rust-filepatterns: use bytes instead of String
In my initial patch, I introduced an unnecessary hard constraint on UTF-8
filenames and patterns which I forgot to remove. Although the performance
penalty for using String might be negligible, we don't want to break
compatibility with non-UTF-8 encodings for no reason.
Moreover, this change allows for a cleaner Rust core API.
This patch introduces a new utils module that is used with this fix.
Finally, PatternError was not put inside the Python module generated by
Rust, which would have raised a NameError.
Differential Revision: https://phab.mercurial-scm.org/D6485
--- a/rust/hg-core/src/filepatterns.rs Sat Jun 01 01:24:49 2019 +0200
+++ b/rust/hg-core/src/filepatterns.rs Thu Jun 06 15:30:56 2019 +0200
@@ -1,9 +1,11 @@
use crate::{LineNumber, PatternError, PatternFileError};
-use regex::Regex;
+use regex::bytes::Regex;
use std::collections::HashMap;
use std::fs::File;
use std::io::Read;
use std::vec::Vec;
+use utils::files::get_path_from_bytes;
+use utils::{replace_slice, SliceExt};
lazy_static! {
static ref reescape: Vec<Vec<u8>> = {
@@ -192,11 +194,11 @@
/// Wrapper function to `_build_single_regex` that short-circuits 'exact' globs
/// that don't need to be transformed into a regex.
pub fn build_single_regex(
- kind: &str,
+ kind: &[u8],
pat: &[u8],
globsuffix: &[u8],
) -> Result<Vec<u8>, PatternError> {
- let enum_kind = parse_pattern_syntax(kind.as_bytes())?;
+ let enum_kind = parse_pattern_syntax(kind)?;
if enum_kind == PatternSyntax::RootGlob
&& pat.iter().all(|b| GLOB_SPECIAL_CHARACTERS.contains(b))
{
@@ -207,43 +209,42 @@
}
lazy_static! {
- static ref SYNTAXES: HashMap<&'static str, &'static str> = {
+ static ref SYNTAXES: HashMap<&'static [u8], &'static [u8]> = {
let mut m = HashMap::new();
- m.insert("re", "relre:");
- m.insert("regexp", "relre:");
- m.insert("glob", "relglob:");
- m.insert("rootglob", "rootglob:");
- m.insert("include", "include");
- m.insert("subinclude", "subinclude");
+ m.insert(b"re".as_ref(), b"relre:".as_ref());
+ m.insert(b"regexp".as_ref(), b"relre:".as_ref());
+ m.insert(b"glob".as_ref(), b"relglob:".as_ref());
+ m.insert(b"rootglob".as_ref(), b"rootglob:".as_ref());
+ m.insert(b"include".as_ref(), b"include".as_ref());
+ m.insert(b"subinclude".as_ref(), b"subinclude".as_ref());
m
};
}
-pub type PatternTuple = (String, LineNumber, String);
+pub type PatternTuple = (Vec<u8>, LineNumber, Vec<u8>);
type WarningTuple = (String, String);
pub fn parse_pattern_file_contents(
- lines: &str,
- file_path: &str,
+ lines: &[u8],
+ file_path: &[u8],
warn: bool,
) -> (Vec<PatternTuple>, Vec<WarningTuple>) {
let comment_regex = Regex::new(r"((?:^|[^\\])(?:\\\\)*)#.*").unwrap();
let mut inputs: Vec<PatternTuple> = vec![];
let mut warnings: Vec<WarningTuple> = vec![];
- let mut current_syntax = "relre:";
+ let mut current_syntax = b"relre:".as_ref();
- let mut line = String::new();
- for (line_number, line_str) in lines.split('\n').enumerate() {
+ for (line_number, mut line) in lines.split(|c| *c == b'\n').enumerate() {
let line_number = line_number + 1;
- line.replace_range(.., line_str);
- if line.contains('#') {
- if let Some(cap) = comment_regex.captures(line.clone().as_ref()) {
- line = line[..cap.get(1).unwrap().end()].to_string()
+ if line.contains(&('#' as u8)) {
+ if let Some(cap) = comment_regex.captures(line) {
+ line = &line[..cap.get(1).unwrap().end()]
}
- line = line.replace(r"\#", "#");
+ let mut line = line.to_owned();
+ replace_slice(&mut line, br"\#", b"#");
}
let mut line = line.trim_end();
@@ -252,25 +253,28 @@
continue;
}
- if line.starts_with("syntax:") {
- let syntax = line["syntax:".len()..].trim();
+ if line.starts_with(b"syntax:") {
+ let syntax = line[b"syntax:".len()..].trim();
if let Some(rel_syntax) = SYNTAXES.get(syntax) {
current_syntax = rel_syntax;
} else if warn {
- warnings.push((file_path.to_string(), syntax.to_string()));
+ warnings.push((
+ String::from_utf8_lossy(file_path).to_string(),
+ String::from_utf8_lossy(syntax).to_string(),
+ ));
}
continue;
}
- let mut line_syntax: &str = ¤t_syntax;
+ let mut line_syntax: &[u8] = ¤t_syntax;
for (s, rels) in SYNTAXES.iter() {
if line.starts_with(rels) {
line_syntax = rels;
line = &line[rels.len()..];
break;
- } else if line.starts_with(&format!("{}:", s)) {
+ } else if line.starts_with(&[s, b":".as_ref()].concat()) {
line_syntax = rels;
line = &line[s.len() + 1..];
break;
@@ -278,24 +282,24 @@
}
inputs.push((
- format!("{}{}", line_syntax, line),
+ [line_syntax, line].concat(),
line_number,
- line.to_string(),
+ line.to_owned(),
));
}
(inputs, warnings)
}
pub fn read_pattern_file(
- file_path: String,
+ file_path: &[u8],
warn: bool,
) -> Result<(Vec<PatternTuple>, Vec<WarningTuple>), PatternFileError> {
- let mut f = File::open(&file_path)?;
- let mut contents = String::new();
+ let mut f = File::open(get_path_from_bytes(file_path))?;
+ let mut contents = Vec::new();
- f.read_to_string(&mut contents)?;
+ f.read_to_end(&mut contents)?;
- Ok(parse_pattern_file_contents(&contents, &file_path, warn))
+ Ok(parse_pattern_file_contents(&contents, file_path, warn))
}
#[cfg(test)]
@@ -328,18 +332,23 @@
#[test]
fn test_parse_pattern_file_contents() {
- let lines = "syntax: glob\n*.elc";
+ let lines = b"syntax: glob\n*.elc";
assert_eq!(
- vec![("relglob:*.elc".to_string(), 2, "*.elc".to_string())],
- parse_pattern_file_contents(lines, "file_path", false).0,
+ vec![(b"relglob:*.elc".to_vec(), 2, b"*.elc".to_vec())],
+ parse_pattern_file_contents(lines, b"file_path", false).0,
);
- let lines = "syntax: include\nsyntax: glob";
+ let lines = b"syntax: include\nsyntax: glob";
assert_eq!(
- parse_pattern_file_contents(lines, "file_path", false).0,
+ parse_pattern_file_contents(lines, b"file_path", false).0,
vec![]
);
+ let lines = b"glob:**.o";
+ assert_eq!(
+ parse_pattern_file_contents(lines, b"file_path", false).0,
+ vec![(b"relglob:**.o".to_vec(), 1, b"**.o".to_vec())]
+ );
}
}
--- a/rust/hg-core/src/lib.rs Sat Jun 01 01:24:49 2019 +0200
+++ b/rust/hg-core/src/lib.rs Thu Jun 06 15:30:56 2019 +0200
@@ -19,6 +19,7 @@
CopyVec, CopyVecEntry, DirstateEntry, DirstateParents, DirstateVec,
};
mod filepatterns;
+mod utils;
pub use filepatterns::{
build_single_regex, read_pattern_file, PatternSyntax, PatternTuple,
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/rust/hg-core/src/utils/files.rs Thu Jun 06 15:30:56 2019 +0200
@@ -0,0 +1,17 @@
+use std::path::Path;
+
+pub fn get_path_from_bytes(bytes: &[u8]) -> &Path {
+ let os_str;
+ #[cfg(unix)]
+ {
+ use std::os::unix::ffi::OsStrExt;
+ os_str = std::ffi::OsStr::from_bytes(bytes);
+ }
+ #[cfg(windows)]
+ {
+ use std::os::windows::ffi::OsStrExt;
+ os_str = std::ffi::OsString::from_wide(bytes);
+ }
+
+ Path::new(os_str)
+}
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/rust/hg-core/src/utils/mod.rs Thu Jun 06 15:30:56 2019 +0200
@@ -0,0 +1,45 @@
+pub mod files;
+
+pub fn replace_slice<T>(buf: &mut [T], from: &[T], to: &[T])
+where
+ T: Clone + PartialEq,
+{
+ if buf.len() < from.len() || from.len() != to.len() {
+ return;
+ }
+ for i in 0..=buf.len() - from.len() {
+ if buf[i..].starts_with(from) {
+ buf[i..(i + from.len())].clone_from_slice(to);
+ }
+ }
+}
+
+pub trait SliceExt {
+ fn trim(&self) -> &Self;
+ fn trim_end(&self) -> &Self;
+}
+
+fn is_not_whitespace(c: &u8) -> bool {
+ !(*c as char).is_whitespace()
+}
+
+impl SliceExt for [u8] {
+ fn trim(&self) -> &[u8] {
+ if let Some(first) = self.iter().position(is_not_whitespace) {
+ if let Some(last) = self.iter().rposition(is_not_whitespace) {
+ &self[first..last + 1]
+ } else {
+ unreachable!();
+ }
+ } else {
+ &[]
+ }
+ }
+ fn trim_end(&self) -> &[u8] {
+ if let Some(last) = self.iter().rposition(is_not_whitespace) {
+ &self[..last + 1]
+ } else {
+ &[]
+ }
+ }
+}
--- a/rust/hg-cpython/src/filepatterns.rs Sat Jun 01 01:24:49 2019 +0200
+++ b/rust/hg-cpython/src/filepatterns.rs Thu Jun 06 15:30:56 2019 +0200
@@ -11,14 +11,10 @@
//! and can be used as replacement for the the pure `filepatterns` Python module.
//!
use cpython::{
- exc, PyDict, PyErr, PyModule, PyResult, PyString, PyTuple, Python,
- ToPyObject,
+ PyBytes, PyDict, PyModule, PyObject, PyResult, PyTuple, Python, ToPyObject,
};
-use hg::{build_single_regex, read_pattern_file, PatternTuple};
-use exceptions::{
- PatternError,
- PatternFileError,
-};
+use exceptions::{PatternError, PatternFileError};
+use hg::{build_single_regex, read_pattern_file, LineNumber, PatternTuple};
/// Rust does not like functions with different return signatures.
/// The 3-tuple version is always returned by the hg-core function,
@@ -32,17 +28,22 @@
/// for more details.
fn read_pattern_file_wrapper(
py: Python,
- file_path: String,
+ file_path: PyObject,
warn: bool,
source_info: bool,
) -> PyResult<PyTuple> {
- match read_pattern_file(file_path, warn) {
+ match read_pattern_file(file_path.extract::<PyBytes>(py)?.data(py), warn) {
Ok((patterns, warnings)) => {
if source_info {
- return Ok((patterns, warnings).to_py_object(py));
+ let itemgetter = |x: &PatternTuple| {
+ (PyBytes::new(py, &x.0), x.1, PyBytes::new(py, &x.2))
+ };
+ let results: Vec<(PyBytes, LineNumber, PyBytes)> =
+ patterns.iter().map(itemgetter).collect();
+ return Ok((results, warnings).to_py_object(py));
}
- let itemgetter = |x: &PatternTuple| x.0.to_py_object(py);
- let results: Vec<PyString> =
+ let itemgetter = |x: &PatternTuple| PyBytes::new(py, &x.0);
+ let results: Vec<PyBytes> =
patterns.iter().map(itemgetter).collect();
Ok((results, warnings).to_py_object(py))
}
@@ -52,22 +53,16 @@
fn build_single_regex_wrapper(
py: Python,
- kind: String,
- pat: String,
- globsuffix: String,
-) -> PyResult<PyString> {
+ kind: PyObject,
+ pat: PyObject,
+ globsuffix: PyObject,
+) -> PyResult<PyBytes> {
match build_single_regex(
- kind.as_ref(),
- pat.as_bytes(),
- globsuffix.as_bytes(),
+ kind.extract::<PyBytes>(py)?.data(py),
+ pat.extract::<PyBytes>(py)?.data(py),
+ globsuffix.extract::<PyBytes>(py)?.data(py),
) {
- Ok(regex) => match String::from_utf8(regex) {
- Ok(regex) => Ok(regex.to_py_object(py)),
- Err(e) => Err(PyErr::new::<exc::UnicodeDecodeError, _>(
- py,
- e.to_string(),
- )),
- },
+ Ok(regex) => Ok(PyBytes::new(py, ®ex)),
Err(e) => Err(PatternError::pynew(py, e)),
}
}
@@ -88,9 +83,9 @@
py_fn!(
py,
build_single_regex_wrapper(
- kind: String,
- pat: String,
- globsuffix: String
+ kind: PyObject,
+ pat: PyObject,
+ globsuffix: PyObject
)
),
)?;
@@ -100,13 +95,13 @@
py_fn!(
py,
read_pattern_file_wrapper(
- file_path: String,
+ file_path: PyObject,
warn: bool,
source_info: bool
)
),
)?;
-
+ m.add(py, "PatternError", py.get_type::<PatternError>())?;
let sys = PyModule::import(py, "sys")?;
let sys_modules: PyDict = sys.get(py, "modules")?.extract(py)?;
sys_modules.set_item(py, dotted_name, &m)?;