Mercurial > hg
changeset 42437:9609430d3625
rust-filepatterns: use bytes instead of String
In my initial patch, I introduced an unnecessary hard constraint on UTF-8
filenames and patterns which I forgot to remove. Although the performance
penalty for using String might be negligible, we don't want to break
compatibility with non-UTF-8 encodings for no reason.
Moreover, this change allows for a cleaner Rust core API.
This patch introduces a new utils module that is used with this fix.
Finally, PatternError was not put inside the Python module generated by
Rust, which would have raised a NameError.
Differential Revision: https://phab.mercurial-scm.org/D6485
author | Raphaël Gomès <rgomes@octobus.net> |
---|---|
date | Thu, 06 Jun 2019 15:30:56 +0200 |
parents | dc5bd66a8270 |
children | 48f1f864d928 |
files | rust/hg-core/src/filepatterns.rs rust/hg-core/src/lib.rs rust/hg-core/src/utils/files.rs rust/hg-core/src/utils/mod.rs rust/hg-cpython/src/filepatterns.rs |
diffstat | 5 files changed, 136 insertions(+), 69 deletions(-) [+] |
line wrap: on
line diff
--- a/rust/hg-core/src/filepatterns.rs Sat Jun 01 01:24:49 2019 +0200 +++ b/rust/hg-core/src/filepatterns.rs Thu Jun 06 15:30:56 2019 +0200 @@ -1,9 +1,11 @@ use crate::{LineNumber, PatternError, PatternFileError}; -use regex::Regex; +use regex::bytes::Regex; use std::collections::HashMap; use std::fs::File; use std::io::Read; use std::vec::Vec; +use utils::files::get_path_from_bytes; +use utils::{replace_slice, SliceExt}; lazy_static! { static ref reescape: Vec<Vec<u8>> = { @@ -192,11 +194,11 @@ /// Wrapper function to `_build_single_regex` that short-circuits 'exact' globs /// that don't need to be transformed into a regex. pub fn build_single_regex( - kind: &str, + kind: &[u8], pat: &[u8], globsuffix: &[u8], ) -> Result<Vec<u8>, PatternError> { - let enum_kind = parse_pattern_syntax(kind.as_bytes())?; + let enum_kind = parse_pattern_syntax(kind)?; if enum_kind == PatternSyntax::RootGlob && pat.iter().all(|b| GLOB_SPECIAL_CHARACTERS.contains(b)) { @@ -207,43 +209,42 @@ } lazy_static! { - static ref SYNTAXES: HashMap<&'static str, &'static str> = { + static ref SYNTAXES: HashMap<&'static [u8], &'static [u8]> = { let mut m = HashMap::new(); - m.insert("re", "relre:"); - m.insert("regexp", "relre:"); - m.insert("glob", "relglob:"); - m.insert("rootglob", "rootglob:"); - m.insert("include", "include"); - m.insert("subinclude", "subinclude"); + m.insert(b"re".as_ref(), b"relre:".as_ref()); + m.insert(b"regexp".as_ref(), b"relre:".as_ref()); + m.insert(b"glob".as_ref(), b"relglob:".as_ref()); + m.insert(b"rootglob".as_ref(), b"rootglob:".as_ref()); + m.insert(b"include".as_ref(), b"include".as_ref()); + m.insert(b"subinclude".as_ref(), b"subinclude".as_ref()); m }; } -pub type PatternTuple = (String, LineNumber, String); +pub type PatternTuple = (Vec<u8>, LineNumber, Vec<u8>); type WarningTuple = (String, String); pub fn parse_pattern_file_contents( - lines: &str, - file_path: &str, + lines: &[u8], + file_path: &[u8], warn: bool, ) -> (Vec<PatternTuple>, Vec<WarningTuple>) { let comment_regex = Regex::new(r"((?:^|[^\\])(?:\\\\)*)#.*").unwrap(); let mut inputs: Vec<PatternTuple> = vec![]; let mut warnings: Vec<WarningTuple> = vec![]; - let mut current_syntax = "relre:"; + let mut current_syntax = b"relre:".as_ref(); - let mut line = String::new(); - for (line_number, line_str) in lines.split('\n').enumerate() { + for (line_number, mut line) in lines.split(|c| *c == b'\n').enumerate() { let line_number = line_number + 1; - line.replace_range(.., line_str); - if line.contains('#') { - if let Some(cap) = comment_regex.captures(line.clone().as_ref()) { - line = line[..cap.get(1).unwrap().end()].to_string() + if line.contains(&('#' as u8)) { + if let Some(cap) = comment_regex.captures(line) { + line = &line[..cap.get(1).unwrap().end()] } - line = line.replace(r"\#", "#"); + let mut line = line.to_owned(); + replace_slice(&mut line, br"\#", b"#"); } let mut line = line.trim_end(); @@ -252,25 +253,28 @@ continue; } - if line.starts_with("syntax:") { - let syntax = line["syntax:".len()..].trim(); + if line.starts_with(b"syntax:") { + let syntax = line[b"syntax:".len()..].trim(); if let Some(rel_syntax) = SYNTAXES.get(syntax) { current_syntax = rel_syntax; } else if warn { - warnings.push((file_path.to_string(), syntax.to_string())); + warnings.push(( + String::from_utf8_lossy(file_path).to_string(), + String::from_utf8_lossy(syntax).to_string(), + )); } continue; } - let mut line_syntax: &str = ¤t_syntax; + let mut line_syntax: &[u8] = ¤t_syntax; for (s, rels) in SYNTAXES.iter() { if line.starts_with(rels) { line_syntax = rels; line = &line[rels.len()..]; break; - } else if line.starts_with(&format!("{}:", s)) { + } else if line.starts_with(&[s, b":".as_ref()].concat()) { line_syntax = rels; line = &line[s.len() + 1..]; break; @@ -278,24 +282,24 @@ } inputs.push(( - format!("{}{}", line_syntax, line), + [line_syntax, line].concat(), line_number, - line.to_string(), + line.to_owned(), )); } (inputs, warnings) } pub fn read_pattern_file( - file_path: String, + file_path: &[u8], warn: bool, ) -> Result<(Vec<PatternTuple>, Vec<WarningTuple>), PatternFileError> { - let mut f = File::open(&file_path)?; - let mut contents = String::new(); + let mut f = File::open(get_path_from_bytes(file_path))?; + let mut contents = Vec::new(); - f.read_to_string(&mut contents)?; + f.read_to_end(&mut contents)?; - Ok(parse_pattern_file_contents(&contents, &file_path, warn)) + Ok(parse_pattern_file_contents(&contents, file_path, warn)) } #[cfg(test)] @@ -328,18 +332,23 @@ #[test] fn test_parse_pattern_file_contents() { - let lines = "syntax: glob\n*.elc"; + let lines = b"syntax: glob\n*.elc"; assert_eq!( - vec![("relglob:*.elc".to_string(), 2, "*.elc".to_string())], - parse_pattern_file_contents(lines, "file_path", false).0, + vec![(b"relglob:*.elc".to_vec(), 2, b"*.elc".to_vec())], + parse_pattern_file_contents(lines, b"file_path", false).0, ); - let lines = "syntax: include\nsyntax: glob"; + let lines = b"syntax: include\nsyntax: glob"; assert_eq!( - parse_pattern_file_contents(lines, "file_path", false).0, + parse_pattern_file_contents(lines, b"file_path", false).0, vec![] ); + let lines = b"glob:**.o"; + assert_eq!( + parse_pattern_file_contents(lines, b"file_path", false).0, + vec![(b"relglob:**.o".to_vec(), 1, b"**.o".to_vec())] + ); } }
--- a/rust/hg-core/src/lib.rs Sat Jun 01 01:24:49 2019 +0200 +++ b/rust/hg-core/src/lib.rs Thu Jun 06 15:30:56 2019 +0200 @@ -19,6 +19,7 @@ CopyVec, CopyVecEntry, DirstateEntry, DirstateParents, DirstateVec, }; mod filepatterns; +mod utils; pub use filepatterns::{ build_single_regex, read_pattern_file, PatternSyntax, PatternTuple,
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/rust/hg-core/src/utils/files.rs Thu Jun 06 15:30:56 2019 +0200 @@ -0,0 +1,17 @@ +use std::path::Path; + +pub fn get_path_from_bytes(bytes: &[u8]) -> &Path { + let os_str; + #[cfg(unix)] + { + use std::os::unix::ffi::OsStrExt; + os_str = std::ffi::OsStr::from_bytes(bytes); + } + #[cfg(windows)] + { + use std::os::windows::ffi::OsStrExt; + os_str = std::ffi::OsString::from_wide(bytes); + } + + Path::new(os_str) +}
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/rust/hg-core/src/utils/mod.rs Thu Jun 06 15:30:56 2019 +0200 @@ -0,0 +1,45 @@ +pub mod files; + +pub fn replace_slice<T>(buf: &mut [T], from: &[T], to: &[T]) +where + T: Clone + PartialEq, +{ + if buf.len() < from.len() || from.len() != to.len() { + return; + } + for i in 0..=buf.len() - from.len() { + if buf[i..].starts_with(from) { + buf[i..(i + from.len())].clone_from_slice(to); + } + } +} + +pub trait SliceExt { + fn trim(&self) -> &Self; + fn trim_end(&self) -> &Self; +} + +fn is_not_whitespace(c: &u8) -> bool { + !(*c as char).is_whitespace() +} + +impl SliceExt for [u8] { + fn trim(&self) -> &[u8] { + if let Some(first) = self.iter().position(is_not_whitespace) { + if let Some(last) = self.iter().rposition(is_not_whitespace) { + &self[first..last + 1] + } else { + unreachable!(); + } + } else { + &[] + } + } + fn trim_end(&self) -> &[u8] { + if let Some(last) = self.iter().rposition(is_not_whitespace) { + &self[..last + 1] + } else { + &[] + } + } +}
--- a/rust/hg-cpython/src/filepatterns.rs Sat Jun 01 01:24:49 2019 +0200 +++ b/rust/hg-cpython/src/filepatterns.rs Thu Jun 06 15:30:56 2019 +0200 @@ -11,14 +11,10 @@ //! and can be used as replacement for the the pure `filepatterns` Python module. //! use cpython::{ - exc, PyDict, PyErr, PyModule, PyResult, PyString, PyTuple, Python, - ToPyObject, + PyBytes, PyDict, PyModule, PyObject, PyResult, PyTuple, Python, ToPyObject, }; -use hg::{build_single_regex, read_pattern_file, PatternTuple}; -use exceptions::{ - PatternError, - PatternFileError, -}; +use exceptions::{PatternError, PatternFileError}; +use hg::{build_single_regex, read_pattern_file, LineNumber, PatternTuple}; /// Rust does not like functions with different return signatures. /// The 3-tuple version is always returned by the hg-core function, @@ -32,17 +28,22 @@ /// for more details. fn read_pattern_file_wrapper( py: Python, - file_path: String, + file_path: PyObject, warn: bool, source_info: bool, ) -> PyResult<PyTuple> { - match read_pattern_file(file_path, warn) { + match read_pattern_file(file_path.extract::<PyBytes>(py)?.data(py), warn) { Ok((patterns, warnings)) => { if source_info { - return Ok((patterns, warnings).to_py_object(py)); + let itemgetter = |x: &PatternTuple| { + (PyBytes::new(py, &x.0), x.1, PyBytes::new(py, &x.2)) + }; + let results: Vec<(PyBytes, LineNumber, PyBytes)> = + patterns.iter().map(itemgetter).collect(); + return Ok((results, warnings).to_py_object(py)); } - let itemgetter = |x: &PatternTuple| x.0.to_py_object(py); - let results: Vec<PyString> = + let itemgetter = |x: &PatternTuple| PyBytes::new(py, &x.0); + let results: Vec<PyBytes> = patterns.iter().map(itemgetter).collect(); Ok((results, warnings).to_py_object(py)) } @@ -52,22 +53,16 @@ fn build_single_regex_wrapper( py: Python, - kind: String, - pat: String, - globsuffix: String, -) -> PyResult<PyString> { + kind: PyObject, + pat: PyObject, + globsuffix: PyObject, +) -> PyResult<PyBytes> { match build_single_regex( - kind.as_ref(), - pat.as_bytes(), - globsuffix.as_bytes(), + kind.extract::<PyBytes>(py)?.data(py), + pat.extract::<PyBytes>(py)?.data(py), + globsuffix.extract::<PyBytes>(py)?.data(py), ) { - Ok(regex) => match String::from_utf8(regex) { - Ok(regex) => Ok(regex.to_py_object(py)), - Err(e) => Err(PyErr::new::<exc::UnicodeDecodeError, _>( - py, - e.to_string(), - )), - }, + Ok(regex) => Ok(PyBytes::new(py, ®ex)), Err(e) => Err(PatternError::pynew(py, e)), } } @@ -88,9 +83,9 @@ py_fn!( py, build_single_regex_wrapper( - kind: String, - pat: String, - globsuffix: String + kind: PyObject, + pat: PyObject, + globsuffix: PyObject ) ), )?; @@ -100,13 +95,13 @@ py_fn!( py, read_pattern_file_wrapper( - file_path: String, + file_path: PyObject, warn: bool, source_info: bool ) ), )?; - + m.add(py, "PatternError", py.get_type::<PatternError>())?; let sys = PyModule::import(py, "sys")?; let sys_modules: PyDict = sys.get(py, "modules")?.extract(py)?; sys_modules.set_item(py, dotted_name, &m)?;