Mercurial > hg
changeset 42327:e8f3740cc067
rust-filepatterns: add a Rust implementation of pattern-related utils
This change introduces Rust implementations of two functions related to
pattern handling, all located in `match.py`:
- `_regex`
- `readpatternfile`
These utils are useful in the long-term effort to improve `hg status`'s
performance using Rust. Experimental work done by Valentin Gatien-Baron
shows very promising improvements, but is too different from the current
Mercurial core code structure to be used "as-is".
This is the first - albeit very small - step towards the code revamp
needed down the line.
Two dependencies were added: `regex` and `lazy_static`. Both of them
will be useful for a majority of the Rust code that will be written,
are well known and maintained either by the Rust core team, or by
very frequent contributors.
Differential Revision: https://phab.mercurial-scm.org/D6271
author | Raphaël Gomès <rgomes@octobus.net> |
---|---|
date | Wed, 24 Apr 2019 11:34:09 +0200 |
parents | 5d4ec64a6fcb |
children | 94f3a73b6672 |
files | rust/Cargo.lock rust/hg-core/Cargo.toml rust/hg-core/src/ancestors.rs rust/hg-core/src/filepatterns.rs rust/hg-core/src/lib.rs |
diffstat | 5 files changed, 381 insertions(+), 5 deletions(-) [+] |
line wrap: on
line diff
--- a/rust/Cargo.lock Wed May 15 22:11:41 2019 -0700 +++ b/rust/Cargo.lock Wed Apr 24 11:34:09 2019 +0200 @@ -50,9 +50,11 @@ version = "0.1.0" dependencies = [ "byteorder 1.3.1 (registry+https://github.com/rust-lang/crates.io-index)", + "lazy_static 1.3.0 (registry+https://github.com/rust-lang/crates.io-index)", "memchr 2.2.0 (registry+https://github.com/rust-lang/crates.io-index)", "rand 0.6.5 (registry+https://github.com/rust-lang/crates.io-index)", "rand_pcg 0.1.1 (registry+https://github.com/rust-lang/crates.io-index)", + "regex 1.1.0 (registry+https://github.com/rust-lang/crates.io-index)", ] [[package]] @@ -76,7 +78,7 @@ [[package]] name = "lazy_static" -version = "1.2.0" +version = "1.3.0" source = "registry+https://github.com/rust-lang/crates.io-index" [[package]] @@ -262,7 +264,7 @@ version = "0.3.6" source = "registry+https://github.com/rust-lang/crates.io-index" dependencies = [ - "lazy_static 1.2.0 (registry+https://github.com/rust-lang/crates.io-index)", + "lazy_static 1.3.0 (registry+https://github.com/rust-lang/crates.io-index)", ] [[package]] @@ -302,7 +304,7 @@ "checksum cloudabi 0.0.3 (registry+https://github.com/rust-lang/crates.io-index)" = "ddfc5b9aa5d4507acaf872de71051dfd0e309860e88966e1051e462a077aac4f" "checksum cpython 0.2.1 (registry+https://github.com/rust-lang/crates.io-index)" = "b489034e723e7f5109fecd19b719e664f89ef925be785885252469e9822fa940" "checksum fuchsia-cprng 0.1.0 (registry+https://github.com/rust-lang/crates.io-index)" = "81f7f8eb465745ea9b02e2704612a9946a59fa40572086c6fd49d6ddcf30bf31" -"checksum lazy_static 1.2.0 (registry+https://github.com/rust-lang/crates.io-index)" = "a374c89b9db55895453a74c1e38861d9deec0b01b405a82516e9d5de4820dea1" +"checksum lazy_static 1.3.0 (registry+https://github.com/rust-lang/crates.io-index)" = "bc5729f27f159ddd61f4df6228e827e86643d4d3e7c32183cb30a1c08f604a14" "checksum libc 0.2.45 (registry+https://github.com/rust-lang/crates.io-index)" = "2d2857ec59fadc0773853c664d2d18e7198e83883e7060b63c924cb077bd5c74" "checksum memchr 2.2.0 (registry+https://github.com/rust-lang/crates.io-index)" = "2efc7bc57c883d4a4d6e3246905283d8dae951bb3bd32f49d6ef297f546e1c39" "checksum num-traits 0.2.6 (registry+https://github.com/rust-lang/crates.io-index)" = "0b3a5d7cc97d6d30d8b9bc8fa19bf45349ffe46241e8816f50f62f6d6aaabee1"
--- a/rust/hg-core/Cargo.toml Wed May 15 22:11:41 2019 -0700 +++ b/rust/hg-core/Cargo.toml Wed Apr 24 11:34:09 2019 +0200 @@ -13,4 +13,6 @@ [dependencies] memchr = "2.2.0" -byteorder = "1.3.1" \ No newline at end of file +byteorder = "1.3.1" +lazy_static = "1.3.0" +regex = "^1.1" \ No newline at end of file
--- a/rust/hg-core/src/ancestors.rs Wed May 15 22:11:41 2019 -0700 +++ b/rust/hg-core/src/ancestors.rs Wed Apr 24 11:34:09 2019 +0200 @@ -8,9 +8,9 @@ //! Rust versions of generic DAG ancestors algorithms for Mercurial use super::{Graph, GraphError, Revision, NULL_REVISION}; +use crate::dagops; use std::cmp::max; use std::collections::{BinaryHeap, HashSet}; -use crate::dagops; /// Iterator over the ancestors of a given list of revisions /// This is a generic type, defined and implemented for any Graph, so that
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/rust/hg-core/src/filepatterns.rs Wed Apr 24 11:34:09 2019 +0200 @@ -0,0 +1,345 @@ +use crate::{LineNumber, PatternError, PatternFileError}; +use regex::Regex; +use std::collections::HashMap; +use std::fs::File; +use std::io::Read; +use std::vec::Vec; + +lazy_static! { + static ref reescape: Vec<Vec<u8>> = { + let mut v: Vec<Vec<u8>> = (0..=255).map(|byte| vec![byte]).collect(); + let to_escape = b"()[]{}?*+-|^$\\.&~# \t\n\r\x0b\x0c"; + for byte in to_escape { + v[*byte as usize].insert(0, b'\\'); + } + v + }; +} + +/// These are matched in order +const GLOB_REPLACEMENTS: &[(&[u8], &[u8])] = + &[(b"*/", b"(?:.*/)?"), (b"*", b".*"), (b"", b"[^/]*")]; + +#[derive(Debug, Copy, Clone, PartialEq, Eq)] +pub enum PatternSyntax { + Regexp, + /// Glob that matches at the front of the path + RootGlob, + /// Glob that matches at any suffix of the path (still anchored at slashes) + Glob, + Path, + RelPath, + RelGlob, + RelRegexp, + RootFiles, +} + +/// Transforms a glob pattern into a regex +fn glob_to_re(pat: &[u8]) -> Vec<u8> { + let mut input = pat; + let mut res: Vec<u8> = vec![]; + let mut group_depth = 0; + + while let Some((c, rest)) = input.split_first() { + input = rest; + + match c { + b'*' => { + for (source, repl) in GLOB_REPLACEMENTS { + if input.starts_with(source) { + input = &input[source.len()..]; + res.extend(*repl); + break; + } + } + } + b'?' => res.extend(b"."), + b'[' => { + match input.iter().skip(1).position(|b| *b == b']') { + None => res.extend(b"\\["), + Some(end) => { + // Account for the one we skipped + let end = end + 1; + + res.extend(b"["); + + for (i, b) in input[..end].iter().enumerate() { + if *b == b'!' && i == 0 { + res.extend(b"^") + } else if *b == b'^' && i == 0 { + res.extend(b"\\^") + } else if *b == b'\\' { + res.extend(b"\\\\") + } else { + res.push(*b) + } + } + res.extend(b"]"); + input = &input[end + 1..]; + } + } + } + b'{' => { + group_depth += 1; + res.extend(b"(?:") + } + b'}' if group_depth > 0 => { + group_depth -= 1; + res.extend(b")"); + } + b',' if group_depth > 0 => res.extend(b"|"), + b'\\' => { + let c = { + if let Some((c, rest)) = input.split_first() { + input = rest; + c + } else { + c + } + }; + res.extend(&reescape[*c as usize]) + } + _ => res.extend(&reescape[*c as usize]), + } + } + res +} + +fn escape_pattern(pattern: &[u8]) -> Vec<u8> { + pattern + .iter() + .flat_map(|c| reescape[*c as usize].clone()) + .collect() +} + +fn parse_pattern_syntax(kind: &[u8]) -> Result<PatternSyntax, PatternError> { + match kind { + b"re" => Ok(PatternSyntax::Regexp), + b"path" => Ok(PatternSyntax::Path), + b"relpath" => Ok(PatternSyntax::RelPath), + b"rootfilesin" => Ok(PatternSyntax::RootFiles), + b"relglob" => Ok(PatternSyntax::RelGlob), + b"relre" => Ok(PatternSyntax::RelRegexp), + b"glob" => Ok(PatternSyntax::Glob), + b"rootglob" => Ok(PatternSyntax::RootGlob), + _ => Err(PatternError::UnsupportedSyntax( + String::from_utf8_lossy(kind).to_string(), + )), + } +} + +/// Builds the regex that corresponds to the given pattern. +/// If within a `syntax: regexp` context, returns the pattern, +/// otherwise, returns the corresponding regex. +fn _build_single_regex( + syntax: PatternSyntax, + pattern: &[u8], + globsuffix: &[u8], +) -> Vec<u8> { + if pattern.is_empty() { + return vec![]; + } + match syntax { + PatternSyntax::Regexp => pattern.to_owned(), + PatternSyntax::RelRegexp => { + if pattern[0] == b'^' { + return pattern.to_owned(); + } + let mut res = b".*".to_vec(); + res.extend(pattern); + res + } + PatternSyntax::Path | PatternSyntax::RelPath => { + if pattern == b"." { + return vec![]; + } + let mut pattern = escape_pattern(pattern); + pattern.extend(b"(?:/|$)"); + pattern + } + PatternSyntax::RootFiles => { + let mut res = if pattern == b"." { + vec![] + } else { + // Pattern is a directory name. + let mut as_vec: Vec<u8> = escape_pattern(pattern); + as_vec.push(b'/'); + as_vec + }; + + // Anything after the pattern must be a non-directory. + res.extend(b"[^/]+$"); + res + } + PatternSyntax::Glob + | PatternSyntax::RelGlob + | PatternSyntax::RootGlob => { + let mut res: Vec<u8> = vec![]; + if syntax == PatternSyntax::RelGlob { + res.extend(b"(?:|.*/)"); + } + + res.extend(glob_to_re(pattern)); + res.extend(globsuffix.iter()); + res + } + } +} + +const GLOB_SPECIAL_CHARACTERS: [u8; 7] = + [b'*', b'?', b'[', b']', b'{', b'}', b'\\']; + +/// Wrapper function to `_build_single_regex` that short-circuits 'exact' globs +/// that don't need to be transformed into a regex. +pub fn build_single_regex( + kind: &str, + pat: &[u8], + globsuffix: &[u8], +) -> Result<Vec<u8>, PatternError> { + let enum_kind = parse_pattern_syntax(kind.as_bytes())?; + if enum_kind == PatternSyntax::RootGlob + && pat.iter().all(|b| GLOB_SPECIAL_CHARACTERS.contains(b)) + { + Ok(pat.to_vec()) + } else { + Ok(_build_single_regex(enum_kind, pat, globsuffix)) + } +} + +lazy_static! { + static ref SYNTAXES: HashMap<&'static str, &'static str> = { + let mut m = HashMap::new(); + + m.insert("re", "relre:"); + m.insert("regexp", "relre:"); + m.insert("glob", "relglob:"); + m.insert("rootglob", "rootglob:"); + m.insert("include", "include"); + m.insert("subinclude", "subinclude"); + m + }; +} + +pub type PatternTuple = (String, LineNumber, String); +type WarningTuple = (String, String); + +pub fn parse_pattern_file_contents( + lines: &str, + file_path: &str, + warn: bool, +) -> (Vec<PatternTuple>, Vec<WarningTuple>) { + let comment_regex = Regex::new(r"((?:^|[^\\])(?:\\\\)*)#.*").unwrap(); + let mut inputs: Vec<PatternTuple> = vec![]; + let mut warnings: Vec<WarningTuple> = vec![]; + + let mut current_syntax = "relre:"; + + let mut line = String::new(); + for (line_number, line_str) in lines.split('\n').enumerate() { + let line_number = line_number + 1; + line.replace_range(.., line_str); + + if line.contains('#') { + if let Some(cap) = comment_regex.captures(line.clone().as_ref()) { + line = line[..cap.get(1).unwrap().end()].to_string() + } + line = line.replace(r"\#", "#"); + } + + let mut line = line.trim_end(); + + if line.is_empty() { + continue; + } + + if line.starts_with("syntax:") { + let syntax = line["syntax:".len()..].trim(); + + if let Some(rel_syntax) = SYNTAXES.get(syntax) { + current_syntax = rel_syntax; + } else if warn { + warnings.push((file_path.to_string(), syntax.to_string())); + } + continue; + } + + let mut line_syntax: &str = ¤t_syntax; + + for (s, rels) in SYNTAXES.iter() { + if line.starts_with(rels) { + line_syntax = rels; + line = &line[rels.len()..]; + break; + } else if line.starts_with(&format!("{}:", s)) { + line_syntax = rels; + line = &line[s.len() + 1..]; + break; + } + } + + inputs.push(( + format!("{}{}", line_syntax, line), + line_number, + line.to_string(), + )); + } + (inputs, warnings) +} + +pub fn read_pattern_file( + file_path: String, + warn: bool, +) -> Result<(Vec<PatternTuple>, Vec<WarningTuple>), PatternFileError> { + let mut f = File::open(&file_path)?; + let mut contents = String::new(); + + f.read_to_string(&mut contents)?; + + Ok(parse_pattern_file_contents(&contents, &file_path, warn)) +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn escape_pattern_test() { + let untouched = br#"!"%',/0123456789:;<=>@ABCDEFGHIJKLMNOPQRSTUVWXYZ_`abcdefghijklmnopqrstuvwxyz"#; + assert_eq!(escape_pattern(untouched), untouched.to_vec()); + // All escape codes + assert_eq!( + escape_pattern(br#"()[]{}?*+-|^$\\.&~# \t\n\r\v\f"#), + br#"\(\)\[\]\{\}\?\*\+\-\|\^\$\\\\\.\&\~\#\ \\t\\n\\r\\v\\f"# + .to_vec() + ); + } + + #[test] + fn glob_test() { + assert_eq!(glob_to_re(br#"?"#), br#"."#); + assert_eq!(glob_to_re(br#"*"#), br#"[^/]*"#); + assert_eq!(glob_to_re(br#"**"#), br#".*"#); + assert_eq!(glob_to_re(br#"**/a"#), br#"(?:.*/)?a"#); + assert_eq!(glob_to_re(br#"a/**/b"#), br#"a/(?:.*/)?b"#); + assert_eq!(glob_to_re(br#"[a*?!^][^b][!c]"#), br#"[a*?!^][\^b][^c]"#); + assert_eq!(glob_to_re(br#"{a,b}"#), br#"(?:a|b)"#); + assert_eq!(glob_to_re(br#".\*\?"#), br#"\.\*\?"#); + } + + #[test] + fn test_parse_pattern_file_contents() { + let lines = "syntax: glob\n*.elc"; + + assert_eq!( + vec![("relglob:*.elc".to_string(), 2, "*.elc".to_string())], + parse_pattern_file_contents(lines, "file_path", false).0, + ); + + let lines = "syntax: include\nsyntax: glob"; + + assert_eq!( + parse_pattern_file_contents(lines, "file_path", false).0, + vec![] + ); + } +}
--- a/rust/hg-core/src/lib.rs Wed May 15 22:11:41 2019 -0700 +++ b/rust/hg-core/src/lib.rs Wed Apr 24 11:34:09 2019 +0200 @@ -4,6 +4,9 @@ // GNU General Public License version 2 or any later version. extern crate byteorder; extern crate memchr; +#[macro_use] +extern crate lazy_static; +extern crate regex; mod ancestors; pub mod dagops; @@ -15,6 +18,11 @@ pack_dirstate, parse_dirstate, CopyVec, CopyVecEntry, DirstateEntry, DirstateParents, DirstateVec, }; +mod filepatterns; + +pub use filepatterns::{ + build_single_regex, read_pattern_file, PatternSyntax, PatternTuple, +}; /// Mercurial revision numbers /// @@ -42,6 +50,8 @@ fn parents(&self, Revision) -> Result<[Revision; 2], GraphError>; } +pub type LineNumber = usize; + #[derive(Clone, Debug, PartialEq)] pub enum GraphError { ParentOutOfRange(Revision), @@ -73,3 +83,20 @@ DirstateParseError::CorruptedEntry(e.to_string()) } } + +#[derive(Debug)] +pub enum PatternError { + UnsupportedSyntax(String), +} + +#[derive(Debug)] +pub enum PatternFileError { + IO(std::io::Error), + Pattern(PatternError, LineNumber), +} + +impl From<std::io::Error> for PatternFileError { + fn from(e: std::io::Error) -> Self { + PatternFileError::IO(e) + } +}