rust-filepatterns: add a Rust implementation of pattern-related utils
This change introduces Rust implementations of two functions related to
pattern handling, all located in `match.py`:
- `_regex`
- `readpatternfile`
These utils are useful in the long-term effort to improve `hg status`'s
performance using Rust. Experimental work done by Valentin Gatien-Baron
shows very promising improvements, but is too different from the current
Mercurial core code structure to be used "as-is".
This is the first - albeit very small - step towards the code revamp
needed down the line.
Two dependencies were added: `regex` and `lazy_static`. Both of them
will be useful for a majority of the Rust code that will be written,
are well known and maintained either by the Rust core team, or by
very frequent contributors.
Differential Revision: https://phab.mercurial-scm.org/D6271
--- a/rust/Cargo.lock Wed May 15 22:11:41 2019 -0700
+++ b/rust/Cargo.lock Wed Apr 24 11:34:09 2019 +0200
@@ -50,9 +50,11 @@
version = "0.1.0"
dependencies = [
"byteorder 1.3.1 (registry+https://github.com/rust-lang/crates.io-index)",
+ "lazy_static 1.3.0 (registry+https://github.com/rust-lang/crates.io-index)",
"memchr 2.2.0 (registry+https://github.com/rust-lang/crates.io-index)",
"rand 0.6.5 (registry+https://github.com/rust-lang/crates.io-index)",
"rand_pcg 0.1.1 (registry+https://github.com/rust-lang/crates.io-index)",
+ "regex 1.1.0 (registry+https://github.com/rust-lang/crates.io-index)",
]
[[package]]
@@ -76,7 +78,7 @@
[[package]]
name = "lazy_static"
-version = "1.2.0"
+version = "1.3.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
[[package]]
@@ -262,7 +264,7 @@
version = "0.3.6"
source = "registry+https://github.com/rust-lang/crates.io-index"
dependencies = [
- "lazy_static 1.2.0 (registry+https://github.com/rust-lang/crates.io-index)",
+ "lazy_static 1.3.0 (registry+https://github.com/rust-lang/crates.io-index)",
]
[[package]]
@@ -302,7 +304,7 @@
"checksum cloudabi 0.0.3 (registry+https://github.com/rust-lang/crates.io-index)" = "ddfc5b9aa5d4507acaf872de71051dfd0e309860e88966e1051e462a077aac4f"
"checksum cpython 0.2.1 (registry+https://github.com/rust-lang/crates.io-index)" = "b489034e723e7f5109fecd19b719e664f89ef925be785885252469e9822fa940"
"checksum fuchsia-cprng 0.1.0 (registry+https://github.com/rust-lang/crates.io-index)" = "81f7f8eb465745ea9b02e2704612a9946a59fa40572086c6fd49d6ddcf30bf31"
-"checksum lazy_static 1.2.0 (registry+https://github.com/rust-lang/crates.io-index)" = "a374c89b9db55895453a74c1e38861d9deec0b01b405a82516e9d5de4820dea1"
+"checksum lazy_static 1.3.0 (registry+https://github.com/rust-lang/crates.io-index)" = "bc5729f27f159ddd61f4df6228e827e86643d4d3e7c32183cb30a1c08f604a14"
"checksum libc 0.2.45 (registry+https://github.com/rust-lang/crates.io-index)" = "2d2857ec59fadc0773853c664d2d18e7198e83883e7060b63c924cb077bd5c74"
"checksum memchr 2.2.0 (registry+https://github.com/rust-lang/crates.io-index)" = "2efc7bc57c883d4a4d6e3246905283d8dae951bb3bd32f49d6ef297f546e1c39"
"checksum num-traits 0.2.6 (registry+https://github.com/rust-lang/crates.io-index)" = "0b3a5d7cc97d6d30d8b9bc8fa19bf45349ffe46241e8816f50f62f6d6aaabee1"
--- a/rust/hg-core/Cargo.toml Wed May 15 22:11:41 2019 -0700
+++ b/rust/hg-core/Cargo.toml Wed Apr 24 11:34:09 2019 +0200
@@ -13,4 +13,6 @@
[dependencies]
memchr = "2.2.0"
-byteorder = "1.3.1"
\ No newline at end of file
+byteorder = "1.3.1"
+lazy_static = "1.3.0"
+regex = "^1.1"
\ No newline at end of file
--- a/rust/hg-core/src/ancestors.rs Wed May 15 22:11:41 2019 -0700
+++ b/rust/hg-core/src/ancestors.rs Wed Apr 24 11:34:09 2019 +0200
@@ -8,9 +8,9 @@
//! Rust versions of generic DAG ancestors algorithms for Mercurial
use super::{Graph, GraphError, Revision, NULL_REVISION};
+use crate::dagops;
use std::cmp::max;
use std::collections::{BinaryHeap, HashSet};
-use crate::dagops;
/// Iterator over the ancestors of a given list of revisions
/// This is a generic type, defined and implemented for any Graph, so that
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/rust/hg-core/src/filepatterns.rs Wed Apr 24 11:34:09 2019 +0200
@@ -0,0 +1,345 @@
+use crate::{LineNumber, PatternError, PatternFileError};
+use regex::Regex;
+use std::collections::HashMap;
+use std::fs::File;
+use std::io::Read;
+use std::vec::Vec;
+
+lazy_static! {
+ static ref reescape: Vec<Vec<u8>> = {
+ let mut v: Vec<Vec<u8>> = (0..=255).map(|byte| vec![byte]).collect();
+ let to_escape = b"()[]{}?*+-|^$\\.&~# \t\n\r\x0b\x0c";
+ for byte in to_escape {
+ v[*byte as usize].insert(0, b'\\');
+ }
+ v
+ };
+}
+
+/// These are matched in order
+const GLOB_REPLACEMENTS: &[(&[u8], &[u8])] =
+ &[(b"*/", b"(?:.*/)?"), (b"*", b".*"), (b"", b"[^/]*")];
+
+#[derive(Debug, Copy, Clone, PartialEq, Eq)]
+pub enum PatternSyntax {
+ Regexp,
+ /// Glob that matches at the front of the path
+ RootGlob,
+ /// Glob that matches at any suffix of the path (still anchored at slashes)
+ Glob,
+ Path,
+ RelPath,
+ RelGlob,
+ RelRegexp,
+ RootFiles,
+}
+
+/// Transforms a glob pattern into a regex
+fn glob_to_re(pat: &[u8]) -> Vec<u8> {
+ let mut input = pat;
+ let mut res: Vec<u8> = vec![];
+ let mut group_depth = 0;
+
+ while let Some((c, rest)) = input.split_first() {
+ input = rest;
+
+ match c {
+ b'*' => {
+ for (source, repl) in GLOB_REPLACEMENTS {
+ if input.starts_with(source) {
+ input = &input[source.len()..];
+ res.extend(*repl);
+ break;
+ }
+ }
+ }
+ b'?' => res.extend(b"."),
+ b'[' => {
+ match input.iter().skip(1).position(|b| *b == b']') {
+ None => res.extend(b"\\["),
+ Some(end) => {
+ // Account for the one we skipped
+ let end = end + 1;
+
+ res.extend(b"[");
+
+ for (i, b) in input[..end].iter().enumerate() {
+ if *b == b'!' && i == 0 {
+ res.extend(b"^")
+ } else if *b == b'^' && i == 0 {
+ res.extend(b"\\^")
+ } else if *b == b'\\' {
+ res.extend(b"\\\\")
+ } else {
+ res.push(*b)
+ }
+ }
+ res.extend(b"]");
+ input = &input[end + 1..];
+ }
+ }
+ }
+ b'{' => {
+ group_depth += 1;
+ res.extend(b"(?:")
+ }
+ b'}' if group_depth > 0 => {
+ group_depth -= 1;
+ res.extend(b")");
+ }
+ b',' if group_depth > 0 => res.extend(b"|"),
+ b'\\' => {
+ let c = {
+ if let Some((c, rest)) = input.split_first() {
+ input = rest;
+ c
+ } else {
+ c
+ }
+ };
+ res.extend(&reescape[*c as usize])
+ }
+ _ => res.extend(&reescape[*c as usize]),
+ }
+ }
+ res
+}
+
+fn escape_pattern(pattern: &[u8]) -> Vec<u8> {
+ pattern
+ .iter()
+ .flat_map(|c| reescape[*c as usize].clone())
+ .collect()
+}
+
+fn parse_pattern_syntax(kind: &[u8]) -> Result<PatternSyntax, PatternError> {
+ match kind {
+ b"re" => Ok(PatternSyntax::Regexp),
+ b"path" => Ok(PatternSyntax::Path),
+ b"relpath" => Ok(PatternSyntax::RelPath),
+ b"rootfilesin" => Ok(PatternSyntax::RootFiles),
+ b"relglob" => Ok(PatternSyntax::RelGlob),
+ b"relre" => Ok(PatternSyntax::RelRegexp),
+ b"glob" => Ok(PatternSyntax::Glob),
+ b"rootglob" => Ok(PatternSyntax::RootGlob),
+ _ => Err(PatternError::UnsupportedSyntax(
+ String::from_utf8_lossy(kind).to_string(),
+ )),
+ }
+}
+
+/// Builds the regex that corresponds to the given pattern.
+/// If within a `syntax: regexp` context, returns the pattern,
+/// otherwise, returns the corresponding regex.
+fn _build_single_regex(
+ syntax: PatternSyntax,
+ pattern: &[u8],
+ globsuffix: &[u8],
+) -> Vec<u8> {
+ if pattern.is_empty() {
+ return vec![];
+ }
+ match syntax {
+ PatternSyntax::Regexp => pattern.to_owned(),
+ PatternSyntax::RelRegexp => {
+ if pattern[0] == b'^' {
+ return pattern.to_owned();
+ }
+ let mut res = b".*".to_vec();
+ res.extend(pattern);
+ res
+ }
+ PatternSyntax::Path | PatternSyntax::RelPath => {
+ if pattern == b"." {
+ return vec![];
+ }
+ let mut pattern = escape_pattern(pattern);
+ pattern.extend(b"(?:/|$)");
+ pattern
+ }
+ PatternSyntax::RootFiles => {
+ let mut res = if pattern == b"." {
+ vec![]
+ } else {
+ // Pattern is a directory name.
+ let mut as_vec: Vec<u8> = escape_pattern(pattern);
+ as_vec.push(b'/');
+ as_vec
+ };
+
+ // Anything after the pattern must be a non-directory.
+ res.extend(b"[^/]+$");
+ res
+ }
+ PatternSyntax::Glob
+ | PatternSyntax::RelGlob
+ | PatternSyntax::RootGlob => {
+ let mut res: Vec<u8> = vec![];
+ if syntax == PatternSyntax::RelGlob {
+ res.extend(b"(?:|.*/)");
+ }
+
+ res.extend(glob_to_re(pattern));
+ res.extend(globsuffix.iter());
+ res
+ }
+ }
+}
+
+const GLOB_SPECIAL_CHARACTERS: [u8; 7] =
+ [b'*', b'?', b'[', b']', b'{', b'}', b'\\'];
+
+/// Wrapper function to `_build_single_regex` that short-circuits 'exact' globs
+/// that don't need to be transformed into a regex.
+pub fn build_single_regex(
+ kind: &str,
+ pat: &[u8],
+ globsuffix: &[u8],
+) -> Result<Vec<u8>, PatternError> {
+ let enum_kind = parse_pattern_syntax(kind.as_bytes())?;
+ if enum_kind == PatternSyntax::RootGlob
+ && pat.iter().all(|b| GLOB_SPECIAL_CHARACTERS.contains(b))
+ {
+ Ok(pat.to_vec())
+ } else {
+ Ok(_build_single_regex(enum_kind, pat, globsuffix))
+ }
+}
+
+lazy_static! {
+ static ref SYNTAXES: HashMap<&'static str, &'static str> = {
+ let mut m = HashMap::new();
+
+ m.insert("re", "relre:");
+ m.insert("regexp", "relre:");
+ m.insert("glob", "relglob:");
+ m.insert("rootglob", "rootglob:");
+ m.insert("include", "include");
+ m.insert("subinclude", "subinclude");
+ m
+ };
+}
+
+pub type PatternTuple = (String, LineNumber, String);
+type WarningTuple = (String, String);
+
+pub fn parse_pattern_file_contents(
+ lines: &str,
+ file_path: &str,
+ warn: bool,
+) -> (Vec<PatternTuple>, Vec<WarningTuple>) {
+ let comment_regex = Regex::new(r"((?:^|[^\\])(?:\\\\)*)#.*").unwrap();
+ let mut inputs: Vec<PatternTuple> = vec![];
+ let mut warnings: Vec<WarningTuple> = vec![];
+
+ let mut current_syntax = "relre:";
+
+ let mut line = String::new();
+ for (line_number, line_str) in lines.split('\n').enumerate() {
+ let line_number = line_number + 1;
+ line.replace_range(.., line_str);
+
+ if line.contains('#') {
+ if let Some(cap) = comment_regex.captures(line.clone().as_ref()) {
+ line = line[..cap.get(1).unwrap().end()].to_string()
+ }
+ line = line.replace(r"\#", "#");
+ }
+
+ let mut line = line.trim_end();
+
+ if line.is_empty() {
+ continue;
+ }
+
+ if line.starts_with("syntax:") {
+ let syntax = line["syntax:".len()..].trim();
+
+ if let Some(rel_syntax) = SYNTAXES.get(syntax) {
+ current_syntax = rel_syntax;
+ } else if warn {
+ warnings.push((file_path.to_string(), syntax.to_string()));
+ }
+ continue;
+ }
+
+ let mut line_syntax: &str = ¤t_syntax;
+
+ for (s, rels) in SYNTAXES.iter() {
+ if line.starts_with(rels) {
+ line_syntax = rels;
+ line = &line[rels.len()..];
+ break;
+ } else if line.starts_with(&format!("{}:", s)) {
+ line_syntax = rels;
+ line = &line[s.len() + 1..];
+ break;
+ }
+ }
+
+ inputs.push((
+ format!("{}{}", line_syntax, line),
+ line_number,
+ line.to_string(),
+ ));
+ }
+ (inputs, warnings)
+}
+
+pub fn read_pattern_file(
+ file_path: String,
+ warn: bool,
+) -> Result<(Vec<PatternTuple>, Vec<WarningTuple>), PatternFileError> {
+ let mut f = File::open(&file_path)?;
+ let mut contents = String::new();
+
+ f.read_to_string(&mut contents)?;
+
+ Ok(parse_pattern_file_contents(&contents, &file_path, warn))
+}
+
+#[cfg(test)]
+mod tests {
+ use super::*;
+
+ #[test]
+ fn escape_pattern_test() {
+ let untouched = br#"!"%',/0123456789:;<=>@ABCDEFGHIJKLMNOPQRSTUVWXYZ_`abcdefghijklmnopqrstuvwxyz"#;
+ assert_eq!(escape_pattern(untouched), untouched.to_vec());
+ // All escape codes
+ assert_eq!(
+ escape_pattern(br#"()[]{}?*+-|^$\\.&~# \t\n\r\v\f"#),
+ br#"\(\)\[\]\{\}\?\*\+\-\|\^\$\\\\\.\&\~\#\ \\t\\n\\r\\v\\f"#
+ .to_vec()
+ );
+ }
+
+ #[test]
+ fn glob_test() {
+ assert_eq!(glob_to_re(br#"?"#), br#"."#);
+ assert_eq!(glob_to_re(br#"*"#), br#"[^/]*"#);
+ assert_eq!(glob_to_re(br#"**"#), br#".*"#);
+ assert_eq!(glob_to_re(br#"**/a"#), br#"(?:.*/)?a"#);
+ assert_eq!(glob_to_re(br#"a/**/b"#), br#"a/(?:.*/)?b"#);
+ assert_eq!(glob_to_re(br#"[a*?!^][^b][!c]"#), br#"[a*?!^][\^b][^c]"#);
+ assert_eq!(glob_to_re(br#"{a,b}"#), br#"(?:a|b)"#);
+ assert_eq!(glob_to_re(br#".\*\?"#), br#"\.\*\?"#);
+ }
+
+ #[test]
+ fn test_parse_pattern_file_contents() {
+ let lines = "syntax: glob\n*.elc";
+
+ assert_eq!(
+ vec![("relglob:*.elc".to_string(), 2, "*.elc".to_string())],
+ parse_pattern_file_contents(lines, "file_path", false).0,
+ );
+
+ let lines = "syntax: include\nsyntax: glob";
+
+ assert_eq!(
+ parse_pattern_file_contents(lines, "file_path", false).0,
+ vec![]
+ );
+ }
+}
--- a/rust/hg-core/src/lib.rs Wed May 15 22:11:41 2019 -0700
+++ b/rust/hg-core/src/lib.rs Wed Apr 24 11:34:09 2019 +0200
@@ -4,6 +4,9 @@
// GNU General Public License version 2 or any later version.
extern crate byteorder;
extern crate memchr;
+#[macro_use]
+extern crate lazy_static;
+extern crate regex;
mod ancestors;
pub mod dagops;
@@ -15,6 +18,11 @@
pack_dirstate, parse_dirstate, CopyVec, CopyVecEntry, DirstateEntry,
DirstateParents, DirstateVec,
};
+mod filepatterns;
+
+pub use filepatterns::{
+ build_single_regex, read_pattern_file, PatternSyntax, PatternTuple,
+};
/// Mercurial revision numbers
///
@@ -42,6 +50,8 @@
fn parents(&self, Revision) -> Result<[Revision; 2], GraphError>;
}
+pub type LineNumber = usize;
+
#[derive(Clone, Debug, PartialEq)]
pub enum GraphError {
ParentOutOfRange(Revision),
@@ -73,3 +83,20 @@
DirstateParseError::CorruptedEntry(e.to_string())
}
}
+
+#[derive(Debug)]
+pub enum PatternError {
+ UnsupportedSyntax(String),
+}
+
+#[derive(Debug)]
+pub enum PatternFileError {
+ IO(std::io::Error),
+ Pattern(PatternError, LineNumber),
+}
+
+impl From<std::io::Error> for PatternFileError {
+ fn from(e: std::io::Error) -> Self {
+ PatternFileError::IO(e)
+ }
+}