rust-matchers: use the `regex` crate
Instead of falling back to Python when a code path with "ignore" functionality
is reached and `Re2` is not installed, the default compilation (i.e. without
the `with-re2` feature) will use the `regex` crate for all regular expressions
business.
As with the introduction of `Re2` in a previous series, this yields a big
performance boost compared to the Python + C code in `status`, `diff`, `commit`,
`update`, and maybe others.
For now `Re2` looks to be faster at compiling the DFA (1.5ms vs 5ms for
Netbeans' `.hgignore`) and a bit faster in actual use: (123ms vs 137ms for
the parallel traversal of Netbeans' clean repo). I am in talks with the author
of `regex` to see whether that performance difference is a bug, a "won't fix",
or a tuning issue.
The `regex` crate is already one of our dependencies and using this code does
not require any additional work from the end-user than to use the Rust
extensions.
Differential Revision: https://phab.mercurial-scm.org/D8323
--- a/rust/hg-core/src/filepatterns.rs Sun Mar 15 16:11:58 2020 +0900
+++ b/rust/hg-core/src/filepatterns.rs Tue Mar 24 17:55:59 2020 +0100
@@ -176,9 +176,14 @@
return vec![];
}
match syntax {
- PatternSyntax::Regexp => pattern.to_owned(),
+ // The `regex` crate adds `.*` to the start and end of expressions
+ // if there are no anchors, so add them.
+ PatternSyntax::Regexp => [b"^", &pattern[..], b"$"].concat(),
PatternSyntax::RelRegexp => {
- if pattern[0] == b'^' {
+ // The `regex` crate accepts `**` while `re2` and Python's `re`
+ // do not. Checking for `*` correctly triggers the same error all
+ // engines.
+ if pattern[0] == b'^' || pattern[0] == b'*' {
return pattern.to_owned();
}
[&b".*"[..], pattern].concat()
@@ -191,14 +196,15 @@
}
PatternSyntax::RootFiles => {
let mut res = if pattern == b"." {
- vec![]
+ vec![b'^']
} else {
// Pattern is a directory name.
- [escape_pattern(pattern).as_slice(), b"/"].concat()
+ [b"^", escape_pattern(pattern).as_slice(), b"/"].concat()
};
// Anything after the pattern must be a non-directory.
res.extend(b"[^/]+$");
+ res.push(b'$');
res
}
PatternSyntax::RelGlob => {
@@ -206,11 +212,11 @@
if let Some(rest) = glob_re.drop_prefix(b"[^/]*") {
[b".*", rest, GLOB_SUFFIX].concat()
} else {
- [b"(?:|.*/)", glob_re.as_slice(), GLOB_SUFFIX].concat()
+ [b"(?:.*/)?", glob_re.as_slice(), GLOB_SUFFIX].concat()
}
}
PatternSyntax::Glob | PatternSyntax::RootGlob => {
- [glob_to_re(pattern).as_slice(), GLOB_SUFFIX].concat()
+ [b"^", glob_to_re(pattern).as_slice(), GLOB_SUFFIX].concat()
}
PatternSyntax::Include | PatternSyntax::SubInclude => unreachable!(),
}
@@ -282,7 +288,10 @@
if *syntax == PatternSyntax::RootGlob
&& !pattern.iter().any(|b| GLOB_SPECIAL_CHARACTERS.contains(b))
{
- let mut escaped = escape_pattern(&pattern);
+ // The `regex` crate adds `.*` to the start and end of expressions
+ // if there are no anchors, so add the start anchor.
+ let mut escaped = vec![b'^'];
+ escaped.extend(escape_pattern(&pattern));
escaped.extend(GLOB_SUFFIX);
Ok(escaped)
} else {
@@ -619,7 +628,7 @@
Path::new("")
))
.unwrap(),
- br"(?:|.*/)rust/target(?:/|$)".to_vec(),
+ br"(?:.*/)?rust/target(?:/|$)".to_vec(),
);
}
@@ -632,7 +641,7 @@
Path::new("")
))
.unwrap(),
- br"\.(?:/|$)".to_vec(),
+ br"^\.(?:/|$)".to_vec(),
);
assert_eq!(
build_single_regex(&IgnorePattern::new(
@@ -641,7 +650,7 @@
Path::new("")
))
.unwrap(),
- br"whatever(?:/|$)".to_vec(),
+ br"^whatever(?:/|$)".to_vec(),
);
assert_eq!(
build_single_regex(&IgnorePattern::new(
@@ -650,7 +659,7 @@
Path::new("")
))
.unwrap(),
- br"[^/]*\.o(?:/|$)".to_vec(),
+ br"^[^/]*\.o(?:/|$)".to_vec(),
);
}
}
--- a/rust/hg-core/src/matchers.rs Sun Mar 15 16:11:58 2020 +0900
+++ b/rust/hg-core/src/matchers.rs Tue Mar 24 17:55:59 2020 +0100
@@ -331,8 +331,37 @@
}
#[cfg(not(feature = "with-re2"))]
-fn re_matcher(_: &[u8]) -> PatternResult<Box<dyn Fn(&HgPath) -> bool + Sync>> {
- Err(PatternError::Re2NotInstalled)
+/// Returns a function that matches an `HgPath` against the given regex
+/// pattern.
+///
+/// This can fail when the pattern is invalid or not supported by the
+/// underlying engine (the `regex` crate), for instance anything with
+/// back-references.
+fn re_matcher(
+ pattern: &[u8],
+) -> PatternResult<impl Fn(&HgPath) -> bool + Sync> {
+ use std::io::Write;
+
+ let mut escaped_bytes = vec![];
+ for byte in pattern {
+ if *byte > 127 {
+ write!(escaped_bytes, "\\x{:x}", *byte).unwrap();
+ } else {
+ escaped_bytes.push(*byte);
+ }
+ }
+
+ // Avoid the cost of UTF8 checking
+ //
+ // # Safety
+ // This is safe because we escaped all non-ASCII bytes.
+ let pattern_string = unsafe { String::from_utf8_unchecked(escaped_bytes) };
+ let re = regex::bytes::RegexBuilder::new(&pattern_string)
+ .unicode(false)
+ .build()
+ .map_err(|e| PatternError::UnsupportedSyntax(e.to_string()))?;
+
+ Ok(move |path: &HgPath| re.is_match(path.as_bytes()))
}
/// Returns the regex pattern and a function that matches an `HgPath` against