rust-filepatterns: match exact `rootglob`s with a `HashSet`, not in the regex
This optimization yields some very interesting results in `rootglob`-heavy
repositories.
I build a test repository of the following structure:
```
root
/<uuid>/build/empty_file
... repeat for 4000 entries
```
and a `.hgignore` containing the corresponding 4000 `rootglob` entries pointing
to all `build/` folders.
Rust+c `hg status` goes from 350ms down to 110ms.
Differential Revision: https://phab.mercurial-scm.org/D8491
--- a/rust/hg-core/src/filepatterns.rs Wed Apr 15 16:43:05 2020 -0400
+++ b/rust/hg-core/src/filepatterns.rs Wed May 06 11:17:27 2020 +0200
@@ -271,7 +271,7 @@
/// that don't need to be transformed into a regex.
pub fn build_single_regex(
entry: &IgnorePattern,
-) -> Result<Vec<u8>, PatternError> {
+) -> Result<Option<Vec<u8>>, PatternError> {
let IgnorePattern {
pattern, syntax, ..
} = entry;
@@ -288,16 +288,11 @@
if *syntax == PatternSyntax::RootGlob
&& !pattern.iter().any(|b| GLOB_SPECIAL_CHARACTERS.contains(b))
{
- // The `regex` crate adds `.*` to the start and end of expressions
- // if there are no anchors, so add the start anchor.
- let mut escaped = vec![b'^'];
- escaped.extend(escape_pattern(&pattern));
- escaped.extend(GLOB_SUFFIX);
- Ok(escaped)
+ Ok(None)
} else {
let mut entry = entry.clone();
entry.pattern = pattern;
- Ok(_build_single_regex(&entry))
+ Ok(Some(_build_single_regex(&entry)))
}
}
@@ -628,7 +623,7 @@
Path::new("")
))
.unwrap(),
- br"(?:.*/)?rust/target(?:/|$)".to_vec(),
+ Some(br"(?:.*/)?rust/target(?:/|$)".to_vec()),
);
}
@@ -641,7 +636,7 @@
Path::new("")
))
.unwrap(),
- br"^\.(?:/|$)".to_vec(),
+ None,
);
assert_eq!(
build_single_regex(&IgnorePattern::new(
@@ -650,7 +645,7 @@
Path::new("")
))
.unwrap(),
- br"^whatever(?:/|$)".to_vec(),
+ None,
);
assert_eq!(
build_single_regex(&IgnorePattern::new(
@@ -659,7 +654,7 @@
Path::new("")
))
.unwrap(),
- br"^[^/]*\.o(?:/|$)".to_vec(),
+ Some(br"^[^/]*\.o(?:/|$)".to_vec()),
);
}
}
--- a/rust/hg-core/src/matchers.rs Wed Apr 15 16:43:05 2020 -0400
+++ b/rust/hg-core/src/matchers.rs Wed May 06 11:17:27 2020 +0200
@@ -24,6 +24,7 @@
PatternSyntax,
};
+use crate::filepatterns::normalize_path_bytes;
use std::borrow::ToOwned;
use std::collections::HashSet;
use std::fmt::{Display, Error, Formatter};
@@ -373,15 +374,32 @@
fn build_regex_match<'a>(
ignore_patterns: &'a [&'a IgnorePattern],
) -> PatternResult<(Vec<u8>, Box<dyn Fn(&HgPath) -> bool + Sync>)> {
- let regexps: Result<Vec<_>, PatternError> = ignore_patterns
- .into_iter()
- .map(|k| build_single_regex(*k))
- .collect();
- let regexps = regexps?;
+ let mut regexps = vec![];
+ let mut exact_set = HashSet::new();
+
+ for pattern in ignore_patterns {
+ if let Some(re) = build_single_regex(pattern)? {
+ regexps.push(re);
+ } else {
+ let exact = normalize_path_bytes(&pattern.pattern);
+ exact_set.insert(HgPathBuf::from_bytes(&exact));
+ }
+ }
+
let full_regex = regexps.join(&b'|');
- let matcher = re_matcher(&full_regex)?;
- let func = Box::new(move |filename: &HgPath| matcher(filename));
+ // An empty pattern would cause the regex engine to incorrectly match the
+ // (empty) root directory
+ let func = if !(regexps.is_empty()) {
+ let matcher = re_matcher(&full_regex)?;
+ let func = move |filename: &HgPath| {
+ exact_set.contains(filename) || matcher(filename)
+ };
+ Box::new(func) as Box<dyn Fn(&HgPath) -> bool + Sync>
+ } else {
+ let func = move |filename: &HgPath| exact_set.contains(filename);
+ Box::new(func) as Box<dyn Fn(&HgPath) -> bool + Sync>
+ };
Ok((full_regex, func))
}