Mercurial > hg
changeset 44802:e0414fcd35e0
rust-filepatterns: match exact `rootglob`s with a `HashSet`, not in the regex
This optimization yields some very interesting results in `rootglob`-heavy
repositories.
I build a test repository of the following structure:
```
root
/<uuid>/build/empty_file
... repeat for 4000 entries
```
and a `.hgignore` containing the corresponding 4000 `rootglob` entries pointing
to all `build/` folders.
Rust+c `hg status` goes from 350ms down to 110ms.
Differential Revision: https://phab.mercurial-scm.org/D8491
author | Raphaël Gomès <rgomes@octobus.net> |
---|---|
date | Wed, 06 May 2020 11:17:27 +0200 |
parents | 373dd22ae60e |
children | de0fb4463a3d |
files | rust/hg-core/src/filepatterns.rs rust/hg-core/src/matchers.rs |
diffstat | 2 files changed, 32 insertions(+), 19 deletions(-) [+] |
line wrap: on
line diff
--- a/rust/hg-core/src/filepatterns.rs Wed Apr 15 16:43:05 2020 -0400 +++ b/rust/hg-core/src/filepatterns.rs Wed May 06 11:17:27 2020 +0200 @@ -271,7 +271,7 @@ /// that don't need to be transformed into a regex. pub fn build_single_regex( entry: &IgnorePattern, -) -> Result<Vec<u8>, PatternError> { +) -> Result<Option<Vec<u8>>, PatternError> { let IgnorePattern { pattern, syntax, .. } = entry; @@ -288,16 +288,11 @@ if *syntax == PatternSyntax::RootGlob && !pattern.iter().any(|b| GLOB_SPECIAL_CHARACTERS.contains(b)) { - // The `regex` crate adds `.*` to the start and end of expressions - // if there are no anchors, so add the start anchor. - let mut escaped = vec![b'^']; - escaped.extend(escape_pattern(&pattern)); - escaped.extend(GLOB_SUFFIX); - Ok(escaped) + Ok(None) } else { let mut entry = entry.clone(); entry.pattern = pattern; - Ok(_build_single_regex(&entry)) + Ok(Some(_build_single_regex(&entry))) } } @@ -628,7 +623,7 @@ Path::new("") )) .unwrap(), - br"(?:.*/)?rust/target(?:/|$)".to_vec(), + Some(br"(?:.*/)?rust/target(?:/|$)".to_vec()), ); } @@ -641,7 +636,7 @@ Path::new("") )) .unwrap(), - br"^\.(?:/|$)".to_vec(), + None, ); assert_eq!( build_single_regex(&IgnorePattern::new( @@ -650,7 +645,7 @@ Path::new("") )) .unwrap(), - br"^whatever(?:/|$)".to_vec(), + None, ); assert_eq!( build_single_regex(&IgnorePattern::new( @@ -659,7 +654,7 @@ Path::new("") )) .unwrap(), - br"^[^/]*\.o(?:/|$)".to_vec(), + Some(br"^[^/]*\.o(?:/|$)".to_vec()), ); } }
--- a/rust/hg-core/src/matchers.rs Wed Apr 15 16:43:05 2020 -0400 +++ b/rust/hg-core/src/matchers.rs Wed May 06 11:17:27 2020 +0200 @@ -24,6 +24,7 @@ PatternSyntax, }; +use crate::filepatterns::normalize_path_bytes; use std::borrow::ToOwned; use std::collections::HashSet; use std::fmt::{Display, Error, Formatter}; @@ -373,15 +374,32 @@ fn build_regex_match<'a>( ignore_patterns: &'a [&'a IgnorePattern], ) -> PatternResult<(Vec<u8>, Box<dyn Fn(&HgPath) -> bool + Sync>)> { - let regexps: Result<Vec<_>, PatternError> = ignore_patterns - .into_iter() - .map(|k| build_single_regex(*k)) - .collect(); - let regexps = regexps?; + let mut regexps = vec![]; + let mut exact_set = HashSet::new(); + + for pattern in ignore_patterns { + if let Some(re) = build_single_regex(pattern)? { + regexps.push(re); + } else { + let exact = normalize_path_bytes(&pattern.pattern); + exact_set.insert(HgPathBuf::from_bytes(&exact)); + } + } + let full_regex = regexps.join(&b'|'); - let matcher = re_matcher(&full_regex)?; - let func = Box::new(move |filename: &HgPath| matcher(filename)); + // An empty pattern would cause the regex engine to incorrectly match the + // (empty) root directory + let func = if !(regexps.is_empty()) { + let matcher = re_matcher(&full_regex)?; + let func = move |filename: &HgPath| { + exact_set.contains(filename) || matcher(filename) + }; + Box::new(func) as Box<dyn Fn(&HgPath) -> bool + Sync> + } else { + let func = move |filename: &HgPath| exact_set.contains(filename); + Box::new(func) as Box<dyn Fn(&HgPath) -> bool + Sync> + }; Ok((full_regex, func)) }