# HG changeset patch # User Raphaël Gomès # Date 1588756647 -7200 # Node ID e0414fcd35e0fda8757c0fd0ab601be32c21ff77 # Parent 373dd22ae60ee45bf9f4d4652b3f59aabd1f7bf6 rust-filepatterns: match exact `rootglob`s with a `HashSet`, not in the regex This optimization yields some very interesting results in `rootglob`-heavy repositories. I build a test repository of the following structure: ``` root //build/empty_file ... repeat for 4000 entries ``` and a `.hgignore` containing the corresponding 4000 `rootglob` entries pointing to all `build/` folders. Rust+c `hg status` goes from 350ms down to 110ms. Differential Revision: https://phab.mercurial-scm.org/D8491 diff -r 373dd22ae60e -r e0414fcd35e0 rust/hg-core/src/filepatterns.rs --- a/rust/hg-core/src/filepatterns.rs Wed Apr 15 16:43:05 2020 -0400 +++ b/rust/hg-core/src/filepatterns.rs Wed May 06 11:17:27 2020 +0200 @@ -271,7 +271,7 @@ /// that don't need to be transformed into a regex. pub fn build_single_regex( entry: &IgnorePattern, -) -> Result, PatternError> { +) -> Result>, PatternError> { let IgnorePattern { pattern, syntax, .. } = entry; @@ -288,16 +288,11 @@ if *syntax == PatternSyntax::RootGlob && !pattern.iter().any(|b| GLOB_SPECIAL_CHARACTERS.contains(b)) { - // The `regex` crate adds `.*` to the start and end of expressions - // if there are no anchors, so add the start anchor. - let mut escaped = vec![b'^']; - escaped.extend(escape_pattern(&pattern)); - escaped.extend(GLOB_SUFFIX); - Ok(escaped) + Ok(None) } else { let mut entry = entry.clone(); entry.pattern = pattern; - Ok(_build_single_regex(&entry)) + Ok(Some(_build_single_regex(&entry))) } } @@ -628,7 +623,7 @@ Path::new("") )) .unwrap(), - br"(?:.*/)?rust/target(?:/|$)".to_vec(), + Some(br"(?:.*/)?rust/target(?:/|$)".to_vec()), ); } @@ -641,7 +636,7 @@ Path::new("") )) .unwrap(), - br"^\.(?:/|$)".to_vec(), + None, ); assert_eq!( build_single_regex(&IgnorePattern::new( @@ -650,7 +645,7 @@ Path::new("") )) .unwrap(), - br"^whatever(?:/|$)".to_vec(), + None, ); assert_eq!( build_single_regex(&IgnorePattern::new( @@ -659,7 +654,7 @@ Path::new("") )) .unwrap(), - br"^[^/]*\.o(?:/|$)".to_vec(), + Some(br"^[^/]*\.o(?:/|$)".to_vec()), ); } } diff -r 373dd22ae60e -r e0414fcd35e0 rust/hg-core/src/matchers.rs --- a/rust/hg-core/src/matchers.rs Wed Apr 15 16:43:05 2020 -0400 +++ b/rust/hg-core/src/matchers.rs Wed May 06 11:17:27 2020 +0200 @@ -24,6 +24,7 @@ PatternSyntax, }; +use crate::filepatterns::normalize_path_bytes; use std::borrow::ToOwned; use std::collections::HashSet; use std::fmt::{Display, Error, Formatter}; @@ -373,15 +374,32 @@ fn build_regex_match<'a>( ignore_patterns: &'a [&'a IgnorePattern], ) -> PatternResult<(Vec, Box bool + Sync>)> { - let regexps: Result, PatternError> = ignore_patterns - .into_iter() - .map(|k| build_single_regex(*k)) - .collect(); - let regexps = regexps?; + let mut regexps = vec![]; + let mut exact_set = HashSet::new(); + + for pattern in ignore_patterns { + if let Some(re) = build_single_regex(pattern)? { + regexps.push(re); + } else { + let exact = normalize_path_bytes(&pattern.pattern); + exact_set.insert(HgPathBuf::from_bytes(&exact)); + } + } + let full_regex = regexps.join(&b'|'); - let matcher = re_matcher(&full_regex)?; - let func = Box::new(move |filename: &HgPath| matcher(filename)); + // An empty pattern would cause the regex engine to incorrectly match the + // (empty) root directory + let func = if !(regexps.is_empty()) { + let matcher = re_matcher(&full_regex)?; + let func = move |filename: &HgPath| { + exact_set.contains(filename) || matcher(filename) + }; + Box::new(func) as Box bool + Sync> + } else { + let func = move |filename: &HgPath| exact_set.contains(filename); + Box::new(func) as Box bool + Sync> + }; Ok((full_regex, func)) }