rust-filepatterns: match exact `rootglob`s with a `HashSet`, not in the regex
authorRaphaël Gomès <rgomes@octobus.net>
Wed, 06 May 2020 11:17:27 +0200
changeset 44879 e0414fcd35e0
parent 44878 373dd22ae60e
child 44880 de0fb4463a3d
rust-filepatterns: match exact `rootglob`s with a `HashSet`, not in the regex This optimization yields some very interesting results in `rootglob`-heavy repositories. I build a test repository of the following structure: ``` root /<uuid>/build/empty_file ... repeat for 4000 entries ``` and a `.hgignore` containing the corresponding 4000 `rootglob` entries pointing to all `build/` folders. Rust+c `hg status` goes from 350ms down to 110ms. Differential Revision: https://phab.mercurial-scm.org/D8491
rust/hg-core/src/filepatterns.rs
rust/hg-core/src/matchers.rs
--- a/rust/hg-core/src/filepatterns.rs	Wed Apr 15 16:43:05 2020 -0400
+++ b/rust/hg-core/src/filepatterns.rs	Wed May 06 11:17:27 2020 +0200
@@ -271,7 +271,7 @@
 /// that don't need to be transformed into a regex.
 pub fn build_single_regex(
     entry: &IgnorePattern,
-) -> Result<Vec<u8>, PatternError> {
+) -> Result<Option<Vec<u8>>, PatternError> {
     let IgnorePattern {
         pattern, syntax, ..
     } = entry;
@@ -288,16 +288,11 @@
     if *syntax == PatternSyntax::RootGlob
         && !pattern.iter().any(|b| GLOB_SPECIAL_CHARACTERS.contains(b))
     {
-        // The `regex` crate adds `.*` to the start and end of expressions
-        // if there are no anchors, so add the start anchor.
-        let mut escaped = vec![b'^'];
-        escaped.extend(escape_pattern(&pattern));
-        escaped.extend(GLOB_SUFFIX);
-        Ok(escaped)
+        Ok(None)
     } else {
         let mut entry = entry.clone();
         entry.pattern = pattern;
-        Ok(_build_single_regex(&entry))
+        Ok(Some(_build_single_regex(&entry)))
     }
 }
 
@@ -628,7 +623,7 @@
                 Path::new("")
             ))
             .unwrap(),
-            br"(?:.*/)?rust/target(?:/|$)".to_vec(),
+            Some(br"(?:.*/)?rust/target(?:/|$)".to_vec()),
         );
     }
 
@@ -641,7 +636,7 @@
                 Path::new("")
             ))
             .unwrap(),
-            br"^\.(?:/|$)".to_vec(),
+            None,
         );
         assert_eq!(
             build_single_regex(&IgnorePattern::new(
@@ -650,7 +645,7 @@
                 Path::new("")
             ))
             .unwrap(),
-            br"^whatever(?:/|$)".to_vec(),
+            None,
         );
         assert_eq!(
             build_single_regex(&IgnorePattern::new(
@@ -659,7 +654,7 @@
                 Path::new("")
             ))
             .unwrap(),
-            br"^[^/]*\.o(?:/|$)".to_vec(),
+            Some(br"^[^/]*\.o(?:/|$)".to_vec()),
         );
     }
 }
--- a/rust/hg-core/src/matchers.rs	Wed Apr 15 16:43:05 2020 -0400
+++ b/rust/hg-core/src/matchers.rs	Wed May 06 11:17:27 2020 +0200
@@ -24,6 +24,7 @@
     PatternSyntax,
 };
 
+use crate::filepatterns::normalize_path_bytes;
 use std::borrow::ToOwned;
 use std::collections::HashSet;
 use std::fmt::{Display, Error, Formatter};
@@ -373,15 +374,32 @@
 fn build_regex_match<'a>(
     ignore_patterns: &'a [&'a IgnorePattern],
 ) -> PatternResult<(Vec<u8>, Box<dyn Fn(&HgPath) -> bool + Sync>)> {
-    let regexps: Result<Vec<_>, PatternError> = ignore_patterns
-        .into_iter()
-        .map(|k| build_single_regex(*k))
-        .collect();
-    let regexps = regexps?;
+    let mut regexps = vec![];
+    let mut exact_set = HashSet::new();
+
+    for pattern in ignore_patterns {
+        if let Some(re) = build_single_regex(pattern)? {
+            regexps.push(re);
+        } else {
+            let exact = normalize_path_bytes(&pattern.pattern);
+            exact_set.insert(HgPathBuf::from_bytes(&exact));
+        }
+    }
+
     let full_regex = regexps.join(&b'|');
 
-    let matcher = re_matcher(&full_regex)?;
-    let func = Box::new(move |filename: &HgPath| matcher(filename));
+    // An empty pattern would cause the regex engine to incorrectly match the
+    // (empty) root directory
+    let func = if !(regexps.is_empty()) {
+        let matcher = re_matcher(&full_regex)?;
+        let func = move |filename: &HgPath| {
+            exact_set.contains(filename) || matcher(filename)
+        };
+        Box::new(func) as Box<dyn Fn(&HgPath) -> bool + Sync>
+    } else {
+        let func = move |filename: &HgPath| exact_set.contains(filename);
+        Box::new(func) as Box<dyn Fn(&HgPath) -> bool + Sync>
+    };
 
     Ok((full_regex, func))
 }