rust: Parse "subinclude"d files along the way, not later
authorSimon Sapin <simon.sapin@octobus.net>
Wed, 02 Jun 2021 18:03:43 +0200
changeset 47379 f6bb181c75f8
parent 47378 777c3d231913
child 47380 fad504cfc94b
rust: Parse "subinclude"d files along the way, not later When parsing a `.hgignore` file and encountering an `include:` line, the included file is parsed recursively right then in a depth-first fashion. With `subinclude:` however included files were parsed (recursively) much later. This changes it to be expanded during parsing, like `.hgignore`. The motivation for this is an upcoming changeset that needs to detect changes in which files are ignored or not. The plan is to hash all ignore files while they are being read, and store that hash in the dirstate (in v2 format). In order to allow a potential alternative implementations to read that format, the algorithm to compute that hash must be documented. Having a well-defined depth-first ordering for the tree of (sub-)included files makes that easier. Differential Revision: https://phab.mercurial-scm.org/D10834
rust/hg-core/src/filepatterns.rs
rust/hg-core/src/matchers.rs
rust/hg-cpython/src/dirstate/status.rs
--- a/rust/hg-core/src/filepatterns.rs	Wed Jun 02 18:14:44 2021 +0200
+++ b/rust/hg-core/src/filepatterns.rs	Wed Jun 02 18:03:43 2021 +0200
@@ -41,7 +41,7 @@
 /// Appended to the regexp of globs
 const GLOB_SUFFIX: &[u8; 7] = b"(?:/|$)";
 
-#[derive(Debug, Copy, Clone, PartialEq, Eq)]
+#[derive(Debug, Clone, PartialEq, Eq)]
 pub enum PatternSyntax {
     /// A regular expression
     Regexp,
@@ -65,6 +65,14 @@
     Include,
     /// A file of patterns to match against files under the same directory
     SubInclude,
+    /// SubInclude with the result of parsing the included file
+    ///
+    /// Note: there is no ExpandedInclude because that expansion can be done
+    /// in place by replacing the Include pattern by the included patterns.
+    /// SubInclude requires more handling.
+    ///
+    /// Note: `Box` is used to minimize size impact on other enum variants
+    ExpandedSubInclude(Box<SubInclude>),
 }
 
 /// Transforms a glob pattern into a regex
@@ -218,7 +226,9 @@
         PatternSyntax::Glob | PatternSyntax::RootGlob => {
             [glob_to_re(pattern).as_slice(), GLOB_SUFFIX].concat()
         }
-        PatternSyntax::Include | PatternSyntax::SubInclude => unreachable!(),
+        PatternSyntax::Include
+        | PatternSyntax::SubInclude
+        | PatternSyntax::ExpandedSubInclude(_) => unreachable!(),
     }
 }
 
@@ -441,10 +451,10 @@
 pub type PatternResult<T> = Result<T, PatternError>;
 
 /// Wrapper for `read_pattern_file` that also recursively expands `include:`
-/// patterns.
+/// and `subinclude:` patterns.
 ///
-/// `subinclude:` is not treated as a special pattern here: unraveling them
-/// needs to occur in the "ignore" phase.
+/// The former are expanded in place, while `PatternSyntax::ExpandedSubInclude`
+/// is used for the latter to form a tree of patterns.
 pub fn get_patterns_from_file(
     pattern_file: &Path,
     root_dir: &Path,
@@ -453,18 +463,35 @@
     let patterns = patterns
         .into_iter()
         .flat_map(|entry| -> PatternResult<_> {
-            let IgnorePattern {
-                syntax, pattern, ..
-            } = &entry;
-            Ok(match syntax {
+            Ok(match &entry.syntax {
                 PatternSyntax::Include => {
                     let inner_include =
-                        root_dir.join(get_path_from_bytes(&pattern));
+                        root_dir.join(get_path_from_bytes(&entry.pattern));
                     let (inner_pats, inner_warnings) =
                         get_patterns_from_file(&inner_include, root_dir)?;
                     warnings.extend(inner_warnings);
                     inner_pats
                 }
+                PatternSyntax::SubInclude => {
+                    let mut sub_include = SubInclude::new(
+                        &root_dir,
+                        &entry.pattern,
+                        &entry.source,
+                    )?;
+                    let (inner_patterns, inner_warnings) =
+                        get_patterns_from_file(
+                            &sub_include.path,
+                            &sub_include.root,
+                        )?;
+                    sub_include.included_patterns = inner_patterns;
+                    warnings.extend(inner_warnings);
+                    vec![IgnorePattern {
+                        syntax: PatternSyntax::ExpandedSubInclude(Box::new(
+                            sub_include,
+                        )),
+                        ..entry
+                    }]
+                }
                 _ => vec![entry],
             })
         })
@@ -475,6 +502,7 @@
 }
 
 /// Holds all the information needed to handle a `subinclude:` pattern.
+#[derive(Debug, PartialEq, Eq, Clone)]
 pub struct SubInclude {
     /// Will be used for repository (hg) paths that start with this prefix.
     /// It is relative to the current working directory, so comparing against
@@ -484,6 +512,8 @@
     pub path: PathBuf,
     /// Folder in the filesystem where this it applies
     pub root: PathBuf,
+
+    pub included_patterns: Vec<IgnorePattern>,
 }
 
 impl SubInclude {
@@ -513,29 +543,25 @@
             })?,
             path: path.to_owned(),
             root: new_root.to_owned(),
+            included_patterns: Vec::new(),
         })
     }
 }
 
 /// Separate and pre-process subincludes from other patterns for the "ignore"
 /// phase.
-pub fn filter_subincludes<'a>(
-    ignore_patterns: &'a [IgnorePattern],
-    root_dir: &Path,
-) -> Result<(Vec<SubInclude>, Vec<&'a IgnorePattern>), HgPathError> {
+pub fn filter_subincludes(
+    ignore_patterns: Vec<IgnorePattern>,
+) -> Result<(Vec<Box<SubInclude>>, Vec<IgnorePattern>), HgPathError> {
     let mut subincludes = vec![];
     let mut others = vec![];
 
-    for ignore_pattern in ignore_patterns.iter() {
-        let IgnorePattern {
-            syntax,
-            pattern,
-            source,
-        } = ignore_pattern;
-        if *syntax == PatternSyntax::SubInclude {
-            subincludes.push(SubInclude::new(root_dir, pattern, &source)?);
+    for pattern in ignore_patterns {
+        if let PatternSyntax::ExpandedSubInclude(sub_include) = pattern.syntax
+        {
+            subincludes.push(sub_include);
         } else {
-            others.push(ignore_pattern)
+            others.push(pattern)
         }
     }
     Ok((subincludes, others))
--- a/rust/hg-core/src/matchers.rs	Wed Jun 02 18:14:44 2021 +0200
+++ b/rust/hg-core/src/matchers.rs	Wed Jun 02 18:03:43 2021 +0200
@@ -11,7 +11,7 @@
     dirstate::dirs_multiset::DirsChildrenMultiset,
     filepatterns::{
         build_single_regex, filter_subincludes, get_patterns_from_file,
-        PatternFileWarning, PatternResult, SubInclude,
+        PatternFileWarning, PatternResult,
     },
     utils::{
         files::find_dirs,
@@ -237,7 +237,7 @@
 /// ///
 /// let ignore_patterns =
 /// vec![IgnorePattern::new(PatternSyntax::RootGlob, b"this*", Path::new(""))];
-/// let (matcher, _) = IncludeMatcher::new(ignore_patterns, "".as_ref()).unwrap();
+/// let matcher = IncludeMatcher::new(ignore_patterns).unwrap();
 /// ///
 /// assert_eq!(matcher.matches(HgPath::new(b"testing")), false);
 /// assert_eq!(matcher.matches(HgPath::new(b"this should work")), true);
@@ -341,8 +341,8 @@
 
 /// Returns the regex pattern and a function that matches an `HgPath` against
 /// said regex formed by the given ignore patterns.
-fn build_regex_match<'a>(
-    ignore_patterns: &'a [&'a IgnorePattern],
+fn build_regex_match(
+    ignore_patterns: &[IgnorePattern],
 ) -> PatternResult<(Vec<u8>, Box<dyn Fn(&HgPath) -> bool + Sync>)> {
     let mut regexps = vec![];
     let mut exact_set = HashSet::new();
@@ -478,32 +478,25 @@
 /// Returns a function that checks whether a given file (in the general sense)
 /// should be matched.
 fn build_match<'a, 'b>(
-    ignore_patterns: &'a [IgnorePattern],
-    root_dir: &Path,
-) -> PatternResult<(
-    Vec<u8>,
-    Box<dyn Fn(&HgPath) -> bool + 'b + Sync>,
-    Vec<PatternFileWarning>,
-)> {
+    ignore_patterns: Vec<IgnorePattern>,
+) -> PatternResult<(Vec<u8>, Box<dyn Fn(&HgPath) -> bool + 'b + Sync>)> {
     let mut match_funcs: Vec<Box<dyn Fn(&HgPath) -> bool + Sync>> = vec![];
     // For debugging and printing
     let mut patterns = vec![];
-    let mut all_warnings = vec![];
 
-    let (subincludes, ignore_patterns) =
-        filter_subincludes(ignore_patterns, root_dir)?;
+    let (subincludes, ignore_patterns) = filter_subincludes(ignore_patterns)?;
 
     if !subincludes.is_empty() {
         // Build prefix-based matcher functions for subincludes
         let mut submatchers = FastHashMap::default();
         let mut prefixes = vec![];
 
-        for SubInclude { prefix, root, path } in subincludes.into_iter() {
-            let (match_fn, warnings) =
-                get_ignore_function(vec![path.to_path_buf()], &root)?;
-            all_warnings.extend(warnings);
-            prefixes.push(prefix.to_owned());
-            submatchers.insert(prefix.to_owned(), match_fn);
+        for sub_include in subincludes {
+            let matcher = IncludeMatcher::new(sub_include.included_patterns)?;
+            let match_fn =
+                Box::new(move |path: &HgPath| matcher.matches(path));
+            prefixes.push(sub_include.prefix.clone());
+            submatchers.insert(sub_include.prefix.clone(), match_fn);
         }
 
         let match_subinclude = move |filename: &HgPath| {
@@ -556,14 +549,13 @@
     }
 
     Ok(if match_funcs.len() == 1 {
-        (patterns, match_funcs.remove(0), all_warnings)
+        (patterns, match_funcs.remove(0))
     } else {
         (
             patterns,
             Box::new(move |f: &HgPath| -> bool {
                 match_funcs.iter().any(|match_func| match_func(f))
             }),
-            all_warnings,
         )
     })
 }
@@ -588,8 +580,7 @@
         all_patterns.extend(patterns.to_owned());
         all_warnings.extend(warnings);
     }
-    let (matcher, warnings) = IncludeMatcher::new(all_patterns, root_dir)?;
-    all_warnings.extend(warnings);
+    let matcher = IncludeMatcher::new(all_patterns)?;
     Ok((
         Box::new(move |path: &HgPath| matcher.matches(path)),
         all_warnings,
@@ -597,34 +588,26 @@
 }
 
 impl<'a> IncludeMatcher<'a> {
-    pub fn new(
-        ignore_patterns: Vec<IgnorePattern>,
-        root_dir: &Path,
-    ) -> PatternResult<(Self, Vec<PatternFileWarning>)> {
-        let (patterns, match_fn, warnings) =
-            build_match(&ignore_patterns, root_dir)?;
+    pub fn new(ignore_patterns: Vec<IgnorePattern>) -> PatternResult<Self> {
         let RootsDirsAndParents {
             roots,
             dirs,
             parents,
         } = roots_dirs_and_parents(&ignore_patterns)?;
-
         let prefix = ignore_patterns.iter().any(|k| match k.syntax {
             PatternSyntax::Path | PatternSyntax::RelPath => true,
             _ => false,
         });
+        let (patterns, match_fn) = build_match(ignore_patterns)?;
 
-        Ok((
-            Self {
-                patterns,
-                match_fn,
-                prefix,
-                roots,
-                dirs,
-                parents,
-            },
-            warnings,
-        ))
+        Ok(Self {
+            patterns,
+            match_fn,
+            prefix,
+            roots,
+            dirs,
+            parents,
+        })
     }
 
     fn get_all_parents_children(&self) -> DirsChildrenMultiset {
@@ -810,14 +793,11 @@
     #[test]
     fn test_includematcher() {
         // VisitchildrensetPrefix
-        let (matcher, _) = IncludeMatcher::new(
-            vec![IgnorePattern::new(
-                PatternSyntax::RelPath,
-                b"dir/subdir",
-                Path::new(""),
-            )],
-            "".as_ref(),
-        )
+        let matcher = IncludeMatcher::new(vec![IgnorePattern::new(
+            PatternSyntax::RelPath,
+            b"dir/subdir",
+            Path::new(""),
+        )])
         .unwrap();
 
         let mut set = HashSet::new();
@@ -848,14 +828,11 @@
         );
 
         // VisitchildrensetRootfilesin
-        let (matcher, _) = IncludeMatcher::new(
-            vec![IgnorePattern::new(
-                PatternSyntax::RootFiles,
-                b"dir/subdir",
-                Path::new(""),
-            )],
-            "".as_ref(),
-        )
+        let matcher = IncludeMatcher::new(vec![IgnorePattern::new(
+            PatternSyntax::RootFiles,
+            b"dir/subdir",
+            Path::new(""),
+        )])
         .unwrap();
 
         let mut set = HashSet::new();
@@ -886,14 +863,11 @@
         );
 
         // VisitchildrensetGlob
-        let (matcher, _) = IncludeMatcher::new(
-            vec![IgnorePattern::new(
-                PatternSyntax::Glob,
-                b"dir/z*",
-                Path::new(""),
-            )],
-            "".as_ref(),
-        )
+        let matcher = IncludeMatcher::new(vec![IgnorePattern::new(
+            PatternSyntax::Glob,
+            b"dir/z*",
+            Path::new(""),
+        )])
         .unwrap();
 
         let mut set = HashSet::new();
--- a/rust/hg-cpython/src/dirstate/status.rs	Wed Jun 02 18:14:44 2021 +0200
+++ b/rust/hg-cpython/src/dirstate/status.rs	Wed Jun 02 18:03:43 2021 +0200
@@ -211,12 +211,9 @@
                 .collect();
 
             let ignore_patterns = ignore_patterns?;
-            let mut all_warnings = vec![];
 
-            let (matcher, warnings) =
-                IncludeMatcher::new(ignore_patterns, &root_dir)
-                    .map_err(|e| handle_fallback(py, e.into()))?;
-            all_warnings.extend(warnings);
+            let matcher = IncludeMatcher::new(ignore_patterns)
+                .map_err(|e| handle_fallback(py, e.into()))?;
 
             let (status_res, warnings) = dmap
                 .status(
@@ -234,9 +231,7 @@
                 )
                 .map_err(|e| handle_fallback(py, e))?;
 
-            all_warnings.extend(warnings);
-
-            build_response(py, status_res, all_warnings)
+            build_response(py, status_res, warnings)
         }
         e => Err(PyErr::new::<ValueError, _>(
             py,