dirstate-v2: Store a hash of ignore patterns (.hgignore)
authorSimon Sapin <simon.sapin@octobus.net>
Wed, 02 Jun 2021 11:25:18 +0200
changeset 47415 0ef8231e413f
parent 47414 7954ee2d7cf7
child 47416 65333ecc8b34
dirstate-v2: Store a hash of ignore patterns (.hgignore) Later, this help extend `read_dir` caching to directories that contain ignored files (but no unknown files). Such cache must be invalidated when ignore patterns change since a formerly-ignored file might become unknown. This helps the default configuration of `hg status` where unknown files must be listed, but ignored files are not. Differential Revision: https://phab.mercurial-scm.org/D10836
rust/hg-core/src/dirstate/status.rs
rust/hg-core/src/dirstate_tree/dirstate_map.rs
rust/hg-core/src/dirstate_tree/on_disk.rs
rust/hg-core/src/dirstate_tree/status.rs
rust/hg-core/src/filepatterns.rs
rust/hg-core/src/matchers.rs
tests/test-hgignore.t
--- a/rust/hg-core/src/dirstate/status.rs	Mon Jun 07 17:29:32 2021 +0530
+++ b/rust/hg-core/src/dirstate/status.rs	Wed Jun 02 11:25:18 2021 +0200
@@ -351,7 +351,7 @@
 
         let (ignore_fn, warnings): (IgnoreFnType, _) =
             if options.list_ignored || options.list_unknown {
-                get_ignore_function(ignore_files, &root_dir)?
+                get_ignore_function(ignore_files, &root_dir, &mut |_| {})?
             } else {
                 (Box::new(|&_| true), vec![])
             };
--- a/rust/hg-core/src/dirstate_tree/dirstate_map.rs	Mon Jun 07 17:29:32 2021 +0530
+++ b/rust/hg-core/src/dirstate_tree/dirstate_map.rs	Wed Jun 02 11:25:18 2021 +0200
@@ -37,6 +37,9 @@
     /// Number of nodes anywhere in the tree that have
     /// `.copy_source.is_some()`.
     pub(super) nodes_with_copy_source_count: u32,
+
+    /// See on_disk::Header
+    pub(super) ignore_patterns_hash: on_disk::IgnorePatternsHash,
 }
 
 /// Using a plain `HgPathBuf` of the full path from the repository root as a
@@ -385,6 +388,7 @@
             root: ChildNodes::default(),
             nodes_with_entry_count: 0,
             nodes_with_copy_source_count: 0,
+            ignore_patterns_hash: [0; on_disk::IGNORE_PATTERNS_HASH_LEN],
         }
     }
 
--- a/rust/hg-core/src/dirstate_tree/on_disk.rs	Mon Jun 07 17:29:32 2021 +0530
+++ b/rust/hg-core/src/dirstate_tree/on_disk.rs	Wed Jun 02 11:25:18 2021 +0200
@@ -28,6 +28,9 @@
 /// `.hg/requires` already governs which format should be used.
 pub const V2_FORMAT_MARKER: &[u8; 12] = b"dirstate-v2\n";
 
+pub(super) const IGNORE_PATTERNS_HASH_LEN: usize = 20;
+pub(super) type IgnorePatternsHash = [u8; IGNORE_PATTERNS_HASH_LEN];
+
 #[derive(BytesCast)]
 #[repr(C)]
 struct Header {
@@ -40,6 +43,27 @@
     root: ChildNodes,
     nodes_with_entry_count: Size,
     nodes_with_copy_source_count: Size,
+
+    /// If non-zero, a hash of ignore files that were used for some previous
+    /// run of the `status` algorithm.
+    ///
+    /// We define:
+    ///
+    /// * "Root" ignore files are `.hgignore` at the root of the repository if
+    ///   it exists, and files from `ui.ignore.*` config. This set of files is
+    ///   then sorted by the string representation of their path.
+    /// * The "expanded contents" of an ignore files is the byte string made
+    ///   by concatenating its contents with the "expanded contents" of other
+    ///   files included with `include:` or `subinclude:` files, in inclusion
+    ///   order. This definition is recursive, as included files can
+    ///   themselves include more files.
+    ///
+    /// This hash is defined as the SHA-1 of the concatenation (in sorted
+    /// order) of the "expanded contents" of each "root" ignore file.
+    /// (Note that computing this does not require actually concatenating byte
+    /// strings into contiguous memory, instead SHA-1 hashing can be done
+    /// incrementally.)
+    ignore_patterns_hash: IgnorePatternsHash,
 }
 
 #[derive(BytesCast)]
@@ -145,7 +169,7 @@
 
 /// Make sure that size-affecting changes are made knowingly
 fn _static_assert_size_of() {
-    let _ = std::mem::transmute::<Header, [u8; 72]>;
+    let _ = std::mem::transmute::<Header, [u8; 92]>;
     let _ = std::mem::transmute::<Node, [u8; 57]>;
 }
 
@@ -197,6 +221,7 @@
         nodes_with_copy_source_count: header
             .nodes_with_copy_source_count
             .get(),
+        ignore_patterns_hash: header.ignore_patterns_hash,
     };
     let parents = Some(header.parents.clone());
     Ok((dirstate_map, parents))
@@ -473,6 +498,7 @@
         nodes_with_copy_source_count: dirstate_map
             .nodes_with_copy_source_count
             .into(),
+        ignore_patterns_hash: dirstate_map.ignore_patterns_hash,
     };
     out[..header_len].copy_from_slice(header.as_bytes());
     Ok(out)
--- a/rust/hg-core/src/dirstate_tree/status.rs	Mon Jun 07 17:29:32 2021 +0530
+++ b/rust/hg-core/src/dirstate_tree/status.rs	Wed Jun 02 11:25:18 2021 +0200
@@ -21,6 +21,7 @@
 use crate::StatusOptions;
 use micro_timer::timed;
 use rayon::prelude::*;
+use sha1::{Digest, Sha1};
 use std::borrow::Cow;
 use std::io;
 use std::path::Path;
@@ -45,11 +46,20 @@
     ignore_files: Vec<PathBuf>,
     options: StatusOptions,
 ) -> Result<(DirstateStatus<'on_disk>, Vec<PatternFileWarning>), StatusError> {
-    let (ignore_fn, warnings): (IgnoreFnType, _) =
+    let (ignore_fn, warnings, patterns_changed): (IgnoreFnType, _, _) =
         if options.list_ignored || options.list_unknown {
-            get_ignore_function(ignore_files, &root_dir)?
+            let mut hasher = Sha1::new();
+            let (ignore_fn, warnings) = get_ignore_function(
+                ignore_files,
+                &root_dir,
+                &mut |pattern_bytes| hasher.update(pattern_bytes),
+            )?;
+            let new_hash = *hasher.finalize().as_ref();
+            let changed = new_hash != dmap.ignore_patterns_hash;
+            dmap.ignore_patterns_hash = new_hash;
+            (ignore_fn, warnings, Some(changed))
         } else {
-            (Box::new(|&_| true), vec![])
+            (Box::new(|&_| true), vec![], None)
         };
 
     let common = StatusCommon {
@@ -58,7 +68,8 @@
         matcher,
         ignore_fn,
         outcome: Default::default(),
-        cached_directory_mtimes_to_add: Default::default(),
+        ignore_patterns_have_changed: patterns_changed,
+        new_cachable_directories: Default::default(),
         filesystem_time_at_status_start: filesystem_now(&root_dir).ok(),
     };
     let is_at_repo_root = true;
@@ -79,9 +90,12 @@
         is_at_repo_root,
     )?;
     let mut outcome = common.outcome.into_inner().unwrap();
-    let to_add = common.cached_directory_mtimes_to_add.into_inner().unwrap();
-    outcome.dirty = !to_add.is_empty();
-    for (path, mtime) in &to_add {
+    let new_cachable = common.new_cachable_directories.into_inner().unwrap();
+
+    outcome.dirty = common.ignore_patterns_have_changed == Some(true)
+        || !new_cachable.is_empty();
+
+    for (path, mtime) in &new_cachable {
         let node = DirstateMap::get_or_insert_node(
             dmap.on_disk,
             &mut dmap.root,
@@ -96,6 +110,7 @@
             }
         }
     }
+
     Ok((outcome, warnings))
 }
 
@@ -107,8 +122,13 @@
     matcher: &'a (dyn Matcher + Sync),
     ignore_fn: IgnoreFnType<'a>,
     outcome: Mutex<DirstateStatus<'on_disk>>,
-    cached_directory_mtimes_to_add:
-        Mutex<Vec<(Cow<'on_disk, HgPath>, Timestamp)>>,
+    new_cachable_directories: Mutex<Vec<(Cow<'on_disk, HgPath>, Timestamp)>>,
+
+    /// Whether ignore files like `.hgignore` have changed since the previous
+    /// time a `status()` call wrote their hash to the dirstate. `None` means
+    /// we don’t know as this run doesn’t list either ignored or uknown files
+    /// and therefore isn’t reading `.hgignore`.
+    ignore_patterns_have_changed: Option<bool>,
 
     /// The current time at the start of the `status()` algorithm, as measured
     /// and possibly truncated by the filesystem.
@@ -422,7 +442,7 @@
                         let hg_path = dirstate_node
                             .full_path_borrowed(self.dmap.on_disk)?
                             .detach_from_tree();
-                        self.cached_directory_mtimes_to_add
+                        self.new_cachable_directories
                             .lock()
                             .unwrap()
                             .push((hg_path, timestamp))
--- a/rust/hg-core/src/filepatterns.rs	Mon Jun 07 17:29:32 2021 +0530
+++ b/rust/hg-core/src/filepatterns.rs	Wed Jun 02 11:25:18 2021 +0200
@@ -17,8 +17,6 @@
 };
 use lazy_static::lazy_static;
 use regex::bytes::{NoExpand, Regex};
-use std::fs::File;
-use std::io::Read;
 use std::ops::Deref;
 use std::path::{Path, PathBuf};
 use std::vec::Vec;
@@ -410,24 +408,19 @@
 pub fn read_pattern_file(
     file_path: &Path,
     warn: bool,
+    inspect_pattern_bytes: &mut impl FnMut(&[u8]),
 ) -> Result<(Vec<IgnorePattern>, Vec<PatternFileWarning>), PatternError> {
-    let mut f = match File::open(file_path) {
-        Ok(f) => Ok(f),
-        Err(e) => match e.kind() {
-            std::io::ErrorKind::NotFound => {
-                return Ok((
-                    vec![],
-                    vec![PatternFileWarning::NoSuchFile(file_path.to_owned())],
-                ))
-            }
-            _ => Err(e),
-        },
-    }?;
-    let mut contents = Vec::new();
-
-    f.read_to_end(&mut contents)?;
-
-    Ok(parse_pattern_file_contents(&contents, file_path, warn)?)
+    match std::fs::read(file_path) {
+        Ok(contents) => {
+            inspect_pattern_bytes(&contents);
+            parse_pattern_file_contents(&contents, file_path, warn)
+        }
+        Err(e) if e.kind() == std::io::ErrorKind::NotFound => Ok((
+            vec![],
+            vec![PatternFileWarning::NoSuchFile(file_path.to_owned())],
+        )),
+        Err(e) => Err(e.into()),
+    }
 }
 
 /// Represents an entry in an "ignore" file.
@@ -458,8 +451,10 @@
 pub fn get_patterns_from_file(
     pattern_file: &Path,
     root_dir: &Path,
+    inspect_pattern_bytes: &mut impl FnMut(&[u8]),
 ) -> PatternResult<(Vec<IgnorePattern>, Vec<PatternFileWarning>)> {
-    let (patterns, mut warnings) = read_pattern_file(pattern_file, true)?;
+    let (patterns, mut warnings) =
+        read_pattern_file(pattern_file, true, inspect_pattern_bytes)?;
     let patterns = patterns
         .into_iter()
         .flat_map(|entry| -> PatternResult<_> {
@@ -467,8 +462,11 @@
                 PatternSyntax::Include => {
                     let inner_include =
                         root_dir.join(get_path_from_bytes(&entry.pattern));
-                    let (inner_pats, inner_warnings) =
-                        get_patterns_from_file(&inner_include, root_dir)?;
+                    let (inner_pats, inner_warnings) = get_patterns_from_file(
+                        &inner_include,
+                        root_dir,
+                        inspect_pattern_bytes,
+                    )?;
                     warnings.extend(inner_warnings);
                     inner_pats
                 }
@@ -482,6 +480,7 @@
                         get_patterns_from_file(
                             &sub_include.path,
                             &sub_include.root,
+                            inspect_pattern_bytes,
                         )?;
                     sub_include.included_patterns = inner_patterns;
                     warnings.extend(inner_warnings);
--- a/rust/hg-core/src/matchers.rs	Mon Jun 07 17:29:32 2021 +0530
+++ b/rust/hg-core/src/matchers.rs	Wed Jun 02 11:25:18 2021 +0200
@@ -564,8 +564,9 @@
 /// function that checks whether a given file (in the general sense) should be
 /// ignored.
 pub fn get_ignore_function<'a>(
-    all_pattern_files: Vec<PathBuf>,
+    mut all_pattern_files: Vec<PathBuf>,
     root_dir: &Path,
+    inspect_pattern_bytes: &mut impl FnMut(&[u8]),
 ) -> PatternResult<(
     Box<dyn for<'r> Fn(&'r HgPath) -> bool + Sync + 'a>,
     Vec<PatternFileWarning>,
@@ -573,9 +574,20 @@
     let mut all_patterns = vec![];
     let mut all_warnings = vec![];
 
+    // Sort to make the ordering of calls to `inspect_pattern_bytes`
+    // deterministic even if the ordering of `all_pattern_files` is not (such
+    // as when a iteration order of a Python dict or Rust HashMap is involved).
+    // Sort by "string" representation instead of the default by component
+    // (with a Rust-specific definition of a component)
+    all_pattern_files
+        .sort_unstable_by(|a, b| a.as_os_str().cmp(b.as_os_str()));
+
     for pattern_file in &all_pattern_files {
-        let (patterns, warnings) =
-            get_patterns_from_file(pattern_file, root_dir)?;
+        let (patterns, warnings) = get_patterns_from_file(
+            pattern_file,
+            root_dir,
+            inspect_pattern_bytes,
+        )?;
 
         all_patterns.extend(patterns.to_owned());
         all_warnings.extend(warnings);
--- a/tests/test-hgignore.t	Mon Jun 07 17:29:32 2021 +0530
+++ b/tests/test-hgignore.t	Wed Jun 02 11:25:18 2021 +0200
@@ -402,3 +402,23 @@
   $ hg up -qC .
 
 #endif
+
+#if dirstate-v2
+
+Check the hash of ignore patterns written in the dirstate at offset
+12 + 20 + 20 + 8 + 4 + 4 + 4 = 72
+
+  $ hg status > /dev/null
+  $ cat .hg/testhgignore .hg/testhgignorerel .hgignore dir2/.hgignore dir1/.hgignore dir1/.hgignoretwo | $TESTDIR/f --sha1
+  sha1=6e315b60f15fb5dfa02be00f3e2c8f923051f5ff
+  >>> import binascii; print(binascii.hexlify(open(".hg/dirstate", "rb").read()[72:][:20]).decode())
+  6e315b60f15fb5dfa02be00f3e2c8f923051f5ff
+
+  $ echo rel > .hg/testhgignorerel
+  $ hg status > /dev/null
+  $ cat .hg/testhgignore .hg/testhgignorerel .hgignore dir2/.hgignore dir1/.hgignore dir1/.hgignoretwo | $TESTDIR/f --sha1
+  sha1=dea19cc7119213f24b6b582a4bae7b0cb063e34e
+  >>> import binascii; print(binascii.hexlify(open(".hg/dirstate", "rb").read()[72:][:20]).decode())
+  dea19cc7119213f24b6b582a4bae7b0cb063e34e
+
+#endif