dirstate-v2: skip evaluation of hgignore regex on cached directories
authorArseniy Alekseyev <aalekseyev@janestreet.com>
Mon, 10 Oct 2022 14:48:39 +0100
changeset 49520 eb02decdf0ab
parent 49519 943509a58d29
child 49522 f599a946181d
dirstate-v2: skip evaluation of hgignore regex on cached directories By making the computation of [has_ignored_ancestor] lazy we're eliding its computation in the common case when none of its descendants have changed on disk. On a ~400k files repo, with a cached status, we saw a ~64% reduction in CPU time, resulting in a speedup of ~10-15% (on ZFS), and a speedup of ~38% of XFS (XFS has faster stat operations for some reason).
rust/Cargo.lock
rust/hg-core/Cargo.toml
rust/hg-core/src/dirstate_tree/status.rs
--- a/rust/Cargo.lock	Fri Sep 30 09:05:48 2022 -0600
+++ b/rust/Cargo.lock	Mon Oct 10 14:48:39 2022 +0100
@@ -468,6 +468,7 @@
  "log",
  "memmap2",
  "micro-timer",
+ "once_cell",
  "ouroboros",
  "pretty_assertions",
  "rand 0.8.5",
@@ -687,6 +688,12 @@
 ]
 
 [[package]]
+name = "once_cell"
+version = "1.14.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "2f7254b99e31cad77da24b08ebf628882739a608578bb1bcdfc1f9c21260d7c0"
+
+[[package]]
 name = "opaque-debug"
 version = "0.3.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
--- a/rust/hg-core/Cargo.toml	Fri Sep 30 09:05:48 2022 -0600
+++ b/rust/hg-core/Cargo.toml	Mon Oct 10 14:48:39 2022 +0100
@@ -35,6 +35,9 @@
 memmap2 = { version = "0.5.3", features = ["stable_deref_trait"] }
 zstd = "0.5.3"
 format-bytes = "0.3.0"
+# once_cell 1.15 uses edition 2021, while the heptapod CI 
+# uses an old version of Cargo that doesn't support it.
+once_cell = "=1.14.0"
 
 # We don't use the `miniz-oxide` backend to not change rhg benchmarks and until
 # we have a clearer view of which backend is the fastest.
--- a/rust/hg-core/src/dirstate_tree/status.rs	Fri Sep 30 09:05:48 2022 -0600
+++ b/rust/hg-core/src/dirstate_tree/status.rs	Mon Oct 10 14:48:39 2022 +0100
@@ -20,6 +20,7 @@
 use crate::StatusError;
 use crate::StatusOptions;
 use micro_timer::timed;
+use once_cell::sync::OnceCell;
 use rayon::prelude::*;
 use sha1::{Digest, Sha1};
 use std::borrow::Cow;
@@ -126,14 +127,14 @@
     };
     let is_at_repo_root = true;
     let hg_path = &BorrowedPath::OnDisk(HgPath::new(""));
-    let has_ignored_ancestor = false;
+    let has_ignored_ancestor = HasIgnoredAncestor::create(None, hg_path);
     let root_cached_mtime = None;
     let root_dir_metadata = None;
     // If the path we have for the repository root is a symlink, do follow it.
     // (As opposed to symlinks within the working directory which are not
     // followed, using `std::fs::symlink_metadata`.)
     common.traverse_fs_directory_and_dirstate(
-        has_ignored_ancestor,
+        &has_ignored_ancestor,
         dmap.root.as_ref(),
         hg_path,
         &root_dir,
@@ -196,6 +197,40 @@
     Unsure,
 }
 
+/// Lazy computation of whether a given path has a hgignored
+/// ancestor.
+struct HasIgnoredAncestor<'a> {
+    /// `path` and `parent` constitute the inputs to the computation,
+    /// `cache` stores the outcome.
+    path: &'a HgPath,
+    parent: Option<&'a HasIgnoredAncestor<'a>>,
+    cache: OnceCell<bool>,
+}
+
+impl<'a> HasIgnoredAncestor<'a> {
+    fn create(
+        parent: Option<&'a HasIgnoredAncestor<'a>>,
+        path: &'a HgPath,
+    ) -> HasIgnoredAncestor<'a> {
+        Self {
+            path,
+            parent,
+            cache: OnceCell::new(),
+        }
+    }
+
+    fn force<'b>(&self, ignore_fn: &IgnoreFnType<'b>) -> bool {
+        match self.parent {
+            None => false,
+            Some(parent) => {
+                *(parent.cache.get_or_init(|| {
+                    parent.force(ignore_fn) || ignore_fn(&self.path)
+                }))
+            }
+        }
+    }
+}
+
 impl<'a, 'tree, 'on_disk> StatusCommon<'a, 'tree, 'on_disk> {
     fn push_outcome(
         &self,
@@ -318,9 +353,9 @@
 
     /// Returns whether all child entries of the filesystem directory have a
     /// corresponding dirstate node or are ignored.
-    fn traverse_fs_directory_and_dirstate(
+    fn traverse_fs_directory_and_dirstate<'ancestor>(
         &self,
-        has_ignored_ancestor: bool,
+        has_ignored_ancestor: &'ancestor HasIgnoredAncestor<'ancestor>,
         dirstate_nodes: ChildNodesRef<'tree, 'on_disk>,
         directory_hg_path: &BorrowedPath<'tree, 'on_disk>,
         directory_fs_path: &Path,
@@ -418,7 +453,7 @@
                 }
                 Right(fs_entry) => {
                     has_dirstate_node_or_is_ignored = self.traverse_fs_only(
-                        has_ignored_ancestor,
+                        has_ignored_ancestor.force(&self.ignore_fn),
                         directory_hg_path,
                         fs_entry,
                     )
@@ -429,12 +464,12 @@
         .try_reduce(|| true, |a, b| Ok(a && b))
     }
 
-    fn traverse_fs_and_dirstate(
+    fn traverse_fs_and_dirstate<'ancestor>(
         &self,
         fs_path: &Path,
         fs_metadata: &std::fs::Metadata,
         dirstate_node: NodeRef<'tree, 'on_disk>,
-        has_ignored_ancestor: bool,
+        has_ignored_ancestor: &'ancestor HasIgnoredAncestor<'ancestor>,
     ) -> Result<(), DirstateV2ParseError> {
         self.check_for_outdated_directory_cache(&dirstate_node)?;
         let hg_path = &dirstate_node.full_path_borrowed(self.dmap.on_disk)?;
@@ -454,11 +489,14 @@
                     .traversed
                     .push(hg_path.detach_from_tree())
             }
-            let is_ignored = has_ignored_ancestor || (self.ignore_fn)(hg_path);
+            let is_ignored = HasIgnoredAncestor::create(
+                Some(&has_ignored_ancestor),
+                hg_path,
+            );
             let is_at_repo_root = false;
             let children_all_have_dirstate_node_or_are_ignored = self
                 .traverse_fs_directory_and_dirstate(
-                    is_ignored,
+                    &is_ignored,
                     dirstate_node.children(self.dmap.on_disk)?,
                     hg_path,
                     fs_path,
@@ -472,14 +510,14 @@
                 dirstate_node,
             )?
         } else {
-            if file_or_symlink && self.matcher.matches(hg_path) {
+            if file_or_symlink && self.matcher.matches(&hg_path) {
                 if let Some(entry) = dirstate_node.entry()? {
                     if !entry.any_tracked() {
                         // Forward-compat if we start tracking unknown/ignored
                         // files for caching reasons
                         self.mark_unknown_or_ignored(
-                            has_ignored_ancestor,
-                            hg_path,
+                            has_ignored_ancestor.force(&self.ignore_fn),
+                            &hg_path,
                         );
                     }
                     if entry.added() {
@@ -495,7 +533,7 @@
                     // `node.entry.is_none()` indicates a "directory"
                     // node, but the filesystem has a file
                     self.mark_unknown_or_ignored(
-                        has_ignored_ancestor,
+                        has_ignored_ancestor.force(&self.ignore_fn),
                         hg_path,
                     );
                 }