changeset 47355:7138c863d0a1

dirstate-v2: Skip readdir in status based on directory mtime When calling `read_dir` during `status` and the directory is found to be eligible for caching (see code comments), write the directory’s mtime to the dirstate. The presence of a directory mtime in the dirstate is meaningful and indicates eligibility. When an eligible directory mtime is found in the dirstate and `stat()` shows that the mtime has not changed, `status` can skip calling `read_dir` again and instead rely on the names of child nodes in the dirstate tree. The `tempfile` crate is used to create a temporary file in order to use its modification time as "current time" with the same truncation as other files and directories would have in their own modification time. Differential Revision: https://phab.mercurial-scm.org/D10826
author Simon Sapin <simon.sapin@octobus.net>
date Fri, 28 May 2021 11:48:59 +0200
parents a4de570e61fa
children 04d1f17f49e7
files rust/hg-core/Cargo.toml rust/hg-core/src/dirstate_tree/dirstate_map.rs rust/hg-core/src/dirstate_tree/on_disk.rs rust/hg-core/src/dirstate_tree/status.rs
diffstat 4 files changed, 242 insertions(+), 27 deletions(-) [+]
line wrap: on
line diff
--- a/rust/hg-core/Cargo.toml	Thu May 27 18:40:54 2021 +0200
+++ b/rust/hg-core/Cargo.toml	Fri May 28 11:48:59 2021 +0200
@@ -23,6 +23,7 @@
 regex = "1.3.9"
 twox-hash = "1.5.0"
 same-file = "1.0.6"
+tempfile = "3.1.0"
 crossbeam-channel = "0.4"
 micro-timer = "0.3.0"
 log = "0.4.8"
@@ -41,4 +42,3 @@
 [dev-dependencies]
 clap = "*"
 pretty_assertions = "0.6.1"
-tempfile = "3.1.0"
--- a/rust/hg-core/src/dirstate_tree/dirstate_map.rs	Thu May 27 18:40:54 2021 +0200
+++ b/rust/hg-core/src/dirstate_tree/dirstate_map.rs	Fri May 28 11:48:59 2021 +0200
@@ -317,6 +317,18 @@
         }
     }
 
+    pub(super) fn cached_directory_mtime(
+        &self,
+    ) -> Option<&on_disk::Timestamp> {
+        match self {
+            NodeRef::InMemory(_path, node) => match &node.data {
+                NodeData::CachedDirectory { mtime } => Some(mtime),
+                _ => None,
+            },
+            NodeRef::OnDisk(node) => node.cached_directory_mtime(),
+        }
+    }
+
     pub(super) fn tracked_descendants_count(&self) -> u32 {
         match self {
             NodeRef::InMemory(_path, node) => node.tracked_descendants_count,
@@ -479,7 +491,7 @@
         }
     }
 
-    fn get_or_insert_node<'tree, 'path>(
+    pub(super) fn get_or_insert_node<'tree, 'path>(
         on_disk: &'on_disk [u8],
         root: &'tree mut ChildNodes<'on_disk>,
         path: &'path HgPath,
--- a/rust/hg-core/src/dirstate_tree/on_disk.rs	Thu May 27 18:40:54 2021 +0200
+++ b/rust/hg-core/src/dirstate_tree/on_disk.rs	Fri May 28 11:48:59 2021 +0200
@@ -56,13 +56,31 @@
 
     /// Dependending on the value of `state`:
     ///
-    /// * A null byte: `data` represents nothing
+    /// * A null byte: `data` is not used.
+    ///
     /// * A `n`, `a`, `r`, or `m` ASCII byte: `state` and `data` together
-    ///   represents a dirstate entry like in the v1 format.
+    ///   represent a dirstate entry like in the v1 format.
+    ///
     /// * A `d` ASCII byte: the bytes of `data` should instead be interpreted
     ///   as the `Timestamp` for the mtime of a cached directory.
     ///
-    /// TODO: document directory caching
+    ///   The presence of this state means that at some point, this path in
+    ///   the working directory was observed:
+    ///
+    ///   - To be a directory
+    ///   - With the modification time as given by `Timestamp`
+    ///   - That timestamp was already strictly in the past when observed,
+    ///     meaning that later changes cannot happen in the same clock tick
+    ///     and must cause a different modification time (unless the system
+    ///     clock jumps back and we get unlucky, which is not impossible but
+    ///     but deemed unlikely enough).
+    ///   - The directory did not contain any child entry that did not have a
+    ///     corresponding dirstate node.
+    ///
+    ///   This means that if `std::fs::symlink_metadata` later reports the
+    ///   same modification time, we don’t need to call `std::fs::read_dir`
+    ///   again for this directory and can iterate child dirstate nodes
+    ///   instead.
     state: u8,
     data: Entry,
 }
@@ -76,7 +94,7 @@
 }
 
 /// Duration since the Unix epoch
-#[derive(BytesCast, Copy, Clone)]
+#[derive(BytesCast, Copy, Clone, PartialEq)]
 #[repr(C)]
 pub(super) struct Timestamp {
     seconds: I64Be,
@@ -258,6 +276,14 @@
         }
     }
 
+    pub(super) fn cached_directory_mtime(&self) -> Option<&Timestamp> {
+        if self.state == b'd' {
+            Some(self.data.as_timestamp())
+        } else {
+            None
+        }
+    }
+
     pub(super) fn state(
         &self,
     ) -> Result<Option<EntryState>, DirstateV2ParseError> {
@@ -326,8 +352,8 @@
     }
 }
 
-impl From<&'_ SystemTime> for Timestamp {
-    fn from(system_time: &'_ SystemTime) -> Self {
+impl From<SystemTime> for Timestamp {
+    fn from(system_time: SystemTime) -> Self {
         let (secs, nanos) = match system_time.duration_since(UNIX_EPOCH) {
             Ok(duration) => {
                 (duration.as_secs() as i64, duration.subsec_nanos())
--- a/rust/hg-core/src/dirstate_tree/status.rs	Thu May 27 18:40:54 2021 +0200
+++ b/rust/hg-core/src/dirstate_tree/status.rs	Fri May 28 11:48:59 2021 +0200
@@ -2,8 +2,11 @@
 use crate::dirstate_tree::dirstate_map::BorrowedPath;
 use crate::dirstate_tree::dirstate_map::ChildNodesRef;
 use crate::dirstate_tree::dirstate_map::DirstateMap;
+use crate::dirstate_tree::dirstate_map::NodeData;
 use crate::dirstate_tree::dirstate_map::NodeRef;
 use crate::dirstate_tree::on_disk::DirstateV2ParseError;
+use crate::dirstate_tree::on_disk::Timestamp;
+use crate::dirstate_tree::path_with_basename::WithBasename;
 use crate::matchers::get_ignore_function;
 use crate::matchers::Matcher;
 use crate::utils::files::get_bytes_from_os_string;
@@ -18,10 +21,12 @@
 use crate::StatusOptions;
 use micro_timer::timed;
 use rayon::prelude::*;
+use std::borrow::Cow;
 use std::io;
 use std::path::Path;
 use std::path::PathBuf;
 use std::sync::Mutex;
+use std::time::SystemTime;
 
 /// Returns the status of the working directory compared to its parent
 /// changeset.
@@ -52,19 +57,45 @@
         options,
         matcher,
         ignore_fn,
-        outcome: Mutex::new(DirstateStatus::default()),
+        outcome: Default::default(),
+        cached_directory_mtimes_to_add: Default::default(),
+        filesystem_time_at_status_start: filesystem_now(&root_dir).ok(),
     };
     let is_at_repo_root = true;
     let hg_path = &BorrowedPath::OnDisk(HgPath::new(""));
     let has_ignored_ancestor = false;
+    let root_cached_mtime = None;
+    let root_dir_metadata = None;
+    // If the path we have for the repository root is a symlink, do follow it.
+    // (As opposed to symlinks within the working directory which are not
+    // followed, using `std::fs::symlink_metadata`.)
     common.traverse_fs_directory_and_dirstate(
         has_ignored_ancestor,
         dmap.root.as_ref(),
         hg_path,
         &root_dir,
+        root_dir_metadata,
+        root_cached_mtime,
         is_at_repo_root,
     )?;
-    Ok((common.outcome.into_inner().unwrap(), warnings))
+    let outcome = common.outcome.into_inner().unwrap();
+    let to_add = common.cached_directory_mtimes_to_add.into_inner().unwrap();
+    for (path, mtime) in &to_add {
+        let node = DirstateMap::get_or_insert_node(
+            dmap.on_disk,
+            &mut dmap.root,
+            path,
+            WithBasename::to_cow_owned,
+            |_| {},
+        )?;
+        match &node.data {
+            NodeData::Entry(_) => {} // Don’t overwrite an entry
+            NodeData::CachedDirectory { .. } | NodeData::None => {
+                node.data = NodeData::CachedDirectory { mtime: *mtime }
+            }
+        }
+    }
+    Ok((outcome, warnings))
 }
 
 /// Bag of random things needed by various parts of the algorithm. Reduces the
@@ -75,6 +106,12 @@
     matcher: &'a (dyn Matcher + Sync),
     ignore_fn: IgnoreFnType<'a>,
     outcome: Mutex<DirstateStatus<'on_disk>>,
+    cached_directory_mtimes_to_add:
+        Mutex<Vec<(Cow<'on_disk, HgPath>, Timestamp)>>,
+
+    /// The current time at the start of the `status()` algorithm, as measured
+    /// and possibly truncated by the filesystem.
+    filesystem_time_at_status_start: Option<SystemTime>,
 }
 
 impl<'a, 'tree, 'on_disk> StatusCommon<'a, 'tree, 'on_disk> {
@@ -97,18 +134,54 @@
             .push((hg_path.to_owned().into(), BadMatch::OsError(errno)))
     }
 
+    /// If this returns true, we can get accurate results by only using
+    /// `symlink_metadata` for child nodes that exist in the dirstate and don’t
+    /// need to call `read_dir`.
+    fn can_skip_fs_readdir(
+        &self,
+        directory_metadata: Option<&std::fs::Metadata>,
+        cached_directory_mtime: Option<&Timestamp>,
+    ) -> bool {
+        if !self.options.list_unknown && !self.options.list_ignored {
+            // All states that we care about listing have corresponding
+            // dirstate entries.
+            // This happens for example with `hg status -mard`.
+            return true;
+        }
+        if let Some(cached_mtime) = cached_directory_mtime {
+            // The dirstate contains a cached mtime for this directory, set by
+            // a previous run of the `status` algorithm which found this
+            // directory eligible for `read_dir` caching.
+            if let Some(meta) = directory_metadata {
+                if let Ok(current_mtime) = meta.modified() {
+                    if current_mtime == cached_mtime.into() {
+                        // The mtime of that directory has not changed since
+                        // then, which means that the
+                        // results of `read_dir` should also
+                        // be unchanged.
+                        return true;
+                    }
+                }
+            }
+        }
+        false
+    }
+
+    /// Returns whether the filesystem directory was found to have any entry
+    /// that does not have a corresponding dirstate tree node.
     fn traverse_fs_directory_and_dirstate(
         &self,
         has_ignored_ancestor: bool,
         dirstate_nodes: ChildNodesRef<'tree, 'on_disk>,
         directory_hg_path: &BorrowedPath<'tree, 'on_disk>,
         directory_fs_path: &Path,
+        directory_metadata: Option<&std::fs::Metadata>,
+        cached_directory_mtime: Option<&Timestamp>,
         is_at_repo_root: bool,
-    ) -> Result<(), DirstateV2ParseError> {
-        if !self.options.list_unknown && !self.options.list_ignored {
-            // We only care about files in the dirstate, so we can skip listing
-            // filesystem directories entirely.
-            return dirstate_nodes
+    ) -> Result<bool, DirstateV2ParseError> {
+        if self.can_skip_fs_readdir(directory_metadata, cached_directory_mtime)
+        {
+            dirstate_nodes
                 .par_iter()
                 .map(|dirstate_node| {
                     let fs_path = directory_fs_path.join(get_path_from_bytes(
@@ -131,7 +204,13 @@
                         }
                     }
                 })
-                .collect();
+                .collect::<Result<_, _>>()?;
+
+            // Conservatively don’t let the caller assume that there aren’t
+            // any, since we don’t know.
+            let directory_has_any_fs_only_entry = true;
+
+            return Ok(directory_has_any_fs_only_entry);
         }
 
         let mut fs_entries = if let Ok(entries) = self.read_dir(
@@ -174,6 +253,7 @@
         .par_bridge()
         .map(|pair| {
             use itertools::EitherOrBoth::*;
+            let is_fs_only = pair.is_right();
             match pair {
                 Both(dirstate_node, fs_entry) => self
                     .traverse_fs_and_dirstate(
@@ -181,18 +261,19 @@
                         &fs_entry.metadata,
                         dirstate_node,
                         has_ignored_ancestor,
-                    ),
+                    )?,
                 Left(dirstate_node) => {
-                    self.traverse_dirstate_only(dirstate_node)
+                    self.traverse_dirstate_only(dirstate_node)?
                 }
-                Right(fs_entry) => Ok(self.traverse_fs_only(
+                Right(fs_entry) => self.traverse_fs_only(
                     has_ignored_ancestor,
                     directory_hg_path,
                     fs_entry,
-                )),
+                ),
             }
+            Ok(is_fs_only)
         })
-        .collect()
+        .try_reduce(|| false, |a, b| Ok(a || b))
     }
 
     fn traverse_fs_and_dirstate(
@@ -224,12 +305,20 @@
             }
             let is_ignored = has_ignored_ancestor || (self.ignore_fn)(hg_path);
             let is_at_repo_root = false;
-            self.traverse_fs_directory_and_dirstate(
-                is_ignored,
-                dirstate_node.children(self.dmap.on_disk)?,
-                hg_path,
-                fs_path,
-                is_at_repo_root,
+            let directory_has_any_fs_only_entry = self
+                .traverse_fs_directory_and_dirstate(
+                    is_ignored,
+                    dirstate_node.children(self.dmap.on_disk)?,
+                    hg_path,
+                    fs_path,
+                    Some(fs_metadata),
+                    dirstate_node.cached_directory_mtime(),
+                    is_at_repo_root,
+                )?;
+            self.maybe_save_directory_mtime(
+                directory_has_any_fs_only_entry,
+                fs_metadata,
+                dirstate_node,
             )?
         } else {
             if file_or_symlink && self.matcher.matches(hg_path) {
@@ -274,6 +363,75 @@
         Ok(())
     }
 
+    fn maybe_save_directory_mtime(
+        &self,
+        directory_has_any_fs_only_entry: bool,
+        directory_metadata: &std::fs::Metadata,
+        dirstate_node: NodeRef<'tree, 'on_disk>,
+    ) -> Result<(), DirstateV2ParseError> {
+        if !directory_has_any_fs_only_entry {
+            // All filesystem directory entries from `read_dir` have a
+            // corresponding node in the dirstate, so we can reconstitute the
+            // names of those entries without calling `read_dir` again.
+            if let (Some(status_start), Ok(directory_mtime)) = (
+                &self.filesystem_time_at_status_start,
+                directory_metadata.modified(),
+            ) {
+                // Although the Rust standard library’s `SystemTime` type
+                // has nanosecond precision, the times reported for a
+                // directory’s (or file’s) modified time may have lower
+                // resolution based on the filesystem (for example ext3
+                // only stores integer seconds), kernel (see
+                // https://stackoverflow.com/a/14393315/1162888), etc.
+                if &directory_mtime >= status_start {
+                    // The directory was modified too recently, don’t cache its
+                    // `read_dir` results.
+                    //
+                    // A timeline like this is possible:
+                    //
+                    // 1. A change to this directory (direct child was
+                    //    added or removed) cause its mtime to be set
+                    //    (possibly truncated) to `directory_mtime`
+                    // 2. This `status` algorithm calls `read_dir`
+                    // 3. An other change is made to the same directory is
+                    //    made so that calling `read_dir` agin would give
+                    //    different results, but soon enough after 1. that
+                    //    the mtime stays the same
+                    //
+                    // On a system where the time resolution poor, this
+                    // scenario is not unlikely if all three steps are caused
+                    // by the same script.
+                } else {
+                    // We’ve observed (through `status_start`) that time has
+                    // “progressed” since `directory_mtime`, so any further
+                    // change to this directory is extremely likely to cause a
+                    // different mtime.
+                    //
+                    // Having the same mtime again is not entirely impossible
+                    // since the system clock is not monotonous. It could jump
+                    // backward to some point before `directory_mtime`, then a
+                    // directory change could potentially happen during exactly
+                    // the wrong tick.
+                    //
+                    // We deem this scenario (unlike the previous one) to be
+                    // unlikely enough in practice.
+                    let timestamp = directory_mtime.into();
+                    let cached = dirstate_node.cached_directory_mtime();
+                    if cached != Some(&timestamp) {
+                        let hg_path = dirstate_node
+                            .full_path_borrowed(self.dmap.on_disk)?
+                            .detach_from_tree();
+                        self.cached_directory_mtimes_to_add
+                            .lock()
+                            .unwrap()
+                            .push((hg_path, timestamp))
+                    }
+                }
+            }
+        }
+        Ok(())
+    }
+
     /// A file with `EntryState::Normal` in the dirstate was found in the
     /// filesystem
     fn handle_normal_file(
@@ -505,3 +663,22 @@
         Ok(results)
     }
 }
+
+/// Return the `mtime` of a temporary file newly-created in the `.hg` directory
+/// of the give repository.
+///
+/// This is similar to `SystemTime::now()`, with the result truncated to the
+/// same time resolution as other files’ modification times. Using `.hg`
+/// instead of the system’s default temporary directory (such as `/tmp`) makes
+/// it more likely the temporary file is in the same disk partition as contents
+/// of the working directory, which can matter since different filesystems may
+/// store timestamps with different resolutions.
+///
+/// This may fail, typically if we lack write permissions. In that case we
+/// should continue the `status()` algoritm anyway and consider the current
+/// date/time to be unknown.
+fn filesystem_now(repo_root: &Path) -> Result<SystemTime, io::Error> {
+    tempfile::tempfile_in(repo_root.join(".hg"))?
+        .metadata()?
+        .modified()
+}