# HG changeset patch # User Simon Sapin # Date 1622195339 -7200 # Node ID 7138c863d0a1d1346705e52abda3adca9f505491 # Parent a4de570e61fae4d27144282edbb6463c1dc14304 dirstate-v2: Skip readdir in status based on directory mtime When calling `read_dir` during `status` and the directory is found to be eligible for caching (see code comments), write the directory’s mtime to the dirstate. The presence of a directory mtime in the dirstate is meaningful and indicates eligibility. When an eligible directory mtime is found in the dirstate and `stat()` shows that the mtime has not changed, `status` can skip calling `read_dir` again and instead rely on the names of child nodes in the dirstate tree. The `tempfile` crate is used to create a temporary file in order to use its modification time as "current time" with the same truncation as other files and directories would have in their own modification time. Differential Revision: https://phab.mercurial-scm.org/D10826 diff -r a4de570e61fa -r 7138c863d0a1 rust/hg-core/Cargo.toml --- a/rust/hg-core/Cargo.toml Thu May 27 18:40:54 2021 +0200 +++ b/rust/hg-core/Cargo.toml Fri May 28 11:48:59 2021 +0200 @@ -23,6 +23,7 @@ regex = "1.3.9" twox-hash = "1.5.0" same-file = "1.0.6" +tempfile = "3.1.0" crossbeam-channel = "0.4" micro-timer = "0.3.0" log = "0.4.8" @@ -41,4 +42,3 @@ [dev-dependencies] clap = "*" pretty_assertions = "0.6.1" -tempfile = "3.1.0" diff -r a4de570e61fa -r 7138c863d0a1 rust/hg-core/src/dirstate_tree/dirstate_map.rs --- a/rust/hg-core/src/dirstate_tree/dirstate_map.rs Thu May 27 18:40:54 2021 +0200 +++ b/rust/hg-core/src/dirstate_tree/dirstate_map.rs Fri May 28 11:48:59 2021 +0200 @@ -317,6 +317,18 @@ } } + pub(super) fn cached_directory_mtime( + &self, + ) -> Option<&on_disk::Timestamp> { + match self { + NodeRef::InMemory(_path, node) => match &node.data { + NodeData::CachedDirectory { mtime } => Some(mtime), + _ => None, + }, + NodeRef::OnDisk(node) => node.cached_directory_mtime(), + } + } + pub(super) fn tracked_descendants_count(&self) -> u32 { match self { NodeRef::InMemory(_path, node) => node.tracked_descendants_count, @@ -479,7 +491,7 @@ } } - fn get_or_insert_node<'tree, 'path>( + pub(super) fn get_or_insert_node<'tree, 'path>( on_disk: &'on_disk [u8], root: &'tree mut ChildNodes<'on_disk>, path: &'path HgPath, diff -r a4de570e61fa -r 7138c863d0a1 rust/hg-core/src/dirstate_tree/on_disk.rs --- a/rust/hg-core/src/dirstate_tree/on_disk.rs Thu May 27 18:40:54 2021 +0200 +++ b/rust/hg-core/src/dirstate_tree/on_disk.rs Fri May 28 11:48:59 2021 +0200 @@ -56,13 +56,31 @@ /// Dependending on the value of `state`: /// - /// * A null byte: `data` represents nothing + /// * A null byte: `data` is not used. + /// /// * A `n`, `a`, `r`, or `m` ASCII byte: `state` and `data` together - /// represents a dirstate entry like in the v1 format. + /// represent a dirstate entry like in the v1 format. + /// /// * A `d` ASCII byte: the bytes of `data` should instead be interpreted /// as the `Timestamp` for the mtime of a cached directory. /// - /// TODO: document directory caching + /// The presence of this state means that at some point, this path in + /// the working directory was observed: + /// + /// - To be a directory + /// - With the modification time as given by `Timestamp` + /// - That timestamp was already strictly in the past when observed, + /// meaning that later changes cannot happen in the same clock tick + /// and must cause a different modification time (unless the system + /// clock jumps back and we get unlucky, which is not impossible but + /// but deemed unlikely enough). + /// - The directory did not contain any child entry that did not have a + /// corresponding dirstate node. + /// + /// This means that if `std::fs::symlink_metadata` later reports the + /// same modification time, we don’t need to call `std::fs::read_dir` + /// again for this directory and can iterate child dirstate nodes + /// instead. state: u8, data: Entry, } @@ -76,7 +94,7 @@ } /// Duration since the Unix epoch -#[derive(BytesCast, Copy, Clone)] +#[derive(BytesCast, Copy, Clone, PartialEq)] #[repr(C)] pub(super) struct Timestamp { seconds: I64Be, @@ -258,6 +276,14 @@ } } + pub(super) fn cached_directory_mtime(&self) -> Option<&Timestamp> { + if self.state == b'd' { + Some(self.data.as_timestamp()) + } else { + None + } + } + pub(super) fn state( &self, ) -> Result, DirstateV2ParseError> { @@ -326,8 +352,8 @@ } } -impl From<&'_ SystemTime> for Timestamp { - fn from(system_time: &'_ SystemTime) -> Self { +impl From for Timestamp { + fn from(system_time: SystemTime) -> Self { let (secs, nanos) = match system_time.duration_since(UNIX_EPOCH) { Ok(duration) => { (duration.as_secs() as i64, duration.subsec_nanos()) diff -r a4de570e61fa -r 7138c863d0a1 rust/hg-core/src/dirstate_tree/status.rs --- a/rust/hg-core/src/dirstate_tree/status.rs Thu May 27 18:40:54 2021 +0200 +++ b/rust/hg-core/src/dirstate_tree/status.rs Fri May 28 11:48:59 2021 +0200 @@ -2,8 +2,11 @@ use crate::dirstate_tree::dirstate_map::BorrowedPath; use crate::dirstate_tree::dirstate_map::ChildNodesRef; use crate::dirstate_tree::dirstate_map::DirstateMap; +use crate::dirstate_tree::dirstate_map::NodeData; use crate::dirstate_tree::dirstate_map::NodeRef; use crate::dirstate_tree::on_disk::DirstateV2ParseError; +use crate::dirstate_tree::on_disk::Timestamp; +use crate::dirstate_tree::path_with_basename::WithBasename; use crate::matchers::get_ignore_function; use crate::matchers::Matcher; use crate::utils::files::get_bytes_from_os_string; @@ -18,10 +21,12 @@ use crate::StatusOptions; use micro_timer::timed; use rayon::prelude::*; +use std::borrow::Cow; use std::io; use std::path::Path; use std::path::PathBuf; use std::sync::Mutex; +use std::time::SystemTime; /// Returns the status of the working directory compared to its parent /// changeset. @@ -52,19 +57,45 @@ options, matcher, ignore_fn, - outcome: Mutex::new(DirstateStatus::default()), + outcome: Default::default(), + cached_directory_mtimes_to_add: Default::default(), + filesystem_time_at_status_start: filesystem_now(&root_dir).ok(), }; let is_at_repo_root = true; let hg_path = &BorrowedPath::OnDisk(HgPath::new("")); let has_ignored_ancestor = false; + let root_cached_mtime = None; + let root_dir_metadata = None; + // If the path we have for the repository root is a symlink, do follow it. + // (As opposed to symlinks within the working directory which are not + // followed, using `std::fs::symlink_metadata`.) common.traverse_fs_directory_and_dirstate( has_ignored_ancestor, dmap.root.as_ref(), hg_path, &root_dir, + root_dir_metadata, + root_cached_mtime, is_at_repo_root, )?; - Ok((common.outcome.into_inner().unwrap(), warnings)) + let outcome = common.outcome.into_inner().unwrap(); + let to_add = common.cached_directory_mtimes_to_add.into_inner().unwrap(); + for (path, mtime) in &to_add { + let node = DirstateMap::get_or_insert_node( + dmap.on_disk, + &mut dmap.root, + path, + WithBasename::to_cow_owned, + |_| {}, + )?; + match &node.data { + NodeData::Entry(_) => {} // Don’t overwrite an entry + NodeData::CachedDirectory { .. } | NodeData::None => { + node.data = NodeData::CachedDirectory { mtime: *mtime } + } + } + } + Ok((outcome, warnings)) } /// Bag of random things needed by various parts of the algorithm. Reduces the @@ -75,6 +106,12 @@ matcher: &'a (dyn Matcher + Sync), ignore_fn: IgnoreFnType<'a>, outcome: Mutex>, + cached_directory_mtimes_to_add: + Mutex, Timestamp)>>, + + /// The current time at the start of the `status()` algorithm, as measured + /// and possibly truncated by the filesystem. + filesystem_time_at_status_start: Option, } impl<'a, 'tree, 'on_disk> StatusCommon<'a, 'tree, 'on_disk> { @@ -97,18 +134,54 @@ .push((hg_path.to_owned().into(), BadMatch::OsError(errno))) } + /// If this returns true, we can get accurate results by only using + /// `symlink_metadata` for child nodes that exist in the dirstate and don’t + /// need to call `read_dir`. + fn can_skip_fs_readdir( + &self, + directory_metadata: Option<&std::fs::Metadata>, + cached_directory_mtime: Option<&Timestamp>, + ) -> bool { + if !self.options.list_unknown && !self.options.list_ignored { + // All states that we care about listing have corresponding + // dirstate entries. + // This happens for example with `hg status -mard`. + return true; + } + if let Some(cached_mtime) = cached_directory_mtime { + // The dirstate contains a cached mtime for this directory, set by + // a previous run of the `status` algorithm which found this + // directory eligible for `read_dir` caching. + if let Some(meta) = directory_metadata { + if let Ok(current_mtime) = meta.modified() { + if current_mtime == cached_mtime.into() { + // The mtime of that directory has not changed since + // then, which means that the + // results of `read_dir` should also + // be unchanged. + return true; + } + } + } + } + false + } + + /// Returns whether the filesystem directory was found to have any entry + /// that does not have a corresponding dirstate tree node. fn traverse_fs_directory_and_dirstate( &self, has_ignored_ancestor: bool, dirstate_nodes: ChildNodesRef<'tree, 'on_disk>, directory_hg_path: &BorrowedPath<'tree, 'on_disk>, directory_fs_path: &Path, + directory_metadata: Option<&std::fs::Metadata>, + cached_directory_mtime: Option<&Timestamp>, is_at_repo_root: bool, - ) -> Result<(), DirstateV2ParseError> { - if !self.options.list_unknown && !self.options.list_ignored { - // We only care about files in the dirstate, so we can skip listing - // filesystem directories entirely. - return dirstate_nodes + ) -> Result { + if self.can_skip_fs_readdir(directory_metadata, cached_directory_mtime) + { + dirstate_nodes .par_iter() .map(|dirstate_node| { let fs_path = directory_fs_path.join(get_path_from_bytes( @@ -131,7 +204,13 @@ } } }) - .collect(); + .collect::>()?; + + // Conservatively don’t let the caller assume that there aren’t + // any, since we don’t know. + let directory_has_any_fs_only_entry = true; + + return Ok(directory_has_any_fs_only_entry); } let mut fs_entries = if let Ok(entries) = self.read_dir( @@ -174,6 +253,7 @@ .par_bridge() .map(|pair| { use itertools::EitherOrBoth::*; + let is_fs_only = pair.is_right(); match pair { Both(dirstate_node, fs_entry) => self .traverse_fs_and_dirstate( @@ -181,18 +261,19 @@ &fs_entry.metadata, dirstate_node, has_ignored_ancestor, - ), + )?, Left(dirstate_node) => { - self.traverse_dirstate_only(dirstate_node) + self.traverse_dirstate_only(dirstate_node)? } - Right(fs_entry) => Ok(self.traverse_fs_only( + Right(fs_entry) => self.traverse_fs_only( has_ignored_ancestor, directory_hg_path, fs_entry, - )), + ), } + Ok(is_fs_only) }) - .collect() + .try_reduce(|| false, |a, b| Ok(a || b)) } fn traverse_fs_and_dirstate( @@ -224,12 +305,20 @@ } let is_ignored = has_ignored_ancestor || (self.ignore_fn)(hg_path); let is_at_repo_root = false; - self.traverse_fs_directory_and_dirstate( - is_ignored, - dirstate_node.children(self.dmap.on_disk)?, - hg_path, - fs_path, - is_at_repo_root, + let directory_has_any_fs_only_entry = self + .traverse_fs_directory_and_dirstate( + is_ignored, + dirstate_node.children(self.dmap.on_disk)?, + hg_path, + fs_path, + Some(fs_metadata), + dirstate_node.cached_directory_mtime(), + is_at_repo_root, + )?; + self.maybe_save_directory_mtime( + directory_has_any_fs_only_entry, + fs_metadata, + dirstate_node, )? } else { if file_or_symlink && self.matcher.matches(hg_path) { @@ -274,6 +363,75 @@ Ok(()) } + fn maybe_save_directory_mtime( + &self, + directory_has_any_fs_only_entry: bool, + directory_metadata: &std::fs::Metadata, + dirstate_node: NodeRef<'tree, 'on_disk>, + ) -> Result<(), DirstateV2ParseError> { + if !directory_has_any_fs_only_entry { + // All filesystem directory entries from `read_dir` have a + // corresponding node in the dirstate, so we can reconstitute the + // names of those entries without calling `read_dir` again. + if let (Some(status_start), Ok(directory_mtime)) = ( + &self.filesystem_time_at_status_start, + directory_metadata.modified(), + ) { + // Although the Rust standard library’s `SystemTime` type + // has nanosecond precision, the times reported for a + // directory’s (or file’s) modified time may have lower + // resolution based on the filesystem (for example ext3 + // only stores integer seconds), kernel (see + // https://stackoverflow.com/a/14393315/1162888), etc. + if &directory_mtime >= status_start { + // The directory was modified too recently, don’t cache its + // `read_dir` results. + // + // A timeline like this is possible: + // + // 1. A change to this directory (direct child was + // added or removed) cause its mtime to be set + // (possibly truncated) to `directory_mtime` + // 2. This `status` algorithm calls `read_dir` + // 3. An other change is made to the same directory is + // made so that calling `read_dir` agin would give + // different results, but soon enough after 1. that + // the mtime stays the same + // + // On a system where the time resolution poor, this + // scenario is not unlikely if all three steps are caused + // by the same script. + } else { + // We’ve observed (through `status_start`) that time has + // “progressed” since `directory_mtime`, so any further + // change to this directory is extremely likely to cause a + // different mtime. + // + // Having the same mtime again is not entirely impossible + // since the system clock is not monotonous. It could jump + // backward to some point before `directory_mtime`, then a + // directory change could potentially happen during exactly + // the wrong tick. + // + // We deem this scenario (unlike the previous one) to be + // unlikely enough in practice. + let timestamp = directory_mtime.into(); + let cached = dirstate_node.cached_directory_mtime(); + if cached != Some(×tamp) { + let hg_path = dirstate_node + .full_path_borrowed(self.dmap.on_disk)? + .detach_from_tree(); + self.cached_directory_mtimes_to_add + .lock() + .unwrap() + .push((hg_path, timestamp)) + } + } + } + } + Ok(()) + } + /// A file with `EntryState::Normal` in the dirstate was found in the /// filesystem fn handle_normal_file( @@ -505,3 +663,22 @@ Ok(results) } } + +/// Return the `mtime` of a temporary file newly-created in the `.hg` directory +/// of the give repository. +/// +/// This is similar to `SystemTime::now()`, with the result truncated to the +/// same time resolution as other files’ modification times. Using `.hg` +/// instead of the system’s default temporary directory (such as `/tmp`) makes +/// it more likely the temporary file is in the same disk partition as contents +/// of the working directory, which can matter since different filesystems may +/// store timestamps with different resolutions. +/// +/// This may fail, typically if we lack write permissions. In that case we +/// should continue the `status()` algoritm anyway and consider the current +/// date/time to be unknown. +fn filesystem_now(repo_root: &Path) -> Result { + tempfile::tempfile_in(repo_root.join(".hg"))? + .metadata()? + .modified() +}