comparison rust/hg-core/src/dirstate_tree/status.rs @ 47349:7138c863d0a1

dirstate-v2: Skip readdir in status based on directory mtime When calling `read_dir` during `status` and the directory is found to be eligible for caching (see code comments), write the directory’s mtime to the dirstate. The presence of a directory mtime in the dirstate is meaningful and indicates eligibility. When an eligible directory mtime is found in the dirstate and `stat()` shows that the mtime has not changed, `status` can skip calling `read_dir` again and instead rely on the names of child nodes in the dirstate tree. The `tempfile` crate is used to create a temporary file in order to use its modification time as "current time" with the same truncation as other files and directories would have in their own modification time. Differential Revision: https://phab.mercurial-scm.org/D10826
author Simon Sapin <simon.sapin@octobus.net>
date Fri, 28 May 2021 11:48:59 +0200
parents 73ddcedeaadf
children 04d1f17f49e7
comparison
equal deleted inserted replaced
47348:a4de570e61fa 47349:7138c863d0a1
1 use crate::dirstate::status::IgnoreFnType; 1 use crate::dirstate::status::IgnoreFnType;
2 use crate::dirstate_tree::dirstate_map::BorrowedPath; 2 use crate::dirstate_tree::dirstate_map::BorrowedPath;
3 use crate::dirstate_tree::dirstate_map::ChildNodesRef; 3 use crate::dirstate_tree::dirstate_map::ChildNodesRef;
4 use crate::dirstate_tree::dirstate_map::DirstateMap; 4 use crate::dirstate_tree::dirstate_map::DirstateMap;
5 use crate::dirstate_tree::dirstate_map::NodeData;
5 use crate::dirstate_tree::dirstate_map::NodeRef; 6 use crate::dirstate_tree::dirstate_map::NodeRef;
6 use crate::dirstate_tree::on_disk::DirstateV2ParseError; 7 use crate::dirstate_tree::on_disk::DirstateV2ParseError;
8 use crate::dirstate_tree::on_disk::Timestamp;
9 use crate::dirstate_tree::path_with_basename::WithBasename;
7 use crate::matchers::get_ignore_function; 10 use crate::matchers::get_ignore_function;
8 use crate::matchers::Matcher; 11 use crate::matchers::Matcher;
9 use crate::utils::files::get_bytes_from_os_string; 12 use crate::utils::files::get_bytes_from_os_string;
10 use crate::utils::files::get_path_from_bytes; 13 use crate::utils::files::get_path_from_bytes;
11 use crate::utils::hg_path::HgPath; 14 use crate::utils::hg_path::HgPath;
16 use crate::PatternFileWarning; 19 use crate::PatternFileWarning;
17 use crate::StatusError; 20 use crate::StatusError;
18 use crate::StatusOptions; 21 use crate::StatusOptions;
19 use micro_timer::timed; 22 use micro_timer::timed;
20 use rayon::prelude::*; 23 use rayon::prelude::*;
24 use std::borrow::Cow;
21 use std::io; 25 use std::io;
22 use std::path::Path; 26 use std::path::Path;
23 use std::path::PathBuf; 27 use std::path::PathBuf;
24 use std::sync::Mutex; 28 use std::sync::Mutex;
29 use std::time::SystemTime;
25 30
26 /// Returns the status of the working directory compared to its parent 31 /// Returns the status of the working directory compared to its parent
27 /// changeset. 32 /// changeset.
28 /// 33 ///
29 /// This algorithm is based on traversing the filesystem tree (`fs` in function 34 /// This algorithm is based on traversing the filesystem tree (`fs` in function
50 let common = StatusCommon { 55 let common = StatusCommon {
51 dmap, 56 dmap,
52 options, 57 options,
53 matcher, 58 matcher,
54 ignore_fn, 59 ignore_fn,
55 outcome: Mutex::new(DirstateStatus::default()), 60 outcome: Default::default(),
61 cached_directory_mtimes_to_add: Default::default(),
62 filesystem_time_at_status_start: filesystem_now(&root_dir).ok(),
56 }; 63 };
57 let is_at_repo_root = true; 64 let is_at_repo_root = true;
58 let hg_path = &BorrowedPath::OnDisk(HgPath::new("")); 65 let hg_path = &BorrowedPath::OnDisk(HgPath::new(""));
59 let has_ignored_ancestor = false; 66 let has_ignored_ancestor = false;
67 let root_cached_mtime = None;
68 let root_dir_metadata = None;
69 // If the path we have for the repository root is a symlink, do follow it.
70 // (As opposed to symlinks within the working directory which are not
71 // followed, using `std::fs::symlink_metadata`.)
60 common.traverse_fs_directory_and_dirstate( 72 common.traverse_fs_directory_and_dirstate(
61 has_ignored_ancestor, 73 has_ignored_ancestor,
62 dmap.root.as_ref(), 74 dmap.root.as_ref(),
63 hg_path, 75 hg_path,
64 &root_dir, 76 &root_dir,
77 root_dir_metadata,
78 root_cached_mtime,
65 is_at_repo_root, 79 is_at_repo_root,
66 )?; 80 )?;
67 Ok((common.outcome.into_inner().unwrap(), warnings)) 81 let outcome = common.outcome.into_inner().unwrap();
82 let to_add = common.cached_directory_mtimes_to_add.into_inner().unwrap();
83 for (path, mtime) in &to_add {
84 let node = DirstateMap::get_or_insert_node(
85 dmap.on_disk,
86 &mut dmap.root,
87 path,
88 WithBasename::to_cow_owned,
89 |_| {},
90 )?;
91 match &node.data {
92 NodeData::Entry(_) => {} // Don’t overwrite an entry
93 NodeData::CachedDirectory { .. } | NodeData::None => {
94 node.data = NodeData::CachedDirectory { mtime: *mtime }
95 }
96 }
97 }
98 Ok((outcome, warnings))
68 } 99 }
69 100
70 /// Bag of random things needed by various parts of the algorithm. Reduces the 101 /// Bag of random things needed by various parts of the algorithm. Reduces the
71 /// number of parameters passed to functions. 102 /// number of parameters passed to functions.
72 struct StatusCommon<'a, 'tree, 'on_disk: 'tree> { 103 struct StatusCommon<'a, 'tree, 'on_disk: 'tree> {
73 dmap: &'tree DirstateMap<'on_disk>, 104 dmap: &'tree DirstateMap<'on_disk>,
74 options: StatusOptions, 105 options: StatusOptions,
75 matcher: &'a (dyn Matcher + Sync), 106 matcher: &'a (dyn Matcher + Sync),
76 ignore_fn: IgnoreFnType<'a>, 107 ignore_fn: IgnoreFnType<'a>,
77 outcome: Mutex<DirstateStatus<'on_disk>>, 108 outcome: Mutex<DirstateStatus<'on_disk>>,
109 cached_directory_mtimes_to_add:
110 Mutex<Vec<(Cow<'on_disk, HgPath>, Timestamp)>>,
111
112 /// The current time at the start of the `status()` algorithm, as measured
113 /// and possibly truncated by the filesystem.
114 filesystem_time_at_status_start: Option<SystemTime>,
78 } 115 }
79 116
80 impl<'a, 'tree, 'on_disk> StatusCommon<'a, 'tree, 'on_disk> { 117 impl<'a, 'tree, 'on_disk> StatusCommon<'a, 'tree, 'on_disk> {
81 fn read_dir( 118 fn read_dir(
82 &self, 119 &self,
95 .unwrap() 132 .unwrap()
96 .bad 133 .bad
97 .push((hg_path.to_owned().into(), BadMatch::OsError(errno))) 134 .push((hg_path.to_owned().into(), BadMatch::OsError(errno)))
98 } 135 }
99 136
137 /// If this returns true, we can get accurate results by only using
138 /// `symlink_metadata` for child nodes that exist in the dirstate and don’t
139 /// need to call `read_dir`.
140 fn can_skip_fs_readdir(
141 &self,
142 directory_metadata: Option<&std::fs::Metadata>,
143 cached_directory_mtime: Option<&Timestamp>,
144 ) -> bool {
145 if !self.options.list_unknown && !self.options.list_ignored {
146 // All states that we care about listing have corresponding
147 // dirstate entries.
148 // This happens for example with `hg status -mard`.
149 return true;
150 }
151 if let Some(cached_mtime) = cached_directory_mtime {
152 // The dirstate contains a cached mtime for this directory, set by
153 // a previous run of the `status` algorithm which found this
154 // directory eligible for `read_dir` caching.
155 if let Some(meta) = directory_metadata {
156 if let Ok(current_mtime) = meta.modified() {
157 if current_mtime == cached_mtime.into() {
158 // The mtime of that directory has not changed since
159 // then, which means that the
160 // results of `read_dir` should also
161 // be unchanged.
162 return true;
163 }
164 }
165 }
166 }
167 false
168 }
169
170 /// Returns whether the filesystem directory was found to have any entry
171 /// that does not have a corresponding dirstate tree node.
100 fn traverse_fs_directory_and_dirstate( 172 fn traverse_fs_directory_and_dirstate(
101 &self, 173 &self,
102 has_ignored_ancestor: bool, 174 has_ignored_ancestor: bool,
103 dirstate_nodes: ChildNodesRef<'tree, 'on_disk>, 175 dirstate_nodes: ChildNodesRef<'tree, 'on_disk>,
104 directory_hg_path: &BorrowedPath<'tree, 'on_disk>, 176 directory_hg_path: &BorrowedPath<'tree, 'on_disk>,
105 directory_fs_path: &Path, 177 directory_fs_path: &Path,
178 directory_metadata: Option<&std::fs::Metadata>,
179 cached_directory_mtime: Option<&Timestamp>,
106 is_at_repo_root: bool, 180 is_at_repo_root: bool,
107 ) -> Result<(), DirstateV2ParseError> { 181 ) -> Result<bool, DirstateV2ParseError> {
108 if !self.options.list_unknown && !self.options.list_ignored { 182 if self.can_skip_fs_readdir(directory_metadata, cached_directory_mtime)
109 // We only care about files in the dirstate, so we can skip listing 183 {
110 // filesystem directories entirely. 184 dirstate_nodes
111 return dirstate_nodes
112 .par_iter() 185 .par_iter()
113 .map(|dirstate_node| { 186 .map(|dirstate_node| {
114 let fs_path = directory_fs_path.join(get_path_from_bytes( 187 let fs_path = directory_fs_path.join(get_path_from_bytes(
115 dirstate_node.base_name(self.dmap.on_disk)?.as_bytes(), 188 dirstate_node.base_name(self.dmap.on_disk)?.as_bytes(),
116 )); 189 ));
129 dirstate_node.full_path(self.dmap.on_disk)?; 202 dirstate_node.full_path(self.dmap.on_disk)?;
130 Ok(self.io_error(error, hg_path)) 203 Ok(self.io_error(error, hg_path))
131 } 204 }
132 } 205 }
133 }) 206 })
134 .collect(); 207 .collect::<Result<_, _>>()?;
208
209 // Conservatively don’t let the caller assume that there aren’t
210 // any, since we don’t know.
211 let directory_has_any_fs_only_entry = true;
212
213 return Ok(directory_has_any_fs_only_entry);
135 } 214 }
136 215
137 let mut fs_entries = if let Ok(entries) = self.read_dir( 216 let mut fs_entries = if let Ok(entries) = self.read_dir(
138 directory_hg_path, 217 directory_hg_path,
139 directory_fs_path, 218 directory_fs_path,
172 }, 251 },
173 ) 252 )
174 .par_bridge() 253 .par_bridge()
175 .map(|pair| { 254 .map(|pair| {
176 use itertools::EitherOrBoth::*; 255 use itertools::EitherOrBoth::*;
256 let is_fs_only = pair.is_right();
177 match pair { 257 match pair {
178 Both(dirstate_node, fs_entry) => self 258 Both(dirstate_node, fs_entry) => self
179 .traverse_fs_and_dirstate( 259 .traverse_fs_and_dirstate(
180 &fs_entry.full_path, 260 &fs_entry.full_path,
181 &fs_entry.metadata, 261 &fs_entry.metadata,
182 dirstate_node, 262 dirstate_node,
183 has_ignored_ancestor, 263 has_ignored_ancestor,
184 ), 264 )?,
185 Left(dirstate_node) => { 265 Left(dirstate_node) => {
186 self.traverse_dirstate_only(dirstate_node) 266 self.traverse_dirstate_only(dirstate_node)?
187 } 267 }
188 Right(fs_entry) => Ok(self.traverse_fs_only( 268 Right(fs_entry) => self.traverse_fs_only(
189 has_ignored_ancestor, 269 has_ignored_ancestor,
190 directory_hg_path, 270 directory_hg_path,
191 fs_entry, 271 fs_entry,
192 )), 272 ),
193 } 273 }
274 Ok(is_fs_only)
194 }) 275 })
195 .collect() 276 .try_reduce(|| false, |a, b| Ok(a || b))
196 } 277 }
197 278
198 fn traverse_fs_and_dirstate( 279 fn traverse_fs_and_dirstate(
199 &self, 280 &self,
200 fs_path: &Path, 281 fs_path: &Path,
222 .traversed 303 .traversed
223 .push(hg_path.detach_from_tree()) 304 .push(hg_path.detach_from_tree())
224 } 305 }
225 let is_ignored = has_ignored_ancestor || (self.ignore_fn)(hg_path); 306 let is_ignored = has_ignored_ancestor || (self.ignore_fn)(hg_path);
226 let is_at_repo_root = false; 307 let is_at_repo_root = false;
227 self.traverse_fs_directory_and_dirstate( 308 let directory_has_any_fs_only_entry = self
228 is_ignored, 309 .traverse_fs_directory_and_dirstate(
229 dirstate_node.children(self.dmap.on_disk)?, 310 is_ignored,
230 hg_path, 311 dirstate_node.children(self.dmap.on_disk)?,
231 fs_path, 312 hg_path,
232 is_at_repo_root, 313 fs_path,
314 Some(fs_metadata),
315 dirstate_node.cached_directory_mtime(),
316 is_at_repo_root,
317 )?;
318 self.maybe_save_directory_mtime(
319 directory_has_any_fs_only_entry,
320 fs_metadata,
321 dirstate_node,
233 )? 322 )?
234 } else { 323 } else {
235 if file_or_symlink && self.matcher.matches(hg_path) { 324 if file_or_symlink && self.matcher.matches(hg_path) {
236 if let Some(state) = dirstate_node.state()? { 325 if let Some(state) = dirstate_node.state()? {
237 match state { 326 match state {
267 } 356 }
268 357
269 for child_node in dirstate_node.children(self.dmap.on_disk)?.iter() 358 for child_node in dirstate_node.children(self.dmap.on_disk)?.iter()
270 { 359 {
271 self.traverse_dirstate_only(child_node)? 360 self.traverse_dirstate_only(child_node)?
361 }
362 }
363 Ok(())
364 }
365
366 fn maybe_save_directory_mtime(
367 &self,
368 directory_has_any_fs_only_entry: bool,
369 directory_metadata: &std::fs::Metadata,
370 dirstate_node: NodeRef<'tree, 'on_disk>,
371 ) -> Result<(), DirstateV2ParseError> {
372 if !directory_has_any_fs_only_entry {
373 // All filesystem directory entries from `read_dir` have a
374 // corresponding node in the dirstate, so we can reconstitute the
375 // names of those entries without calling `read_dir` again.
376 if let (Some(status_start), Ok(directory_mtime)) = (
377 &self.filesystem_time_at_status_start,
378 directory_metadata.modified(),
379 ) {
380 // Although the Rust standard library’s `SystemTime` type
381 // has nanosecond precision, the times reported for a
382 // directory’s (or file’s) modified time may have lower
383 // resolution based on the filesystem (for example ext3
384 // only stores integer seconds), kernel (see
385 // https://stackoverflow.com/a/14393315/1162888), etc.
386 if &directory_mtime >= status_start {
387 // The directory was modified too recently, don’t cache its
388 // `read_dir` results.
389 //
390 // A timeline like this is possible:
391 //
392 // 1. A change to this directory (direct child was
393 // added or removed) cause its mtime to be set
394 // (possibly truncated) to `directory_mtime`
395 // 2. This `status` algorithm calls `read_dir`
396 // 3. An other change is made to the same directory is
397 // made so that calling `read_dir` agin would give
398 // different results, but soon enough after 1. that
399 // the mtime stays the same
400 //
401 // On a system where the time resolution poor, this
402 // scenario is not unlikely if all three steps are caused
403 // by the same script.
404 } else {
405 // We’ve observed (through `status_start`) that time has
406 // “progressed” since `directory_mtime`, so any further
407 // change to this directory is extremely likely to cause a
408 // different mtime.
409 //
410 // Having the same mtime again is not entirely impossible
411 // since the system clock is not monotonous. It could jump
412 // backward to some point before `directory_mtime`, then a
413 // directory change could potentially happen during exactly
414 // the wrong tick.
415 //
416 // We deem this scenario (unlike the previous one) to be
417 // unlikely enough in practice.
418 let timestamp = directory_mtime.into();
419 let cached = dirstate_node.cached_directory_mtime();
420 if cached != Some(&timestamp) {
421 let hg_path = dirstate_node
422 .full_path_borrowed(self.dmap.on_disk)?
423 .detach_from_tree();
424 self.cached_directory_mtimes_to_add
425 .lock()
426 .unwrap()
427 .push((hg_path, timestamp))
428 }
429 }
272 } 430 }
273 } 431 }
274 Ok(()) 432 Ok(())
275 } 433 }
276 434
503 }) 661 })
504 } 662 }
505 Ok(results) 663 Ok(results)
506 } 664 }
507 } 665 }
666
667 /// Return the `mtime` of a temporary file newly-created in the `.hg` directory
668 /// of the give repository.
669 ///
670 /// This is similar to `SystemTime::now()`, with the result truncated to the
671 /// same time resolution as other files’ modification times. Using `.hg`
672 /// instead of the system’s default temporary directory (such as `/tmp`) makes
673 /// it more likely the temporary file is in the same disk partition as contents
674 /// of the working directory, which can matter since different filesystems may
675 /// store timestamps with different resolutions.
676 ///
677 /// This may fail, typically if we lack write permissions. In that case we
678 /// should continue the `status()` algoritm anyway and consider the current
679 /// date/time to be unknown.
680 fn filesystem_now(repo_root: &Path) -> Result<SystemTime, io::Error> {
681 tempfile::tempfile_in(repo_root.join(".hg"))?
682 .metadata()?
683 .modified()
684 }