Mercurial > hg
comparison rust/hg-core/src/dirstate_tree/status.rs @ 47349:7138c863d0a1
dirstate-v2: Skip readdir in status based on directory mtime
When calling `read_dir` during `status` and the directory is found to be
eligible for caching (see code comments), write the directory’s mtime to the
dirstate. The presence of a directory mtime in the dirstate is meaningful
and indicates eligibility.
When an eligible directory mtime is found in the dirstate and `stat()` shows
that the mtime has not changed, `status` can skip calling `read_dir` again
and instead rely on the names of child nodes in the dirstate tree.
The `tempfile` crate is used to create a temporary file in order to use its
modification time as "current time" with the same truncation as other files
and directories would have in their own modification time.
Differential Revision: https://phab.mercurial-scm.org/D10826
author | Simon Sapin <simon.sapin@octobus.net> |
---|---|
date | Fri, 28 May 2021 11:48:59 +0200 |
parents | 73ddcedeaadf |
children | 04d1f17f49e7 |
comparison
equal
deleted
inserted
replaced
47348:a4de570e61fa | 47349:7138c863d0a1 |
---|---|
1 use crate::dirstate::status::IgnoreFnType; | 1 use crate::dirstate::status::IgnoreFnType; |
2 use crate::dirstate_tree::dirstate_map::BorrowedPath; | 2 use crate::dirstate_tree::dirstate_map::BorrowedPath; |
3 use crate::dirstate_tree::dirstate_map::ChildNodesRef; | 3 use crate::dirstate_tree::dirstate_map::ChildNodesRef; |
4 use crate::dirstate_tree::dirstate_map::DirstateMap; | 4 use crate::dirstate_tree::dirstate_map::DirstateMap; |
5 use crate::dirstate_tree::dirstate_map::NodeData; | |
5 use crate::dirstate_tree::dirstate_map::NodeRef; | 6 use crate::dirstate_tree::dirstate_map::NodeRef; |
6 use crate::dirstate_tree::on_disk::DirstateV2ParseError; | 7 use crate::dirstate_tree::on_disk::DirstateV2ParseError; |
8 use crate::dirstate_tree::on_disk::Timestamp; | |
9 use crate::dirstate_tree::path_with_basename::WithBasename; | |
7 use crate::matchers::get_ignore_function; | 10 use crate::matchers::get_ignore_function; |
8 use crate::matchers::Matcher; | 11 use crate::matchers::Matcher; |
9 use crate::utils::files::get_bytes_from_os_string; | 12 use crate::utils::files::get_bytes_from_os_string; |
10 use crate::utils::files::get_path_from_bytes; | 13 use crate::utils::files::get_path_from_bytes; |
11 use crate::utils::hg_path::HgPath; | 14 use crate::utils::hg_path::HgPath; |
16 use crate::PatternFileWarning; | 19 use crate::PatternFileWarning; |
17 use crate::StatusError; | 20 use crate::StatusError; |
18 use crate::StatusOptions; | 21 use crate::StatusOptions; |
19 use micro_timer::timed; | 22 use micro_timer::timed; |
20 use rayon::prelude::*; | 23 use rayon::prelude::*; |
24 use std::borrow::Cow; | |
21 use std::io; | 25 use std::io; |
22 use std::path::Path; | 26 use std::path::Path; |
23 use std::path::PathBuf; | 27 use std::path::PathBuf; |
24 use std::sync::Mutex; | 28 use std::sync::Mutex; |
29 use std::time::SystemTime; | |
25 | 30 |
26 /// Returns the status of the working directory compared to its parent | 31 /// Returns the status of the working directory compared to its parent |
27 /// changeset. | 32 /// changeset. |
28 /// | 33 /// |
29 /// This algorithm is based on traversing the filesystem tree (`fs` in function | 34 /// This algorithm is based on traversing the filesystem tree (`fs` in function |
50 let common = StatusCommon { | 55 let common = StatusCommon { |
51 dmap, | 56 dmap, |
52 options, | 57 options, |
53 matcher, | 58 matcher, |
54 ignore_fn, | 59 ignore_fn, |
55 outcome: Mutex::new(DirstateStatus::default()), | 60 outcome: Default::default(), |
61 cached_directory_mtimes_to_add: Default::default(), | |
62 filesystem_time_at_status_start: filesystem_now(&root_dir).ok(), | |
56 }; | 63 }; |
57 let is_at_repo_root = true; | 64 let is_at_repo_root = true; |
58 let hg_path = &BorrowedPath::OnDisk(HgPath::new("")); | 65 let hg_path = &BorrowedPath::OnDisk(HgPath::new("")); |
59 let has_ignored_ancestor = false; | 66 let has_ignored_ancestor = false; |
67 let root_cached_mtime = None; | |
68 let root_dir_metadata = None; | |
69 // If the path we have for the repository root is a symlink, do follow it. | |
70 // (As opposed to symlinks within the working directory which are not | |
71 // followed, using `std::fs::symlink_metadata`.) | |
60 common.traverse_fs_directory_and_dirstate( | 72 common.traverse_fs_directory_and_dirstate( |
61 has_ignored_ancestor, | 73 has_ignored_ancestor, |
62 dmap.root.as_ref(), | 74 dmap.root.as_ref(), |
63 hg_path, | 75 hg_path, |
64 &root_dir, | 76 &root_dir, |
77 root_dir_metadata, | |
78 root_cached_mtime, | |
65 is_at_repo_root, | 79 is_at_repo_root, |
66 )?; | 80 )?; |
67 Ok((common.outcome.into_inner().unwrap(), warnings)) | 81 let outcome = common.outcome.into_inner().unwrap(); |
82 let to_add = common.cached_directory_mtimes_to_add.into_inner().unwrap(); | |
83 for (path, mtime) in &to_add { | |
84 let node = DirstateMap::get_or_insert_node( | |
85 dmap.on_disk, | |
86 &mut dmap.root, | |
87 path, | |
88 WithBasename::to_cow_owned, | |
89 |_| {}, | |
90 )?; | |
91 match &node.data { | |
92 NodeData::Entry(_) => {} // Don’t overwrite an entry | |
93 NodeData::CachedDirectory { .. } | NodeData::None => { | |
94 node.data = NodeData::CachedDirectory { mtime: *mtime } | |
95 } | |
96 } | |
97 } | |
98 Ok((outcome, warnings)) | |
68 } | 99 } |
69 | 100 |
70 /// Bag of random things needed by various parts of the algorithm. Reduces the | 101 /// Bag of random things needed by various parts of the algorithm. Reduces the |
71 /// number of parameters passed to functions. | 102 /// number of parameters passed to functions. |
72 struct StatusCommon<'a, 'tree, 'on_disk: 'tree> { | 103 struct StatusCommon<'a, 'tree, 'on_disk: 'tree> { |
73 dmap: &'tree DirstateMap<'on_disk>, | 104 dmap: &'tree DirstateMap<'on_disk>, |
74 options: StatusOptions, | 105 options: StatusOptions, |
75 matcher: &'a (dyn Matcher + Sync), | 106 matcher: &'a (dyn Matcher + Sync), |
76 ignore_fn: IgnoreFnType<'a>, | 107 ignore_fn: IgnoreFnType<'a>, |
77 outcome: Mutex<DirstateStatus<'on_disk>>, | 108 outcome: Mutex<DirstateStatus<'on_disk>>, |
109 cached_directory_mtimes_to_add: | |
110 Mutex<Vec<(Cow<'on_disk, HgPath>, Timestamp)>>, | |
111 | |
112 /// The current time at the start of the `status()` algorithm, as measured | |
113 /// and possibly truncated by the filesystem. | |
114 filesystem_time_at_status_start: Option<SystemTime>, | |
78 } | 115 } |
79 | 116 |
80 impl<'a, 'tree, 'on_disk> StatusCommon<'a, 'tree, 'on_disk> { | 117 impl<'a, 'tree, 'on_disk> StatusCommon<'a, 'tree, 'on_disk> { |
81 fn read_dir( | 118 fn read_dir( |
82 &self, | 119 &self, |
95 .unwrap() | 132 .unwrap() |
96 .bad | 133 .bad |
97 .push((hg_path.to_owned().into(), BadMatch::OsError(errno))) | 134 .push((hg_path.to_owned().into(), BadMatch::OsError(errno))) |
98 } | 135 } |
99 | 136 |
137 /// If this returns true, we can get accurate results by only using | |
138 /// `symlink_metadata` for child nodes that exist in the dirstate and don’t | |
139 /// need to call `read_dir`. | |
140 fn can_skip_fs_readdir( | |
141 &self, | |
142 directory_metadata: Option<&std::fs::Metadata>, | |
143 cached_directory_mtime: Option<&Timestamp>, | |
144 ) -> bool { | |
145 if !self.options.list_unknown && !self.options.list_ignored { | |
146 // All states that we care about listing have corresponding | |
147 // dirstate entries. | |
148 // This happens for example with `hg status -mard`. | |
149 return true; | |
150 } | |
151 if let Some(cached_mtime) = cached_directory_mtime { | |
152 // The dirstate contains a cached mtime for this directory, set by | |
153 // a previous run of the `status` algorithm which found this | |
154 // directory eligible for `read_dir` caching. | |
155 if let Some(meta) = directory_metadata { | |
156 if let Ok(current_mtime) = meta.modified() { | |
157 if current_mtime == cached_mtime.into() { | |
158 // The mtime of that directory has not changed since | |
159 // then, which means that the | |
160 // results of `read_dir` should also | |
161 // be unchanged. | |
162 return true; | |
163 } | |
164 } | |
165 } | |
166 } | |
167 false | |
168 } | |
169 | |
170 /// Returns whether the filesystem directory was found to have any entry | |
171 /// that does not have a corresponding dirstate tree node. | |
100 fn traverse_fs_directory_and_dirstate( | 172 fn traverse_fs_directory_and_dirstate( |
101 &self, | 173 &self, |
102 has_ignored_ancestor: bool, | 174 has_ignored_ancestor: bool, |
103 dirstate_nodes: ChildNodesRef<'tree, 'on_disk>, | 175 dirstate_nodes: ChildNodesRef<'tree, 'on_disk>, |
104 directory_hg_path: &BorrowedPath<'tree, 'on_disk>, | 176 directory_hg_path: &BorrowedPath<'tree, 'on_disk>, |
105 directory_fs_path: &Path, | 177 directory_fs_path: &Path, |
178 directory_metadata: Option<&std::fs::Metadata>, | |
179 cached_directory_mtime: Option<&Timestamp>, | |
106 is_at_repo_root: bool, | 180 is_at_repo_root: bool, |
107 ) -> Result<(), DirstateV2ParseError> { | 181 ) -> Result<bool, DirstateV2ParseError> { |
108 if !self.options.list_unknown && !self.options.list_ignored { | 182 if self.can_skip_fs_readdir(directory_metadata, cached_directory_mtime) |
109 // We only care about files in the dirstate, so we can skip listing | 183 { |
110 // filesystem directories entirely. | 184 dirstate_nodes |
111 return dirstate_nodes | |
112 .par_iter() | 185 .par_iter() |
113 .map(|dirstate_node| { | 186 .map(|dirstate_node| { |
114 let fs_path = directory_fs_path.join(get_path_from_bytes( | 187 let fs_path = directory_fs_path.join(get_path_from_bytes( |
115 dirstate_node.base_name(self.dmap.on_disk)?.as_bytes(), | 188 dirstate_node.base_name(self.dmap.on_disk)?.as_bytes(), |
116 )); | 189 )); |
129 dirstate_node.full_path(self.dmap.on_disk)?; | 202 dirstate_node.full_path(self.dmap.on_disk)?; |
130 Ok(self.io_error(error, hg_path)) | 203 Ok(self.io_error(error, hg_path)) |
131 } | 204 } |
132 } | 205 } |
133 }) | 206 }) |
134 .collect(); | 207 .collect::<Result<_, _>>()?; |
208 | |
209 // Conservatively don’t let the caller assume that there aren’t | |
210 // any, since we don’t know. | |
211 let directory_has_any_fs_only_entry = true; | |
212 | |
213 return Ok(directory_has_any_fs_only_entry); | |
135 } | 214 } |
136 | 215 |
137 let mut fs_entries = if let Ok(entries) = self.read_dir( | 216 let mut fs_entries = if let Ok(entries) = self.read_dir( |
138 directory_hg_path, | 217 directory_hg_path, |
139 directory_fs_path, | 218 directory_fs_path, |
172 }, | 251 }, |
173 ) | 252 ) |
174 .par_bridge() | 253 .par_bridge() |
175 .map(|pair| { | 254 .map(|pair| { |
176 use itertools::EitherOrBoth::*; | 255 use itertools::EitherOrBoth::*; |
256 let is_fs_only = pair.is_right(); | |
177 match pair { | 257 match pair { |
178 Both(dirstate_node, fs_entry) => self | 258 Both(dirstate_node, fs_entry) => self |
179 .traverse_fs_and_dirstate( | 259 .traverse_fs_and_dirstate( |
180 &fs_entry.full_path, | 260 &fs_entry.full_path, |
181 &fs_entry.metadata, | 261 &fs_entry.metadata, |
182 dirstate_node, | 262 dirstate_node, |
183 has_ignored_ancestor, | 263 has_ignored_ancestor, |
184 ), | 264 )?, |
185 Left(dirstate_node) => { | 265 Left(dirstate_node) => { |
186 self.traverse_dirstate_only(dirstate_node) | 266 self.traverse_dirstate_only(dirstate_node)? |
187 } | 267 } |
188 Right(fs_entry) => Ok(self.traverse_fs_only( | 268 Right(fs_entry) => self.traverse_fs_only( |
189 has_ignored_ancestor, | 269 has_ignored_ancestor, |
190 directory_hg_path, | 270 directory_hg_path, |
191 fs_entry, | 271 fs_entry, |
192 )), | 272 ), |
193 } | 273 } |
274 Ok(is_fs_only) | |
194 }) | 275 }) |
195 .collect() | 276 .try_reduce(|| false, |a, b| Ok(a || b)) |
196 } | 277 } |
197 | 278 |
198 fn traverse_fs_and_dirstate( | 279 fn traverse_fs_and_dirstate( |
199 &self, | 280 &self, |
200 fs_path: &Path, | 281 fs_path: &Path, |
222 .traversed | 303 .traversed |
223 .push(hg_path.detach_from_tree()) | 304 .push(hg_path.detach_from_tree()) |
224 } | 305 } |
225 let is_ignored = has_ignored_ancestor || (self.ignore_fn)(hg_path); | 306 let is_ignored = has_ignored_ancestor || (self.ignore_fn)(hg_path); |
226 let is_at_repo_root = false; | 307 let is_at_repo_root = false; |
227 self.traverse_fs_directory_and_dirstate( | 308 let directory_has_any_fs_only_entry = self |
228 is_ignored, | 309 .traverse_fs_directory_and_dirstate( |
229 dirstate_node.children(self.dmap.on_disk)?, | 310 is_ignored, |
230 hg_path, | 311 dirstate_node.children(self.dmap.on_disk)?, |
231 fs_path, | 312 hg_path, |
232 is_at_repo_root, | 313 fs_path, |
314 Some(fs_metadata), | |
315 dirstate_node.cached_directory_mtime(), | |
316 is_at_repo_root, | |
317 )?; | |
318 self.maybe_save_directory_mtime( | |
319 directory_has_any_fs_only_entry, | |
320 fs_metadata, | |
321 dirstate_node, | |
233 )? | 322 )? |
234 } else { | 323 } else { |
235 if file_or_symlink && self.matcher.matches(hg_path) { | 324 if file_or_symlink && self.matcher.matches(hg_path) { |
236 if let Some(state) = dirstate_node.state()? { | 325 if let Some(state) = dirstate_node.state()? { |
237 match state { | 326 match state { |
267 } | 356 } |
268 | 357 |
269 for child_node in dirstate_node.children(self.dmap.on_disk)?.iter() | 358 for child_node in dirstate_node.children(self.dmap.on_disk)?.iter() |
270 { | 359 { |
271 self.traverse_dirstate_only(child_node)? | 360 self.traverse_dirstate_only(child_node)? |
361 } | |
362 } | |
363 Ok(()) | |
364 } | |
365 | |
366 fn maybe_save_directory_mtime( | |
367 &self, | |
368 directory_has_any_fs_only_entry: bool, | |
369 directory_metadata: &std::fs::Metadata, | |
370 dirstate_node: NodeRef<'tree, 'on_disk>, | |
371 ) -> Result<(), DirstateV2ParseError> { | |
372 if !directory_has_any_fs_only_entry { | |
373 // All filesystem directory entries from `read_dir` have a | |
374 // corresponding node in the dirstate, so we can reconstitute the | |
375 // names of those entries without calling `read_dir` again. | |
376 if let (Some(status_start), Ok(directory_mtime)) = ( | |
377 &self.filesystem_time_at_status_start, | |
378 directory_metadata.modified(), | |
379 ) { | |
380 // Although the Rust standard library’s `SystemTime` type | |
381 // has nanosecond precision, the times reported for a | |
382 // directory’s (or file’s) modified time may have lower | |
383 // resolution based on the filesystem (for example ext3 | |
384 // only stores integer seconds), kernel (see | |
385 // https://stackoverflow.com/a/14393315/1162888), etc. | |
386 if &directory_mtime >= status_start { | |
387 // The directory was modified too recently, don’t cache its | |
388 // `read_dir` results. | |
389 // | |
390 // A timeline like this is possible: | |
391 // | |
392 // 1. A change to this directory (direct child was | |
393 // added or removed) cause its mtime to be set | |
394 // (possibly truncated) to `directory_mtime` | |
395 // 2. This `status` algorithm calls `read_dir` | |
396 // 3. An other change is made to the same directory is | |
397 // made so that calling `read_dir` agin would give | |
398 // different results, but soon enough after 1. that | |
399 // the mtime stays the same | |
400 // | |
401 // On a system where the time resolution poor, this | |
402 // scenario is not unlikely if all three steps are caused | |
403 // by the same script. | |
404 } else { | |
405 // We’ve observed (through `status_start`) that time has | |
406 // “progressed” since `directory_mtime`, so any further | |
407 // change to this directory is extremely likely to cause a | |
408 // different mtime. | |
409 // | |
410 // Having the same mtime again is not entirely impossible | |
411 // since the system clock is not monotonous. It could jump | |
412 // backward to some point before `directory_mtime`, then a | |
413 // directory change could potentially happen during exactly | |
414 // the wrong tick. | |
415 // | |
416 // We deem this scenario (unlike the previous one) to be | |
417 // unlikely enough in practice. | |
418 let timestamp = directory_mtime.into(); | |
419 let cached = dirstate_node.cached_directory_mtime(); | |
420 if cached != Some(×tamp) { | |
421 let hg_path = dirstate_node | |
422 .full_path_borrowed(self.dmap.on_disk)? | |
423 .detach_from_tree(); | |
424 self.cached_directory_mtimes_to_add | |
425 .lock() | |
426 .unwrap() | |
427 .push((hg_path, timestamp)) | |
428 } | |
429 } | |
272 } | 430 } |
273 } | 431 } |
274 Ok(()) | 432 Ok(()) |
275 } | 433 } |
276 | 434 |
503 }) | 661 }) |
504 } | 662 } |
505 Ok(results) | 663 Ok(results) |
506 } | 664 } |
507 } | 665 } |
666 | |
667 /// Return the `mtime` of a temporary file newly-created in the `.hg` directory | |
668 /// of the give repository. | |
669 /// | |
670 /// This is similar to `SystemTime::now()`, with the result truncated to the | |
671 /// same time resolution as other files’ modification times. Using `.hg` | |
672 /// instead of the system’s default temporary directory (such as `/tmp`) makes | |
673 /// it more likely the temporary file is in the same disk partition as contents | |
674 /// of the working directory, which can matter since different filesystems may | |
675 /// store timestamps with different resolutions. | |
676 /// | |
677 /// This may fail, typically if we lack write permissions. In that case we | |
678 /// should continue the `status()` algoritm anyway and consider the current | |
679 /// date/time to be unknown. | |
680 fn filesystem_now(repo_root: &Path) -> Result<SystemTime, io::Error> { | |
681 tempfile::tempfile_in(repo_root.join(".hg"))? | |
682 .metadata()? | |
683 .modified() | |
684 } |