comparison rust/hg-core/src/dirstate/dirstate_tree/iter.rs @ 45562:b51167d70f5a

rust: add `dirstate_tree` module Mercurial needs to represent the filesystem hierarchy on which it operates, for example in the dirstate. Its current on-disk representation is an unsorted, flat structure that gets transformed in the current Rust code into a `HashMap`. This loses the hierarchical information of the dirstate, leading to some unfortunate performance and algorithmic compromises. This module adds an implementation of a radix tree that is specialized for representing the dirstate: its unit is the path component. I have made no efforts to optimize either its memory footprint or its insertion speed: they're pretty bad for now. Following will be a few patches that modify the dirstate.status logic to use that new hierarchical information, fixing issue 6335 in the same swing. Differential Revision: https://phab.mercurial-scm.org/D9085
author Raphaël Gomès <rgomes@octobus.net>
date Fri, 25 Sep 2020 17:51:34 +0200
parents
children c35db907363d
comparison
equal deleted inserted replaced
45558:80bf7b1ada15 45562:b51167d70f5a
1 // iter.rs
2 //
3 // Copyright 2020, Raphaël Gomès <rgomes@octobus.net>
4 //
5 // This software may be used and distributed according to the terms of the
6 // GNU General Public License version 2 or any later version.
7
8 use super::node::{Node, NodeKind};
9 use super::tree::Tree;
10 use crate::dirstate::dirstate_tree::node::Directory;
11 use crate::dirstate::status::Dispatch;
12 use crate::utils::hg_path::{hg_path_to_path_buf, HgPath, HgPathBuf};
13 use crate::DirstateEntry;
14 use std::borrow::Cow;
15 use std::collections::VecDeque;
16 use std::iter::{FromIterator, FusedIterator};
17 use std::path::PathBuf;
18
19 impl FromIterator<(HgPathBuf, DirstateEntry)> for Tree {
20 fn from_iter<T: IntoIterator<Item = (HgPathBuf, DirstateEntry)>>(iter: T) -> Self {
21 let mut tree = Self::new();
22 for (path, entry) in iter {
23 tree.insert(path, entry);
24 }
25 tree
26 }
27 }
28
29 /// Iterator of all entries in the dirstate tree.
30 ///
31 /// It has no particular ordering.
32 pub struct Iter<'a> {
33 to_visit: VecDeque<(Cow<'a, [u8]>, &'a Node)>,
34 }
35
36 impl<'a> Iter<'a> {
37 pub fn new(node: &'a Node) -> Iter<'a> {
38 let mut to_visit = VecDeque::new();
39 to_visit.push_back((Cow::Borrowed(&b""[..]), node));
40 Self { to_visit }
41 }
42 }
43
44 impl<'a> Iterator for Iter<'a> {
45 type Item = (HgPathBuf, DirstateEntry);
46
47 fn next(&mut self) -> Option<Self::Item> {
48 while let Some((base_path, node)) = self.to_visit.pop_front() {
49 match &node.kind {
50 NodeKind::Directory(dir) => {
51 add_children_to_visit(&mut self.to_visit, &base_path, &dir);
52 if let Some(file) = &dir.was_file {
53 return Some((HgPathBuf::from_bytes(&base_path), file.entry));
54 }
55 }
56 NodeKind::File(file) => {
57 if let Some(dir) = &file.was_directory {
58 add_children_to_visit(&mut self.to_visit, &base_path, &dir);
59 }
60 return Some((HgPathBuf::from_bytes(&base_path), file.entry));
61 }
62 }
63 }
64 None
65 }
66 }
67
68 impl<'a> FusedIterator for Iter<'a> {}
69
70 /// Iterator of all entries in the dirstate tree, with a special filesystem
71 /// handling for the directories containing said entries.
72 ///
73 /// It checks every directory on-disk to see if it has become a symlink, to
74 /// prevent a potential security issue.
75 /// Using this information, it may dispatch `status` information early: it
76 /// returns canonical paths along with `Shortcut`s, which are either a
77 /// `DirstateEntry` or a `Dispatch`, if the fate of said path has already been
78 /// determined.
79 ///
80 /// Like `Iter`, it has no particular ordering.
81 pub struct FsIter<'a> {
82 root_dir: PathBuf,
83 to_visit: VecDeque<(Cow<'a, [u8]>, &'a Node)>,
84 shortcuts: VecDeque<(HgPathBuf, StatusShortcut)>,
85 }
86
87 impl<'a> FsIter<'a> {
88 pub fn new(node: &'a Node, root_dir: PathBuf) -> FsIter<'a> {
89 let mut to_visit = VecDeque::new();
90 to_visit.push_back((Cow::Borrowed(&b""[..]), node));
91 Self {
92 root_dir,
93 to_visit,
94 shortcuts: Default::default(),
95 }
96 }
97
98 /// Mercurial tracks symlinks but *not* what they point to.
99 /// If a directory is moved and symlinked:
100 ///
101 /// ```bash
102 /// $ mkdir foo
103 /// $ touch foo/a
104 /// $ # commit...
105 /// $ mv foo bar
106 /// $ ln -s bar foo
107 /// ```
108 /// We need to dispatch the new symlink as `Unknown` and all the
109 /// descendents of the directory it replace as `Deleted`.
110 fn dispatch_symlinked_directory(&mut self, path: impl AsRef<HgPath>, node: &Node) {
111 let path = path.as_ref();
112 self.shortcuts
113 .push_back((path.to_owned(), StatusShortcut::Dispatch(Dispatch::Unknown)));
114 for (file, _) in node.iter() {
115 self.shortcuts.push_back((
116 path.join(&file),
117 StatusShortcut::Dispatch(Dispatch::Deleted),
118 ));
119 }
120 }
121
122 /// Returns `true` if the canonical `path` of a directory corresponds to a
123 /// symlink on disk. It means it was moved and symlinked after the last
124 /// dirstate update.
125 ///
126 /// # Special cases
127 ///
128 /// Returns `false` for the repository root.
129 /// Returns `false` on io error, error handling is outside of the iterator.
130 fn directory_became_symlink(&mut self, path: &HgPath) -> bool {
131 if path.is_empty() {
132 return false;
133 }
134 let filename_as_path = match hg_path_to_path_buf(&path) {
135 Ok(p) => p,
136 _ => return false,
137 };
138 let meta = self.root_dir.join(filename_as_path).symlink_metadata();
139 match meta {
140 Ok(ref m) if m.file_type().is_symlink() => true,
141 _ => return false,
142 }
143 }
144 }
145
146 /// Returned by `FsIter`, since the `Dispatch` of any given entry may already
147 /// be determined during the iteration. This is necessary for performance
148 /// reasons, since hierarchical information is needed to `Dispatch` an entire
149 /// subtree efficiently.
150 #[derive(Debug, Copy, Clone)]
151 pub enum StatusShortcut {
152 /// A entry in the dirstate for further inspection
153 Entry(DirstateEntry),
154 /// The result of the status of the corresponding file
155 Dispatch(Dispatch),
156 }
157
158 impl<'a> Iterator for FsIter<'a> {
159 type Item = (HgPathBuf, StatusShortcut);
160
161 fn next(&mut self) -> Option<Self::Item> {
162 // If any paths have already been `Dispatch`-ed, return them
163 while let Some(res) = self.shortcuts.pop_front() {
164 return Some(res);
165 }
166
167 while let Some((base_path, node)) = self.to_visit.pop_front() {
168 match &node.kind {
169 NodeKind::Directory(dir) => {
170 let canonical_path = HgPath::new(&base_path);
171 if self.directory_became_symlink(canonical_path) {
172 // Potential security issue, don't do a normal
173 // traversal, force the results.
174 self.dispatch_symlinked_directory(canonical_path, &node);
175 continue;
176 }
177 add_children_to_visit(&mut self.to_visit, &base_path, &dir);
178 if let Some(file) = &dir.was_file {
179 return Some((
180 HgPathBuf::from_bytes(&base_path),
181 StatusShortcut::Entry(file.entry),
182 ));
183 }
184 }
185 NodeKind::File(file) => {
186 if let Some(dir) = &file.was_directory {
187 add_children_to_visit(&mut self.to_visit, &base_path, &dir);
188 }
189 return Some((
190 HgPathBuf::from_bytes(&base_path),
191 StatusShortcut::Entry(file.entry),
192 ));
193 }
194 }
195 }
196
197 None
198 }
199 }
200
201 impl<'a> FusedIterator for FsIter<'a> {}
202
203 fn join_path<'a, 'b>(path: &'a [u8], other: &'b [u8]) -> Cow<'b, [u8]> {
204 if path.is_empty() {
205 other.into()
206 } else {
207 [path, &b"/"[..], other].concat().into()
208 }
209 }
210
211 /// Adds all children of a given directory `dir` to the visit queue `to_visit`
212 /// prefixed by a `base_path`.
213 fn add_children_to_visit<'a>(
214 to_visit: &mut VecDeque<(Cow<'a, [u8]>, &'a Node)>,
215 base_path: &[u8],
216 dir: &'a Directory,
217 ) {
218 to_visit.extend(dir.children.iter().map(|(path, child)| {
219 let full_path = join_path(&base_path, &path);
220 (Cow::from(full_path), child)
221 }));
222 }
223
224 #[cfg(test)]
225 mod tests {
226 use super::*;
227 use crate::utils::hg_path::HgPath;
228 use crate::{EntryState, FastHashMap};
229 use std::collections::HashSet;
230
231 #[test]
232 fn test_iteration() {
233 let mut tree = Tree::new();
234
235 assert_eq!(
236 tree.insert(
237 HgPathBuf::from_bytes(b"foo/bar"),
238 DirstateEntry {
239 state: EntryState::Merged,
240 mode: 41,
241 mtime: 42,
242 size: 43,
243 }
244 ),
245 None
246 );
247
248 assert_eq!(
249 tree.insert(
250 HgPathBuf::from_bytes(b"foo2"),
251 DirstateEntry {
252 state: EntryState::Merged,
253 mode: 40,
254 mtime: 41,
255 size: 42,
256 }
257 ),
258 None
259 );
260
261 assert_eq!(
262 tree.insert(
263 HgPathBuf::from_bytes(b"foo/baz"),
264 DirstateEntry {
265 state: EntryState::Normal,
266 mode: 0,
267 mtime: 0,
268 size: 0,
269 }
270 ),
271 None
272 );
273
274 assert_eq!(
275 tree.insert(
276 HgPathBuf::from_bytes(b"foo/bap/nested"),
277 DirstateEntry {
278 state: EntryState::Normal,
279 mode: 0,
280 mtime: 0,
281 size: 0,
282 }
283 ),
284 None
285 );
286
287 assert_eq!(tree.len(), 4);
288
289 let results: HashSet<_> = tree.iter().map(|(c, _)| c.to_owned()).collect();
290 dbg!(&results);
291 assert!(results.contains(HgPath::new(b"foo2")));
292 assert!(results.contains(HgPath::new(b"foo/bar")));
293 assert!(results.contains(HgPath::new(b"foo/baz")));
294 assert!(results.contains(HgPath::new(b"foo/bap/nested")));
295
296 let mut iter = tree.iter();
297 assert!(iter.next().is_some());
298 assert!(iter.next().is_some());
299 assert!(iter.next().is_some());
300 assert!(iter.next().is_some());
301 assert_eq!(None, iter.next());
302 assert_eq!(None, iter.next());
303 drop(iter);
304
305 assert_eq!(
306 tree.insert(
307 HgPathBuf::from_bytes(b"foo/bap/nested/a"),
308 DirstateEntry {
309 state: EntryState::Normal,
310 mode: 0,
311 mtime: 0,
312 size: 0,
313 }
314 ),
315 None
316 );
317
318 let results: FastHashMap<_, _> = tree.iter().collect();
319 assert!(results.contains_key(HgPath::new(b"foo2")));
320 assert!(results.contains_key(HgPath::new(b"foo/bar")));
321 assert!(results.contains_key(HgPath::new(b"foo/baz")));
322 // Is a dir but `was_file`, so it's listed as a removed file
323 assert!(results.contains_key(HgPath::new(b"foo/bap/nested")));
324 assert!(results.contains_key(HgPath::new(b"foo/bap/nested/a")));
325
326 // insert removed file (now directory) after nested file
327 assert_eq!(
328 tree.insert(
329 HgPathBuf::from_bytes(b"a/a"),
330 DirstateEntry {
331 state: EntryState::Normal,
332 mode: 0,
333 mtime: 0,
334 size: 0,
335 }
336 ),
337 None
338 );
339
340 // `insert` returns `None` for a directory
341 assert_eq!(
342 tree.insert(
343 HgPathBuf::from_bytes(b"a"),
344 DirstateEntry {
345 state: EntryState::Removed,
346 mode: 0,
347 mtime: 0,
348 size: 0,
349 }
350 ),
351 None
352 );
353
354 let results: FastHashMap<_, _> = tree.iter().collect();
355 assert!(results.contains_key(HgPath::new(b"a")));
356 assert!(results.contains_key(HgPath::new(b"a/a")));
357 }
358 }