Mercurial > hg
changeset 45562:b51167d70f5a
rust: add `dirstate_tree` module
Mercurial needs to represent the filesystem hierarchy on which it operates, for
example in the dirstate. Its current on-disk representation is an unsorted, flat
structure that gets transformed in the current Rust code into a `HashMap`.
This loses the hierarchical information of the dirstate, leading to some
unfortunate performance and algorithmic compromises.
This module adds an implementation of a radix tree that is specialized for
representing the dirstate: its unit is the path component. I have made no
efforts to optimize either its memory footprint or its insertion speed: they're
pretty bad for now.
Following will be a few patches that modify the dirstate.status logic to use
that new hierarchical information, fixing issue 6335 in the same swing.
Differential Revision: https://phab.mercurial-scm.org/D9085
author | Raphaël Gomès <rgomes@octobus.net> |
---|---|
date | Fri, 25 Sep 2020 17:51:34 +0200 |
parents | 80bf7b1ada15 |
children | 142f0dcf90d0 |
files | rust/hg-core/src/dirstate.rs rust/hg-core/src/dirstate/dirstate_tree.rs rust/hg-core/src/dirstate/dirstate_tree/iter.rs rust/hg-core/src/dirstate/dirstate_tree/node.rs rust/hg-core/src/dirstate/dirstate_tree/tree.rs |
diffstat | 5 files changed, 1411 insertions(+), 0 deletions(-) [+] |
line wrap: on
line diff
--- a/rust/hg-core/src/dirstate.rs Fri Jul 24 16:35:02 2020 +0200 +++ b/rust/hg-core/src/dirstate.rs Fri Sep 25 17:51:34 2020 +0200 @@ -11,6 +11,7 @@ pub mod dirs_multiset; pub mod dirstate_map; +pub mod dirstate_tree; pub mod parsers; pub mod status;
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/rust/hg-core/src/dirstate/dirstate_tree.rs Fri Sep 25 17:51:34 2020 +0200 @@ -0,0 +1,14 @@ +// dirstate_tree.rs +// +// Copyright 2020, Raphaël Gomès <rgomes@octobus.net> +// +// This software may be used and distributed according to the terms of the +// GNU General Public License version 2 or any later version. + +//! Special-case radix tree that matches a filesystem hierarchy for use in the +//! dirstate. +//! It has not been optimized at all yet. + +pub mod iter; +pub mod node; +pub mod tree;
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/rust/hg-core/src/dirstate/dirstate_tree/iter.rs Fri Sep 25 17:51:34 2020 +0200 @@ -0,0 +1,358 @@ +// iter.rs +// +// Copyright 2020, Raphaël Gomès <rgomes@octobus.net> +// +// This software may be used and distributed according to the terms of the +// GNU General Public License version 2 or any later version. + +use super::node::{Node, NodeKind}; +use super::tree::Tree; +use crate::dirstate::dirstate_tree::node::Directory; +use crate::dirstate::status::Dispatch; +use crate::utils::hg_path::{hg_path_to_path_buf, HgPath, HgPathBuf}; +use crate::DirstateEntry; +use std::borrow::Cow; +use std::collections::VecDeque; +use std::iter::{FromIterator, FusedIterator}; +use std::path::PathBuf; + +impl FromIterator<(HgPathBuf, DirstateEntry)> for Tree { + fn from_iter<T: IntoIterator<Item = (HgPathBuf, DirstateEntry)>>(iter: T) -> Self { + let mut tree = Self::new(); + for (path, entry) in iter { + tree.insert(path, entry); + } + tree + } +} + +/// Iterator of all entries in the dirstate tree. +/// +/// It has no particular ordering. +pub struct Iter<'a> { + to_visit: VecDeque<(Cow<'a, [u8]>, &'a Node)>, +} + +impl<'a> Iter<'a> { + pub fn new(node: &'a Node) -> Iter<'a> { + let mut to_visit = VecDeque::new(); + to_visit.push_back((Cow::Borrowed(&b""[..]), node)); + Self { to_visit } + } +} + +impl<'a> Iterator for Iter<'a> { + type Item = (HgPathBuf, DirstateEntry); + + fn next(&mut self) -> Option<Self::Item> { + while let Some((base_path, node)) = self.to_visit.pop_front() { + match &node.kind { + NodeKind::Directory(dir) => { + add_children_to_visit(&mut self.to_visit, &base_path, &dir); + if let Some(file) = &dir.was_file { + return Some((HgPathBuf::from_bytes(&base_path), file.entry)); + } + } + NodeKind::File(file) => { + if let Some(dir) = &file.was_directory { + add_children_to_visit(&mut self.to_visit, &base_path, &dir); + } + return Some((HgPathBuf::from_bytes(&base_path), file.entry)); + } + } + } + None + } +} + +impl<'a> FusedIterator for Iter<'a> {} + +/// Iterator of all entries in the dirstate tree, with a special filesystem +/// handling for the directories containing said entries. +/// +/// It checks every directory on-disk to see if it has become a symlink, to +/// prevent a potential security issue. +/// Using this information, it may dispatch `status` information early: it +/// returns canonical paths along with `Shortcut`s, which are either a +/// `DirstateEntry` or a `Dispatch`, if the fate of said path has already been +/// determined. +/// +/// Like `Iter`, it has no particular ordering. +pub struct FsIter<'a> { + root_dir: PathBuf, + to_visit: VecDeque<(Cow<'a, [u8]>, &'a Node)>, + shortcuts: VecDeque<(HgPathBuf, StatusShortcut)>, +} + +impl<'a> FsIter<'a> { + pub fn new(node: &'a Node, root_dir: PathBuf) -> FsIter<'a> { + let mut to_visit = VecDeque::new(); + to_visit.push_back((Cow::Borrowed(&b""[..]), node)); + Self { + root_dir, + to_visit, + shortcuts: Default::default(), + } + } + + /// Mercurial tracks symlinks but *not* what they point to. + /// If a directory is moved and symlinked: + /// + /// ```bash + /// $ mkdir foo + /// $ touch foo/a + /// $ # commit... + /// $ mv foo bar + /// $ ln -s bar foo + /// ``` + /// We need to dispatch the new symlink as `Unknown` and all the + /// descendents of the directory it replace as `Deleted`. + fn dispatch_symlinked_directory(&mut self, path: impl AsRef<HgPath>, node: &Node) { + let path = path.as_ref(); + self.shortcuts + .push_back((path.to_owned(), StatusShortcut::Dispatch(Dispatch::Unknown))); + for (file, _) in node.iter() { + self.shortcuts.push_back(( + path.join(&file), + StatusShortcut::Dispatch(Dispatch::Deleted), + )); + } + } + + /// Returns `true` if the canonical `path` of a directory corresponds to a + /// symlink on disk. It means it was moved and symlinked after the last + /// dirstate update. + /// + /// # Special cases + /// + /// Returns `false` for the repository root. + /// Returns `false` on io error, error handling is outside of the iterator. + fn directory_became_symlink(&mut self, path: &HgPath) -> bool { + if path.is_empty() { + return false; + } + let filename_as_path = match hg_path_to_path_buf(&path) { + Ok(p) => p, + _ => return false, + }; + let meta = self.root_dir.join(filename_as_path).symlink_metadata(); + match meta { + Ok(ref m) if m.file_type().is_symlink() => true, + _ => return false, + } + } +} + +/// Returned by `FsIter`, since the `Dispatch` of any given entry may already +/// be determined during the iteration. This is necessary for performance +/// reasons, since hierarchical information is needed to `Dispatch` an entire +/// subtree efficiently. +#[derive(Debug, Copy, Clone)] +pub enum StatusShortcut { + /// A entry in the dirstate for further inspection + Entry(DirstateEntry), + /// The result of the status of the corresponding file + Dispatch(Dispatch), +} + +impl<'a> Iterator for FsIter<'a> { + type Item = (HgPathBuf, StatusShortcut); + + fn next(&mut self) -> Option<Self::Item> { + // If any paths have already been `Dispatch`-ed, return them + while let Some(res) = self.shortcuts.pop_front() { + return Some(res); + } + + while let Some((base_path, node)) = self.to_visit.pop_front() { + match &node.kind { + NodeKind::Directory(dir) => { + let canonical_path = HgPath::new(&base_path); + if self.directory_became_symlink(canonical_path) { + // Potential security issue, don't do a normal + // traversal, force the results. + self.dispatch_symlinked_directory(canonical_path, &node); + continue; + } + add_children_to_visit(&mut self.to_visit, &base_path, &dir); + if let Some(file) = &dir.was_file { + return Some(( + HgPathBuf::from_bytes(&base_path), + StatusShortcut::Entry(file.entry), + )); + } + } + NodeKind::File(file) => { + if let Some(dir) = &file.was_directory { + add_children_to_visit(&mut self.to_visit, &base_path, &dir); + } + return Some(( + HgPathBuf::from_bytes(&base_path), + StatusShortcut::Entry(file.entry), + )); + } + } + } + + None + } +} + +impl<'a> FusedIterator for FsIter<'a> {} + +fn join_path<'a, 'b>(path: &'a [u8], other: &'b [u8]) -> Cow<'b, [u8]> { + if path.is_empty() { + other.into() + } else { + [path, &b"/"[..], other].concat().into() + } +} + +/// Adds all children of a given directory `dir` to the visit queue `to_visit` +/// prefixed by a `base_path`. +fn add_children_to_visit<'a>( + to_visit: &mut VecDeque<(Cow<'a, [u8]>, &'a Node)>, + base_path: &[u8], + dir: &'a Directory, +) { + to_visit.extend(dir.children.iter().map(|(path, child)| { + let full_path = join_path(&base_path, &path); + (Cow::from(full_path), child) + })); +} + +#[cfg(test)] +mod tests { + use super::*; + use crate::utils::hg_path::HgPath; + use crate::{EntryState, FastHashMap}; + use std::collections::HashSet; + + #[test] + fn test_iteration() { + let mut tree = Tree::new(); + + assert_eq!( + tree.insert( + HgPathBuf::from_bytes(b"foo/bar"), + DirstateEntry { + state: EntryState::Merged, + mode: 41, + mtime: 42, + size: 43, + } + ), + None + ); + + assert_eq!( + tree.insert( + HgPathBuf::from_bytes(b"foo2"), + DirstateEntry { + state: EntryState::Merged, + mode: 40, + mtime: 41, + size: 42, + } + ), + None + ); + + assert_eq!( + tree.insert( + HgPathBuf::from_bytes(b"foo/baz"), + DirstateEntry { + state: EntryState::Normal, + mode: 0, + mtime: 0, + size: 0, + } + ), + None + ); + + assert_eq!( + tree.insert( + HgPathBuf::from_bytes(b"foo/bap/nested"), + DirstateEntry { + state: EntryState::Normal, + mode: 0, + mtime: 0, + size: 0, + } + ), + None + ); + + assert_eq!(tree.len(), 4); + + let results: HashSet<_> = tree.iter().map(|(c, _)| c.to_owned()).collect(); + dbg!(&results); + assert!(results.contains(HgPath::new(b"foo2"))); + assert!(results.contains(HgPath::new(b"foo/bar"))); + assert!(results.contains(HgPath::new(b"foo/baz"))); + assert!(results.contains(HgPath::new(b"foo/bap/nested"))); + + let mut iter = tree.iter(); + assert!(iter.next().is_some()); + assert!(iter.next().is_some()); + assert!(iter.next().is_some()); + assert!(iter.next().is_some()); + assert_eq!(None, iter.next()); + assert_eq!(None, iter.next()); + drop(iter); + + assert_eq!( + tree.insert( + HgPathBuf::from_bytes(b"foo/bap/nested/a"), + DirstateEntry { + state: EntryState::Normal, + mode: 0, + mtime: 0, + size: 0, + } + ), + None + ); + + let results: FastHashMap<_, _> = tree.iter().collect(); + assert!(results.contains_key(HgPath::new(b"foo2"))); + assert!(results.contains_key(HgPath::new(b"foo/bar"))); + assert!(results.contains_key(HgPath::new(b"foo/baz"))); + // Is a dir but `was_file`, so it's listed as a removed file + assert!(results.contains_key(HgPath::new(b"foo/bap/nested"))); + assert!(results.contains_key(HgPath::new(b"foo/bap/nested/a"))); + + // insert removed file (now directory) after nested file + assert_eq!( + tree.insert( + HgPathBuf::from_bytes(b"a/a"), + DirstateEntry { + state: EntryState::Normal, + mode: 0, + mtime: 0, + size: 0, + } + ), + None + ); + + // `insert` returns `None` for a directory + assert_eq!( + tree.insert( + HgPathBuf::from_bytes(b"a"), + DirstateEntry { + state: EntryState::Removed, + mode: 0, + mtime: 0, + size: 0, + } + ), + None + ); + + let results: FastHashMap<_, _> = tree.iter().collect(); + assert!(results.contains_key(HgPath::new(b"a"))); + assert!(results.contains_key(HgPath::new(b"a/a"))); + } +}
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/rust/hg-core/src/dirstate/dirstate_tree/node.rs Fri Sep 25 17:51:34 2020 +0200 @@ -0,0 +1,377 @@ +// node.rs +// +// Copyright 2020, Raphaël Gomès <rgomes@octobus.net> +// +// This software may be used and distributed according to the terms of the +// GNU General Public License version 2 or any later version. + +use super::iter::Iter; +use crate::utils::hg_path::HgPathBuf; +use crate::{DirstateEntry, EntryState, FastHashMap}; + +/// Represents a filesystem directory in the dirstate tree +#[derive(Debug, Default, Clone, PartialEq)] +pub struct Directory { + /// Contains the old file information if it existed between changesets. + /// Happens if a file `foo` is marked as removed, removed from the + /// filesystem then a directory `foo` is created and at least one of its + /// descendents is added to Mercurial. + pub(super) was_file: Option<Box<File>>, + pub(super) children: FastHashMap<Vec<u8>, Node>, +} + +/// Represents a filesystem file (or symlink) in the dirstate tree +#[derive(Debug, Clone, PartialEq)] +pub struct File { + /// Contains the old structure if it existed between changesets. + /// Happens all descendents of `foo` marked as removed and removed from + /// the filesystem, then a file `foo` is created and added to Mercurial. + pub(super) was_directory: Option<Box<Directory>>, + pub(super) entry: DirstateEntry, +} + +#[derive(Debug, Clone, PartialEq)] +pub enum NodeKind { + Directory(Directory), + File(File), +} + +#[derive(Debug, Default, Clone, PartialEq)] +pub struct Node { + pub kind: NodeKind, +} + +impl Default for NodeKind { + fn default() -> Self { + NodeKind::Directory(Default::default()) + } +} + +impl Node { + pub fn insert(&mut self, path: &[u8], new_entry: DirstateEntry) -> InsertResult { + let mut split = path.splitn(2, |&c| c == b'/'); + let head = split.next().unwrap_or(b""); + let tail = split.next().unwrap_or(b""); + + if let NodeKind::File(file) = &mut self.kind { + if tail.is_empty() && head.is_empty() { + // We're modifying the current file + let new = Self { + kind: NodeKind::File(File { + entry: new_entry, + ..file.clone() + }), + }; + return InsertResult { + did_insert: false, + old_entry: Some(std::mem::replace(self, new)), + }; + } else { + match file.entry.state { + // Only replace the current file with a directory if it's + // marked as `Removed` + EntryState::Removed => { + self.kind = NodeKind::Directory(Directory { + was_file: Some(Box::from(file.clone())), + children: Default::default(), + }) + } + _ => return Node::insert_in_file(file, new_entry, head, tail), + } + } + } + + match &mut self.kind { + NodeKind::Directory(directory) => { + return Node::insert_in_directory(directory, new_entry, head, tail); + } + NodeKind::File(_) => unreachable!("The file case has already been handled"), + } + } + + /// The current file still exists and is not marked as `Removed`. + /// Insert the entry in its `was_directory`. + fn insert_in_file( + file: &mut File, + new_entry: DirstateEntry, + head: &[u8], + tail: &[u8], + ) -> InsertResult { + if let Some(d) = &mut file.was_directory { + Node::insert_in_directory(d, new_entry, head, tail) + } else { + let mut dir = Directory { + was_file: None, + children: FastHashMap::default(), + }; + let res = Node::insert_in_directory(&mut dir, new_entry, head, tail); + file.was_directory = Some(Box::new(dir)); + res + } + } + + /// Insert an entry in the subtree of `directory` + fn insert_in_directory( + directory: &mut Directory, + new_entry: DirstateEntry, + head: &[u8], + tail: &[u8], + ) -> InsertResult { + let mut res = InsertResult::default(); + + if let Some(node) = directory.children.get_mut(head) { + // Node exists + match &mut node.kind { + NodeKind::Directory(subdir) => { + if tail.is_empty() { + let becomes_file = Self { + kind: NodeKind::File(File { + was_directory: Some(Box::from(subdir.clone())), + entry: new_entry, + }), + }; + let old_entry = directory.children.insert(head.to_owned(), becomes_file); + return InsertResult { + did_insert: true, + old_entry, + }; + } else { + res = node.insert(tail, new_entry); + } + } + NodeKind::File(_) => { + res = node.insert(tail, new_entry); + } + } + } else if tail.is_empty() { + // File does not already exist + directory.children.insert( + head.to_owned(), + Self { + kind: NodeKind::File(File { + was_directory: None, + entry: new_entry, + }), + }, + ); + res.did_insert = true; + } else { + // Directory does not already exist + let mut nested = Self { + kind: NodeKind::Directory(Directory { + was_file: None, + children: Default::default(), + }), + }; + res = nested.insert(tail, new_entry); + directory.children.insert(head.to_owned(), nested); + } + res + } + + /// Removes an entry from the tree, returns a `RemoveResult`. + pub fn remove(&mut self, path: &[u8]) -> RemoveResult { + let empty_result = RemoveResult::default(); + if path.is_empty() { + return empty_result; + } + let mut split = path.splitn(2, |&c| c == b'/'); + let head = split.next(); + let tail = split.next().unwrap_or(b""); + + let head = match head { + None => { + return empty_result; + } + Some(h) => h, + }; + if head == path { + match &mut self.kind { + NodeKind::Directory(d) => { + return Node::remove_from_directory(head, d); + } + NodeKind::File(f) => { + if let Some(d) = &mut f.was_directory { + let RemoveResult { old_entry, .. } = Node::remove_from_directory(head, d); + return RemoveResult { + cleanup: false, + old_entry, + }; + } + } + } + empty_result + } else { + // Look into the dirs + match &mut self.kind { + NodeKind::Directory(d) => { + if let Some(child) = d.children.get_mut(head) { + let mut res = child.remove(tail); + if res.cleanup { + d.children.remove(head); + } + res.cleanup = d.children.len() == 0 && d.was_file.is_none(); + res + } else { + empty_result + } + } + NodeKind::File(f) => { + if let Some(d) = &mut f.was_directory { + if let Some(child) = d.children.get_mut(head) { + let RemoveResult { cleanup, old_entry } = child.remove(tail); + if cleanup { + d.children.remove(head); + } + if d.children.len() == 0 && d.was_file.is_none() { + f.was_directory = None; + } + + return RemoveResult { + cleanup: false, + old_entry, + }; + } + } + empty_result + } + } + } + } + + fn remove_from_directory(head: &[u8], d: &mut Directory) -> RemoveResult { + if let Some(node) = d.children.get_mut(head) { + return match &mut node.kind { + NodeKind::Directory(d) => { + if let Some(f) = &mut d.was_file { + let entry = f.entry; + d.was_file = None; + RemoveResult { + cleanup: false, + old_entry: Some(entry), + } + } else { + RemoveResult::default() + } + } + NodeKind::File(f) => { + let entry = f.entry; + let mut cleanup = false; + match &f.was_directory { + None => { + if d.children.len() == 1 { + cleanup = true; + } + d.children.remove(head); + } + Some(dir) => { + node.kind = NodeKind::Directory(*dir.clone()); + } + } + + RemoveResult { + cleanup: cleanup, + old_entry: Some(entry), + } + } + }; + } + RemoveResult::default() + } + + pub fn get(&self, path: &[u8]) -> Option<&Node> { + if path.is_empty() { + return Some(&self); + } + let mut split = path.splitn(2, |&c| c == b'/'); + let head = split.next(); + let tail = split.next().unwrap_or(b""); + + let head = match head { + None => { + return Some(&self); + } + Some(h) => h, + }; + match &self.kind { + NodeKind::Directory(d) => { + if let Some(child) = d.children.get(head) { + return child.get(tail); + } + } + NodeKind::File(f) => { + if let Some(d) = &f.was_directory { + if let Some(child) = d.children.get(head) { + return child.get(tail); + } + } + } + } + + None + } + + pub fn get_mut(&mut self, path: &[u8]) -> Option<&mut NodeKind> { + if path.is_empty() { + return Some(&mut self.kind); + } + let mut split = path.splitn(2, |&c| c == b'/'); + let head = split.next(); + let tail = split.next().unwrap_or(b""); + + let head = match head { + None => { + return Some(&mut self.kind); + } + Some(h) => h, + }; + match &mut self.kind { + NodeKind::Directory(d) => { + if let Some(child) = d.children.get_mut(head) { + return child.get_mut(tail); + } + } + NodeKind::File(f) => { + if let Some(d) = &mut f.was_directory { + if let Some(child) = d.children.get_mut(head) { + return child.get_mut(tail); + } + } + } + } + + None + } + + pub fn iter(&self) -> Iter { + Iter::new(self) + } +} + +/// Information returned to the caller of an `insert` operation for integrity. +#[derive(Debug, Default)] +pub struct InsertResult { + /// Whether the insertion resulted in an actual insertion and not an + /// update + pub(super) did_insert: bool, + /// The entry that was replaced, if it exists + pub(super) old_entry: Option<Node>, +} + +/// Information returned to the caller of a `remove` operation integrity. +#[derive(Debug, Default)] +pub struct RemoveResult { + /// If the caller needs to remove the current node + pub(super) cleanup: bool, + /// The entry that was replaced, if it exists + pub(super) old_entry: Option<DirstateEntry>, +} + +impl<'a> IntoIterator for &'a Node { + type Item = (HgPathBuf, DirstateEntry); + type IntoIter = Iter<'a>; + + fn into_iter(self) -> Self::IntoIter { + self.iter() + } +}
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/rust/hg-core/src/dirstate/dirstate_tree/tree.rs Fri Sep 25 17:51:34 2020 +0200 @@ -0,0 +1,661 @@ +// tree.rs +// +// Copyright 2020, Raphaël Gomès <rgomes@octobus.net> +// +// This software may be used and distributed according to the terms of the +// GNU General Public License version 2 or any later version. + +use super::iter::Iter; +use super::node::{Directory, Node, NodeKind}; +use crate::dirstate::dirstate_tree::iter::FsIter; +use crate::dirstate::dirstate_tree::node::{InsertResult, RemoveResult}; +use crate::utils::hg_path::{HgPath, HgPathBuf}; +use crate::DirstateEntry; +use std::path::PathBuf; + +/// A specialized tree to represent the Mercurial dirstate. +/// +/// # Advantages over a flat structure +/// +/// The dirstate is inherently hierarchical, since it's a representation of the +/// file structure of the project. The current dirstate format is flat, and +/// while that affords us potentially great (unordered) iteration speeds, the +/// need to retrieve a given path is great enough that you need some kind of +/// hashmap or tree in a lot of cases anyway. +/// +/// Going with a tree allows us to be smarter: +/// - Skipping an ignored directory means we don't visit its entire subtree +/// - Security auditing does not need to reconstruct paths backwards to check +/// for symlinked directories, this can be done during the iteration in a +/// very efficient fashion +/// - We don't need to build the directory information in another struct, +/// simplifying the code a lot, reducing the memory footprint and +/// potentially going faster depending on the implementation. +/// - We can use it to store a (platform-dependent) caching mechanism [1] +/// - And probably other types of optimizations. +/// +/// Only the first two items in this list are implemented as of this commit. +/// +/// [1]: https://www.mercurial-scm.org/wiki/DirsCachePlan +/// +/// +/// # Structure +/// +/// It's a prefix (radix) tree with no fixed arity, with a granularity of a +/// folder, allowing it to mimic a filesystem hierarchy: +/// +/// ```text +/// foo/bar +/// foo/baz +/// test +/// ``` +/// Will be represented (simplified) by: +/// +/// ```text +/// Directory(root): +/// - File("test") +/// - Directory("foo"): +/// - File("bar") +/// - File("baz") +/// ``` +/// +/// Moreover, it is special-cased for storing the dirstate and as such handles +/// cases that a simple `HashMap` would handle, but while preserving the +/// hierarchy. +/// For example: +/// +/// ```shell +/// $ touch foo +/// $ hg add foo +/// $ hg commit -m "foo" +/// $ hg remove foo +/// $ rm foo +/// $ mkdir foo +/// $ touch foo/a +/// $ hg add foo/a +/// $ hg status +/// R foo +/// A foo/a +/// ``` +/// To represent this in a tree, one needs to keep track of whether any given +/// file was a directory and whether any given directory was a file at the last +/// dirstate update. This tree stores that information, but only in the right +/// circumstances by respecting the high-level rules that prevent nonsensical +/// structures to exist: +/// - a file can only be added as a child of another file if the latter is +/// marked as `Removed` +/// - a file cannot replace a folder unless all its descendents are removed +/// +/// This second rule is not checked by the tree for performance reasons, and +/// because high-level logic already prevents that state from happening. +/// +/// # Ordering +/// +/// It makes no guarantee of ordering for now. +#[derive(Debug, Default, Clone, PartialEq)] +pub struct Tree { + pub root: Node, + files_count: usize, +} + +impl Tree { + pub fn new() -> Self { + Self { + root: Node { + kind: NodeKind::Directory(Directory { + was_file: None, + children: Default::default(), + }), + }, + files_count: 0, + } + } + + /// How many files (not directories) are stored in the tree, including ones + /// marked as `Removed`. + pub fn len(&self) -> usize { + self.files_count + } + + pub fn is_empty(&self) -> bool { + self.len() == 0 + } + + /// Inserts a file in the tree and returns the previous entry if any. + pub fn insert( + &mut self, + path: impl AsRef<HgPath>, + kind: DirstateEntry, + ) -> Option<DirstateEntry> { + let old = self.insert_node(path, kind); + match old?.kind { + NodeKind::Directory(_) => None, + NodeKind::File(f) => Some(f.entry), + } + } + + /// Low-level insertion method that returns the previous node (directories + /// included). + fn insert_node(&mut self, path: impl AsRef<HgPath>, kind: DirstateEntry) -> Option<Node> { + let InsertResult { + did_insert, + old_entry, + } = self.root.insert(path.as_ref().as_bytes(), kind); + self.files_count += if did_insert { 1 } else { 0 }; + old_entry + } + + /// Returns a reference to a node if it exists. + pub fn get_node(&self, path: impl AsRef<HgPath>) -> Option<&Node> { + self.root.get(path.as_ref().as_bytes()) + } + + /// Returns a reference to the entry corresponding to `path` if it exists. + pub fn get(&self, path: impl AsRef<HgPath>) -> Option<&DirstateEntry> { + if let Some(node) = self.get_node(&path) { + return match &node.kind { + NodeKind::Directory(d) => d.was_file.as_ref().map(|f| &f.entry), + NodeKind::File(f) => Some(&f.entry), + }; + } + None + } + + /// Returns `true` if an entry is found for the given `path`. + pub fn contains_key(&self, path: impl AsRef<HgPath>) -> bool { + self.get(path).is_some() + } + + /// Returns a mutable reference to the entry corresponding to `path` if it + /// exists. + pub fn get_mut(&mut self, path: impl AsRef<HgPath>) -> Option<&mut DirstateEntry> { + if let Some(kind) = self.root.get_mut(path.as_ref().as_bytes()) { + return match kind { + NodeKind::Directory(d) => d.was_file.as_mut().map(|f| &mut f.entry), + NodeKind::File(f) => Some(&mut f.entry), + }; + } + None + } + + /// Returns an iterator over the paths and corresponding entries in the + /// tree. + pub fn iter(&self) -> Iter { + Iter::new(&self.root) + } + + /// Returns an iterator of all entries in the tree, with a special + /// filesystem handling for the directories containing said entries. See + /// the documentation of `FsIter` for more. + pub fn fs_iter(&self, root_dir: PathBuf) -> FsIter { + FsIter::new(&self.root, root_dir) + } + + /// Remove the entry at `path` and returns it, if it exists. + pub fn remove(&mut self, path: impl AsRef<HgPath>) -> Option<DirstateEntry> { + let RemoveResult { old_entry, .. } = self.root.remove(path.as_ref().as_bytes()); + self.files_count = self + .files_count + .checked_sub(if old_entry.is_some() { 1 } else { 0 }) + .expect("removed too many files"); + old_entry + } +} + +impl<P: AsRef<HgPath>> Extend<(P, DirstateEntry)> for Tree { + fn extend<T: IntoIterator<Item = (P, DirstateEntry)>>(&mut self, iter: T) { + for (path, entry) in iter { + self.insert(path, entry); + } + } +} + +impl<'a> IntoIterator for &'a Tree { + type Item = (HgPathBuf, DirstateEntry); + type IntoIter = Iter<'a>; + + fn into_iter(self) -> Self::IntoIter { + self.iter() + } +} + +#[cfg(test)] +mod tests { + use super::*; + use crate::dirstate::dirstate_tree::node::File; + use crate::{EntryState, FastHashMap}; + use pretty_assertions::assert_eq; + + impl Node { + /// Shortcut for getting children of a node in tests. + fn children(&self) -> Option<&FastHashMap<Vec<u8>, Node>> { + match &self.kind { + NodeKind::Directory(d) => Some(&d.children), + NodeKind::File(_) => None, + } + } + } + + #[test] + fn test_dirstate_tree() { + let mut tree = Tree::new(); + + assert_eq!( + tree.insert_node( + HgPath::new(b"we/p"), + DirstateEntry { + state: EntryState::Normal, + mode: 0, + mtime: 0, + size: 0 + } + ), + None + ); + dbg!(&tree); + assert!(tree.get_node(HgPath::new(b"we")).is_some()); + let entry = DirstateEntry { + state: EntryState::Merged, + mode: 41, + mtime: 42, + size: 43, + }; + assert_eq!(tree.insert_node(HgPath::new(b"foo/bar"), entry), None); + assert_eq!( + tree.get_node(HgPath::new(b"foo/bar")), + Some(&Node { + kind: NodeKind::File(File { + was_directory: None, + entry + }) + }) + ); + // We didn't override the first entry we made + assert!(tree.get_node(HgPath::new(b"we")).is_some(),); + // Inserting the same key again + assert_eq!( + tree.insert_node(HgPath::new(b"foo/bar"), entry), + Some(Node { + kind: NodeKind::File(File { + was_directory: None, + entry + }), + }) + ); + // Inserting the two levels deep + assert_eq!(tree.insert_node(HgPath::new(b"foo/bar/baz"), entry), None); + // Getting a file "inside a file" should return `None` + assert_eq!(tree.get_node(HgPath::new(b"foo/bar/baz/bap"),), None); + + assert_eq!( + tree.insert_node(HgPath::new(b"wasdir/subfile"), entry), + None, + ); + let removed_entry = DirstateEntry { + state: EntryState::Removed, + mode: 0, + mtime: 0, + size: 0, + }; + assert!(tree + .insert_node(HgPath::new(b"wasdir"), removed_entry) + .is_some()); + + assert_eq!( + tree.get_node(HgPath::new(b"wasdir")), + Some(&Node { + kind: NodeKind::File(File { + was_directory: Some(Box::new(Directory { + was_file: None, + children: [( + b"subfile".to_vec(), + Node { + kind: NodeKind::File(File { + was_directory: None, + entry, + }) + } + )] + .to_vec() + .into_iter() + .collect() + })), + entry: removed_entry + }) + }) + ); + + assert!(tree.get(HgPath::new(b"wasdir/subfile")).is_some()) + } + + #[test] + fn test_insert_removed() { + let mut tree = Tree::new(); + let entry = DirstateEntry { + state: EntryState::Merged, + mode: 1, + mtime: 2, + size: 3, + }; + let removed_entry = DirstateEntry { + state: EntryState::Removed, + mode: 10, + mtime: 20, + size: 30, + }; + assert_eq!(tree.insert_node(HgPath::new(b"foo"), entry), None); + assert_eq!(tree.insert_node(HgPath::new(b"foo/a"), removed_entry), None); + // The insert should not turn `foo` into a directory as `foo` is not + // `Removed`. + match tree.get_node(HgPath::new(b"foo")).unwrap().kind { + NodeKind::Directory(_) => panic!("should be a file"), + NodeKind::File(_) => {} + } + + let mut tree = Tree::new(); + let entry = DirstateEntry { + state: EntryState::Merged, + mode: 1, + mtime: 2, + size: 3, + }; + let removed_entry = DirstateEntry { + state: EntryState::Removed, + mode: 10, + mtime: 20, + size: 30, + }; + // The insert *should* turn `foo` into a directory as it is `Removed`. + assert_eq!(tree.insert_node(HgPath::new(b"foo"), removed_entry), None); + assert_eq!(tree.insert_node(HgPath::new(b"foo/a"), entry), None); + match tree.get_node(HgPath::new(b"foo")).unwrap().kind { + NodeKind::Directory(_) => {} + NodeKind::File(_) => panic!("should be a directory"), + } + } + + #[test] + fn test_get() { + let mut tree = Tree::new(); + let entry = DirstateEntry { + state: EntryState::Merged, + mode: 1, + mtime: 2, + size: 3, + }; + assert_eq!(tree.insert_node(HgPath::new(b"a/b/c"), entry), None); + assert_eq!(tree.files_count, 1); + assert_eq!(tree.get(HgPath::new(b"a/b/c")), Some(&entry)); + assert_eq!(tree.get(HgPath::new(b"a/b")), None); + assert_eq!(tree.get(HgPath::new(b"a")), None); + assert_eq!(tree.get(HgPath::new(b"a/b/c/d")), None); + let entry2 = DirstateEntry { + state: EntryState::Removed, + mode: 0, + mtime: 5, + size: 1, + }; + // was_directory + assert_eq!(tree.insert(HgPath::new(b"a/b"), entry2), None); + assert_eq!(tree.files_count, 2); + assert_eq!(tree.get(HgPath::new(b"a/b")), Some(&entry2)); + assert_eq!(tree.get(HgPath::new(b"a/b/c")), Some(&entry)); + + let mut tree = Tree::new(); + + // was_file + assert_eq!(tree.insert_node(HgPath::new(b"a"), entry), None); + assert_eq!(tree.files_count, 1); + assert_eq!(tree.insert_node(HgPath::new(b"a/b"), entry2), None); + assert_eq!(tree.files_count, 2); + assert_eq!(tree.get(HgPath::new(b"a/b")), Some(&entry2)); + } + + #[test] + fn test_get_mut() { + let mut tree = Tree::new(); + let mut entry = DirstateEntry { + state: EntryState::Merged, + mode: 1, + mtime: 2, + size: 3, + }; + assert_eq!(tree.insert_node(HgPath::new(b"a/b/c"), entry), None); + assert_eq!(tree.files_count, 1); + assert_eq!(tree.get_mut(HgPath::new(b"a/b/c")), Some(&mut entry)); + assert_eq!(tree.get_mut(HgPath::new(b"a/b")), None); + assert_eq!(tree.get_mut(HgPath::new(b"a")), None); + assert_eq!(tree.get_mut(HgPath::new(b"a/b/c/d")), None); + let mut entry2 = DirstateEntry { + state: EntryState::Removed, + mode: 0, + mtime: 5, + size: 1, + }; + // was_directory + assert_eq!(tree.insert(HgPath::new(b"a/b"), entry2), None); + assert_eq!(tree.files_count, 2); + assert_eq!(tree.get_mut(HgPath::new(b"a/b")), Some(&mut entry2)); + assert_eq!(tree.get_mut(HgPath::new(b"a/b/c")), Some(&mut entry)); + + let mut tree = Tree::new(); + + // was_file + assert_eq!(tree.insert_node(HgPath::new(b"a"), entry), None); + assert_eq!(tree.files_count, 1); + assert_eq!(tree.insert_node(HgPath::new(b"a/b"), entry2), None); + assert_eq!(tree.files_count, 2); + assert_eq!(tree.get_mut(HgPath::new(b"a/b")), Some(&mut entry2)); + } + + #[test] + fn test_remove() { + let mut tree = Tree::new(); + assert_eq!(tree.files_count, 0); + assert_eq!(tree.remove(HgPath::new(b"foo")), None); + assert_eq!(tree.files_count, 0); + + let entry = DirstateEntry { + state: EntryState::Normal, + mode: 0, + mtime: 0, + size: 0, + }; + assert_eq!(tree.insert_node(HgPath::new(b"a/b/c"), entry), None); + assert_eq!(tree.files_count, 1); + + assert_eq!(tree.remove(HgPath::new(b"a/b/c")), Some(entry)); + assert_eq!(tree.files_count, 0); + + assert_eq!(tree.insert_node(HgPath::new(b"a/b/x"), entry), None); + assert_eq!(tree.insert_node(HgPath::new(b"a/b/y"), entry), None); + assert_eq!(tree.insert_node(HgPath::new(b"a/b/z"), entry), None); + assert_eq!(tree.insert_node(HgPath::new(b"x"), entry), None); + assert_eq!(tree.insert_node(HgPath::new(b"y"), entry), None); + assert_eq!(tree.files_count, 5); + + assert_eq!(tree.remove(HgPath::new(b"a/b/x")), Some(entry)); + assert_eq!(tree.files_count, 4); + assert_eq!(tree.remove(HgPath::new(b"a/b/x")), None); + assert_eq!(tree.files_count, 4); + assert_eq!(tree.remove(HgPath::new(b"a/b/y")), Some(entry)); + assert_eq!(tree.files_count, 3); + assert_eq!(tree.remove(HgPath::new(b"a/b/z")), Some(entry)); + assert_eq!(tree.files_count, 2); + + assert_eq!(tree.remove(HgPath::new(b"x")), Some(entry)); + assert_eq!(tree.files_count, 1); + assert_eq!(tree.remove(HgPath::new(b"y")), Some(entry)); + assert_eq!(tree.files_count, 0); + + // `a` should have been cleaned up, no more files anywhere in its + // descendents + assert_eq!(tree.get_node(HgPath::new(b"a")), None); + assert_eq!(tree.root.children().unwrap().len(), 0); + + let removed_entry = DirstateEntry { + state: EntryState::Removed, + ..entry + }; + assert_eq!(tree.insert(HgPath::new(b"a"), removed_entry), None); + assert_eq!(tree.insert_node(HgPath::new(b"a/b/x"), entry), None); + assert_eq!(tree.files_count, 2); + dbg!(&tree); + assert_eq!(tree.remove(HgPath::new(b"a")), Some(removed_entry)); + assert_eq!(tree.files_count, 1); + dbg!(&tree); + assert_eq!(tree.remove(HgPath::new(b"a/b/x")), Some(entry)); + assert_eq!(tree.files_count, 0); + + // The entire tree should have been cleaned up, no more files anywhere + // in its descendents + assert_eq!(tree.root.children().unwrap().len(), 0); + + let removed_entry = DirstateEntry { + state: EntryState::Removed, + ..entry + }; + assert_eq!(tree.insert(HgPath::new(b"a"), entry), None); + assert_eq!(tree.insert_node(HgPath::new(b"a/b/x"), removed_entry), None); + assert_eq!(tree.files_count, 2); + dbg!(&tree); + assert_eq!(tree.remove(HgPath::new(b"a")), Some(entry)); + assert_eq!(tree.files_count, 1); + dbg!(&tree); + assert_eq!(tree.remove(HgPath::new(b"a/b/x")), Some(removed_entry)); + assert_eq!(tree.files_count, 0); + + dbg!(&tree); + // The entire tree should have been cleaned up, no more files anywhere + // in its descendents + assert_eq!(tree.root.children().unwrap().len(), 0); + + assert_eq!(tree.insert(HgPath::new(b"d"), entry), None); + assert_eq!(tree.insert(HgPath::new(b"d/d/d"), entry), None); + assert_eq!(tree.files_count, 2); + + // Deleting the nested file should not delete the top directory as it + // used to be a file + assert_eq!(tree.remove(HgPath::new(b"d/d/d")), Some(entry)); + assert_eq!(tree.files_count, 1); + assert!(tree.get_node(HgPath::new(b"d")).is_some()); + assert!(tree.remove(HgPath::new(b"d")).is_some()); + assert_eq!(tree.files_count, 0); + + // Deleting the nested file should not delete the top file (other way + // around from the last case) + assert_eq!(tree.insert(HgPath::new(b"a/a"), entry), None); + assert_eq!(tree.files_count, 1); + assert_eq!(tree.insert(HgPath::new(b"a"), entry), None); + assert_eq!(tree.files_count, 2); + dbg!(&tree); + assert_eq!(tree.remove(HgPath::new(b"a/a")), Some(entry)); + assert_eq!(tree.files_count, 1); + dbg!(&tree); + assert!(tree.get_node(HgPath::new(b"a")).is_some()); + assert!(tree.get_node(HgPath::new(b"a/a")).is_none()); + } + + #[test] + fn test_was_directory() { + let mut tree = Tree::new(); + + let entry = DirstateEntry { + state: EntryState::Removed, + mode: 0, + mtime: 0, + size: 0, + }; + assert_eq!(tree.insert_node(HgPath::new(b"a/b/c"), entry), None); + assert_eq!(tree.files_count, 1); + + assert!(tree.insert_node(HgPath::new(b"a"), entry).is_some()); + let new_a = tree.root.children().unwrap().get(&b"a".to_vec()).unwrap(); + + match &new_a.kind { + NodeKind::Directory(_) => panic!(), + NodeKind::File(f) => { + let dir = f.was_directory.clone().unwrap(); + let c = dir + .children + .get(&b"b".to_vec()) + .unwrap() + .children() + .unwrap() + .get(&b"c".to_vec()) + .unwrap(); + + assert_eq!( + match &c.kind { + NodeKind::Directory(_) => panic!(), + NodeKind::File(f) => f.entry, + }, + entry + ); + } + } + assert_eq!(tree.files_count, 2); + dbg!(&tree); + assert_eq!(tree.remove(HgPath::new(b"a/b/c")), Some(entry)); + assert_eq!(tree.files_count, 1); + dbg!(&tree); + let a = tree.get_node(HgPath::new(b"a")).unwrap(); + match &a.kind { + NodeKind::Directory(_) => panic!(), + NodeKind::File(f) => { + // Directory in `was_directory` was emptied, should be removed + assert_eq!(f.was_directory, None); + } + } + } + #[test] + fn test_extend() { + let insertions = [ + ( + HgPathBuf::from_bytes(b"d"), + DirstateEntry { + state: EntryState::Added, + mode: 0, + mtime: -1, + size: -1, + }, + ), + ( + HgPathBuf::from_bytes(b"b"), + DirstateEntry { + state: EntryState::Normal, + mode: 33188, + mtime: 1599647984, + size: 2, + }, + ), + ( + HgPathBuf::from_bytes(b"a/a"), + DirstateEntry { + state: EntryState::Normal, + mode: 33188, + mtime: 1599647984, + size: 2, + }, + ), + ( + HgPathBuf::from_bytes(b"d/d/d"), + DirstateEntry { + state: EntryState::Removed, + mode: 0, + mtime: 0, + size: 0, + }, + ), + ] + .to_vec(); + let mut tree = Tree::new(); + + tree.extend(insertions.clone().into_iter()); + + for (path, _) in &insertions { + assert!(tree.contains_key(path), true); + } + assert_eq!(tree.files_count, 4); + } +}