rust: add `dirstate_tree` module
Mercurial needs to represent the filesystem hierarchy on which it operates, for
example in the dirstate. Its current on-disk representation is an unsorted, flat
structure that gets transformed in the current Rust code into a `HashMap`.
This loses the hierarchical information of the dirstate, leading to some
unfortunate performance and algorithmic compromises.
This module adds an implementation of a radix tree that is specialized for
representing the dirstate: its unit is the path component. I have made no
efforts to optimize either its memory footprint or its insertion speed: they're
pretty bad for now.
Following will be a few patches that modify the dirstate.status logic to use
that new hierarchical information, fixing issue 6335 in the same swing.
Differential Revision: https://phab.mercurial-scm.org/D9085
--- a/rust/hg-core/src/dirstate.rs Fri Jul 24 16:35:02 2020 +0200
+++ b/rust/hg-core/src/dirstate.rs Fri Sep 25 17:51:34 2020 +0200
@@ -11,6 +11,7 @@
pub mod dirs_multiset;
pub mod dirstate_map;
+pub mod dirstate_tree;
pub mod parsers;
pub mod status;
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/rust/hg-core/src/dirstate/dirstate_tree.rs Fri Sep 25 17:51:34 2020 +0200
@@ -0,0 +1,14 @@
+// dirstate_tree.rs
+//
+// Copyright 2020, Raphaël Gomès <rgomes@octobus.net>
+//
+// This software may be used and distributed according to the terms of the
+// GNU General Public License version 2 or any later version.
+
+//! Special-case radix tree that matches a filesystem hierarchy for use in the
+//! dirstate.
+//! It has not been optimized at all yet.
+
+pub mod iter;
+pub mod node;
+pub mod tree;
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/rust/hg-core/src/dirstate/dirstate_tree/iter.rs Fri Sep 25 17:51:34 2020 +0200
@@ -0,0 +1,358 @@
+// iter.rs
+//
+// Copyright 2020, Raphaël Gomès <rgomes@octobus.net>
+//
+// This software may be used and distributed according to the terms of the
+// GNU General Public License version 2 or any later version.
+
+use super::node::{Node, NodeKind};
+use super::tree::Tree;
+use crate::dirstate::dirstate_tree::node::Directory;
+use crate::dirstate::status::Dispatch;
+use crate::utils::hg_path::{hg_path_to_path_buf, HgPath, HgPathBuf};
+use crate::DirstateEntry;
+use std::borrow::Cow;
+use std::collections::VecDeque;
+use std::iter::{FromIterator, FusedIterator};
+use std::path::PathBuf;
+
+impl FromIterator<(HgPathBuf, DirstateEntry)> for Tree {
+ fn from_iter<T: IntoIterator<Item = (HgPathBuf, DirstateEntry)>>(iter: T) -> Self {
+ let mut tree = Self::new();
+ for (path, entry) in iter {
+ tree.insert(path, entry);
+ }
+ tree
+ }
+}
+
+/// Iterator of all entries in the dirstate tree.
+///
+/// It has no particular ordering.
+pub struct Iter<'a> {
+ to_visit: VecDeque<(Cow<'a, [u8]>, &'a Node)>,
+}
+
+impl<'a> Iter<'a> {
+ pub fn new(node: &'a Node) -> Iter<'a> {
+ let mut to_visit = VecDeque::new();
+ to_visit.push_back((Cow::Borrowed(&b""[..]), node));
+ Self { to_visit }
+ }
+}
+
+impl<'a> Iterator for Iter<'a> {
+ type Item = (HgPathBuf, DirstateEntry);
+
+ fn next(&mut self) -> Option<Self::Item> {
+ while let Some((base_path, node)) = self.to_visit.pop_front() {
+ match &node.kind {
+ NodeKind::Directory(dir) => {
+ add_children_to_visit(&mut self.to_visit, &base_path, &dir);
+ if let Some(file) = &dir.was_file {
+ return Some((HgPathBuf::from_bytes(&base_path), file.entry));
+ }
+ }
+ NodeKind::File(file) => {
+ if let Some(dir) = &file.was_directory {
+ add_children_to_visit(&mut self.to_visit, &base_path, &dir);
+ }
+ return Some((HgPathBuf::from_bytes(&base_path), file.entry));
+ }
+ }
+ }
+ None
+ }
+}
+
+impl<'a> FusedIterator for Iter<'a> {}
+
+/// Iterator of all entries in the dirstate tree, with a special filesystem
+/// handling for the directories containing said entries.
+///
+/// It checks every directory on-disk to see if it has become a symlink, to
+/// prevent a potential security issue.
+/// Using this information, it may dispatch `status` information early: it
+/// returns canonical paths along with `Shortcut`s, which are either a
+/// `DirstateEntry` or a `Dispatch`, if the fate of said path has already been
+/// determined.
+///
+/// Like `Iter`, it has no particular ordering.
+pub struct FsIter<'a> {
+ root_dir: PathBuf,
+ to_visit: VecDeque<(Cow<'a, [u8]>, &'a Node)>,
+ shortcuts: VecDeque<(HgPathBuf, StatusShortcut)>,
+}
+
+impl<'a> FsIter<'a> {
+ pub fn new(node: &'a Node, root_dir: PathBuf) -> FsIter<'a> {
+ let mut to_visit = VecDeque::new();
+ to_visit.push_back((Cow::Borrowed(&b""[..]), node));
+ Self {
+ root_dir,
+ to_visit,
+ shortcuts: Default::default(),
+ }
+ }
+
+ /// Mercurial tracks symlinks but *not* what they point to.
+ /// If a directory is moved and symlinked:
+ ///
+ /// ```bash
+ /// $ mkdir foo
+ /// $ touch foo/a
+ /// $ # commit...
+ /// $ mv foo bar
+ /// $ ln -s bar foo
+ /// ```
+ /// We need to dispatch the new symlink as `Unknown` and all the
+ /// descendents of the directory it replace as `Deleted`.
+ fn dispatch_symlinked_directory(&mut self, path: impl AsRef<HgPath>, node: &Node) {
+ let path = path.as_ref();
+ self.shortcuts
+ .push_back((path.to_owned(), StatusShortcut::Dispatch(Dispatch::Unknown)));
+ for (file, _) in node.iter() {
+ self.shortcuts.push_back((
+ path.join(&file),
+ StatusShortcut::Dispatch(Dispatch::Deleted),
+ ));
+ }
+ }
+
+ /// Returns `true` if the canonical `path` of a directory corresponds to a
+ /// symlink on disk. It means it was moved and symlinked after the last
+ /// dirstate update.
+ ///
+ /// # Special cases
+ ///
+ /// Returns `false` for the repository root.
+ /// Returns `false` on io error, error handling is outside of the iterator.
+ fn directory_became_symlink(&mut self, path: &HgPath) -> bool {
+ if path.is_empty() {
+ return false;
+ }
+ let filename_as_path = match hg_path_to_path_buf(&path) {
+ Ok(p) => p,
+ _ => return false,
+ };
+ let meta = self.root_dir.join(filename_as_path).symlink_metadata();
+ match meta {
+ Ok(ref m) if m.file_type().is_symlink() => true,
+ _ => return false,
+ }
+ }
+}
+
+/// Returned by `FsIter`, since the `Dispatch` of any given entry may already
+/// be determined during the iteration. This is necessary for performance
+/// reasons, since hierarchical information is needed to `Dispatch` an entire
+/// subtree efficiently.
+#[derive(Debug, Copy, Clone)]
+pub enum StatusShortcut {
+ /// A entry in the dirstate for further inspection
+ Entry(DirstateEntry),
+ /// The result of the status of the corresponding file
+ Dispatch(Dispatch),
+}
+
+impl<'a> Iterator for FsIter<'a> {
+ type Item = (HgPathBuf, StatusShortcut);
+
+ fn next(&mut self) -> Option<Self::Item> {
+ // If any paths have already been `Dispatch`-ed, return them
+ while let Some(res) = self.shortcuts.pop_front() {
+ return Some(res);
+ }
+
+ while let Some((base_path, node)) = self.to_visit.pop_front() {
+ match &node.kind {
+ NodeKind::Directory(dir) => {
+ let canonical_path = HgPath::new(&base_path);
+ if self.directory_became_symlink(canonical_path) {
+ // Potential security issue, don't do a normal
+ // traversal, force the results.
+ self.dispatch_symlinked_directory(canonical_path, &node);
+ continue;
+ }
+ add_children_to_visit(&mut self.to_visit, &base_path, &dir);
+ if let Some(file) = &dir.was_file {
+ return Some((
+ HgPathBuf::from_bytes(&base_path),
+ StatusShortcut::Entry(file.entry),
+ ));
+ }
+ }
+ NodeKind::File(file) => {
+ if let Some(dir) = &file.was_directory {
+ add_children_to_visit(&mut self.to_visit, &base_path, &dir);
+ }
+ return Some((
+ HgPathBuf::from_bytes(&base_path),
+ StatusShortcut::Entry(file.entry),
+ ));
+ }
+ }
+ }
+
+ None
+ }
+}
+
+impl<'a> FusedIterator for FsIter<'a> {}
+
+fn join_path<'a, 'b>(path: &'a [u8], other: &'b [u8]) -> Cow<'b, [u8]> {
+ if path.is_empty() {
+ other.into()
+ } else {
+ [path, &b"/"[..], other].concat().into()
+ }
+}
+
+/// Adds all children of a given directory `dir` to the visit queue `to_visit`
+/// prefixed by a `base_path`.
+fn add_children_to_visit<'a>(
+ to_visit: &mut VecDeque<(Cow<'a, [u8]>, &'a Node)>,
+ base_path: &[u8],
+ dir: &'a Directory,
+) {
+ to_visit.extend(dir.children.iter().map(|(path, child)| {
+ let full_path = join_path(&base_path, &path);
+ (Cow::from(full_path), child)
+ }));
+}
+
+#[cfg(test)]
+mod tests {
+ use super::*;
+ use crate::utils::hg_path::HgPath;
+ use crate::{EntryState, FastHashMap};
+ use std::collections::HashSet;
+
+ #[test]
+ fn test_iteration() {
+ let mut tree = Tree::new();
+
+ assert_eq!(
+ tree.insert(
+ HgPathBuf::from_bytes(b"foo/bar"),
+ DirstateEntry {
+ state: EntryState::Merged,
+ mode: 41,
+ mtime: 42,
+ size: 43,
+ }
+ ),
+ None
+ );
+
+ assert_eq!(
+ tree.insert(
+ HgPathBuf::from_bytes(b"foo2"),
+ DirstateEntry {
+ state: EntryState::Merged,
+ mode: 40,
+ mtime: 41,
+ size: 42,
+ }
+ ),
+ None
+ );
+
+ assert_eq!(
+ tree.insert(
+ HgPathBuf::from_bytes(b"foo/baz"),
+ DirstateEntry {
+ state: EntryState::Normal,
+ mode: 0,
+ mtime: 0,
+ size: 0,
+ }
+ ),
+ None
+ );
+
+ assert_eq!(
+ tree.insert(
+ HgPathBuf::from_bytes(b"foo/bap/nested"),
+ DirstateEntry {
+ state: EntryState::Normal,
+ mode: 0,
+ mtime: 0,
+ size: 0,
+ }
+ ),
+ None
+ );
+
+ assert_eq!(tree.len(), 4);
+
+ let results: HashSet<_> = tree.iter().map(|(c, _)| c.to_owned()).collect();
+ dbg!(&results);
+ assert!(results.contains(HgPath::new(b"foo2")));
+ assert!(results.contains(HgPath::new(b"foo/bar")));
+ assert!(results.contains(HgPath::new(b"foo/baz")));
+ assert!(results.contains(HgPath::new(b"foo/bap/nested")));
+
+ let mut iter = tree.iter();
+ assert!(iter.next().is_some());
+ assert!(iter.next().is_some());
+ assert!(iter.next().is_some());
+ assert!(iter.next().is_some());
+ assert_eq!(None, iter.next());
+ assert_eq!(None, iter.next());
+ drop(iter);
+
+ assert_eq!(
+ tree.insert(
+ HgPathBuf::from_bytes(b"foo/bap/nested/a"),
+ DirstateEntry {
+ state: EntryState::Normal,
+ mode: 0,
+ mtime: 0,
+ size: 0,
+ }
+ ),
+ None
+ );
+
+ let results: FastHashMap<_, _> = tree.iter().collect();
+ assert!(results.contains_key(HgPath::new(b"foo2")));
+ assert!(results.contains_key(HgPath::new(b"foo/bar")));
+ assert!(results.contains_key(HgPath::new(b"foo/baz")));
+ // Is a dir but `was_file`, so it's listed as a removed file
+ assert!(results.contains_key(HgPath::new(b"foo/bap/nested")));
+ assert!(results.contains_key(HgPath::new(b"foo/bap/nested/a")));
+
+ // insert removed file (now directory) after nested file
+ assert_eq!(
+ tree.insert(
+ HgPathBuf::from_bytes(b"a/a"),
+ DirstateEntry {
+ state: EntryState::Normal,
+ mode: 0,
+ mtime: 0,
+ size: 0,
+ }
+ ),
+ None
+ );
+
+ // `insert` returns `None` for a directory
+ assert_eq!(
+ tree.insert(
+ HgPathBuf::from_bytes(b"a"),
+ DirstateEntry {
+ state: EntryState::Removed,
+ mode: 0,
+ mtime: 0,
+ size: 0,
+ }
+ ),
+ None
+ );
+
+ let results: FastHashMap<_, _> = tree.iter().collect();
+ assert!(results.contains_key(HgPath::new(b"a")));
+ assert!(results.contains_key(HgPath::new(b"a/a")));
+ }
+}
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/rust/hg-core/src/dirstate/dirstate_tree/node.rs Fri Sep 25 17:51:34 2020 +0200
@@ -0,0 +1,377 @@
+// node.rs
+//
+// Copyright 2020, Raphaël Gomès <rgomes@octobus.net>
+//
+// This software may be used and distributed according to the terms of the
+// GNU General Public License version 2 or any later version.
+
+use super::iter::Iter;
+use crate::utils::hg_path::HgPathBuf;
+use crate::{DirstateEntry, EntryState, FastHashMap};
+
+/// Represents a filesystem directory in the dirstate tree
+#[derive(Debug, Default, Clone, PartialEq)]
+pub struct Directory {
+ /// Contains the old file information if it existed between changesets.
+ /// Happens if a file `foo` is marked as removed, removed from the
+ /// filesystem then a directory `foo` is created and at least one of its
+ /// descendents is added to Mercurial.
+ pub(super) was_file: Option<Box<File>>,
+ pub(super) children: FastHashMap<Vec<u8>, Node>,
+}
+
+/// Represents a filesystem file (or symlink) in the dirstate tree
+#[derive(Debug, Clone, PartialEq)]
+pub struct File {
+ /// Contains the old structure if it existed between changesets.
+ /// Happens all descendents of `foo` marked as removed and removed from
+ /// the filesystem, then a file `foo` is created and added to Mercurial.
+ pub(super) was_directory: Option<Box<Directory>>,
+ pub(super) entry: DirstateEntry,
+}
+
+#[derive(Debug, Clone, PartialEq)]
+pub enum NodeKind {
+ Directory(Directory),
+ File(File),
+}
+
+#[derive(Debug, Default, Clone, PartialEq)]
+pub struct Node {
+ pub kind: NodeKind,
+}
+
+impl Default for NodeKind {
+ fn default() -> Self {
+ NodeKind::Directory(Default::default())
+ }
+}
+
+impl Node {
+ pub fn insert(&mut self, path: &[u8], new_entry: DirstateEntry) -> InsertResult {
+ let mut split = path.splitn(2, |&c| c == b'/');
+ let head = split.next().unwrap_or(b"");
+ let tail = split.next().unwrap_or(b"");
+
+ if let NodeKind::File(file) = &mut self.kind {
+ if tail.is_empty() && head.is_empty() {
+ // We're modifying the current file
+ let new = Self {
+ kind: NodeKind::File(File {
+ entry: new_entry,
+ ..file.clone()
+ }),
+ };
+ return InsertResult {
+ did_insert: false,
+ old_entry: Some(std::mem::replace(self, new)),
+ };
+ } else {
+ match file.entry.state {
+ // Only replace the current file with a directory if it's
+ // marked as `Removed`
+ EntryState::Removed => {
+ self.kind = NodeKind::Directory(Directory {
+ was_file: Some(Box::from(file.clone())),
+ children: Default::default(),
+ })
+ }
+ _ => return Node::insert_in_file(file, new_entry, head, tail),
+ }
+ }
+ }
+
+ match &mut self.kind {
+ NodeKind::Directory(directory) => {
+ return Node::insert_in_directory(directory, new_entry, head, tail);
+ }
+ NodeKind::File(_) => unreachable!("The file case has already been handled"),
+ }
+ }
+
+ /// The current file still exists and is not marked as `Removed`.
+ /// Insert the entry in its `was_directory`.
+ fn insert_in_file(
+ file: &mut File,
+ new_entry: DirstateEntry,
+ head: &[u8],
+ tail: &[u8],
+ ) -> InsertResult {
+ if let Some(d) = &mut file.was_directory {
+ Node::insert_in_directory(d, new_entry, head, tail)
+ } else {
+ let mut dir = Directory {
+ was_file: None,
+ children: FastHashMap::default(),
+ };
+ let res = Node::insert_in_directory(&mut dir, new_entry, head, tail);
+ file.was_directory = Some(Box::new(dir));
+ res
+ }
+ }
+
+ /// Insert an entry in the subtree of `directory`
+ fn insert_in_directory(
+ directory: &mut Directory,
+ new_entry: DirstateEntry,
+ head: &[u8],
+ tail: &[u8],
+ ) -> InsertResult {
+ let mut res = InsertResult::default();
+
+ if let Some(node) = directory.children.get_mut(head) {
+ // Node exists
+ match &mut node.kind {
+ NodeKind::Directory(subdir) => {
+ if tail.is_empty() {
+ let becomes_file = Self {
+ kind: NodeKind::File(File {
+ was_directory: Some(Box::from(subdir.clone())),
+ entry: new_entry,
+ }),
+ };
+ let old_entry = directory.children.insert(head.to_owned(), becomes_file);
+ return InsertResult {
+ did_insert: true,
+ old_entry,
+ };
+ } else {
+ res = node.insert(tail, new_entry);
+ }
+ }
+ NodeKind::File(_) => {
+ res = node.insert(tail, new_entry);
+ }
+ }
+ } else if tail.is_empty() {
+ // File does not already exist
+ directory.children.insert(
+ head.to_owned(),
+ Self {
+ kind: NodeKind::File(File {
+ was_directory: None,
+ entry: new_entry,
+ }),
+ },
+ );
+ res.did_insert = true;
+ } else {
+ // Directory does not already exist
+ let mut nested = Self {
+ kind: NodeKind::Directory(Directory {
+ was_file: None,
+ children: Default::default(),
+ }),
+ };
+ res = nested.insert(tail, new_entry);
+ directory.children.insert(head.to_owned(), nested);
+ }
+ res
+ }
+
+ /// Removes an entry from the tree, returns a `RemoveResult`.
+ pub fn remove(&mut self, path: &[u8]) -> RemoveResult {
+ let empty_result = RemoveResult::default();
+ if path.is_empty() {
+ return empty_result;
+ }
+ let mut split = path.splitn(2, |&c| c == b'/');
+ let head = split.next();
+ let tail = split.next().unwrap_or(b"");
+
+ let head = match head {
+ None => {
+ return empty_result;
+ }
+ Some(h) => h,
+ };
+ if head == path {
+ match &mut self.kind {
+ NodeKind::Directory(d) => {
+ return Node::remove_from_directory(head, d);
+ }
+ NodeKind::File(f) => {
+ if let Some(d) = &mut f.was_directory {
+ let RemoveResult { old_entry, .. } = Node::remove_from_directory(head, d);
+ return RemoveResult {
+ cleanup: false,
+ old_entry,
+ };
+ }
+ }
+ }
+ empty_result
+ } else {
+ // Look into the dirs
+ match &mut self.kind {
+ NodeKind::Directory(d) => {
+ if let Some(child) = d.children.get_mut(head) {
+ let mut res = child.remove(tail);
+ if res.cleanup {
+ d.children.remove(head);
+ }
+ res.cleanup = d.children.len() == 0 && d.was_file.is_none();
+ res
+ } else {
+ empty_result
+ }
+ }
+ NodeKind::File(f) => {
+ if let Some(d) = &mut f.was_directory {
+ if let Some(child) = d.children.get_mut(head) {
+ let RemoveResult { cleanup, old_entry } = child.remove(tail);
+ if cleanup {
+ d.children.remove(head);
+ }
+ if d.children.len() == 0 && d.was_file.is_none() {
+ f.was_directory = None;
+ }
+
+ return RemoveResult {
+ cleanup: false,
+ old_entry,
+ };
+ }
+ }
+ empty_result
+ }
+ }
+ }
+ }
+
+ fn remove_from_directory(head: &[u8], d: &mut Directory) -> RemoveResult {
+ if let Some(node) = d.children.get_mut(head) {
+ return match &mut node.kind {
+ NodeKind::Directory(d) => {
+ if let Some(f) = &mut d.was_file {
+ let entry = f.entry;
+ d.was_file = None;
+ RemoveResult {
+ cleanup: false,
+ old_entry: Some(entry),
+ }
+ } else {
+ RemoveResult::default()
+ }
+ }
+ NodeKind::File(f) => {
+ let entry = f.entry;
+ let mut cleanup = false;
+ match &f.was_directory {
+ None => {
+ if d.children.len() == 1 {
+ cleanup = true;
+ }
+ d.children.remove(head);
+ }
+ Some(dir) => {
+ node.kind = NodeKind::Directory(*dir.clone());
+ }
+ }
+
+ RemoveResult {
+ cleanup: cleanup,
+ old_entry: Some(entry),
+ }
+ }
+ };
+ }
+ RemoveResult::default()
+ }
+
+ pub fn get(&self, path: &[u8]) -> Option<&Node> {
+ if path.is_empty() {
+ return Some(&self);
+ }
+ let mut split = path.splitn(2, |&c| c == b'/');
+ let head = split.next();
+ let tail = split.next().unwrap_or(b"");
+
+ let head = match head {
+ None => {
+ return Some(&self);
+ }
+ Some(h) => h,
+ };
+ match &self.kind {
+ NodeKind::Directory(d) => {
+ if let Some(child) = d.children.get(head) {
+ return child.get(tail);
+ }
+ }
+ NodeKind::File(f) => {
+ if let Some(d) = &f.was_directory {
+ if let Some(child) = d.children.get(head) {
+ return child.get(tail);
+ }
+ }
+ }
+ }
+
+ None
+ }
+
+ pub fn get_mut(&mut self, path: &[u8]) -> Option<&mut NodeKind> {
+ if path.is_empty() {
+ return Some(&mut self.kind);
+ }
+ let mut split = path.splitn(2, |&c| c == b'/');
+ let head = split.next();
+ let tail = split.next().unwrap_or(b"");
+
+ let head = match head {
+ None => {
+ return Some(&mut self.kind);
+ }
+ Some(h) => h,
+ };
+ match &mut self.kind {
+ NodeKind::Directory(d) => {
+ if let Some(child) = d.children.get_mut(head) {
+ return child.get_mut(tail);
+ }
+ }
+ NodeKind::File(f) => {
+ if let Some(d) = &mut f.was_directory {
+ if let Some(child) = d.children.get_mut(head) {
+ return child.get_mut(tail);
+ }
+ }
+ }
+ }
+
+ None
+ }
+
+ pub fn iter(&self) -> Iter {
+ Iter::new(self)
+ }
+}
+
+/// Information returned to the caller of an `insert` operation for integrity.
+#[derive(Debug, Default)]
+pub struct InsertResult {
+ /// Whether the insertion resulted in an actual insertion and not an
+ /// update
+ pub(super) did_insert: bool,
+ /// The entry that was replaced, if it exists
+ pub(super) old_entry: Option<Node>,
+}
+
+/// Information returned to the caller of a `remove` operation integrity.
+#[derive(Debug, Default)]
+pub struct RemoveResult {
+ /// If the caller needs to remove the current node
+ pub(super) cleanup: bool,
+ /// The entry that was replaced, if it exists
+ pub(super) old_entry: Option<DirstateEntry>,
+}
+
+impl<'a> IntoIterator for &'a Node {
+ type Item = (HgPathBuf, DirstateEntry);
+ type IntoIter = Iter<'a>;
+
+ fn into_iter(self) -> Self::IntoIter {
+ self.iter()
+ }
+}
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/rust/hg-core/src/dirstate/dirstate_tree/tree.rs Fri Sep 25 17:51:34 2020 +0200
@@ -0,0 +1,661 @@
+// tree.rs
+//
+// Copyright 2020, Raphaël Gomès <rgomes@octobus.net>
+//
+// This software may be used and distributed according to the terms of the
+// GNU General Public License version 2 or any later version.
+
+use super::iter::Iter;
+use super::node::{Directory, Node, NodeKind};
+use crate::dirstate::dirstate_tree::iter::FsIter;
+use crate::dirstate::dirstate_tree::node::{InsertResult, RemoveResult};
+use crate::utils::hg_path::{HgPath, HgPathBuf};
+use crate::DirstateEntry;
+use std::path::PathBuf;
+
+/// A specialized tree to represent the Mercurial dirstate.
+///
+/// # Advantages over a flat structure
+///
+/// The dirstate is inherently hierarchical, since it's a representation of the
+/// file structure of the project. The current dirstate format is flat, and
+/// while that affords us potentially great (unordered) iteration speeds, the
+/// need to retrieve a given path is great enough that you need some kind of
+/// hashmap or tree in a lot of cases anyway.
+///
+/// Going with a tree allows us to be smarter:
+/// - Skipping an ignored directory means we don't visit its entire subtree
+/// - Security auditing does not need to reconstruct paths backwards to check
+/// for symlinked directories, this can be done during the iteration in a
+/// very efficient fashion
+/// - We don't need to build the directory information in another struct,
+/// simplifying the code a lot, reducing the memory footprint and
+/// potentially going faster depending on the implementation.
+/// - We can use it to store a (platform-dependent) caching mechanism [1]
+/// - And probably other types of optimizations.
+///
+/// Only the first two items in this list are implemented as of this commit.
+///
+/// [1]: https://www.mercurial-scm.org/wiki/DirsCachePlan
+///
+///
+/// # Structure
+///
+/// It's a prefix (radix) tree with no fixed arity, with a granularity of a
+/// folder, allowing it to mimic a filesystem hierarchy:
+///
+/// ```text
+/// foo/bar
+/// foo/baz
+/// test
+/// ```
+/// Will be represented (simplified) by:
+///
+/// ```text
+/// Directory(root):
+/// - File("test")
+/// - Directory("foo"):
+/// - File("bar")
+/// - File("baz")
+/// ```
+///
+/// Moreover, it is special-cased for storing the dirstate and as such handles
+/// cases that a simple `HashMap` would handle, but while preserving the
+/// hierarchy.
+/// For example:
+///
+/// ```shell
+/// $ touch foo
+/// $ hg add foo
+/// $ hg commit -m "foo"
+/// $ hg remove foo
+/// $ rm foo
+/// $ mkdir foo
+/// $ touch foo/a
+/// $ hg add foo/a
+/// $ hg status
+/// R foo
+/// A foo/a
+/// ```
+/// To represent this in a tree, one needs to keep track of whether any given
+/// file was a directory and whether any given directory was a file at the last
+/// dirstate update. This tree stores that information, but only in the right
+/// circumstances by respecting the high-level rules that prevent nonsensical
+/// structures to exist:
+/// - a file can only be added as a child of another file if the latter is
+/// marked as `Removed`
+/// - a file cannot replace a folder unless all its descendents are removed
+///
+/// This second rule is not checked by the tree for performance reasons, and
+/// because high-level logic already prevents that state from happening.
+///
+/// # Ordering
+///
+/// It makes no guarantee of ordering for now.
+#[derive(Debug, Default, Clone, PartialEq)]
+pub struct Tree {
+ pub root: Node,
+ files_count: usize,
+}
+
+impl Tree {
+ pub fn new() -> Self {
+ Self {
+ root: Node {
+ kind: NodeKind::Directory(Directory {
+ was_file: None,
+ children: Default::default(),
+ }),
+ },
+ files_count: 0,
+ }
+ }
+
+ /// How many files (not directories) are stored in the tree, including ones
+ /// marked as `Removed`.
+ pub fn len(&self) -> usize {
+ self.files_count
+ }
+
+ pub fn is_empty(&self) -> bool {
+ self.len() == 0
+ }
+
+ /// Inserts a file in the tree and returns the previous entry if any.
+ pub fn insert(
+ &mut self,
+ path: impl AsRef<HgPath>,
+ kind: DirstateEntry,
+ ) -> Option<DirstateEntry> {
+ let old = self.insert_node(path, kind);
+ match old?.kind {
+ NodeKind::Directory(_) => None,
+ NodeKind::File(f) => Some(f.entry),
+ }
+ }
+
+ /// Low-level insertion method that returns the previous node (directories
+ /// included).
+ fn insert_node(&mut self, path: impl AsRef<HgPath>, kind: DirstateEntry) -> Option<Node> {
+ let InsertResult {
+ did_insert,
+ old_entry,
+ } = self.root.insert(path.as_ref().as_bytes(), kind);
+ self.files_count += if did_insert { 1 } else { 0 };
+ old_entry
+ }
+
+ /// Returns a reference to a node if it exists.
+ pub fn get_node(&self, path: impl AsRef<HgPath>) -> Option<&Node> {
+ self.root.get(path.as_ref().as_bytes())
+ }
+
+ /// Returns a reference to the entry corresponding to `path` if it exists.
+ pub fn get(&self, path: impl AsRef<HgPath>) -> Option<&DirstateEntry> {
+ if let Some(node) = self.get_node(&path) {
+ return match &node.kind {
+ NodeKind::Directory(d) => d.was_file.as_ref().map(|f| &f.entry),
+ NodeKind::File(f) => Some(&f.entry),
+ };
+ }
+ None
+ }
+
+ /// Returns `true` if an entry is found for the given `path`.
+ pub fn contains_key(&self, path: impl AsRef<HgPath>) -> bool {
+ self.get(path).is_some()
+ }
+
+ /// Returns a mutable reference to the entry corresponding to `path` if it
+ /// exists.
+ pub fn get_mut(&mut self, path: impl AsRef<HgPath>) -> Option<&mut DirstateEntry> {
+ if let Some(kind) = self.root.get_mut(path.as_ref().as_bytes()) {
+ return match kind {
+ NodeKind::Directory(d) => d.was_file.as_mut().map(|f| &mut f.entry),
+ NodeKind::File(f) => Some(&mut f.entry),
+ };
+ }
+ None
+ }
+
+ /// Returns an iterator over the paths and corresponding entries in the
+ /// tree.
+ pub fn iter(&self) -> Iter {
+ Iter::new(&self.root)
+ }
+
+ /// Returns an iterator of all entries in the tree, with a special
+ /// filesystem handling for the directories containing said entries. See
+ /// the documentation of `FsIter` for more.
+ pub fn fs_iter(&self, root_dir: PathBuf) -> FsIter {
+ FsIter::new(&self.root, root_dir)
+ }
+
+ /// Remove the entry at `path` and returns it, if it exists.
+ pub fn remove(&mut self, path: impl AsRef<HgPath>) -> Option<DirstateEntry> {
+ let RemoveResult { old_entry, .. } = self.root.remove(path.as_ref().as_bytes());
+ self.files_count = self
+ .files_count
+ .checked_sub(if old_entry.is_some() { 1 } else { 0 })
+ .expect("removed too many files");
+ old_entry
+ }
+}
+
+impl<P: AsRef<HgPath>> Extend<(P, DirstateEntry)> for Tree {
+ fn extend<T: IntoIterator<Item = (P, DirstateEntry)>>(&mut self, iter: T) {
+ for (path, entry) in iter {
+ self.insert(path, entry);
+ }
+ }
+}
+
+impl<'a> IntoIterator for &'a Tree {
+ type Item = (HgPathBuf, DirstateEntry);
+ type IntoIter = Iter<'a>;
+
+ fn into_iter(self) -> Self::IntoIter {
+ self.iter()
+ }
+}
+
+#[cfg(test)]
+mod tests {
+ use super::*;
+ use crate::dirstate::dirstate_tree::node::File;
+ use crate::{EntryState, FastHashMap};
+ use pretty_assertions::assert_eq;
+
+ impl Node {
+ /// Shortcut for getting children of a node in tests.
+ fn children(&self) -> Option<&FastHashMap<Vec<u8>, Node>> {
+ match &self.kind {
+ NodeKind::Directory(d) => Some(&d.children),
+ NodeKind::File(_) => None,
+ }
+ }
+ }
+
+ #[test]
+ fn test_dirstate_tree() {
+ let mut tree = Tree::new();
+
+ assert_eq!(
+ tree.insert_node(
+ HgPath::new(b"we/p"),
+ DirstateEntry {
+ state: EntryState::Normal,
+ mode: 0,
+ mtime: 0,
+ size: 0
+ }
+ ),
+ None
+ );
+ dbg!(&tree);
+ assert!(tree.get_node(HgPath::new(b"we")).is_some());
+ let entry = DirstateEntry {
+ state: EntryState::Merged,
+ mode: 41,
+ mtime: 42,
+ size: 43,
+ };
+ assert_eq!(tree.insert_node(HgPath::new(b"foo/bar"), entry), None);
+ assert_eq!(
+ tree.get_node(HgPath::new(b"foo/bar")),
+ Some(&Node {
+ kind: NodeKind::File(File {
+ was_directory: None,
+ entry
+ })
+ })
+ );
+ // We didn't override the first entry we made
+ assert!(tree.get_node(HgPath::new(b"we")).is_some(),);
+ // Inserting the same key again
+ assert_eq!(
+ tree.insert_node(HgPath::new(b"foo/bar"), entry),
+ Some(Node {
+ kind: NodeKind::File(File {
+ was_directory: None,
+ entry
+ }),
+ })
+ );
+ // Inserting the two levels deep
+ assert_eq!(tree.insert_node(HgPath::new(b"foo/bar/baz"), entry), None);
+ // Getting a file "inside a file" should return `None`
+ assert_eq!(tree.get_node(HgPath::new(b"foo/bar/baz/bap"),), None);
+
+ assert_eq!(
+ tree.insert_node(HgPath::new(b"wasdir/subfile"), entry),
+ None,
+ );
+ let removed_entry = DirstateEntry {
+ state: EntryState::Removed,
+ mode: 0,
+ mtime: 0,
+ size: 0,
+ };
+ assert!(tree
+ .insert_node(HgPath::new(b"wasdir"), removed_entry)
+ .is_some());
+
+ assert_eq!(
+ tree.get_node(HgPath::new(b"wasdir")),
+ Some(&Node {
+ kind: NodeKind::File(File {
+ was_directory: Some(Box::new(Directory {
+ was_file: None,
+ children: [(
+ b"subfile".to_vec(),
+ Node {
+ kind: NodeKind::File(File {
+ was_directory: None,
+ entry,
+ })
+ }
+ )]
+ .to_vec()
+ .into_iter()
+ .collect()
+ })),
+ entry: removed_entry
+ })
+ })
+ );
+
+ assert!(tree.get(HgPath::new(b"wasdir/subfile")).is_some())
+ }
+
+ #[test]
+ fn test_insert_removed() {
+ let mut tree = Tree::new();
+ let entry = DirstateEntry {
+ state: EntryState::Merged,
+ mode: 1,
+ mtime: 2,
+ size: 3,
+ };
+ let removed_entry = DirstateEntry {
+ state: EntryState::Removed,
+ mode: 10,
+ mtime: 20,
+ size: 30,
+ };
+ assert_eq!(tree.insert_node(HgPath::new(b"foo"), entry), None);
+ assert_eq!(tree.insert_node(HgPath::new(b"foo/a"), removed_entry), None);
+ // The insert should not turn `foo` into a directory as `foo` is not
+ // `Removed`.
+ match tree.get_node(HgPath::new(b"foo")).unwrap().kind {
+ NodeKind::Directory(_) => panic!("should be a file"),
+ NodeKind::File(_) => {}
+ }
+
+ let mut tree = Tree::new();
+ let entry = DirstateEntry {
+ state: EntryState::Merged,
+ mode: 1,
+ mtime: 2,
+ size: 3,
+ };
+ let removed_entry = DirstateEntry {
+ state: EntryState::Removed,
+ mode: 10,
+ mtime: 20,
+ size: 30,
+ };
+ // The insert *should* turn `foo` into a directory as it is `Removed`.
+ assert_eq!(tree.insert_node(HgPath::new(b"foo"), removed_entry), None);
+ assert_eq!(tree.insert_node(HgPath::new(b"foo/a"), entry), None);
+ match tree.get_node(HgPath::new(b"foo")).unwrap().kind {
+ NodeKind::Directory(_) => {}
+ NodeKind::File(_) => panic!("should be a directory"),
+ }
+ }
+
+ #[test]
+ fn test_get() {
+ let mut tree = Tree::new();
+ let entry = DirstateEntry {
+ state: EntryState::Merged,
+ mode: 1,
+ mtime: 2,
+ size: 3,
+ };
+ assert_eq!(tree.insert_node(HgPath::new(b"a/b/c"), entry), None);
+ assert_eq!(tree.files_count, 1);
+ assert_eq!(tree.get(HgPath::new(b"a/b/c")), Some(&entry));
+ assert_eq!(tree.get(HgPath::new(b"a/b")), None);
+ assert_eq!(tree.get(HgPath::new(b"a")), None);
+ assert_eq!(tree.get(HgPath::new(b"a/b/c/d")), None);
+ let entry2 = DirstateEntry {
+ state: EntryState::Removed,
+ mode: 0,
+ mtime: 5,
+ size: 1,
+ };
+ // was_directory
+ assert_eq!(tree.insert(HgPath::new(b"a/b"), entry2), None);
+ assert_eq!(tree.files_count, 2);
+ assert_eq!(tree.get(HgPath::new(b"a/b")), Some(&entry2));
+ assert_eq!(tree.get(HgPath::new(b"a/b/c")), Some(&entry));
+
+ let mut tree = Tree::new();
+
+ // was_file
+ assert_eq!(tree.insert_node(HgPath::new(b"a"), entry), None);
+ assert_eq!(tree.files_count, 1);
+ assert_eq!(tree.insert_node(HgPath::new(b"a/b"), entry2), None);
+ assert_eq!(tree.files_count, 2);
+ assert_eq!(tree.get(HgPath::new(b"a/b")), Some(&entry2));
+ }
+
+ #[test]
+ fn test_get_mut() {
+ let mut tree = Tree::new();
+ let mut entry = DirstateEntry {
+ state: EntryState::Merged,
+ mode: 1,
+ mtime: 2,
+ size: 3,
+ };
+ assert_eq!(tree.insert_node(HgPath::new(b"a/b/c"), entry), None);
+ assert_eq!(tree.files_count, 1);
+ assert_eq!(tree.get_mut(HgPath::new(b"a/b/c")), Some(&mut entry));
+ assert_eq!(tree.get_mut(HgPath::new(b"a/b")), None);
+ assert_eq!(tree.get_mut(HgPath::new(b"a")), None);
+ assert_eq!(tree.get_mut(HgPath::new(b"a/b/c/d")), None);
+ let mut entry2 = DirstateEntry {
+ state: EntryState::Removed,
+ mode: 0,
+ mtime: 5,
+ size: 1,
+ };
+ // was_directory
+ assert_eq!(tree.insert(HgPath::new(b"a/b"), entry2), None);
+ assert_eq!(tree.files_count, 2);
+ assert_eq!(tree.get_mut(HgPath::new(b"a/b")), Some(&mut entry2));
+ assert_eq!(tree.get_mut(HgPath::new(b"a/b/c")), Some(&mut entry));
+
+ let mut tree = Tree::new();
+
+ // was_file
+ assert_eq!(tree.insert_node(HgPath::new(b"a"), entry), None);
+ assert_eq!(tree.files_count, 1);
+ assert_eq!(tree.insert_node(HgPath::new(b"a/b"), entry2), None);
+ assert_eq!(tree.files_count, 2);
+ assert_eq!(tree.get_mut(HgPath::new(b"a/b")), Some(&mut entry2));
+ }
+
+ #[test]
+ fn test_remove() {
+ let mut tree = Tree::new();
+ assert_eq!(tree.files_count, 0);
+ assert_eq!(tree.remove(HgPath::new(b"foo")), None);
+ assert_eq!(tree.files_count, 0);
+
+ let entry = DirstateEntry {
+ state: EntryState::Normal,
+ mode: 0,
+ mtime: 0,
+ size: 0,
+ };
+ assert_eq!(tree.insert_node(HgPath::new(b"a/b/c"), entry), None);
+ assert_eq!(tree.files_count, 1);
+
+ assert_eq!(tree.remove(HgPath::new(b"a/b/c")), Some(entry));
+ assert_eq!(tree.files_count, 0);
+
+ assert_eq!(tree.insert_node(HgPath::new(b"a/b/x"), entry), None);
+ assert_eq!(tree.insert_node(HgPath::new(b"a/b/y"), entry), None);
+ assert_eq!(tree.insert_node(HgPath::new(b"a/b/z"), entry), None);
+ assert_eq!(tree.insert_node(HgPath::new(b"x"), entry), None);
+ assert_eq!(tree.insert_node(HgPath::new(b"y"), entry), None);
+ assert_eq!(tree.files_count, 5);
+
+ assert_eq!(tree.remove(HgPath::new(b"a/b/x")), Some(entry));
+ assert_eq!(tree.files_count, 4);
+ assert_eq!(tree.remove(HgPath::new(b"a/b/x")), None);
+ assert_eq!(tree.files_count, 4);
+ assert_eq!(tree.remove(HgPath::new(b"a/b/y")), Some(entry));
+ assert_eq!(tree.files_count, 3);
+ assert_eq!(tree.remove(HgPath::new(b"a/b/z")), Some(entry));
+ assert_eq!(tree.files_count, 2);
+
+ assert_eq!(tree.remove(HgPath::new(b"x")), Some(entry));
+ assert_eq!(tree.files_count, 1);
+ assert_eq!(tree.remove(HgPath::new(b"y")), Some(entry));
+ assert_eq!(tree.files_count, 0);
+
+ // `a` should have been cleaned up, no more files anywhere in its
+ // descendents
+ assert_eq!(tree.get_node(HgPath::new(b"a")), None);
+ assert_eq!(tree.root.children().unwrap().len(), 0);
+
+ let removed_entry = DirstateEntry {
+ state: EntryState::Removed,
+ ..entry
+ };
+ assert_eq!(tree.insert(HgPath::new(b"a"), removed_entry), None);
+ assert_eq!(tree.insert_node(HgPath::new(b"a/b/x"), entry), None);
+ assert_eq!(tree.files_count, 2);
+ dbg!(&tree);
+ assert_eq!(tree.remove(HgPath::new(b"a")), Some(removed_entry));
+ assert_eq!(tree.files_count, 1);
+ dbg!(&tree);
+ assert_eq!(tree.remove(HgPath::new(b"a/b/x")), Some(entry));
+ assert_eq!(tree.files_count, 0);
+
+ // The entire tree should have been cleaned up, no more files anywhere
+ // in its descendents
+ assert_eq!(tree.root.children().unwrap().len(), 0);
+
+ let removed_entry = DirstateEntry {
+ state: EntryState::Removed,
+ ..entry
+ };
+ assert_eq!(tree.insert(HgPath::new(b"a"), entry), None);
+ assert_eq!(tree.insert_node(HgPath::new(b"a/b/x"), removed_entry), None);
+ assert_eq!(tree.files_count, 2);
+ dbg!(&tree);
+ assert_eq!(tree.remove(HgPath::new(b"a")), Some(entry));
+ assert_eq!(tree.files_count, 1);
+ dbg!(&tree);
+ assert_eq!(tree.remove(HgPath::new(b"a/b/x")), Some(removed_entry));
+ assert_eq!(tree.files_count, 0);
+
+ dbg!(&tree);
+ // The entire tree should have been cleaned up, no more files anywhere
+ // in its descendents
+ assert_eq!(tree.root.children().unwrap().len(), 0);
+
+ assert_eq!(tree.insert(HgPath::new(b"d"), entry), None);
+ assert_eq!(tree.insert(HgPath::new(b"d/d/d"), entry), None);
+ assert_eq!(tree.files_count, 2);
+
+ // Deleting the nested file should not delete the top directory as it
+ // used to be a file
+ assert_eq!(tree.remove(HgPath::new(b"d/d/d")), Some(entry));
+ assert_eq!(tree.files_count, 1);
+ assert!(tree.get_node(HgPath::new(b"d")).is_some());
+ assert!(tree.remove(HgPath::new(b"d")).is_some());
+ assert_eq!(tree.files_count, 0);
+
+ // Deleting the nested file should not delete the top file (other way
+ // around from the last case)
+ assert_eq!(tree.insert(HgPath::new(b"a/a"), entry), None);
+ assert_eq!(tree.files_count, 1);
+ assert_eq!(tree.insert(HgPath::new(b"a"), entry), None);
+ assert_eq!(tree.files_count, 2);
+ dbg!(&tree);
+ assert_eq!(tree.remove(HgPath::new(b"a/a")), Some(entry));
+ assert_eq!(tree.files_count, 1);
+ dbg!(&tree);
+ assert!(tree.get_node(HgPath::new(b"a")).is_some());
+ assert!(tree.get_node(HgPath::new(b"a/a")).is_none());
+ }
+
+ #[test]
+ fn test_was_directory() {
+ let mut tree = Tree::new();
+
+ let entry = DirstateEntry {
+ state: EntryState::Removed,
+ mode: 0,
+ mtime: 0,
+ size: 0,
+ };
+ assert_eq!(tree.insert_node(HgPath::new(b"a/b/c"), entry), None);
+ assert_eq!(tree.files_count, 1);
+
+ assert!(tree.insert_node(HgPath::new(b"a"), entry).is_some());
+ let new_a = tree.root.children().unwrap().get(&b"a".to_vec()).unwrap();
+
+ match &new_a.kind {
+ NodeKind::Directory(_) => panic!(),
+ NodeKind::File(f) => {
+ let dir = f.was_directory.clone().unwrap();
+ let c = dir
+ .children
+ .get(&b"b".to_vec())
+ .unwrap()
+ .children()
+ .unwrap()
+ .get(&b"c".to_vec())
+ .unwrap();
+
+ assert_eq!(
+ match &c.kind {
+ NodeKind::Directory(_) => panic!(),
+ NodeKind::File(f) => f.entry,
+ },
+ entry
+ );
+ }
+ }
+ assert_eq!(tree.files_count, 2);
+ dbg!(&tree);
+ assert_eq!(tree.remove(HgPath::new(b"a/b/c")), Some(entry));
+ assert_eq!(tree.files_count, 1);
+ dbg!(&tree);
+ let a = tree.get_node(HgPath::new(b"a")).unwrap();
+ match &a.kind {
+ NodeKind::Directory(_) => panic!(),
+ NodeKind::File(f) => {
+ // Directory in `was_directory` was emptied, should be removed
+ assert_eq!(f.was_directory, None);
+ }
+ }
+ }
+ #[test]
+ fn test_extend() {
+ let insertions = [
+ (
+ HgPathBuf::from_bytes(b"d"),
+ DirstateEntry {
+ state: EntryState::Added,
+ mode: 0,
+ mtime: -1,
+ size: -1,
+ },
+ ),
+ (
+ HgPathBuf::from_bytes(b"b"),
+ DirstateEntry {
+ state: EntryState::Normal,
+ mode: 33188,
+ mtime: 1599647984,
+ size: 2,
+ },
+ ),
+ (
+ HgPathBuf::from_bytes(b"a/a"),
+ DirstateEntry {
+ state: EntryState::Normal,
+ mode: 33188,
+ mtime: 1599647984,
+ size: 2,
+ },
+ ),
+ (
+ HgPathBuf::from_bytes(b"d/d/d"),
+ DirstateEntry {
+ state: EntryState::Removed,
+ mode: 0,
+ mtime: 0,
+ size: 0,
+ },
+ ),
+ ]
+ .to_vec();
+ let mut tree = Tree::new();
+
+ tree.extend(insertions.clone().into_iter());
+
+ for (path, _) in &insertions {
+ assert!(tree.contains_key(path), true);
+ }
+ assert_eq!(tree.files_count, 4);
+ }
+}