view rust/hg-core/src/dirstate/dirstate_tree/iter.rs @ 45562:b51167d70f5a

rust: add `dirstate_tree` module Mercurial needs to represent the filesystem hierarchy on which it operates, for example in the dirstate. Its current on-disk representation is an unsorted, flat structure that gets transformed in the current Rust code into a `HashMap`. This loses the hierarchical information of the dirstate, leading to some unfortunate performance and algorithmic compromises. This module adds an implementation of a radix tree that is specialized for representing the dirstate: its unit is the path component. I have made no efforts to optimize either its memory footprint or its insertion speed: they're pretty bad for now. Following will be a few patches that modify the dirstate.status logic to use that new hierarchical information, fixing issue 6335 in the same swing. Differential Revision: https://phab.mercurial-scm.org/D9085
author Raphaël Gomès <rgomes@octobus.net>
date Fri, 25 Sep 2020 17:51:34 +0200
parents
children c35db907363d
line wrap: on
line source

// iter.rs
//
// Copyright 2020, Raphaël Gomès <rgomes@octobus.net>
//
// This software may be used and distributed according to the terms of the
// GNU General Public License version 2 or any later version.

use super::node::{Node, NodeKind};
use super::tree::Tree;
use crate::dirstate::dirstate_tree::node::Directory;
use crate::dirstate::status::Dispatch;
use crate::utils::hg_path::{hg_path_to_path_buf, HgPath, HgPathBuf};
use crate::DirstateEntry;
use std::borrow::Cow;
use std::collections::VecDeque;
use std::iter::{FromIterator, FusedIterator};
use std::path::PathBuf;

impl FromIterator<(HgPathBuf, DirstateEntry)> for Tree {
    fn from_iter<T: IntoIterator<Item = (HgPathBuf, DirstateEntry)>>(iter: T) -> Self {
        let mut tree = Self::new();
        for (path, entry) in iter {
            tree.insert(path, entry);
        }
        tree
    }
}

/// Iterator of all entries in the dirstate tree.
///
/// It has no particular ordering.
pub struct Iter<'a> {
    to_visit: VecDeque<(Cow<'a, [u8]>, &'a Node)>,
}

impl<'a> Iter<'a> {
    pub fn new(node: &'a Node) -> Iter<'a> {
        let mut to_visit = VecDeque::new();
        to_visit.push_back((Cow::Borrowed(&b""[..]), node));
        Self { to_visit }
    }
}

impl<'a> Iterator for Iter<'a> {
    type Item = (HgPathBuf, DirstateEntry);

    fn next(&mut self) -> Option<Self::Item> {
        while let Some((base_path, node)) = self.to_visit.pop_front() {
            match &node.kind {
                NodeKind::Directory(dir) => {
                    add_children_to_visit(&mut self.to_visit, &base_path, &dir);
                    if let Some(file) = &dir.was_file {
                        return Some((HgPathBuf::from_bytes(&base_path), file.entry));
                    }
                }
                NodeKind::File(file) => {
                    if let Some(dir) = &file.was_directory {
                        add_children_to_visit(&mut self.to_visit, &base_path, &dir);
                    }
                    return Some((HgPathBuf::from_bytes(&base_path), file.entry));
                }
            }
        }
        None
    }
}

impl<'a> FusedIterator for Iter<'a> {}

/// Iterator of all entries in the dirstate tree, with a special filesystem
/// handling for the directories containing said entries.
///
/// It checks every directory on-disk to see if it has become a symlink, to
/// prevent a potential security issue.
/// Using this information, it may dispatch `status` information early: it
/// returns canonical paths along with `Shortcut`s, which are either a
/// `DirstateEntry` or a `Dispatch`, if the fate of said path has already been
/// determined.
///
/// Like `Iter`, it has no particular ordering.
pub struct FsIter<'a> {
    root_dir: PathBuf,
    to_visit: VecDeque<(Cow<'a, [u8]>, &'a Node)>,
    shortcuts: VecDeque<(HgPathBuf, StatusShortcut)>,
}

impl<'a> FsIter<'a> {
    pub fn new(node: &'a Node, root_dir: PathBuf) -> FsIter<'a> {
        let mut to_visit = VecDeque::new();
        to_visit.push_back((Cow::Borrowed(&b""[..]), node));
        Self {
            root_dir,
            to_visit,
            shortcuts: Default::default(),
        }
    }

    /// Mercurial tracks symlinks but *not* what they point to.
    /// If a directory is moved and symlinked:
    ///
    /// ```bash
    /// $ mkdir foo
    /// $ touch foo/a
    /// $ # commit...
    /// $ mv foo bar
    /// $ ln -s bar foo
    /// ```
    /// We need to dispatch the new symlink as `Unknown` and all the
    /// descendents of the directory it replace as `Deleted`.
    fn dispatch_symlinked_directory(&mut self, path: impl AsRef<HgPath>, node: &Node) {
        let path = path.as_ref();
        self.shortcuts
            .push_back((path.to_owned(), StatusShortcut::Dispatch(Dispatch::Unknown)));
        for (file, _) in node.iter() {
            self.shortcuts.push_back((
                path.join(&file),
                StatusShortcut::Dispatch(Dispatch::Deleted),
            ));
        }
    }

    /// Returns `true` if the canonical `path` of a directory corresponds to a
    /// symlink on disk. It means it was moved and symlinked after the last
    /// dirstate update.
    ///
    /// # Special cases
    ///
    /// Returns `false` for the repository root.
    /// Returns `false` on io error, error handling is outside of the iterator.
    fn directory_became_symlink(&mut self, path: &HgPath) -> bool {
        if path.is_empty() {
            return false;
        }
        let filename_as_path = match hg_path_to_path_buf(&path) {
            Ok(p) => p,
            _ => return false,
        };
        let meta = self.root_dir.join(filename_as_path).symlink_metadata();
        match meta {
            Ok(ref m) if m.file_type().is_symlink() => true,
            _ => return false,
        }
    }
}

/// Returned by `FsIter`, since the `Dispatch` of any given entry may already
/// be determined during the iteration. This is necessary for performance
/// reasons, since hierarchical information is needed to `Dispatch` an entire
/// subtree efficiently.
#[derive(Debug, Copy, Clone)]
pub enum StatusShortcut {
    /// A entry in the dirstate for further inspection
    Entry(DirstateEntry),
    /// The result of the status of the corresponding file
    Dispatch(Dispatch),
}

impl<'a> Iterator for FsIter<'a> {
    type Item = (HgPathBuf, StatusShortcut);

    fn next(&mut self) -> Option<Self::Item> {
        // If any paths have already been `Dispatch`-ed, return them
        while let Some(res) = self.shortcuts.pop_front() {
            return Some(res);
        }

        while let Some((base_path, node)) = self.to_visit.pop_front() {
            match &node.kind {
                NodeKind::Directory(dir) => {
                    let canonical_path = HgPath::new(&base_path);
                    if self.directory_became_symlink(canonical_path) {
                        // Potential security issue, don't do a normal
                        // traversal, force the results.
                        self.dispatch_symlinked_directory(canonical_path, &node);
                        continue;
                    }
                    add_children_to_visit(&mut self.to_visit, &base_path, &dir);
                    if let Some(file) = &dir.was_file {
                        return Some((
                            HgPathBuf::from_bytes(&base_path),
                            StatusShortcut::Entry(file.entry),
                        ));
                    }
                }
                NodeKind::File(file) => {
                    if let Some(dir) = &file.was_directory {
                        add_children_to_visit(&mut self.to_visit, &base_path, &dir);
                    }
                    return Some((
                        HgPathBuf::from_bytes(&base_path),
                        StatusShortcut::Entry(file.entry),
                    ));
                }
            }
        }

        None
    }
}

impl<'a> FusedIterator for FsIter<'a> {}

fn join_path<'a, 'b>(path: &'a [u8], other: &'b [u8]) -> Cow<'b, [u8]> {
    if path.is_empty() {
        other.into()
    } else {
        [path, &b"/"[..], other].concat().into()
    }
}

/// Adds all children of a given directory `dir` to the visit queue `to_visit`
/// prefixed by a `base_path`.
fn add_children_to_visit<'a>(
    to_visit: &mut VecDeque<(Cow<'a, [u8]>, &'a Node)>,
    base_path: &[u8],
    dir: &'a Directory,
) {
    to_visit.extend(dir.children.iter().map(|(path, child)| {
        let full_path = join_path(&base_path, &path);
        (Cow::from(full_path), child)
    }));
}

#[cfg(test)]
mod tests {
    use super::*;
    use crate::utils::hg_path::HgPath;
    use crate::{EntryState, FastHashMap};
    use std::collections::HashSet;

    #[test]
    fn test_iteration() {
        let mut tree = Tree::new();

        assert_eq!(
            tree.insert(
                HgPathBuf::from_bytes(b"foo/bar"),
                DirstateEntry {
                    state: EntryState::Merged,
                    mode: 41,
                    mtime: 42,
                    size: 43,
                }
            ),
            None
        );

        assert_eq!(
            tree.insert(
                HgPathBuf::from_bytes(b"foo2"),
                DirstateEntry {
                    state: EntryState::Merged,
                    mode: 40,
                    mtime: 41,
                    size: 42,
                }
            ),
            None
        );

        assert_eq!(
            tree.insert(
                HgPathBuf::from_bytes(b"foo/baz"),
                DirstateEntry {
                    state: EntryState::Normal,
                    mode: 0,
                    mtime: 0,
                    size: 0,
                }
            ),
            None
        );

        assert_eq!(
            tree.insert(
                HgPathBuf::from_bytes(b"foo/bap/nested"),
                DirstateEntry {
                    state: EntryState::Normal,
                    mode: 0,
                    mtime: 0,
                    size: 0,
                }
            ),
            None
        );

        assert_eq!(tree.len(), 4);

        let results: HashSet<_> = tree.iter().map(|(c, _)| c.to_owned()).collect();
        dbg!(&results);
        assert!(results.contains(HgPath::new(b"foo2")));
        assert!(results.contains(HgPath::new(b"foo/bar")));
        assert!(results.contains(HgPath::new(b"foo/baz")));
        assert!(results.contains(HgPath::new(b"foo/bap/nested")));

        let mut iter = tree.iter();
        assert!(iter.next().is_some());
        assert!(iter.next().is_some());
        assert!(iter.next().is_some());
        assert!(iter.next().is_some());
        assert_eq!(None, iter.next());
        assert_eq!(None, iter.next());
        drop(iter);

        assert_eq!(
            tree.insert(
                HgPathBuf::from_bytes(b"foo/bap/nested/a"),
                DirstateEntry {
                    state: EntryState::Normal,
                    mode: 0,
                    mtime: 0,
                    size: 0,
                }
            ),
            None
        );

        let results: FastHashMap<_, _> = tree.iter().collect();
        assert!(results.contains_key(HgPath::new(b"foo2")));
        assert!(results.contains_key(HgPath::new(b"foo/bar")));
        assert!(results.contains_key(HgPath::new(b"foo/baz")));
        // Is a dir but `was_file`, so it's listed as a removed file
        assert!(results.contains_key(HgPath::new(b"foo/bap/nested")));
        assert!(results.contains_key(HgPath::new(b"foo/bap/nested/a")));

        // insert removed file (now directory) after nested file
        assert_eq!(
            tree.insert(
                HgPathBuf::from_bytes(b"a/a"),
                DirstateEntry {
                    state: EntryState::Normal,
                    mode: 0,
                    mtime: 0,
                    size: 0,
                }
            ),
            None
        );

        // `insert` returns `None` for a directory
        assert_eq!(
            tree.insert(
                HgPathBuf::from_bytes(b"a"),
                DirstateEntry {
                    state: EntryState::Removed,
                    mode: 0,
                    mtime: 0,
                    size: 0,
                }
            ),
            None
        );

        let results: FastHashMap<_, _> = tree.iter().collect();
        assert!(results.contains_key(HgPath::new(b"a")));
        assert!(results.contains_key(HgPath::new(b"a/a")));
    }
}