view rust/hg-core/src/revlog/nodemap.rs @ 44186:796d05f3fa84

rust-nodemap: generic NodeTreeVisitor This iterator will help avoid code duplication when we'll implement `insert()`, in which we will need to traverse the node tree, and to remember the visited blocks. The structured iterator item will allow different usages from `lookup()` and the upcoming `insert()`. Differential Revision: https://phab.mercurial-scm.org/D7794
author Georges Racinet <georges.racinet@octobus.net>
date Fri, 27 Dec 2019 16:06:54 +0100
parents a19331456d48
children d2da8667125b
line wrap: on
line source

// Copyright 2018-2020 Georges Racinet <georges.racinet@octobus.net>
//           and Mercurial contributors
//
// This software may be used and distributed according to the terms of the
// GNU General Public License version 2 or any later version.
//! Indexing facilities for fast retrieval of `Revision` from `Node`
//!
//! This provides a variation on the 16-ary radix tree that is
//! provided as "nodetree" in revlog.c, ready for append-only persistence
//! on disk.
//!
//! Following existing implicit conventions, the "nodemap" terminology
//! is used in a more abstract context.

use super::{
    Node, NodeError, NodePrefix, NodePrefixRef, Revision, RevlogIndex,
};
use std::fmt;
use std::ops::Deref;
use std::ops::Index;

#[derive(Debug, PartialEq)]
pub enum NodeMapError {
    MultipleResults,
    InvalidNodePrefix(NodeError),
    /// A `Revision` stored in the nodemap could not be found in the index
    RevisionNotInIndex(Revision),
}

impl From<NodeError> for NodeMapError {
    fn from(err: NodeError) -> Self {
        NodeMapError::InvalidNodePrefix(err)
    }
}

/// Mapping system from Mercurial nodes to revision numbers.
///
/// ## `RevlogIndex` and `NodeMap`
///
/// One way to think about their relationship is that
/// the `NodeMap` is a prefix-oriented reverse index of the `Node` information
/// carried by a [`RevlogIndex`].
///
/// Many of the methods in this trait take a `RevlogIndex` argument
/// which is used for validation of their results. This index must naturally
/// be the one the `NodeMap` is about, and it must be consistent.
///
/// Notably, the `NodeMap` must not store
/// information about more `Revision` values than there are in the index.
/// In these methods, an encountered `Revision` is not in the index, a
/// [`RevisionNotInIndex`] error is returned.
///
/// In insert operations, the rule is thus that the `NodeMap` must always
/// be updated after the `RevlogIndex`
/// be updated first, and the `NodeMap` second.
///
/// [`RevisionNotInIndex`]: enum.NodeMapError.html#variant.RevisionNotInIndex
/// [`RevlogIndex`]: ../trait.RevlogIndex.html
pub trait NodeMap {
    /// Find the unique `Revision` having the given `Node`
    ///
    /// If no Revision matches the given `Node`, `Ok(None)` is returned.
    fn find_node(
        &self,
        index: &impl RevlogIndex,
        node: &Node,
    ) -> Result<Option<Revision>, NodeMapError> {
        self.find_bin(index, node.into())
    }

    /// Find the unique Revision whose `Node` starts with a given binary prefix
    ///
    /// If no Revision matches the given prefix, `Ok(None)` is returned.
    ///
    /// If several Revisions match the given prefix, a [`MultipleResults`]
    /// error is returned.
    fn find_bin<'a>(
        &self,
        idx: &impl RevlogIndex,
        prefix: NodePrefixRef<'a>,
    ) -> Result<Option<Revision>, NodeMapError>;

    /// Find the unique Revision whose `Node` hexadecimal string representation
    /// starts with a given prefix
    ///
    /// If no Revision matches the given prefix, `Ok(None)` is returned.
    ///
    /// If several Revisions match the given prefix, a [`MultipleResults`]
    /// error is returned.
    fn find_hex(
        &self,
        idx: &impl RevlogIndex,
        prefix: &str,
    ) -> Result<Option<Revision>, NodeMapError> {
        self.find_bin(idx, NodePrefix::from_hex(prefix)?.borrow())
    }
}

/// Low level NodeTree [`Blocks`] elements
///
/// These are exactly as for instance on persistent storage.
type RawElement = i32;

/// High level representation of values in NodeTree
/// [`Blocks`](struct.Block.html)
///
/// This is the high level representation that most algorithms should
/// use.
#[derive(Clone, Debug, Eq, PartialEq)]
enum Element {
    Rev(Revision),
    Block(usize),
    None,
}

impl From<RawElement> for Element {
    /// Conversion from low level representation, after endianness conversion.
    ///
    /// See [`Block`](struct.Block.html) for explanation about the encoding.
    fn from(raw: RawElement) -> Element {
        if raw >= 0 {
            Element::Block(raw as usize)
        } else if raw == -1 {
            Element::None
        } else {
            Element::Rev(-raw - 2)
        }
    }
}

impl From<Element> for RawElement {
    fn from(element: Element) -> RawElement {
        match element {
            Element::None => 0,
            Element::Block(i) => i as RawElement,
            Element::Rev(rev) => -rev - 2,
        }
    }
}

/// A logical block of the `NodeTree`, packed with a fixed size.
///
/// These are always used in container types implementing `Index<Block>`,
/// such as `&Block`
///
/// As an array of integers, its ith element encodes that the
/// ith potential edge from the block, representing the ith hexadecimal digit
/// (nybble) `i` is either:
///
/// - absent (value -1)
/// - another `Block` in the same indexable container (value ≥ 0)
///  - a `Revision` leaf (value ≤ -2)
///
/// Endianness has to be fixed for consistency on shared storage across
/// different architectures.
///
/// A key difference with the C `nodetree` is that we need to be
/// able to represent the [`Block`] at index 0, hence -1 is the empty marker
/// rather than 0 and the `Revision` range upper limit of -2 instead of -1.
///
/// Another related difference is that `NULL_REVISION` (-1) is not
/// represented at all, because we want an immutable empty nodetree
/// to be valid.

#[derive(Clone, PartialEq)]
pub struct Block([RawElement; 16]);

impl Block {
    fn new() -> Self {
        Block([-1; 16])
    }

    fn get(&self, nybble: u8) -> Element {
        Element::from(RawElement::from_be(self.0[nybble as usize]))
    }

    fn set(&mut self, nybble: u8, element: Element) {
        self.0[nybble as usize] = RawElement::to_be(element.into())
    }
}

impl fmt::Debug for Block {
    /// sparse representation for testing and debugging purposes
    fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
        f.debug_map()
            .entries((0..16).filter_map(|i| match self.get(i) {
                Element::None => None,
                element => Some((i, element)),
            }))
            .finish()
    }
}

/// A mutable 16-radix tree with the root block logically at the end
///
/// Because of the append only nature of our node trees, we need to
/// keep the original untouched and store new blocks separately.
///
/// The mutable root `Block` is kept apart so that we don't have to rebump
/// it on each insertion.
pub struct NodeTree {
    readonly: Box<dyn Deref<Target = [Block]> + Send>,
    growable: Vec<Block>,
    root: Block,
}

impl Index<usize> for NodeTree {
    type Output = Block;

    fn index(&self, i: usize) -> &Block {
        let ro_len = self.readonly.len();
        if i < ro_len {
            &self.readonly[i]
        } else if i == ro_len + self.growable.len() {
            &self.root
        } else {
            &self.growable[i - ro_len]
        }
    }
}

/// Return `None` unless the `Node` for `rev` has given prefix in `index`.
fn has_prefix_or_none<'p>(
    idx: &impl RevlogIndex,
    prefix: NodePrefixRef<'p>,
    rev: Revision,
) -> Result<Option<Revision>, NodeMapError> {
    idx.node(rev)
        .ok_or_else(|| NodeMapError::RevisionNotInIndex(rev))
        .map(|node| {
            if prefix.is_prefix_of(node) {
                Some(rev)
            } else {
                None
            }
        })
}

impl NodeTree {
    /// Initiate a NodeTree from an immutable slice-like of `Block`
    ///
    /// We keep `readonly` and clone its root block if it isn't empty.
    fn new(readonly: Box<dyn Deref<Target = [Block]> + Send>) -> Self {
        let root = readonly
            .last()
            .map(|b| b.clone())
            .unwrap_or_else(|| Block::new());
        NodeTree {
            readonly: readonly,
            growable: Vec::new(),
            root: root,
        }
    }

    /// Total number of blocks
    fn len(&self) -> usize {
        self.readonly.len() + self.growable.len() + 1
    }

    /// Implemented for completeness
    ///
    /// A `NodeTree` always has at least the mutable root block.
    #[allow(dead_code)]
    fn is_empty(&self) -> bool {
        false
    }

    /// Main working method for `NodeTree` searches
    ///
    /// This partial implementation lacks special cases for NULL_REVISION
    fn lookup<'p>(
        &self,
        prefix: NodePrefixRef<'p>,
    ) -> Result<Option<Revision>, NodeMapError> {
        for visit_item in self.visit(prefix) {
            if let Some(opt) = visit_item.final_revision() {
                return Ok(opt);
            }
        }
        Err(NodeMapError::MultipleResults)
    }

    fn visit<'n, 'p>(
        &'n self,
        prefix: NodePrefixRef<'p>,
    ) -> NodeTreeVisitor<'n, 'p> {
        NodeTreeVisitor {
            nt: self,
            prefix: prefix,
            visit: self.len() - 1,
            nybble_idx: 0,
            done: false,
        }
    }
}

struct NodeTreeVisitor<'n, 'p> {
    nt: &'n NodeTree,
    prefix: NodePrefixRef<'p>,
    visit: usize,
    nybble_idx: usize,
    done: bool,
}

#[derive(Debug, PartialEq, Clone)]
struct NodeTreeVisitItem {
    block_idx: usize,
    nybble: u8,
    element: Element,
}

impl<'n, 'p> Iterator for NodeTreeVisitor<'n, 'p> {
    type Item = NodeTreeVisitItem;

    fn next(&mut self) -> Option<Self::Item> {
        if self.done || self.nybble_idx >= self.prefix.len() {
            return None;
        }

        let nybble = self.prefix.get_nybble(self.nybble_idx);
        self.nybble_idx += 1;

        let visit = self.visit;
        let element = self.nt[visit].get(nybble);
        if let Element::Block(idx) = element {
            self.visit = idx;
        } else {
            self.done = true;
        }

        Some(NodeTreeVisitItem {
            block_idx: visit,
            nybble: nybble,
            element: element,
        })
    }
}

impl NodeTreeVisitItem {
    // Return `Some(opt)` if this item is final, with `opt` being the
    // `Revision` that it may represent.
    //
    // If the item is not terminal, return `None`
    fn final_revision(&self) -> Option<Option<Revision>> {
        match self.element {
            Element::Block(_) => None,
            Element::Rev(r) => Some(Some(r)),
            Element::None => Some(None),
        }
    }
}

impl From<Vec<Block>> for NodeTree {
    fn from(vec: Vec<Block>) -> Self {
        Self::new(Box::new(vec))
    }
}

impl fmt::Debug for NodeTree {
    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
        let readonly: &[Block] = &*self.readonly;
        write!(
            f,
            "readonly: {:?}, growable: {:?}, root: {:?}",
            readonly, self.growable, self.root
        )
    }
}

impl NodeMap for NodeTree {
    fn find_bin<'a>(
        &self,
        idx: &impl RevlogIndex,
        prefix: NodePrefixRef<'a>,
    ) -> Result<Option<Revision>, NodeMapError> {
        self.lookup(prefix.clone()).and_then(|opt| {
            opt.map_or(Ok(None), |rev| has_prefix_or_none(idx, prefix, rev))
        })
    }
}

#[cfg(test)]
mod tests {
    use super::NodeMapError::*;
    use super::*;
    use crate::revlog::node::{hex_pad_right, Node};
    use std::collections::HashMap;

    /// Creates a `Block` using a syntax close to the `Debug` output
    macro_rules! block {
        {$($nybble:tt : $variant:ident($val:tt)),*} => (
            {
                let mut block = Block::new();
                $(block.set($nybble, Element::$variant($val)));*;
                block
            }
        )
    }

    #[test]
    fn test_block_debug() {
        let mut block = Block::new();
        block.set(1, Element::Rev(3));
        block.set(10, Element::Block(0));
        assert_eq!(format!("{:?}", block), "{1: Rev(3), 10: Block(0)}");
    }

    #[test]
    fn test_block_macro() {
        let block = block! {5: Block(2)};
        assert_eq!(format!("{:?}", block), "{5: Block(2)}");

        let block = block! {13: Rev(15), 5: Block(2)};
        assert_eq!(format!("{:?}", block), "{5: Block(2), 13: Rev(15)}");
    }

    #[test]
    fn test_raw_block() {
        let mut raw = [-1; 16];
        raw[0] = 0;
        raw[1] = RawElement::to_be(15);
        raw[2] = RawElement::to_be(-2);
        raw[3] = RawElement::to_be(-1);
        raw[4] = RawElement::to_be(-3);
        let block = Block(raw);
        assert_eq!(block.get(0), Element::Block(0));
        assert_eq!(block.get(1), Element::Block(15));
        assert_eq!(block.get(3), Element::None);
        assert_eq!(block.get(2), Element::Rev(0));
        assert_eq!(block.get(4), Element::Rev(1));
    }

    type TestIndex = HashMap<Revision, Node>;

    impl RevlogIndex for TestIndex {
        fn node(&self, rev: Revision) -> Option<&Node> {
            self.get(&rev)
        }

        fn len(&self) -> usize {
            self.len()
        }
    }

    /// Pad hexadecimal Node prefix with zeros on the right, then insert
    ///
    /// This avoids having to repeatedly write very long hexadecimal
    /// strings for test data, and brings actual hash size independency.
    fn pad_insert(idx: &mut TestIndex, rev: Revision, hex: &str) {
        idx.insert(rev, Node::from_hex(&hex_pad_right(hex)).unwrap());
    }

    fn sample_nodetree() -> NodeTree {
        NodeTree::from(vec![
            block![0: Rev(9)],
            block![0: Rev(0), 1: Rev(9)],
            block![0: Block(1), 1:Rev(1)],
        ])
    }

    #[test]
    fn test_nt_debug() {
        let nt = sample_nodetree();
        assert_eq!(
            format!("{:?}", nt),
            "readonly: \
             [{0: Rev(9)}, {0: Rev(0), 1: Rev(9)}, {0: Block(1), 1: Rev(1)}], \
             growable: [], \
             root: {0: Block(1), 1: Rev(1)}",
        );
    }

    #[test]
    fn test_immutable_find_simplest() -> Result<(), NodeMapError> {
        let mut idx: TestIndex = HashMap::new();
        pad_insert(&mut idx, 1, "1234deadcafe");

        let nt = NodeTree::from(vec![block! {1: Rev(1)}]);
        assert_eq!(nt.find_hex(&idx, "1")?, Some(1));
        assert_eq!(nt.find_hex(&idx, "12")?, Some(1));
        assert_eq!(nt.find_hex(&idx, "1234de")?, Some(1));
        assert_eq!(nt.find_hex(&idx, "1a")?, None);
        assert_eq!(nt.find_hex(&idx, "ab")?, None);

        // and with full binary Nodes
        assert_eq!(nt.find_node(&idx, idx.get(&1).unwrap())?, Some(1));
        let unknown = Node::from_hex(&hex_pad_right("3d")).unwrap();
        assert_eq!(nt.find_node(&idx, &unknown)?, None);
        Ok(())
    }

    #[test]
    fn test_immutable_find_one_jump() {
        let mut idx = TestIndex::new();
        pad_insert(&mut idx, 9, "012");
        pad_insert(&mut idx, 0, "00a");

        let nt = sample_nodetree();

        assert_eq!(nt.find_hex(&idx, "0"), Err(MultipleResults));
        assert_eq!(nt.find_hex(&idx, "01"), Ok(Some(9)));
        assert_eq!(nt.find_hex(&idx, "00"), Ok(Some(0)));
        assert_eq!(nt.find_hex(&idx, "00a"), Ok(Some(0)));
    }

    #[test]
    fn test_mutated_find() -> Result<(), NodeMapError> {
        let mut idx = TestIndex::new();
        pad_insert(&mut idx, 9, "012");
        pad_insert(&mut idx, 0, "00a");
        pad_insert(&mut idx, 2, "cafe");
        pad_insert(&mut idx, 3, "15");
        pad_insert(&mut idx, 1, "10");

        let nt = NodeTree {
            readonly: sample_nodetree().readonly,
            growable: vec![block![0: Rev(1), 5: Rev(3)]],
            root: block![0: Block(1), 1:Block(3), 12: Rev(2)],
        };
        assert_eq!(nt.find_hex(&idx, "10")?, Some(1));
        assert_eq!(nt.find_hex(&idx, "c")?, Some(2));
        assert_eq!(nt.find_hex(&idx, "00")?, Some(0));
        assert_eq!(nt.find_hex(&idx, "01")?, Some(9));
        Ok(())
    }
}