dirstate-v2: Parse the dirstate lazily, with copy-on-write nodes
TODO: more description
Differential Revision: https://phab.mercurial-scm.org/D10751
--- a/rust/hg-core/src/dirstate_tree/dirstate_map.rs Wed May 19 13:15:00 2021 +0200
+++ b/rust/hg-core/src/dirstate_tree/dirstate_map.rs Wed May 19 13:15:00 2021 +0200
@@ -48,14 +48,17 @@
pub(super) enum ChildNodes<'on_disk> {
InMemory(FastHashMap<NodeKey<'on_disk>, Node<'on_disk>>),
+ OnDisk(&'on_disk [on_disk::Node]),
}
pub(super) enum ChildNodesRef<'tree, 'on_disk> {
InMemory(&'tree FastHashMap<NodeKey<'on_disk>, Node<'on_disk>>),
+ OnDisk(&'on_disk [on_disk::Node]),
}
pub(super) enum NodeRef<'tree, 'on_disk> {
InMemory(&'tree NodeKey<'on_disk>, &'tree Node<'on_disk>),
+ OnDisk(&'on_disk on_disk::Node),
}
impl Default for ChildNodes<'_> {
@@ -70,24 +73,42 @@
) -> ChildNodesRef<'tree, 'on_disk> {
match self {
ChildNodes::InMemory(nodes) => ChildNodesRef::InMemory(nodes),
+ ChildNodes::OnDisk(nodes) => ChildNodesRef::OnDisk(nodes),
}
}
pub(super) fn is_empty(&self) -> bool {
match self {
ChildNodes::InMemory(nodes) => nodes.is_empty(),
+ ChildNodes::OnDisk(nodes) => nodes.is_empty(),
}
}
pub(super) fn make_mut(
&mut self,
- _on_disk: &'on_disk [u8],
+ on_disk: &'on_disk [u8],
) -> Result<
&mut FastHashMap<NodeKey<'on_disk>, Node<'on_disk>>,
DirstateV2ParseError,
> {
match self {
ChildNodes::InMemory(nodes) => Ok(nodes),
+ ChildNodes::OnDisk(nodes) => {
+ let nodes = nodes
+ .iter()
+ .map(|node| {
+ Ok((
+ node.path(on_disk)?,
+ node.to_in_memory_node(on_disk)?,
+ ))
+ })
+ .collect::<Result<_, _>>()?;
+ *self = ChildNodes::InMemory(nodes);
+ match self {
+ ChildNodes::InMemory(nodes) => Ok(nodes),
+ ChildNodes::OnDisk(_) => unreachable!(),
+ }
+ }
}
}
}
@@ -96,12 +117,29 @@
pub(super) fn get(
&self,
base_name: &HgPath,
- _on_disk: &'on_disk [u8],
+ on_disk: &'on_disk [u8],
) -> Result<Option<NodeRef<'tree, 'on_disk>>, DirstateV2ParseError> {
match self {
ChildNodesRef::InMemory(nodes) => Ok(nodes
.get_key_value(base_name)
.map(|(k, v)| NodeRef::InMemory(k, v))),
+ ChildNodesRef::OnDisk(nodes) => {
+ let mut parse_result = Ok(());
+ let search_result = nodes.binary_search_by(|node| {
+ match node.base_name(on_disk) {
+ Ok(node_base_name) => node_base_name.cmp(base_name),
+ Err(e) => {
+ parse_result = Err(e);
+ // Dummy comparison result, `search_result` won’t
+ // be used since `parse_result` is an error
+ std::cmp::Ordering::Equal
+ }
+ }
+ });
+ parse_result.map(|()| {
+ search_result.ok().map(|i| NodeRef::OnDisk(&nodes[i]))
+ })
+ }
}
}
@@ -110,8 +148,11 @@
&self,
) -> impl Iterator<Item = NodeRef<'tree, 'on_disk>> {
match self {
- ChildNodesRef::InMemory(nodes) => {
- nodes.iter().map(|(k, v)| NodeRef::InMemory(k, v))
+ ChildNodesRef::InMemory(nodes) => itertools::Either::Left(
+ nodes.iter().map(|(k, v)| NodeRef::InMemory(k, v)),
+ ),
+ ChildNodesRef::OnDisk(nodes) => {
+ itertools::Either::Right(nodes.iter().map(NodeRef::OnDisk))
}
}
}
@@ -123,9 +164,12 @@
{
use rayon::prelude::*;
match self {
- ChildNodesRef::InMemory(nodes) => {
- nodes.par_iter().map(|(k, v)| NodeRef::InMemory(k, v))
- }
+ ChildNodesRef::InMemory(nodes) => rayon::iter::Either::Left(
+ nodes.par_iter().map(|(k, v)| NodeRef::InMemory(k, v)),
+ ),
+ ChildNodesRef::OnDisk(nodes) => rayon::iter::Either::Right(
+ nodes.par_iter().map(NodeRef::OnDisk),
+ ),
}
}
@@ -139,6 +183,7 @@
fn sort_key<'a>(node: &'a NodeRef) -> &'a HgPath {
match node {
NodeRef::InMemory(path, _node) => path.base_name(),
+ NodeRef::OnDisk(_) => unreachable!(),
}
}
// `sort_unstable_by_key` doesn’t allow keys borrowing from the
@@ -146,6 +191,10 @@
vec.sort_unstable_by(|a, b| sort_key(a).cmp(sort_key(b)));
vec
}
+ ChildNodesRef::OnDisk(nodes) => {
+ // Nodes on disk are already sorted
+ nodes.iter().map(NodeRef::OnDisk).collect()
+ }
}
}
}
@@ -153,61 +202,72 @@
impl<'tree, 'on_disk> NodeRef<'tree, 'on_disk> {
pub(super) fn full_path(
&self,
- _on_disk: &'on_disk [u8],
+ on_disk: &'on_disk [u8],
) -> Result<&'tree HgPath, DirstateV2ParseError> {
match self {
NodeRef::InMemory(path, _node) => Ok(path.full_path()),
+ NodeRef::OnDisk(node) => node.full_path(on_disk),
}
}
/// Returns a `Cow` that can borrow 'on_disk but is detached from 'tree
pub(super) fn full_path_cow(
&self,
- _on_disk: &'on_disk [u8],
+ on_disk: &'on_disk [u8],
) -> Result<Cow<'on_disk, HgPath>, DirstateV2ParseError> {
match self {
NodeRef::InMemory(path, _node) => Ok(path.full_path().clone()),
+ NodeRef::OnDisk(node) => {
+ Ok(Cow::Borrowed(node.full_path(on_disk)?))
+ }
}
}
pub(super) fn base_name(
&self,
- _on_disk: &'on_disk [u8],
+ on_disk: &'on_disk [u8],
) -> Result<&'tree HgPath, DirstateV2ParseError> {
match self {
NodeRef::InMemory(path, _node) => Ok(path.base_name()),
+ NodeRef::OnDisk(node) => node.base_name(on_disk),
}
}
pub(super) fn children(
&self,
- _on_disk: &'on_disk [u8],
+ on_disk: &'on_disk [u8],
) -> Result<ChildNodesRef<'tree, 'on_disk>, DirstateV2ParseError> {
match self {
NodeRef::InMemory(_path, node) => Ok(node.children.as_ref()),
+ NodeRef::OnDisk(node) => {
+ Ok(ChildNodesRef::OnDisk(node.children(on_disk)?))
+ }
}
}
pub(super) fn has_copy_source(&self) -> bool {
match self {
NodeRef::InMemory(_path, node) => node.copy_source.is_some(),
+ NodeRef::OnDisk(node) => node.has_copy_source(),
}
}
pub(super) fn copy_source(
&self,
- _on_disk: &'on_disk [u8],
+ on_disk: &'on_disk [u8],
) -> Result<Option<&'tree HgPath>, DirstateV2ParseError> {
match self {
NodeRef::InMemory(_path, node) => {
Ok(node.copy_source.as_ref().map(|s| &**s))
}
+ NodeRef::OnDisk(node) => node.copy_source(on_disk),
}
}
pub(super) fn has_entry(&self) -> bool {
match self {
NodeRef::InMemory(_path, node) => node.entry.is_some(),
+ NodeRef::OnDisk(node) => node.has_entry(),
}
}
@@ -216,6 +276,7 @@
) -> Result<Option<DirstateEntry>, DirstateV2ParseError> {
match self {
NodeRef::InMemory(_path, node) => Ok(node.entry),
+ NodeRef::OnDisk(node) => node.entry(),
}
}
@@ -226,12 +287,14 @@
NodeRef::InMemory(_path, node) => {
Ok(node.entry.as_ref().map(|entry| entry.state))
}
+ NodeRef::OnDisk(node) => node.state(),
}
}
pub(super) fn tracked_descendants_count(&self) -> u32 {
match self {
NodeRef::InMemory(_path, node) => node.tracked_descendants_count,
+ NodeRef::OnDisk(node) => node.tracked_descendants_count.get(),
}
}
}
--- a/rust/hg-core/src/dirstate_tree/on_disk.rs Wed May 19 13:15:00 2021 +0200
+++ b/rust/hg-core/src/dirstate_tree/on_disk.rs Wed May 19 13:15:00 2021 +0200
@@ -16,6 +16,7 @@
use crate::DirstateEntry;
use crate::DirstateError;
use crate::DirstateParents;
+use crate::EntryState;
use bytes_cast::unaligned::{I32Be, U32Be, U64Be};
use bytes_cast::BytesCast;
use std::borrow::Cow;
@@ -42,7 +43,7 @@
#[derive(BytesCast)]
#[repr(C)]
-struct Node {
+pub(super) struct Node {
full_path: PathSlice,
/// In bytes from `self.full_path.start`
@@ -51,12 +52,12 @@
copy_source: OptPathSlice,
entry: OptEntry,
children: ChildNodes,
- tracked_descendants_count: Size,
+ pub(super) tracked_descendants_count: Size,
}
/// Either nothing if `state == b'\0'`, or a dirstate entry like in the v1
/// format
-#[derive(BytesCast)]
+#[derive(BytesCast, Copy, Clone)]
#[repr(C)]
struct OptEntry {
state: u8,
@@ -149,7 +150,9 @@
}
let dirstate_map = DirstateMap {
on_disk,
- root: read_nodes(on_disk, *root)?,
+ root: dirstate_map::ChildNodes::OnDisk(read_slice::<Node>(
+ on_disk, *root,
+ )?),
nodes_with_entry_count: nodes_with_entry_count.get(),
nodes_with_copy_source_count: nodes_with_copy_source_count.get(),
};
@@ -158,49 +161,96 @@
}
impl Node {
- pub(super) fn path<'on_disk>(
+ pub(super) fn full_path<'on_disk>(
&self,
on_disk: &'on_disk [u8],
- ) -> Result<dirstate_map::NodeKey<'on_disk>, DirstateV2ParseError> {
- let full_path = read_hg_path(on_disk, self.full_path)?;
- let base_name_start = usize::try_from(self.base_name_start.get())
- // u32 -> usize, could only panic on a 16-bit CPU
- .expect("dirstate-v2 base_name_start out of bounds");
- if base_name_start < full_path.len() {
- Ok(WithBasename::from_raw_parts(full_path, base_name_start))
+ ) -> Result<&'on_disk HgPath, DirstateV2ParseError> {
+ read_hg_path(on_disk, self.full_path)
+ }
+
+ pub(super) fn base_name_start<'on_disk>(
+ &self,
+ ) -> Result<usize, DirstateV2ParseError> {
+ let start = self.base_name_start.get();
+ if start < self.full_path.len.get() {
+ let start = usize::try_from(start)
+ // u32 -> usize, could only panic on a 16-bit CPU
+ .expect("dirstate-v2 base_name_start out of bounds");
+ Ok(start)
} else {
Err(DirstateV2ParseError)
}
}
+ pub(super) fn base_name<'on_disk>(
+ &self,
+ on_disk: &'on_disk [u8],
+ ) -> Result<&'on_disk HgPath, DirstateV2ParseError> {
+ let full_path = self.full_path(on_disk)?;
+ let base_name_start = self.base_name_start()?;
+ Ok(HgPath::new(&full_path.as_bytes()[base_name_start..]))
+ }
+
+ pub(super) fn path<'on_disk>(
+ &self,
+ on_disk: &'on_disk [u8],
+ ) -> Result<dirstate_map::NodeKey<'on_disk>, DirstateV2ParseError> {
+ Ok(WithBasename::from_raw_parts(
+ Cow::Borrowed(self.full_path(on_disk)?),
+ self.base_name_start()?,
+ ))
+ }
+
+ pub(super) fn has_copy_source<'on_disk>(&self) -> bool {
+ self.copy_source.start.get() != 0
+ }
+
pub(super) fn copy_source<'on_disk>(
&self,
on_disk: &'on_disk [u8],
- ) -> Result<Option<Cow<'on_disk, HgPath>>, DirstateV2ParseError> {
- Ok(if self.copy_source.start.get() != 0 {
+ ) -> Result<Option<&'on_disk HgPath>, DirstateV2ParseError> {
+ Ok(if self.has_copy_source() {
Some(read_hg_path(on_disk, self.copy_source)?)
} else {
None
})
}
+ pub(super) fn has_entry(&self) -> bool {
+ self.entry.state != b'\0'
+ }
+
+ pub(super) fn state(
+ &self,
+ ) -> Result<Option<EntryState>, DirstateV2ParseError> {
+ Ok(if self.has_entry() {
+ Some(
+ self.entry
+ .state
+ .try_into()
+ .map_err(|_| DirstateV2ParseError)?,
+ )
+ } else {
+ None
+ })
+ }
+
pub(super) fn entry(
&self,
) -> Result<Option<DirstateEntry>, DirstateV2ParseError> {
- Ok(if self.entry.state != b'\0' {
- Some(DirstateEntry {
- state: self
- .entry
- .state
- .try_into()
- .map_err(|_| DirstateV2ParseError)?,
- mode: self.entry.mode.get(),
- mtime: self.entry.mtime.get(),
- size: self.entry.size.get(),
- })
- } else {
- None
- })
+ Ok(self.state()?.map(|state| DirstateEntry {
+ state,
+ mode: self.entry.mode.get(),
+ mtime: self.entry.mtime.get(),
+ size: self.entry.size.get(),
+ }))
+ }
+
+ pub(super) fn children<'on_disk>(
+ &self,
+ on_disk: &'on_disk [u8],
+ ) -> Result<&'on_disk [Node], DirstateV2ParseError> {
+ read_slice::<Node>(on_disk, self.children)
}
pub(super) fn to_in_memory_node<'on_disk>(
@@ -208,33 +258,22 @@
on_disk: &'on_disk [u8],
) -> Result<dirstate_map::Node<'on_disk>, DirstateV2ParseError> {
Ok(dirstate_map::Node {
- children: read_nodes(on_disk, self.children)?,
- copy_source: self.copy_source(on_disk)?,
+ children: dirstate_map::ChildNodes::OnDisk(
+ self.children(on_disk)?,
+ ),
+ copy_source: self.copy_source(on_disk)?.map(Cow::Borrowed),
entry: self.entry()?,
tracked_descendants_count: self.tracked_descendants_count.get(),
})
}
}
-fn read_nodes(
- on_disk: &[u8],
- slice: ChildNodes,
-) -> Result<dirstate_map::ChildNodes, DirstateV2ParseError> {
- read_slice::<Node>(on_disk, slice)?
- .iter()
- .map(|node| {
- Ok((node.path(on_disk)?, node.to_in_memory_node(on_disk)?))
- })
- .collect::<Result<_, _>>()
- .map(dirstate_map::ChildNodes::InMemory)
-}
-
fn read_hg_path(
on_disk: &[u8],
slice: Slice,
-) -> Result<Cow<HgPath>, DirstateV2ParseError> {
+) -> Result<&HgPath, DirstateV2ParseError> {
let bytes = read_slice::<u8>(on_disk, slice)?;
- Ok(Cow::Borrowed(HgPath::new(bytes)))
+ Ok(HgPath::new(bytes))
}
fn read_slice<T>(
@@ -300,8 +339,11 @@
// First accumulate serialized nodes in a `Vec`
let mut on_disk_nodes = Vec::with_capacity(nodes.len());
for node in nodes {
- let children = node.children(dirstate_map.on_disk)?;
- let children = write_nodes(dirstate_map, children, out)?;
+ let children = write_nodes(
+ dirstate_map,
+ node.children(dirstate_map.on_disk)?,
+ out,
+ )?;
let full_path = node.full_path(dirstate_map.on_disk)?;
let full_path = write_slice::<u8>(full_path.as_bytes(), out);
let copy_source =
@@ -341,6 +383,12 @@
}
},
},
+ NodeRef::OnDisk(node) => Node {
+ children,
+ copy_source,
+ full_path,
+ ..*node
+ },
})
}
// … so we can write them contiguously