dirstate-v2: Parse the dirstate lazily, with copy-on-write nodes
authorSimon Sapin <simon.sapin@octobus.net>
Wed, 19 May 2021 13:15:00 +0200
changeset 47337 0654b3b3d2b5
parent 47336 8d0260d0dbc9
child 47338 f27f2afb15da
dirstate-v2: Parse the dirstate lazily, with copy-on-write nodes TODO: more description Differential Revision: https://phab.mercurial-scm.org/D10751
rust/hg-core/src/dirstate_tree/dirstate_map.rs
rust/hg-core/src/dirstate_tree/on_disk.rs
--- a/rust/hg-core/src/dirstate_tree/dirstate_map.rs	Wed May 19 13:15:00 2021 +0200
+++ b/rust/hg-core/src/dirstate_tree/dirstate_map.rs	Wed May 19 13:15:00 2021 +0200
@@ -48,14 +48,17 @@
 
 pub(super) enum ChildNodes<'on_disk> {
     InMemory(FastHashMap<NodeKey<'on_disk>, Node<'on_disk>>),
+    OnDisk(&'on_disk [on_disk::Node]),
 }
 
 pub(super) enum ChildNodesRef<'tree, 'on_disk> {
     InMemory(&'tree FastHashMap<NodeKey<'on_disk>, Node<'on_disk>>),
+    OnDisk(&'on_disk [on_disk::Node]),
 }
 
 pub(super) enum NodeRef<'tree, 'on_disk> {
     InMemory(&'tree NodeKey<'on_disk>, &'tree Node<'on_disk>),
+    OnDisk(&'on_disk on_disk::Node),
 }
 
 impl Default for ChildNodes<'_> {
@@ -70,24 +73,42 @@
     ) -> ChildNodesRef<'tree, 'on_disk> {
         match self {
             ChildNodes::InMemory(nodes) => ChildNodesRef::InMemory(nodes),
+            ChildNodes::OnDisk(nodes) => ChildNodesRef::OnDisk(nodes),
         }
     }
 
     pub(super) fn is_empty(&self) -> bool {
         match self {
             ChildNodes::InMemory(nodes) => nodes.is_empty(),
+            ChildNodes::OnDisk(nodes) => nodes.is_empty(),
         }
     }
 
     pub(super) fn make_mut(
         &mut self,
-        _on_disk: &'on_disk [u8],
+        on_disk: &'on_disk [u8],
     ) -> Result<
         &mut FastHashMap<NodeKey<'on_disk>, Node<'on_disk>>,
         DirstateV2ParseError,
     > {
         match self {
             ChildNodes::InMemory(nodes) => Ok(nodes),
+            ChildNodes::OnDisk(nodes) => {
+                let nodes = nodes
+                    .iter()
+                    .map(|node| {
+                        Ok((
+                            node.path(on_disk)?,
+                            node.to_in_memory_node(on_disk)?,
+                        ))
+                    })
+                    .collect::<Result<_, _>>()?;
+                *self = ChildNodes::InMemory(nodes);
+                match self {
+                    ChildNodes::InMemory(nodes) => Ok(nodes),
+                    ChildNodes::OnDisk(_) => unreachable!(),
+                }
+            }
         }
     }
 }
@@ -96,12 +117,29 @@
     pub(super) fn get(
         &self,
         base_name: &HgPath,
-        _on_disk: &'on_disk [u8],
+        on_disk: &'on_disk [u8],
     ) -> Result<Option<NodeRef<'tree, 'on_disk>>, DirstateV2ParseError> {
         match self {
             ChildNodesRef::InMemory(nodes) => Ok(nodes
                 .get_key_value(base_name)
                 .map(|(k, v)| NodeRef::InMemory(k, v))),
+            ChildNodesRef::OnDisk(nodes) => {
+                let mut parse_result = Ok(());
+                let search_result = nodes.binary_search_by(|node| {
+                    match node.base_name(on_disk) {
+                        Ok(node_base_name) => node_base_name.cmp(base_name),
+                        Err(e) => {
+                            parse_result = Err(e);
+                            // Dummy comparison result, `search_result` won’t
+                            // be used since `parse_result` is an error
+                            std::cmp::Ordering::Equal
+                        }
+                    }
+                });
+                parse_result.map(|()| {
+                    search_result.ok().map(|i| NodeRef::OnDisk(&nodes[i]))
+                })
+            }
         }
     }
 
@@ -110,8 +148,11 @@
         &self,
     ) -> impl Iterator<Item = NodeRef<'tree, 'on_disk>> {
         match self {
-            ChildNodesRef::InMemory(nodes) => {
-                nodes.iter().map(|(k, v)| NodeRef::InMemory(k, v))
+            ChildNodesRef::InMemory(nodes) => itertools::Either::Left(
+                nodes.iter().map(|(k, v)| NodeRef::InMemory(k, v)),
+            ),
+            ChildNodesRef::OnDisk(nodes) => {
+                itertools::Either::Right(nodes.iter().map(NodeRef::OnDisk))
             }
         }
     }
@@ -123,9 +164,12 @@
     {
         use rayon::prelude::*;
         match self {
-            ChildNodesRef::InMemory(nodes) => {
-                nodes.par_iter().map(|(k, v)| NodeRef::InMemory(k, v))
-            }
+            ChildNodesRef::InMemory(nodes) => rayon::iter::Either::Left(
+                nodes.par_iter().map(|(k, v)| NodeRef::InMemory(k, v)),
+            ),
+            ChildNodesRef::OnDisk(nodes) => rayon::iter::Either::Right(
+                nodes.par_iter().map(NodeRef::OnDisk),
+            ),
         }
     }
 
@@ -139,6 +183,7 @@
                 fn sort_key<'a>(node: &'a NodeRef) -> &'a HgPath {
                     match node {
                         NodeRef::InMemory(path, _node) => path.base_name(),
+                        NodeRef::OnDisk(_) => unreachable!(),
                     }
                 }
                 // `sort_unstable_by_key` doesn’t allow keys borrowing from the
@@ -146,6 +191,10 @@
                 vec.sort_unstable_by(|a, b| sort_key(a).cmp(sort_key(b)));
                 vec
             }
+            ChildNodesRef::OnDisk(nodes) => {
+                // Nodes on disk are already sorted
+                nodes.iter().map(NodeRef::OnDisk).collect()
+            }
         }
     }
 }
@@ -153,61 +202,72 @@
 impl<'tree, 'on_disk> NodeRef<'tree, 'on_disk> {
     pub(super) fn full_path(
         &self,
-        _on_disk: &'on_disk [u8],
+        on_disk: &'on_disk [u8],
     ) -> Result<&'tree HgPath, DirstateV2ParseError> {
         match self {
             NodeRef::InMemory(path, _node) => Ok(path.full_path()),
+            NodeRef::OnDisk(node) => node.full_path(on_disk),
         }
     }
 
     /// Returns a `Cow` that can borrow 'on_disk but is detached from 'tree
     pub(super) fn full_path_cow(
         &self,
-        _on_disk: &'on_disk [u8],
+        on_disk: &'on_disk [u8],
     ) -> Result<Cow<'on_disk, HgPath>, DirstateV2ParseError> {
         match self {
             NodeRef::InMemory(path, _node) => Ok(path.full_path().clone()),
+            NodeRef::OnDisk(node) => {
+                Ok(Cow::Borrowed(node.full_path(on_disk)?))
+            }
         }
     }
 
     pub(super) fn base_name(
         &self,
-        _on_disk: &'on_disk [u8],
+        on_disk: &'on_disk [u8],
     ) -> Result<&'tree HgPath, DirstateV2ParseError> {
         match self {
             NodeRef::InMemory(path, _node) => Ok(path.base_name()),
+            NodeRef::OnDisk(node) => node.base_name(on_disk),
         }
     }
 
     pub(super) fn children(
         &self,
-        _on_disk: &'on_disk [u8],
+        on_disk: &'on_disk [u8],
     ) -> Result<ChildNodesRef<'tree, 'on_disk>, DirstateV2ParseError> {
         match self {
             NodeRef::InMemory(_path, node) => Ok(node.children.as_ref()),
+            NodeRef::OnDisk(node) => {
+                Ok(ChildNodesRef::OnDisk(node.children(on_disk)?))
+            }
         }
     }
 
     pub(super) fn has_copy_source(&self) -> bool {
         match self {
             NodeRef::InMemory(_path, node) => node.copy_source.is_some(),
+            NodeRef::OnDisk(node) => node.has_copy_source(),
         }
     }
 
     pub(super) fn copy_source(
         &self,
-        _on_disk: &'on_disk [u8],
+        on_disk: &'on_disk [u8],
     ) -> Result<Option<&'tree HgPath>, DirstateV2ParseError> {
         match self {
             NodeRef::InMemory(_path, node) => {
                 Ok(node.copy_source.as_ref().map(|s| &**s))
             }
+            NodeRef::OnDisk(node) => node.copy_source(on_disk),
         }
     }
 
     pub(super) fn has_entry(&self) -> bool {
         match self {
             NodeRef::InMemory(_path, node) => node.entry.is_some(),
+            NodeRef::OnDisk(node) => node.has_entry(),
         }
     }
 
@@ -216,6 +276,7 @@
     ) -> Result<Option<DirstateEntry>, DirstateV2ParseError> {
         match self {
             NodeRef::InMemory(_path, node) => Ok(node.entry),
+            NodeRef::OnDisk(node) => node.entry(),
         }
     }
 
@@ -226,12 +287,14 @@
             NodeRef::InMemory(_path, node) => {
                 Ok(node.entry.as_ref().map(|entry| entry.state))
             }
+            NodeRef::OnDisk(node) => node.state(),
         }
     }
 
     pub(super) fn tracked_descendants_count(&self) -> u32 {
         match self {
             NodeRef::InMemory(_path, node) => node.tracked_descendants_count,
+            NodeRef::OnDisk(node) => node.tracked_descendants_count.get(),
         }
     }
 }
--- a/rust/hg-core/src/dirstate_tree/on_disk.rs	Wed May 19 13:15:00 2021 +0200
+++ b/rust/hg-core/src/dirstate_tree/on_disk.rs	Wed May 19 13:15:00 2021 +0200
@@ -16,6 +16,7 @@
 use crate::DirstateEntry;
 use crate::DirstateError;
 use crate::DirstateParents;
+use crate::EntryState;
 use bytes_cast::unaligned::{I32Be, U32Be, U64Be};
 use bytes_cast::BytesCast;
 use std::borrow::Cow;
@@ -42,7 +43,7 @@
 
 #[derive(BytesCast)]
 #[repr(C)]
-struct Node {
+pub(super) struct Node {
     full_path: PathSlice,
 
     /// In bytes from `self.full_path.start`
@@ -51,12 +52,12 @@
     copy_source: OptPathSlice,
     entry: OptEntry,
     children: ChildNodes,
-    tracked_descendants_count: Size,
+    pub(super) tracked_descendants_count: Size,
 }
 
 /// Either nothing if `state == b'\0'`, or a dirstate entry like in the v1
 /// format
-#[derive(BytesCast)]
+#[derive(BytesCast, Copy, Clone)]
 #[repr(C)]
 struct OptEntry {
     state: u8,
@@ -149,7 +150,9 @@
     }
     let dirstate_map = DirstateMap {
         on_disk,
-        root: read_nodes(on_disk, *root)?,
+        root: dirstate_map::ChildNodes::OnDisk(read_slice::<Node>(
+            on_disk, *root,
+        )?),
         nodes_with_entry_count: nodes_with_entry_count.get(),
         nodes_with_copy_source_count: nodes_with_copy_source_count.get(),
     };
@@ -158,49 +161,96 @@
 }
 
 impl Node {
-    pub(super) fn path<'on_disk>(
+    pub(super) fn full_path<'on_disk>(
         &self,
         on_disk: &'on_disk [u8],
-    ) -> Result<dirstate_map::NodeKey<'on_disk>, DirstateV2ParseError> {
-        let full_path = read_hg_path(on_disk, self.full_path)?;
-        let base_name_start = usize::try_from(self.base_name_start.get())
-            // u32 -> usize, could only panic on a 16-bit CPU
-            .expect("dirstate-v2 base_name_start out of bounds");
-        if base_name_start < full_path.len() {
-            Ok(WithBasename::from_raw_parts(full_path, base_name_start))
+    ) -> Result<&'on_disk HgPath, DirstateV2ParseError> {
+        read_hg_path(on_disk, self.full_path)
+    }
+
+    pub(super) fn base_name_start<'on_disk>(
+        &self,
+    ) -> Result<usize, DirstateV2ParseError> {
+        let start = self.base_name_start.get();
+        if start < self.full_path.len.get() {
+            let start = usize::try_from(start)
+                // u32 -> usize, could only panic on a 16-bit CPU
+                .expect("dirstate-v2 base_name_start out of bounds");
+            Ok(start)
         } else {
             Err(DirstateV2ParseError)
         }
     }
 
+    pub(super) fn base_name<'on_disk>(
+        &self,
+        on_disk: &'on_disk [u8],
+    ) -> Result<&'on_disk HgPath, DirstateV2ParseError> {
+        let full_path = self.full_path(on_disk)?;
+        let base_name_start = self.base_name_start()?;
+        Ok(HgPath::new(&full_path.as_bytes()[base_name_start..]))
+    }
+
+    pub(super) fn path<'on_disk>(
+        &self,
+        on_disk: &'on_disk [u8],
+    ) -> Result<dirstate_map::NodeKey<'on_disk>, DirstateV2ParseError> {
+        Ok(WithBasename::from_raw_parts(
+            Cow::Borrowed(self.full_path(on_disk)?),
+            self.base_name_start()?,
+        ))
+    }
+
+    pub(super) fn has_copy_source<'on_disk>(&self) -> bool {
+        self.copy_source.start.get() != 0
+    }
+
     pub(super) fn copy_source<'on_disk>(
         &self,
         on_disk: &'on_disk [u8],
-    ) -> Result<Option<Cow<'on_disk, HgPath>>, DirstateV2ParseError> {
-        Ok(if self.copy_source.start.get() != 0 {
+    ) -> Result<Option<&'on_disk HgPath>, DirstateV2ParseError> {
+        Ok(if self.has_copy_source() {
             Some(read_hg_path(on_disk, self.copy_source)?)
         } else {
             None
         })
     }
 
+    pub(super) fn has_entry(&self) -> bool {
+        self.entry.state != b'\0'
+    }
+
+    pub(super) fn state(
+        &self,
+    ) -> Result<Option<EntryState>, DirstateV2ParseError> {
+        Ok(if self.has_entry() {
+            Some(
+                self.entry
+                    .state
+                    .try_into()
+                    .map_err(|_| DirstateV2ParseError)?,
+            )
+        } else {
+            None
+        })
+    }
+
     pub(super) fn entry(
         &self,
     ) -> Result<Option<DirstateEntry>, DirstateV2ParseError> {
-        Ok(if self.entry.state != b'\0' {
-            Some(DirstateEntry {
-                state: self
-                    .entry
-                    .state
-                    .try_into()
-                    .map_err(|_| DirstateV2ParseError)?,
-                mode: self.entry.mode.get(),
-                mtime: self.entry.mtime.get(),
-                size: self.entry.size.get(),
-            })
-        } else {
-            None
-        })
+        Ok(self.state()?.map(|state| DirstateEntry {
+            state,
+            mode: self.entry.mode.get(),
+            mtime: self.entry.mtime.get(),
+            size: self.entry.size.get(),
+        }))
+    }
+
+    pub(super) fn children<'on_disk>(
+        &self,
+        on_disk: &'on_disk [u8],
+    ) -> Result<&'on_disk [Node], DirstateV2ParseError> {
+        read_slice::<Node>(on_disk, self.children)
     }
 
     pub(super) fn to_in_memory_node<'on_disk>(
@@ -208,33 +258,22 @@
         on_disk: &'on_disk [u8],
     ) -> Result<dirstate_map::Node<'on_disk>, DirstateV2ParseError> {
         Ok(dirstate_map::Node {
-            children: read_nodes(on_disk, self.children)?,
-            copy_source: self.copy_source(on_disk)?,
+            children: dirstate_map::ChildNodes::OnDisk(
+                self.children(on_disk)?,
+            ),
+            copy_source: self.copy_source(on_disk)?.map(Cow::Borrowed),
             entry: self.entry()?,
             tracked_descendants_count: self.tracked_descendants_count.get(),
         })
     }
 }
 
-fn read_nodes(
-    on_disk: &[u8],
-    slice: ChildNodes,
-) -> Result<dirstate_map::ChildNodes, DirstateV2ParseError> {
-    read_slice::<Node>(on_disk, slice)?
-        .iter()
-        .map(|node| {
-            Ok((node.path(on_disk)?, node.to_in_memory_node(on_disk)?))
-        })
-        .collect::<Result<_, _>>()
-        .map(dirstate_map::ChildNodes::InMemory)
-}
-
 fn read_hg_path(
     on_disk: &[u8],
     slice: Slice,
-) -> Result<Cow<HgPath>, DirstateV2ParseError> {
+) -> Result<&HgPath, DirstateV2ParseError> {
     let bytes = read_slice::<u8>(on_disk, slice)?;
-    Ok(Cow::Borrowed(HgPath::new(bytes)))
+    Ok(HgPath::new(bytes))
 }
 
 fn read_slice<T>(
@@ -300,8 +339,11 @@
     // First accumulate serialized nodes in a `Vec`
     let mut on_disk_nodes = Vec::with_capacity(nodes.len());
     for node in nodes {
-        let children = node.children(dirstate_map.on_disk)?;
-        let children = write_nodes(dirstate_map, children, out)?;
+        let children = write_nodes(
+            dirstate_map,
+            node.children(dirstate_map.on_disk)?,
+            out,
+        )?;
         let full_path = node.full_path(dirstate_map.on_disk)?;
         let full_path = write_slice::<u8>(full_path.as_bytes(), out);
         let copy_source =
@@ -341,6 +383,12 @@
                     }
                 },
             },
+            NodeRef::OnDisk(node) => Node {
+                children,
+                copy_source,
+                full_path,
+                ..*node
+            },
         })
     }
     // … so we can write them contiguously