diff rust/hg-core/src/dirstate_tree/on_disk.rs @ 47682:78f7f0d490ee

dirstate-v2: Move fixed-size tree metadata into the docket file Before this changeset, the dirstate-v2 data file contained not only nodes and paths that may be reused when appending to an existing file, but also some fixed-size metadata that applies to the entire tree and was added at the end of the data file for every append. This moves that metadata into the docket file, so that repeated "append" operations without meaningful changes don’t actually need to grow any file. Differential Revision: https://phab.mercurial-scm.org/D11098
author Simon Sapin <simon.sapin@octobus.net>
date Thu, 15 Jul 2021 23:02:17 +0200
parents d94118365ec5
children 852262e2e7d9
line wrap: on
line diff
--- a/rust/hg-core/src/dirstate_tree/on_disk.rs	Thu Jul 08 19:23:44 2021 +0200
+++ b/rust/hg-core/src/dirstate_tree/on_disk.rs	Thu Jul 15 23:02:17 2021 +0200
@@ -47,6 +47,18 @@
 pub(super) const IGNORE_PATTERNS_HASH_LEN: usize = 20;
 pub(super) type IgnorePatternsHash = [u8; IGNORE_PATTERNS_HASH_LEN];
 
+/// Must match the constant of the same name in
+/// `mercurial/dirstateutils/docket.py`
+const TREE_METADATA_SIZE: usize = 40;
+
+/// Make sure that size-affecting changes are made knowingly
+#[allow(unused)]
+fn static_assert_size_of() {
+    let _ = std::mem::transmute::<DocketHeader, [u8; 121]>;
+    let _ = std::mem::transmute::<TreeMetadata, [u8; TREE_METADATA_SIZE]>;
+    let _ = std::mem::transmute::<Node, [u8; 43]>;
+}
+
 // Must match `HEADER` in `mercurial/dirstateutils/docket.py`
 #[derive(BytesCast)]
 #[repr(C)]
@@ -58,6 +70,8 @@
     /// Counted in bytes
     data_size: Size,
 
+    metadata: TreeMetadata,
+
     uuid_size: u8,
 }
 
@@ -68,7 +82,7 @@
 
 #[derive(BytesCast)]
 #[repr(C)]
-struct Root {
+struct TreeMetadata {
     root_nodes: ChildNodes,
     nodes_with_entry_count: Size,
     nodes_with_copy_source_count: Size,
@@ -134,7 +148,7 @@
     ///   - All direct children of this directory (as returned by
     ///     `std::fs::read_dir`) either have a corresponding dirstate node, or
     ///     are ignored by ignore patterns whose hash is in
-    ///     `Root::ignore_patterns_hash`.
+    ///     `TreeMetadata::ignore_patterns_hash`.
     ///
     ///   This means that if `std::fs::symlink_metadata` later reports the
     ///   same modification time and ignored patterns haven’t changed, a run
@@ -205,13 +219,6 @@
 /// Either nothing if `start == 0`, or a `HgPath` of `len` bytes
 type OptPathSlice = PathSlice;
 
-/// Make sure that size-affecting changes are made knowingly
-fn _static_assert_size_of() {
-    let _ = std::mem::transmute::<DocketHeader, [u8; 81]>;
-    let _ = std::mem::transmute::<Root, [u8; 40]>;
-    let _ = std::mem::transmute::<Node, [u8; 43]>;
-}
-
 /// Unexpected file format found in `.hg/dirstate` with the "v2" format.
 ///
 /// This should only happen if Mercurial is buggy or a repository is corrupted.
@@ -242,6 +249,10 @@
         DirstateParents { p1, p2 }
     }
 
+    pub fn tree_metadata(&self) -> &[u8] {
+        self.header.metadata.as_bytes()
+    }
+
     pub fn data_size(&self) -> usize {
         // This `unwrap` could only panic on a 16-bit CPU
         self.header.data_size.get().try_into().unwrap()
@@ -265,40 +276,25 @@
     }
 }
 
-fn read_root<'on_disk>(
-    on_disk: &'on_disk [u8],
-) -> Result<&'on_disk Root, DirstateV2ParseError> {
-    // Find the `Root` at the end of the given slice
-    let root_offset = on_disk
-        .len()
-        .checked_sub(std::mem::size_of::<Root>())
-        // A non-empty slice too short is an error
-        .ok_or(DirstateV2ParseError)?;
-    let (root, _) = Root::from_bytes(&on_disk[root_offset..])
-        .map_err(|_| DirstateV2ParseError)?;
-    Ok(root)
-}
-
 pub(super) fn read<'on_disk>(
     on_disk: &'on_disk [u8],
+    metadata: &[u8],
 ) -> Result<DirstateMap<'on_disk>, DirstateV2ParseError> {
     if on_disk.is_empty() {
         return Ok(DirstateMap::empty(on_disk));
     }
-    let root = read_root(on_disk)?;
-    let mut unreachable_bytes = root.unreachable_bytes.get();
-    // Each append writes a new `Root`, so it’s never reused
-    unreachable_bytes += std::mem::size_of::<Root>() as u32;
+    let (meta, _) = TreeMetadata::from_bytes(metadata)
+        .map_err(|_| DirstateV2ParseError)?;
     let dirstate_map = DirstateMap {
         on_disk,
         root: dirstate_map::ChildNodes::OnDisk(read_nodes(
             on_disk,
-            root.root_nodes,
+            meta.root_nodes,
         )?),
-        nodes_with_entry_count: root.nodes_with_entry_count.get(),
-        nodes_with_copy_source_count: root.nodes_with_copy_source_count.get(),
-        ignore_patterns_hash: root.ignore_patterns_hash,
-        unreachable_bytes,
+        nodes_with_entry_count: meta.nodes_with_entry_count.get(),
+        nodes_with_copy_source_count: meta.nodes_with_copy_source_count.get(),
+        ignore_patterns_hash: meta.ignore_patterns_hash,
+        unreachable_bytes: meta.unreachable_bytes.get(),
     };
     Ok(dirstate_map)
 }
@@ -530,9 +526,11 @@
 
 pub(crate) fn for_each_tracked_path<'on_disk>(
     on_disk: &'on_disk [u8],
+    metadata: &[u8],
     mut f: impl FnMut(&'on_disk HgPath),
 ) -> Result<(), DirstateV2ParseError> {
-    let root = read_root(on_disk)?;
+    let (meta, _) = TreeMetadata::from_bytes(metadata)
+        .map_err(|_| DirstateV2ParseError)?;
     fn recur<'on_disk>(
         on_disk: &'on_disk [u8],
         nodes: ChildNodes,
@@ -548,23 +546,23 @@
         }
         Ok(())
     }
-    recur(on_disk, root.root_nodes, &mut f)
+    recur(on_disk, meta.root_nodes, &mut f)
 }
 
-/// Returns new data together with whether that data should be appended to the
-/// existing data file whose content is at `dirstate_map.on_disk` (true),
-/// instead of written to a new data file (false).
+/// Returns new data and metadata, together with whether that data should be
+/// appended to the existing data file whose content is at
+/// `dirstate_map.on_disk` (true), instead of written to a new data file
+/// (false).
 pub(super) fn write(
     dirstate_map: &mut DirstateMap,
     can_append: bool,
-) -> Result<(Vec<u8>, bool), DirstateError> {
+) -> Result<(Vec<u8>, Vec<u8>, bool), DirstateError> {
     let append = can_append && dirstate_map.write_should_append();
 
     // This ignores the space for paths, and for nodes without an entry.
     // TODO: better estimate? Skip the `Vec` and write to a file directly?
-    let size_guess = std::mem::size_of::<Root>()
-        + std::mem::size_of::<Node>()
-            * dirstate_map.nodes_with_entry_count as usize;
+    let size_guess = std::mem::size_of::<Node>()
+        * dirstate_map.nodes_with_entry_count as usize;
 
     let mut writer = Writer {
         dirstate_map,
@@ -574,7 +572,7 @@
 
     let root_nodes = writer.write_nodes(dirstate_map.root.as_ref())?;
 
-    let root = Root {
+    let meta = TreeMetadata {
         root_nodes,
         nodes_with_entry_count: dirstate_map.nodes_with_entry_count.into(),
         nodes_with_copy_source_count: dirstate_map
@@ -583,8 +581,7 @@
         unreachable_bytes: dirstate_map.unreachable_bytes.into(),
         ignore_patterns_hash: dirstate_map.ignore_patterns_hash,
     };
-    writer.out.extend(root.as_bytes());
-    Ok((writer.out, append))
+    Ok((writer.out, meta.as_bytes().to_vec(), append))
 }
 
 struct Writer<'dmap, 'on_disk> {