Mercurial > hg
changeset 47681:d94118365ec5
dirstate-v2: Add heuristic for when to create a new data file
… instead of appending to the existing one. This is based on keeping track
of how much of the existing data is not used anymore.
Differential Revision: https://phab.mercurial-scm.org/D11097
author | Simon Sapin <simon.sapin@octobus.net> |
---|---|
date | Thu, 08 Jul 2021 19:23:44 +0200 |
parents | a8b0f29dc0d7 |
children | 78f7f0d490ee |
files | rust/hg-core/src/dirstate_tree/dirstate_map.rs rust/hg-core/src/dirstate_tree/on_disk.rs |
diffstat | 2 files changed, 85 insertions(+), 39 deletions(-) [+] |
line wrap: on
line diff
--- a/rust/hg-core/src/dirstate_tree/dirstate_map.rs Thu Jul 15 10:31:43 2021 +0200 +++ b/rust/hg-core/src/dirstate_tree/dirstate_map.rs Thu Jul 08 19:23:44 2021 +0200 @@ -29,6 +29,10 @@ use crate::StatusError; use crate::StatusOptions; +/// Append to an existing data file if the amount of unreachable data (not used +/// anymore) is less than this fraction of the total amount of existing data. +const ACCEPTABLE_UNREACHABLE_BYTES_RATIO: f32 = 0.5; + pub struct DirstateMap<'on_disk> { /// Contents of the `.hg/dirstate` file pub(super) on_disk: &'on_disk [u8], @@ -44,6 +48,9 @@ /// See on_disk::Header pub(super) ignore_patterns_hash: on_disk::IgnorePatternsHash, + + /// How many bytes of `on_disk` are not used anymore + pub(super) unreachable_bytes: u32, } /// Using a plain `HgPathBuf` of the full path from the repository root as a @@ -118,9 +125,10 @@ } } - pub(super) fn make_mut( + fn make_mut( &mut self, on_disk: &'on_disk [u8], + unreachable_bytes: &mut u32, ) -> Result< &mut FastHashMap<NodeKey<'on_disk>, Node<'on_disk>>, DirstateV2ParseError, @@ -128,6 +136,8 @@ match self { ChildNodes::InMemory(nodes) => Ok(nodes), ChildNodes::OnDisk(nodes) => { + *unreachable_bytes += + std::mem::size_of_val::<[on_disk::Node]>(nodes) as u32; let nodes = nodes .iter() .map(|node| { @@ -406,6 +416,7 @@ nodes_with_entry_count: 0, nodes_with_copy_source_count: 0, ignore_patterns_hash: [0; on_disk::IGNORE_PATTERNS_HASH_LEN], + unreachable_bytes: 0, } } @@ -436,6 +447,7 @@ let tracked = entry.state.is_tracked(); let node = Self::get_or_insert_node( map.on_disk, + &mut map.unreachable_bytes, &mut map.root, path, WithBasename::to_cow_borrowed, @@ -472,18 +484,8 @@ /// append to the existing data file that contains `self.on_disk` (true), /// or create a new data file from scratch (false). pub(super) fn write_should_append(&self) -> bool { - // Soon this will be a heuristic based on the amount of unreachable - // data. For now it’s pseudo-random in order to make tests exercise - // both code paths. - - fn bad_rng() -> u32 { - std::time::SystemTime::now() - .duration_since(std::time::UNIX_EPOCH) - .unwrap() - .subsec_millis() - } - - bad_rng() % 2 == 0 + let ratio = self.unreachable_bytes as f32 / self.on_disk.len() as f32; + ratio < ACCEPTABLE_UNREACHABLE_BYTES_RATIO } fn get_node<'tree>( @@ -514,6 +516,7 @@ /// other fields while the returned borrow is still valid fn get_node_mut<'tree>( on_disk: &'on_disk [u8], + unreachable_bytes: &mut u32, root: &'tree mut ChildNodes<'on_disk>, path: &HgPath, ) -> Result<Option<&'tree mut Node<'on_disk>>, DirstateV2ParseError> { @@ -522,7 +525,9 @@ let mut component = components.next().expect("expected at least one components"); loop { - if let Some(child) = children.make_mut(on_disk)?.get_mut(component) + if let Some(child) = children + .make_mut(on_disk, unreachable_bytes)? + .get_mut(component) { if let Some(next_component) = components.next() { component = next_component; @@ -542,6 +547,7 @@ ) -> Result<&'tree mut Node<'on_disk>, DirstateV2ParseError> { Self::get_or_insert_node( self.on_disk, + &mut self.unreachable_bytes, &mut self.root, path, WithBasename::to_cow_owned, @@ -549,8 +555,9 @@ ) } - pub(super) fn get_or_insert_node<'tree, 'path>( + fn get_or_insert_node<'tree, 'path>( on_disk: &'on_disk [u8], + unreachable_bytes: &mut u32, root: &'tree mut ChildNodes<'on_disk>, path: &'path HgPath, to_cow: impl Fn( @@ -569,7 +576,7 @@ // map already contains that key, without introducing double // lookup? let child_node = child_nodes - .make_mut(on_disk)? + .make_mut(on_disk, unreachable_bytes)? .entry(to_cow(ancestor_path)) .or_default(); if let Some(next) = inclusive_ancestor_paths.next() { @@ -598,6 +605,7 @@ let node = Self::get_or_insert_node( self.on_disk, + &mut self.unreachable_bytes, &mut self.root, path, WithBasename::to_cow_owned, @@ -681,6 +689,7 @@ for path in paths { if let Some(node) = Self::get_node_mut( self.on_disk, + &mut self.unreachable_bytes, &mut self.root, path.as_ref(), )? { @@ -712,6 +721,12 @@ Ok(None) }) } + + fn count_dropped_path(unreachable_bytes: &mut u32, path: &Cow<HgPath>) { + if let Cow::Borrowed(path) = path { + *unreachable_bytes += path.len() as u32 + } + } } /// Like `Iterator::filter_map`, but over a fallible iterator of `Result`s. @@ -844,13 +859,14 @@ /// removed a node in `nodes`. fn recur<'on_disk>( on_disk: &'on_disk [u8], + unreachable_bytes: &mut u32, nodes: &mut ChildNodes<'on_disk>, path: &HgPath, ) -> Result<Option<(Dropped, bool)>, DirstateV2ParseError> { let (first_path_component, rest_of_path) = path.split_first_component(); - let node = if let Some(node) = - nodes.make_mut(on_disk)?.get_mut(first_path_component) + let nodes = nodes.make_mut(on_disk, unreachable_bytes)?; + let node = if let Some(node) = nodes.get_mut(first_path_component) { node } else { @@ -858,9 +874,12 @@ }; let dropped; if let Some(rest) = rest_of_path { - if let Some((d, removed)) = - recur(on_disk, &mut node.children, rest)? - { + if let Some((d, removed)) = recur( + on_disk, + unreachable_bytes, + &mut node.children, + rest, + )? { dropped = d; if dropped.had_entry { node.descendants_with_entry_count -= 1; @@ -884,6 +903,9 @@ if had_entry { node.data = NodeData::None } + if let Some(source) = &node.copy_source { + DirstateMap::count_dropped_path(unreachable_bytes, source) + } dropped = Dropped { was_tracked: node .data @@ -899,14 +921,22 @@ && node.copy_source.is_none() && node.children.is_empty(); if remove { - nodes.make_mut(on_disk)?.remove(first_path_component); + let (key, _) = + nodes.remove_entry(first_path_component).unwrap(); + DirstateMap::count_dropped_path( + unreachable_bytes, + key.full_path(), + ) } Ok(Some((dropped, remove))) } - if let Some((dropped, _removed)) = - recur(self.on_disk, &mut self.root, filename)? - { + if let Some((dropped, _removed)) = recur( + self.on_disk, + &mut self.unreachable_bytes, + &mut self.root, + filename, + )? { if dropped.had_entry { self.nodes_with_entry_count -= 1 } @@ -926,9 +956,12 @@ now: i32, ) -> Result<(), DirstateV2ParseError> { for filename in filenames { - if let Some(node) = - Self::get_node_mut(self.on_disk, &mut self.root, &filename)? - { + if let Some(node) = Self::get_node_mut( + self.on_disk, + &mut self.unreachable_bytes, + &mut self.root, + &filename, + )? { if let NodeData::Entry(entry) = &mut node.data { entry.clear_ambiguous_mtime(now); } @@ -1144,16 +1177,20 @@ key: &HgPath, ) -> Result<Option<HgPathBuf>, DirstateV2ParseError> { let count = &mut self.nodes_with_copy_source_count; - Ok( - Self::get_node_mut(self.on_disk, &mut self.root, key)?.and_then( - |node| { - if node.copy_source.is_some() { - *count -= 1 - } - node.copy_source.take().map(Cow::into_owned) - }, - ), - ) + let unreachable_bytes = &mut self.unreachable_bytes; + Ok(Self::get_node_mut( + self.on_disk, + unreachable_bytes, + &mut self.root, + key, + )? + .and_then(|node| { + if let Some(source) = &node.copy_source { + *count -= 1; + Self::count_dropped_path(unreachable_bytes, source); + } + node.copy_source.take().map(Cow::into_owned) + })) } fn copy_map_insert( @@ -1163,6 +1200,7 @@ ) -> Result<Option<HgPathBuf>, DirstateV2ParseError> { let node = Self::get_or_insert_node( self.on_disk, + &mut self.unreachable_bytes, &mut self.root, &key, WithBasename::to_cow_owned,
--- a/rust/hg-core/src/dirstate_tree/on_disk.rs Thu Jul 15 10:31:43 2021 +0200 +++ b/rust/hg-core/src/dirstate_tree/on_disk.rs Thu Jul 08 19:23:44 2021 +0200 @@ -73,6 +73,9 @@ nodes_with_entry_count: Size, nodes_with_copy_source_count: Size, + /// How many bytes of this data file are not used anymore + unreachable_bytes: Size, + /// If non-zero, a hash of ignore files that were used for some previous /// run of the `status` algorithm. /// @@ -205,7 +208,7 @@ /// Make sure that size-affecting changes are made knowingly fn _static_assert_size_of() { let _ = std::mem::transmute::<DocketHeader, [u8; 81]>; - let _ = std::mem::transmute::<Root, [u8; 36]>; + let _ = std::mem::transmute::<Root, [u8; 40]>; let _ = std::mem::transmute::<Node, [u8; 43]>; } @@ -283,6 +286,9 @@ return Ok(DirstateMap::empty(on_disk)); } let root = read_root(on_disk)?; + let mut unreachable_bytes = root.unreachable_bytes.get(); + // Each append writes a new `Root`, so it’s never reused + unreachable_bytes += std::mem::size_of::<Root>() as u32; let dirstate_map = DirstateMap { on_disk, root: dirstate_map::ChildNodes::OnDisk(read_nodes( @@ -292,6 +298,7 @@ nodes_with_entry_count: root.nodes_with_entry_count.get(), nodes_with_copy_source_count: root.nodes_with_copy_source_count.get(), ignore_patterns_hash: root.ignore_patterns_hash, + unreachable_bytes, }; Ok(dirstate_map) } @@ -573,6 +580,7 @@ nodes_with_copy_source_count: dirstate_map .nodes_with_copy_source_count .into(), + unreachable_bytes: dirstate_map.unreachable_bytes.into(), ignore_patterns_hash: dirstate_map.ignore_patterns_hash, }; writer.out.extend(root.as_bytes());