Mercurial > hg
changeset 48225:0cc69017d47f
rhg: stop manifest traversal when no more files are needed
Stopping the traversal early can skip a significant part
of the manifest traversal, to avoid some of its cost.
The worst-case benchmarks are favorable, as well.
Running [hg cat] on the last file in the manifest of
a large repo, I'm seeing a ~4ms improvement (150ms -> 146ms),
so this time is now almost indistinguishable from the
baseline ("brute force") implementation.
Running [hg cat] on ~220 files together with the last file
of the repo is further improved by ~5ms or so.
I suspect the raw performance improvements are caused by splitting
the manifest search and the file data access into separate phases,
instead of interleaving them.
Differential Revision: https://phab.mercurial-scm.org/D11616
author | Arseniy Alekseyev <aalekseyev@janestreet.com> |
---|---|
date | Tue, 05 Oct 2021 15:10:42 +0100 |
parents | 6b5773f89183 |
children | 5e77bdc29d56 |
files | rust/hg-core/src/operations/cat.rs |
diffstat | 1 files changed, 59 insertions(+), 22 deletions(-) [+] |
line wrap: on
line diff
--- a/rust/hg-core/src/operations/cat.rs Mon Oct 04 19:06:45 2021 +0100 +++ b/rust/hg-core/src/operations/cat.rs Tue Oct 05 15:10:42 2021 +0100 @@ -9,10 +9,12 @@ use crate::revlog::revlog::RevlogError; use crate::revlog::Node; +use crate::utils::hg_path::HgPath; use crate::utils::hg_path::HgPathBuf; -use itertools::EitherOrBoth::{Both, Left, Right}; -use itertools::Itertools; +use itertools::put_back; +use itertools::PutBack; +use std::cmp::Ordering; pub struct CatOutput { /// Whether any file in the manifest matched the paths given as CLI @@ -26,6 +28,49 @@ pub node: Node, } +// Find an item in an iterator over a sorted collection. +fn find_item<'a, 'b, 'c, D, I: Iterator<Item = (&'a HgPath, D)>>( + i: &mut PutBack<I>, + needle: &'b HgPath, +) -> Option<I::Item> { + loop { + match i.next() { + None => return None, + Some(val) => match needle.as_bytes().cmp(val.0.as_bytes()) { + Ordering::Less => { + i.put_back(val); + return None; + } + Ordering::Greater => continue, + Ordering::Equal => return Some(val), + }, + } + } +} + +fn find_files_in_manifest< + 'a, + 'b, + D, + I: Iterator<Item = (&'a HgPath, D)>, + J: Iterator<Item = &'b HgPath>, +>( + manifest: I, + files: J, +) -> (Vec<(&'a HgPath, D)>, Vec<&'b HgPath>) { + let mut manifest = put_back(manifest); + let mut res = vec![]; + let mut missing = vec![]; + + for file in files { + match find_item(&mut manifest, file) { + None => missing.push(file), + Some(item) => res.push(item), + } + } + return (res, missing); +} + /// Output the given revision of files /// /// * `root`: Repository root @@ -42,34 +87,26 @@ .changelog()? .node_from_rev(rev) .expect("should succeed when repo.manifest did"); - let mut bytes = vec![]; + let mut bytes: Vec<u8> = vec![]; let mut found_any = false; + files.sort_unstable(); - let mut missing = vec![]; + let (found, missing) = find_files_in_manifest( + manifest.files_with_nodes(), + files.iter().map(|f| f.as_ref()), + ); - for entry in manifest - .files_with_nodes() - .merge_join_by(files.iter(), |(manifest_file, _), file| { - manifest_file.cmp(&file.as_ref()) - }) - { - match entry { - Left(_) => (), - Right(path) => missing.push(path), - Both((manifest_file, node_bytes), _) => { - found_any = true; - let file_log = repo.filelog(manifest_file)?; - let file_node = Node::from_hex_for_repo(node_bytes)?; - let entry = file_log.data_for_node(file_node)?; - bytes.extend(entry.data()?) - } - } + for (manifest_file, node_bytes) in found { + found_any = true; + let file_log = repo.filelog(manifest_file)?; + let file_node = Node::from_hex_for_repo(node_bytes)?; + bytes.extend(file_log.data_for_node(file_node)?.data()?); } let missing: Vec<HgPathBuf> = missing .iter() - .map(|file| (*(file.as_ref())).to_owned()) + .map(|file| (*file).to_owned()) .collect(); Ok(CatOutput { found_any,