view rust/hg-core/src/operations/cat.rs @ 48225:0cc69017d47f

rhg: stop manifest traversal when no more files are needed Stopping the traversal early can skip a significant part of the manifest traversal, to avoid some of its cost. The worst-case benchmarks are favorable, as well. Running [hg cat] on the last file in the manifest of a large repo, I'm seeing a ~4ms improvement (150ms -> 146ms), so this time is now almost indistinguishable from the baseline ("brute force") implementation. Running [hg cat] on ~220 files together with the last file of the repo is further improved by ~5ms or so. I suspect the raw performance improvements are caused by splitting the manifest search and the file data access into separate phases, instead of interleaving them. Differential Revision: https://phab.mercurial-scm.org/D11616
author Arseniy Alekseyev <aalekseyev@janestreet.com>
date Tue, 05 Oct 2021 15:10:42 +0100
parents 6b5773f89183
children 1837663ac216
line wrap: on
line source

// list_tracked_files.rs
//
// Copyright 2020 Antoine Cezar <antoine.cezar@octobus.net>
//
// This software may be used and distributed according to the terms of the
// GNU General Public License version 2 or any later version.

use crate::repo::Repo;
use crate::revlog::revlog::RevlogError;
use crate::revlog::Node;

use crate::utils::hg_path::HgPath;
use crate::utils::hg_path::HgPathBuf;

use itertools::put_back;
use itertools::PutBack;
use std::cmp::Ordering;

pub struct CatOutput {
    /// Whether any file in the manifest matched the paths given as CLI
    /// arguments
    pub found_any: bool,
    /// The contents of matching files, in manifest order
    pub concatenated: Vec<u8>,
    /// Which of the CLI arguments did not match any manifest file
    pub missing: Vec<HgPathBuf>,
    /// The node ID that the given revset was resolved to
    pub node: Node,
}

// Find an item in an iterator over a sorted collection.
fn find_item<'a, 'b, 'c, D, I: Iterator<Item = (&'a HgPath, D)>>(
    i: &mut PutBack<I>,
    needle: &'b HgPath,
) -> Option<I::Item> {
    loop {
        match i.next() {
            None => return None,
            Some(val) => match needle.as_bytes().cmp(val.0.as_bytes()) {
                Ordering::Less => {
                    i.put_back(val);
                    return None;
                }
                Ordering::Greater => continue,
                Ordering::Equal => return Some(val),
            },
        }
    }
}

fn find_files_in_manifest<
    'a,
    'b,
    D,
    I: Iterator<Item = (&'a HgPath, D)>,
    J: Iterator<Item = &'b HgPath>,
>(
    manifest: I,
    files: J,
) -> (Vec<(&'a HgPath, D)>, Vec<&'b HgPath>) {
    let mut manifest = put_back(manifest);
    let mut res = vec![];
    let mut missing = vec![];

    for file in files {
        match find_item(&mut manifest, file) {
            None => missing.push(file),
            Some(item) => res.push(item),
        }
    }
    return (res, missing);
}

/// Output the given revision of files
///
/// * `root`: Repository root
/// * `rev`: The revision to cat the files from.
/// * `files`: The files to output.
pub fn cat<'a>(
    repo: &Repo,
    revset: &str,
    mut files: Vec<HgPathBuf>,
) -> Result<CatOutput, RevlogError> {
    let rev = crate::revset::resolve_single(revset, repo)?;
    let manifest = repo.manifest_for_rev(rev)?;
    let node = *repo
        .changelog()?
        .node_from_rev(rev)
        .expect("should succeed when repo.manifest did");
    let mut bytes: Vec<u8> = vec![];
    let mut found_any = false;

    files.sort_unstable();

    let (found, missing) = find_files_in_manifest(
        manifest.files_with_nodes(),
        files.iter().map(|f| f.as_ref()),
    );

    for (manifest_file, node_bytes) in found {
        found_any = true;
        let file_log = repo.filelog(manifest_file)?;
        let file_node = Node::from_hex_for_repo(node_bytes)?;
        bytes.extend(file_log.data_for_node(file_node)?.data()?);
    }

    let missing: Vec<HgPathBuf> = missing
        .iter()
        .map(|file| (*file).to_owned())
        .collect();
    Ok(CatOutput {
        found_any,
        concatenated: bytes,
        missing,
        node,
    })
}