Mercurial > hg
view rust/hg-core/src/revlog/changelog.rs @ 51229:1b23aaf5eb7b
rust-index: optimize find_gca_candidates() on less than 8 revisions
This is expected to be by far the most common case, given that, e.g.,
merging involves using it on two revisions.
Using a `u8` as support for the bitset obviously divides the amount of
RAM needed by 8. To state the obvious, on a repository with 10 million
changesets, this spares 70MB. It is also possible that it'd be slightly
faster, because it is easier to allocate and provides better cache locality.
It is possible that some exhaustive listing of the traits implemented by
`u8` and `u64` would avoid the added duplication, but that can be done later
and would need a replacement for the `MAX` consts.
author | Georges Racinet <georges.racinet@octobus.net> |
---|---|
date | Fri, 20 Oct 2023 09:12:22 +0200 |
parents | 13f58ce70299 |
children | 508fd40dc86a |
line wrap: on
line source
use crate::errors::HgError; use crate::revlog::Revision; use crate::revlog::{Node, NodePrefix}; use crate::revlog::{Revlog, RevlogEntry, RevlogError}; use crate::utils::hg_path::HgPath; use crate::vfs::Vfs; use crate::{Graph, GraphError, RevlogOpenOptions, UncheckedRevision}; use itertools::Itertools; use std::ascii::escape_default; use std::borrow::Cow; use std::fmt::{Debug, Formatter}; /// A specialized `Revlog` to work with changelog data format. pub struct Changelog { /// The generic `revlog` format. pub(crate) revlog: Revlog, } impl Changelog { /// Open the `changelog` of a repository given by its root. pub fn open( store_vfs: &Vfs, options: RevlogOpenOptions, ) -> Result<Self, HgError> { let revlog = Revlog::open(store_vfs, "00changelog.i", None, options)?; Ok(Self { revlog }) } /// Return the `ChangelogRevisionData` for the given node ID. pub fn data_for_node( &self, node: NodePrefix, ) -> Result<ChangelogRevisionData, RevlogError> { let rev = self.revlog.rev_from_node(node)?; self.entry_for_checked_rev(rev)?.data() } /// Return the [`ChangelogEntry`] for the given revision number. pub fn entry_for_rev( &self, rev: UncheckedRevision, ) -> Result<ChangelogEntry, RevlogError> { let revlog_entry = self.revlog.get_entry(rev)?; Ok(ChangelogEntry { revlog_entry }) } /// Same as [`Self::entry_for_rev`] for checked revisions. fn entry_for_checked_rev( &self, rev: Revision, ) -> Result<ChangelogEntry, RevlogError> { let revlog_entry = self.revlog.get_entry_for_checked_rev(rev)?; Ok(ChangelogEntry { revlog_entry }) } /// Return the [`ChangelogRevisionData`] for the given revision number. /// /// This is a useful shortcut in case the caller does not need the /// generic revlog information (parents, hashes etc). Otherwise /// consider taking a [`ChangelogEntry`] with /// [entry_for_rev](`Self::entry_for_rev`) and doing everything from there. pub fn data_for_rev( &self, rev: UncheckedRevision, ) -> Result<ChangelogRevisionData, RevlogError> { self.entry_for_rev(rev)?.data() } pub fn node_from_rev(&self, rev: UncheckedRevision) -> Option<&Node> { self.revlog.node_from_rev(rev) } pub fn rev_from_node( &self, node: NodePrefix, ) -> Result<Revision, RevlogError> { self.revlog.rev_from_node(node) } } impl Graph for Changelog { fn parents(&self, rev: Revision) -> Result<[Revision; 2], GraphError> { self.revlog.parents(rev) } } /// A specialized `RevlogEntry` for `changelog` data format /// /// This is a `RevlogEntry` with the added semantics that the associated /// data should meet the requirements for `changelog`, materialized by /// the fact that `data()` constructs a `ChangelogRevisionData`. /// In case that promise would be broken, the `data` method returns an error. #[derive(Clone)] pub struct ChangelogEntry<'changelog> { /// Same data, as a generic `RevlogEntry`. pub(crate) revlog_entry: RevlogEntry<'changelog>, } impl<'changelog> ChangelogEntry<'changelog> { pub fn data<'a>( &'a self, ) -> Result<ChangelogRevisionData<'changelog>, RevlogError> { let bytes = self.revlog_entry.data()?; if bytes.is_empty() { Ok(ChangelogRevisionData::null()) } else { Ok(ChangelogRevisionData::new(bytes).map_err(|err| { RevlogError::Other(HgError::CorruptedRepository(format!( "Invalid changelog data for revision {}: {:?}", self.revlog_entry.revision(), err ))) })?) } } /// Obtain a reference to the underlying `RevlogEntry`. /// /// This allows the caller to access the information that is common /// to all revlog entries: revision number, node id, parent revisions etc. pub fn as_revlog_entry(&self) -> &RevlogEntry { &self.revlog_entry } pub fn p1_entry(&self) -> Result<Option<ChangelogEntry>, RevlogError> { Ok(self .revlog_entry .p1_entry()? .map(|revlog_entry| Self { revlog_entry })) } pub fn p2_entry(&self) -> Result<Option<ChangelogEntry>, RevlogError> { Ok(self .revlog_entry .p2_entry()? .map(|revlog_entry| Self { revlog_entry })) } } /// `Changelog` entry which knows how to interpret the `changelog` data bytes. #[derive(PartialEq)] pub struct ChangelogRevisionData<'changelog> { /// The data bytes of the `changelog` entry. bytes: Cow<'changelog, [u8]>, /// The end offset for the hex manifest (not including the newline) manifest_end: usize, /// The end offset for the user+email (not including the newline) user_end: usize, /// The end offset for the timestamp+timezone+extras (not including the /// newline) timestamp_end: usize, /// The end offset for the file list (not including the newline) files_end: usize, } impl<'changelog> ChangelogRevisionData<'changelog> { fn new(bytes: Cow<'changelog, [u8]>) -> Result<Self, HgError> { let mut line_iter = bytes.split(|b| b == &b'\n'); let manifest_end = line_iter .next() .expect("Empty iterator from split()?") .len(); let user_slice = line_iter.next().ok_or_else(|| { HgError::corrupted("Changeset data truncated after manifest line") })?; let user_end = manifest_end + 1 + user_slice.len(); let timestamp_slice = line_iter.next().ok_or_else(|| { HgError::corrupted("Changeset data truncated after user line") })?; let timestamp_end = user_end + 1 + timestamp_slice.len(); let mut files_end = timestamp_end + 1; loop { let line = line_iter.next().ok_or_else(|| { HgError::corrupted("Changeset data truncated in files list") })?; if line.is_empty() { if files_end == bytes.len() { // The list of files ended with a single newline (there // should be two) return Err(HgError::corrupted( "Changeset data truncated after files list", )); } files_end -= 1; break; } files_end += line.len() + 1; } Ok(Self { bytes, manifest_end, user_end, timestamp_end, files_end, }) } fn null() -> Self { Self::new(Cow::Borrowed( b"0000000000000000000000000000000000000000\n\n0 0\n\n", )) .unwrap() } /// Return an iterator over the lines of the entry. pub fn lines(&self) -> impl Iterator<Item = &[u8]> { self.bytes.split(|b| b == &b'\n') } /// Return the node id of the `manifest` referenced by this `changelog` /// entry. pub fn manifest_node(&self) -> Result<Node, HgError> { let manifest_node_hex = &self.bytes[..self.manifest_end]; Node::from_hex_for_repo(manifest_node_hex) } /// The full user string (usually a name followed by an email enclosed in /// angle brackets) pub fn user(&self) -> &[u8] { &self.bytes[self.manifest_end + 1..self.user_end] } /// The full timestamp line (timestamp in seconds, offset in seconds, and /// possibly extras) // TODO: We should expose this in a more useful way pub fn timestamp_line(&self) -> &[u8] { &self.bytes[self.user_end + 1..self.timestamp_end] } /// The files changed in this revision. pub fn files(&self) -> impl Iterator<Item = &HgPath> { self.bytes[self.timestamp_end + 1..self.files_end] .split(|b| b == &b'\n') .map(HgPath::new) } /// The change description. pub fn description(&self) -> &[u8] { &self.bytes[self.files_end + 2..] } } impl Debug for ChangelogRevisionData<'_> { fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result { f.debug_struct("ChangelogRevisionData") .field("bytes", &debug_bytes(&self.bytes)) .field("manifest", &debug_bytes(&self.bytes[..self.manifest_end])) .field( "user", &debug_bytes( &self.bytes[self.manifest_end + 1..self.user_end], ), ) .field( "timestamp", &debug_bytes( &self.bytes[self.user_end + 1..self.timestamp_end], ), ) .field( "files", &debug_bytes( &self.bytes[self.timestamp_end + 1..self.files_end], ), ) .field( "description", &debug_bytes(&self.bytes[self.files_end + 2..]), ) .finish() } } fn debug_bytes(bytes: &[u8]) -> String { String::from_utf8_lossy( &bytes.iter().flat_map(|b| escape_default(*b)).collect_vec(), ) .to_string() } #[cfg(test)] mod tests { use super::*; use crate::vfs::Vfs; use crate::NULL_REVISION; use pretty_assertions::assert_eq; #[test] fn test_create_changelogrevisiondata_invalid() { // Completely empty assert!(ChangelogRevisionData::new(Cow::Borrowed(b"abcd")).is_err()); // No newline after manifest assert!(ChangelogRevisionData::new(Cow::Borrowed(b"abcd")).is_err()); // No newline after user assert!(ChangelogRevisionData::new(Cow::Borrowed(b"abcd\n")).is_err()); // No newline after timestamp assert!( ChangelogRevisionData::new(Cow::Borrowed(b"abcd\n\n0 0")).is_err() ); // Missing newline after files assert!(ChangelogRevisionData::new(Cow::Borrowed( b"abcd\n\n0 0\nfile1\nfile2" )) .is_err(),); // Only one newline after files assert!(ChangelogRevisionData::new(Cow::Borrowed( b"abcd\n\n0 0\nfile1\nfile2\n" )) .is_err(),); } #[test] fn test_create_changelogrevisiondata() { let data = ChangelogRevisionData::new(Cow::Borrowed( b"0123456789abcdef0123456789abcdef01234567 Some One <someone@example.com> 0 0 file1 file2 some commit message", )) .unwrap(); assert_eq!( data.manifest_node().unwrap(), Node::from_hex("0123456789abcdef0123456789abcdef01234567") .unwrap() ); assert_eq!(data.user(), b"Some One <someone@example.com>"); assert_eq!(data.timestamp_line(), b"0 0"); assert_eq!( data.files().collect_vec(), vec![HgPath::new("file1"), HgPath::new("file2")] ); assert_eq!(data.description(), b"some\ncommit\nmessage"); } #[test] fn test_data_from_rev_null() -> Result<(), RevlogError> { // an empty revlog will be enough for this case let temp = tempfile::tempdir().unwrap(); let vfs = Vfs { base: temp.path() }; std::fs::write(temp.path().join("foo.i"), b"").unwrap(); let revlog = Revlog::open(&vfs, "foo.i", None, RevlogOpenOptions::new()) .unwrap(); let changelog = Changelog { revlog }; assert_eq!( changelog.data_for_rev(NULL_REVISION.into())?, ChangelogRevisionData::null() ); // same with the intermediate entry object assert_eq!( changelog.entry_for_rev(NULL_REVISION.into())?.data()?, ChangelogRevisionData::null() ); Ok(()) } }