Mercurial > hg
changeset 49064:95da3e99cbd8
rust-changelog: start parsing changeset data
This patch makes `ChangelogRevisionData` do some coarse, line-level
splitting of the changeset data into manifest node, user, timestamp,
files list, and description. There are no (in-tree) users of these
functions yet, but I've added tests to prevent regressions. We'll
surely add callers at some point.
Differential Revision: https://phab.mercurial-scm.org/D12439
author | Martin von Zweigbergk <martinvonz@google.com> |
---|---|
date | Tue, 05 Apr 2022 08:47:04 -0700 |
parents | cc132255261b |
children | 5d205e476057 |
files | rust/hg-core/src/revlog/changelog.rs |
diffstat | 1 files changed, 183 insertions(+), 6 deletions(-) [+] |
line wrap: on
line diff
--- a/rust/hg-core/src/revlog/changelog.rs Mon Apr 04 23:27:16 2022 -0700 +++ b/rust/hg-core/src/revlog/changelog.rs Tue Apr 05 08:47:04 2022 -0700 @@ -3,6 +3,10 @@ use crate::revlog::revlog::{Revlog, RevlogError}; use crate::revlog::Revision; use crate::revlog::{Node, NodePrefix}; +use crate::utils::hg_path::HgPath; +use itertools::Itertools; +use std::ascii::escape_default; +use std::fmt::{Debug, Formatter}; /// A specialized `Revlog` to work with `changelog` data format. pub struct Changelog { @@ -35,7 +39,12 @@ if bytes.is_empty() { Ok(ChangelogRevisionData::null()) } else { - Ok(ChangelogRevisionData::new(bytes)) + Ok(ChangelogRevisionData::new(bytes).map_err(|err| { + RevlogError::Other(HgError::CorruptedRepository(format!( + "Invalid changelog data for revision {}: {:?}", + rev, err + ))) + })?) } } @@ -45,21 +54,69 @@ } /// `Changelog` entry which knows how to interpret the `changelog` data bytes. -#[derive(Debug)] +#[derive(PartialEq)] pub struct ChangelogRevisionData { /// The data bytes of the `changelog` entry. bytes: Vec<u8>, + /// The end offset for the hex manifest (not including the newline) + manifest_end: usize, + /// The end offset for the user+email (not including the newline) + user_end: usize, + /// The end offset for the timestamp+timezone+extras (not including the + /// newline) + timestamp_end: usize, + /// The end offset for the file list (not including the newline) + files_end: usize, } impl ChangelogRevisionData { - fn new(bytes: Vec<u8>) -> Self { - Self { bytes } + fn new(bytes: Vec<u8>) -> Result<Self, HgError> { + let mut line_iter = bytes.split(|b| b == &b'\n'); + let manifest_end = line_iter + .next() + .expect("Empty iterator from split()?") + .len(); + let user_slice = line_iter.next().ok_or_else(|| { + HgError::corrupted("Changeset data truncated after manifest line") + })?; + let user_end = manifest_end + 1 + user_slice.len(); + let timestamp_slice = line_iter.next().ok_or_else(|| { + HgError::corrupted("Changeset data truncated after user line") + })?; + let timestamp_end = user_end + 1 + timestamp_slice.len(); + let mut files_end = timestamp_end + 1; + loop { + let line = line_iter.next().ok_or_else(|| { + HgError::corrupted("Changeset data truncated in files list") + })?; + if line.is_empty() { + if files_end == bytes.len() { + // The list of files ended with a single newline (there + // should be two) + return Err(HgError::corrupted( + "Changeset data truncated after files list", + )); + } + files_end -= 1; + break; + } + files_end += line.len() + 1; + } + + Ok(Self { + bytes, + manifest_end, + user_end, + timestamp_end, + files_end, + }) } fn null() -> Self { Self::new( b"0000000000000000000000000000000000000000\n\n0 0\n\n".to_vec(), ) + .unwrap() } /// Return an iterator over the lines of the entry. @@ -70,8 +127,128 @@ /// Return the node id of the `manifest` referenced by this `changelog` /// entry. pub fn manifest_node(&self) -> Result<Node, HgError> { - let manifest_node_hex = - self.lines().next().expect("Empty iterator from split()?"); + let manifest_node_hex = &self.bytes[..self.manifest_end]; Node::from_hex_for_repo(manifest_node_hex) } + + /// The full user string (usually a name followed by an email enclosed in + /// angle brackets) + pub fn user(&self) -> &[u8] { + &self.bytes[self.manifest_end + 1..self.user_end] + } + + /// The full timestamp line (timestamp in seconds, offset in seconds, and + /// possibly extras) + // TODO: We should expose this in a more useful way + pub fn timestamp_line(&self) -> &[u8] { + &self.bytes[self.user_end + 1..self.timestamp_end] + } + + /// The files changed in this revision. + pub fn files(&self) -> impl Iterator<Item = &HgPath> { + self.bytes[self.timestamp_end + 1..self.files_end] + .split(|b| b == &b'\n') + .map(|path| HgPath::new(path)) + } + + /// The change description. + pub fn description(&self) -> &[u8] { + &self.bytes[self.files_end + 2..] + } } + +impl Debug for ChangelogRevisionData { + fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result { + f.debug_struct("ChangelogRevisionData") + .field("bytes", &debug_bytes(&self.bytes)) + .field("manifest", &debug_bytes(&self.bytes[..self.manifest_end])) + .field( + "user", + &debug_bytes( + &self.bytes[self.manifest_end + 1..self.user_end], + ), + ) + .field( + "timestamp", + &debug_bytes( + &self.bytes[self.user_end + 1..self.timestamp_end], + ), + ) + .field( + "files", + &debug_bytes( + &self.bytes[self.timestamp_end + 1..self.files_end], + ), + ) + .field( + "description", + &debug_bytes(&self.bytes[self.files_end + 2..]), + ) + .finish() + } +} + +fn debug_bytes(bytes: &[u8]) -> String { + String::from_utf8_lossy( + &bytes.iter().flat_map(|b| escape_default(*b)).collect_vec(), + ) + .to_string() +} + +#[cfg(test)] +mod tests { + use super::*; + use itertools::Itertools; + use pretty_assertions::assert_eq; + + #[test] + fn test_create_changelogrevisiondata_invalid() { + // Completely empty + assert!(ChangelogRevisionData::new(b"abcd".to_vec()).is_err()); + // No newline after manifest + assert!(ChangelogRevisionData::new(b"abcd".to_vec()).is_err()); + // No newline after user + assert!(ChangelogRevisionData::new(b"abcd\n".to_vec()).is_err()); + // No newline after timestamp + assert!(ChangelogRevisionData::new(b"abcd\n\n0 0".to_vec()).is_err()); + // Missing newline after files + assert!(ChangelogRevisionData::new( + b"abcd\n\n0 0\nfile1\nfile2".to_vec() + ) + .is_err(),); + // Only one newline after files + assert!(ChangelogRevisionData::new( + b"abcd\n\n0 0\nfile1\nfile2\n".to_vec() + ) + .is_err(),); + } + + #[test] + fn test_create_changelogrevisiondata() { + let data = ChangelogRevisionData::new( + b"0123456789abcdef0123456789abcdef01234567 +Some One <someone@example.com> +0 0 +file1 +file2 + +some +commit +message" + .to_vec(), + ) + .unwrap(); + assert_eq!( + data.manifest_node().unwrap(), + Node::from_hex("0123456789abcdef0123456789abcdef01234567") + .unwrap() + ); + assert_eq!(data.user(), b"Some One <someone@example.com>"); + assert_eq!(data.timestamp_line(), b"0 0"); + assert_eq!( + data.files().collect_vec(), + vec![HgPath::new("file1"), HgPath::new("file2")] + ); + assert_eq!(data.description(), b"some\ncommit\nmessage"); + } +}