Mercurial > hg-stable
view rust/hg-core/src/revlog/index.rs @ 51243:fc05dd74e907
rust-index: add support for `reachableroots2`
Exposition in `hg-cpython` done in regular impl block, again
for rustfmt support etc.
author | Raphaël Gomès <rgomes@octobus.net> |
---|---|
date | Mon, 30 Oct 2023 11:57:36 +0100 |
parents | c817d9f626d3 |
children | 42c8dbdb88ad |
line wrap: on
line source
use std::collections::hash_map::RandomState; use std::collections::{HashMap, HashSet}; use std::fmt::Debug; use std::ops::Deref; use std::sync::{RwLock, RwLockReadGuard, RwLockWriteGuard}; use byteorder::{BigEndian, ByteOrder}; use bytes_cast::{unaligned, BytesCast}; use super::REVIDX_KNOWN_FLAGS; use crate::errors::HgError; use crate::node::{NODE_BYTES_LENGTH, NULL_NODE, STORED_NODE_ID_BYTES}; use crate::revlog::node::Node; use crate::revlog::{Revision, NULL_REVISION}; use crate::{ dagops, BaseRevision, FastHashMap, Graph, GraphError, RevlogError, RevlogIndex, UncheckedRevision, }; pub const INDEX_ENTRY_SIZE: usize = 64; pub const COMPRESSION_MODE_INLINE: u8 = 2; #[derive(Debug)] pub struct IndexHeader { pub(super) header_bytes: [u8; 4], } #[derive(Copy, Clone)] pub struct IndexHeaderFlags { flags: u16, } /// Corresponds to the high bits of `_format_flags` in python impl IndexHeaderFlags { /// Corresponds to FLAG_INLINE_DATA in python pub fn is_inline(self) -> bool { self.flags & 1 != 0 } /// Corresponds to FLAG_GENERALDELTA in python pub fn uses_generaldelta(self) -> bool { self.flags & 2 != 0 } } /// Corresponds to the INDEX_HEADER structure, /// which is parsed as a `header` variable in `_loadindex` in `revlog.py` impl IndexHeader { fn format_flags(&self) -> IndexHeaderFlags { // No "unknown flags" check here, unlike in python. Maybe there should // be. IndexHeaderFlags { flags: BigEndian::read_u16(&self.header_bytes[0..2]), } } /// The only revlog version currently supported by rhg. const REVLOGV1: u16 = 1; /// Corresponds to `_format_version` in Python. fn format_version(&self) -> u16 { BigEndian::read_u16(&self.header_bytes[2..4]) } pub fn parse(index_bytes: &[u8]) -> Result<Option<IndexHeader>, HgError> { if index_bytes.is_empty() { return Ok(None); } if index_bytes.len() < 4 { return Err(HgError::corrupted( "corrupted revlog: can't read the index format header", )); } Ok(Some(IndexHeader { header_bytes: { let bytes: [u8; 4] = index_bytes[0..4].try_into().expect("impossible"); bytes }, })) } } /// Abstracts the access to the index bytes since they can be spread between /// the immutable (bytes) part and the mutable (added) part if any appends /// happened. This makes it transparent for the callers. struct IndexData { /// Immutable bytes, most likely taken from disk bytes: Box<dyn Deref<Target = [u8]> + Send>, /// Used when stripping index contents, keeps track of the start of the /// first stripped revision, which is used to give a slice of the /// `bytes` field. truncation: Option<usize>, /// Bytes that were added after reading the index added: Vec<u8>, } impl IndexData { pub fn new(bytes: Box<dyn Deref<Target = [u8]> + Send>) -> Self { Self { bytes, truncation: None, added: vec![], } } pub fn len(&self) -> usize { match self.truncation { Some(truncation) => truncation + self.added.len(), None => self.bytes.len() + self.added.len(), } } fn remove( &mut self, rev: Revision, offsets: Option<&[usize]>, ) -> Result<(), RevlogError> { let rev = rev.0 as usize; let truncation = if let Some(offsets) = offsets { offsets[rev] } else { rev * INDEX_ENTRY_SIZE }; if truncation < self.bytes.len() { self.truncation = Some(truncation); self.added.clear(); } else { self.added.truncate(truncation - self.bytes.len()); } Ok(()) } fn is_new(&self) -> bool { self.bytes.is_empty() } } impl std::ops::Index<std::ops::Range<usize>> for IndexData { type Output = [u8]; fn index(&self, index: std::ops::Range<usize>) -> &Self::Output { let start = index.start; let end = index.end; let immutable_len = match self.truncation { Some(truncation) => truncation, None => self.bytes.len(), }; if start < immutable_len { if end > immutable_len { panic!("index data cannot span existing and added ranges"); } &self.bytes[index] } else { &self.added[start - immutable_len..end - immutable_len] } } } #[derive(Debug, PartialEq, Eq)] pub struct RevisionDataParams { pub flags: u16, pub data_offset: u64, pub data_compressed_length: i32, pub data_uncompressed_length: i32, pub data_delta_base: i32, pub link_rev: i32, pub parent_rev_1: i32, pub parent_rev_2: i32, pub node_id: [u8; NODE_BYTES_LENGTH], pub _sidedata_offset: u64, pub _sidedata_compressed_length: i32, pub data_compression_mode: u8, pub _sidedata_compression_mode: u8, pub _rank: i32, } impl Default for RevisionDataParams { fn default() -> Self { Self { flags: 0, data_offset: 0, data_compressed_length: 0, data_uncompressed_length: 0, data_delta_base: -1, link_rev: -1, parent_rev_1: -1, parent_rev_2: -1, node_id: [0; NODE_BYTES_LENGTH], _sidedata_offset: 0, _sidedata_compressed_length: 0, data_compression_mode: COMPRESSION_MODE_INLINE, _sidedata_compression_mode: COMPRESSION_MODE_INLINE, _rank: -1, } } } #[derive(BytesCast)] #[repr(C)] pub struct RevisionDataV1 { data_offset_or_flags: unaligned::U64Be, data_compressed_length: unaligned::I32Be, data_uncompressed_length: unaligned::I32Be, data_delta_base: unaligned::I32Be, link_rev: unaligned::I32Be, parent_rev_1: unaligned::I32Be, parent_rev_2: unaligned::I32Be, node_id: [u8; STORED_NODE_ID_BYTES], } fn _static_assert_size_of_revision_data_v1() { let _ = std::mem::transmute::<RevisionDataV1, [u8; 64]>; } impl RevisionDataParams { pub fn validate(&self) -> Result<(), RevlogError> { if self.flags & !REVIDX_KNOWN_FLAGS != 0 { return Err(RevlogError::corrupted(format!( "unknown revlog index flags: {}", self.flags ))); } if self.data_compression_mode != COMPRESSION_MODE_INLINE { return Err(RevlogError::corrupted(format!( "invalid data compression mode: {}", self.data_compression_mode ))); } // FIXME isn't this only for v2 or changelog v2? if self._sidedata_compression_mode != COMPRESSION_MODE_INLINE { return Err(RevlogError::corrupted(format!( "invalid sidedata compression mode: {}", self._sidedata_compression_mode ))); } Ok(()) } pub fn into_v1(self) -> RevisionDataV1 { let data_offset_or_flags = self.data_offset << 16 | self.flags as u64; let mut node_id = [0; STORED_NODE_ID_BYTES]; node_id[..NODE_BYTES_LENGTH].copy_from_slice(&self.node_id); RevisionDataV1 { data_offset_or_flags: data_offset_or_flags.into(), data_compressed_length: self.data_compressed_length.into(), data_uncompressed_length: self.data_uncompressed_length.into(), data_delta_base: self.data_delta_base.into(), link_rev: self.link_rev.into(), parent_rev_1: self.parent_rev_1.into(), parent_rev_2: self.parent_rev_2.into(), node_id, } } } /// A Revlog index pub struct Index { bytes: IndexData, /// Offsets of starts of index blocks. /// Only needed when the index is interleaved with data. offsets: RwLock<Option<Vec<usize>>>, uses_generaldelta: bool, is_inline: bool, /// Cache of the head revisions in this index, kept in sync. Should /// be accessed via the [`Self::head_revs`] method. head_revs: Vec<Revision>, /// Cache of the last filtered revisions in this index, used to make sure /// we haven't changed filters when returning the cached `head_revs`. filtered_revs: HashSet<Revision>, } impl Debug for Index { fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { f.debug_struct("Index") .field("offsets", &self.offsets) .field("uses_generaldelta", &self.uses_generaldelta) .finish() } } impl Graph for Index { fn parents(&self, rev: Revision) -> Result<[Revision; 2], GraphError> { let err = || GraphError::ParentOutOfRange(rev); match self.get_entry(rev) { Some(entry) => { // The C implementation checks that the parents are valid // before returning Ok([ self.check_revision(entry.p1()).ok_or_else(err)?, self.check_revision(entry.p2()).ok_or_else(err)?, ]) } None => Ok([NULL_REVISION, NULL_REVISION]), } } } /// A cache suitable for find_snapshots /// /// Logically equivalent to a mapping whose keys are [`BaseRevision`] and /// values sets of [`BaseRevision`] /// /// TODO the dubious part is insisting that errors must be RevlogError /// we would probably need to sprinkle some magic here, such as an associated /// type that would be Into<RevlogError> but even that would not be /// satisfactory, as errors potentially have nothing to do with the revlog. pub trait SnapshotsCache { fn insert_for( &mut self, rev: BaseRevision, value: BaseRevision, ) -> Result<(), RevlogError>; } impl SnapshotsCache for FastHashMap<BaseRevision, HashSet<BaseRevision>> { fn insert_for( &mut self, rev: BaseRevision, value: BaseRevision, ) -> Result<(), RevlogError> { let all_values = self.entry(rev).or_insert_with(HashSet::new); all_values.insert(value); Ok(()) } } impl Index { /// Create an index from bytes. /// Calculate the start of each entry when is_inline is true. pub fn new( bytes: Box<dyn Deref<Target = [u8]> + Send>, default_header: IndexHeader, ) -> Result<Self, HgError> { let header = IndexHeader::parse(bytes.as_ref())?.unwrap_or(default_header); if header.format_version() != IndexHeader::REVLOGV1 { // A proper new version should have had a repo/store // requirement. return Err(HgError::corrupted("unsupported revlog version")); } // This is only correct because we know version is REVLOGV1. // In v2 we always use generaldelta, while in v0 we never use // generaldelta. Similar for [is_inline] (it's only used in v1). let uses_generaldelta = header.format_flags().uses_generaldelta(); if header.format_flags().is_inline() { let mut offset: usize = 0; let mut offsets = Vec::new(); while offset + INDEX_ENTRY_SIZE <= bytes.len() { offsets.push(offset); let end = offset + INDEX_ENTRY_SIZE; let entry = IndexEntry { bytes: &bytes[offset..end], offset_override: None, }; offset += INDEX_ENTRY_SIZE + entry.compressed_len() as usize; } if offset == bytes.len() { Ok(Self { bytes: IndexData::new(bytes), offsets: RwLock::new(Some(offsets)), uses_generaldelta, is_inline: true, head_revs: vec![], filtered_revs: HashSet::new(), }) } else { Err(HgError::corrupted("unexpected inline revlog length")) } } else { Ok(Self { bytes: IndexData::new(bytes), offsets: RwLock::new(None), uses_generaldelta, is_inline: false, head_revs: vec![], filtered_revs: HashSet::new(), }) } } pub fn uses_generaldelta(&self) -> bool { self.uses_generaldelta } /// Value of the inline flag. pub fn is_inline(&self) -> bool { self.is_inline } /// Return a slice of bytes if `revlog` is inline. Panic if not. pub fn data(&self, start: usize, end: usize) -> &[u8] { if !self.is_inline() { panic!("tried to access data in the index of a revlog that is not inline"); } &self.bytes[start..end] } /// Return number of entries of the revlog index. pub fn len(&self) -> usize { if let Some(offsets) = &*self.get_offsets() { offsets.len() } else { self.bytes.len() / INDEX_ENTRY_SIZE } } pub fn get_offsets(&self) -> RwLockReadGuard<Option<Vec<usize>>> { if self.is_inline() { { // Wrap in a block to drop the read guard // TODO perf? let mut offsets = self.offsets.write().unwrap(); if offsets.is_none() { offsets.replace(inline_scan(&self.bytes.bytes).1); } } } self.offsets.read().unwrap() } pub fn get_offsets_mut(&mut self) -> RwLockWriteGuard<Option<Vec<usize>>> { let mut offsets = self.offsets.write().unwrap(); if self.is_inline() && offsets.is_none() { offsets.replace(inline_scan(&self.bytes.bytes).1); } offsets } /// Returns `true` if the `Index` has zero `entries`. pub fn is_empty(&self) -> bool { self.len() == 0 } /// Return the index entry corresponding to the given revision or `None` /// for [`NULL_REVISION`] /// /// The specified revision being of the checked type, it always exists /// if it was validated by this index. pub fn get_entry(&self, rev: Revision) -> Option<IndexEntry> { if rev == NULL_REVISION { return None; } Some(if let Some(offsets) = &*self.get_offsets() { self.get_entry_inline(rev, offsets.as_ref()) } else { self.get_entry_separated(rev) }) } /// Return the binary content of the index entry for the given revision /// /// See [get_entry()](`Self::get_entry()`) for cases when `None` is /// returned. pub fn entry_binary(&self, rev: Revision) -> Option<&[u8]> { self.get_entry(rev).map(|e| { let bytes = e.as_bytes(); if rev.0 == 0 { &bytes[4..] } else { bytes } }) } pub fn entry_as_params( &self, rev: UncheckedRevision, ) -> Option<RevisionDataParams> { let rev = self.check_revision(rev)?; self.get_entry(rev).map(|e| RevisionDataParams { flags: e.flags(), data_offset: if rev.0 == 0 && !self.bytes.is_new() { e.flags() as u64 } else { e.raw_offset() }, data_compressed_length: e.compressed_len().try_into().unwrap(), data_uncompressed_length: e.uncompressed_len(), data_delta_base: e.base_revision_or_base_of_delta_chain().0, link_rev: e.link_revision().0, parent_rev_1: e.p1().0, parent_rev_2: e.p2().0, node_id: e.hash().as_bytes().try_into().unwrap(), ..Default::default() }) } fn get_entry_inline( &self, rev: Revision, offsets: &[usize], ) -> IndexEntry { let start = offsets[rev.0 as usize]; let end = start + INDEX_ENTRY_SIZE; let bytes = &self.bytes[start..end]; // See IndexEntry for an explanation of this override. let offset_override = Some(end); IndexEntry { bytes, offset_override, } } fn get_entry_separated(&self, rev: Revision) -> IndexEntry { let start = rev.0 as usize * INDEX_ENTRY_SIZE; let end = start + INDEX_ENTRY_SIZE; let bytes = &self.bytes[start..end]; // Override the offset of the first revision as its bytes are used // for the index's metadata (saving space because it is always 0) let offset_override = if rev == Revision(0) { Some(0) } else { None }; IndexEntry { bytes, offset_override, } } fn null_entry(&self) -> IndexEntry { IndexEntry { bytes: &[0; INDEX_ENTRY_SIZE], offset_override: Some(0), } } /// Return the head revisions of this index pub fn head_revs(&mut self) -> Result<Vec<Revision>, GraphError> { self.head_revs_filtered(&HashSet::new()) } /// Return the head revisions of this index pub fn head_revs_filtered( &mut self, filtered_revs: &HashSet<Revision>, ) -> Result<Vec<Revision>, GraphError> { if !self.head_revs.is_empty() && filtered_revs == &self.filtered_revs { return Ok(self.head_revs.to_owned()); } let mut revs: HashSet<Revision, RandomState> = if filtered_revs.is_empty() { (0..self.len()) .into_iter() .map(|i| Revision(i as BaseRevision)) .collect() } else { (0..self.len()) .into_iter() .filter_map(|i| { let r = Revision(i as BaseRevision); if filtered_revs.contains(&r) { None } else { Some(r) } }) .collect() }; dagops::retain_heads(self, &mut revs)?; if self.is_empty() { revs.insert(NULL_REVISION); } let mut as_vec: Vec<Revision> = revs.into_iter().map(Into::into).collect(); as_vec.sort_unstable(); self.head_revs = as_vec.to_owned(); self.filtered_revs = filtered_revs.to_owned(); Ok(as_vec) } /// Obtain the delta chain for a revision. /// /// `stop_rev` specifies a revision to stop at. If not specified, we /// stop at the base of the chain. /// /// Returns a 2-tuple of (chain, stopped) where `chain` is a vec of /// revs in ascending order and `stopped` is a bool indicating whether /// `stoprev` was hit. pub fn delta_chain( &self, rev: Revision, stop_rev: Option<Revision>, ) -> Result<(Vec<Revision>, bool), HgError> { let mut current_rev = rev; let mut entry = self.get_entry(rev).unwrap(); let mut chain = vec![]; while current_rev.0 != entry.base_revision_or_base_of_delta_chain().0 && stop_rev.map(|r| r != current_rev).unwrap_or(true) { chain.push(current_rev); let new_rev = if self.uses_generaldelta() { entry.base_revision_or_base_of_delta_chain() } else { UncheckedRevision(current_rev.0 - 1) }; current_rev = self.check_revision(new_rev).ok_or_else(|| { HgError::corrupted(format!("Revision {new_rev} out of range")) })?; if current_rev.0 == NULL_REVISION.0 { break; } entry = self.get_entry(current_rev).unwrap() } let stopped = if stop_rev.map(|r| current_rev == r).unwrap_or(false) { true } else { chain.push(current_rev); false }; chain.reverse(); Ok((chain, stopped)) } pub fn find_snapshots( &self, start_rev: UncheckedRevision, end_rev: UncheckedRevision, cache: &mut impl SnapshotsCache, ) -> Result<(), RevlogError> { let mut start_rev = start_rev.0; let mut end_rev = end_rev.0; end_rev += 1; let len = self.len().try_into().unwrap(); if end_rev > len { end_rev = len; } if start_rev < 0 { start_rev = 0; } for rev in start_rev..end_rev { if !self.is_snapshot_unchecked(Revision(rev))? { continue; } let mut base = self .get_entry(Revision(rev)) .unwrap() .base_revision_or_base_of_delta_chain(); if base.0 == rev { base = NULL_REVISION.into(); } cache.insert_for(base.0, rev)?; } Ok(()) } /// TODO move this to the trait probably, along with other things pub fn append( &mut self, revision_data: RevisionDataParams, ) -> Result<(), RevlogError> { revision_data.validate()?; let new_offset = self.bytes.len(); if let Some(offsets) = &mut *self.get_offsets_mut() { offsets.push(new_offset) } self.bytes.added.extend(revision_data.into_v1().as_bytes()); self.head_revs.clear(); Ok(()) } pub fn pack_header(&self, header: i32) -> [u8; 4] { header.to_be_bytes() } pub fn remove(&mut self, rev: Revision) -> Result<(), RevlogError> { let offsets = self.get_offsets().clone(); self.bytes.remove(rev, offsets.as_deref())?; if let Some(offsets) = &mut *self.get_offsets_mut() { offsets.truncate(rev.0 as usize) } self.head_revs.clear(); Ok(()) } pub fn clear_caches(&mut self) { // We need to get the 'inline' value from Python at init and use this // instead of offsets to determine whether we're inline since we might // clear caches. This implies re-populating the offsets on-demand. self.offsets = RwLock::new(None); self.head_revs.clear(); } /// Unchecked version of `is_snapshot`. /// Assumes the caller checked that `rev` is within a valid revision range. pub fn is_snapshot_unchecked( &self, mut rev: Revision, ) -> Result<bool, RevlogError> { while rev.0 >= 0 { let entry = self.get_entry(rev).unwrap(); let mut base = entry.base_revision_or_base_of_delta_chain().0; if base == rev.0 { base = NULL_REVISION.0; } if base == NULL_REVISION.0 { return Ok(true); } let [mut p1, mut p2] = self .parents(rev) .map_err(|_| RevlogError::InvalidRevision)?; while let Some(p1_entry) = self.get_entry(p1) { if p1_entry.compressed_len() != 0 || p1.0 == 0 { break; } let parent_base = p1_entry.base_revision_or_base_of_delta_chain(); if parent_base.0 == p1.0 { break; } p1 = self .check_revision(parent_base) .ok_or(RevlogError::InvalidRevision)?; } while let Some(p2_entry) = self.get_entry(p2) { if p2_entry.compressed_len() != 0 || p2.0 == 0 { break; } let parent_base = p2_entry.base_revision_or_base_of_delta_chain(); if parent_base.0 == p2.0 { break; } p2 = self .check_revision(parent_base) .ok_or(RevlogError::InvalidRevision)?; } if base == p1.0 || base == p2.0 { return Ok(false); } rev = self .check_revision(base.into()) .ok_or(RevlogError::InvalidRevision)?; } Ok(rev == NULL_REVISION) } /// Return whether the given revision is a snapshot. Returns an error if /// `rev` is not within a valid revision range. pub fn is_snapshot( &self, rev: UncheckedRevision, ) -> Result<bool, RevlogError> { let rev = self .check_revision(rev) .ok_or_else(|| RevlogError::corrupted("test"))?; self.is_snapshot_unchecked(rev) } /// Slice revs to reduce the amount of unrelated data to be read from disk. /// /// The index is sliced into groups that should be read in one time. /// /// The initial chunk is sliced until the overall density /// (payload/chunks-span ratio) is above `target_density`. /// No gap smaller than `min_gap_size` is skipped. pub fn slice_chunk_to_density( &self, revs: &[Revision], target_density: f64, min_gap_size: usize, ) -> Vec<Vec<Revision>> { if revs.is_empty() { return vec![]; } if revs.len() == 1 { return vec![revs.to_owned()]; } let delta_chain_span = self.segment_span(revs); if delta_chain_span < min_gap_size { return vec![revs.to_owned()]; } let entries: Vec<_> = revs .iter() .map(|r| { (*r, self.get_entry(*r).unwrap_or_else(|| self.null_entry())) }) .collect(); let mut read_data = delta_chain_span; let chain_payload: u32 = entries.iter().map(|(_r, e)| e.compressed_len()).sum(); let mut density = if delta_chain_span > 0 { chain_payload as f64 / delta_chain_span as f64 } else { 1.0 }; if density >= target_density { return vec![revs.to_owned()]; } // Store the gaps in a heap to have them sorted by decreasing size let mut gaps = Vec::new(); let mut previous_end = None; for (i, (_rev, entry)) in entries.iter().enumerate() { let start = entry.c_start() as usize; let length = entry.compressed_len(); // Skip empty revisions to form larger holes if length == 0 { continue; } if let Some(end) = previous_end { let gap_size = start - end; // Only consider holes that are large enough if gap_size > min_gap_size { gaps.push((gap_size, i)); } } previous_end = Some(start + length as usize); } if gaps.is_empty() { return vec![revs.to_owned()]; } // sort the gaps to pop them from largest to small gaps.sort_unstable(); // Collect the indices of the largest holes until // the density is acceptable let mut selected = vec![]; while let Some((gap_size, gap_id)) = gaps.pop() { if density >= target_density { break; } selected.push(gap_id); // The gap sizes are stored as negatives to be sorted decreasingly // by the heap read_data -= gap_size; density = if read_data > 0 { chain_payload as f64 / read_data as f64 } else { 1.0 }; if density >= target_density { break; } } selected.sort_unstable(); selected.push(revs.len()); // Cut the revs at collected indices let mut previous_idx = 0; let mut chunks = vec![]; for idx in selected { let chunk = self.trim_chunk(&entries, previous_idx, idx); if !chunk.is_empty() { chunks.push(chunk.iter().map(|(rev, _entry)| *rev).collect()); } previous_idx = idx; } let chunk = self.trim_chunk(&entries, previous_idx, entries.len()); if !chunk.is_empty() { chunks.push(chunk.iter().map(|(rev, _entry)| *rev).collect()); } chunks } /// Get the byte span of a segment of sorted revisions. /// /// Occurrences of [`NULL_REVISION`] are ignored at the beginning of /// the `revs` segment. /// /// panics: /// - if `revs` is empty or only made of `NULL_REVISION` /// - if cannot retrieve entry for the last or first not null element of /// `revs`. fn segment_span(&self, revs: &[Revision]) -> usize { if revs.is_empty() { return 0; } let last_entry = &self.get_entry(revs[revs.len() - 1]).unwrap(); let end = last_entry.c_start() + last_entry.compressed_len() as u64; let first_rev = revs.iter().find(|r| r.0 != NULL_REVISION.0).unwrap(); let start = if (*first_rev).0 == 0 { 0 } else { self.get_entry(*first_rev).unwrap().c_start() }; (end - start) as usize } /// Returns `&revs[startidx..endidx]` without empty trailing revs fn trim_chunk<'a>( &'a self, revs: &'a [(Revision, IndexEntry)], start: usize, mut end: usize, ) -> &'a [(Revision, IndexEntry)] { // Trim empty revs at the end, except the very first rev of a chain let last_rev = revs[end - 1].0; if last_rev.0 < self.len() as BaseRevision { while end > 1 && end > start && revs[end - 1].1.compressed_len() == 0 { end -= 1 } } &revs[start..end] } /// Computes the set of revisions for each non-public phase from `roots`, /// which are the last known roots for each non-public phase. pub fn compute_phases_map_sets( &self, roots: HashMap<Phase, Vec<Revision>>, ) -> Result<(usize, RootsPerPhase), GraphError> { let mut phases = HashMap::new(); let mut min_phase_rev = NULL_REVISION; for phase in Phase::non_public_phases() { if let Some(phase_roots) = roots.get(phase) { let min_rev = self.add_roots_get_min(phase_roots, &mut phases, *phase); if min_rev != NULL_REVISION && (min_phase_rev == NULL_REVISION || min_rev < min_phase_rev) { min_phase_rev = min_rev; } } else { continue; }; } let mut phase_sets: RootsPerPhase = Default::default(); if min_phase_rev == NULL_REVISION { min_phase_rev = Revision(self.len() as BaseRevision); } for rev in min_phase_rev.0..self.len() as BaseRevision { let rev = Revision(rev); let [p1, p2] = self.parents(rev)?; const DEFAULT_PHASE: &Phase = &Phase::Public; if p1.0 >= 0 && phases.get(&p1).unwrap_or(DEFAULT_PHASE) > phases.get(&rev).unwrap_or(DEFAULT_PHASE) { phases.insert(rev, phases[&p1]); } if p2.0 >= 0 && phases.get(&p2).unwrap_or(DEFAULT_PHASE) > phases.get(&rev).unwrap_or(DEFAULT_PHASE) { phases.insert(rev, phases[&p2]); } let set = match phases.get(&rev).unwrap_or(DEFAULT_PHASE) { Phase::Public => continue, phase => &mut phase_sets[*phase as usize - 1], }; set.insert(rev); } Ok((self.len(), phase_sets)) } fn add_roots_get_min( &self, phase_roots: &[Revision], phases: &mut HashMap<Revision, Phase>, phase: Phase, ) -> Revision { let mut min_rev = NULL_REVISION; for root in phase_roots { phases.insert(*root, phase); if min_rev == NULL_REVISION || min_rev > *root { min_rev = *root; } } min_rev } /// Return `(heads(::(<roots> and <roots>::<heads>)))` /// If `include_path` is `true`, return `(<roots>::<heads>)`.""" /// /// `min_root` and `roots` are unchecked since they are just used as /// a bound or for comparison and don't need to represent a valid revision. /// In practice, the only invalid revision passed is the working directory /// revision ([`i32::MAX`]). pub fn reachable_roots( &self, min_root: UncheckedRevision, mut heads: Vec<Revision>, roots: HashSet<UncheckedRevision>, include_path: bool, ) -> Result<HashSet<Revision>, GraphError> { if roots.is_empty() { return Ok(HashSet::new()); } let mut reachable = HashSet::new(); let mut seen = HashMap::new(); while let Some(rev) = heads.pop() { if roots.contains(&rev.into()) { reachable.insert(rev); if !include_path { continue; } } let parents = self.parents(rev)?; seen.insert(rev, parents); for parent in parents { if parent.0 >= min_root.0 && !seen.contains_key(&parent) { heads.push(parent); } } } if !include_path { return Ok(reachable); } let mut revs: Vec<_> = seen.keys().collect(); revs.sort_unstable(); for rev in revs { for parent in seen[rev] { if reachable.contains(&parent) { reachable.insert(*rev); } } } Ok(reachable) } } /// Set of roots of all non-public phases pub type RootsPerPhase = [HashSet<Revision>; Phase::non_public_phases().len()]; #[derive(Debug, Copy, Clone, PartialEq, Eq, Ord, PartialOrd, Hash)] pub enum Phase { Public = 0, Draft = 1, Secret = 2, Archived = 3, Internal = 4, } impl TryFrom<usize> for Phase { type Error = RevlogError; fn try_from(value: usize) -> Result<Self, Self::Error> { Ok(match value { 0 => Self::Public, 1 => Self::Draft, 2 => Self::Secret, 32 => Self::Archived, 96 => Self::Internal, v => { return Err(RevlogError::corrupted(format!( "invalid phase value {}", v ))) } }) } } impl Phase { pub const fn all_phases() -> &'static [Self] { &[ Self::Public, Self::Draft, Self::Secret, Self::Archived, Self::Internal, ] } pub const fn non_public_phases() -> &'static [Self] { &[Self::Draft, Self::Secret, Self::Archived, Self::Internal] } } fn inline_scan(bytes: &[u8]) -> (usize, Vec<usize>) { let mut offset: usize = 0; let mut offsets = Vec::new(); while offset + INDEX_ENTRY_SIZE <= bytes.len() { offsets.push(offset); let end = offset + INDEX_ENTRY_SIZE; let entry = IndexEntry { bytes: &bytes[offset..end], offset_override: None, }; offset += INDEX_ENTRY_SIZE + entry.compressed_len() as usize; } (offset, offsets) } impl super::RevlogIndex for Index { fn len(&self) -> usize { self.len() } fn node(&self, rev: Revision) -> Option<&Node> { if rev == NULL_REVISION { return Some(&NULL_NODE); } self.get_entry(rev).map(|entry| entry.hash()) } } #[derive(Debug)] pub struct IndexEntry<'a> { bytes: &'a [u8], /// Allows to override the offset value of the entry. /// /// For interleaved index and data, the offset stored in the index /// corresponds to the separated data offset. /// It has to be overridden with the actual offset in the interleaved /// index which is just after the index block. /// /// For separated index and data, the offset stored in the first index /// entry is mixed with the index headers. /// It has to be overridden with 0. offset_override: Option<usize>, } impl<'a> IndexEntry<'a> { /// Return the offset of the data. pub fn offset(&self) -> usize { if let Some(offset_override) = self.offset_override { offset_override } else { let mut bytes = [0; 8]; bytes[2..8].copy_from_slice(&self.bytes[0..=5]); BigEndian::read_u64(&bytes[..]) as usize } } pub fn raw_offset(&self) -> u64 { BigEndian::read_u64(&self.bytes[0..8]) } /// Same result (except potentially for rev 0) as C `index_get_start()` fn c_start(&self) -> u64 { self.raw_offset() >> 16 } pub fn flags(&self) -> u16 { BigEndian::read_u16(&self.bytes[6..=7]) } /// Return the compressed length of the data. pub fn compressed_len(&self) -> u32 { BigEndian::read_u32(&self.bytes[8..=11]) } /// Return the uncompressed length of the data. pub fn uncompressed_len(&self) -> i32 { BigEndian::read_i32(&self.bytes[12..=15]) } /// Return the revision upon which the data has been derived. pub fn base_revision_or_base_of_delta_chain(&self) -> UncheckedRevision { // TODO Maybe return an Option when base_revision == rev? // Requires to add rev to IndexEntry BigEndian::read_i32(&self.bytes[16..]).into() } pub fn link_revision(&self) -> UncheckedRevision { BigEndian::read_i32(&self.bytes[20..]).into() } pub fn p1(&self) -> UncheckedRevision { BigEndian::read_i32(&self.bytes[24..]).into() } pub fn p2(&self) -> UncheckedRevision { BigEndian::read_i32(&self.bytes[28..]).into() } /// Return the hash of revision's full text. /// /// Currently, SHA-1 is used and only the first 20 bytes of this field /// are used. pub fn hash(&self) -> &'a Node { (&self.bytes[32..52]).try_into().unwrap() } pub fn as_bytes(&self) -> &'a [u8] { self.bytes } } #[cfg(test)] mod tests { use super::*; use crate::node::NULL_NODE; #[cfg(test)] #[derive(Debug, Copy, Clone)] pub struct IndexEntryBuilder { is_first: bool, is_inline: bool, is_general_delta: bool, version: u16, offset: usize, compressed_len: usize, uncompressed_len: usize, base_revision_or_base_of_delta_chain: Revision, link_revision: Revision, p1: Revision, p2: Revision, node: Node, } #[cfg(test)] impl IndexEntryBuilder { #[allow(clippy::new_without_default)] pub fn new() -> Self { Self { is_first: false, is_inline: false, is_general_delta: true, version: 1, offset: 0, compressed_len: 0, uncompressed_len: 0, base_revision_or_base_of_delta_chain: Revision(0), link_revision: Revision(0), p1: NULL_REVISION, p2: NULL_REVISION, node: NULL_NODE, } } pub fn is_first(&mut self, value: bool) -> &mut Self { self.is_first = value; self } pub fn with_inline(&mut self, value: bool) -> &mut Self { self.is_inline = value; self } pub fn with_general_delta(&mut self, value: bool) -> &mut Self { self.is_general_delta = value; self } pub fn with_version(&mut self, value: u16) -> &mut Self { self.version = value; self } pub fn with_offset(&mut self, value: usize) -> &mut Self { self.offset = value; self } pub fn with_compressed_len(&mut self, value: usize) -> &mut Self { self.compressed_len = value; self } pub fn with_uncompressed_len(&mut self, value: usize) -> &mut Self { self.uncompressed_len = value; self } pub fn with_base_revision_or_base_of_delta_chain( &mut self, value: Revision, ) -> &mut Self { self.base_revision_or_base_of_delta_chain = value; self } pub fn with_link_revision(&mut self, value: Revision) -> &mut Self { self.link_revision = value; self } pub fn with_p1(&mut self, value: Revision) -> &mut Self { self.p1 = value; self } pub fn with_p2(&mut self, value: Revision) -> &mut Self { self.p2 = value; self } pub fn with_node(&mut self, value: Node) -> &mut Self { self.node = value; self } pub fn build(&self) -> Vec<u8> { let mut bytes = Vec::with_capacity(INDEX_ENTRY_SIZE); if self.is_first { bytes.extend(&match (self.is_general_delta, self.is_inline) { (false, false) => [0u8, 0], (false, true) => [0u8, 1], (true, false) => [0u8, 2], (true, true) => [0u8, 3], }); bytes.extend(&self.version.to_be_bytes()); // Remaining offset bytes. bytes.extend(&[0u8; 2]); } else { // Offset stored on 48 bits (6 bytes) bytes.extend(&(self.offset as u64).to_be_bytes()[2..]); } bytes.extend(&[0u8; 2]); // Revision flags. bytes.extend(&(self.compressed_len as u32).to_be_bytes()); bytes.extend(&(self.uncompressed_len as u32).to_be_bytes()); bytes.extend( &self.base_revision_or_base_of_delta_chain.0.to_be_bytes(), ); bytes.extend(&self.link_revision.0.to_be_bytes()); bytes.extend(&self.p1.0.to_be_bytes()); bytes.extend(&self.p2.0.to_be_bytes()); bytes.extend(self.node.as_bytes()); bytes.extend(vec![0u8; 12]); bytes } } pub fn is_inline(index_bytes: &[u8]) -> bool { IndexHeader::parse(index_bytes) .expect("too short") .unwrap() .format_flags() .is_inline() } pub fn uses_generaldelta(index_bytes: &[u8]) -> bool { IndexHeader::parse(index_bytes) .expect("too short") .unwrap() .format_flags() .uses_generaldelta() } pub fn get_version(index_bytes: &[u8]) -> u16 { IndexHeader::parse(index_bytes) .expect("too short") .unwrap() .format_version() } #[test] fn flags_when_no_inline_flag_test() { let bytes = IndexEntryBuilder::new() .is_first(true) .with_general_delta(false) .with_inline(false) .build(); assert!(!is_inline(&bytes)); assert!(!uses_generaldelta(&bytes)); } #[test] fn flags_when_inline_flag_test() { let bytes = IndexEntryBuilder::new() .is_first(true) .with_general_delta(false) .with_inline(true) .build(); assert!(is_inline(&bytes)); assert!(!uses_generaldelta(&bytes)); } #[test] fn flags_when_inline_and_generaldelta_flags_test() { let bytes = IndexEntryBuilder::new() .is_first(true) .with_general_delta(true) .with_inline(true) .build(); assert!(is_inline(&bytes)); assert!(uses_generaldelta(&bytes)); } #[test] fn test_offset() { let bytes = IndexEntryBuilder::new().with_offset(1).build(); let entry = IndexEntry { bytes: &bytes, offset_override: None, }; assert_eq!(entry.offset(), 1) } #[test] fn test_with_overridden_offset() { let bytes = IndexEntryBuilder::new().with_offset(1).build(); let entry = IndexEntry { bytes: &bytes, offset_override: Some(2), }; assert_eq!(entry.offset(), 2) } #[test] fn test_compressed_len() { let bytes = IndexEntryBuilder::new().with_compressed_len(1).build(); let entry = IndexEntry { bytes: &bytes, offset_override: None, }; assert_eq!(entry.compressed_len(), 1) } #[test] fn test_uncompressed_len() { let bytes = IndexEntryBuilder::new().with_uncompressed_len(1).build(); let entry = IndexEntry { bytes: &bytes, offset_override: None, }; assert_eq!(entry.uncompressed_len(), 1) } #[test] fn test_base_revision_or_base_of_delta_chain() { let bytes = IndexEntryBuilder::new() .with_base_revision_or_base_of_delta_chain(Revision(1)) .build(); let entry = IndexEntry { bytes: &bytes, offset_override: None, }; assert_eq!(entry.base_revision_or_base_of_delta_chain(), 1.into()) } #[test] fn link_revision_test() { let bytes = IndexEntryBuilder::new() .with_link_revision(Revision(123)) .build(); let entry = IndexEntry { bytes: &bytes, offset_override: None, }; assert_eq!(entry.link_revision(), 123.into()); } #[test] fn p1_test() { let bytes = IndexEntryBuilder::new().with_p1(Revision(123)).build(); let entry = IndexEntry { bytes: &bytes, offset_override: None, }; assert_eq!(entry.p1(), 123.into()); } #[test] fn p2_test() { let bytes = IndexEntryBuilder::new().with_p2(Revision(123)).build(); let entry = IndexEntry { bytes: &bytes, offset_override: None, }; assert_eq!(entry.p2(), 123.into()); } #[test] fn node_test() { let node = Node::from_hex("0123456789012345678901234567890123456789") .unwrap(); let bytes = IndexEntryBuilder::new().with_node(node).build(); let entry = IndexEntry { bytes: &bytes, offset_override: None, }; assert_eq!(*entry.hash(), node); } #[test] fn version_test() { let bytes = IndexEntryBuilder::new() .is_first(true) .with_version(2) .build(); assert_eq!(get_version(&bytes), 2) } } #[cfg(test)] pub use tests::IndexEntryBuilder;