Mercurial: rust/hg-core/src/revlog/inner


//! A layer of lower-level revlog functionality to encapsulate most of the
//! IO work and expensive operations.
use std::{
    borrow::Cow,
    cell::RefCell,
    io::{ErrorKind, Seek, SeekFrom, Write},
    ops::Deref,
    path::PathBuf,
    sync::{Arc, Mutex},
};

use schnellru::{ByMemoryUsage, LruMap};
use sha1::{Digest, Sha1};

use crate::{
    errors::{HgError, IoResultExt},
    exit_codes,
    transaction::Transaction,
    vfs::Vfs,
};

use super::{
    compression::{
        uncompressed_zstd_data, CompressionConfig, Compressor, NoneCompressor,
        ZlibCompressor, ZstdCompressor, ZLIB_BYTE, ZSTD_BYTE,
    },
    file_io::{DelayedBuffer, FileHandle, RandomAccessFile, WriteHandles},
    index::{Index, IndexHeader, INDEX_ENTRY_SIZE},
    node::{NODE_BYTES_LENGTH, NULL_NODE},
    options::{RevlogDataConfig, RevlogDeltaConfig, RevlogFeatureConfig},
    BaseRevision, Node, Revision, RevlogEntry, RevlogError, RevlogIndex,
    UncheckedRevision, NULL_REVISION, NULL_REVLOG_ENTRY_FLAGS,
};

/// Matches the `_InnerRevlog` class in the Python code, as an arbitrary
/// boundary to incrementally rewrite higher-level revlog functionality in
/// Rust.
pub struct InnerRevlog {
    /// When index and data are not interleaved: bytes of the revlog index.
    /// When index and data are interleaved (inline revlog): bytes of the
    /// revlog index and data.
    pub index: Index,
    /// The store vfs that is used to interact with the filesystem
    vfs: Box<dyn Vfs>,
    /// The index file path, relative to the vfs root
    pub index_file: PathBuf,
    /// The data file path, relative to the vfs root (same as `index_file`
    /// if inline)
    data_file: PathBuf,
    /// Data config that applies to this revlog
    data_config: RevlogDataConfig,
    /// Delta config that applies to this revlog
    delta_config: RevlogDeltaConfig,
    /// Feature config that applies to this revlog
    feature_config: RevlogFeatureConfig,
    /// A view into this revlog's data file
    segment_file: RandomAccessFile,
    /// A cache of uncompressed chunks that have previously been restored.
    /// Its eviction policy is defined in [`Self::new`].
    uncompressed_chunk_cache: Option<UncompressedChunkCache>,
    /// Used to keep track of the actual target during diverted writes
    /// for the changelog
    original_index_file: Option<PathBuf>,
    /// Write handles to the index and data files
    /// XXX why duplicate from `index` and `segment_file`?
    writing_handles: Option<WriteHandles>,
    /// See [`DelayedBuffer`].
    delayed_buffer: Option<Arc<Mutex<DelayedBuffer>>>,
    /// Whether this revlog is inline. XXX why duplicate from `index`?
    pub inline: bool,
    /// A cache of the last revision, which is usually accessed multiple
    /// times.
    pub last_revision_cache: Mutex<Option<SingleRevisionCache>>,
}

impl InnerRevlog {
    pub fn new(
        vfs: Box<dyn Vfs>,
        index: Index,
        index_file: PathBuf,
        data_file: PathBuf,
        data_config: RevlogDataConfig,
        delta_config: RevlogDeltaConfig,
        feature_config: RevlogFeatureConfig,
    ) -> Self {
        assert!(index_file.is_relative());
        assert!(data_file.is_relative());
        let segment_file = RandomAccessFile::new(
            dyn_clone::clone_box(&*vfs),
            if index.is_inline() {
                index_file.to_owned()
            } else {
                data_file.to_owned()
            },
        );

        let uncompressed_chunk_cache =
            data_config.uncompressed_cache_factor.map(
                // Arbitrary initial value
                // TODO check if using a hasher specific to integers is useful
                |_factor| RefCell::new(LruMap::with_memory_budget(65536)),
            );

        let inline = index.is_inline();
        Self {
            index,
            vfs,
            index_file,
            data_file,
            data_config,
            delta_config,
            feature_config,
            segment_file,
            uncompressed_chunk_cache,
            original_index_file: None,
            writing_handles: None,
            delayed_buffer: None,
            inline,
            last_revision_cache: Mutex::new(None),
        }
    }

    /// Return number of entries of the revlog index
    pub fn len(&self) -> usize {
        self.index.len()
    }

    /// Return `true` if this revlog has no entries
    pub fn is_empty(&self) -> bool {
        self.len() == 0
    }

    /// Return whether this revlog is inline (mixed index and data)
    pub fn is_inline(&self) -> bool {
        self.inline
    }

    /// Clear all caches from this revlog
    pub fn clear_cache(&mut self) {
        assert!(!self.is_delaying());
        if let Some(cache) = self.uncompressed_chunk_cache.as_ref() {
            // We don't clear the allocation here because it's probably faster.
            // We could change our minds later if this ends up being a problem
            // with regards to memory consumption.
            cache.borrow_mut().clear();
        }
    }

    /// Return an entry for the null revision
    pub fn make_null_entry(&self) -> RevlogEntry {
        RevlogEntry {
            revlog: self,
            rev: NULL_REVISION,
            uncompressed_len: 0,
            p1: NULL_REVISION,
            p2: NULL_REVISION,
            flags: NULL_REVLOG_ENTRY_FLAGS,
            hash: NULL_NODE,
        }
    }

    /// Return the [`RevlogEntry`] for a [`Revision`] that is known to exist
    pub fn get_entry_for_checked_rev(
        &self,
        rev: Revision,
    ) -> Result<RevlogEntry, RevlogError> {
        if rev == NULL_REVISION {
            return Ok(self.make_null_entry());
        }
        let index_entry = self
            .index
            .get_entry(rev)
            .ok_or_else(|| RevlogError::InvalidRevision(rev.to_string()))?;
        let p1 =
            self.index.check_revision(index_entry.p1()).ok_or_else(|| {
                RevlogError::corrupted(format!(
                    "p1 for rev {} is invalid",
                    rev
                ))
            })?;
        let p2 =
            self.index.check_revision(index_entry.p2()).ok_or_else(|| {
                RevlogError::corrupted(format!(
                    "p2 for rev {} is invalid",
                    rev
                ))
            })?;
        let entry = RevlogEntry {
            revlog: self,
            rev,
            uncompressed_len: index_entry.uncompressed_len(),
            p1,
            p2,
            flags: index_entry.flags(),
            hash: *index_entry.hash(),
        };
        Ok(entry)
    }

    /// Return the [`RevlogEntry`] for `rev`. If `rev` fails to check, this
    /// returns a [`RevlogError`].
    /// TODO normalize naming across the index and all revlogs
    /// (changelog, etc.) so that `get_entry` is always on an unchecked rev and
    /// `get_entry_for_checked_rev` is for checked rev
    pub fn get_entry(
        &self,
        rev: UncheckedRevision,
    ) -> Result<RevlogEntry, RevlogError> {
        if rev == NULL_REVISION.into() {
            return Ok(self.make_null_entry());
        }
        let rev = self.index.check_revision(rev).ok_or_else(|| {
            RevlogError::corrupted(format!("rev {} is invalid", rev))
        })?;
        self.get_entry_for_checked_rev(rev)
    }

    /// Is the revlog currently delaying the visibility of written data?
    ///
    /// The delaying mechanism can be either in-memory or written on disk in a
    /// side-file.
    pub fn is_delaying(&self) -> bool {
        self.delayed_buffer.is_some() || self.original_index_file.is_some()
    }

    /// The offset of the data chunk for this revision
    #[inline(always)]
    pub fn start(&self, rev: Revision) -> usize {
        self.index.start(
            rev,
            &self
                .index
                .get_entry(rev)
                .unwrap_or_else(|| self.index.make_null_entry()),
        )
    }

    /// The length of the data chunk for this revision
    /// TODO rename this method and others to more explicit names than the
    /// existing ones that were copied over from Python
    #[inline(always)]
    pub fn length(&self, rev: Revision) -> usize {
        self.index
            .get_entry(rev)
            .unwrap_or_else(|| self.index.make_null_entry())
            .compressed_len() as usize
    }

    /// The end of the data chunk for this revision
    #[inline(always)]
    pub fn end(&self, rev: Revision) -> usize {
        self.start(rev) + self.length(rev)
    }

    /// Return the delta parent of the given revision
    pub fn delta_parent(&self, rev: Revision) -> Revision {
        let base = self
            .index
            .get_entry(rev)
            .unwrap()
            .base_revision_or_base_of_delta_chain();
        if base.0 == rev.0 {
            NULL_REVISION
        } else if self.delta_config.general_delta {
            Revision(base.0)
        } else {
            Revision(rev.0 - 1)
        }
    }

    /// Return whether `rev` points to a snapshot revision (i.e. does not have
    /// a delta base).
    pub fn is_snapshot(&self, rev: Revision) -> Result<bool, RevlogError> {
        if !self.delta_config.sparse_revlog {
            return Ok(self.delta_parent(rev) == NULL_REVISION);
        }
        self.index.is_snapshot_unchecked(rev)
    }

    /// Return the delta chain for `rev` according to this revlog's config.
    /// See [`Index::delta_chain`] for more information.
    pub fn delta_chain(
        &self,
        rev: Revision,
        stop_rev: Option<Revision>,
    ) -> Result<(Vec<Revision>, bool), HgError> {
        self.index.delta_chain(
            rev,
            stop_rev,
            self.delta_config.general_delta.into(),
        )
    }

    fn compressor(&self) -> Result<Box<dyn Compressor>, HgError> {
        // TODO cache the compressor?
        Ok(match self.feature_config.compression_engine {
            CompressionConfig::Zlib { level } => {
                Box::new(ZlibCompressor::new(level))
            }
            CompressionConfig::Zstd { level, threads } => {
                Box::new(ZstdCompressor::new(level, threads))
            }
            CompressionConfig::None => Box::new(NoneCompressor),
        })
    }

    /// Generate a possibly-compressed representation of data.
    /// Returns `None` if the data was not compressed.
    pub fn compress<'data>(
        &self,
        data: &'data [u8],
    ) -> Result<Option<Cow<'data, [u8]>>, RevlogError> {
        if data.is_empty() {
            return Ok(Some(data.into()));
        }
        let res = self.compressor()?.compress(data)?;
        if let Some(compressed) = res {
            // The revlog compressor added the header in the returned data.
            return Ok(Some(compressed.into()));
        }

        if data[0] == b'\0' {
            return Ok(Some(data.into()));
        }
        Ok(None)
    }

    /// Decompress a revlog chunk.
    ///
    /// The chunk is expected to begin with a header identifying the
    /// format type so it can be routed to an appropriate decompressor.
    pub fn decompress<'a>(
        &'a self,
        data: &'a [u8],
    ) -> Result<Cow<[u8]>, RevlogError> {
        if data.is_empty() {
            return Ok(data.into());
        }

        // Revlogs are read much more frequently than they are written and many
        // chunks only take microseconds to decompress, so performance is
        // important here.

        let header = data[0];
        match header {
            // Settings don't matter as they only affect compression
            ZLIB_BYTE => Ok(ZlibCompressor::new(0).decompress(data)?.into()),
            // Settings don't matter as they only affect compression
            ZSTD_BYTE => {
                Ok(ZstdCompressor::new(0, 0).decompress(data)?.into())
            }
            b'\0' => Ok(data.into()),
            b'u' => Ok((&data[1..]).into()),
            other => Err(HgError::UnsupportedFeature(format!(
                "unknown compression header '{}'",
                other
            ))
            .into()),
        }
    }

    /// Obtain a segment of raw data corresponding to a range of revisions.
    ///
    /// Requests for data may be satisfied by a cache.
    ///
    /// Returns a 2-tuple of (offset, data) for the requested range of
    /// revisions. Offset is the integer offset from the beginning of the
    /// revlog and data is a slice of the raw byte data.
    ///
    /// Callers will need to call `self.start(rev)` and `self.length(rev)`
    /// to determine where each revision's data begins and ends.
    pub fn get_segment_for_revs(
        &self,
        start_rev: Revision,
        end_rev: Revision,
    ) -> Result<(usize, Vec<u8>), HgError> {
        let start = if start_rev == NULL_REVISION {
            0
        } else {
            let start_entry = self
                .index
                .get_entry(start_rev)
                .expect("null revision segment");
            self.index.start(start_rev, &start_entry)
        };
        let end_entry = self
            .index
            .get_entry(end_rev)
            .expect("null revision segment");
        let end = self.index.start(end_rev, &end_entry) + self.length(end_rev);

        let length = end - start;

        // XXX should we use mmap instead of doing this for platforms that
        // support madvise/populate?
        Ok((start, self.segment_file.read_chunk(start, length)?))
    }

    /// Return the uncompressed raw data for `rev`
    pub fn chunk_for_rev(&self, rev: Revision) -> Result<Arc<[u8]>, HgError> {
        if let Some(cache) = self.uncompressed_chunk_cache.as_ref() {
            if let Some(chunk) = cache.borrow_mut().get(&rev) {
                return Ok(chunk.clone());
            }
        }
        // TODO revlogv2 should check the compression mode
        let data = self.get_segment_for_revs(rev, rev)?.1;
        let uncompressed = self.decompress(&data).map_err(|e| {
            HgError::abort(
                format!("revlog decompression error: {}", e),
                exit_codes::ABORT,
                None,
            )
        })?;
        let uncompressed: Arc<[u8]> = Arc::from(uncompressed.into_owned());
        if let Some(cache) = self.uncompressed_chunk_cache.as_ref() {
            cache.borrow_mut().insert(rev, uncompressed.clone());
        }
        Ok(uncompressed)
    }

    /// Execute `func` within a read context for the data file, meaning that
    /// the read handle will be taken and discarded after the operation.
    pub fn with_read<R>(
        &self,
        func: impl FnOnce() -> Result<R, RevlogError>,
    ) -> Result<R, RevlogError> {
        self.enter_reading_context()?;
        let res = func();
        self.exit_reading_context();
        res.map_err(Into::into)
    }

    /// `pub` only for use in hg-cpython
    #[doc(hidden)]
    pub fn enter_reading_context(&self) -> Result<(), HgError> {
        if self.is_empty() {
            // Nothing to be read
            return Ok(());
        }
        if self.delayed_buffer.is_some() && self.is_inline() {
            return Err(HgError::abort(
                "revlog with delayed write should not be inline",
                exit_codes::ABORT,
                None,
            ));
        }
        self.segment_file.get_read_handle()?;
        Ok(())
    }

    /// `pub` only for use in hg-cpython
    #[doc(hidden)]
    pub fn exit_reading_context(&self) {
        self.segment_file.exit_reading_context()
    }

    /// Fill the buffer returned by `get_buffer` with the possibly un-validated
    /// raw text for a revision. It can be already validated if it comes
    /// from the cache.
    pub fn raw_text<G, T>(
        &self,
        rev: Revision,
        get_buffer: G,
    ) -> Result<(), RevlogError>
    where
        G: FnOnce(
            usize,
            &mut dyn FnMut(
                &mut dyn RevisionBuffer<Target = T>,
            ) -> Result<(), RevlogError>,
        ) -> Result<(), RevlogError>,
    {
        let entry = &self.get_entry_for_checked_rev(rev)?;
        let raw_size = entry.uncompressed_len();
        let mut mutex_guard = self
            .last_revision_cache
            .lock()
            .expect("lock should not be held");
        let cached_rev = if let Some((_node, rev, data)) = &*mutex_guard {
            Some((*rev, data.deref().as_ref()))
        } else {
            None
        };
        if let Some(cache) = &self.uncompressed_chunk_cache {
            let cache = &mut cache.borrow_mut();
            if let Some(size) = raw_size {
                // Dynamically update the uncompressed_chunk_cache size to the
                // largest revision we've seen in this revlog.
                // Do it *before* restoration in case the current revision
                // is the largest.
                let factor = self
                    .data_config
                    .uncompressed_cache_factor
                    .expect("cache should not exist without factor");
                let candidate_size = (size as f64 * factor) as usize;
                let limiter_mut = cache.limiter_mut();
                if candidate_size > limiter_mut.max_memory_usage() {
                    std::mem::swap(
                        limiter_mut,
                        &mut ByMemoryUsage::new(candidate_size),
                    );
                }
            }
        }
        entry.rawdata(cached_rev, get_buffer)?;
        // drop cache to save memory, the caller is expected to update
        // the revision cache after validating the text
        mutex_guard.take();
        Ok(())
    }

    /// Only `pub` for `hg-cpython`.
    /// Obtain decompressed raw data for the specified revisions that are
    /// assumed to be in ascending order.
    ///
    /// Returns a list with decompressed data for each requested revision.
    #[doc(hidden)]
    pub fn chunks(
        &self,
        revs: Vec<Revision>,
        target_size: Option<u64>,
    ) -> Result<Vec<Arc<[u8]>>, RevlogError> {
        if revs.is_empty() {
            return Ok(vec![]);
        }
        let mut fetched_revs = vec![];
        let mut chunks = Vec::with_capacity(revs.len());

        match self.uncompressed_chunk_cache.as_ref() {
            Some(cache) => {
                let mut cache = cache.borrow_mut();
                for rev in revs.iter() {
                    match cache.get(rev) {
                        Some(hit) => chunks.push((*rev, hit.to_owned())),
                        None => fetched_revs.push(*rev),
                    }
                }
            }
            None => fetched_revs = revs,
        }

        let already_cached = chunks.len();

        let sliced_chunks = if fetched_revs.is_empty() {
            vec![]
        } else if !self.data_config.with_sparse_read || self.is_inline() {
            vec![fetched_revs]
        } else {
            self.slice_chunk(&fetched_revs, target_size)?
        };

        self.with_read(|| {
            for revs_chunk in sliced_chunks {
                let first_rev = revs_chunk[0];
                // Skip trailing revisions with empty diff
                let last_rev_idx = revs_chunk
                    .iter()
                    .rposition(|r| self.length(*r) != 0)
                    .unwrap_or(revs_chunk.len() - 1);

                let last_rev = revs_chunk[last_rev_idx];

                let (offset, data) =
                    self.get_segment_for_revs(first_rev, last_rev)?;

                let revs_chunk = &revs_chunk[..=last_rev_idx];

                for rev in revs_chunk {
                    let chunk_start = self.start(*rev);
                    let chunk_length = self.length(*rev);
                    // TODO revlogv2 should check the compression mode
                    let bytes = &data[chunk_start - offset..][..chunk_length];
                    let chunk = if !bytes.is_empty() && bytes[0] == ZSTD_BYTE {
                        // If we're using `zstd`, we want to try a more
                        // specialized decompression
                        let entry = self.index.get_entry(*rev).unwrap();
                        let is_delta = entry
                            .base_revision_or_base_of_delta_chain()
                            != (*rev).into();
                        let uncompressed = uncompressed_zstd_data(
                            bytes,
                            is_delta,
                            entry.uncompressed_len(),
                        )?;
                        Cow::Owned(uncompressed)
                    } else {
                        // Otherwise just fallback to generic decompression.
                        self.decompress(bytes)?
                    };

                    chunks.push((*rev, chunk.into()));
                }
            }
            Ok(())
        })?;

        if let Some(cache) = self.uncompressed_chunk_cache.as_ref() {
            let mut cache = cache.borrow_mut();
            for (rev, chunk) in chunks.iter().skip(already_cached) {
                cache.insert(*rev, chunk.clone());
            }
        }
        // Use stable sort here since it's *mostly* sorted
        chunks.sort_by(|a, b| a.0.cmp(&b.0));
        Ok(chunks.into_iter().map(|(_r, chunk)| chunk).collect())
    }

    /// Slice revs to reduce the amount of unrelated data to be read from disk.
    ///
    /// ``revs`` is sliced into groups that should be read in one time.
    /// Assume that revs are sorted.
    ///
    /// The initial chunk is sliced until the overall density
    /// (payload/chunks-span ratio) is above
    /// `revlog.data_config.sr_density_threshold`.
    /// No gap smaller than `revlog.data_config.sr_min_gap_size` is skipped.
    ///
    /// If `target_size` is set, no chunk larger than `target_size`
    /// will be returned.
    /// For consistency with other slicing choices, this limit won't go lower
    /// than `revlog.data_config.sr_min_gap_size`.
    ///
    /// If individual revision chunks are larger than this limit, they will
    /// still be raised individually.
    pub fn slice_chunk(
        &self,
        revs: &[Revision],
        target_size: Option<u64>,
    ) -> Result<Vec<Vec<Revision>>, RevlogError> {
        let target_size =
            target_size.map(|size| size.max(self.data_config.sr_min_gap_size));

        let target_density = self.data_config.sr_density_threshold;
        let min_gap_size = self.data_config.sr_min_gap_size as usize;
        let to_density = self.index.slice_chunk_to_density(
            revs,
            target_density,
            min_gap_size,
        );

        let mut sliced = vec![];

        for chunk in to_density {
            sliced.extend(
                self.slice_chunk_to_size(&chunk, target_size)?
                    .into_iter()
                    .map(ToOwned::to_owned),
            );
        }

        Ok(sliced)
    }

    /// Slice revs to match the target size
    ///
    /// This is intended to be used on chunks that density slicing selected,
    /// but that are still too large compared to the read guarantee of revlogs.
    /// This might happen when the "minimal gap size" interrupted the slicing
    /// or when chains are built in a way that create large blocks next to
    /// each other.
    fn slice_chunk_to_size<'a>(
        &self,
        revs: &'a [Revision],
        target_size: Option<u64>,
    ) -> Result<Vec<&'a [Revision]>, RevlogError> {
        let mut start_data = self.start(revs[0]);
        let end_data = self.end(revs[revs.len() - 1]);
        let full_span = end_data - start_data;

        let nothing_to_do = target_size
            .map(|size| full_span <= size as usize)
            .unwrap_or(true);

        if nothing_to_do {
            return Ok(vec![revs]);
        }
        let target_size = target_size.expect("target_size is set") as usize;

        let mut start_rev_idx = 0;
        let mut end_rev_idx = 1;
        let mut chunks = vec![];

        for (idx, rev) in revs.iter().enumerate().skip(1) {
            let span = self.end(*rev) - start_data;
            let is_snapshot = self.is_snapshot(*rev)?;
            if span <= target_size && is_snapshot {
                end_rev_idx = idx + 1;
            } else {
                let chunk =
                    self.trim_chunk(revs, start_rev_idx, Some(end_rev_idx));
                if !chunk.is_empty() {
                    chunks.push(chunk);
                }
                start_rev_idx = idx;
                start_data = self.start(*rev);
                end_rev_idx = idx + 1;
            }
            if !is_snapshot {
                break;
            }
        }

        // For the others, we use binary slicing to quickly converge towards
        // valid chunks (otherwise, we might end up looking for the start/end
        // of many revisions). This logic is not looking for the perfect
        // slicing point, it quickly converges towards valid chunks.
        let number_of_items = revs.len();

        while (end_data - start_data) > target_size {
            end_rev_idx = number_of_items;
            if number_of_items - start_rev_idx <= 1 {
                // Protect against individual chunks larger than the limit
                break;
            }
            let mut local_end_data = self.end(revs[end_rev_idx - 1]);
            let mut span = local_end_data - start_data;
            while span > target_size {
                if end_rev_idx - start_rev_idx <= 1 {
                    // Protect against individual chunks larger than the limit
                    break;
                }
                end_rev_idx -= (end_rev_idx - start_rev_idx) / 2;
                local_end_data = self.end(revs[end_rev_idx - 1]);
                span = local_end_data - start_data;
            }
            let chunk =
                self.trim_chunk(revs, start_rev_idx, Some(end_rev_idx));
            if !chunk.is_empty() {
                chunks.push(chunk);
            }
            start_rev_idx = end_rev_idx;
            start_data = self.start(revs[start_rev_idx]);
        }

        let chunk = self.trim_chunk(revs, start_rev_idx, None);
        if !chunk.is_empty() {
            chunks.push(chunk);
        }

        Ok(chunks)
    }

    /// Returns `revs[startidx..endidx]` without empty trailing revs
    fn trim_chunk<'a>(
        &self,
        revs: &'a [Revision],
        start_rev_idx: usize,
        end_rev_idx: Option<usize>,
    ) -> &'a [Revision] {
        let mut end_rev_idx = end_rev_idx.unwrap_or(revs.len());

        // If we have a non-empty delta candidate, there is nothing to trim
        if revs[end_rev_idx - 1].0 < self.len() as BaseRevision {
            // Trim empty revs at the end, except the very first rev of a chain
            while end_rev_idx > 1
                && end_rev_idx > start_rev_idx
                && self.length(revs[end_rev_idx - 1]) == 0
            {
                end_rev_idx -= 1
            }
        }

        &revs[start_rev_idx..end_rev_idx]
    }

    /// Check the hash of some given data against the recorded hash.
    pub fn check_hash(
        &self,
        p1: Revision,
        p2: Revision,
        expected: &[u8],
        data: &[u8],
    ) -> bool {
        let e1 = self.index.get_entry(p1);
        let h1 = match e1 {
            Some(ref entry) => entry.hash(),
            None => &NULL_NODE,
        };
        let e2 = self.index.get_entry(p2);
        let h2 = match e2 {
            Some(ref entry) => entry.hash(),
            None => &NULL_NODE,
        };

        hash(data, h1.as_bytes(), h2.as_bytes()) == expected
    }

    /// Returns whether we are currently in a [`Self::with_write`] context
    pub fn is_writing(&self) -> bool {
        self.writing_handles.is_some()
    }

    /// Open the revlog files for writing
    ///
    /// Adding content to a revlog should be done within this context.
    /// TODO try using `BufRead` and `BufWrite` and see if performance improves
    pub fn with_write<R>(
        &mut self,
        transaction: &mut impl Transaction,
        data_end: Option<usize>,
        func: impl FnOnce() -> R,
    ) -> Result<R, HgError> {
        if self.is_writing() {
            return Ok(func());
        }
        self.enter_writing_context(data_end, transaction)
            .map_err(|e| {
                self.exit_writing_context();
                e
            })?;
        let res = func();
        self.exit_writing_context();
        Ok(res)
    }

    /// `pub` only for use in hg-cpython
    #[doc(hidden)]
    pub fn exit_writing_context(&mut self) {
        self.writing_handles.take();
        self.segment_file.writing_handle.take();
        self.segment_file.reading_handle.take();
    }

    /// `pub` only for use in hg-cpython
    #[doc(hidden)]
    pub fn python_writing_handles(&self) -> Option<&WriteHandles> {
        self.writing_handles.as_ref()
    }

    /// `pub` only for use in hg-cpython
    #[doc(hidden)]
    pub fn enter_writing_context(
        &mut self,
        data_end: Option<usize>,
        transaction: &mut impl Transaction,
    ) -> Result<(), HgError> {
        let data_size = if self.is_empty() {
            0
        } else {
            self.end(Revision((self.len() - 1) as BaseRevision))
        };
        let data_handle = if !self.is_inline() {
            let data_handle = match self.vfs.open(&self.data_file) {
                Ok(mut f) => {
                    if let Some(end) = data_end {
                        f.seek(SeekFrom::Start(end as u64))
                            .when_reading_file(&self.data_file)?;
                    } else {
                        f.seek(SeekFrom::End(0))
                            .when_reading_file(&self.data_file)?;
                    }
                    f
                }
                Err(e) => match e {
                    HgError::IoError { error, context } => {
                        if error.kind() != ErrorKind::NotFound {
                            return Err(HgError::IoError { error, context });
                        }
                        self.vfs.create(&self.data_file, true)?
                    }
                    e => return Err(e),
                },
            };
            transaction.add(&self.data_file, data_size);
            Some(FileHandle::from_file(
                data_handle,
                dyn_clone::clone_box(&*self.vfs),
                &self.data_file,
            ))
        } else {
            None
        };
        let index_size = self.len() * INDEX_ENTRY_SIZE;
        let index_handle = self.index_write_handle()?;
        if self.is_inline() {
            transaction.add(&self.index_file, data_size);
        } else {
            transaction.add(&self.index_file, index_size);
        }
        self.writing_handles = Some(WriteHandles {
            index_handle: index_handle.clone(),
            data_handle: data_handle.clone(),
        });
        *self.segment_file.reading_handle.borrow_mut() = if self.is_inline() {
            Some(index_handle)
        } else {
            data_handle
        };
        Ok(())
    }

    /// Get a write handle to the index, sought to the end of its data.
    fn index_write_handle(&self) -> Result<FileHandle, HgError> {
        let res = if self.delayed_buffer.is_none() {
            if self.data_config.check_ambig {
                self.vfs.open_check_ambig(&self.index_file)
            } else {
                self.vfs.open(&self.index_file)
            }
        } else {
            self.vfs.open(&self.index_file)
        };
        match res {
            Ok(mut handle) => {
                handle
                    .seek(SeekFrom::End(0))
                    .when_reading_file(&self.index_file)?;
                Ok(
                    if let Some(delayed_buffer) = self.delayed_buffer.as_ref()
                    {
                        FileHandle::from_file_delayed(
                            handle,
                            dyn_clone::clone_box(&*self.vfs),
                            &self.index_file,
                            delayed_buffer.clone(),
                        )?
                    } else {
                        FileHandle::from_file(
                            handle,
                            dyn_clone::clone_box(&*self.vfs),
                            &self.index_file,
                        )
                    },
                )
            }
            Err(e) => match e {
                HgError::IoError { error, context } => {
                    if error.kind() != ErrorKind::NotFound {
                        return Err(HgError::IoError { error, context });
                    };
                    if let Some(delayed_buffer) = self.delayed_buffer.as_ref()
                    {
                        FileHandle::new_delayed(
                            dyn_clone::clone_box(&*self.vfs),
                            &self.index_file,
                            true,
                            delayed_buffer.clone(),
                        )
                    } else {
                        FileHandle::new(
                            dyn_clone::clone_box(&*self.vfs),
                            &self.index_file,
                            true,
                            true,
                        )
                    }
                }
                e => Err(e),
            },
        }
    }

    /// Split the data of an inline revlog into an index and a data file
    pub fn split_inline(
        &mut self,
        header: IndexHeader,
        new_index_file_path: Option<PathBuf>,
    ) -> Result<PathBuf, RevlogError> {
        assert!(self.delayed_buffer.is_none());
        let existing_handles = self.writing_handles.is_some();
        if let Some(handles) = &mut self.writing_handles {
            handles.index_handle.flush()?;
            self.writing_handles.take();
            self.segment_file.writing_handle.take();
        }
        let mut new_data_file_handle =
            self.vfs.create(&self.data_file, true)?;
        // Drop any potential data, possibly redundant with the VFS impl.
        new_data_file_handle
            .set_len(0)
            .when_writing_file(&self.data_file)?;

        self.with_read(|| -> Result<(), RevlogError> {
            for r in 0..self.index.len() {
                let rev = Revision(r as BaseRevision);
                let rev_segment = self.get_segment_for_revs(rev, rev)?.1;
                new_data_file_handle
                    .write_all(&rev_segment)
                    .when_writing_file(&self.data_file)?;
            }
            new_data_file_handle
                .flush()
                .when_writing_file(&self.data_file)?;
            Ok(())
        })?;

        if let Some(index_path) = new_index_file_path {
            self.index_file = index_path
        }

        let mut new_index_handle = self.vfs.create(&self.index_file, true)?;
        let mut new_data = Vec::with_capacity(self.len() * INDEX_ENTRY_SIZE);
        for r in 0..self.len() {
            let rev = Revision(r as BaseRevision);
            let entry = self.index.entry_binary(rev).unwrap_or_else(|| {
                panic!(
                    "entry {} should exist in {}",
                    r,
                    self.index_file.display()
                )
            });
            if r == 0 {
                new_data.extend(header.header_bytes);
            }
            new_data.extend(entry);
        }
        new_index_handle
            .write_all(&new_data)
            .when_writing_file(&self.index_file)?;
        // Replace the index with a new one because the buffer contains inline
        // data
        self.index = Index::new(Box::new(new_data), header)?;
        self.inline = false;

        self.segment_file = RandomAccessFile::new(
            dyn_clone::clone_box(&*self.vfs),
            self.data_file.to_owned(),
        );
        if existing_handles {
            // Switched from inline to conventional, reopen the index
            let new_data_handle = Some(FileHandle::from_file(
                new_data_file_handle,
                dyn_clone::clone_box(&*self.vfs),
                &self.data_file,
            ));
            self.writing_handles = Some(WriteHandles {
                index_handle: self.index_write_handle()?,
                data_handle: new_data_handle.clone(),
            });
            *self.segment_file.writing_handle.borrow_mut() = new_data_handle;
        }

        Ok(self.index_file.to_owned())
    }

    /// Write a new entry to this revlog.
    /// - `entry` is the index bytes
    /// - `header_and_data` is the compression header and the revision data
    /// - `offset` is the position in the data file to write to
    /// - `index_end` is the overwritten position in the index in revlog-v2,
    ///   since the format may allow a rewrite of garbage data at the end.
    /// - `data_end` is the overwritten position in the data-file in revlog-v2,
    ///   since the format may allow a rewrite of garbage data at the end.
    ///
    /// XXX Why do we have `data_end` *and* `offset`? Same question in Python
    pub fn write_entry(
        &mut self,
        mut transaction: impl Transaction,
        entry: &[u8],
        header_and_data: (&[u8], &[u8]),
        mut offset: usize,
        index_end: Option<u64>,
        data_end: Option<u64>,
    ) -> Result<(u64, Option<u64>), HgError> {
        let current_revision = self.len() - 1;
        let canonical_index_file = self.canonical_index_file();

        let is_inline = self.is_inline();
        let handles = match &mut self.writing_handles {
            None => {
                return Err(HgError::abort(
                    "adding revision outside of the `with_write` context",
                    exit_codes::ABORT,
                    None,
                ));
            }
            Some(handles) => handles,
        };
        let index_handle = &mut handles.index_handle;
        let data_handle = &mut handles.data_handle;
        if let Some(end) = index_end {
            index_handle
                .seek(SeekFrom::Start(end))
                .when_reading_file(&self.index_file)?;
        } else {
            index_handle
                .seek(SeekFrom::End(0))
                .when_reading_file(&self.index_file)?;
        }
        if let Some(data_handle) = data_handle {
            if let Some(end) = data_end {
                data_handle
                    .seek(SeekFrom::Start(end))
                    .when_reading_file(&self.data_file)?;
            } else {
                data_handle
                    .seek(SeekFrom::End(0))
                    .when_reading_file(&self.data_file)?;
            }
        }
        let (header, data) = header_and_data;

        if !is_inline {
            transaction.add(&self.data_file, offset);
            transaction
                .add(&canonical_index_file, current_revision * entry.len());
            let data_handle = data_handle
                .as_mut()
                .expect("data handle should exist when not inline");
            if !header.is_empty() {
                data_handle.write_all(header)?;
            }
            data_handle.write_all(data)?;
            match &mut self.delayed_buffer {
                Some(buf) => {
                    buf.lock()
                        .expect("propagate the panic")
                        .buffer
                        .write_all(entry)
                        .expect("write to delay buffer should succeed");
                }
                None => index_handle.write_all(entry)?,
            }
        } else if self.delayed_buffer.is_some() {
            return Err(HgError::abort(
                "invalid delayed write on inline revlog",
                exit_codes::ABORT,
                None,
            ));
        } else {
            offset += current_revision * entry.len();
            transaction.add(&canonical_index_file, offset);
            index_handle.write_all(entry)?;
            index_handle.write_all(header)?;
            index_handle.write_all(data)?;
        }
        let data_position = match data_handle {
            Some(h) => Some(h.position()?),
            None => None,
        };
        Ok((index_handle.position()?, data_position))
    }

    /// Return the real target index file and not the temporary when diverting
    pub fn canonical_index_file(&self) -> PathBuf {
        self.original_index_file
            .as_ref()
            .map(ToOwned::to_owned)
            .unwrap_or_else(|| self.index_file.to_owned())
    }

    /// Return the path to the diverted index
    fn diverted_index(&self) -> PathBuf {
        self.index_file.with_extension("i.a")
    }

    /// True if we're in a [`Self::with_write`] or [`Self::with_read`] context
    pub fn is_open(&self) -> bool {
        self.segment_file.is_open()
    }

    /// Set this revlog to delay its writes to a buffer
    pub fn delay(&mut self) -> Result<Option<PathBuf>, HgError> {
        assert!(!self.is_open());
        if self.is_inline() {
            return Err(HgError::abort(
                "revlog with delayed write should not be inline",
                exit_codes::ABORT,
                None,
            ));
        }
        if self.delayed_buffer.is_some() || self.original_index_file.is_some()
        {
            // Delay or divert already happening
            return Ok(None);
        }
        if self.is_empty() {
            self.original_index_file = Some(self.index_file.to_owned());
            self.index_file = self.diverted_index();
            if self.vfs.exists(&self.index_file) {
                self.vfs.unlink(&self.index_file)?;
            }
            Ok(Some(self.index_file.to_owned()))
        } else {
            self.delayed_buffer =
                Some(Arc::new(Mutex::new(DelayedBuffer::default())));
            Ok(None)
        }
    }

    /// Write the pending data (in memory) if any to the diverted index file
    /// (on disk temporary file)
    pub fn write_pending(
        &mut self,
    ) -> Result<(Option<PathBuf>, bool), HgError> {
        assert!(!self.is_open());
        if self.is_inline() {
            return Err(HgError::abort(
                "revlog with delayed write should not be inline",
                exit_codes::ABORT,
                None,
            ));
        }
        if self.original_index_file.is_some() {
            return Ok((None, true));
        }
        let mut any_pending = false;
        let pending_index_file = self.diverted_index();
        if self.vfs.exists(&pending_index_file) {
            self.vfs.unlink(&pending_index_file)?;
        }
        self.vfs.copy(&self.index_file, &pending_index_file)?;
        if let Some(delayed_buffer) = self.delayed_buffer.take() {
            let mut index_file_handle = self.vfs.open(&pending_index_file)?;
            index_file_handle
                .seek(SeekFrom::End(0))
                .when_writing_file(&pending_index_file)?;
            let delayed_data =
                &delayed_buffer.lock().expect("propagate the panic").buffer;
            index_file_handle
                .write_all(delayed_data)
                .when_writing_file(&pending_index_file)?;
            any_pending = true;
        }
        self.original_index_file = Some(self.index_file.to_owned());
        self.index_file = pending_index_file;
        Ok((Some(self.index_file.to_owned()), any_pending))
    }

    /// Overwrite the canonical file with the diverted file, or write out the
    /// delayed buffer.
    /// Returns an error if the revlog is neither diverted nor delayed.
    pub fn finalize_pending(&mut self) -> Result<PathBuf, HgError> {
        assert!(!self.is_open());
        if self.is_inline() {
            return Err(HgError::abort(
                "revlog with delayed write should not be inline",
                exit_codes::ABORT,
                None,
            ));
        }
        match (
            self.delayed_buffer.as_ref(),
            self.original_index_file.as_ref(),
        ) {
            (None, None) => {
                return Err(HgError::abort(
                    "neither delay nor divert found on this revlog",
                    exit_codes::ABORT,
                    None,
                ));
            }
            (Some(delay), None) => {
                let mut index_file_handle = self.vfs.open(&self.index_file)?;
                index_file_handle
                    .seek(SeekFrom::End(0))
                    .when_writing_file(&self.index_file)?;
                index_file_handle
                    .write_all(
                        &delay.lock().expect("propagate the panic").buffer,
                    )
                    .when_writing_file(&self.index_file)?;
                self.delayed_buffer = None;
            }
            (None, Some(divert)) => {
                if self.vfs.exists(&self.index_file) {
                    self.vfs.rename(&self.index_file, divert, true)?;
                }
                divert.clone_into(&mut self.index_file);
                self.original_index_file = None;
            }
            (Some(_), Some(_)) => unreachable!(
                "{} is in an inconsistent state of both delay and divert",
                self.canonical_index_file().display(),
            ),
        }
        Ok(self.canonical_index_file())
    }

    /// `pub` only for `hg-cpython`. This is made a different method than
    /// [`Revlog::index`] in case there is a different invariant that pops up
    /// later.
    #[doc(hidden)]
    pub fn shared_index(&self) -> &Index {
        &self.index
    }
}

/// The use of a [`Refcell`] assumes that a given revlog will only
/// be accessed (read or write) by a single thread.
type UncompressedChunkCache =
    RefCell<LruMap<Revision, Arc<[u8]>, ByMemoryUsage>>;

/// The node, revision and data for the last revision we've seen. Speeds up
/// a lot of sequential operations of the revlog.
///
/// The data is not just bytes since it can come from Python and we want to
/// avoid copies if possible.
type SingleRevisionCache =
    (Node, Revision, Box<dyn Deref<Target = [u8]> + Send>);

/// A way of progressively filling a buffer with revision data, then return
/// that buffer. Used to abstract away Python-allocated code to reduce copying
/// for performance reasons.
pub trait RevisionBuffer {
    /// The owned buffer type to return
    type Target;
    /// Copies the slice into the buffer
    fn extend_from_slice(&mut self, slice: &[u8]);
    /// Returns the now finished owned buffer
    fn finish(self) -> Self::Target;
}

/// A simple vec-based buffer. This is uselessly complicated for the pure Rust
/// case, but it's the price to pay for Python compatibility.
#[derive(Debug)]
pub(super) struct CoreRevisionBuffer {
    buf: Vec<u8>,
}

impl CoreRevisionBuffer {
    pub fn new() -> Self {
        Self { buf: vec![] }
    }

    #[inline]
    pub fn resize(&mut self, size: usize) {
        self.buf.reserve_exact(size - self.buf.capacity());
    }
}

impl RevisionBuffer for CoreRevisionBuffer {
    type Target = Vec<u8>;

    #[inline]
    fn extend_from_slice(&mut self, slice: &[u8]) {
        self.buf.extend_from_slice(slice);
    }

    #[inline]
    fn finish(self) -> Self::Target {
        self.buf
    }
}

/// Calculate the hash of a revision given its data and its parents.
pub fn hash(
    data: &[u8],
    p1_hash: &[u8],
    p2_hash: &[u8],
) -> [u8; NODE_BYTES_LENGTH] {
    let mut hasher = Sha1::new();
    let (a, b) = (p1_hash, p2_hash);
    if a > b {
        hasher.update(b);
        hasher.update(a);
    } else {
        hasher.update(a);
        hasher.update(b);
    }
    hasher.update(data);
    *hasher.finalize().as_ref()
}
author	Raphaël Gomès <rgomes@octobus.net>
	Thu, 14 Nov 2024 16:45:23 +0100
changeset 52255	65d516db7309
parent 52185	8d35941689af
permissions	-rw-r--r--