changeset 52164:e01e84e5e426

rust-revlog: add a Rust-only `InnerRevlog` This mirrors the Python `InnerRevlog` and will be used in a future patch to replace said Python implementation. This allows us to start doing more things in pure Rust, in particular reading and writing operations. A lot of changes have to be introduced all at once, it wouldn't be very useful to separate this patch IMO since all of them are either interlocked or only useful with the rest.
author Raphaël Gomès <rgomes@octobus.net>
date Thu, 10 Oct 2024 10:34:51 +0200
parents 44823c5011fe
children 6f94dc3cf8cf
files rust/Cargo.lock rust/hg-core/Cargo.toml rust/hg-core/src/lib.rs rust/hg-core/src/revlog/changelog.rs rust/hg-core/src/revlog/index.rs rust/hg-core/src/revlog/inner_revlog.rs rust/hg-core/src/revlog/mod.rs rust/hg-core/src/revlog/patch.rs rust/hg-core/src/transaction.rs
diffstat 9 files changed, 1614 insertions(+), 345 deletions(-) [+]
line wrap: on
line diff
--- a/rust/Cargo.lock	Thu Oct 10 10:38:35 2024 +0200
+++ b/rust/Cargo.lock	Thu Oct 10 10:34:51 2024 +0200
@@ -15,6 +15,7 @@
 checksum = "bf6ccdb167abbf410dcb915cabd428929d7f6a04980b54a11f26a39f1c7f7107"
 dependencies = [
  "cfg-if",
+ "getrandom 0.2.8",
  "once_cell",
  "version_check",
 ]
@@ -493,14 +494,14 @@
 
 [[package]]
 name = "filetime"
-version = "0.2.25"
+version = "0.2.23"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "35c0522e981e68cbfa8c3f978441a5f34b30b96e146b33cd3359176b50fe8586"
+checksum = "1ee447700ac8aa0b2f2bd7bc4462ad686ba06baa6727ac149a2d6277f0d240fd"
 dependencies = [
  "cfg-if",
  "libc",
- "libredox",
- "windows-sys 0.59.0",
+ "redox_syscall 0.4.1",
+ "windows-sys 0.52.0",
 ]
 
 [[package]]
@@ -580,9 +581,9 @@
 
 [[package]]
 name = "hashbrown"
-version = "0.13.1"
+version = "0.13.2"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "33ff8ae62cd3a9102e5637afc8452c55acf3844001bd5374e0b0bd7b6616c038"
+checksum = "43a3c133739dddd0d2990f9a4bdf8eb4b21ef50e4851ca85ab661199821d510e"
 dependencies = [
  "ahash",
  "rayon",
@@ -625,7 +626,7 @@
  "filetime",
  "flate2",
  "format-bytes",
- "hashbrown 0.13.1",
+ "hashbrown 0.13.2",
  "home",
  "im-rc",
  "indicatif",
@@ -643,6 +644,7 @@
  "rayon",
  "regex",
  "same-file",
+ "schnellru",
  "self_cell",
  "serde",
  "sha-1 0.10.0",
@@ -806,7 +808,6 @@
 dependencies = [
  "bitflags 2.6.0",
  "libc",
- "redox_syscall 0.5.3",
 ]
 
 [[package]]
@@ -1191,11 +1192,11 @@
 
 [[package]]
 name = "redox_syscall"
-version = "0.5.3"
+version = "0.4.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "2a908a6e00f1fdd0dfd9c0eb08ce85126f6d8bbda50017e74bc4a4b7d4a926a4"
+checksum = "4722d768eff46b75989dd134e5c353f0d6296e5aaa3132e776cbdb56be7731aa"
 dependencies = [
- "bitflags 2.6.0",
+ "bitflags 1.3.2",
 ]
 
 [[package]]
@@ -1283,6 +1284,17 @@
 ]
 
 [[package]]
+name = "schnellru"
+version = "0.2.3"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "c9a8ef13a93c54d20580de1e5c413e624e53121d42fc7e2c11d10ef7f8b02367"
+dependencies = [
+ "ahash",
+ "cfg-if",
+ "hashbrown 0.13.2",
+]
+
+[[package]]
 name = "scopeguard"
 version = "1.1.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
@@ -1709,15 +1721,6 @@
 ]
 
 [[package]]
-name = "windows-sys"
-version = "0.59.0"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "1e38bc4d79ed67fd075bcc251a1c39b32a1776bbe92e5bef1f0bf1f8c531853b"
-dependencies = [
- "windows-targets 0.52.6",
-]
-
-[[package]]
 name = "windows-targets"
 version = "0.48.5"
 source = "registry+https://github.com/rust-lang/crates.io-index"
--- a/rust/hg-core/Cargo.toml	Thu Oct 10 10:38:35 2024 +0200
+++ b/rust/hg-core/Cargo.toml	Thu Oct 10 10:34:51 2024 +0200
@@ -42,6 +42,7 @@
 once_cell = "1.16.0"
 bitvec = "1.0.1"
 chrono = "0.4.34"
+schnellru = "0.2.1"
 dyn-clone = "1.0.16"
 filetime = "0.2.23"
 uuid = { version = "1.10", features = ["v4"] }
--- a/rust/hg-core/src/lib.rs	Thu Oct 10 10:38:35 2024 +0200
+++ b/rust/hg-core/src/lib.rs	Thu Oct 10 10:34:51 2024 +0200
@@ -37,6 +37,7 @@
 pub mod operations;
 pub mod progress;
 pub mod revset;
+pub mod transaction;
 pub mod update;
 pub mod utils;
 pub mod vfs;
--- a/rust/hg-core/src/revlog/changelog.rs	Thu Oct 10 10:38:35 2024 +0200
+++ b/rust/hg-core/src/revlog/changelog.rs	Thu Oct 10 10:34:51 2024 +0200
@@ -86,7 +86,7 @@
     }
 
     pub fn get_index(&self) -> &Index {
-        &self.revlog.index
+        self.revlog.index()
     }
 }
 
@@ -569,7 +569,6 @@
             base: temp.path().to_owned(),
         };
         std::fs::write(temp.path().join("foo.i"), b"").unwrap();
-        std::fs::write(temp.path().join("foo.d"), b"").unwrap();
         let revlog =
             Revlog::open(&vfs, "foo.i", None, RevlogOpenOptions::default())
                 .unwrap();
--- a/rust/hg-core/src/revlog/index.rs	Thu Oct 10 10:38:35 2024 +0200
+++ b/rust/hg-core/src/revlog/index.rs	Thu Oct 10 10:34:51 2024 +0200
@@ -272,7 +272,7 @@
     /// be accessed via the [`Self::head_revs`] method.
     /// The last filtered revisions in this index, used to make sure
     /// we haven't changed filters when returning the cached `head_revs`.
-    head_revs: RwLock<(Vec<Revision>, HashSet<Revision>)>,
+    pub(super) head_revs: RwLock<(Vec<Revision>, HashSet<Revision>)>,
 }
 
 impl Debug for Index {
@@ -1397,6 +1397,14 @@
             offset
         }
     }
+
+    pub(crate) fn make_null_entry(&self) -> IndexEntry<'_> {
+        IndexEntry {
+            bytes: b"\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0 \
+            \xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff \
+            \0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0",
+        }
+    }
 }
 
 /// The kind of functionality needed by find_gca_candidates
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/rust/hg-core/src/revlog/inner_revlog.rs	Thu Oct 10 10:34:51 2024 +0200
@@ -0,0 +1,1353 @@
+//! A layer of lower-level revlog functionality to encapsulate most of the
+//! IO work and expensive operations.
+use std::{
+    borrow::Cow,
+    cell::RefCell,
+    io::{ErrorKind, Seek, SeekFrom, Write},
+    ops::Deref,
+    path::PathBuf,
+    sync::{Arc, Mutex},
+};
+
+use schnellru::{ByMemoryUsage, LruMap};
+use sha1::{Digest, Sha1};
+
+use crate::{
+    errors::{HgError, IoResultExt},
+    exit_codes,
+    transaction::Transaction,
+    vfs::Vfs,
+};
+
+use super::{
+    compression::{
+        uncompressed_zstd_data, CompressionConfig, Compressor, NoneCompressor,
+        ZlibCompressor, ZstdCompressor, ZLIB_BYTE, ZSTD_BYTE,
+    },
+    file_io::{DelayedBuffer, FileHandle, RandomAccessFile, WriteHandles},
+    index::{Index, IndexHeader, INDEX_ENTRY_SIZE},
+    node::{NODE_BYTES_LENGTH, NULL_NODE},
+    options::{RevlogDataConfig, RevlogDeltaConfig, RevlogFeatureConfig},
+    BaseRevision, Node, Revision, RevlogEntry, RevlogError, RevlogIndex,
+    UncheckedRevision, NULL_REVISION, NULL_REVLOG_ENTRY_FLAGS,
+};
+
+/// Matches the `_InnerRevlog` class in the Python code, as an arbitrary
+/// boundary to incrementally rewrite higher-level revlog functionality in
+/// Rust.
+pub struct InnerRevlog {
+    /// When index and data are not interleaved: bytes of the revlog index.
+    /// When index and data are interleaved (inline revlog): bytes of the
+    /// revlog index and data.
+    pub index: Index,
+    /// The store vfs that is used to interact with the filesystem
+    vfs: Box<dyn Vfs>,
+    /// The index file path, relative to the vfs root
+    pub index_file: PathBuf,
+    /// The data file path, relative to the vfs root (same as `index_file`
+    /// if inline)
+    data_file: PathBuf,
+    /// Data config that applies to this revlog
+    data_config: RevlogDataConfig,
+    /// Delta config that applies to this revlog
+    delta_config: RevlogDeltaConfig,
+    /// Feature config that applies to this revlog
+    feature_config: RevlogFeatureConfig,
+    /// A view into this revlog's data file
+    segment_file: RandomAccessFile,
+    /// A cache of uncompressed chunks that have previously been restored.
+    /// Its eviction policy is defined in [`Self::new`].
+    uncompressed_chunk_cache: Option<UncompressedChunkCache>,
+    /// Used to keep track of the actual target during diverted writes
+    /// for the changelog
+    original_index_file: Option<PathBuf>,
+    /// Write handles to the index and data files
+    /// XXX why duplicate from `index` and `segment_file`?
+    writing_handles: Option<WriteHandles>,
+    /// See [`DelayedBuffer`].
+    delayed_buffer: Option<Arc<Mutex<DelayedBuffer>>>,
+    /// Whether this revlog is inline. XXX why duplicate from `index`?
+    pub inline: bool,
+    /// A cache of the last revision, which is usually accessed multiple
+    /// times.
+    pub last_revision_cache: Mutex<Option<SingleRevisionCache>>,
+}
+
+impl InnerRevlog {
+    pub fn new(
+        vfs: Box<dyn Vfs>,
+        index: Index,
+        index_file: PathBuf,
+        data_file: PathBuf,
+        data_config: RevlogDataConfig,
+        delta_config: RevlogDeltaConfig,
+        feature_config: RevlogFeatureConfig,
+    ) -> Self {
+        assert!(index_file.is_relative());
+        assert!(data_file.is_relative());
+        let segment_file = RandomAccessFile::new(
+            dyn_clone::clone_box(&*vfs),
+            if index.is_inline() {
+                index_file.to_owned()
+            } else {
+                data_file.to_owned()
+            },
+        );
+
+        let uncompressed_chunk_cache =
+            data_config.uncompressed_cache_factor.map(
+                // Arbitrary initial value
+                // TODO check if using a hasher specific to integers is useful
+                |_factor| RefCell::new(LruMap::with_memory_budget(65536)),
+            );
+
+        let inline = index.is_inline();
+        Self {
+            index,
+            vfs,
+            index_file,
+            data_file,
+            data_config,
+            delta_config,
+            feature_config,
+            segment_file,
+            uncompressed_chunk_cache,
+            original_index_file: None,
+            writing_handles: None,
+            delayed_buffer: None,
+            inline,
+            last_revision_cache: Mutex::new(None),
+        }
+    }
+
+    /// Return number of entries of the revlog index
+    pub fn len(&self) -> usize {
+        self.index.len()
+    }
+
+    /// Return `true` if this revlog has no entries
+    pub fn is_empty(&self) -> bool {
+        self.len() == 0
+    }
+
+    /// Return whether this revlog is inline (mixed index and data)
+    pub fn is_inline(&self) -> bool {
+        self.inline
+    }
+
+    /// Clear all caches from this revlog
+    pub fn clear_cache(&mut self) {
+        assert!(!self.is_delaying());
+        if let Some(cache) = self.uncompressed_chunk_cache.as_ref() {
+            // We don't clear the allocation here because it's probably faster.
+            // We could change our minds later if this ends up being a problem
+            // with regards to memory consumption.
+            cache.borrow_mut().clear();
+        }
+    }
+
+    /// Return an entry for the null revision
+    pub fn make_null_entry(&self) -> RevlogEntry {
+        RevlogEntry {
+            revlog: self,
+            rev: NULL_REVISION,
+            uncompressed_len: 0,
+            p1: NULL_REVISION,
+            p2: NULL_REVISION,
+            flags: NULL_REVLOG_ENTRY_FLAGS,
+            hash: NULL_NODE,
+        }
+    }
+
+    /// Return the [`RevlogEntry`] for a [`Revision`] that is known to exist
+    pub fn get_entry_for_checked_rev(
+        &self,
+        rev: Revision,
+    ) -> Result<RevlogEntry, RevlogError> {
+        if rev == NULL_REVISION {
+            return Ok(self.make_null_entry());
+        }
+        let index_entry = self
+            .index
+            .get_entry(rev)
+            .ok_or_else(|| RevlogError::InvalidRevision(rev.to_string()))?;
+        let p1 =
+            self.index.check_revision(index_entry.p1()).ok_or_else(|| {
+                RevlogError::corrupted(format!(
+                    "p1 for rev {} is invalid",
+                    rev
+                ))
+            })?;
+        let p2 =
+            self.index.check_revision(index_entry.p2()).ok_or_else(|| {
+                RevlogError::corrupted(format!(
+                    "p2 for rev {} is invalid",
+                    rev
+                ))
+            })?;
+        let entry = RevlogEntry {
+            revlog: self,
+            rev,
+            uncompressed_len: index_entry.uncompressed_len(),
+            p1,
+            p2,
+            flags: index_entry.flags(),
+            hash: *index_entry.hash(),
+        };
+        Ok(entry)
+    }
+
+    /// Return the [`RevlogEntry`] for `rev`. If `rev` fails to check, this
+    /// returns a [`RevlogError`].
+    /// TODO normalize naming across the index and all revlogs
+    /// (changelog, etc.) so that `get_entry` is always on an unchecked rev and
+    /// `get_entry_for_checked_rev` is for checked rev
+    pub fn get_entry(
+        &self,
+        rev: UncheckedRevision,
+    ) -> Result<RevlogEntry, RevlogError> {
+        if rev == NULL_REVISION.into() {
+            return Ok(self.make_null_entry());
+        }
+        let rev = self.index.check_revision(rev).ok_or_else(|| {
+            RevlogError::corrupted(format!("rev {} is invalid", rev))
+        })?;
+        self.get_entry_for_checked_rev(rev)
+    }
+
+    /// Is the revlog currently delaying the visibility of written data?
+    ///
+    /// The delaying mechanism can be either in-memory or written on disk in a
+    /// side-file.
+    pub fn is_delaying(&self) -> bool {
+        self.delayed_buffer.is_some() || self.original_index_file.is_some()
+    }
+
+    /// The offset of the data chunk for this revision
+    #[inline(always)]
+    pub fn start(&self, rev: Revision) -> usize {
+        self.index.start(
+            rev,
+            &self
+                .index
+                .get_entry(rev)
+                .unwrap_or_else(|| self.index.make_null_entry()),
+        )
+    }
+
+    /// The length of the data chunk for this revision
+    /// TODO rename this method and others to more explicit names than the
+    /// existing ones that were copied over from Python
+    #[inline(always)]
+    pub fn length(&self, rev: Revision) -> usize {
+        self.index
+            .get_entry(rev)
+            .unwrap_or_else(|| self.index.make_null_entry())
+            .compressed_len() as usize
+    }
+
+    /// The end of the data chunk for this revision
+    #[inline(always)]
+    pub fn end(&self, rev: Revision) -> usize {
+        self.start(rev) + self.length(rev)
+    }
+
+    /// Return the delta parent of the given revision
+    pub fn delta_parent(&self, rev: Revision) -> Revision {
+        let base = self
+            .index
+            .get_entry(rev)
+            .unwrap()
+            .base_revision_or_base_of_delta_chain();
+        if base.0 == rev.0 {
+            NULL_REVISION
+        } else if self.delta_config.general_delta {
+            Revision(base.0)
+        } else {
+            Revision(rev.0 - 1)
+        }
+    }
+
+    /// Return whether `rev` points to a snapshot revision (i.e. does not have
+    /// a delta base).
+    pub fn is_snapshot(&self, rev: Revision) -> Result<bool, RevlogError> {
+        if !self.delta_config.sparse_revlog {
+            return Ok(self.delta_parent(rev) == NULL_REVISION);
+        }
+        self.index.is_snapshot_unchecked(rev)
+    }
+
+    /// Return the delta chain for `rev` according to this revlog's config.
+    /// See [`Index::delta_chain`] for more information.
+    pub fn delta_chain(
+        &self,
+        rev: Revision,
+        stop_rev: Option<Revision>,
+    ) -> Result<(Vec<Revision>, bool), HgError> {
+        self.index.delta_chain(
+            rev,
+            stop_rev,
+            self.delta_config.general_delta.into(),
+        )
+    }
+
+    fn compressor(&self) -> Result<Box<dyn Compressor>, HgError> {
+        // TODO cache the compressor?
+        Ok(match self.feature_config.compression_engine {
+            CompressionConfig::Zlib { level } => {
+                Box::new(ZlibCompressor::new(level))
+            }
+            CompressionConfig::Zstd { level, threads } => {
+                Box::new(ZstdCompressor::new(level, threads))
+            }
+            CompressionConfig::None => Box::new(NoneCompressor),
+        })
+    }
+
+    /// Generate a possibly-compressed representation of data.
+    /// Returns `None` if the data was not compressed.
+    pub fn compress<'data>(
+        &self,
+        data: &'data [u8],
+    ) -> Result<Option<Cow<'data, [u8]>>, RevlogError> {
+        if data.is_empty() {
+            return Ok(Some(data.into()));
+        }
+        let res = self.compressor()?.compress(data)?;
+        if let Some(compressed) = res {
+            // The revlog compressor added the header in the returned data.
+            return Ok(Some(compressed.into()));
+        }
+
+        if data[0] == b'\0' {
+            return Ok(Some(data.into()));
+        }
+        Ok(None)
+    }
+
+    /// Decompress a revlog chunk.
+    ///
+    /// The chunk is expected to begin with a header identifying the
+    /// format type so it can be routed to an appropriate decompressor.
+    pub fn decompress<'a>(
+        &'a self,
+        data: &'a [u8],
+    ) -> Result<Cow<[u8]>, RevlogError> {
+        if data.is_empty() {
+            return Ok(data.into());
+        }
+
+        // Revlogs are read much more frequently than they are written and many
+        // chunks only take microseconds to decompress, so performance is
+        // important here.
+
+        let header = data[0];
+        match header {
+            // Settings don't matter as they only affect compression
+            ZLIB_BYTE => Ok(ZlibCompressor::new(0).decompress(data)?.into()),
+            // Settings don't matter as they only affect compression
+            ZSTD_BYTE => {
+                Ok(ZstdCompressor::new(0, 0).decompress(data)?.into())
+            }
+            b'\0' => Ok(data.into()),
+            b'u' => Ok((&data[1..]).into()),
+            other => Err(HgError::UnsupportedFeature(format!(
+                "unknown compression header '{}'",
+                other
+            ))
+            .into()),
+        }
+    }
+
+    /// Obtain a segment of raw data corresponding to a range of revisions.
+    ///
+    /// Requests for data may be satisfied by a cache.
+    ///
+    /// Returns a 2-tuple of (offset, data) for the requested range of
+    /// revisions. Offset is the integer offset from the beginning of the
+    /// revlog and data is a slice of the raw byte data.
+    ///
+    /// Callers will need to call `self.start(rev)` and `self.length(rev)`
+    /// to determine where each revision's data begins and ends.
+    pub fn get_segment_for_revs(
+        &self,
+        start_rev: Revision,
+        end_rev: Revision,
+    ) -> Result<(usize, Vec<u8>), HgError> {
+        let start = if start_rev == NULL_REVISION {
+            0
+        } else {
+            let start_entry = self
+                .index
+                .get_entry(start_rev)
+                .expect("null revision segment");
+            self.index.start(start_rev, &start_entry)
+        };
+        let end_entry = self
+            .index
+            .get_entry(end_rev)
+            .expect("null revision segment");
+        let end = self.index.start(end_rev, &end_entry) + self.length(end_rev);
+
+        let length = end - start;
+
+        // XXX should we use mmap instead of doing this for platforms that
+        // support madvise/populate?
+        Ok((start, self.segment_file.read_chunk(start, length)?))
+    }
+
+    /// Return the uncompressed raw data for `rev`
+    pub fn chunk_for_rev(&self, rev: Revision) -> Result<Arc<[u8]>, HgError> {
+        if let Some(cache) = self.uncompressed_chunk_cache.as_ref() {
+            if let Some(chunk) = cache.borrow_mut().get(&rev) {
+                return Ok(chunk.clone());
+            }
+        }
+        // TODO revlogv2 should check the compression mode
+        let data = self.get_segment_for_revs(rev, rev)?.1;
+        let uncompressed = self.decompress(&data).map_err(|e| {
+            HgError::abort(
+                format!("revlog decompression error: {}", e),
+                exit_codes::ABORT,
+                None,
+            )
+        })?;
+        let uncompressed: Arc<[u8]> = Arc::from(uncompressed.into_owned());
+        if let Some(cache) = self.uncompressed_chunk_cache.as_ref() {
+            cache.borrow_mut().insert(rev, uncompressed.clone());
+        }
+        Ok(uncompressed)
+    }
+
+    /// Execute `func` within a read context for the data file, meaning that
+    /// the read handle will be taken and discarded after the operation.
+    pub fn with_read<R>(
+        &self,
+        func: impl FnOnce() -> Result<R, RevlogError>,
+    ) -> Result<R, RevlogError> {
+        self.enter_reading_context()?;
+        let res = func();
+        self.exit_reading_context();
+        res.map_err(Into::into)
+    }
+
+    /// `pub` only for use in hg-cpython
+    #[doc(hidden)]
+    pub fn enter_reading_context(&self) -> Result<(), HgError> {
+        if self.is_empty() {
+            // Nothing to be read
+            return Ok(());
+        }
+        if self.delayed_buffer.is_some() && self.is_inline() {
+            return Err(HgError::abort(
+                "revlog with delayed write should not be inline",
+                exit_codes::ABORT,
+                None,
+            ));
+        }
+        self.segment_file.get_read_handle()?;
+        Ok(())
+    }
+
+    /// `pub` only for use in hg-cpython
+    #[doc(hidden)]
+    pub fn exit_reading_context(&self) {
+        self.segment_file.exit_reading_context()
+    }
+
+    /// Fill the buffer returned by `get_buffer` with the possibly un-validated
+    /// raw text for a revision. It can be already validated if it comes
+    /// from the cache.
+    pub fn raw_text<G, T>(
+        &self,
+        rev: Revision,
+        get_buffer: G,
+    ) -> Result<(), RevlogError>
+    where
+        G: FnOnce(
+            usize,
+            &mut dyn FnMut(
+                &mut dyn RevisionBuffer<Target = T>,
+            ) -> Result<(), RevlogError>,
+        ) -> Result<(), RevlogError>,
+    {
+        let entry = &self.get_entry_for_checked_rev(rev)?;
+        let raw_size = entry.uncompressed_len();
+        let mut mutex_guard = self
+            .last_revision_cache
+            .lock()
+            .expect("lock should not be held");
+        let cached_rev = if let Some((_node, rev, data)) = &*mutex_guard {
+            Some((*rev, data.deref().as_ref()))
+        } else {
+            None
+        };
+        if let Some(cache) = &self.uncompressed_chunk_cache {
+            let cache = &mut cache.borrow_mut();
+            if let Some(size) = raw_size {
+                // Dynamically update the uncompressed_chunk_cache size to the
+                // largest revision we've seen in this revlog.
+                // Do it *before* restoration in case the current revision
+                // is the largest.
+                let factor = self
+                    .data_config
+                    .uncompressed_cache_factor
+                    .expect("cache should not exist without factor");
+                let candidate_size = (size as f64 * factor) as usize;
+                let limiter_mut = cache.limiter_mut();
+                if candidate_size > limiter_mut.max_memory_usage() {
+                    std::mem::swap(
+                        limiter_mut,
+                        &mut ByMemoryUsage::new(candidate_size),
+                    );
+                }
+            }
+        }
+        entry.rawdata(cached_rev, get_buffer)?;
+        // drop cache to save memory, the caller is expected to update
+        // the revision cache after validating the text
+        mutex_guard.take();
+        Ok(())
+    }
+
+    /// Only `pub` for `hg-cpython`.
+    /// Obtain decompressed raw data for the specified revisions that are
+    /// assumed to be in ascending order.
+    ///
+    /// Returns a list with decompressed data for each requested revision.
+    #[doc(hidden)]
+    pub fn chunks(
+        &self,
+        revs: Vec<Revision>,
+        target_size: Option<u64>,
+    ) -> Result<Vec<Arc<[u8]>>, RevlogError> {
+        if revs.is_empty() {
+            return Ok(vec![]);
+        }
+        let mut fetched_revs = vec![];
+        let mut chunks = Vec::with_capacity(revs.len());
+
+        match self.uncompressed_chunk_cache.as_ref() {
+            Some(cache) => {
+                let mut cache = cache.borrow_mut();
+                for rev in revs.iter() {
+                    match cache.get(rev) {
+                        Some(hit) => chunks.push((*rev, hit.to_owned())),
+                        None => fetched_revs.push(*rev),
+                    }
+                }
+            }
+            None => fetched_revs = revs,
+        }
+
+        let already_cached = chunks.len();
+
+        let sliced_chunks = if fetched_revs.is_empty() {
+            vec![]
+        } else if !self.data_config.with_sparse_read || self.is_inline() {
+            vec![fetched_revs]
+        } else {
+            self.slice_chunk(&fetched_revs, target_size)?
+        };
+
+        self.with_read(|| {
+            for revs_chunk in sliced_chunks {
+                let first_rev = revs_chunk[0];
+                // Skip trailing revisions with empty diff
+                let last_rev_idx = revs_chunk
+                    .iter()
+                    .rposition(|r| self.length(*r) != 0)
+                    .unwrap_or(revs_chunk.len() - 1);
+
+                let last_rev = revs_chunk[last_rev_idx];
+
+                let (offset, data) =
+                    self.get_segment_for_revs(first_rev, last_rev)?;
+
+                let revs_chunk = &revs_chunk[..=last_rev_idx];
+
+                for rev in revs_chunk {
+                    let chunk_start = self.start(*rev);
+                    let chunk_length = self.length(*rev);
+                    // TODO revlogv2 should check the compression mode
+                    let bytes = &data[chunk_start - offset..][..chunk_length];
+                    let chunk = if !bytes.is_empty() && bytes[0] == ZSTD_BYTE {
+                        // If we're using `zstd`, we want to try a more
+                        // specialized decompression
+                        let entry = self.index.get_entry(*rev).unwrap();
+                        let is_delta = entry
+                            .base_revision_or_base_of_delta_chain()
+                            != (*rev).into();
+                        let uncompressed = uncompressed_zstd_data(
+                            bytes,
+                            is_delta,
+                            entry.uncompressed_len(),
+                        )?;
+                        Cow::Owned(uncompressed)
+                    } else {
+                        // Otherwise just fallback to generic decompression.
+                        self.decompress(bytes)?
+                    };
+
+                    chunks.push((*rev, chunk.into()));
+                }
+            }
+            Ok(())
+        })?;
+
+        if let Some(cache) = self.uncompressed_chunk_cache.as_ref() {
+            let mut cache = cache.borrow_mut();
+            for (rev, chunk) in chunks.iter().skip(already_cached) {
+                cache.insert(*rev, chunk.clone());
+            }
+        }
+        // Use stable sort here since it's *mostly* sorted
+        chunks.sort_by(|a, b| a.0.cmp(&b.0));
+        Ok(chunks.into_iter().map(|(_r, chunk)| chunk).collect())
+    }
+
+    /// Slice revs to reduce the amount of unrelated data to be read from disk.
+    ///
+    /// ``revs`` is sliced into groups that should be read in one time.
+    /// Assume that revs are sorted.
+    ///
+    /// The initial chunk is sliced until the overall density
+    /// (payload/chunks-span ratio) is above
+    /// `revlog.data_config.sr_density_threshold`.
+    /// No gap smaller than `revlog.data_config.sr_min_gap_size` is skipped.
+    ///
+    /// If `target_size` is set, no chunk larger than `target_size`
+    /// will be returned.
+    /// For consistency with other slicing choices, this limit won't go lower
+    /// than `revlog.data_config.sr_min_gap_size`.
+    ///
+    /// If individual revision chunks are larger than this limit, they will
+    /// still be raised individually.
+    pub fn slice_chunk(
+        &self,
+        revs: &[Revision],
+        target_size: Option<u64>,
+    ) -> Result<Vec<Vec<Revision>>, RevlogError> {
+        let target_size =
+            target_size.map(|size| size.max(self.data_config.sr_min_gap_size));
+
+        let target_density = self.data_config.sr_density_threshold;
+        let min_gap_size = self.data_config.sr_min_gap_size as usize;
+        let to_density = self.index.slice_chunk_to_density(
+            revs,
+            target_density,
+            min_gap_size,
+        );
+
+        let mut sliced = vec![];
+
+        for chunk in to_density {
+            sliced.extend(
+                self.slice_chunk_to_size(&chunk, target_size)?
+                    .into_iter()
+                    .map(ToOwned::to_owned),
+            );
+        }
+
+        Ok(sliced)
+    }
+
+    /// Slice revs to match the target size
+    ///
+    /// This is intended to be used on chunks that density slicing selected,
+    /// but that are still too large compared to the read guarantee of revlogs.
+    /// This might happen when the "minimal gap size" interrupted the slicing
+    /// or when chains are built in a way that create large blocks next to
+    /// each other.
+    fn slice_chunk_to_size<'a>(
+        &self,
+        revs: &'a [Revision],
+        target_size: Option<u64>,
+    ) -> Result<Vec<&'a [Revision]>, RevlogError> {
+        let mut start_data = self.start(revs[0]);
+        let end_data = self.end(revs[revs.len() - 1]);
+        let full_span = end_data - start_data;
+
+        let nothing_to_do = target_size
+            .map(|size| full_span <= size as usize)
+            .unwrap_or(true);
+
+        if nothing_to_do {
+            return Ok(vec![revs]);
+        }
+        let target_size = target_size.expect("target_size is set") as usize;
+
+        let mut start_rev_idx = 0;
+        let mut end_rev_idx = 1;
+        let mut chunks = vec![];
+
+        for (idx, rev) in revs.iter().enumerate().skip(1) {
+            let span = self.end(*rev) - start_data;
+            let is_snapshot = self.is_snapshot(*rev)?;
+            if span <= target_size && is_snapshot {
+                end_rev_idx = idx + 1;
+            } else {
+                let chunk =
+                    self.trim_chunk(revs, start_rev_idx, Some(end_rev_idx));
+                if !chunk.is_empty() {
+                    chunks.push(chunk);
+                }
+                start_rev_idx = idx;
+                start_data = self.start(*rev);
+                end_rev_idx = idx + 1;
+            }
+            if !is_snapshot {
+                break;
+            }
+        }
+
+        // For the others, we use binary slicing to quickly converge towards
+        // valid chunks (otherwise, we might end up looking for the start/end
+        // of many revisions). This logic is not looking for the perfect
+        // slicing point, it quickly converges towards valid chunks.
+        let number_of_items = revs.len();
+
+        while (end_data - start_data) > target_size {
+            end_rev_idx = number_of_items;
+            if number_of_items - start_rev_idx <= 1 {
+                // Protect against individual chunks larger than the limit
+                break;
+            }
+            let mut local_end_data = self.end(revs[end_rev_idx - 1]);
+            let mut span = local_end_data - start_data;
+            while span > target_size {
+                if end_rev_idx - start_rev_idx <= 1 {
+                    // Protect against individual chunks larger than the limit
+                    break;
+                }
+                end_rev_idx -= (end_rev_idx - start_rev_idx) / 2;
+                local_end_data = self.end(revs[end_rev_idx - 1]);
+                span = local_end_data - start_data;
+            }
+            let chunk =
+                self.trim_chunk(revs, start_rev_idx, Some(end_rev_idx));
+            if !chunk.is_empty() {
+                chunks.push(chunk);
+            }
+            start_rev_idx = end_rev_idx;
+            start_data = self.start(revs[start_rev_idx]);
+        }
+
+        let chunk = self.trim_chunk(revs, start_rev_idx, None);
+        if !chunk.is_empty() {
+            chunks.push(chunk);
+        }
+
+        Ok(chunks)
+    }
+
+    /// Returns `revs[startidx..endidx]` without empty trailing revs
+    fn trim_chunk<'a>(
+        &self,
+        revs: &'a [Revision],
+        start_rev_idx: usize,
+        end_rev_idx: Option<usize>,
+    ) -> &'a [Revision] {
+        let mut end_rev_idx = end_rev_idx.unwrap_or(revs.len());
+
+        // If we have a non-empty delta candidate, there is nothing to trim
+        if revs[end_rev_idx - 1].0 < self.len() as BaseRevision {
+            // Trim empty revs at the end, except the very first rev of a chain
+            while end_rev_idx > 1
+                && end_rev_idx > start_rev_idx
+                && self.length(revs[end_rev_idx - 1]) == 0
+            {
+                end_rev_idx -= 1
+            }
+        }
+
+        &revs[start_rev_idx..end_rev_idx]
+    }
+
+    /// Check the hash of some given data against the recorded hash.
+    pub fn check_hash(
+        &self,
+        p1: Revision,
+        p2: Revision,
+        expected: &[u8],
+        data: &[u8],
+    ) -> bool {
+        let e1 = self.index.get_entry(p1);
+        let h1 = match e1 {
+            Some(ref entry) => entry.hash(),
+            None => &NULL_NODE,
+        };
+        let e2 = self.index.get_entry(p2);
+        let h2 = match e2 {
+            Some(ref entry) => entry.hash(),
+            None => &NULL_NODE,
+        };
+
+        hash(data, h1.as_bytes(), h2.as_bytes()) == expected
+    }
+
+    /// Returns whether we are currently in a [`Self::with_write`] context
+    pub fn is_writing(&self) -> bool {
+        self.writing_handles.is_some()
+    }
+
+    /// Open the revlog files for writing
+    ///
+    /// Adding content to a revlog should be done within this context.
+    /// TODO try using `BufRead` and `BufWrite` and see if performance improves
+    pub fn with_write<R>(
+        &mut self,
+        transaction: &mut impl Transaction,
+        data_end: Option<usize>,
+        func: impl FnOnce() -> R,
+    ) -> Result<R, HgError> {
+        if self.is_writing() {
+            return Ok(func());
+        }
+        self.enter_writing_context(data_end, transaction)
+            .map_err(|e| {
+                self.exit_writing_context();
+                e
+            })?;
+        let res = func();
+        self.exit_writing_context();
+        Ok(res)
+    }
+
+    /// `pub` only for use in hg-cpython
+    #[doc(hidden)]
+    pub fn exit_writing_context(&mut self) {
+        self.writing_handles.take();
+        self.segment_file.writing_handle.take();
+        self.segment_file.reading_handle.take();
+    }
+
+    /// `pub` only for use in hg-cpython
+    #[doc(hidden)]
+    pub fn python_writing_handles(&self) -> Option<&WriteHandles> {
+        self.writing_handles.as_ref()
+    }
+
+    /// `pub` only for use in hg-cpython
+    #[doc(hidden)]
+    pub fn enter_writing_context(
+        &mut self,
+        data_end: Option<usize>,
+        transaction: &mut impl Transaction,
+    ) -> Result<(), HgError> {
+        let data_size = if self.is_empty() {
+            0
+        } else {
+            self.end(Revision((self.len() - 1) as BaseRevision))
+        };
+        let data_handle = if !self.is_inline() {
+            let data_handle = match self.vfs.open(&self.data_file) {
+                Ok(mut f) => {
+                    if let Some(end) = data_end {
+                        f.seek(SeekFrom::Start(end as u64))
+                            .when_reading_file(&self.data_file)?;
+                    } else {
+                        f.seek(SeekFrom::End(0))
+                            .when_reading_file(&self.data_file)?;
+                    }
+                    f
+                }
+                Err(e) => match e {
+                    HgError::IoError { error, context } => {
+                        if error.kind() != ErrorKind::NotFound {
+                            return Err(HgError::IoError { error, context });
+                        }
+                        self.vfs.create(&self.data_file)?
+                    }
+                    e => return Err(e),
+                },
+            };
+            transaction.add(&self.data_file, data_size);
+            Some(FileHandle::from_file(
+                data_handle,
+                dyn_clone::clone_box(&*self.vfs),
+                &self.data_file,
+            ))
+        } else {
+            None
+        };
+        let index_size = self.len() * INDEX_ENTRY_SIZE;
+        let index_handle = self.index_write_handle()?;
+        if self.is_inline() {
+            transaction.add(&self.index_file, data_size);
+        } else {
+            transaction.add(&self.index_file, index_size);
+        }
+        self.writing_handles = Some(WriteHandles {
+            index_handle: index_handle.clone(),
+            data_handle: data_handle.clone(),
+        });
+        *self.segment_file.reading_handle.borrow_mut() = if self.is_inline() {
+            Some(index_handle)
+        } else {
+            data_handle
+        };
+        Ok(())
+    }
+
+    /// Get a write handle to the index, sought to the end of its data.
+    fn index_write_handle(&self) -> Result<FileHandle, HgError> {
+        let res = if self.delayed_buffer.is_none() {
+            if self.data_config.check_ambig {
+                self.vfs.open_check_ambig(&self.index_file)
+            } else {
+                self.vfs.open(&self.index_file)
+            }
+        } else {
+            self.vfs.open(&self.index_file)
+        };
+        match res {
+            Ok(mut handle) => {
+                handle
+                    .seek(SeekFrom::End(0))
+                    .when_reading_file(&self.index_file)?;
+                Ok(
+                    if let Some(delayed_buffer) = self.delayed_buffer.as_ref()
+                    {
+                        FileHandle::from_file_delayed(
+                            handle,
+                            dyn_clone::clone_box(&*self.vfs),
+                            &self.index_file,
+                            delayed_buffer.clone(),
+                        )?
+                    } else {
+                        FileHandle::from_file(
+                            handle,
+                            dyn_clone::clone_box(&*self.vfs),
+                            &self.index_file,
+                        )
+                    },
+                )
+            }
+            Err(e) => match e {
+                HgError::IoError { error, context } => {
+                    if error.kind() != ErrorKind::NotFound {
+                        return Err(HgError::IoError { error, context });
+                    };
+                    if let Some(delayed_buffer) = self.delayed_buffer.as_ref()
+                    {
+                        FileHandle::new_delayed(
+                            dyn_clone::clone_box(&*self.vfs),
+                            &self.index_file,
+                            true,
+                            delayed_buffer.clone(),
+                        )
+                    } else {
+                        FileHandle::new(
+                            dyn_clone::clone_box(&*self.vfs),
+                            &self.index_file,
+                            true,
+                            true,
+                        )
+                    }
+                }
+                e => Err(e),
+            },
+        }
+    }
+
+    /// Split the data of an inline revlog into an index and a data file
+    pub fn split_inline(
+        &mut self,
+        header: IndexHeader,
+        new_index_file_path: Option<PathBuf>,
+    ) -> Result<PathBuf, RevlogError> {
+        assert!(self.delayed_buffer.is_none());
+        let existing_handles = self.writing_handles.is_some();
+        if let Some(handles) = &mut self.writing_handles {
+            handles.index_handle.flush()?;
+            self.writing_handles.take();
+            self.segment_file.writing_handle.take();
+        }
+        let mut new_data_file_handle = self.vfs.create(&self.data_file)?;
+        // Drop any potential data, possibly redundant with the VFS impl.
+        new_data_file_handle
+            .set_len(0)
+            .when_writing_file(&self.data_file)?;
+
+        self.with_read(|| -> Result<(), RevlogError> {
+            for r in 0..self.index.len() {
+                let rev = Revision(r as BaseRevision);
+                let rev_segment = self.get_segment_for_revs(rev, rev)?.1;
+                new_data_file_handle
+                    .write_all(&rev_segment)
+                    .when_writing_file(&self.data_file)?;
+            }
+            new_data_file_handle
+                .flush()
+                .when_writing_file(&self.data_file)?;
+            Ok(())
+        })?;
+
+        if let Some(index_path) = new_index_file_path {
+            self.index_file = index_path
+        }
+
+        let mut new_index_handle = self.vfs.create(&self.index_file)?;
+        let mut new_data = Vec::with_capacity(self.len() * INDEX_ENTRY_SIZE);
+        for r in 0..self.len() {
+            let rev = Revision(r as BaseRevision);
+            let entry = self.index.entry_binary(rev).unwrap_or_else(|| {
+                panic!(
+                    "entry {} should exist in {}",
+                    r,
+                    self.index_file.display()
+                )
+            });
+            if r == 0 {
+                new_data.extend(header.header_bytes);
+            }
+            new_data.extend(entry);
+        }
+        new_index_handle
+            .write_all(&new_data)
+            .when_writing_file(&self.index_file)?;
+        // Replace the index with a new one because the buffer contains inline
+        // data
+        self.index = Index::new(Box::new(new_data), header)?;
+        self.inline = false;
+
+        self.segment_file = RandomAccessFile::new(
+            dyn_clone::clone_box(&*self.vfs),
+            self.data_file.to_owned(),
+        );
+        if existing_handles {
+            // Switched from inline to conventional, reopen the index
+            let new_data_handle = Some(FileHandle::from_file(
+                new_data_file_handle,
+                dyn_clone::clone_box(&*self.vfs),
+                &self.data_file,
+            ));
+            self.writing_handles = Some(WriteHandles {
+                index_handle: self.index_write_handle()?,
+                data_handle: new_data_handle.clone(),
+            });
+            *self.segment_file.writing_handle.borrow_mut() = new_data_handle;
+        }
+
+        Ok(self.index_file.to_owned())
+    }
+
+    /// Write a new entry to this revlog.
+    /// - `entry` is the index bytes
+    /// - `header_and_data` is the compression header and the revision data
+    /// - `offset` is the position in the data file to write to
+    /// - `index_end` is the overwritten position in the index in revlog-v2,
+    ///   since the format may allow a rewrite of garbage data at the end.
+    /// - `data_end` is the overwritten position in the data-file in revlog-v2,
+    ///   since the format may allow a rewrite of garbage data at the end.
+    ///
+    /// XXX Why do we have `data_end` *and* `offset`? Same question in Python
+    pub fn write_entry(
+        &mut self,
+        mut transaction: impl Transaction,
+        entry: &[u8],
+        header_and_data: (&[u8], &[u8]),
+        mut offset: usize,
+        index_end: Option<u64>,
+        data_end: Option<u64>,
+    ) -> Result<(u64, Option<u64>), HgError> {
+        let current_revision = self.len() - 1;
+        let canonical_index_file = self.canonical_index_file();
+
+        let is_inline = self.is_inline();
+        let handles = match &mut self.writing_handles {
+            None => {
+                return Err(HgError::abort(
+                    "adding revision outside of the `with_write` context",
+                    exit_codes::ABORT,
+                    None,
+                ));
+            }
+            Some(handles) => handles,
+        };
+        let index_handle = &mut handles.index_handle;
+        let data_handle = &mut handles.data_handle;
+        if let Some(end) = index_end {
+            index_handle
+                .seek(SeekFrom::Start(end))
+                .when_reading_file(&self.index_file)?;
+        } else {
+            index_handle
+                .seek(SeekFrom::End(0))
+                .when_reading_file(&self.index_file)?;
+        }
+        if let Some(data_handle) = data_handle {
+            if let Some(end) = data_end {
+                data_handle
+                    .seek(SeekFrom::Start(end))
+                    .when_reading_file(&self.data_file)?;
+            } else {
+                data_handle
+                    .seek(SeekFrom::End(0))
+                    .when_reading_file(&self.data_file)?;
+            }
+        }
+        let (header, data) = header_and_data;
+
+        if !is_inline {
+            transaction.add(&self.data_file, offset);
+            transaction
+                .add(&canonical_index_file, current_revision * entry.len());
+            let data_handle = data_handle
+                .as_mut()
+                .expect("data handle should exist when not inline");
+            if !header.is_empty() {
+                data_handle.write_all(header)?;
+            }
+            data_handle.write_all(data)?;
+            match &mut self.delayed_buffer {
+                Some(buf) => {
+                    buf.lock()
+                        .expect("propagate the panic")
+                        .buffer
+                        .write_all(entry)
+                        .expect("write to delay buffer should succeed");
+                }
+                None => index_handle.write_all(entry)?,
+            }
+        } else if self.delayed_buffer.is_some() {
+            return Err(HgError::abort(
+                "invalid delayed write on inline revlog",
+                exit_codes::ABORT,
+                None,
+            ));
+        } else {
+            offset += current_revision * entry.len();
+            transaction.add(&canonical_index_file, offset);
+            index_handle.write_all(entry)?;
+            index_handle.write_all(header)?;
+            index_handle.write_all(data)?;
+        }
+        let data_position = match data_handle {
+            Some(h) => Some(h.position()?),
+            None => None,
+        };
+        Ok((index_handle.position()?, data_position))
+    }
+
+    /// Return the real target index file and not the temporary when diverting
+    pub fn canonical_index_file(&self) -> PathBuf {
+        self.original_index_file
+            .as_ref()
+            .map(ToOwned::to_owned)
+            .unwrap_or_else(|| self.index_file.to_owned())
+    }
+
+    /// Return the path to the diverted index
+    fn diverted_index(&self) -> PathBuf {
+        self.index_file.with_extension("i.a")
+    }
+
+    /// True if we're in a [`Self::with_write`] or [`Self::with_read`] context
+    pub fn is_open(&self) -> bool {
+        self.segment_file.is_open()
+    }
+
+    /// Set this revlog to delay its writes to a buffer
+    pub fn delay(&mut self) -> Result<Option<PathBuf>, HgError> {
+        assert!(!self.is_open());
+        if self.is_inline() {
+            return Err(HgError::abort(
+                "revlog with delayed write should not be inline",
+                exit_codes::ABORT,
+                None,
+            ));
+        }
+        if self.delayed_buffer.is_some() || self.original_index_file.is_some()
+        {
+            // Delay or divert already happening
+            return Ok(None);
+        }
+        if self.is_empty() {
+            self.original_index_file = Some(self.index_file.to_owned());
+            self.index_file = self.diverted_index();
+            if self.vfs.exists(&self.index_file) {
+                self.vfs.unlink(&self.index_file)?;
+            }
+            Ok(Some(self.index_file.to_owned()))
+        } else {
+            self.delayed_buffer =
+                Some(Arc::new(Mutex::new(DelayedBuffer::default())));
+            Ok(None)
+        }
+    }
+
+    /// Write the pending data (in memory) if any to the diverted index file
+    /// (on disk temporary file)
+    pub fn write_pending(
+        &mut self,
+    ) -> Result<(Option<PathBuf>, bool), HgError> {
+        assert!(!self.is_open());
+        if self.is_inline() {
+            return Err(HgError::abort(
+                "revlog with delayed write should not be inline",
+                exit_codes::ABORT,
+                None,
+            ));
+        }
+        if self.original_index_file.is_some() {
+            return Ok((None, true));
+        }
+        let mut any_pending = false;
+        let pending_index_file = self.diverted_index();
+        if self.vfs.exists(&pending_index_file) {
+            self.vfs.unlink(&pending_index_file)?;
+        }
+        self.vfs.copy(&self.index_file, &pending_index_file)?;
+        if let Some(delayed_buffer) = self.delayed_buffer.take() {
+            let mut index_file_handle = self.vfs.open(&pending_index_file)?;
+            index_file_handle
+                .seek(SeekFrom::End(0))
+                .when_writing_file(&pending_index_file)?;
+            let delayed_data =
+                &delayed_buffer.lock().expect("propagate the panic").buffer;
+            index_file_handle
+                .write_all(delayed_data)
+                .when_writing_file(&pending_index_file)?;
+            any_pending = true;
+        }
+        self.original_index_file = Some(self.index_file.to_owned());
+        self.index_file = pending_index_file;
+        Ok((Some(self.index_file.to_owned()), any_pending))
+    }
+
+    /// Overwrite the canonical file with the diverted file, or write out the
+    /// delayed buffer.
+    /// Returns an error if the revlog is neither diverted nor delayed.
+    pub fn finalize_pending(&mut self) -> Result<PathBuf, HgError> {
+        assert!(!self.is_open());
+        if self.is_inline() {
+            return Err(HgError::abort(
+                "revlog with delayed write should not be inline",
+                exit_codes::ABORT,
+                None,
+            ));
+        }
+        match (
+            self.delayed_buffer.as_ref(),
+            self.original_index_file.as_ref(),
+        ) {
+            (None, None) => {
+                return Err(HgError::abort(
+                    "neither delay nor divert found on this revlog",
+                    exit_codes::ABORT,
+                    None,
+                ));
+            }
+            (Some(delay), None) => {
+                let mut index_file_handle = self.vfs.open(&self.index_file)?;
+                index_file_handle
+                    .seek(SeekFrom::End(0))
+                    .when_writing_file(&self.index_file)?;
+                index_file_handle
+                    .write_all(
+                        &delay.lock().expect("propagate the panic").buffer,
+                    )
+                    .when_writing_file(&self.index_file)?;
+                self.delayed_buffer = None;
+            }
+            (None, Some(divert)) => {
+                if self.vfs.exists(&self.index_file) {
+                    self.vfs.rename(&self.index_file, divert, true)?;
+                }
+                divert.clone_into(&mut self.index_file);
+                self.original_index_file = None;
+            }
+            (Some(_), Some(_)) => unreachable!(
+                "{} is in an inconsistent state of both delay and divert",
+                self.canonical_index_file().display(),
+            ),
+        }
+        Ok(self.canonical_index_file())
+    }
+
+    /// `pub` only for `hg-cpython`. This is made a different method than
+    /// [`Revlog::index`] in case there is a different invariant that pops up
+    /// later.
+    #[doc(hidden)]
+    pub fn shared_index(&self) -> &Index {
+        &self.index
+    }
+}
+
+/// The use of a [`Refcell`] assumes that a given revlog will only
+/// be accessed (read or write) by a single thread.
+type UncompressedChunkCache =
+    RefCell<LruMap<Revision, Arc<[u8]>, ByMemoryUsage>>;
+
+/// The node, revision and data for the last revision we've seen. Speeds up
+/// a lot of sequential operations of the revlog.
+///
+/// The data is not just bytes since it can come from Python and we want to
+/// avoid copies if possible.
+type SingleRevisionCache =
+    (Node, Revision, Box<dyn Deref<Target = [u8]> + Send>);
+
+/// A way of progressively filling a buffer with revision data, then return
+/// that buffer. Used to abstract away Python-allocated code to reduce copying
+/// for performance reasons.
+pub trait RevisionBuffer {
+    /// The owned buffer type to return
+    type Target;
+    /// Copies the slice into the buffer
+    fn extend_from_slice(&mut self, slice: &[u8]);
+    /// Returns the now finished owned buffer
+    fn finish(self) -> Self::Target;
+}
+
+/// A simple vec-based buffer. This is uselessly complicated for the pure Rust
+/// case, but it's the price to pay for Python compatibility.
+#[derive(Debug)]
+pub(super) struct CoreRevisionBuffer {
+    buf: Vec<u8>,
+}
+
+impl CoreRevisionBuffer {
+    pub fn new() -> Self {
+        Self { buf: vec![] }
+    }
+
+    #[inline]
+    pub fn resize(&mut self, size: usize) {
+        self.buf.reserve_exact(size - self.buf.capacity());
+    }
+}
+
+impl RevisionBuffer for CoreRevisionBuffer {
+    type Target = Vec<u8>;
+
+    #[inline]
+    fn extend_from_slice(&mut self, slice: &[u8]) {
+        self.buf.extend_from_slice(slice);
+    }
+
+    #[inline]
+    fn finish(self) -> Self::Target {
+        self.buf
+    }
+}
+
+/// Calculate the hash of a revision given its data and its parents.
+pub fn hash(
+    data: &[u8],
+    p1_hash: &[u8],
+    p2_hash: &[u8],
+) -> [u8; NODE_BYTES_LENGTH] {
+    let mut hasher = Sha1::new();
+    let (a, b) = (p1_hash, p2_hash);
+    if a > b {
+        hasher.update(b);
+        hasher.update(a);
+    } else {
+        hasher.update(a);
+        hasher.update(b);
+    }
+    hasher.update(data);
+    *hasher.finalize().as_ref()
+}
--- a/rust/hg-core/src/revlog/mod.rs	Thu Oct 10 10:38:35 2024 +0200
+++ b/rust/hg-core/src/revlog/mod.rs	Thu Oct 10 10:34:51 2024 +0200
@@ -9,7 +9,10 @@
 pub mod nodemap;
 mod nodemap_docket;
 pub mod path_encode;
-use compression::uncompressed_zstd_data;
+use inner_revlog::CoreRevisionBuffer;
+use inner_revlog::InnerRevlog;
+use inner_revlog::RevisionBuffer;
+use memmap2::MmapOptions;
 pub use node::{FromHexError, Node, NodePrefix};
 use options::RevlogOpenOptions;
 pub mod changelog;
@@ -17,25 +20,25 @@
 pub mod file_io;
 pub mod filelog;
 pub mod index;
+pub mod inner_revlog;
 pub mod manifest;
 pub mod options;
 pub mod patch;
 
 use std::borrow::Cow;
+use std::io::ErrorKind;
 use std::io::Read;
 use std::ops::Deref;
 use std::path::Path;
 
-use flate2::read::ZlibDecoder;
-use sha1::{Digest, Sha1};
-
-use self::node::{NODE_BYTES_LENGTH, NULL_NODE};
+use self::node::NULL_NODE;
 use self::nodemap_docket::NodeMapDocket;
 use super::index::Index;
-use super::index::INDEX_ENTRY_SIZE;
 use super::nodemap::{NodeMap, NodeMapError};
 use crate::errors::HgError;
+use crate::errors::IoResultExt;
 use crate::exit_codes;
+use crate::vfs::Vfs;
 use crate::vfs::VfsImpl;
 
 /// As noted in revlog.c, revision numbers are actually encoded in
@@ -256,24 +259,17 @@
     }
 }
 
-/// Read only implementation of revlog.
 pub struct Revlog {
-    /// When index and data are not interleaved: bytes of the revlog index.
-    /// When index and data are interleaved: bytes of the revlog index and
-    /// data.
-    index: Index,
-    /// When index and data are not interleaved: bytes of the revlog data
-    data_bytes: Option<Box<dyn Deref<Target = [u8]> + Send>>,
+    inner: InnerRevlog,
     /// When present on disk: the persistent nodemap for this revlog
     nodemap: Option<nodemap::NodeTree>,
 }
 
 impl Graph for Revlog {
     fn parents(&self, rev: Revision) -> Result<[Revision; 2], GraphError> {
-        self.index.parents(rev)
+        self.index().parents(rev)
     }
 }
-
 impl Revlog {
     /// Open a revlog index file.
     ///
@@ -289,6 +285,10 @@
         Self::open_gen(store_vfs, index_path, data_path, options, None)
     }
 
+    fn index(&self) -> &Index {
+        &self.inner.index
+    }
+
     fn open_gen(
         // Todo use the `Vfs` trait here once we create a function for mmap
         store_vfs: &VfsImpl,
@@ -298,37 +298,10 @@
         nodemap_for_test: Option<nodemap::NodeTree>,
     ) -> Result<Self, HgError> {
         let index_path = index_path.as_ref();
-        let index = {
-            match store_vfs.mmap_open_opt(index_path)? {
-                None => Index::new(
-                    Box::<Vec<_>>::default(),
-                    options.index_header(),
-                ),
-                Some(index_mmap) => {
-                    let index = Index::new(
-                        Box::new(index_mmap),
-                        options.index_header(),
-                    )?;
-                    Ok(index)
-                }
-            }
-        }?;
+        let index = open_index(store_vfs, index_path, options)?;
 
         let default_data_path = index_path.with_extension("d");
-
-        // type annotation required
-        // won't recognize Mmap as Deref<Target = [u8]>
-        let data_bytes: Option<Box<dyn Deref<Target = [u8]> + Send>> =
-            if index.is_inline() {
-                None
-            } else if index.is_empty() {
-                // No need to even try to open the data file then.
-                Some(Box::new(&[][..]))
-            } else {
-                let data_path = data_path.unwrap_or(&default_data_path);
-                let data_mmap = store_vfs.mmap_open(data_path)?;
-                Some(Box::new(data_mmap))
-            };
+        let data_path = data_path.unwrap_or(&default_data_path);
 
         let nodemap = if index.is_inline() || !options.use_nodemap {
             None
@@ -346,20 +319,27 @@
         let nodemap = nodemap_for_test.or(nodemap);
 
         Ok(Revlog {
-            index,
-            data_bytes,
+            inner: InnerRevlog::new(
+                Box::new(store_vfs.clone()),
+                index,
+                index_path.to_path_buf(),
+                data_path.to_path_buf(),
+                options.data_config,
+                options.delta_config,
+                options.feature_config,
+            ),
             nodemap,
         })
     }
 
     /// Return number of entries of the `Revlog`.
     pub fn len(&self) -> usize {
-        self.index.len()
+        self.index().len()
     }
 
     /// Returns `true` if the `Revlog` has zero `entries`.
     pub fn is_empty(&self) -> bool {
-        self.index.is_empty()
+        self.index().is_empty()
     }
 
     /// Returns the node ID for the given revision number, if it exists in this
@@ -368,8 +348,8 @@
         if rev == NULL_REVISION.into() {
             return Some(&NULL_NODE);
         }
-        let rev = self.index.check_revision(rev)?;
-        Some(self.index.get_entry(rev)?.hash())
+        let rev = self.index().check_revision(rev)?;
+        Some(self.index().get_entry(rev)?.hash())
     }
 
     /// Return the revision number for the given node ID, if it exists in this
@@ -380,7 +360,7 @@
     ) -> Result<Revision, RevlogError> {
         if let Some(nodemap) = &self.nodemap {
             nodemap
-                .find_bin(&self.index, node)?
+                .find_bin(self.index(), node)?
                 .ok_or(RevlogError::InvalidRevision(format!("{:x}", node)))
         } else {
             self.rev_from_node_no_persistent_nodemap(node)
@@ -406,7 +386,7 @@
                 NULL_NODE
             } else {
                 let index_entry =
-                    self.index.get_entry(rev).ok_or_else(|| {
+                    self.index().get_entry(rev).ok_or_else(|| {
                         HgError::corrupted(
                             "revlog references a revision not in the index",
                         )
@@ -429,7 +409,21 @@
 
     /// Returns whether the given revision exists in this revlog.
     pub fn has_rev(&self, rev: UncheckedRevision) -> bool {
-        self.index.check_revision(rev).is_some()
+        self.index().check_revision(rev).is_some()
+    }
+
+    pub fn get_entry_for_checked_rev(
+        &self,
+        rev: Revision,
+    ) -> Result<RevlogEntry, RevlogError> {
+        self.inner.get_entry_for_checked_rev(rev)
+    }
+
+    pub fn get_entry(
+        &self,
+        rev: UncheckedRevision,
+    ) -> Result<RevlogEntry, RevlogError> {
+        self.inner.get_entry(rev)
     }
 
     /// Return the full data associated to a revision.
@@ -466,153 +460,93 @@
         expected: &[u8],
         data: &[u8],
     ) -> bool {
-        let e1 = self.index.get_entry(p1);
-        let h1 = match e1 {
-            Some(ref entry) => entry.hash(),
-            None => &NULL_NODE,
-        };
-        let e2 = self.index.get_entry(p2);
-        let h2 = match e2 {
-            Some(ref entry) => entry.hash(),
-            None => &NULL_NODE,
-        };
-
-        hash(data, h1.as_bytes(), h2.as_bytes()) == expected
+        self.inner.check_hash(p1, p2, expected, data)
     }
 
     /// Build the full data of a revision out its snapshot
     /// and its deltas.
-    fn build_data_from_deltas(
-        snapshot: RevlogEntry,
-        deltas: &[RevlogEntry],
-    ) -> Result<Vec<u8>, HgError> {
-        let snapshot = snapshot.data_chunk()?;
-        let deltas = deltas
-            .iter()
-            .rev()
-            .map(RevlogEntry::data_chunk)
-            .collect::<Result<Vec<_>, _>>()?;
-        let patches: Vec<_> =
-            deltas.iter().map(|d| patch::PatchList::new(d)).collect();
-        let patch = patch::fold_patch_lists(&patches);
-        Ok(patch.apply(&snapshot))
-    }
-
-    /// Return the revlog data.
-    fn data(&self) -> &[u8] {
-        match &self.data_bytes {
-            Some(data_bytes) => data_bytes,
-            None => panic!(
-                "forgot to load the data or trying to access inline data"
-            ),
-        }
-    }
-
-    pub fn make_null_entry(&self) -> RevlogEntry {
-        RevlogEntry {
-            revlog: self,
-            rev: NULL_REVISION,
-            bytes: b"",
-            compressed_len: 0,
-            uncompressed_len: 0,
-            base_rev_or_base_of_delta_chain: None,
-            p1: NULL_REVISION,
-            p2: NULL_REVISION,
-            flags: NULL_REVLOG_ENTRY_FLAGS,
-            hash: NULL_NODE,
-        }
-    }
-
-    fn get_entry_for_checked_rev(
-        &self,
-        rev: Revision,
-    ) -> Result<RevlogEntry, RevlogError> {
-        if rev == NULL_REVISION {
-            return Ok(self.make_null_entry());
+    fn build_data_from_deltas<T>(
+        buffer: &mut dyn RevisionBuffer<Target = T>,
+        snapshot: &[u8],
+        deltas: &[impl AsRef<[u8]>],
+    ) -> Result<(), RevlogError> {
+        if deltas.is_empty() {
+            buffer.extend_from_slice(snapshot);
+            return Ok(());
         }
-        let index_entry = self
-            .index
-            .get_entry(rev)
-            .ok_or(RevlogError::InvalidRevision(rev.to_string()))?;
-        let offset = index_entry.offset();
-        let start = if self.index.is_inline() {
-            offset + ((rev.0 as usize + 1) * INDEX_ENTRY_SIZE)
-        } else {
-            offset
-        };
-        let end = start + index_entry.compressed_len() as usize;
-        let data = if self.index.is_inline() {
-            self.index.data(start, end)
-        } else {
-            &self.data()[start..end]
-        };
-        let base_rev = self
-            .index
-            .check_revision(index_entry.base_revision_or_base_of_delta_chain())
-            .ok_or_else(|| {
-                RevlogError::corrupted(format!(
-                    "base revision for rev {} is invalid",
-                    rev
-                ))
-            })?;
-        let p1 =
-            self.index.check_revision(index_entry.p1()).ok_or_else(|| {
-                RevlogError::corrupted(format!(
-                    "p1 for rev {} is invalid",
-                    rev
-                ))
-            })?;
-        let p2 =
-            self.index.check_revision(index_entry.p2()).ok_or_else(|| {
-                RevlogError::corrupted(format!(
-                    "p2 for rev {} is invalid",
-                    rev
-                ))
-            })?;
-        let entry = RevlogEntry {
-            revlog: self,
-            rev,
-            bytes: data,
-            compressed_len: index_entry.compressed_len(),
-            uncompressed_len: index_entry.uncompressed_len(),
-            base_rev_or_base_of_delta_chain: if base_rev == rev {
+        let patches: Result<Vec<_>, _> = deltas
+            .iter()
+            .map(|d| patch::PatchList::new(d.as_ref()))
+            .collect();
+        let patch = patch::fold_patch_lists(&patches?);
+        patch.apply(buffer, snapshot);
+        Ok(())
+    }
+}
+
+type IndexData = Box<dyn Deref<Target = [u8]> + Send + Sync>;
+
+/// Open the revlog [`Index`] at `index_path`, through the `store_vfs` and the
+/// given `options`. This controls whether (and how) we `mmap` the index file,
+/// and returns an empty buffer if the index does not exist on disk.
+/// This is only used when doing pure-Rust work, in Python contexts this is
+/// unused at the time of writing.
+pub fn open_index(
+    store_vfs: &impl Vfs,
+    index_path: &Path,
+    options: RevlogOpenOptions,
+) -> Result<Index, HgError> {
+    let buf: IndexData = match store_vfs.open_read(index_path) {
+        Ok(mut file) => {
+            let mut buf = if let Some(threshold) =
+                options.data_config.mmap_index_threshold
+            {
+                let size = store_vfs.file_size(&file)?;
+                if size >= threshold {
+                    // Safety is "enforced" by locks and assuming other
+                    // processes are well-behaved. If any misbehaving or
+                    // malicious process does touch the index, it could lead
+                    // to corruption. This is somewhat inherent to file-based
+                    // `mmap`, though some platforms have some ways of
+                    // mitigating.
+                    // TODO linux: set the immutable flag with `chattr(1)`?
+                    let mmap = unsafe { MmapOptions::new().map(&file) }
+                        .when_reading_file(index_path)?;
+                    Some(Box::new(mmap) as IndexData)
+                } else {
+                    None
+                }
+            } else {
                 None
-            } else {
-                Some(base_rev)
-            },
-            p1,
-            p2,
-            flags: index_entry.flags(),
-            hash: *index_entry.hash(),
-        };
-        Ok(entry)
-    }
+            };
 
-    /// Get an entry of the revlog.
-    pub fn get_entry(
-        &self,
-        rev: UncheckedRevision,
-    ) -> Result<RevlogEntry, RevlogError> {
-        if rev == NULL_REVISION.into() {
-            return Ok(self.make_null_entry());
+            if buf.is_none() {
+                let mut data = vec![];
+                file.read_to_end(&mut data).when_reading_file(index_path)?;
+                buf = Some(Box::new(data) as IndexData);
+            }
+            buf.unwrap()
         }
-        let rev = self.index.check_revision(rev).ok_or_else(|| {
-            RevlogError::corrupted(format!("rev {} is invalid", rev))
-        })?;
-        self.get_entry_for_checked_rev(rev)
-    }
+        Err(err) => match err {
+            HgError::IoError { error, context } => match error.kind() {
+                ErrorKind::NotFound => Box::<Vec<u8>>::default(),
+                _ => return Err(HgError::IoError { error, context }),
+            },
+            e => return Err(e),
+        },
+    };
+
+    let index = Index::new(buf, options.index_header())?;
+    Ok(index)
 }
 
 /// The revlog entry's bytes and the necessary informations to extract
 /// the entry's data.
 #[derive(Clone)]
 pub struct RevlogEntry<'revlog> {
-    revlog: &'revlog Revlog,
+    revlog: &'revlog InnerRevlog,
     rev: Revision,
-    bytes: &'revlog [u8],
-    compressed_len: u32,
     uncompressed_len: i32,
-    base_rev_or_base_of_delta_chain: Option<Revision>,
     p1: Revision,
     p2: Revision,
     flags: u16,
@@ -683,33 +617,47 @@
     }
 
     /// The data for this entry, after resolving deltas if any.
-    pub fn rawdata(&self) -> Result<Cow<'revlog, [u8]>, RevlogError> {
-        let mut entry = self.clone();
-        let mut delta_chain = vec![];
+    /// Non-Python callers should probably call [`Self::data`] instead.
+    fn rawdata<G, T>(
+        &self,
+        stop_rev: Option<(Revision, &[u8])>,
+        with_buffer: G,
+    ) -> Result<(), RevlogError>
+    where
+        G: FnOnce(
+            usize,
+            &mut dyn FnMut(
+                &mut dyn RevisionBuffer<Target = T>,
+            ) -> Result<(), RevlogError>,
+        ) -> Result<(), RevlogError>,
+    {
+        let (delta_chain, stopped) = self
+            .revlog
+            .delta_chain(self.revision(), stop_rev.map(|(r, _)| r))?;
+        let target_size =
+            self.uncompressed_len().map(|raw_size| 4 * raw_size as u64);
 
-        // The meaning of `base_rev_or_base_of_delta_chain` depends on
-        // generaldelta. See the doc on `ENTRY_DELTA_BASE` in
-        // `mercurial/revlogutils/constants.py` and the code in
-        // [_chaininfo] and in [index_deltachain].
-        let uses_generaldelta = self.revlog.index.uses_generaldelta();
-        while let Some(base_rev) = entry.base_rev_or_base_of_delta_chain {
-            entry = if uses_generaldelta {
-                delta_chain.push(entry);
-                self.revlog.get_entry_for_checked_rev(base_rev)?
-            } else {
-                let base_rev = UncheckedRevision(entry.rev.0 - 1);
-                delta_chain.push(entry);
-                self.revlog.get_entry(base_rev)?
-            };
-        }
+        let deltas = self.revlog.chunks(delta_chain, target_size)?;
 
-        let data = if delta_chain.is_empty() {
-            entry.data_chunk()?
+        let (base_text, deltas) = if stopped {
+            (
+                stop_rev.as_ref().expect("last revision should be cached").1,
+                &deltas[..],
+            )
         } else {
-            Revlog::build_data_from_deltas(entry, &delta_chain)?.into()
+            let (buf, deltas) = deltas.split_at(1);
+            (buf[0].as_ref(), deltas)
         };
 
-        Ok(data)
+        let size = self
+            .uncompressed_len()
+            .map(|l| l as usize)
+            .unwrap_or(base_text.len());
+        with_buffer(size, &mut |buf| {
+            Revlog::build_data_from_deltas(buf, base_text, deltas)?;
+            Ok(())
+        })?;
+        Ok(())
     }
 
     fn check_data(
@@ -739,86 +687,23 @@
     }
 
     pub fn data(&self) -> Result<Cow<'revlog, [u8]>, RevlogError> {
-        let data = self.rawdata()?;
+        // TODO figure out if there is ever a need for `Cow` here anymore.
+        let mut data = CoreRevisionBuffer::new();
         if self.rev == NULL_REVISION {
-            return Ok(data);
+            return Ok(data.finish().into());
         }
+        self.rawdata(None, |size, f| {
+            // Pre-allocate the expected size (received from the index)
+            data.resize(size);
+            // Actually fill the buffer
+            f(&mut data)?;
+            Ok(())
+        })?;
         if self.is_censored() {
             return Err(HgError::CensoredNodeError.into());
         }
-        self.check_data(data)
-    }
-
-    /// Extract the data contained in the entry.
-    /// This may be a delta. (See `is_delta`.)
-    fn data_chunk(&self) -> Result<Cow<'revlog, [u8]>, HgError> {
-        if self.bytes.is_empty() {
-            return Ok(Cow::Borrowed(&[]));
-        }
-        match self.bytes[0] {
-            // Revision data is the entirety of the entry, including this
-            // header.
-            b'\0' => Ok(Cow::Borrowed(self.bytes)),
-            // Raw revision data follows.
-            b'u' => Ok(Cow::Borrowed(&self.bytes[1..])),
-            // zlib (RFC 1950) data.
-            b'x' => Ok(Cow::Owned(self.uncompressed_zlib_data()?)),
-            // zstd data.
-            b'\x28' => Ok(Cow::Owned(uncompressed_zstd_data(
-                self.bytes,
-                self.is_delta(),
-                self.uncompressed_len.max(0),
-            )?)),
-            // A proper new format should have had a repo/store requirement.
-            format_type => Err(corrupted(format!(
-                "unknown compression header '{}'",
-                format_type
-            ))),
-        }
+        self.check_data(data.finish().into())
     }
-
-    fn uncompressed_zlib_data(&self) -> Result<Vec<u8>, HgError> {
-        let mut decoder = ZlibDecoder::new(self.bytes);
-        if self.is_delta() {
-            let mut buf = Vec::with_capacity(self.compressed_len as usize);
-            decoder
-                .read_to_end(&mut buf)
-                .map_err(|e| corrupted(e.to_string()))?;
-            Ok(buf)
-        } else {
-            let cap = self.uncompressed_len.max(0) as usize;
-            let mut buf = vec![0; cap];
-            decoder
-                .read_exact(&mut buf)
-                .map_err(|e| corrupted(e.to_string()))?;
-            Ok(buf)
-        }
-    }
-
-    /// Tell if the entry is a snapshot or a delta
-    /// (influences on decompression).
-    fn is_delta(&self) -> bool {
-        self.base_rev_or_base_of_delta_chain.is_some()
-    }
-}
-
-/// Calculate the hash of a revision given its data and its parents.
-fn hash(
-    data: &[u8],
-    p1_hash: &[u8],
-    p2_hash: &[u8],
-) -> [u8; NODE_BYTES_LENGTH] {
-    let mut hasher = Sha1::new();
-    let (a, b) = (p1_hash, p2_hash);
-    if a > b {
-        hasher.update(b);
-        hasher.update(a);
-    } else {
-        hasher.update(a);
-        hasher.update(b);
-    }
-    hasher.update(data);
-    *hasher.finalize().as_ref()
 }
 
 #[cfg(test)]
--- a/rust/hg-core/src/revlog/patch.rs	Thu Oct 10 10:38:35 2024 +0200
+++ b/rust/hg-core/src/revlog/patch.rs	Thu Oct 10 10:34:51 2024 +0200
@@ -1,5 +1,9 @@
 use byteorder::{BigEndian, ByteOrder};
 
+use crate::RevlogError;
+
+use super::inner_revlog::RevisionBuffer;
+
 /// A chunk of data to insert, delete or replace in a patch
 ///
 /// A chunk is:
@@ -61,14 +65,16 @@
 
 impl<'a> PatchList<'a> {
     /// Create a `PatchList` from bytes.
-    pub fn new(data: &'a [u8]) -> Self {
+    pub fn new(data: &'a [u8]) -> Result<Self, RevlogError> {
         let mut chunks = vec![];
         let mut data = data;
         while !data.is_empty() {
             let start = BigEndian::read_u32(&data[0..]);
             let end = BigEndian::read_u32(&data[4..]);
             let len = BigEndian::read_u32(&data[8..]);
-            assert!(start <= end);
+            if start > end {
+                return Err(RevlogError::corrupted("patch cannot be decoded"));
+            }
             chunks.push(Chunk {
                 start,
                 end,
@@ -76,29 +82,23 @@
             });
             data = &data[12 + (len as usize)..];
         }
-        PatchList { chunks }
-    }
-
-    /// Return the final length of data after patching
-    /// given its initial length .
-    fn size(&self, initial_size: i32) -> i32 {
-        self.chunks
-            .iter()
-            .fold(initial_size, |acc, chunk| acc + chunk.len_diff())
+        Ok(PatchList { chunks })
     }
 
     /// Apply the patch to some data.
-    pub fn apply(&self, initial: &[u8]) -> Vec<u8> {
+    pub fn apply<T>(
+        &self,
+        buffer: &mut dyn RevisionBuffer<Target = T>,
+        initial: &[u8],
+    ) {
         let mut last: usize = 0;
-        let mut vec =
-            Vec::with_capacity(self.size(initial.len() as i32) as usize);
         for Chunk { start, end, data } in self.chunks.iter() {
-            vec.extend(&initial[last..(*start as usize)]);
-            vec.extend(data.iter());
+            let slice = &initial[last..(*start as usize)];
+            buffer.extend_from_slice(slice);
+            buffer.extend_from_slice(data);
             last = *end as usize;
         }
-        vec.extend(&initial[last..]);
-        vec
+        buffer.extend_from_slice(&initial[last..]);
     }
 
     /// Combine two patch lists into a single patch list.
@@ -229,6 +229,8 @@
 
 #[cfg(test)]
 mod tests {
+    use crate::inner_revlog::CoreRevisionBuffer;
+
     use super::*;
 
     struct PatchDataBuilder {
@@ -264,17 +266,18 @@
         let data = vec![0u8, 0u8, 0u8];
         let mut patch1_data = PatchDataBuilder::new();
         patch1_data.replace(0, 1, &[1, 2]);
-        let mut patch1 = PatchList::new(patch1_data.get());
+        let mut patch1 = PatchList::new(patch1_data.get()).unwrap();
 
         let mut patch2_data = PatchDataBuilder::new();
         patch2_data.replace(2, 4, &[3, 4]);
-        let mut patch2 = PatchList::new(patch2_data.get());
+        let mut patch2 = PatchList::new(patch2_data.get()).unwrap();
 
         let patch = patch1.combine(&mut patch2);
 
-        let result = patch.apply(&data);
+        let mut buffer = CoreRevisionBuffer::new();
+        patch.apply(&mut buffer, &data);
 
-        assert_eq!(result, vec![1u8, 2, 3, 4]);
+        assert_eq!(buffer.finish(), vec![1u8, 2, 3, 4]);
     }
 
     #[test]
@@ -282,17 +285,18 @@
         let data = vec![0u8, 0u8, 0u8];
         let mut patch1_data = PatchDataBuilder::new();
         patch1_data.replace(2, 3, &[3]);
-        let mut patch1 = PatchList::new(patch1_data.get());
+        let mut patch1 = PatchList::new(patch1_data.get()).unwrap();
 
         let mut patch2_data = PatchDataBuilder::new();
         patch2_data.replace(1, 2, &[1, 2]);
-        let mut patch2 = PatchList::new(patch2_data.get());
+        let mut patch2 = PatchList::new(patch2_data.get()).unwrap();
 
         let patch = patch1.combine(&mut patch2);
 
-        let result = patch.apply(&data);
+        let mut buffer = CoreRevisionBuffer::new();
+        patch.apply(&mut buffer, &data);
 
-        assert_eq!(result, vec![0u8, 1, 2, 3]);
+        assert_eq!(buffer.finish(), vec![0u8, 1, 2, 3]);
     }
 
     #[test]
@@ -300,17 +304,18 @@
         let data = vec![0u8, 0, 0];
         let mut patch1_data = PatchDataBuilder::new();
         patch1_data.replace(1, 2, &[3, 4]);
-        let mut patch1 = PatchList::new(patch1_data.get());
+        let mut patch1 = PatchList::new(patch1_data.get()).unwrap();
 
         let mut patch2_data = PatchDataBuilder::new();
         patch2_data.replace(1, 4, &[1, 2, 3]);
-        let mut patch2 = PatchList::new(patch2_data.get());
+        let mut patch2 = PatchList::new(patch2_data.get()).unwrap();
 
         let patch = patch1.combine(&mut patch2);
 
-        let result = patch.apply(&data);
+        let mut buffer = CoreRevisionBuffer::new();
+        patch.apply(&mut buffer, &data);
 
-        assert_eq!(result, vec![0u8, 1, 2, 3]);
+        assert_eq!(buffer.finish(), vec![0u8, 1, 2, 3]);
     }
 
     #[test]
@@ -318,17 +323,18 @@
         let data = vec![0u8, 0, 0];
         let mut patch1_data = PatchDataBuilder::new();
         patch1_data.replace(0, 1, &[1, 3]);
-        let mut patch1 = PatchList::new(patch1_data.get());
+        let mut patch1 = PatchList::new(patch1_data.get()).unwrap();
 
         let mut patch2_data = PatchDataBuilder::new();
         patch2_data.replace(1, 4, &[2, 3, 4]);
-        let mut patch2 = PatchList::new(patch2_data.get());
+        let mut patch2 = PatchList::new(patch2_data.get()).unwrap();
 
         let patch = patch1.combine(&mut patch2);
 
-        let result = patch.apply(&data);
+        let mut buffer = CoreRevisionBuffer::new();
+        patch.apply(&mut buffer, &data);
 
-        assert_eq!(result, vec![1u8, 2, 3, 4]);
+        assert_eq!(buffer.finish(), vec![1u8, 2, 3, 4]);
     }
 
     #[test]
@@ -336,17 +342,18 @@
         let data = vec![0u8, 0, 0];
         let mut patch1_data = PatchDataBuilder::new();
         patch1_data.replace(1, 3, &[1, 3, 4]);
-        let mut patch1 = PatchList::new(patch1_data.get());
+        let mut patch1 = PatchList::new(patch1_data.get()).unwrap();
 
         let mut patch2_data = PatchDataBuilder::new();
         patch2_data.replace(0, 2, &[1, 2]);
-        let mut patch2 = PatchList::new(patch2_data.get());
+        let mut patch2 = PatchList::new(patch2_data.get()).unwrap();
 
         let patch = patch1.combine(&mut patch2);
 
-        let result = patch.apply(&data);
+        let mut buffer = CoreRevisionBuffer::new();
+        patch.apply(&mut buffer, &data);
 
-        assert_eq!(result, vec![1u8, 2, 3, 4]);
+        assert_eq!(buffer.finish(), vec![1u8, 2, 3, 4]);
     }
 
     #[test]
@@ -354,16 +361,17 @@
         let data = vec![0u8, 0, 0];
         let mut patch1_data = PatchDataBuilder::new();
         patch1_data.replace(0, 3, &[1, 3, 3, 4]);
-        let mut patch1 = PatchList::new(patch1_data.get());
+        let mut patch1 = PatchList::new(patch1_data.get()).unwrap();
 
         let mut patch2_data = PatchDataBuilder::new();
         patch2_data.replace(1, 3, &[2, 3]);
-        let mut patch2 = PatchList::new(patch2_data.get());
+        let mut patch2 = PatchList::new(patch2_data.get()).unwrap();
 
         let patch = patch1.combine(&mut patch2);
 
-        let result = patch.apply(&data);
+        let mut buffer = CoreRevisionBuffer::new();
+        patch.apply(&mut buffer, &data);
 
-        assert_eq!(result, vec![1u8, 2, 3, 4]);
+        assert_eq!(buffer.finish(), vec![1u8, 2, 3, 4]);
     }
 }
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/rust/hg-core/src/transaction.rs	Thu Oct 10 10:34:51 2024 +0200
@@ -0,0 +1,11 @@
+use std::path::Path;
+
+/// The Mercurial transaction system is based on the append-only nature
+/// of its core files. This exposes the necessary methods to safely write to
+/// the different core datastructures.
+pub trait Transaction {
+    /// Record the state of an append-only file before update
+    fn add(&mut self, file: impl AsRef<Path>, offset: usize);
+
+    // TODO the rest of the methods once we do more in Rust.
+}