diff rust/hg-core/src/revlog/revlog.rs @ 45526:26c53ee51c68

hg-core: Add a limited read only `revlog` implementation Only covers the needs of the upcoming `rhg debugdata` command. Differential Revision: https://phab.mercurial-scm.org/D8958
author Antoine Cezar <antoine.cezar@octobus.net>
date Fri, 04 Sep 2020 11:55:07 +0200
parents
children b0d6309ff50c
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/rust/hg-core/src/revlog/revlog.rs	Fri Sep 04 11:55:07 2020 +0200
@@ -0,0 +1,353 @@
+use std::borrow::Cow;
+use std::fs::File;
+use std::io::Read;
+use std::ops::Deref;
+use std::path::Path;
+
+use byteorder::{BigEndian, ByteOrder};
+use flate2::read::ZlibDecoder;
+use memmap::{Mmap, MmapOptions};
+use micro_timer::timed;
+use zstd;
+
+use super::index::Index;
+use super::patch;
+use crate::revlog::Revision;
+
+pub enum RevlogError {
+    IoError(std::io::Error),
+    UnsuportedVersion(u16),
+    InvalidRevision,
+    Corrupted,
+    UnknowDataFormat(u8),
+}
+
+fn mmap_open(path: &Path) -> Result<Mmap, std::io::Error> {
+    let file = File::open(path)?;
+    let mmap = unsafe { MmapOptions::new().map(&file) }?;
+    Ok(mmap)
+}
+
+/// Read only implementation of revlog.
+pub struct Revlog {
+    /// When index and data are not interleaved: bytes of the revlog index.
+    /// When index and data are interleaved: bytes of the revlog index and
+    /// data.
+    index_bytes: Box<dyn Deref<Target = [u8]> + Send>,
+    /// When index and data are not interleaved: bytes of the revlog data
+    data_bytes: Option<Box<dyn Deref<Target = [u8]> + Send>>,
+}
+
+impl Revlog {
+    /// Open a revlog index file.
+    ///
+    /// It will also open the associated data file if index and data are not
+    /// interleaved.
+    #[timed]
+    pub fn open(index_path: &Path) -> Result<Self, RevlogError> {
+        let index_mmap =
+            mmap_open(&index_path).map_err(RevlogError::IoError)?;
+
+        let version = get_version(&index_mmap);
+        if version != 1 {
+            return Err(RevlogError::UnsuportedVersion(version));
+        }
+
+        let is_inline = is_inline(&index_mmap);
+
+        let index_bytes = Box::new(index_mmap);
+
+        // TODO load data only when needed //
+        // type annotation required
+        // won't recognize Mmap as Deref<Target = [u8]>
+        let data_bytes: Option<Box<dyn Deref<Target = [u8]> + Send>> =
+            if is_inline {
+                None
+            } else {
+                let data_path = index_path.with_extension("d");
+                let data_mmap =
+                    mmap_open(&data_path).map_err(RevlogError::IoError)?;
+                Some(Box::new(data_mmap))
+            };
+
+        Ok(Revlog {
+            index_bytes,
+            data_bytes,
+        })
+    }
+
+    /// Return the full data associated to a revision.
+    ///
+    /// All entries required to build the final data out of deltas will be
+    /// retrieved as needed, and the deltas will be applied to the inital
+    /// snapshot to rebuild the final data.
+    #[timed]
+    pub fn get_rev_data(&self, rev: Revision) -> Result<Vec<u8>, RevlogError> {
+        // Todo return -> Cow
+        let mut entry = self.get_entry(rev)?;
+        let mut delta_chain = vec![];
+        while let Some(base_rev) = entry.base_rev {
+            delta_chain.push(entry);
+            entry = self
+                .get_entry(base_rev)
+                .map_err(|_| RevlogError::Corrupted)?;
+        }
+
+        if delta_chain.is_empty() {
+            Ok(entry.data()?.into())
+        } else {
+            Revlog::build_data_from_deltas(entry, &delta_chain)
+        }
+    }
+
+    /// Build the full data of a revision out its snapshot
+    /// and its deltas.
+    #[timed]
+    fn build_data_from_deltas(
+        snapshot: RevlogEntry,
+        deltas: &[RevlogEntry],
+    ) -> Result<Vec<u8>, RevlogError> {
+        let snapshot = snapshot.data()?;
+        let deltas = deltas
+            .iter()
+            .rev()
+            .map(RevlogEntry::data)
+            .collect::<Result<Vec<Cow<'_, [u8]>>, RevlogError>>()?;
+        let patches: Vec<_> =
+            deltas.iter().map(|d| patch::PatchList::new(d)).collect();
+        let patch = patch::fold_patch_lists(&patches);
+        Ok(patch.apply(&snapshot))
+    }
+
+    /// Return the revlog index.
+    fn index(&self) -> Index {
+        let is_inline = self.data_bytes.is_none();
+        Index::new(&self.index_bytes, is_inline)
+    }
+
+    /// Return the revlog data.
+    fn data(&self) -> &[u8] {
+        match self.data_bytes {
+            Some(ref data_bytes) => &data_bytes,
+            None => &self.index_bytes,
+        }
+    }
+
+    /// Get an entry of the revlog.
+    fn get_entry(&self, rev: Revision) -> Result<RevlogEntry, RevlogError> {
+        let index = self.index();
+        let index_entry =
+            index.get_entry(rev).ok_or(RevlogError::InvalidRevision)?;
+        let start = index_entry.offset();
+        let end = start + index_entry.compressed_len();
+        let entry = RevlogEntry {
+            rev,
+            bytes: &self.data()[start..end],
+            compressed_len: index_entry.compressed_len(),
+            uncompressed_len: index_entry.uncompressed_len(),
+            base_rev: if index_entry.base_revision() == rev {
+                None
+            } else {
+                Some(index_entry.base_revision())
+            },
+        };
+        Ok(entry)
+    }
+}
+
+/// The revlog entry's bytes and the necessary informations to extract
+/// the entry's data.
+#[derive(Debug)]
+pub struct RevlogEntry<'a> {
+    rev: Revision,
+    bytes: &'a [u8],
+    compressed_len: usize,
+    uncompressed_len: usize,
+    base_rev: Option<Revision>,
+}
+
+impl<'a> RevlogEntry<'a> {
+    /// Extract the data contained in the entry.
+    pub fn data(&self) -> Result<Cow<'_, [u8]>, RevlogError> {
+        if self.bytes.is_empty() {
+            return Ok(Cow::Borrowed(&[]));
+        }
+        match self.bytes[0] {
+            // Revision data is the entirety of the entry, including this
+            // header.
+            b'\0' => Ok(Cow::Borrowed(self.bytes)),
+            // Raw revision data follows.
+            b'u' => Ok(Cow::Borrowed(&self.bytes[1..])),
+            // zlib (RFC 1950) data.
+            b'x' => Ok(Cow::Owned(self.uncompressed_zlib_data())),
+            // zstd data.
+            b'\x28' => Ok(Cow::Owned(self.uncompressed_zstd_data())),
+            format_type => Err(RevlogError::UnknowDataFormat(format_type)),
+        }
+    }
+
+    fn uncompressed_zlib_data(&self) -> Vec<u8> {
+        let mut decoder = ZlibDecoder::new(self.bytes);
+        if self.is_delta() {
+            let mut buf = Vec::with_capacity(self.compressed_len);
+            decoder.read_to_end(&mut buf).expect("corrupted zlib data");
+            buf
+        } else {
+            let mut buf = vec![0; self.uncompressed_len];
+            decoder.read_exact(&mut buf).expect("corrupted zlib data");
+            buf
+        }
+    }
+
+    fn uncompressed_zstd_data(&self) -> Vec<u8> {
+        if self.is_delta() {
+            let mut buf = Vec::with_capacity(self.compressed_len);
+            zstd::stream::copy_decode(self.bytes, &mut buf)
+                .expect("corrupted zstd data");
+            buf
+        } else {
+            let mut buf = vec![0; self.uncompressed_len];
+            let len = zstd::block::decompress_to_buffer(self.bytes, &mut buf)
+                .expect("corrupted zstd data");
+            assert_eq!(len, self.uncompressed_len, "corrupted zstd data");
+            buf
+        }
+    }
+
+    /// Tell if the entry is a snapshot or a delta
+    /// (influences on decompression).
+    fn is_delta(&self) -> bool {
+        self.base_rev.is_some()
+    }
+}
+
+/// Value of the inline flag.
+pub fn is_inline(index_bytes: &[u8]) -> bool {
+    match &index_bytes[0..=1] {
+        [0, 0] | [0, 2] => false,
+        _ => true,
+    }
+}
+
+/// Format version of the revlog.
+pub fn get_version(index_bytes: &[u8]) -> u16 {
+    BigEndian::read_u16(&index_bytes[2..=3])
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+
+    use super::super::index::IndexEntryBuilder;
+
+    #[cfg(test)]
+    pub struct RevlogBuilder {
+        version: u16,
+        is_general_delta: bool,
+        is_inline: bool,
+        offset: usize,
+        index: Vec<Vec<u8>>,
+        data: Vec<Vec<u8>>,
+    }
+
+    #[cfg(test)]
+    impl RevlogBuilder {
+        pub fn new() -> Self {
+            Self {
+                version: 2,
+                is_inline: false,
+                is_general_delta: true,
+                offset: 0,
+                index: vec![],
+                data: vec![],
+            }
+        }
+
+        pub fn with_inline(&mut self, value: bool) -> &mut Self {
+            self.is_inline = value;
+            self
+        }
+
+        pub fn with_general_delta(&mut self, value: bool) -> &mut Self {
+            self.is_general_delta = value;
+            self
+        }
+
+        pub fn with_version(&mut self, value: u16) -> &mut Self {
+            self.version = value;
+            self
+        }
+
+        pub fn push(
+            &mut self,
+            mut index: IndexEntryBuilder,
+            data: Vec<u8>,
+        ) -> &mut Self {
+            if self.index.is_empty() {
+                index.is_first(true);
+                index.with_general_delta(self.is_general_delta);
+                index.with_inline(self.is_inline);
+                index.with_version(self.version);
+            } else {
+                index.with_offset(self.offset);
+            }
+            self.index.push(index.build());
+            self.offset += data.len();
+            self.data.push(data);
+            self
+        }
+
+        pub fn build_inline(&self) -> Vec<u8> {
+            let mut bytes =
+                Vec::with_capacity(self.index.len() + self.data.len());
+            for (index, data) in self.index.iter().zip(self.data.iter()) {
+                bytes.extend(index);
+                bytes.extend(data);
+            }
+            bytes
+        }
+    }
+
+    #[test]
+    fn is_not_inline_when_no_inline_flag_test() {
+        let bytes = RevlogBuilder::new()
+            .with_general_delta(false)
+            .with_inline(false)
+            .push(IndexEntryBuilder::new(), vec![])
+            .build_inline();
+
+        assert_eq!(is_inline(&bytes), false)
+    }
+
+    #[test]
+    fn is_inline_when_inline_flag_test() {
+        let bytes = RevlogBuilder::new()
+            .with_general_delta(false)
+            .with_inline(true)
+            .push(IndexEntryBuilder::new(), vec![])
+            .build_inline();
+
+        assert_eq!(is_inline(&bytes), true)
+    }
+
+    #[test]
+    fn is_inline_when_inline_and_generaldelta_flags_test() {
+        let bytes = RevlogBuilder::new()
+            .with_general_delta(true)
+            .with_inline(true)
+            .push(IndexEntryBuilder::new(), vec![])
+            .build_inline();
+
+        assert_eq!(is_inline(&bytes), true)
+    }
+
+    #[test]
+    fn version_test() {
+        let bytes = RevlogBuilder::new()
+            .with_version(1)
+            .push(IndexEntryBuilder::new(), vec![])
+            .build_inline();
+
+        assert_eq!(get_version(&bytes), 1)
+    }
+}