view rust/hg-core/src/revlog/index.rs @ 48571:35c47015b9b7

rhg: Expose FilelogEntry that wraps RevlogEntry This can be later extended to access metadata such as `uncompressed_len` without necessarily resolving deltas. Differential Revision: https://phab.mercurial-scm.org/D11961
author Simon Sapin <simon.sapin@octobus.net>
date Tue, 21 Dec 2021 18:50:44 +0100
parents 96ea4db4741b
children 0a4ac916673e
line wrap: on
line source

use std::convert::TryInto;
use std::ops::Deref;

use byteorder::{BigEndian, ByteOrder};

use crate::errors::HgError;
use crate::revlog::node::Node;
use crate::revlog::{Revision, NULL_REVISION};

pub const INDEX_ENTRY_SIZE: usize = 64;

pub struct IndexHeader {
    header_bytes: [u8; 4],
}

#[derive(Copy, Clone)]
pub struct IndexHeaderFlags {
    flags: u16,
}

/// Corresponds to the high bits of `_format_flags` in python
impl IndexHeaderFlags {
    /// Corresponds to FLAG_INLINE_DATA in python
    pub fn is_inline(self) -> bool {
        return self.flags & 1 != 0;
    }
    /// Corresponds to FLAG_GENERALDELTA in python
    pub fn uses_generaldelta(self) -> bool {
        return self.flags & 2 != 0;
    }
}

/// Corresponds to the INDEX_HEADER structure,
/// which is parsed as a `header` variable in `_loadindex` in `revlog.py`
impl IndexHeader {
    fn format_flags(&self) -> IndexHeaderFlags {
        // No "unknown flags" check here, unlike in python. Maybe there should
        // be.
        return IndexHeaderFlags {
            flags: BigEndian::read_u16(&self.header_bytes[0..2]),
        };
    }

    /// The only revlog version currently supported by rhg.
    const REVLOGV1: u16 = 1;

    /// Corresponds to `_format_version` in Python.
    fn format_version(&self) -> u16 {
        return BigEndian::read_u16(&self.header_bytes[2..4]);
    }

    const EMPTY_INDEX_HEADER: IndexHeader = IndexHeader {
        // We treat an empty file as a valid index with no entries.
        // Here we make an arbitrary choice of what we assume the format of the
        // index to be (V1, using generaldelta).
        // This doesn't matter too much, since we're only doing read-only
        // access. but the value corresponds to the `new_header` variable in
        // `revlog.py`, `_loadindex`
        header_bytes: [0, 3, 0, 1],
    };

    fn parse(index_bytes: &[u8]) -> Result<IndexHeader, HgError> {
        if index_bytes.len() == 0 {
            return Ok(IndexHeader::EMPTY_INDEX_HEADER);
        }
        if index_bytes.len() < 4 {
            return Err(HgError::corrupted(
                "corrupted revlog: can't read the index format header",
            ));
        }
        return Ok(IndexHeader {
            header_bytes: {
                let bytes: [u8; 4] =
                    index_bytes[0..4].try_into().expect("impossible");
                bytes
            },
        });
    }
}

/// A Revlog index
pub struct Index {
    bytes: Box<dyn Deref<Target = [u8]> + Send>,
    /// Offsets of starts of index blocks.
    /// Only needed when the index is interleaved with data.
    offsets: Option<Vec<usize>>,
    uses_generaldelta: bool,
}

impl Index {
    /// Create an index from bytes.
    /// Calculate the start of each entry when is_inline is true.
    pub fn new(
        bytes: Box<dyn Deref<Target = [u8]> + Send>,
    ) -> Result<Self, HgError> {
        let header = IndexHeader::parse(bytes.as_ref())?;

        if header.format_version() != IndexHeader::REVLOGV1 {
            // A proper new version should have had a repo/store
            // requirement.
            return Err(HgError::corrupted("unsupported revlog version"));
        }

        // This is only correct because we know version is REVLOGV1.
        // In v2 we always use generaldelta, while in v0 we never use
        // generaldelta. Similar for [is_inline] (it's only used in v1).
        let uses_generaldelta = header.format_flags().uses_generaldelta();

        if header.format_flags().is_inline() {
            let mut offset: usize = 0;
            let mut offsets = Vec::new();

            while offset + INDEX_ENTRY_SIZE <= bytes.len() {
                offsets.push(offset);
                let end = offset + INDEX_ENTRY_SIZE;
                let entry = IndexEntry {
                    bytes: &bytes[offset..end],
                    offset_override: None,
                };

                offset += INDEX_ENTRY_SIZE + entry.compressed_len();
            }

            if offset == bytes.len() {
                Ok(Self {
                    bytes,
                    offsets: Some(offsets),
                    uses_generaldelta,
                })
            } else {
                Err(HgError::corrupted("unexpected inline revlog length")
                    .into())
            }
        } else {
            Ok(Self {
                bytes,
                offsets: None,
                uses_generaldelta,
            })
        }
    }

    pub fn uses_generaldelta(&self) -> bool {
        self.uses_generaldelta
    }

    /// Value of the inline flag.
    pub fn is_inline(&self) -> bool {
        self.offsets.is_some()
    }

    /// Return a slice of bytes if `revlog` is inline. Panic if not.
    pub fn data(&self, start: usize, end: usize) -> &[u8] {
        if !self.is_inline() {
            panic!("tried to access data in the index of a revlog that is not inline");
        }
        &self.bytes[start..end]
    }

    /// Return number of entries of the revlog index.
    pub fn len(&self) -> usize {
        if let Some(offsets) = &self.offsets {
            offsets.len()
        } else {
            self.bytes.len() / INDEX_ENTRY_SIZE
        }
    }

    /// Returns `true` if the `Index` has zero `entries`.
    pub fn is_empty(&self) -> bool {
        self.len() == 0
    }

    /// Return the index entry corresponding to the given revision if it
    /// exists.
    pub fn get_entry(&self, rev: Revision) -> Option<IndexEntry> {
        if rev == NULL_REVISION {
            return None;
        }
        if let Some(offsets) = &self.offsets {
            self.get_entry_inline(rev, offsets)
        } else {
            self.get_entry_separated(rev)
        }
    }

    fn get_entry_inline(
        &self,
        rev: Revision,
        offsets: &[usize],
    ) -> Option<IndexEntry> {
        let start = *offsets.get(rev as usize)?;
        let end = start.checked_add(INDEX_ENTRY_SIZE)?;
        let bytes = &self.bytes[start..end];

        // See IndexEntry for an explanation of this override.
        let offset_override = Some(end);

        Some(IndexEntry {
            bytes,
            offset_override,
        })
    }

    fn get_entry_separated(&self, rev: Revision) -> Option<IndexEntry> {
        let max_rev = self.bytes.len() / INDEX_ENTRY_SIZE;
        if rev as usize >= max_rev {
            return None;
        }
        let start = rev as usize * INDEX_ENTRY_SIZE;
        let end = start + INDEX_ENTRY_SIZE;
        let bytes = &self.bytes[start..end];

        // Override the offset of the first revision as its bytes are used
        // for the index's metadata (saving space because it is always 0)
        let offset_override = if rev == 0 { Some(0) } else { None };

        Some(IndexEntry {
            bytes,
            offset_override,
        })
    }
}

impl super::RevlogIndex for Index {
    fn len(&self) -> usize {
        self.len()
    }

    fn node(&self, rev: Revision) -> Option<&Node> {
        self.get_entry(rev).map(|entry| entry.hash())
    }
}

#[derive(Debug)]
pub struct IndexEntry<'a> {
    bytes: &'a [u8],
    /// Allows to override the offset value of the entry.
    ///
    /// For interleaved index and data, the offset stored in the index
    /// corresponds to the separated data offset.
    /// It has to be overridden with the actual offset in the interleaved
    /// index which is just after the index block.
    ///
    /// For separated index and data, the offset stored in the first index
    /// entry is mixed with the index headers.
    /// It has to be overridden with 0.
    offset_override: Option<usize>,
}

impl<'a> IndexEntry<'a> {
    /// Return the offset of the data.
    pub fn offset(&self) -> usize {
        if let Some(offset_override) = self.offset_override {
            offset_override
        } else {
            let mut bytes = [0; 8];
            bytes[2..8].copy_from_slice(&self.bytes[0..=5]);
            BigEndian::read_u64(&bytes[..]) as usize
        }
    }

    /// Return the compressed length of the data.
    pub fn compressed_len(&self) -> usize {
        BigEndian::read_u32(&self.bytes[8..=11]) as usize
    }

    /// Return the uncompressed length of the data.
    pub fn uncompressed_len(&self) -> usize {
        BigEndian::read_u32(&self.bytes[12..=15]) as usize
    }

    /// Return the revision upon which the data has been derived.
    pub fn base_revision_or_base_of_delta_chain(&self) -> Revision {
        // TODO Maybe return an Option when base_revision == rev?
        //      Requires to add rev to IndexEntry

        BigEndian::read_i32(&self.bytes[16..])
    }

    pub fn p1(&self) -> Revision {
        BigEndian::read_i32(&self.bytes[24..])
    }

    pub fn p2(&self) -> Revision {
        BigEndian::read_i32(&self.bytes[28..])
    }

    /// Return the hash of revision's full text.
    ///
    /// Currently, SHA-1 is used and only the first 20 bytes of this field
    /// are used.
    pub fn hash(&self) -> &'a Node {
        (&self.bytes[32..52]).try_into().unwrap()
    }
}

#[cfg(test)]
mod tests {
    use super::*;

    #[cfg(test)]
    #[derive(Debug, Copy, Clone)]
    pub struct IndexEntryBuilder {
        is_first: bool,
        is_inline: bool,
        is_general_delta: bool,
        version: u16,
        offset: usize,
        compressed_len: usize,
        uncompressed_len: usize,
        base_revision_or_base_of_delta_chain: Revision,
    }

    #[cfg(test)]
    impl IndexEntryBuilder {
        pub fn new() -> Self {
            Self {
                is_first: false,
                is_inline: false,
                is_general_delta: true,
                version: 2,
                offset: 0,
                compressed_len: 0,
                uncompressed_len: 0,
                base_revision_or_base_of_delta_chain: 0,
            }
        }

        pub fn is_first(&mut self, value: bool) -> &mut Self {
            self.is_first = value;
            self
        }

        pub fn with_inline(&mut self, value: bool) -> &mut Self {
            self.is_inline = value;
            self
        }

        pub fn with_general_delta(&mut self, value: bool) -> &mut Self {
            self.is_general_delta = value;
            self
        }

        pub fn with_version(&mut self, value: u16) -> &mut Self {
            self.version = value;
            self
        }

        pub fn with_offset(&mut self, value: usize) -> &mut Self {
            self.offset = value;
            self
        }

        pub fn with_compressed_len(&mut self, value: usize) -> &mut Self {
            self.compressed_len = value;
            self
        }

        pub fn with_uncompressed_len(&mut self, value: usize) -> &mut Self {
            self.uncompressed_len = value;
            self
        }

        pub fn with_base_revision_or_base_of_delta_chain(
            &mut self,
            value: Revision,
        ) -> &mut Self {
            self.base_revision_or_base_of_delta_chain = value;
            self
        }

        pub fn build(&self) -> Vec<u8> {
            let mut bytes = Vec::with_capacity(INDEX_ENTRY_SIZE);
            if self.is_first {
                bytes.extend(&match (self.is_general_delta, self.is_inline) {
                    (false, false) => [0u8, 0],
                    (false, true) => [0u8, 1],
                    (true, false) => [0u8, 2],
                    (true, true) => [0u8, 3],
                });
                bytes.extend(&self.version.to_be_bytes());
                // Remaining offset bytes.
                bytes.extend(&[0u8; 2]);
            } else {
                // Offset stored on 48 bits (6 bytes)
                bytes.extend(&(self.offset as u64).to_be_bytes()[2..]);
            }
            bytes.extend(&[0u8; 2]); // Revision flags.
            bytes.extend(&(self.compressed_len as u32).to_be_bytes());
            bytes.extend(&(self.uncompressed_len as u32).to_be_bytes());
            bytes.extend(
                &self.base_revision_or_base_of_delta_chain.to_be_bytes(),
            );
            bytes
        }
    }

    pub fn is_inline(index_bytes: &[u8]) -> bool {
        IndexHeader::parse(index_bytes)
            .expect("too short")
            .format_flags()
            .is_inline()
    }

    pub fn uses_generaldelta(index_bytes: &[u8]) -> bool {
        IndexHeader::parse(index_bytes)
            .expect("too short")
            .format_flags()
            .uses_generaldelta()
    }

    pub fn get_version(index_bytes: &[u8]) -> u16 {
        IndexHeader::parse(index_bytes)
            .expect("too short")
            .format_version()
    }

    #[test]
    fn flags_when_no_inline_flag_test() {
        let bytes = IndexEntryBuilder::new()
            .is_first(true)
            .with_general_delta(false)
            .with_inline(false)
            .build();

        assert_eq!(is_inline(&bytes), false);
        assert_eq!(uses_generaldelta(&bytes), false);
    }

    #[test]
    fn flags_when_inline_flag_test() {
        let bytes = IndexEntryBuilder::new()
            .is_first(true)
            .with_general_delta(false)
            .with_inline(true)
            .build();

        assert_eq!(is_inline(&bytes), true);
        assert_eq!(uses_generaldelta(&bytes), false);
    }

    #[test]
    fn flags_when_inline_and_generaldelta_flags_test() {
        let bytes = IndexEntryBuilder::new()
            .is_first(true)
            .with_general_delta(true)
            .with_inline(true)
            .build();

        assert_eq!(is_inline(&bytes), true);
        assert_eq!(uses_generaldelta(&bytes), true);
    }

    #[test]
    fn test_offset() {
        let bytes = IndexEntryBuilder::new().with_offset(1).build();
        let entry = IndexEntry {
            bytes: &bytes,
            offset_override: None,
        };

        assert_eq!(entry.offset(), 1)
    }

    #[test]
    fn test_with_overridden_offset() {
        let bytes = IndexEntryBuilder::new().with_offset(1).build();
        let entry = IndexEntry {
            bytes: &bytes,
            offset_override: Some(2),
        };

        assert_eq!(entry.offset(), 2)
    }

    #[test]
    fn test_compressed_len() {
        let bytes = IndexEntryBuilder::new().with_compressed_len(1).build();
        let entry = IndexEntry {
            bytes: &bytes,
            offset_override: None,
        };

        assert_eq!(entry.compressed_len(), 1)
    }

    #[test]
    fn test_uncompressed_len() {
        let bytes = IndexEntryBuilder::new().with_uncompressed_len(1).build();
        let entry = IndexEntry {
            bytes: &bytes,
            offset_override: None,
        };

        assert_eq!(entry.uncompressed_len(), 1)
    }

    #[test]
    fn test_base_revision_or_base_of_delta_chain() {
        let bytes = IndexEntryBuilder::new()
            .with_base_revision_or_base_of_delta_chain(1)
            .build();
        let entry = IndexEntry {
            bytes: &bytes,
            offset_override: None,
        };

        assert_eq!(entry.base_revision_or_base_of_delta_chain(), 1)
    }

    #[test]
    fn version_test() {
        let bytes = IndexEntryBuilder::new()
            .is_first(true)
            .with_version(1)
            .build();

        assert_eq!(get_version(&bytes), 1)
    }
}

#[cfg(test)]
pub use tests::IndexEntryBuilder;