changeset 48496:1fb3615dfce2

rhg: centralize index header parsing Centralize index header parsing, parse the generaldelta flag, and leave breadcrumbs to relate the code to python. Differential Revision: https://phab.mercurial-scm.org/D11881
author Arseniy Alekseyev <aalekseyev@janestreet.com>
date Tue, 07 Dec 2021 18:12:13 +0000
parents e33d7c39db47
children 96ea4db4741b
files rust/hg-core/src/revlog/index.rs rust/hg-core/src/revlog/revlog.rs
diffstat 2 files changed, 117 insertions(+), 56 deletions(-) [+]
line wrap: on
line diff
--- a/rust/hg-core/src/revlog/index.rs	Tue Dec 07 17:50:19 2021 +0000
+++ b/rust/hg-core/src/revlog/index.rs	Tue Dec 07 18:12:13 2021 +0000
@@ -9,6 +9,75 @@
 
 pub const INDEX_ENTRY_SIZE: usize = 64;
 
+pub struct IndexHeader {
+    header_bytes: [u8; 4],
+}
+
+#[derive(Copy, Clone)]
+pub struct IndexHeaderFlags {
+    flags: u16,
+}
+
+/// Corresponds to the high bits of `_format_flags` in python
+impl IndexHeaderFlags {
+    /// Corresponds to FLAG_INLINE_DATA in python
+    pub fn is_inline(self) -> bool {
+        return self.flags & 1 != 0;
+    }
+    /// Corresponds to FLAG_GENERALDELTA in python
+    pub fn uses_generaldelta(self) -> bool {
+        return self.flags & 2 != 0;
+    }
+}
+
+/// Corresponds to the INDEX_HEADER structure,
+/// which is parsed as a `header` variable in `_loadindex` in `revlog.py`
+impl IndexHeader {
+    fn format_flags(&self) -> IndexHeaderFlags {
+        // No "unknown flags" check here, unlike in python. Maybe there should
+        // be.
+        return IndexHeaderFlags {
+            flags: BigEndian::read_u16(&self.header_bytes[0..2]),
+        };
+    }
+
+    /// The only revlog version currently supported by rhg.
+    const REVLOGV1: u16 = 1;
+
+    /// Corresponds to `_format_version` in Python.
+    fn format_version(&self) -> u16 {
+        return BigEndian::read_u16(&self.header_bytes[2..4]);
+    }
+
+    const EMPTY_INDEX_HEADER: IndexHeader = IndexHeader {
+        // We treat an empty file as a valid index with no entries.
+        // Here we make an arbitrary choice of what we assume the format of the
+        // index to be (V1, using generaldelta).
+        // This doesn't matter too much, since we're only doing read-only
+        // access. but the value corresponds to the `new_header` variable in
+        // `revlog.py`, `_loadindex`
+        header_bytes: [0, 3, 0, 1],
+    };
+
+    fn parse(index_bytes: &[u8]) -> Result<IndexHeader, HgError> {
+        if index_bytes.len() == 0 {
+            return Ok(IndexHeader::EMPTY_INDEX_HEADER);
+        }
+        if index_bytes.len() < 4 {
+            return Err(HgError::corrupted(
+                "corrupted revlog: can't read the index format header",
+            ));
+        }
+        return Ok(IndexHeader {
+            header_bytes: {
+                let bytes: [u8; 4] =
+                    index_bytes[0..4].try_into().expect("impossible");
+                bytes
+            },
+        });
+    }
+}
+
 /// A Revlog index
 pub struct Index {
     bytes: Box<dyn Deref<Target = [u8]> + Send>,
@@ -23,7 +92,15 @@
     pub fn new(
         bytes: Box<dyn Deref<Target = [u8]> + Send>,
     ) -> Result<Self, HgError> {
-        if is_inline(&bytes) {
+        let header = IndexHeader::parse(bytes.as_ref())?;
+
+        if header.format_version() != IndexHeader::REVLOGV1 {
+            // A proper new version should have had a repo/store
+            // requirement.
+            return Err(HgError::corrupted("unsupported revlog version"));
+        }
+
+        if header.format_flags().is_inline() {
             let mut offset: usize = 0;
             let mut offsets = Vec::new();
 
@@ -206,17 +283,6 @@
     }
 }
 
-/// Value of the inline flag.
-pub fn is_inline(index_bytes: &[u8]) -> bool {
-    if index_bytes.len() < 4 {
-        return true;
-    }
-    match &index_bytes[0..=1] {
-        [0, 0] | [0, 2] => false,
-        _ => true,
-    }
-}
-
 #[cfg(test)]
 mod tests {
     use super::*;
@@ -313,37 +379,60 @@
         }
     }
 
+    pub fn is_inline(index_bytes: &[u8]) -> bool {
+        IndexHeader::parse(index_bytes)
+            .expect("too short")
+            .format_flags()
+            .is_inline()
+    }
+
+    pub fn uses_generaldelta(index_bytes: &[u8]) -> bool {
+        IndexHeader::parse(index_bytes)
+            .expect("too short")
+            .format_flags()
+            .uses_generaldelta()
+    }
+
+    pub fn get_version(index_bytes: &[u8]) -> u16 {
+        IndexHeader::parse(index_bytes)
+            .expect("too short")
+            .format_version()
+    }
+
     #[test]
-    fn is_not_inline_when_no_inline_flag_test() {
+    fn flags_when_no_inline_flag_test() {
         let bytes = IndexEntryBuilder::new()
             .is_first(true)
             .with_general_delta(false)
             .with_inline(false)
             .build();
 
-        assert_eq!(is_inline(&bytes), false)
+        assert_eq!(is_inline(&bytes), false);
+        assert_eq!(uses_generaldelta(&bytes), false);
     }
 
     #[test]
-    fn is_inline_when_inline_flag_test() {
+    fn flags_when_inline_flag_test() {
         let bytes = IndexEntryBuilder::new()
             .is_first(true)
             .with_general_delta(false)
             .with_inline(true)
             .build();
 
-        assert_eq!(is_inline(&bytes), true)
+        assert_eq!(is_inline(&bytes), true);
+        assert_eq!(uses_generaldelta(&bytes), false);
     }
 
     #[test]
-    fn is_inline_when_inline_and_generaldelta_flags_test() {
+    fn flags_when_inline_and_generaldelta_flags_test() {
         let bytes = IndexEntryBuilder::new()
             .is_first(true)
             .with_general_delta(true)
             .with_inline(true)
             .build();
 
-        assert_eq!(is_inline(&bytes), true)
+        assert_eq!(is_inline(&bytes), true);
+        assert_eq!(uses_generaldelta(&bytes), true);
     }
 
     #[test]
@@ -400,6 +489,16 @@
 
         assert_eq!(entry.base_revision(), 1)
     }
+
+    #[test]
+    fn version_test() {
+        let bytes = IndexEntryBuilder::new()
+            .is_first(true)
+            .with_version(1)
+            .build();
+
+        assert_eq!(get_version(&bytes), 1)
+    }
 }
 
 #[cfg(test)]
--- a/rust/hg-core/src/revlog/revlog.rs	Tue Dec 07 17:50:19 2021 +0000
+++ b/rust/hg-core/src/revlog/revlog.rs	Tue Dec 07 18:12:13 2021 +0000
@@ -3,7 +3,6 @@
 use std::ops::Deref;
 use std::path::Path;
 
-use byteorder::{BigEndian, ByteOrder};
 use flate2::read::ZlibDecoder;
 use micro_timer::timed;
 use sha1::{Digest, Sha1};
@@ -74,13 +73,6 @@
             match repo.store_vfs().mmap_open_opt(&index_path)? {
                 None => Index::new(Box::new(vec![])),
                 Some(index_mmap) => {
-                    let version = get_version(&index_mmap)?;
-                    if version != 1 {
-                        // A proper new version should have had a repo/store
-                        // requirement.
-                        return Err(HgError::corrupted("corrupted revlog"));
-                    }
-
                     let index = Index::new(Box::new(index_mmap))?;
                     Ok(index)
                 }
@@ -387,19 +379,6 @@
     }
 }
 
-/// Format version of the revlog.
-pub fn get_version(index_bytes: &[u8]) -> Result<u16, HgError> {
-    if index_bytes.len() == 0 {
-        return Ok(1);
-    };
-    if index_bytes.len() < 4 {
-        return Err(HgError::corrupted(
-            "corrupted revlog: can't read the index format header",
-        ));
-    };
-    Ok(BigEndian::read_u16(&index_bytes[2..=3]))
-}
-
 /// Calculate the hash of a revision given its data and its parents.
 fn hash(
     data: &[u8],
@@ -418,20 +397,3 @@
     hasher.update(data);
     *hasher.finalize().as_ref()
 }
-
-#[cfg(test)]
-mod tests {
-    use super::*;
-
-    use super::super::index::IndexEntryBuilder;
-
-    #[test]
-    fn version_test() {
-        let bytes = IndexEntryBuilder::new()
-            .is_first(true)
-            .with_version(1)
-            .build();
-
-        assert_eq!(get_version(&bytes).map_err(|_err| ()), Ok(1))
-    }
-}