changeset 46594:f88e8ae0aa8f

rust: Rewrite dirstate parsing usin the `bytes-cast` crate Differential Revision: https://phab.mercurial-scm.org/D10005
author Simon Sapin <simon.sapin@octobus.net>
date Wed, 17 Feb 2021 12:06:56 +0100
parents 5ce2aa7c2ad5
children 98a455a62699
files rust/Cargo.lock rust/hg-core/Cargo.toml rust/hg-core/src/dirstate.rs rust/hg-core/src/dirstate/dirstate_map.rs rust/hg-core/src/dirstate/parsers.rs
diffstat 5 files changed, 51 insertions(+), 61 deletions(-) [+]
line wrap: on
line diff
--- a/rust/Cargo.lock	Mon Feb 15 20:13:09 2021 +0100
+++ b/rust/Cargo.lock	Wed Feb 17 12:06:56 2021 +0100
@@ -310,7 +310,6 @@
  "im-rc 15.0.0 (registry+https://github.com/rust-lang/crates.io-index)",
  "lazy_static 1.4.0 (registry+https://github.com/rust-lang/crates.io-index)",
  "log 0.4.11 (registry+https://github.com/rust-lang/crates.io-index)",
- "memchr 2.3.4 (registry+https://github.com/rust-lang/crates.io-index)",
  "memmap 0.7.0 (registry+https://github.com/rust-lang/crates.io-index)",
  "micro-timer 0.3.1 (registry+https://github.com/rust-lang/crates.io-index)",
  "pretty_assertions 0.6.1 (registry+https://github.com/rust-lang/crates.io-index)",
--- a/rust/hg-core/Cargo.toml	Mon Feb 15 20:13:09 2021 +0100
+++ b/rust/hg-core/Cargo.toml	Wed Feb 17 12:06:56 2021 +0100
@@ -15,7 +15,6 @@
 home = "0.5"
 im-rc = "15.0.*"
 lazy_static = "1.4.0"
-memchr = "2.3.3"
 rand = "0.7.3"
 rand_pcg = "0.2.1"
 rand_distr = "0.2.2"
--- a/rust/hg-core/src/dirstate.rs	Mon Feb 15 20:13:09 2021 +0100
+++ b/rust/hg-core/src/dirstate.rs	Wed Feb 17 12:06:56 2021 +0100
@@ -7,6 +7,7 @@
 
 use crate::errors::HgError;
 use crate::{utils::hg_path::HgPathBuf, FastHashMap};
+use bytes_cast::{unaligned, BytesCast};
 use std::collections::hash_map;
 use std::convert::TryFrom;
 
@@ -17,7 +18,8 @@
 pub mod parsers;
 pub mod status;
 
-#[derive(Debug, PartialEq, Clone)]
+#[derive(Debug, PartialEq, Clone, BytesCast)]
+#[repr(C)]
 pub struct DirstateParents {
     pub p1: [u8; 20],
     pub p2: [u8; 20],
@@ -34,6 +36,16 @@
     pub size: i32,
 }
 
+#[derive(BytesCast)]
+#[repr(C)]
+struct RawEntry {
+    state: u8,
+    mode: unaligned::I32Be,
+    size: unaligned::I32Be,
+    mtime: unaligned::I32Be,
+    length: unaligned::I32Be,
+}
+
 /// A `DirstateEntry` with a size of `-2` means that it was merged from the
 /// other parent. This allows revert to pick the right status back during a
 /// merge.
--- a/rust/hg-core/src/dirstate/dirstate_map.rs	Mon Feb 15 20:13:09 2021 +0100
+++ b/rust/hg-core/src/dirstate/dirstate_map.rs	Wed Feb 17 12:06:56 2021 +0100
@@ -386,10 +386,10 @@
     }
 
     #[timed]
-    pub fn read(
+    pub fn read<'a>(
         &mut self,
-        file_contents: &[u8],
-    ) -> Result<Option<DirstateParents>, DirstateError> {
+        file_contents: &'a [u8],
+    ) -> Result<Option<&'a DirstateParents>, DirstateError> {
         if file_contents.is_empty() {
             return Ok(None);
         }
--- a/rust/hg-core/src/dirstate/parsers.rs	Mon Feb 15 20:13:09 2021 +0100
+++ b/rust/hg-core/src/dirstate/parsers.rs	Wed Feb 17 12:06:56 2021 +0100
@@ -6,13 +6,13 @@
 use crate::errors::HgError;
 use crate::utils::hg_path::HgPath;
 use crate::{
-    dirstate::{CopyMap, EntryState, StateMap},
+    dirstate::{CopyMap, EntryState, RawEntry, StateMap},
     DirstateEntry, DirstateParents,
 };
-use byteorder::{BigEndian, ReadBytesExt, WriteBytesExt};
+use byteorder::{BigEndian, WriteBytesExt};
+use bytes_cast::BytesCast;
 use micro_timer::timed;
 use std::convert::{TryFrom, TryInto};
-use std::io::Cursor;
 use std::time::Duration;
 
 /// Parents are stored in the dirstate as byte hashes.
@@ -21,65 +21,45 @@
 const MIN_ENTRY_SIZE: usize = 17;
 
 type ParseResult<'a> = (
-    DirstateParents,
+    &'a DirstateParents,
     Vec<(&'a HgPath, DirstateEntry)>,
     Vec<(&'a HgPath, &'a HgPath)>,
 );
 
 #[timed]
-pub fn parse_dirstate(contents: &[u8]) -> Result<ParseResult, HgError> {
-    if contents.len() < PARENT_SIZE * 2 {
-        return Err(HgError::corrupted("Too little data for dirstate."));
-    }
-    let mut copies = vec![];
-    let mut entries = vec![];
+pub fn parse_dirstate(mut contents: &[u8]) -> Result<ParseResult, HgError> {
+    let mut copies = Vec::new();
+    let mut entries = Vec::new();
 
-    let mut curr_pos = PARENT_SIZE * 2;
-    let parents = DirstateParents {
-        p1: contents[..PARENT_SIZE].try_into().unwrap(),
-        p2: contents[PARENT_SIZE..curr_pos].try_into().unwrap(),
-    };
+    let (parents, rest) = DirstateParents::from_bytes(contents)
+        .map_err(|_| HgError::corrupted("Too little data for dirstate."))?;
+    contents = rest;
+    while !contents.is_empty() {
+        let (raw_entry, rest) = RawEntry::from_bytes(contents)
+            .map_err(|_| HgError::corrupted("Overflow in dirstate."))?;
 
-    while curr_pos < contents.len() {
-        if curr_pos + MIN_ENTRY_SIZE > contents.len() {
-            return Err(HgError::corrupted("Overflow in dirstate."));
-        }
-        let entry_bytes = &contents[curr_pos..];
+        let entry = DirstateEntry {
+            state: EntryState::try_from(raw_entry.state)?,
+            mode: raw_entry.mode.get(),
+            mtime: raw_entry.mtime.get(),
+            size: raw_entry.size.get(),
+        };
+        let (paths, rest) =
+            u8::slice_from_bytes(rest, raw_entry.length.get() as usize)
+                .map_err(|_| HgError::corrupted("Overflow in dirstate."))?;
 
-        let mut cursor = Cursor::new(entry_bytes);
-        // Unwraping errors from `byteorder` as we’ve already checked
-        // `MIN_ENTRY_SIZE` so the input should never be too short.
-        let state = EntryState::try_from(cursor.read_u8().unwrap())?;
-        let mode = cursor.read_i32::<BigEndian>().unwrap();
-        let size = cursor.read_i32::<BigEndian>().unwrap();
-        let mtime = cursor.read_i32::<BigEndian>().unwrap();
-        let path_len = cursor.read_i32::<BigEndian>().unwrap() as usize;
-
-        if path_len > contents.len() - curr_pos {
-            return Err(HgError::corrupted("Overflow in dirstate."));
+        // `paths` is either a single path, or two paths separated by a NULL
+        // byte
+        let mut iter = paths.splitn(2, |&byte| byte == b'\0');
+        let path = HgPath::new(
+            iter.next().expect("splitn always yields at least one item"),
+        );
+        if let Some(copy_source) = iter.next() {
+            copies.push((path, HgPath::new(copy_source)));
         }
 
-        // Slice instead of allocating a Vec needed for `read_exact`
-        let path = &entry_bytes[MIN_ENTRY_SIZE..MIN_ENTRY_SIZE + (path_len)];
-
-        let (path, copy) = match memchr::memchr(0, path) {
-            None => (path, None),
-            Some(i) => (&path[..i], Some(&path[(i + 1)..])),
-        };
-
-        if let Some(copy_path) = copy {
-            copies.push((HgPath::new(path), HgPath::new(copy_path)));
-        };
-        entries.push((
-            HgPath::new(path),
-            DirstateEntry {
-                state,
-                mode,
-                size,
-                mtime,
-            },
-        ));
-        curr_pos = curr_pos + MIN_ENTRY_SIZE + (path_len);
+        entries.push((path, entry));
+        contents = rest;
     }
     Ok((parents, entries, copies))
 }
@@ -374,7 +354,7 @@
             .collect();
 
         assert_eq!(
-            (parents, state_map, copymap),
+            (&parents, state_map, copymap),
             (new_parents, new_state_map, new_copy_map)
         )
     }
@@ -452,7 +432,7 @@
             .collect();
 
         assert_eq!(
-            (parents, state_map, copymap),
+            (&parents, state_map, copymap),
             (new_parents, new_state_map, new_copy_map)
         )
     }
@@ -499,7 +479,7 @@
 
         assert_eq!(
             (
-                parents,
+                &parents,
                 [(
                     HgPathBuf::from_bytes(b"f1"),
                     DirstateEntry {