rust-dirstate: add rust implementation of `parse_dirstate` and `pack_dirstate`
authorRaphaël Gomès <rgomes@octobus.net>
Mon, 06 May 2019 22:48:09 +0200
changeset 42302 d1786c1d34fa
parent 42301 2a7109cc5a28
child 42303 e240bec26626
rust-dirstate: add rust implementation of `parse_dirstate` and `pack_dirstate` Working towards the goal of having a complete Rust implementation of `hg status`, these two utils are a first step of many to be taken to improve performance and code maintainability. Two dependencies have been added: `memchr` and `byteorder`. Both of them have been written by reputable community members and are very mature crates. The Rust code will often need to use their byte-oriented functions. A few unit tests have been added and may help future development and debugging. In a future patch that uses `parse_dirstate` to stat the working tree in parallel - which neither the Python nor the C implementations do - actual performance improvements will be seen for larger repositories. Differential Revision: https://phab.mercurial-scm.org/D6348
rust/hg-core/Cargo.toml
rust/hg-core/src/dirstate.rs
rust/hg-core/src/lib.rs
--- a/rust/hg-core/Cargo.toml	Tue May 14 22:56:58 2019 -0700
+++ b/rust/hg-core/Cargo.toml	Mon May 06 22:48:09 2019 +0200
@@ -10,3 +10,7 @@
 [dev-dependencies]
 rand = "*"
 rand_pcg = "*"
+
+[dependencies]
+memchr = "2.2.0"
+byteorder = "1.3.1"
\ No newline at end of file
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/rust/hg-core/src/dirstate.rs	Mon May 06 22:48:09 2019 +0200
@@ -0,0 +1,409 @@
+// Copyright 2019 Raphaël Gomès <rgomes@octobus.net>
+//
+// This software may be used and distributed according to the terms of the
+// GNU General Public License version 2 or any later version.
+
+use byteorder::{BigEndian, ReadBytesExt, WriteBytesExt};
+use std::collections::HashMap;
+use std::io::Cursor;
+use {DirstatePackError, DirstateParseError};
+
+#[derive(Debug, PartialEq, Copy, Clone)]
+pub struct DirstateParents<'a> {
+    pub p1: &'a [u8],
+    pub p2: &'a [u8],
+}
+/// The C implementation uses all signed types. This will be an issue
+/// either when 4GB+ source files are commonplace or in 2038, whichever
+/// comes first.
+#[derive(Debug, PartialEq)]
+pub struct DirstateEntry {
+    pub state: i8,
+    pub mode: i32,
+    pub mtime: i32,
+    pub size: i32,
+}
+pub type DirstateVec = Vec<(Vec<u8>, DirstateEntry)>;
+
+#[derive(Debug, PartialEq)]
+pub struct CopyVecEntry<'a> {
+    pub path: &'a [u8],
+    pub copy_path: &'a [u8],
+}
+pub type CopyVec<'a> = Vec<CopyVecEntry<'a>>;
+
+/// Parents are stored in the dirstate as byte hashes.
+const PARENT_SIZE: usize = 20;
+/// Dirstate entries have a static part of 8 + 32 + 32 + 32 + 32 bits.
+const MIN_ENTRY_SIZE: usize = 17;
+
+pub fn parse_dirstate(
+    contents: &[u8],
+) -> Result<(DirstateParents, DirstateVec, CopyVec), DirstateParseError> {
+    if contents.len() < PARENT_SIZE * 2 {
+        return Err(DirstateParseError::TooLittleData);
+    }
+
+    let mut dirstate_vec = vec![];
+    let mut copies = vec![];
+    let mut curr_pos = PARENT_SIZE * 2;
+    let parents = DirstateParents {
+        p1: &contents[..PARENT_SIZE],
+        p2: &contents[PARENT_SIZE..curr_pos],
+    };
+
+    while curr_pos < contents.len() {
+        if curr_pos + MIN_ENTRY_SIZE > contents.len() {
+            return Err(DirstateParseError::Overflow);
+        }
+        let entry_bytes = &contents[curr_pos..];
+
+        let mut cursor = Cursor::new(entry_bytes);
+        let state = cursor.read_i8()?;
+        let mode = cursor.read_i32::<BigEndian>()?;
+        let size = cursor.read_i32::<BigEndian>()?;
+        let mtime = cursor.read_i32::<BigEndian>()?;
+        let path_len = cursor.read_i32::<BigEndian>()? as usize;
+
+        if path_len > contents.len() - curr_pos {
+            return Err(DirstateParseError::Overflow);
+        }
+
+        // Slice instead of allocating a Vec needed for `read_exact`
+        let path = &entry_bytes[MIN_ENTRY_SIZE..MIN_ENTRY_SIZE + (path_len)];
+
+        let (path, copy) = match memchr::memchr(0, path) {
+            None => (path, None),
+            Some(i) => (&path[..i], Some(&path[(i + 1)..])),
+        };
+
+        if let Some(copy_path) = copy {
+            copies.push(CopyVecEntry { path, copy_path });
+        };
+        dirstate_vec.push((
+            path.to_owned(),
+            DirstateEntry {
+                state,
+                mode,
+                size,
+                mtime,
+            },
+        ));
+        curr_pos = curr_pos + MIN_ENTRY_SIZE + (path_len);
+    }
+
+    Ok((parents, dirstate_vec, copies))
+}
+
+pub fn pack_dirstate(
+    dirstate_vec: &DirstateVec,
+    copymap: &HashMap<Vec<u8>, Vec<u8>>,
+    parents: DirstateParents,
+    now: i32,
+) -> Result<(Vec<u8>, DirstateVec), DirstatePackError> {
+    if parents.p1.len() != PARENT_SIZE || parents.p2.len() != PARENT_SIZE {
+        return Err(DirstatePackError::CorruptedParent);
+    }
+
+    let expected_size: usize = dirstate_vec
+        .iter()
+        .map(|(ref filename, _)| {
+            let mut length = MIN_ENTRY_SIZE + filename.len();
+            if let Some(ref copy) = copymap.get(filename) {
+                length += copy.len() + 1;
+            }
+            length
+        })
+        .sum();
+    let expected_size = expected_size + PARENT_SIZE * 2;
+
+    let mut packed = Vec::with_capacity(expected_size);
+    let mut new_dirstate_vec = vec![];
+
+    packed.extend(parents.p1);
+    packed.extend(parents.p2);
+
+    for (ref filename, entry) in dirstate_vec {
+        let mut new_filename: Vec<u8> = filename.to_owned();
+        let mut new_mtime: i32 = entry.mtime;
+        if entry.state == 'n' as i8 && entry.mtime == now.into() {
+            // The file was last modified "simultaneously" with the current
+            // write to dirstate (i.e. within the same second for file-
+            // systems with a granularity of 1 sec). This commonly happens
+            // for at least a couple of files on 'update'.
+            // The user could change the file without changing its size
+            // within the same second. Invalidate the file's mtime in
+            // dirstate, forcing future 'status' calls to compare the
+            // contents of the file if the size is the same. This prevents
+            // mistakenly treating such files as clean.
+            new_mtime = -1;
+            new_dirstate_vec.push((
+                filename.to_owned(),
+                DirstateEntry {
+                    mtime: new_mtime,
+                    ..*entry
+                },
+            ));
+        }
+
+        if let Some(copy) = copymap.get(filename) {
+            new_filename.push('\0' as u8);
+            new_filename.extend(copy);
+        }
+
+        packed.write_i8(entry.state)?;
+        packed.write_i32::<BigEndian>(entry.mode)?;
+        packed.write_i32::<BigEndian>(entry.size)?;
+        packed.write_i32::<BigEndian>(new_mtime)?;
+        packed.write_i32::<BigEndian>(new_filename.len() as i32)?;
+        packed.extend(new_filename)
+    }
+
+    if packed.len() != expected_size {
+        return Err(DirstatePackError::BadSize(expected_size, packed.len()));
+    }
+
+    Ok((packed, new_dirstate_vec))
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+
+    #[test]
+    fn test_pack_dirstate_empty() {
+        let dirstate_vec: DirstateVec = vec![];
+        let copymap = HashMap::new();
+        let parents = DirstateParents {
+            p1: b"12345678910111213141",
+            p2: b"00000000000000000000",
+        };
+        let now: i32 = 15000000;
+        let expected =
+            (b"1234567891011121314100000000000000000000".to_vec(), vec![]);
+
+        assert_eq!(
+            expected,
+            pack_dirstate(&dirstate_vec, &copymap, parents, now).unwrap()
+        );
+    }
+    #[test]
+    fn test_pack_dirstate_one_entry() {
+        let dirstate_vec: DirstateVec = vec![(
+            vec!['f' as u8, '1' as u8],
+            DirstateEntry {
+                state: 'n' as i8,
+                mode: 0o644,
+                size: 0,
+                mtime: 791231220,
+            },
+        )];
+        let copymap = HashMap::new();
+        let parents = DirstateParents {
+            p1: b"12345678910111213141",
+            p2: b"00000000000000000000",
+        };
+        let now: i32 = 15000000;
+        let expected = (
+            [
+                49, 50, 51, 52, 53, 54, 55, 56, 57, 49, 48, 49, 49, 49, 50,
+                49, 51, 49, 52, 49, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48,
+                48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 110, 0, 0, 1, 164, 0,
+                0, 0, 0, 47, 41, 58, 244, 0, 0, 0, 2, 102, 49,
+            ]
+            .to_vec(),
+            vec![],
+        );
+
+        assert_eq!(
+            expected,
+            pack_dirstate(&dirstate_vec, &copymap, parents, now).unwrap()
+        );
+    }
+    #[test]
+    fn test_pack_dirstate_one_entry_with_copy() {
+        let dirstate_vec: DirstateVec = vec![(
+            b"f1".to_vec(),
+            DirstateEntry {
+                state: 'n' as i8,
+                mode: 0o644,
+                size: 0,
+                mtime: 791231220,
+            },
+        )];
+        let mut copymap = HashMap::new();
+        copymap.insert(b"f1".to_vec(), b"copyname".to_vec());
+        let parents = DirstateParents {
+            p1: b"12345678910111213141",
+            p2: b"00000000000000000000",
+        };
+        let now: i32 = 15000000;
+        let expected = (
+            [
+                49, 50, 51, 52, 53, 54, 55, 56, 57, 49, 48, 49, 49, 49, 50,
+                49, 51, 49, 52, 49, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48,
+                48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 110, 0, 0, 1, 164, 0,
+                0, 0, 0, 47, 41, 58, 244, 0, 0, 0, 11, 102, 49, 0, 99, 111,
+                112, 121, 110, 97, 109, 101,
+            ]
+            .to_vec(),
+            vec![],
+        );
+
+        assert_eq!(
+            expected,
+            pack_dirstate(&dirstate_vec, &copymap, parents, now).unwrap()
+        );
+    }
+
+    #[test]
+    fn test_parse_pack_one_entry_with_copy() {
+        let dirstate_vec: DirstateVec = vec![(
+            b"f1".to_vec(),
+            DirstateEntry {
+                state: 'n' as i8,
+                mode: 0o644,
+                size: 0,
+                mtime: 791231220,
+            },
+        )];
+        let mut copymap = HashMap::new();
+        copymap.insert(b"f1".to_vec(), b"copyname".to_vec());
+        let parents = DirstateParents {
+            p1: b"12345678910111213141",
+            p2: b"00000000000000000000",
+        };
+        let now: i32 = 15000000;
+        let result =
+            pack_dirstate(&dirstate_vec, &copymap, parents, now).unwrap();
+
+        assert_eq!(
+            (
+                parents,
+                dirstate_vec,
+                copymap
+                    .iter()
+                    .map(|(k, v)| CopyVecEntry {
+                        path: k.as_slice(),
+                        copy_path: v.as_slice()
+                    })
+                    .collect()
+            ),
+            parse_dirstate(result.0.as_slice()).unwrap()
+        )
+    }
+
+    #[test]
+    fn test_parse_pack_multiple_entries_with_copy() {
+        let dirstate_vec: DirstateVec = vec![
+            (
+                b"f1".to_vec(),
+                DirstateEntry {
+                    state: 'n' as i8,
+                    mode: 0o644,
+                    size: 0,
+                    mtime: 791231220,
+                },
+            ),
+            (
+                b"f2".to_vec(),
+                DirstateEntry {
+                    state: 'm' as i8,
+                    mode: 0o777,
+                    size: 1000,
+                    mtime: 791231220,
+                },
+            ),
+            (
+                b"f3".to_vec(),
+                DirstateEntry {
+                    state: 'r' as i8,
+                    mode: 0o644,
+                    size: 234553,
+                    mtime: 791231220,
+                },
+            ),
+            (
+                b"f4\xF6".to_vec(),
+                DirstateEntry {
+                    state: 'a' as i8,
+                    mode: 0o644,
+                    size: -1,
+                    mtime: -1,
+                },
+            ),
+        ];
+        let mut copymap = HashMap::new();
+        copymap.insert(b"f1".to_vec(), b"copyname".to_vec());
+        copymap.insert(b"f4\xF6".to_vec(), b"copyname2".to_vec());
+        let parents = DirstateParents {
+            p1: b"12345678910111213141",
+            p2: b"00000000000000000000",
+        };
+        let now: i32 = 15000000;
+        let result =
+            pack_dirstate(&dirstate_vec, &copymap, parents, now).unwrap();
+
+        assert_eq!(
+            (parents, dirstate_vec, copymap),
+            parse_dirstate(result.0.as_slice())
+                .and_then(|(p, dvec, cvec)| Ok((
+                    p,
+                    dvec,
+                    cvec.iter()
+                        .map(|entry| (
+                            entry.path.to_vec(),
+                            entry.copy_path.to_vec()
+                        ))
+                        .collect()
+                )))
+                .unwrap()
+        )
+    }
+
+    #[test]
+    /// https://www.mercurial-scm.org/repo/hg/rev/af3f26b6bba4
+    fn test_parse_pack_one_entry_with_copy_and_time_conflict() {
+        let dirstate_vec: DirstateVec = vec![(
+            b"f1".to_vec(),
+            DirstateEntry {
+                state: 'n' as i8,
+                mode: 0o644,
+                size: 0,
+                mtime: 15000000,
+            },
+        )];
+        let mut copymap = HashMap::new();
+        copymap.insert(b"f1".to_vec(), b"copyname".to_vec());
+        let parents = DirstateParents {
+            p1: b"12345678910111213141",
+            p2: b"00000000000000000000",
+        };
+        let now: i32 = 15000000;
+        let result =
+            pack_dirstate(&dirstate_vec, &copymap, parents, now).unwrap();
+
+        assert_eq!(
+            (
+                parents,
+                vec![(
+                    b"f1".to_vec(),
+                    DirstateEntry {
+                        state: 'n' as i8,
+                        mode: 0o644,
+                        size: 0,
+                        mtime: -1
+                    }
+                )],
+                copymap
+                    .iter()
+                    .map(|(k, v)| CopyVecEntry {
+                        path: k.as_slice(),
+                        copy_path: v.as_slice()
+                    })
+                    .collect()
+            ),
+            parse_dirstate(result.0.as_slice()).unwrap()
+        )
+    }
+}
--- a/rust/hg-core/src/lib.rs	Tue May 14 22:56:58 2019 -0700
+++ b/rust/hg-core/src/lib.rs	Mon May 06 22:48:09 2019 +0200
@@ -2,6 +2,9 @@
 //
 // This software may be used and distributed according to the terms of the
 // GNU General Public License version 2 or any later version.
+extern crate byteorder;
+extern crate memchr;
+
 mod ancestors;
 pub mod dagops;
 pub use ancestors::{AncestorsIterator, LazyAncestors, MissingAncestors};
@@ -40,3 +43,29 @@
     ParentOutOfRange(Revision),
     WorkingDirectoryUnsupported,
 }
+
+#[derive(Clone, Debug, PartialEq)]
+pub enum DirstateParseError {
+    TooLittleData,
+    Overflow,
+    CorruptedEntry(String),
+}
+
+#[derive(Debug, PartialEq)]
+pub enum DirstatePackError {
+    CorruptedEntry(String),
+    CorruptedParent,
+    BadSize(usize, usize),
+}
+
+impl From<std::io::Error> for DirstatePackError {
+    fn from(e: std::io::Error) -> Self {
+        DirstatePackError::CorruptedEntry(e.to_string())
+    }
+}
+
+impl From<std::io::Error> for DirstateParseError {
+    fn from(e: std::io::Error) -> Self {
+        DirstateParseError::CorruptedEntry(e.to_string())
+    }
+}