rust: use the bytes-cast crate to parse persistent nodemaps
This crate casts pointers to custom structs, with compile-time safety checks,
for easy and efficient binary data parsing.
See https://crates.io/crates/bytes-cast and
https://docs.rs/bytes-cast/0.1.0/bytes_cast/
Differential Revision: https://phab.mercurial-scm.org/D9788
--- a/rust/Cargo.lock Mon Jan 25 19:03:27 2021 -0500
+++ b/rust/Cargo.lock Fri Jan 15 16:11:54 2021 +0100
@@ -55,6 +55,24 @@
source = "registry+https://github.com/rust-lang/crates.io-index"
[[package]]
+name = "bytes-cast"
+version = "0.1.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+dependencies = [
+ "bytes-cast-derive 0.1.0 (registry+https://github.com/rust-lang/crates.io-index)",
+]
+
+[[package]]
+name = "bytes-cast-derive"
+version = "0.1.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+dependencies = [
+ "proc-macro2 1.0.24 (registry+https://github.com/rust-lang/crates.io-index)",
+ "quote 1.0.7 (registry+https://github.com/rust-lang/crates.io-index)",
+ "syn 1.0.54 (registry+https://github.com/rust-lang/crates.io-index)",
+]
+
+[[package]]
name = "cc"
version = "1.0.66"
source = "registry+https://github.com/rust-lang/crates.io-index"
@@ -277,6 +295,7 @@
version = "0.1.0"
dependencies = [
"byteorder 1.3.4 (registry+https://github.com/rust-lang/crates.io-index)",
+ "bytes-cast 0.1.0 (registry+https://github.com/rust-lang/crates.io-index)",
"clap 2.33.3 (registry+https://github.com/rust-lang/crates.io-index)",
"crossbeam-channel 0.4.4 (registry+https://github.com/rust-lang/crates.io-index)",
"flate2 1.0.19 (registry+https://github.com/rust-lang/crates.io-index)",
@@ -910,6 +929,8 @@
"checksum bitflags 1.2.1 (registry+https://github.com/rust-lang/crates.io-index)" = "cf1de2fe8c75bc145a2f577add951f8134889b4795d47466a54a5c846d691693"
"checksum bitmaps 2.1.0 (registry+https://github.com/rust-lang/crates.io-index)" = "031043d04099746d8db04daf1fa424b2bc8bd69d92b25962dcde24da39ab64a2"
"checksum byteorder 1.3.4 (registry+https://github.com/rust-lang/crates.io-index)" = "08c48aae112d48ed9f069b33538ea9e3e90aa263cfa3d1c24309612b1f7472de"
+"checksum bytes-cast 0.1.0 (registry+https://github.com/rust-lang/crates.io-index)" = "3196ba300c7bc9282a4331e878496cb3e9603a898a8f1446601317163e16ca52"
+"checksum bytes-cast-derive 0.1.0 (registry+https://github.com/rust-lang/crates.io-index)" = "cb936af9de38476664d6b58e529aff30d482e4ce1c5e150293d00730b0d81fdb"
"checksum cc 1.0.66 (registry+https://github.com/rust-lang/crates.io-index)" = "4c0496836a84f8d0495758516b8621a622beb77c0fed418570e50764093ced48"
"checksum cfg-if 0.1.10 (registry+https://github.com/rust-lang/crates.io-index)" = "4785bdd1c96b2a846b2bd7cc02e86b6b3dbf14e7e53446c4f54c92a361040822"
"checksum cfg-if 1.0.0 (registry+https://github.com/rust-lang/crates.io-index)" = "baf1de4339761588bc0619e3cbc0120ee582ebb74b53b4efbf79117bd2da40fd"
--- a/rust/hg-core/Cargo.toml Mon Jan 25 19:03:27 2021 -0500
+++ b/rust/hg-core/Cargo.toml Fri Jan 15 16:11:54 2021 +0100
@@ -9,6 +9,7 @@
name = "hg"
[dependencies]
+bytes-cast = "0.1"
byteorder = "1.3.4"
hex = "0.4.2"
im-rc = "15.0.*"
--- a/rust/hg-core/src/revlog/nodemap.rs Mon Jan 25 19:03:27 2021 -0500
+++ b/rust/hg-core/src/revlog/nodemap.rs Fri Jan 15 16:11:54 2021 +0100
@@ -17,12 +17,12 @@
RevlogIndex, NULL_REVISION,
};
+use bytes_cast::{unaligned, BytesCast};
use std::cmp::max;
use std::fmt;
-use std::mem;
+use std::mem::{self, align_of, size_of};
use std::ops::Deref;
use std::ops::Index;
-use std::slice;
#[derive(Debug, PartialEq)]
pub enum NodeMapError {
@@ -149,7 +149,7 @@
/// Low level NodeTree [`Blocks`] elements
///
/// These are exactly as for instance on persistent storage.
-type RawElement = i32;
+type RawElement = unaligned::I32Be;
/// High level representation of values in NodeTree
/// [`Blocks`](struct.Block.html)
@@ -168,23 +168,24 @@
///
/// See [`Block`](struct.Block.html) for explanation about the encoding.
fn from(raw: RawElement) -> Element {
- if raw >= 0 {
- Element::Block(raw as usize)
- } else if raw == -1 {
+ let int = raw.get();
+ if int >= 0 {
+ Element::Block(int as usize)
+ } else if int == -1 {
Element::None
} else {
- Element::Rev(-raw - 2)
+ Element::Rev(-int - 2)
}
}
}
impl From<Element> for RawElement {
fn from(element: Element) -> RawElement {
- match element {
+ RawElement::from(match element {
Element::None => 0,
- Element::Block(i) => i as RawElement,
+ Element::Block(i) => i as i32,
Element::Rev(rev) => -rev - 2,
- }
+ })
}
}
@@ -212,42 +213,24 @@
/// represented at all, because we want an immutable empty nodetree
/// to be valid.
-#[derive(Copy, Clone)]
-pub struct Block([u8; BLOCK_SIZE]);
+const ELEMENTS_PER_BLOCK: usize = 16; // number of different values in a nybble
-/// Not derivable for arrays of length >32 until const generics are stable
-impl PartialEq for Block {
- fn eq(&self, other: &Self) -> bool {
- self.0[..] == other.0[..]
- }
-}
-
-pub const BLOCK_SIZE: usize = 64;
+#[derive(Copy, Clone, BytesCast, PartialEq)]
+#[repr(transparent)]
+pub struct Block([RawElement; ELEMENTS_PER_BLOCK]);
impl Block {
fn new() -> Self {
- // -1 in 2's complement to create an absent node
- let byte: u8 = 255;
- Block([byte; BLOCK_SIZE])
+ let absent_node = RawElement::from(-1);
+ Block([absent_node; ELEMENTS_PER_BLOCK])
}
fn get(&self, nybble: u8) -> Element {
- let index = nybble as usize * mem::size_of::<RawElement>();
- Element::from(RawElement::from_be_bytes([
- self.0[index],
- self.0[index + 1],
- self.0[index + 2],
- self.0[index + 3],
- ]))
+ self.0[nybble as usize].into()
}
fn set(&mut self, nybble: u8, element: Element) {
- let values = RawElement::to_be_bytes(element.into());
- let index = nybble as usize * mem::size_of::<RawElement>();
- self.0[index] = values[0];
- self.0[index + 1] = values[1];
- self.0[index + 2] = values[2];
- self.0[index + 3] = values[3];
+ self.0[nybble as usize] = element.into()
}
}
@@ -398,16 +381,17 @@
// Transmute the `Vec<Block>` to a `Vec<u8>`. Blocks are contiguous
// bytes, so this is perfectly safe.
let bytes = unsafe {
- // Assert that `Block` hasn't been changed and has no padding
- let _: [u8; 4 * BLOCK_SIZE] =
- std::mem::transmute([Block::new(); 4]);
+ // Check for compatible allocation layout.
+ // (Optimized away by constant-folding + dead code elimination.)
+ assert_eq!(size_of::<Block>(), 64);
+ assert_eq!(align_of::<Block>(), 1);
// /!\ Any use of `vec` after this is use-after-free.
// TODO: use `into_raw_parts` once stabilized
Vec::from_raw_parts(
vec.as_ptr() as *mut u8,
- vec.len() * BLOCK_SIZE,
- vec.capacity() * BLOCK_SIZE,
+ vec.len() * size_of::<Block>(),
+ vec.capacity() * size_of::<Block>(),
)
};
(readonly, bytes)
@@ -613,7 +597,7 @@
amount: usize,
) -> Self {
assert!(buffer.len() >= amount);
- let len_in_blocks = amount / BLOCK_SIZE;
+ let len_in_blocks = amount / size_of::<Block>();
NodeTreeBytes {
buffer,
len_in_blocks,
@@ -625,12 +609,11 @@
type Target = [Block];
fn deref(&self) -> &[Block] {
- unsafe {
- slice::from_raw_parts(
- (&self.buffer).as_ptr() as *const Block,
- self.len_in_blocks,
- )
- }
+ Block::slice_from_bytes(&self.buffer, self.len_in_blocks)
+ // `NodeTreeBytes::new` already asserted that `self.buffer` is
+ // large enough.
+ .unwrap()
+ .0
}
}
@@ -774,13 +757,13 @@
let mut raw = [255u8; 64];
let mut counter = 0;
- for val in [0, 15, -2, -1, -3].iter() {
- for byte in RawElement::to_be_bytes(*val).iter() {
+ for val in [0_i32, 15, -2, -1, -3].iter() {
+ for byte in val.to_be_bytes().iter() {
raw[counter] = *byte;
counter += 1;
}
}
- let block = Block(raw);
+ let (block, _) = Block::from_bytes(&raw).unwrap();
assert_eq!(block.get(0), Element::Block(0));
assert_eq!(block.get(1), Element::Block(15));
assert_eq!(block.get(3), Element::None);
@@ -1108,7 +1091,7 @@
let (_, bytes) = idx.nt.into_readonly_and_added_bytes();
// only the root block has been changed
- assert_eq!(bytes.len(), BLOCK_SIZE);
+ assert_eq!(bytes.len(), size_of::<Block>());
// big endian for -2
assert_eq!(&bytes[4..2 * 4], [255, 255, 255, 254]);
// big endian for -6
--- a/rust/hg-core/src/revlog/nodemap_docket.rs Mon Jan 25 19:03:27 2021 -0500
+++ b/rust/hg-core/src/revlog/nodemap_docket.rs Fri Jan 15 16:11:54 2021 +0100
@@ -1,5 +1,5 @@
+use bytes_cast::{unaligned, BytesCast};
use memmap::Mmap;
-use std::convert::TryInto;
use std::path::{Path, PathBuf};
use super::revlog::RevlogError;
@@ -13,6 +13,16 @@
// TODO: keep here more of the data from `parse()` when we need it
}
+#[derive(BytesCast)]
+#[repr(C)]
+struct DocketHeader {
+ uid_size: u8,
+ _tip_rev: unaligned::U64Be,
+ data_length: unaligned::U64Be,
+ _data_unused: unaligned::U64Be,
+ tip_node_size: unaligned::U64Be,
+}
+
impl NodeMapDocket {
/// Return `Ok(None)` when the caller should proceed without a persistent
/// nodemap:
@@ -36,25 +46,22 @@
Ok(bytes) => bytes,
};
- let mut input = if let Some((&ONDISK_VERSION, rest)) =
+ let input = if let Some((&ONDISK_VERSION, rest)) =
docket_bytes.split_first()
{
rest
} else {
return Ok(None);
};
- let input = &mut input;
- let uid_size = read_u8(input)? as usize;
- let _tip_rev = read_be_u64(input)?;
+ let (header, rest) = DocketHeader::from_bytes(input)?;
+ let uid_size = header.uid_size as usize;
// TODO: do we care about overflow for 4 GB+ nodemap files on 32-bit
// systems?
- let data_length = read_be_u64(input)? as usize;
- let _data_unused = read_be_u64(input)?;
- let tip_node_size = read_be_u64(input)? as usize;
- let uid = read_bytes(input, uid_size)?;
- let _tip_node = read_bytes(input, tip_node_size)?;
-
+ let tip_node_size = header.tip_node_size.get() as usize;
+ let data_length = header.data_length.get() as usize;
+ let (uid, rest) = u8::slice_from_bytes(rest, uid_size)?;
+ let (_tip_node, _rest) = u8::slice_from_bytes(rest, tip_node_size)?;
let uid =
std::str::from_utf8(uid).map_err(|_| RevlogError::Corrupted)?;
let docket = NodeMapDocket { data_length };
@@ -81,29 +88,6 @@
}
}
-fn read_bytes<'a>(
- input: &mut &'a [u8],
- count: usize,
-) -> Result<&'a [u8], RevlogError> {
- if let Some(start) = input.get(..count) {
- *input = &input[count..];
- Ok(start)
- } else {
- Err(RevlogError::Corrupted)
- }
-}
-
-fn read_u8<'a>(input: &mut &[u8]) -> Result<u8, RevlogError> {
- Ok(read_bytes(input, 1)?[0])
-}
-
-fn read_be_u64<'a>(input: &mut &[u8]) -> Result<u64, RevlogError> {
- let array = read_bytes(input, std::mem::size_of::<u64>())?
- .try_into()
- .unwrap();
- Ok(u64::from_be_bytes(array))
-}
-
fn rawdata_path(docket_path: &Path, uid: &str) -> PathBuf {
let docket_name = docket_path
.file_name()
--- a/rust/hg-core/src/revlog/revlog.rs Mon Jan 25 19:03:27 2021 -0500
+++ b/rust/hg-core/src/revlog/revlog.rs Fri Jan 15 16:11:54 2021 +0100
@@ -29,6 +29,12 @@
UnknowDataFormat(u8),
}
+impl From<bytes_cast::FromBytesError> for RevlogError {
+ fn from(_: bytes_cast::FromBytesError) -> Self {
+ RevlogError::Corrupted
+ }
+}
+
/// Read only implementation of revlog.
pub struct Revlog {
/// When index and data are not interleaved: bytes of the revlog index.