Mercurial > hg
changeset 46390:0800aa42bb4c
rust: use the bytes-cast crate to parse persistent nodemaps
This crate casts pointers to custom structs, with compile-time safety checks,
for easy and efficient binary data parsing.
See https://crates.io/crates/bytes-cast and
https://docs.rs/bytes-cast/0.1.0/bytes_cast/
Differential Revision: https://phab.mercurial-scm.org/D9788
author | Simon Sapin <simon.sapin@octobus.net> |
---|---|
date | Fri, 15 Jan 2021 16:11:54 +0100 |
parents | 38b9a63d3a13 |
children | cfb6c10c08c2 |
files | rust/Cargo.lock rust/hg-core/Cargo.toml rust/hg-core/src/revlog/nodemap.rs rust/hg-core/src/revlog/nodemap_docket.rs rust/hg-core/src/revlog/revlog.rs |
diffstat | 5 files changed, 81 insertions(+), 86 deletions(-) [+] |
line wrap: on
line diff
--- a/rust/Cargo.lock Mon Jan 25 19:03:27 2021 -0500 +++ b/rust/Cargo.lock Fri Jan 15 16:11:54 2021 +0100 @@ -55,6 +55,24 @@ source = "registry+https://github.com/rust-lang/crates.io-index" [[package]] +name = "bytes-cast" +version = "0.1.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +dependencies = [ + "bytes-cast-derive 0.1.0 (registry+https://github.com/rust-lang/crates.io-index)", +] + +[[package]] +name = "bytes-cast-derive" +version = "0.1.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +dependencies = [ + "proc-macro2 1.0.24 (registry+https://github.com/rust-lang/crates.io-index)", + "quote 1.0.7 (registry+https://github.com/rust-lang/crates.io-index)", + "syn 1.0.54 (registry+https://github.com/rust-lang/crates.io-index)", +] + +[[package]] name = "cc" version = "1.0.66" source = "registry+https://github.com/rust-lang/crates.io-index" @@ -277,6 +295,7 @@ version = "0.1.0" dependencies = [ "byteorder 1.3.4 (registry+https://github.com/rust-lang/crates.io-index)", + "bytes-cast 0.1.0 (registry+https://github.com/rust-lang/crates.io-index)", "clap 2.33.3 (registry+https://github.com/rust-lang/crates.io-index)", "crossbeam-channel 0.4.4 (registry+https://github.com/rust-lang/crates.io-index)", "flate2 1.0.19 (registry+https://github.com/rust-lang/crates.io-index)", @@ -910,6 +929,8 @@ "checksum bitflags 1.2.1 (registry+https://github.com/rust-lang/crates.io-index)" = "cf1de2fe8c75bc145a2f577add951f8134889b4795d47466a54a5c846d691693" "checksum bitmaps 2.1.0 (registry+https://github.com/rust-lang/crates.io-index)" = "031043d04099746d8db04daf1fa424b2bc8bd69d92b25962dcde24da39ab64a2" "checksum byteorder 1.3.4 (registry+https://github.com/rust-lang/crates.io-index)" = "08c48aae112d48ed9f069b33538ea9e3e90aa263cfa3d1c24309612b1f7472de" +"checksum bytes-cast 0.1.0 (registry+https://github.com/rust-lang/crates.io-index)" = "3196ba300c7bc9282a4331e878496cb3e9603a898a8f1446601317163e16ca52" +"checksum bytes-cast-derive 0.1.0 (registry+https://github.com/rust-lang/crates.io-index)" = "cb936af9de38476664d6b58e529aff30d482e4ce1c5e150293d00730b0d81fdb" "checksum cc 1.0.66 (registry+https://github.com/rust-lang/crates.io-index)" = "4c0496836a84f8d0495758516b8621a622beb77c0fed418570e50764093ced48" "checksum cfg-if 0.1.10 (registry+https://github.com/rust-lang/crates.io-index)" = "4785bdd1c96b2a846b2bd7cc02e86b6b3dbf14e7e53446c4f54c92a361040822" "checksum cfg-if 1.0.0 (registry+https://github.com/rust-lang/crates.io-index)" = "baf1de4339761588bc0619e3cbc0120ee582ebb74b53b4efbf79117bd2da40fd"
--- a/rust/hg-core/Cargo.toml Mon Jan 25 19:03:27 2021 -0500 +++ b/rust/hg-core/Cargo.toml Fri Jan 15 16:11:54 2021 +0100 @@ -9,6 +9,7 @@ name = "hg" [dependencies] +bytes-cast = "0.1" byteorder = "1.3.4" hex = "0.4.2" im-rc = "15.0.*"
--- a/rust/hg-core/src/revlog/nodemap.rs Mon Jan 25 19:03:27 2021 -0500 +++ b/rust/hg-core/src/revlog/nodemap.rs Fri Jan 15 16:11:54 2021 +0100 @@ -17,12 +17,12 @@ RevlogIndex, NULL_REVISION, }; +use bytes_cast::{unaligned, BytesCast}; use std::cmp::max; use std::fmt; -use std::mem; +use std::mem::{self, align_of, size_of}; use std::ops::Deref; use std::ops::Index; -use std::slice; #[derive(Debug, PartialEq)] pub enum NodeMapError { @@ -149,7 +149,7 @@ /// Low level NodeTree [`Blocks`] elements /// /// These are exactly as for instance on persistent storage. -type RawElement = i32; +type RawElement = unaligned::I32Be; /// High level representation of values in NodeTree /// [`Blocks`](struct.Block.html) @@ -168,23 +168,24 @@ /// /// See [`Block`](struct.Block.html) for explanation about the encoding. fn from(raw: RawElement) -> Element { - if raw >= 0 { - Element::Block(raw as usize) - } else if raw == -1 { + let int = raw.get(); + if int >= 0 { + Element::Block(int as usize) + } else if int == -1 { Element::None } else { - Element::Rev(-raw - 2) + Element::Rev(-int - 2) } } } impl From<Element> for RawElement { fn from(element: Element) -> RawElement { - match element { + RawElement::from(match element { Element::None => 0, - Element::Block(i) => i as RawElement, + Element::Block(i) => i as i32, Element::Rev(rev) => -rev - 2, - } + }) } } @@ -212,42 +213,24 @@ /// represented at all, because we want an immutable empty nodetree /// to be valid. -#[derive(Copy, Clone)] -pub struct Block([u8; BLOCK_SIZE]); +const ELEMENTS_PER_BLOCK: usize = 16; // number of different values in a nybble -/// Not derivable for arrays of length >32 until const generics are stable -impl PartialEq for Block { - fn eq(&self, other: &Self) -> bool { - self.0[..] == other.0[..] - } -} - -pub const BLOCK_SIZE: usize = 64; +#[derive(Copy, Clone, BytesCast, PartialEq)] +#[repr(transparent)] +pub struct Block([RawElement; ELEMENTS_PER_BLOCK]); impl Block { fn new() -> Self { - // -1 in 2's complement to create an absent node - let byte: u8 = 255; - Block([byte; BLOCK_SIZE]) + let absent_node = RawElement::from(-1); + Block([absent_node; ELEMENTS_PER_BLOCK]) } fn get(&self, nybble: u8) -> Element { - let index = nybble as usize * mem::size_of::<RawElement>(); - Element::from(RawElement::from_be_bytes([ - self.0[index], - self.0[index + 1], - self.0[index + 2], - self.0[index + 3], - ])) + self.0[nybble as usize].into() } fn set(&mut self, nybble: u8, element: Element) { - let values = RawElement::to_be_bytes(element.into()); - let index = nybble as usize * mem::size_of::<RawElement>(); - self.0[index] = values[0]; - self.0[index + 1] = values[1]; - self.0[index + 2] = values[2]; - self.0[index + 3] = values[3]; + self.0[nybble as usize] = element.into() } } @@ -398,16 +381,17 @@ // Transmute the `Vec<Block>` to a `Vec<u8>`. Blocks are contiguous // bytes, so this is perfectly safe. let bytes = unsafe { - // Assert that `Block` hasn't been changed and has no padding - let _: [u8; 4 * BLOCK_SIZE] = - std::mem::transmute([Block::new(); 4]); + // Check for compatible allocation layout. + // (Optimized away by constant-folding + dead code elimination.) + assert_eq!(size_of::<Block>(), 64); + assert_eq!(align_of::<Block>(), 1); // /!\ Any use of `vec` after this is use-after-free. // TODO: use `into_raw_parts` once stabilized Vec::from_raw_parts( vec.as_ptr() as *mut u8, - vec.len() * BLOCK_SIZE, - vec.capacity() * BLOCK_SIZE, + vec.len() * size_of::<Block>(), + vec.capacity() * size_of::<Block>(), ) }; (readonly, bytes) @@ -613,7 +597,7 @@ amount: usize, ) -> Self { assert!(buffer.len() >= amount); - let len_in_blocks = amount / BLOCK_SIZE; + let len_in_blocks = amount / size_of::<Block>(); NodeTreeBytes { buffer, len_in_blocks, @@ -625,12 +609,11 @@ type Target = [Block]; fn deref(&self) -> &[Block] { - unsafe { - slice::from_raw_parts( - (&self.buffer).as_ptr() as *const Block, - self.len_in_blocks, - ) - } + Block::slice_from_bytes(&self.buffer, self.len_in_blocks) + // `NodeTreeBytes::new` already asserted that `self.buffer` is + // large enough. + .unwrap() + .0 } } @@ -774,13 +757,13 @@ let mut raw = [255u8; 64]; let mut counter = 0; - for val in [0, 15, -2, -1, -3].iter() { - for byte in RawElement::to_be_bytes(*val).iter() { + for val in [0_i32, 15, -2, -1, -3].iter() { + for byte in val.to_be_bytes().iter() { raw[counter] = *byte; counter += 1; } } - let block = Block(raw); + let (block, _) = Block::from_bytes(&raw).unwrap(); assert_eq!(block.get(0), Element::Block(0)); assert_eq!(block.get(1), Element::Block(15)); assert_eq!(block.get(3), Element::None); @@ -1108,7 +1091,7 @@ let (_, bytes) = idx.nt.into_readonly_and_added_bytes(); // only the root block has been changed - assert_eq!(bytes.len(), BLOCK_SIZE); + assert_eq!(bytes.len(), size_of::<Block>()); // big endian for -2 assert_eq!(&bytes[4..2 * 4], [255, 255, 255, 254]); // big endian for -6
--- a/rust/hg-core/src/revlog/nodemap_docket.rs Mon Jan 25 19:03:27 2021 -0500 +++ b/rust/hg-core/src/revlog/nodemap_docket.rs Fri Jan 15 16:11:54 2021 +0100 @@ -1,5 +1,5 @@ +use bytes_cast::{unaligned, BytesCast}; use memmap::Mmap; -use std::convert::TryInto; use std::path::{Path, PathBuf}; use super::revlog::RevlogError; @@ -13,6 +13,16 @@ // TODO: keep here more of the data from `parse()` when we need it } +#[derive(BytesCast)] +#[repr(C)] +struct DocketHeader { + uid_size: u8, + _tip_rev: unaligned::U64Be, + data_length: unaligned::U64Be, + _data_unused: unaligned::U64Be, + tip_node_size: unaligned::U64Be, +} + impl NodeMapDocket { /// Return `Ok(None)` when the caller should proceed without a persistent /// nodemap: @@ -36,25 +46,22 @@ Ok(bytes) => bytes, }; - let mut input = if let Some((&ONDISK_VERSION, rest)) = + let input = if let Some((&ONDISK_VERSION, rest)) = docket_bytes.split_first() { rest } else { return Ok(None); }; - let input = &mut input; - let uid_size = read_u8(input)? as usize; - let _tip_rev = read_be_u64(input)?; + let (header, rest) = DocketHeader::from_bytes(input)?; + let uid_size = header.uid_size as usize; // TODO: do we care about overflow for 4 GB+ nodemap files on 32-bit // systems? - let data_length = read_be_u64(input)? as usize; - let _data_unused = read_be_u64(input)?; - let tip_node_size = read_be_u64(input)? as usize; - let uid = read_bytes(input, uid_size)?; - let _tip_node = read_bytes(input, tip_node_size)?; - + let tip_node_size = header.tip_node_size.get() as usize; + let data_length = header.data_length.get() as usize; + let (uid, rest) = u8::slice_from_bytes(rest, uid_size)?; + let (_tip_node, _rest) = u8::slice_from_bytes(rest, tip_node_size)?; let uid = std::str::from_utf8(uid).map_err(|_| RevlogError::Corrupted)?; let docket = NodeMapDocket { data_length }; @@ -81,29 +88,6 @@ } } -fn read_bytes<'a>( - input: &mut &'a [u8], - count: usize, -) -> Result<&'a [u8], RevlogError> { - if let Some(start) = input.get(..count) { - *input = &input[count..]; - Ok(start) - } else { - Err(RevlogError::Corrupted) - } -} - -fn read_u8<'a>(input: &mut &[u8]) -> Result<u8, RevlogError> { - Ok(read_bytes(input, 1)?[0]) -} - -fn read_be_u64<'a>(input: &mut &[u8]) -> Result<u64, RevlogError> { - let array = read_bytes(input, std::mem::size_of::<u64>())? - .try_into() - .unwrap(); - Ok(u64::from_be_bytes(array)) -} - fn rawdata_path(docket_path: &Path, uid: &str) -> PathBuf { let docket_name = docket_path .file_name()
--- a/rust/hg-core/src/revlog/revlog.rs Mon Jan 25 19:03:27 2021 -0500 +++ b/rust/hg-core/src/revlog/revlog.rs Fri Jan 15 16:11:54 2021 +0100 @@ -29,6 +29,12 @@ UnknowDataFormat(u8), } +impl From<bytes_cast::FromBytesError> for RevlogError { + fn from(_: bytes_cast::FromBytesError) -> Self { + RevlogError::Corrupted + } +} + /// Read only implementation of revlog. pub struct Revlog { /// When index and data are not interleaved: bytes of the revlog index.