Mercurial > hg
changeset 46090:9eb07ab3f2d4
rhg: use persistent nodemap when available
… for node ID → revision number lookups, instead on linear scan in a revlog.
Differential Revision: https://phab.mercurial-scm.org/D9520
author | Simon Sapin <simon-commits@exyr.org> |
---|---|
date | Fri, 04 Dec 2020 17:27:10 +0100 |
parents | 8ff2d8359d0f |
children | af3a6900f893 |
files | rust/hg-core/src/requirements.rs rust/hg-core/src/revlog.rs rust/hg-core/src/revlog/index.rs rust/hg-core/src/revlog/nodemap_docket.rs rust/hg-core/src/revlog/revlog.rs rust/hg-core/src/utils.rs tests/test-rhg.t |
diffstat | 7 files changed, 183 insertions(+), 6 deletions(-) [+] |
line wrap: on
line diff
--- a/rust/hg-core/src/requirements.rs Mon Dec 07 18:06:53 2020 +0100 +++ b/rust/hg-core/src/requirements.rs Fri Dec 04 17:27:10 2020 +0100 @@ -69,4 +69,8 @@ "revlogv1", "sparserevlog", "store", + // As of this writing everything rhg does is read-only. + // When it starts writing to the repository, it’ll need to either keep the + // persistent nodemap up to date or remove this entry: + "persistent-nodemap", ];
--- a/rust/hg-core/src/revlog.rs Mon Dec 07 18:06:53 2020 +0100 +++ b/rust/hg-core/src/revlog.rs Fri Dec 04 17:27:10 2020 +0100 @@ -7,6 +7,7 @@ pub mod node; pub mod nodemap; +mod nodemap_docket; pub mod path_encode; pub use node::{Node, NodeError, NodePrefix, NodePrefixRef}; pub mod changelog;
--- a/rust/hg-core/src/revlog/index.rs Mon Dec 07 18:06:53 2020 +0100 +++ b/rust/hg-core/src/revlog/index.rs Fri Dec 04 17:27:10 2020 +0100 @@ -132,6 +132,16 @@ } } +impl super::RevlogIndex for Index { + fn len(&self) -> usize { + self.len() + } + + fn node(&self, rev: Revision) -> Option<&Node> { + self.get_entry(rev).map(|entry| entry.hash()) + } +} + #[derive(Debug)] pub struct IndexEntry<'a> { bytes: &'a [u8], @@ -190,7 +200,7 @@ /// /// Currently, SHA-1 is used and only the first 20 bytes of this field /// are used. - pub fn hash(&self) -> &Node { + pub fn hash(&self) -> &'a Node { (&self.bytes[32..52]).try_into().unwrap() } }
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/rust/hg-core/src/revlog/nodemap_docket.rs Fri Dec 04 17:27:10 2020 +0100 @@ -0,0 +1,119 @@ +use memmap::Mmap; +use std::convert::TryInto; +use std::path::{Path, PathBuf}; + +use super::revlog::{mmap_open, RevlogError}; +use crate::utils::strip_suffix; + +const ONDISK_VERSION: u8 = 1; + +pub(super) struct NodeMapDocket { + pub data_length: usize, + // TODO: keep here more of the data from `parse()` when we need it +} + +impl NodeMapDocket { + /// Return `Ok(None)` when the caller should proceed without a persistent + /// nodemap: + /// + /// * This revlog does not have a `.n` docket file (it is not generated for + /// small revlogs), or + /// * The docket has an unsupported version number (repositories created by + /// later hg, maybe that should be a requirement instead?), or + /// * The docket file points to a missing (likely deleted) data file (this + /// can happen in a rare race condition). + pub fn read_from_file( + index_path: &Path, + ) -> Result<Option<(Self, Mmap)>, RevlogError> { + let docket_path = index_path.with_extension("n"); + let docket_bytes = match std::fs::read(&docket_path) { + Err(e) if e.kind() == std::io::ErrorKind::NotFound => { + return Ok(None) + } + Err(e) => return Err(RevlogError::IoError(e)), + Ok(bytes) => bytes, + }; + + let mut input = if let Some((&ONDISK_VERSION, rest)) = + docket_bytes.split_first() + { + rest + } else { + return Ok(None); + }; + let input = &mut input; + + let uid_size = read_u8(input)? as usize; + let _tip_rev = read_be_u64(input)?; + // TODO: do we care about overflow for 4 GB+ nodemap files on 32-bit + // systems? + let data_length = read_be_u64(input)? as usize; + let _data_unused = read_be_u64(input)?; + let tip_node_size = read_be_u64(input)? as usize; + let uid = read_bytes(input, uid_size)?; + let _tip_node = read_bytes(input, tip_node_size)?; + + let uid = + std::str::from_utf8(uid).map_err(|_| RevlogError::Corrupted)?; + let docket = NodeMapDocket { data_length }; + + let data_path = rawdata_path(&docket_path, uid); + // TODO: use `std::fs::read` here when the `persistent-nodemap.mmap` + // config is false? + match mmap_open(&data_path) { + Ok(mmap) => { + if mmap.len() >= data_length { + Ok(Some((docket, mmap))) + } else { + Err(RevlogError::Corrupted) + } + } + Err(error) => { + if error.kind() == std::io::ErrorKind::NotFound { + Ok(None) + } else { + Err(RevlogError::IoError(error)) + } + } + } + } +} + +fn read_bytes<'a>( + input: &mut &'a [u8], + count: usize, +) -> Result<&'a [u8], RevlogError> { + if let Some(start) = input.get(..count) { + *input = &input[count..]; + Ok(start) + } else { + Err(RevlogError::Corrupted) + } +} + +fn read_u8<'a>(input: &mut &[u8]) -> Result<u8, RevlogError> { + Ok(read_bytes(input, 1)?[0]) +} + +fn read_be_u64<'a>(input: &mut &[u8]) -> Result<u64, RevlogError> { + let array = read_bytes(input, std::mem::size_of::<u64>())? + .try_into() + .unwrap(); + Ok(u64::from_be_bytes(array)) +} + +fn rawdata_path(docket_path: &Path, uid: &str) -> PathBuf { + let docket_name = docket_path + .file_name() + .expect("expected a base name") + .to_str() + .expect("expected an ASCII file name in the store"); + let prefix = strip_suffix(docket_name, ".n.a") + .or_else(|| strip_suffix(docket_name, ".n")) + .expect("expected docket path in .n or .n.a"); + let name = format!("{}-{}.nd", prefix, uid); + docket_path + .parent() + .expect("expected a non-root path") + .join(name) +}
--- a/rust/hg-core/src/revlog/revlog.rs Mon Dec 07 18:06:53 2020 +0100 +++ b/rust/hg-core/src/revlog/revlog.rs Fri Dec 04 17:27:10 2020 +0100 @@ -14,6 +14,9 @@ use super::index::Index; use super::node::{NodePrefixRef, NODE_BYTES_LENGTH, NULL_NODE}; +use super::nodemap; +use super::nodemap::NodeMap; +use super::nodemap_docket::NodeMapDocket; use super::patch; use crate::revlog::Revision; @@ -27,7 +30,7 @@ UnknowDataFormat(u8), } -fn mmap_open(path: &Path) -> Result<Mmap, std::io::Error> { +pub(super) fn mmap_open(path: &Path) -> Result<Mmap, std::io::Error> { let file = File::open(path)?; let mmap = unsafe { MmapOptions::new().map(&file) }?; Ok(mmap) @@ -41,6 +44,8 @@ index: Index, /// When index and data are not interleaved: bytes of the revlog data data_bytes: Option<Box<dyn Deref<Target = [u8]> + Send>>, + /// When present on disk: the persistent nodemap for this revlog + nodemap: Option<nodemap::NodeTree>, } impl Revlog { @@ -77,7 +82,20 @@ Some(Box::new(data_mmap)) }; - Ok(Revlog { index, data_bytes }) + let nodemap = NodeMapDocket::read_from_file(index_path)?.map( + |(docket, data)| { + nodemap::NodeTree::load_bytes( + Box::new(data), + docket.data_length, + ) + }, + ); + + Ok(Revlog { + index, + data_bytes, + nodemap, + }) } /// Return number of entries of the `Revlog`. @@ -96,8 +114,20 @@ &self, node: NodePrefixRef, ) -> Result<Revision, RevlogError> { - // This is brute force. But it is fast enough for now. - // Optimization will come later. + if let Some(nodemap) = &self.nodemap { + return nodemap + .find_bin(&self.index, node) + // TODO: propagate details of this error: + .map_err(|_| RevlogError::Corrupted)? + .ok_or(RevlogError::InvalidRevision); + } + + // Fallback to linear scan when a persistent nodemap is not present. + // This happens when the persistent-nodemap experimental feature is not + // enabled, or for small revlogs. + // + // TODO: consider building a non-persistent nodemap in memory to + // optimize these cases. let mut found_by_prefix = None; for rev in (0..self.len() as Revision).rev() { let index_entry =
--- a/rust/hg-core/src/utils.rs Mon Dec 07 18:06:53 2020 +0100 +++ b/rust/hg-core/src/utils.rs Fri Dec 04 17:27:10 2020 +0100 @@ -167,3 +167,12 @@ self.as_bytes().escaped_bytes() } } + +// TODO: use the str method when we require Rust 1.45 +pub(crate) fn strip_suffix<'a>(s: &'a str, suffix: &str) -> Option<&'a str> { + if s.ends_with(suffix) { + Some(&s[..s.len() - suffix.len()]) + } else { + None + } +}
--- a/tests/test-rhg.t Mon Dec 07 18:06:53 2020 +0100 +++ b/tests/test-rhg.t Fri Dec 04 17:27:10 2020 +0100 @@ -196,5 +196,9 @@ .hg/store/00changelog.d .hg/store/00changelog.i .hg/store/00changelog.n + +Specifying revisions by changeset ID $ rhg files -r c3ae8dec9fad - [252] + of + $ rhg cat -r c3ae8dec9fad of + r5000