Mercurial > hg
changeset 48495:e293ff808a05
rhg: Use binary search in manifest lookup
… instead of linear scan, when looking for a single entry based on its path.
Manifest entries are sorted by path, but are variable-size so we can’t use
the standard library’s `[T]::binary_search`. We can still jump to a byte
index and then look around for entry boundaries.
Differential Revision: https://phab.mercurial-scm.org/D11932
author | Simon Sapin <simon.sapin@octobus.net> |
---|---|
date | Thu, 16 Dec 2021 17:34:51 +0100 |
parents | d3ec82016104 |
children | 30741bbea550 |
files | rust/hg-core/src/revlog/manifest.rs rust/rhg/src/commands/status.rs |
diffstat | 2 files changed, 103 insertions(+), 29 deletions(-) [+] |
line wrap: on
line diff
--- a/rust/hg-core/src/revlog/manifest.rs Fri Dec 17 11:46:30 2021 +0100 +++ b/rust/hg-core/src/revlog/manifest.rs Thu Dec 16 17:34:51 2021 +0100 @@ -52,6 +52,15 @@ /// `Manifestlog` entry which knows how to interpret the `manifest` data bytes. #[derive(Debug)] pub struct Manifest { + /// Format for a manifest: flat sequence of variable-size entries, + /// sorted by path, each as: + /// + /// ```text + /// <path> \0 <hex_node_id> <flags> \n + /// ``` + /// + /// The last entry is also terminated by a newline character. + /// Flags is one of `b""` (the empty string), `b"x"`, `b"l"`, or `b"t"`. bytes: Vec<u8>, } @@ -62,44 +71,84 @@ self.bytes .split(|b| b == &b'\n') .filter(|line| !line.is_empty()) - .map(|line| { - let (path, rest) = line.split_2(b'\0').ok_or_else(|| { - HgError::corrupted("manifest line should contain \\0") - })?; - let path = HgPath::new(path); - let (hex_node_id, flags) = match rest.split_last() { - Some((&b'x', rest)) => (rest, Some(b'x')), - Some((&b'l', rest)) => (rest, Some(b'l')), - Some((&b't', rest)) => (rest, Some(b't')), - _ => (rest, None), - }; - Ok(ManifestEntry { - path, - hex_node_id, - flags, - }) - }) + .map(ManifestEntry::from_raw) } /// If the given path is in this manifest, return its filelog node ID - pub fn find_file( + pub fn find_by_path( &self, path: &HgPath, ) -> Result<Option<ManifestEntry>, HgError> { - // TODO: use binary search instead of linear scan. This may involve - // building (and caching) an index of the byte indicex of each manifest - // line. + use std::cmp::Ordering::*; + let path = path.as_bytes(); + // Both boundaries of this `&[u8]` slice are always at the boundary of + // an entry + let mut bytes = &*self.bytes; - // TODO: use try_find when available (if still using linear scan) - // https://github.com/rust-lang/rust/issues/63178 - for entry in self.iter() { - let entry = entry?; - if entry.path == path { - return Ok(Some(entry)); + // Binary search algorithm derived from `[T]::binary_search_by` + // <https://github.com/rust-lang/rust/blob/1.57.0/library/core/src/slice/mod.rs#L2221> + // except we don’t have a slice of entries. Instead we jump to the + // middle of the byte slice and look around for entry delimiters + // (newlines). + while let Some(entry_range) = Self::find_entry_near_middle_of(bytes)? { + let (entry_path, rest) = + ManifestEntry::split_path(&bytes[entry_range.clone()])?; + let cmp = entry_path.cmp(path); + if cmp == Less { + let after_newline = entry_range.end + 1; + bytes = &bytes[after_newline..]; + } else if cmp == Greater { + bytes = &bytes[..entry_range.start]; + } else { + return Ok(Some(ManifestEntry::from_path_and_rest( + entry_path, rest, + ))); } } Ok(None) } + + /// If there is at least one, return the byte range of an entry *excluding* + /// the final newline. + fn find_entry_near_middle_of( + bytes: &[u8], + ) -> Result<Option<std::ops::Range<usize>>, HgError> { + let len = bytes.len(); + if len > 0 { + let middle = bytes.len() / 2; + // Integer division rounds down, so `middle < len`. + let (before, after) = bytes.split_at(middle); + let is_newline = |&byte: &u8| byte == b'\n'; + let entry_start = match before.iter().rposition(is_newline) { + Some(i) => i + 1, + None => 0, // We choose the first entry in `bytes` + }; + let entry_end = match after.iter().position(is_newline) { + Some(i) => { + // No `+ 1` here to exclude this newline from the range + middle + i + } + None => { + // In a well-formed manifest: + // + // * Since `len > 0`, `bytes` contains at least one entry + // * Every entry ends with a newline + // * Since `middle < len`, `after` contains at least the + // newline at the end of the last entry of `bytes`. + // + // We didn’t find a newline, so this manifest is not + // well-formed. + return Err(HgError::corrupted( + "manifest entry without \\n delimiter", + )); + } + }; + Ok(Some(entry_start..entry_end)) + } else { + // len == 0 + Ok(None) + } + } } /// `Manifestlog` entry which knows how to interpret the `manifest` data bytes. @@ -112,7 +161,32 @@ pub flags: Option<u8>, } -impl ManifestEntry<'_> { +impl<'a> ManifestEntry<'a> { + fn split_path(bytes: &[u8]) -> Result<(&[u8], &[u8]), HgError> { + bytes.split_2(b'\0').ok_or_else(|| { + HgError::corrupted("manifest entry without \\0 delimiter") + }) + } + + fn from_path_and_rest(path: &'a [u8], rest: &'a [u8]) -> Self { + let (hex_node_id, flags) = match rest.split_last() { + Some((&b'x', rest)) => (rest, Some(b'x')), + Some((&b'l', rest)) => (rest, Some(b'l')), + Some((&b't', rest)) => (rest, Some(b't')), + _ => (rest, None), + }; + Self { + path: HgPath::new(path), + hex_node_id, + flags, + } + } + + fn from_raw(bytes: &'a [u8]) -> Result<Self, HgError> { + let (path, rest) = Self::split_path(bytes)?; + Ok(Self::from_path_and_rest(path, rest)) + } + pub fn node_id(&self) -> Result<Node, HgError> { Node::from_hex_for_repo(self.hex_node_id) }
--- a/rust/rhg/src/commands/status.rs Fri Dec 17 11:46:30 2021 +0100 +++ b/rust/rhg/src/commands/status.rs Thu Dec 16 17:34:51 2021 +0100 @@ -473,7 +473,7 @@ }; let entry = manifest - .find_file(hg_path)? + .find_by_path(hg_path)? .expect("ambgious file not in p1"); if entry.flags != fs_flags { return Ok(true);