dirstate-v2: Store a hash of ignore patterns (.hgignore)
Later, this help extend `read_dir` caching to directories that contain ignored
files (but no unknown files). Such cache must be invalidated when ignore patterns
change since a formerly-ignored file might become unknown.
This helps the default configuration of `hg status` where unknown files must
be listed, but ignored files are not.
Differential Revision: https://phab.mercurial-scm.org/D10836
--- a/rust/hg-core/src/dirstate/status.rs Mon Jun 07 17:29:32 2021 +0530
+++ b/rust/hg-core/src/dirstate/status.rs Wed Jun 02 11:25:18 2021 +0200
@@ -351,7 +351,7 @@
let (ignore_fn, warnings): (IgnoreFnType, _) =
if options.list_ignored || options.list_unknown {
- get_ignore_function(ignore_files, &root_dir)?
+ get_ignore_function(ignore_files, &root_dir, &mut |_| {})?
} else {
(Box::new(|&_| true), vec![])
};
--- a/rust/hg-core/src/dirstate_tree/dirstate_map.rs Mon Jun 07 17:29:32 2021 +0530
+++ b/rust/hg-core/src/dirstate_tree/dirstate_map.rs Wed Jun 02 11:25:18 2021 +0200
@@ -37,6 +37,9 @@
/// Number of nodes anywhere in the tree that have
/// `.copy_source.is_some()`.
pub(super) nodes_with_copy_source_count: u32,
+
+ /// See on_disk::Header
+ pub(super) ignore_patterns_hash: on_disk::IgnorePatternsHash,
}
/// Using a plain `HgPathBuf` of the full path from the repository root as a
@@ -385,6 +388,7 @@
root: ChildNodes::default(),
nodes_with_entry_count: 0,
nodes_with_copy_source_count: 0,
+ ignore_patterns_hash: [0; on_disk::IGNORE_PATTERNS_HASH_LEN],
}
}
--- a/rust/hg-core/src/dirstate_tree/on_disk.rs Mon Jun 07 17:29:32 2021 +0530
+++ b/rust/hg-core/src/dirstate_tree/on_disk.rs Wed Jun 02 11:25:18 2021 +0200
@@ -28,6 +28,9 @@
/// `.hg/requires` already governs which format should be used.
pub const V2_FORMAT_MARKER: &[u8; 12] = b"dirstate-v2\n";
+pub(super) const IGNORE_PATTERNS_HASH_LEN: usize = 20;
+pub(super) type IgnorePatternsHash = [u8; IGNORE_PATTERNS_HASH_LEN];
+
#[derive(BytesCast)]
#[repr(C)]
struct Header {
@@ -40,6 +43,27 @@
root: ChildNodes,
nodes_with_entry_count: Size,
nodes_with_copy_source_count: Size,
+
+ /// If non-zero, a hash of ignore files that were used for some previous
+ /// run of the `status` algorithm.
+ ///
+ /// We define:
+ ///
+ /// * "Root" ignore files are `.hgignore` at the root of the repository if
+ /// it exists, and files from `ui.ignore.*` config. This set of files is
+ /// then sorted by the string representation of their path.
+ /// * The "expanded contents" of an ignore files is the byte string made
+ /// by concatenating its contents with the "expanded contents" of other
+ /// files included with `include:` or `subinclude:` files, in inclusion
+ /// order. This definition is recursive, as included files can
+ /// themselves include more files.
+ ///
+ /// This hash is defined as the SHA-1 of the concatenation (in sorted
+ /// order) of the "expanded contents" of each "root" ignore file.
+ /// (Note that computing this does not require actually concatenating byte
+ /// strings into contiguous memory, instead SHA-1 hashing can be done
+ /// incrementally.)
+ ignore_patterns_hash: IgnorePatternsHash,
}
#[derive(BytesCast)]
@@ -145,7 +169,7 @@
/// Make sure that size-affecting changes are made knowingly
fn _static_assert_size_of() {
- let _ = std::mem::transmute::<Header, [u8; 72]>;
+ let _ = std::mem::transmute::<Header, [u8; 92]>;
let _ = std::mem::transmute::<Node, [u8; 57]>;
}
@@ -197,6 +221,7 @@
nodes_with_copy_source_count: header
.nodes_with_copy_source_count
.get(),
+ ignore_patterns_hash: header.ignore_patterns_hash,
};
let parents = Some(header.parents.clone());
Ok((dirstate_map, parents))
@@ -473,6 +498,7 @@
nodes_with_copy_source_count: dirstate_map
.nodes_with_copy_source_count
.into(),
+ ignore_patterns_hash: dirstate_map.ignore_patterns_hash,
};
out[..header_len].copy_from_slice(header.as_bytes());
Ok(out)
--- a/rust/hg-core/src/dirstate_tree/status.rs Mon Jun 07 17:29:32 2021 +0530
+++ b/rust/hg-core/src/dirstate_tree/status.rs Wed Jun 02 11:25:18 2021 +0200
@@ -21,6 +21,7 @@
use crate::StatusOptions;
use micro_timer::timed;
use rayon::prelude::*;
+use sha1::{Digest, Sha1};
use std::borrow::Cow;
use std::io;
use std::path::Path;
@@ -45,11 +46,20 @@
ignore_files: Vec<PathBuf>,
options: StatusOptions,
) -> Result<(DirstateStatus<'on_disk>, Vec<PatternFileWarning>), StatusError> {
- let (ignore_fn, warnings): (IgnoreFnType, _) =
+ let (ignore_fn, warnings, patterns_changed): (IgnoreFnType, _, _) =
if options.list_ignored || options.list_unknown {
- get_ignore_function(ignore_files, &root_dir)?
+ let mut hasher = Sha1::new();
+ let (ignore_fn, warnings) = get_ignore_function(
+ ignore_files,
+ &root_dir,
+ &mut |pattern_bytes| hasher.update(pattern_bytes),
+ )?;
+ let new_hash = *hasher.finalize().as_ref();
+ let changed = new_hash != dmap.ignore_patterns_hash;
+ dmap.ignore_patterns_hash = new_hash;
+ (ignore_fn, warnings, Some(changed))
} else {
- (Box::new(|&_| true), vec![])
+ (Box::new(|&_| true), vec![], None)
};
let common = StatusCommon {
@@ -58,7 +68,8 @@
matcher,
ignore_fn,
outcome: Default::default(),
- cached_directory_mtimes_to_add: Default::default(),
+ ignore_patterns_have_changed: patterns_changed,
+ new_cachable_directories: Default::default(),
filesystem_time_at_status_start: filesystem_now(&root_dir).ok(),
};
let is_at_repo_root = true;
@@ -79,9 +90,12 @@
is_at_repo_root,
)?;
let mut outcome = common.outcome.into_inner().unwrap();
- let to_add = common.cached_directory_mtimes_to_add.into_inner().unwrap();
- outcome.dirty = !to_add.is_empty();
- for (path, mtime) in &to_add {
+ let new_cachable = common.new_cachable_directories.into_inner().unwrap();
+
+ outcome.dirty = common.ignore_patterns_have_changed == Some(true)
+ || !new_cachable.is_empty();
+
+ for (path, mtime) in &new_cachable {
let node = DirstateMap::get_or_insert_node(
dmap.on_disk,
&mut dmap.root,
@@ -96,6 +110,7 @@
}
}
}
+
Ok((outcome, warnings))
}
@@ -107,8 +122,13 @@
matcher: &'a (dyn Matcher + Sync),
ignore_fn: IgnoreFnType<'a>,
outcome: Mutex<DirstateStatus<'on_disk>>,
- cached_directory_mtimes_to_add:
- Mutex<Vec<(Cow<'on_disk, HgPath>, Timestamp)>>,
+ new_cachable_directories: Mutex<Vec<(Cow<'on_disk, HgPath>, Timestamp)>>,
+
+ /// Whether ignore files like `.hgignore` have changed since the previous
+ /// time a `status()` call wrote their hash to the dirstate. `None` means
+ /// we don’t know as this run doesn’t list either ignored or uknown files
+ /// and therefore isn’t reading `.hgignore`.
+ ignore_patterns_have_changed: Option<bool>,
/// The current time at the start of the `status()` algorithm, as measured
/// and possibly truncated by the filesystem.
@@ -422,7 +442,7 @@
let hg_path = dirstate_node
.full_path_borrowed(self.dmap.on_disk)?
.detach_from_tree();
- self.cached_directory_mtimes_to_add
+ self.new_cachable_directories
.lock()
.unwrap()
.push((hg_path, timestamp))
--- a/rust/hg-core/src/filepatterns.rs Mon Jun 07 17:29:32 2021 +0530
+++ b/rust/hg-core/src/filepatterns.rs Wed Jun 02 11:25:18 2021 +0200
@@ -17,8 +17,6 @@
};
use lazy_static::lazy_static;
use regex::bytes::{NoExpand, Regex};
-use std::fs::File;
-use std::io::Read;
use std::ops::Deref;
use std::path::{Path, PathBuf};
use std::vec::Vec;
@@ -410,24 +408,19 @@
pub fn read_pattern_file(
file_path: &Path,
warn: bool,
+ inspect_pattern_bytes: &mut impl FnMut(&[u8]),
) -> Result<(Vec<IgnorePattern>, Vec<PatternFileWarning>), PatternError> {
- let mut f = match File::open(file_path) {
- Ok(f) => Ok(f),
- Err(e) => match e.kind() {
- std::io::ErrorKind::NotFound => {
- return Ok((
- vec![],
- vec![PatternFileWarning::NoSuchFile(file_path.to_owned())],
- ))
- }
- _ => Err(e),
- },
- }?;
- let mut contents = Vec::new();
-
- f.read_to_end(&mut contents)?;
-
- Ok(parse_pattern_file_contents(&contents, file_path, warn)?)
+ match std::fs::read(file_path) {
+ Ok(contents) => {
+ inspect_pattern_bytes(&contents);
+ parse_pattern_file_contents(&contents, file_path, warn)
+ }
+ Err(e) if e.kind() == std::io::ErrorKind::NotFound => Ok((
+ vec![],
+ vec![PatternFileWarning::NoSuchFile(file_path.to_owned())],
+ )),
+ Err(e) => Err(e.into()),
+ }
}
/// Represents an entry in an "ignore" file.
@@ -458,8 +451,10 @@
pub fn get_patterns_from_file(
pattern_file: &Path,
root_dir: &Path,
+ inspect_pattern_bytes: &mut impl FnMut(&[u8]),
) -> PatternResult<(Vec<IgnorePattern>, Vec<PatternFileWarning>)> {
- let (patterns, mut warnings) = read_pattern_file(pattern_file, true)?;
+ let (patterns, mut warnings) =
+ read_pattern_file(pattern_file, true, inspect_pattern_bytes)?;
let patterns = patterns
.into_iter()
.flat_map(|entry| -> PatternResult<_> {
@@ -467,8 +462,11 @@
PatternSyntax::Include => {
let inner_include =
root_dir.join(get_path_from_bytes(&entry.pattern));
- let (inner_pats, inner_warnings) =
- get_patterns_from_file(&inner_include, root_dir)?;
+ let (inner_pats, inner_warnings) = get_patterns_from_file(
+ &inner_include,
+ root_dir,
+ inspect_pattern_bytes,
+ )?;
warnings.extend(inner_warnings);
inner_pats
}
@@ -482,6 +480,7 @@
get_patterns_from_file(
&sub_include.path,
&sub_include.root,
+ inspect_pattern_bytes,
)?;
sub_include.included_patterns = inner_patterns;
warnings.extend(inner_warnings);
--- a/rust/hg-core/src/matchers.rs Mon Jun 07 17:29:32 2021 +0530
+++ b/rust/hg-core/src/matchers.rs Wed Jun 02 11:25:18 2021 +0200
@@ -564,8 +564,9 @@
/// function that checks whether a given file (in the general sense) should be
/// ignored.
pub fn get_ignore_function<'a>(
- all_pattern_files: Vec<PathBuf>,
+ mut all_pattern_files: Vec<PathBuf>,
root_dir: &Path,
+ inspect_pattern_bytes: &mut impl FnMut(&[u8]),
) -> PatternResult<(
Box<dyn for<'r> Fn(&'r HgPath) -> bool + Sync + 'a>,
Vec<PatternFileWarning>,
@@ -573,9 +574,20 @@
let mut all_patterns = vec![];
let mut all_warnings = vec![];
+ // Sort to make the ordering of calls to `inspect_pattern_bytes`
+ // deterministic even if the ordering of `all_pattern_files` is not (such
+ // as when a iteration order of a Python dict or Rust HashMap is involved).
+ // Sort by "string" representation instead of the default by component
+ // (with a Rust-specific definition of a component)
+ all_pattern_files
+ .sort_unstable_by(|a, b| a.as_os_str().cmp(b.as_os_str()));
+
for pattern_file in &all_pattern_files {
- let (patterns, warnings) =
- get_patterns_from_file(pattern_file, root_dir)?;
+ let (patterns, warnings) = get_patterns_from_file(
+ pattern_file,
+ root_dir,
+ inspect_pattern_bytes,
+ )?;
all_patterns.extend(patterns.to_owned());
all_warnings.extend(warnings);
--- a/tests/test-hgignore.t Mon Jun 07 17:29:32 2021 +0530
+++ b/tests/test-hgignore.t Wed Jun 02 11:25:18 2021 +0200
@@ -402,3 +402,23 @@
$ hg up -qC .
#endif
+
+#if dirstate-v2
+
+Check the hash of ignore patterns written in the dirstate at offset
+12 + 20 + 20 + 8 + 4 + 4 + 4 = 72
+
+ $ hg status > /dev/null
+ $ cat .hg/testhgignore .hg/testhgignorerel .hgignore dir2/.hgignore dir1/.hgignore dir1/.hgignoretwo | $TESTDIR/f --sha1
+ sha1=6e315b60f15fb5dfa02be00f3e2c8f923051f5ff
+ >>> import binascii; print(binascii.hexlify(open(".hg/dirstate", "rb").read()[72:][:20]).decode())
+ 6e315b60f15fb5dfa02be00f3e2c8f923051f5ff
+
+ $ echo rel > .hg/testhgignorerel
+ $ hg status > /dev/null
+ $ cat .hg/testhgignore .hg/testhgignorerel .hgignore dir2/.hgignore dir1/.hgignore dir1/.hgignoretwo | $TESTDIR/f --sha1
+ sha1=dea19cc7119213f24b6b582a4bae7b0cb063e34e
+ >>> import binascii; print(binascii.hexlify(open(".hg/dirstate", "rb").read()[72:][:20]).decode())
+ dea19cc7119213f24b6b582a4bae7b0cb063e34e
+
+#endif