Mercurial > hg
changeset 47409:0ef8231e413f
dirstate-v2: Store a hash of ignore patterns (.hgignore)
Later, this help extend `read_dir` caching to directories that contain ignored
files (but no unknown files). Such cache must be invalidated when ignore patterns
change since a formerly-ignored file might become unknown.
This helps the default configuration of `hg status` where unknown files must
be listed, but ignored files are not.
Differential Revision: https://phab.mercurial-scm.org/D10836
author | Simon Sapin <simon.sapin@octobus.net> |
---|---|
date | Wed, 02 Jun 2021 11:25:18 +0200 |
parents | 7954ee2d7cf7 |
children | 65333ecc8b34 |
files | rust/hg-core/src/dirstate/status.rs rust/hg-core/src/dirstate_tree/dirstate_map.rs rust/hg-core/src/dirstate_tree/on_disk.rs rust/hg-core/src/dirstate_tree/status.rs rust/hg-core/src/filepatterns.rs rust/hg-core/src/matchers.rs tests/test-hgignore.t |
diffstat | 7 files changed, 118 insertions(+), 37 deletions(-) [+] |
line wrap: on
line diff
--- a/rust/hg-core/src/dirstate/status.rs Mon Jun 07 17:29:32 2021 +0530 +++ b/rust/hg-core/src/dirstate/status.rs Wed Jun 02 11:25:18 2021 +0200 @@ -351,7 +351,7 @@ let (ignore_fn, warnings): (IgnoreFnType, _) = if options.list_ignored || options.list_unknown { - get_ignore_function(ignore_files, &root_dir)? + get_ignore_function(ignore_files, &root_dir, &mut |_| {})? } else { (Box::new(|&_| true), vec![]) };
--- a/rust/hg-core/src/dirstate_tree/dirstate_map.rs Mon Jun 07 17:29:32 2021 +0530 +++ b/rust/hg-core/src/dirstate_tree/dirstate_map.rs Wed Jun 02 11:25:18 2021 +0200 @@ -37,6 +37,9 @@ /// Number of nodes anywhere in the tree that have /// `.copy_source.is_some()`. pub(super) nodes_with_copy_source_count: u32, + + /// See on_disk::Header + pub(super) ignore_patterns_hash: on_disk::IgnorePatternsHash, } /// Using a plain `HgPathBuf` of the full path from the repository root as a @@ -385,6 +388,7 @@ root: ChildNodes::default(), nodes_with_entry_count: 0, nodes_with_copy_source_count: 0, + ignore_patterns_hash: [0; on_disk::IGNORE_PATTERNS_HASH_LEN], } }
--- a/rust/hg-core/src/dirstate_tree/on_disk.rs Mon Jun 07 17:29:32 2021 +0530 +++ b/rust/hg-core/src/dirstate_tree/on_disk.rs Wed Jun 02 11:25:18 2021 +0200 @@ -28,6 +28,9 @@ /// `.hg/requires` already governs which format should be used. pub const V2_FORMAT_MARKER: &[u8; 12] = b"dirstate-v2\n"; +pub(super) const IGNORE_PATTERNS_HASH_LEN: usize = 20; +pub(super) type IgnorePatternsHash = [u8; IGNORE_PATTERNS_HASH_LEN]; + #[derive(BytesCast)] #[repr(C)] struct Header { @@ -40,6 +43,27 @@ root: ChildNodes, nodes_with_entry_count: Size, nodes_with_copy_source_count: Size, + + /// If non-zero, a hash of ignore files that were used for some previous + /// run of the `status` algorithm. + /// + /// We define: + /// + /// * "Root" ignore files are `.hgignore` at the root of the repository if + /// it exists, and files from `ui.ignore.*` config. This set of files is + /// then sorted by the string representation of their path. + /// * The "expanded contents" of an ignore files is the byte string made + /// by concatenating its contents with the "expanded contents" of other + /// files included with `include:` or `subinclude:` files, in inclusion + /// order. This definition is recursive, as included files can + /// themselves include more files. + /// + /// This hash is defined as the SHA-1 of the concatenation (in sorted + /// order) of the "expanded contents" of each "root" ignore file. + /// (Note that computing this does not require actually concatenating byte + /// strings into contiguous memory, instead SHA-1 hashing can be done + /// incrementally.) + ignore_patterns_hash: IgnorePatternsHash, } #[derive(BytesCast)] @@ -145,7 +169,7 @@ /// Make sure that size-affecting changes are made knowingly fn _static_assert_size_of() { - let _ = std::mem::transmute::<Header, [u8; 72]>; + let _ = std::mem::transmute::<Header, [u8; 92]>; let _ = std::mem::transmute::<Node, [u8; 57]>; } @@ -197,6 +221,7 @@ nodes_with_copy_source_count: header .nodes_with_copy_source_count .get(), + ignore_patterns_hash: header.ignore_patterns_hash, }; let parents = Some(header.parents.clone()); Ok((dirstate_map, parents)) @@ -473,6 +498,7 @@ nodes_with_copy_source_count: dirstate_map .nodes_with_copy_source_count .into(), + ignore_patterns_hash: dirstate_map.ignore_patterns_hash, }; out[..header_len].copy_from_slice(header.as_bytes()); Ok(out)
--- a/rust/hg-core/src/dirstate_tree/status.rs Mon Jun 07 17:29:32 2021 +0530 +++ b/rust/hg-core/src/dirstate_tree/status.rs Wed Jun 02 11:25:18 2021 +0200 @@ -21,6 +21,7 @@ use crate::StatusOptions; use micro_timer::timed; use rayon::prelude::*; +use sha1::{Digest, Sha1}; use std::borrow::Cow; use std::io; use std::path::Path; @@ -45,11 +46,20 @@ ignore_files: Vec<PathBuf>, options: StatusOptions, ) -> Result<(DirstateStatus<'on_disk>, Vec<PatternFileWarning>), StatusError> { - let (ignore_fn, warnings): (IgnoreFnType, _) = + let (ignore_fn, warnings, patterns_changed): (IgnoreFnType, _, _) = if options.list_ignored || options.list_unknown { - get_ignore_function(ignore_files, &root_dir)? + let mut hasher = Sha1::new(); + let (ignore_fn, warnings) = get_ignore_function( + ignore_files, + &root_dir, + &mut |pattern_bytes| hasher.update(pattern_bytes), + )?; + let new_hash = *hasher.finalize().as_ref(); + let changed = new_hash != dmap.ignore_patterns_hash; + dmap.ignore_patterns_hash = new_hash; + (ignore_fn, warnings, Some(changed)) } else { - (Box::new(|&_| true), vec![]) + (Box::new(|&_| true), vec![], None) }; let common = StatusCommon { @@ -58,7 +68,8 @@ matcher, ignore_fn, outcome: Default::default(), - cached_directory_mtimes_to_add: Default::default(), + ignore_patterns_have_changed: patterns_changed, + new_cachable_directories: Default::default(), filesystem_time_at_status_start: filesystem_now(&root_dir).ok(), }; let is_at_repo_root = true; @@ -79,9 +90,12 @@ is_at_repo_root, )?; let mut outcome = common.outcome.into_inner().unwrap(); - let to_add = common.cached_directory_mtimes_to_add.into_inner().unwrap(); - outcome.dirty = !to_add.is_empty(); - for (path, mtime) in &to_add { + let new_cachable = common.new_cachable_directories.into_inner().unwrap(); + + outcome.dirty = common.ignore_patterns_have_changed == Some(true) + || !new_cachable.is_empty(); + + for (path, mtime) in &new_cachable { let node = DirstateMap::get_or_insert_node( dmap.on_disk, &mut dmap.root, @@ -96,6 +110,7 @@ } } } + Ok((outcome, warnings)) } @@ -107,8 +122,13 @@ matcher: &'a (dyn Matcher + Sync), ignore_fn: IgnoreFnType<'a>, outcome: Mutex<DirstateStatus<'on_disk>>, - cached_directory_mtimes_to_add: - Mutex<Vec<(Cow<'on_disk, HgPath>, Timestamp)>>, + new_cachable_directories: Mutex<Vec<(Cow<'on_disk, HgPath>, Timestamp)>>, + + /// Whether ignore files like `.hgignore` have changed since the previous + /// time a `status()` call wrote their hash to the dirstate. `None` means + /// we don’t know as this run doesn’t list either ignored or uknown files + /// and therefore isn’t reading `.hgignore`. + ignore_patterns_have_changed: Option<bool>, /// The current time at the start of the `status()` algorithm, as measured /// and possibly truncated by the filesystem. @@ -422,7 +442,7 @@ let hg_path = dirstate_node .full_path_borrowed(self.dmap.on_disk)? .detach_from_tree(); - self.cached_directory_mtimes_to_add + self.new_cachable_directories .lock() .unwrap() .push((hg_path, timestamp))
--- a/rust/hg-core/src/filepatterns.rs Mon Jun 07 17:29:32 2021 +0530 +++ b/rust/hg-core/src/filepatterns.rs Wed Jun 02 11:25:18 2021 +0200 @@ -17,8 +17,6 @@ }; use lazy_static::lazy_static; use regex::bytes::{NoExpand, Regex}; -use std::fs::File; -use std::io::Read; use std::ops::Deref; use std::path::{Path, PathBuf}; use std::vec::Vec; @@ -410,24 +408,19 @@ pub fn read_pattern_file( file_path: &Path, warn: bool, + inspect_pattern_bytes: &mut impl FnMut(&[u8]), ) -> Result<(Vec<IgnorePattern>, Vec<PatternFileWarning>), PatternError> { - let mut f = match File::open(file_path) { - Ok(f) => Ok(f), - Err(e) => match e.kind() { - std::io::ErrorKind::NotFound => { - return Ok(( - vec![], - vec![PatternFileWarning::NoSuchFile(file_path.to_owned())], - )) - } - _ => Err(e), - }, - }?; - let mut contents = Vec::new(); - - f.read_to_end(&mut contents)?; - - Ok(parse_pattern_file_contents(&contents, file_path, warn)?) + match std::fs::read(file_path) { + Ok(contents) => { + inspect_pattern_bytes(&contents); + parse_pattern_file_contents(&contents, file_path, warn) + } + Err(e) if e.kind() == std::io::ErrorKind::NotFound => Ok(( + vec![], + vec![PatternFileWarning::NoSuchFile(file_path.to_owned())], + )), + Err(e) => Err(e.into()), + } } /// Represents an entry in an "ignore" file. @@ -458,8 +451,10 @@ pub fn get_patterns_from_file( pattern_file: &Path, root_dir: &Path, + inspect_pattern_bytes: &mut impl FnMut(&[u8]), ) -> PatternResult<(Vec<IgnorePattern>, Vec<PatternFileWarning>)> { - let (patterns, mut warnings) = read_pattern_file(pattern_file, true)?; + let (patterns, mut warnings) = + read_pattern_file(pattern_file, true, inspect_pattern_bytes)?; let patterns = patterns .into_iter() .flat_map(|entry| -> PatternResult<_> { @@ -467,8 +462,11 @@ PatternSyntax::Include => { let inner_include = root_dir.join(get_path_from_bytes(&entry.pattern)); - let (inner_pats, inner_warnings) = - get_patterns_from_file(&inner_include, root_dir)?; + let (inner_pats, inner_warnings) = get_patterns_from_file( + &inner_include, + root_dir, + inspect_pattern_bytes, + )?; warnings.extend(inner_warnings); inner_pats } @@ -482,6 +480,7 @@ get_patterns_from_file( &sub_include.path, &sub_include.root, + inspect_pattern_bytes, )?; sub_include.included_patterns = inner_patterns; warnings.extend(inner_warnings);
--- a/rust/hg-core/src/matchers.rs Mon Jun 07 17:29:32 2021 +0530 +++ b/rust/hg-core/src/matchers.rs Wed Jun 02 11:25:18 2021 +0200 @@ -564,8 +564,9 @@ /// function that checks whether a given file (in the general sense) should be /// ignored. pub fn get_ignore_function<'a>( - all_pattern_files: Vec<PathBuf>, + mut all_pattern_files: Vec<PathBuf>, root_dir: &Path, + inspect_pattern_bytes: &mut impl FnMut(&[u8]), ) -> PatternResult<( Box<dyn for<'r> Fn(&'r HgPath) -> bool + Sync + 'a>, Vec<PatternFileWarning>, @@ -573,9 +574,20 @@ let mut all_patterns = vec![]; let mut all_warnings = vec![]; + // Sort to make the ordering of calls to `inspect_pattern_bytes` + // deterministic even if the ordering of `all_pattern_files` is not (such + // as when a iteration order of a Python dict or Rust HashMap is involved). + // Sort by "string" representation instead of the default by component + // (with a Rust-specific definition of a component) + all_pattern_files + .sort_unstable_by(|a, b| a.as_os_str().cmp(b.as_os_str())); + for pattern_file in &all_pattern_files { - let (patterns, warnings) = - get_patterns_from_file(pattern_file, root_dir)?; + let (patterns, warnings) = get_patterns_from_file( + pattern_file, + root_dir, + inspect_pattern_bytes, + )?; all_patterns.extend(patterns.to_owned()); all_warnings.extend(warnings);
--- a/tests/test-hgignore.t Mon Jun 07 17:29:32 2021 +0530 +++ b/tests/test-hgignore.t Wed Jun 02 11:25:18 2021 +0200 @@ -402,3 +402,23 @@ $ hg up -qC . #endif + +#if dirstate-v2 + +Check the hash of ignore patterns written in the dirstate at offset +12 + 20 + 20 + 8 + 4 + 4 + 4 = 72 + + $ hg status > /dev/null + $ cat .hg/testhgignore .hg/testhgignorerel .hgignore dir2/.hgignore dir1/.hgignore dir1/.hgignoretwo | $TESTDIR/f --sha1 + sha1=6e315b60f15fb5dfa02be00f3e2c8f923051f5ff + >>> import binascii; print(binascii.hexlify(open(".hg/dirstate", "rb").read()[72:][:20]).decode()) + 6e315b60f15fb5dfa02be00f3e2c8f923051f5ff + + $ echo rel > .hg/testhgignorerel + $ hg status > /dev/null + $ cat .hg/testhgignore .hg/testhgignorerel .hgignore dir2/.hgignore dir1/.hgignore dir1/.hgignoretwo | $TESTDIR/f --sha1 + sha1=dea19cc7119213f24b6b582a4bae7b0cb063e34e + >>> import binascii; print(binascii.hexlify(open(".hg/dirstate", "rb").read()[72:][:20]).decode()) + dea19cc7119213f24b6b582a4bae7b0cb063e34e + +#endif