comparison rust/hg-core/src/matchers.rs @ 51109:687e192dae16

rust-matchers: fix quadratic complexity in `FileMatcher` Concretely, this command: ``` $ echo hg up -r <nodeid>; time hg revert dir1 dir2 -r <othernode> --debug hg up -r <nodeid> real 0m14.690s user 0m14.766s sys 0m5.430s ``` was much slower despite using 16 cores before this change. The approach taken here is the same one used in match.py, in exactmatcher. This changeset was originally written by Valentin Gatien-Baron in a private repository. I have redacted the commit message and did a minor clean up of the code.
author Raphaël Gomès <rgomes@octobus.net>
date Wed, 18 Oct 2023 14:50:14 +0200
parents f874342fa568
children 532e74ad3ff6
comparison
equal deleted inserted replaced
51108:0250e45040f1 51109:687e192dae16
4 // 4 //
5 // This software may be used and distributed according to the terms of the 5 // This software may be used and distributed according to the terms of the
6 // GNU General Public License version 2 or any later version. 6 // GNU General Public License version 2 or any later version.
7 7
8 //! Structs and types for matching files and directories. 8 //! Structs and types for matching files and directories.
9
10 use format_bytes::format_bytes;
11 use once_cell::sync::OnceCell;
9 12
10 use crate::{ 13 use crate::{
11 dirstate::dirs_multiset::DirsChildrenMultiset, 14 dirstate::dirs_multiset::DirsChildrenMultiset,
12 filepatterns::{ 15 filepatterns::{
13 build_single_regex, filter_subincludes, get_patterns_from_file, 16 build_single_regex, filter_subincludes, get_patterns_from_file,
21 DirsMultiset, FastHashMap, IgnorePattern, PatternError, PatternSyntax, 24 DirsMultiset, FastHashMap, IgnorePattern, PatternError, PatternSyntax,
22 }; 25 };
23 26
24 use crate::dirstate::status::IgnoreFnType; 27 use crate::dirstate::status::IgnoreFnType;
25 use crate::filepatterns::normalize_path_bytes; 28 use crate::filepatterns::normalize_path_bytes;
26 use std::borrow::ToOwned;
27 use std::collections::HashSet; 29 use std::collections::HashSet;
28 use std::fmt::{Display, Error, Formatter}; 30 use std::fmt::{Display, Error, Formatter};
29 use std::ops::Deref; 31 use std::ops::Deref;
30 use std::path::{Path, PathBuf}; 32 use std::path::{Path, PathBuf};
33 use std::{borrow::ToOwned, collections::BTreeSet};
31 34
32 #[derive(Debug, PartialEq)] 35 #[derive(Debug, PartialEq)]
33 pub enum VisitChildrenSet { 36 pub enum VisitChildrenSet {
34 /// Don't visit anything 37 /// Don't visit anything
35 Empty, 38 Empty,
171 /// ``` 174 /// ```
172 #[derive(Debug)] 175 #[derive(Debug)]
173 pub struct FileMatcher { 176 pub struct FileMatcher {
174 files: HashSet<HgPathBuf>, 177 files: HashSet<HgPathBuf>,
175 dirs: DirsMultiset, 178 dirs: DirsMultiset,
179 sorted_visitchildrenset_candidates: OnceCell<BTreeSet<HgPathBuf>>,
176 } 180 }
177 181
178 impl FileMatcher { 182 impl FileMatcher {
179 pub fn new(files: Vec<HgPathBuf>) -> Result<Self, HgPathError> { 183 pub fn new(files: Vec<HgPathBuf>) -> Result<Self, HgPathError> {
180 let dirs = DirsMultiset::from_manifest(&files)?; 184 let dirs = DirsMultiset::from_manifest(&files)?;
181 Ok(Self { 185 Ok(Self {
182 files: HashSet::from_iter(files.into_iter()), 186 files: HashSet::from_iter(files.into_iter()),
183 dirs, 187 dirs,
188 sorted_visitchildrenset_candidates: OnceCell::new(),
184 }) 189 })
185 } 190 }
186 fn inner_matches(&self, filename: &HgPath) -> bool { 191 fn inner_matches(&self, filename: &HgPath) -> bool {
187 self.files.contains(filename.as_ref()) 192 self.files.contains(filename.as_ref())
188 } 193 }
197 } 202 }
198 fn matches(&self, filename: &HgPath) -> bool { 203 fn matches(&self, filename: &HgPath) -> bool {
199 self.inner_matches(filename) 204 self.inner_matches(filename)
200 } 205 }
201 fn visit_children_set(&self, directory: &HgPath) -> VisitChildrenSet { 206 fn visit_children_set(&self, directory: &HgPath) -> VisitChildrenSet {
202 if self.files.is_empty() || !self.dirs.contains(&directory) { 207 if self.files.is_empty() || !self.dirs.contains(directory) {
203 return VisitChildrenSet::Empty; 208 return VisitChildrenSet::Empty;
204 } 209 }
205 let mut candidates: HashSet<HgPathBuf> = 210
206 self.dirs.iter().cloned().collect(); 211 let compute_candidates = || -> BTreeSet<HgPathBuf> {
207 212 let mut candidates: BTreeSet<HgPathBuf> =
208 candidates.extend(self.files.iter().cloned()); 213 self.dirs.iter().cloned().collect();
209 candidates.remove(HgPath::new(b"")); 214 candidates.extend(self.files.iter().cloned());
210 215 candidates.remove(HgPath::new(b""));
211 if !directory.as_ref().is_empty() { 216 candidates
212 let directory = [directory.as_ref().as_bytes(), b"/"].concat(); 217 };
213 candidates = candidates 218 let candidates =
214 .iter() 219 if directory.as_ref().is_empty() {
215 .filter_map(|c| { 220 compute_candidates()
216 if c.as_bytes().starts_with(&directory) { 221 } else {
217 Some(HgPathBuf::from_bytes( 222 let sorted_candidates = self
218 &c.as_bytes()[directory.len()..], 223 .sorted_visitchildrenset_candidates
219 )) 224 .get_or_init(compute_candidates);
220 } else { 225 let directory_bytes = directory.as_ref().as_bytes();
221 None 226 let start: HgPathBuf =
222 } 227 format_bytes!(b"{}/", directory_bytes).into();
223 }) 228 let start_len = start.len();
224 .collect(); 229 // `0` sorts after `/`
225 } 230 let end = format_bytes!(b"{}0", directory_bytes).into();
231 BTreeSet::from_iter(sorted_candidates.range(start..end).map(
232 |c| HgPathBuf::from_bytes(&c.as_bytes()[start_len..]),
233 ))
234 };
226 235
227 // `self.dirs` includes all of the directories, recursively, so if 236 // `self.dirs` includes all of the directories, recursively, so if
228 // we're attempting to match 'foo/bar/baz.txt', it'll have '', 'foo', 237 // we're attempting to match 'foo/bar/baz.txt', it'll have '', 'foo',
229 // 'foo/bar' in it. Thus we can safely ignore a candidate that has a 238 // 'foo/bar' in it. Thus we can safely ignore a candidate that has a
230 // '/' in it, indicating it's for a subdir-of-a-subdir; the immediate 239 // '/' in it, indicating it's for a subdir-of-a-subdir; the immediate