Mercurial > hg
comparison rust/hg-core/src/matchers.rs @ 51109:687e192dae16
rust-matchers: fix quadratic complexity in `FileMatcher`
Concretely, this command:
```
$ echo hg up -r <nodeid>; time hg revert dir1 dir2 -r <othernode> --debug
hg up -r <nodeid>
real 0m14.690s
user 0m14.766s
sys 0m5.430s
```
was much slower despite using 16 cores before this change.
The approach taken here is the same one used in match.py, in exactmatcher.
This changeset was originally written by Valentin Gatien-Baron in a private
repository.
I have redacted the commit message and did a minor clean up of the code.
author | Raphaël Gomès <rgomes@octobus.net> |
---|---|
date | Wed, 18 Oct 2023 14:50:14 +0200 |
parents | f874342fa568 |
children | 532e74ad3ff6 |
comparison
equal
deleted
inserted
replaced
51108:0250e45040f1 | 51109:687e192dae16 |
---|---|
4 // | 4 // |
5 // This software may be used and distributed according to the terms of the | 5 // This software may be used and distributed according to the terms of the |
6 // GNU General Public License version 2 or any later version. | 6 // GNU General Public License version 2 or any later version. |
7 | 7 |
8 //! Structs and types for matching files and directories. | 8 //! Structs and types for matching files and directories. |
9 | |
10 use format_bytes::format_bytes; | |
11 use once_cell::sync::OnceCell; | |
9 | 12 |
10 use crate::{ | 13 use crate::{ |
11 dirstate::dirs_multiset::DirsChildrenMultiset, | 14 dirstate::dirs_multiset::DirsChildrenMultiset, |
12 filepatterns::{ | 15 filepatterns::{ |
13 build_single_regex, filter_subincludes, get_patterns_from_file, | 16 build_single_regex, filter_subincludes, get_patterns_from_file, |
21 DirsMultiset, FastHashMap, IgnorePattern, PatternError, PatternSyntax, | 24 DirsMultiset, FastHashMap, IgnorePattern, PatternError, PatternSyntax, |
22 }; | 25 }; |
23 | 26 |
24 use crate::dirstate::status::IgnoreFnType; | 27 use crate::dirstate::status::IgnoreFnType; |
25 use crate::filepatterns::normalize_path_bytes; | 28 use crate::filepatterns::normalize_path_bytes; |
26 use std::borrow::ToOwned; | |
27 use std::collections::HashSet; | 29 use std::collections::HashSet; |
28 use std::fmt::{Display, Error, Formatter}; | 30 use std::fmt::{Display, Error, Formatter}; |
29 use std::ops::Deref; | 31 use std::ops::Deref; |
30 use std::path::{Path, PathBuf}; | 32 use std::path::{Path, PathBuf}; |
33 use std::{borrow::ToOwned, collections::BTreeSet}; | |
31 | 34 |
32 #[derive(Debug, PartialEq)] | 35 #[derive(Debug, PartialEq)] |
33 pub enum VisitChildrenSet { | 36 pub enum VisitChildrenSet { |
34 /// Don't visit anything | 37 /// Don't visit anything |
35 Empty, | 38 Empty, |
171 /// ``` | 174 /// ``` |
172 #[derive(Debug)] | 175 #[derive(Debug)] |
173 pub struct FileMatcher { | 176 pub struct FileMatcher { |
174 files: HashSet<HgPathBuf>, | 177 files: HashSet<HgPathBuf>, |
175 dirs: DirsMultiset, | 178 dirs: DirsMultiset, |
179 sorted_visitchildrenset_candidates: OnceCell<BTreeSet<HgPathBuf>>, | |
176 } | 180 } |
177 | 181 |
178 impl FileMatcher { | 182 impl FileMatcher { |
179 pub fn new(files: Vec<HgPathBuf>) -> Result<Self, HgPathError> { | 183 pub fn new(files: Vec<HgPathBuf>) -> Result<Self, HgPathError> { |
180 let dirs = DirsMultiset::from_manifest(&files)?; | 184 let dirs = DirsMultiset::from_manifest(&files)?; |
181 Ok(Self { | 185 Ok(Self { |
182 files: HashSet::from_iter(files.into_iter()), | 186 files: HashSet::from_iter(files.into_iter()), |
183 dirs, | 187 dirs, |
188 sorted_visitchildrenset_candidates: OnceCell::new(), | |
184 }) | 189 }) |
185 } | 190 } |
186 fn inner_matches(&self, filename: &HgPath) -> bool { | 191 fn inner_matches(&self, filename: &HgPath) -> bool { |
187 self.files.contains(filename.as_ref()) | 192 self.files.contains(filename.as_ref()) |
188 } | 193 } |
197 } | 202 } |
198 fn matches(&self, filename: &HgPath) -> bool { | 203 fn matches(&self, filename: &HgPath) -> bool { |
199 self.inner_matches(filename) | 204 self.inner_matches(filename) |
200 } | 205 } |
201 fn visit_children_set(&self, directory: &HgPath) -> VisitChildrenSet { | 206 fn visit_children_set(&self, directory: &HgPath) -> VisitChildrenSet { |
202 if self.files.is_empty() || !self.dirs.contains(&directory) { | 207 if self.files.is_empty() || !self.dirs.contains(directory) { |
203 return VisitChildrenSet::Empty; | 208 return VisitChildrenSet::Empty; |
204 } | 209 } |
205 let mut candidates: HashSet<HgPathBuf> = | 210 |
206 self.dirs.iter().cloned().collect(); | 211 let compute_candidates = || -> BTreeSet<HgPathBuf> { |
207 | 212 let mut candidates: BTreeSet<HgPathBuf> = |
208 candidates.extend(self.files.iter().cloned()); | 213 self.dirs.iter().cloned().collect(); |
209 candidates.remove(HgPath::new(b"")); | 214 candidates.extend(self.files.iter().cloned()); |
210 | 215 candidates.remove(HgPath::new(b"")); |
211 if !directory.as_ref().is_empty() { | 216 candidates |
212 let directory = [directory.as_ref().as_bytes(), b"/"].concat(); | 217 }; |
213 candidates = candidates | 218 let candidates = |
214 .iter() | 219 if directory.as_ref().is_empty() { |
215 .filter_map(|c| { | 220 compute_candidates() |
216 if c.as_bytes().starts_with(&directory) { | 221 } else { |
217 Some(HgPathBuf::from_bytes( | 222 let sorted_candidates = self |
218 &c.as_bytes()[directory.len()..], | 223 .sorted_visitchildrenset_candidates |
219 )) | 224 .get_or_init(compute_candidates); |
220 } else { | 225 let directory_bytes = directory.as_ref().as_bytes(); |
221 None | 226 let start: HgPathBuf = |
222 } | 227 format_bytes!(b"{}/", directory_bytes).into(); |
223 }) | 228 let start_len = start.len(); |
224 .collect(); | 229 // `0` sorts after `/` |
225 } | 230 let end = format_bytes!(b"{}0", directory_bytes).into(); |
231 BTreeSet::from_iter(sorted_candidates.range(start..end).map( | |
232 |c| HgPathBuf::from_bytes(&c.as_bytes()[start_len..]), | |
233 )) | |
234 }; | |
226 | 235 |
227 // `self.dirs` includes all of the directories, recursively, so if | 236 // `self.dirs` includes all of the directories, recursively, so if |
228 // we're attempting to match 'foo/bar/baz.txt', it'll have '', 'foo', | 237 // we're attempting to match 'foo/bar/baz.txt', it'll have '', 'foo', |
229 // 'foo/bar' in it. Thus we can safely ignore a candidate that has a | 238 // 'foo/bar' in it. Thus we can safely ignore a candidate that has a |
230 // '/' in it, indicating it's for a subdir-of-a-subdir; the immediate | 239 // '/' in it, indicating it's for a subdir-of-a-subdir; the immediate |