Mercurial > hg
comparison rust/hg-core/src/filepatterns.rs @ 44593:496868f1030c
rust-matchers: use the `regex` crate
Instead of falling back to Python when a code path with "ignore" functionality
is reached and `Re2` is not installed, the default compilation (i.e. without
the `with-re2` feature) will use the `regex` crate for all regular expressions
business.
As with the introduction of `Re2` in a previous series, this yields a big
performance boost compared to the Python + C code in `status`, `diff`, `commit`,
`update`, and maybe others.
For now `Re2` looks to be faster at compiling the DFA (1.5ms vs 5ms for
Netbeans' `.hgignore`) and a bit faster in actual use: (123ms vs 137ms for
the parallel traversal of Netbeans' clean repo). I am in talks with the author
of `regex` to see whether that performance difference is a bug, a "won't fix",
or a tuning issue.
The `regex` crate is already one of our dependencies and using this code does
not require any additional work from the end-user than to use the Rust
extensions.
Differential Revision: https://phab.mercurial-scm.org/D8323
author | Raphaël Gomès <rgomes@octobus.net> |
---|---|
date | Tue, 24 Mar 2020 17:55:59 +0100 |
parents | 2fe89bec8011 |
children | e0414fcd35e0 |
comparison
equal
deleted
inserted
replaced
44592:7cd5c0968139 | 44593:496868f1030c |
---|---|
174 } = entry; | 174 } = entry; |
175 if pattern.is_empty() { | 175 if pattern.is_empty() { |
176 return vec![]; | 176 return vec![]; |
177 } | 177 } |
178 match syntax { | 178 match syntax { |
179 PatternSyntax::Regexp => pattern.to_owned(), | 179 // The `regex` crate adds `.*` to the start and end of expressions |
180 // if there are no anchors, so add them. | |
181 PatternSyntax::Regexp => [b"^", &pattern[..], b"$"].concat(), | |
180 PatternSyntax::RelRegexp => { | 182 PatternSyntax::RelRegexp => { |
181 if pattern[0] == b'^' { | 183 // The `regex` crate accepts `**` while `re2` and Python's `re` |
184 // do not. Checking for `*` correctly triggers the same error all | |
185 // engines. | |
186 if pattern[0] == b'^' || pattern[0] == b'*' { | |
182 return pattern.to_owned(); | 187 return pattern.to_owned(); |
183 } | 188 } |
184 [&b".*"[..], pattern].concat() | 189 [&b".*"[..], pattern].concat() |
185 } | 190 } |
186 PatternSyntax::Path | PatternSyntax::RelPath => { | 191 PatternSyntax::Path | PatternSyntax::RelPath => { |
189 } | 194 } |
190 [escape_pattern(pattern).as_slice(), b"(?:/|$)"].concat() | 195 [escape_pattern(pattern).as_slice(), b"(?:/|$)"].concat() |
191 } | 196 } |
192 PatternSyntax::RootFiles => { | 197 PatternSyntax::RootFiles => { |
193 let mut res = if pattern == b"." { | 198 let mut res = if pattern == b"." { |
194 vec![] | 199 vec![b'^'] |
195 } else { | 200 } else { |
196 // Pattern is a directory name. | 201 // Pattern is a directory name. |
197 [escape_pattern(pattern).as_slice(), b"/"].concat() | 202 [b"^", escape_pattern(pattern).as_slice(), b"/"].concat() |
198 }; | 203 }; |
199 | 204 |
200 // Anything after the pattern must be a non-directory. | 205 // Anything after the pattern must be a non-directory. |
201 res.extend(b"[^/]+$"); | 206 res.extend(b"[^/]+$"); |
207 res.push(b'$'); | |
202 res | 208 res |
203 } | 209 } |
204 PatternSyntax::RelGlob => { | 210 PatternSyntax::RelGlob => { |
205 let glob_re = glob_to_re(pattern); | 211 let glob_re = glob_to_re(pattern); |
206 if let Some(rest) = glob_re.drop_prefix(b"[^/]*") { | 212 if let Some(rest) = glob_re.drop_prefix(b"[^/]*") { |
207 [b".*", rest, GLOB_SUFFIX].concat() | 213 [b".*", rest, GLOB_SUFFIX].concat() |
208 } else { | 214 } else { |
209 [b"(?:|.*/)", glob_re.as_slice(), GLOB_SUFFIX].concat() | 215 [b"(?:.*/)?", glob_re.as_slice(), GLOB_SUFFIX].concat() |
210 } | 216 } |
211 } | 217 } |
212 PatternSyntax::Glob | PatternSyntax::RootGlob => { | 218 PatternSyntax::Glob | PatternSyntax::RootGlob => { |
213 [glob_to_re(pattern).as_slice(), GLOB_SUFFIX].concat() | 219 [b"^", glob_to_re(pattern).as_slice(), GLOB_SUFFIX].concat() |
214 } | 220 } |
215 PatternSyntax::Include | PatternSyntax::SubInclude => unreachable!(), | 221 PatternSyntax::Include | PatternSyntax::SubInclude => unreachable!(), |
216 } | 222 } |
217 } | 223 } |
218 | 224 |
280 _ => pattern.to_owned(), | 286 _ => pattern.to_owned(), |
281 }; | 287 }; |
282 if *syntax == PatternSyntax::RootGlob | 288 if *syntax == PatternSyntax::RootGlob |
283 && !pattern.iter().any(|b| GLOB_SPECIAL_CHARACTERS.contains(b)) | 289 && !pattern.iter().any(|b| GLOB_SPECIAL_CHARACTERS.contains(b)) |
284 { | 290 { |
285 let mut escaped = escape_pattern(&pattern); | 291 // The `regex` crate adds `.*` to the start and end of expressions |
292 // if there are no anchors, so add the start anchor. | |
293 let mut escaped = vec![b'^']; | |
294 escaped.extend(escape_pattern(&pattern)); | |
286 escaped.extend(GLOB_SUFFIX); | 295 escaped.extend(GLOB_SUFFIX); |
287 Ok(escaped) | 296 Ok(escaped) |
288 } else { | 297 } else { |
289 let mut entry = entry.clone(); | 298 let mut entry = entry.clone(); |
290 entry.pattern = pattern; | 299 entry.pattern = pattern; |
617 PatternSyntax::RelGlob, | 626 PatternSyntax::RelGlob, |
618 b"rust/target/", | 627 b"rust/target/", |
619 Path::new("") | 628 Path::new("") |
620 )) | 629 )) |
621 .unwrap(), | 630 .unwrap(), |
622 br"(?:|.*/)rust/target(?:/|$)".to_vec(), | 631 br"(?:.*/)?rust/target(?:/|$)".to_vec(), |
623 ); | 632 ); |
624 } | 633 } |
625 | 634 |
626 #[test] | 635 #[test] |
627 fn test_build_single_regex_shortcut() { | 636 fn test_build_single_regex_shortcut() { |
630 PatternSyntax::RootGlob, | 639 PatternSyntax::RootGlob, |
631 b"", | 640 b"", |
632 Path::new("") | 641 Path::new("") |
633 )) | 642 )) |
634 .unwrap(), | 643 .unwrap(), |
635 br"\.(?:/|$)".to_vec(), | 644 br"^\.(?:/|$)".to_vec(), |
636 ); | 645 ); |
637 assert_eq!( | 646 assert_eq!( |
638 build_single_regex(&IgnorePattern::new( | 647 build_single_regex(&IgnorePattern::new( |
639 PatternSyntax::RootGlob, | 648 PatternSyntax::RootGlob, |
640 b"whatever", | 649 b"whatever", |
641 Path::new("") | 650 Path::new("") |
642 )) | 651 )) |
643 .unwrap(), | 652 .unwrap(), |
644 br"whatever(?:/|$)".to_vec(), | 653 br"^whatever(?:/|$)".to_vec(), |
645 ); | 654 ); |
646 assert_eq!( | 655 assert_eq!( |
647 build_single_regex(&IgnorePattern::new( | 656 build_single_regex(&IgnorePattern::new( |
648 PatternSyntax::RootGlob, | 657 PatternSyntax::RootGlob, |
649 b"*.o", | 658 b"*.o", |
650 Path::new("") | 659 Path::new("") |
651 )) | 660 )) |
652 .unwrap(), | 661 .unwrap(), |
653 br"[^/]*\.o(?:/|$)".to_vec(), | 662 br"^[^/]*\.o(?:/|$)".to_vec(), |
654 ); | 663 ); |
655 } | 664 } |
656 } | 665 } |