comparison rust/hg-core/src/filepatterns.rs @ 44593:496868f1030c

rust-matchers: use the `regex` crate Instead of falling back to Python when a code path with "ignore" functionality is reached and `Re2` is not installed, the default compilation (i.e. without the `with-re2` feature) will use the `regex` crate for all regular expressions business. As with the introduction of `Re2` in a previous series, this yields a big performance boost compared to the Python + C code in `status`, `diff`, `commit`, `update`, and maybe others. For now `Re2` looks to be faster at compiling the DFA (1.5ms vs 5ms for Netbeans' `.hgignore`) and a bit faster in actual use: (123ms vs 137ms for the parallel traversal of Netbeans' clean repo). I am in talks with the author of `regex` to see whether that performance difference is a bug, a "won't fix", or a tuning issue. The `regex` crate is already one of our dependencies and using this code does not require any additional work from the end-user than to use the Rust extensions. Differential Revision: https://phab.mercurial-scm.org/D8323
author Raphaël Gomès <rgomes@octobus.net>
date Tue, 24 Mar 2020 17:55:59 +0100
parents 2fe89bec8011
children e0414fcd35e0
comparison
equal deleted inserted replaced
44592:7cd5c0968139 44593:496868f1030c
174 } = entry; 174 } = entry;
175 if pattern.is_empty() { 175 if pattern.is_empty() {
176 return vec![]; 176 return vec![];
177 } 177 }
178 match syntax { 178 match syntax {
179 PatternSyntax::Regexp => pattern.to_owned(), 179 // The `regex` crate adds `.*` to the start and end of expressions
180 // if there are no anchors, so add them.
181 PatternSyntax::Regexp => [b"^", &pattern[..], b"$"].concat(),
180 PatternSyntax::RelRegexp => { 182 PatternSyntax::RelRegexp => {
181 if pattern[0] == b'^' { 183 // The `regex` crate accepts `**` while `re2` and Python's `re`
184 // do not. Checking for `*` correctly triggers the same error all
185 // engines.
186 if pattern[0] == b'^' || pattern[0] == b'*' {
182 return pattern.to_owned(); 187 return pattern.to_owned();
183 } 188 }
184 [&b".*"[..], pattern].concat() 189 [&b".*"[..], pattern].concat()
185 } 190 }
186 PatternSyntax::Path | PatternSyntax::RelPath => { 191 PatternSyntax::Path | PatternSyntax::RelPath => {
189 } 194 }
190 [escape_pattern(pattern).as_slice(), b"(?:/|$)"].concat() 195 [escape_pattern(pattern).as_slice(), b"(?:/|$)"].concat()
191 } 196 }
192 PatternSyntax::RootFiles => { 197 PatternSyntax::RootFiles => {
193 let mut res = if pattern == b"." { 198 let mut res = if pattern == b"." {
194 vec![] 199 vec![b'^']
195 } else { 200 } else {
196 // Pattern is a directory name. 201 // Pattern is a directory name.
197 [escape_pattern(pattern).as_slice(), b"/"].concat() 202 [b"^", escape_pattern(pattern).as_slice(), b"/"].concat()
198 }; 203 };
199 204
200 // Anything after the pattern must be a non-directory. 205 // Anything after the pattern must be a non-directory.
201 res.extend(b"[^/]+$"); 206 res.extend(b"[^/]+$");
207 res.push(b'$');
202 res 208 res
203 } 209 }
204 PatternSyntax::RelGlob => { 210 PatternSyntax::RelGlob => {
205 let glob_re = glob_to_re(pattern); 211 let glob_re = glob_to_re(pattern);
206 if let Some(rest) = glob_re.drop_prefix(b"[^/]*") { 212 if let Some(rest) = glob_re.drop_prefix(b"[^/]*") {
207 [b".*", rest, GLOB_SUFFIX].concat() 213 [b".*", rest, GLOB_SUFFIX].concat()
208 } else { 214 } else {
209 [b"(?:|.*/)", glob_re.as_slice(), GLOB_SUFFIX].concat() 215 [b"(?:.*/)?", glob_re.as_slice(), GLOB_SUFFIX].concat()
210 } 216 }
211 } 217 }
212 PatternSyntax::Glob | PatternSyntax::RootGlob => { 218 PatternSyntax::Glob | PatternSyntax::RootGlob => {
213 [glob_to_re(pattern).as_slice(), GLOB_SUFFIX].concat() 219 [b"^", glob_to_re(pattern).as_slice(), GLOB_SUFFIX].concat()
214 } 220 }
215 PatternSyntax::Include | PatternSyntax::SubInclude => unreachable!(), 221 PatternSyntax::Include | PatternSyntax::SubInclude => unreachable!(),
216 } 222 }
217 } 223 }
218 224
280 _ => pattern.to_owned(), 286 _ => pattern.to_owned(),
281 }; 287 };
282 if *syntax == PatternSyntax::RootGlob 288 if *syntax == PatternSyntax::RootGlob
283 && !pattern.iter().any(|b| GLOB_SPECIAL_CHARACTERS.contains(b)) 289 && !pattern.iter().any(|b| GLOB_SPECIAL_CHARACTERS.contains(b))
284 { 290 {
285 let mut escaped = escape_pattern(&pattern); 291 // The `regex` crate adds `.*` to the start and end of expressions
292 // if there are no anchors, so add the start anchor.
293 let mut escaped = vec![b'^'];
294 escaped.extend(escape_pattern(&pattern));
286 escaped.extend(GLOB_SUFFIX); 295 escaped.extend(GLOB_SUFFIX);
287 Ok(escaped) 296 Ok(escaped)
288 } else { 297 } else {
289 let mut entry = entry.clone(); 298 let mut entry = entry.clone();
290 entry.pattern = pattern; 299 entry.pattern = pattern;
617 PatternSyntax::RelGlob, 626 PatternSyntax::RelGlob,
618 b"rust/target/", 627 b"rust/target/",
619 Path::new("") 628 Path::new("")
620 )) 629 ))
621 .unwrap(), 630 .unwrap(),
622 br"(?:|.*/)rust/target(?:/|$)".to_vec(), 631 br"(?:.*/)?rust/target(?:/|$)".to_vec(),
623 ); 632 );
624 } 633 }
625 634
626 #[test] 635 #[test]
627 fn test_build_single_regex_shortcut() { 636 fn test_build_single_regex_shortcut() {
630 PatternSyntax::RootGlob, 639 PatternSyntax::RootGlob,
631 b"", 640 b"",
632 Path::new("") 641 Path::new("")
633 )) 642 ))
634 .unwrap(), 643 .unwrap(),
635 br"\.(?:/|$)".to_vec(), 644 br"^\.(?:/|$)".to_vec(),
636 ); 645 );
637 assert_eq!( 646 assert_eq!(
638 build_single_regex(&IgnorePattern::new( 647 build_single_regex(&IgnorePattern::new(
639 PatternSyntax::RootGlob, 648 PatternSyntax::RootGlob,
640 b"whatever", 649 b"whatever",
641 Path::new("") 650 Path::new("")
642 )) 651 ))
643 .unwrap(), 652 .unwrap(),
644 br"whatever(?:/|$)".to_vec(), 653 br"^whatever(?:/|$)".to_vec(),
645 ); 654 );
646 assert_eq!( 655 assert_eq!(
647 build_single_regex(&IgnorePattern::new( 656 build_single_regex(&IgnorePattern::new(
648 PatternSyntax::RootGlob, 657 PatternSyntax::RootGlob,
649 b"*.o", 658 b"*.o",
650 Path::new("") 659 Path::new("")
651 )) 660 ))
652 .unwrap(), 661 .unwrap(),
653 br"[^/]*\.o(?:/|$)".to_vec(), 662 br"^[^/]*\.o(?:/|$)".to_vec(),
654 ); 663 );
655 } 664 }
656 } 665 }