comparison rust/hg-core/src/matchers.rs @ 51471:5633de951d34 stable

rust-matchers: raw regular expression builder Extracting this `re_builder()` from `re_matcher()` makes it reusable in more general cases than matching `HgPath` instances and would help reducing code duplication in RHGitaly.
author Georges Racinet <georges.racinet@octobus.net>
date Mon, 11 Mar 2024 13:36:25 +0100
parents bec6e9c108fd
children f5c367dc6541
comparison
equal deleted inserted replaced
51470:406b413e3cf2 51471:5633de951d34
735 .get_or(|| self.base.clone()) 735 .get_or(|| self.base.clone())
736 .is_match(path.as_bytes()) 736 .is_match(path.as_bytes())
737 } 737 }
738 } 738 }
739 739
740 /// Return a `RegexBuilder` from a bytes pattern
741 ///
742 /// This works around the fact that even if it works on byte haysacks,
743 /// [`regex::bytes::Regex`] still uses UTF-8 patterns.
744 pub fn re_bytes_builder(pattern: &[u8]) -> regex::bytes::RegexBuilder {
745 use std::io::Write;
746
747 // The `regex` crate adds `.*` to the start and end of expressions if there
748 // are no anchors, so add the start anchor.
749 let mut escaped_bytes = vec![b'^', b'(', b'?', b':'];
750 for byte in pattern {
751 if *byte > 127 {
752 write!(escaped_bytes, "\\x{:x}", *byte).unwrap();
753 } else {
754 escaped_bytes.push(*byte);
755 }
756 }
757 escaped_bytes.push(b')');
758
759 // Avoid the cost of UTF8 checking
760 //
761 // # Safety
762 // This is safe because we escaped all non-ASCII bytes.
763 let pattern_string = unsafe { String::from_utf8_unchecked(escaped_bytes) };
764 regex::bytes::RegexBuilder::new(&pattern_string)
765 }
766
740 /// Returns a function that matches an `HgPath` against the given regex 767 /// Returns a function that matches an `HgPath` against the given regex
741 /// pattern. 768 /// pattern.
742 /// 769 ///
743 /// This can fail when the pattern is invalid or not supported by the 770 /// This can fail when the pattern is invalid or not supported by the
744 /// underlying engine (the `regex` crate), for instance anything with 771 /// underlying engine (the `regex` crate), for instance anything with
745 /// back-references. 772 /// back-references.
746 #[logging_timer::time("trace")] 773 #[logging_timer::time("trace")]
747 fn re_matcher(pattern: &[u8]) -> PatternResult<RegexMatcher> { 774 fn re_matcher(pattern: &[u8]) -> PatternResult<RegexMatcher> {
748 use std::io::Write; 775 let re = re_bytes_builder(pattern)
749
750 // The `regex` crate adds `.*` to the start and end of expressions if there
751 // are no anchors, so add the start anchor.
752 let mut escaped_bytes = vec![b'^', b'(', b'?', b':'];
753 for byte in pattern {
754 if *byte > 127 {
755 write!(escaped_bytes, "\\x{:x}", *byte).unwrap();
756 } else {
757 escaped_bytes.push(*byte);
758 }
759 }
760 escaped_bytes.push(b')');
761
762 // Avoid the cost of UTF8 checking
763 //
764 // # Safety
765 // This is safe because we escaped all non-ASCII bytes.
766 let pattern_string = unsafe { String::from_utf8_unchecked(escaped_bytes) };
767 let re = regex::bytes::RegexBuilder::new(&pattern_string)
768 .unicode(false) 776 .unicode(false)
769 // Big repos with big `.hgignore` will hit the default limit and 777 // Big repos with big `.hgignore` will hit the default limit and
770 // incur a significant performance hit. One repo's `hg status` hit 778 // incur a significant performance hit. One repo's `hg status` hit
771 // multiple *minutes*. 779 // multiple *minutes*.
772 .dfa_size_limit(50 * (1 << 20)) 780 .dfa_size_limit(50 * (1 << 20))