Mercurial > hg
comparison rust/hg-core/src/matchers.rs @ 51471:5633de951d34 stable
rust-matchers: raw regular expression builder
Extracting this `re_builder()` from `re_matcher()` makes it reusable
in more general cases than matching `HgPath` instances and would
help reducing code duplication in RHGitaly.
author | Georges Racinet <georges.racinet@octobus.net> |
---|---|
date | Mon, 11 Mar 2024 13:36:25 +0100 |
parents | bec6e9c108fd |
children | f5c367dc6541 |
comparison
equal
deleted
inserted
replaced
51470:406b413e3cf2 | 51471:5633de951d34 |
---|---|
735 .get_or(|| self.base.clone()) | 735 .get_or(|| self.base.clone()) |
736 .is_match(path.as_bytes()) | 736 .is_match(path.as_bytes()) |
737 } | 737 } |
738 } | 738 } |
739 | 739 |
740 /// Return a `RegexBuilder` from a bytes pattern | |
741 /// | |
742 /// This works around the fact that even if it works on byte haysacks, | |
743 /// [`regex::bytes::Regex`] still uses UTF-8 patterns. | |
744 pub fn re_bytes_builder(pattern: &[u8]) -> regex::bytes::RegexBuilder { | |
745 use std::io::Write; | |
746 | |
747 // The `regex` crate adds `.*` to the start and end of expressions if there | |
748 // are no anchors, so add the start anchor. | |
749 let mut escaped_bytes = vec![b'^', b'(', b'?', b':']; | |
750 for byte in pattern { | |
751 if *byte > 127 { | |
752 write!(escaped_bytes, "\\x{:x}", *byte).unwrap(); | |
753 } else { | |
754 escaped_bytes.push(*byte); | |
755 } | |
756 } | |
757 escaped_bytes.push(b')'); | |
758 | |
759 // Avoid the cost of UTF8 checking | |
760 // | |
761 // # Safety | |
762 // This is safe because we escaped all non-ASCII bytes. | |
763 let pattern_string = unsafe { String::from_utf8_unchecked(escaped_bytes) }; | |
764 regex::bytes::RegexBuilder::new(&pattern_string) | |
765 } | |
766 | |
740 /// Returns a function that matches an `HgPath` against the given regex | 767 /// Returns a function that matches an `HgPath` against the given regex |
741 /// pattern. | 768 /// pattern. |
742 /// | 769 /// |
743 /// This can fail when the pattern is invalid or not supported by the | 770 /// This can fail when the pattern is invalid or not supported by the |
744 /// underlying engine (the `regex` crate), for instance anything with | 771 /// underlying engine (the `regex` crate), for instance anything with |
745 /// back-references. | 772 /// back-references. |
746 #[logging_timer::time("trace")] | 773 #[logging_timer::time("trace")] |
747 fn re_matcher(pattern: &[u8]) -> PatternResult<RegexMatcher> { | 774 fn re_matcher(pattern: &[u8]) -> PatternResult<RegexMatcher> { |
748 use std::io::Write; | 775 let re = re_bytes_builder(pattern) |
749 | |
750 // The `regex` crate adds `.*` to the start and end of expressions if there | |
751 // are no anchors, so add the start anchor. | |
752 let mut escaped_bytes = vec![b'^', b'(', b'?', b':']; | |
753 for byte in pattern { | |
754 if *byte > 127 { | |
755 write!(escaped_bytes, "\\x{:x}", *byte).unwrap(); | |
756 } else { | |
757 escaped_bytes.push(*byte); | |
758 } | |
759 } | |
760 escaped_bytes.push(b')'); | |
761 | |
762 // Avoid the cost of UTF8 checking | |
763 // | |
764 // # Safety | |
765 // This is safe because we escaped all non-ASCII bytes. | |
766 let pattern_string = unsafe { String::from_utf8_unchecked(escaped_bytes) }; | |
767 let re = regex::bytes::RegexBuilder::new(&pattern_string) | |
768 .unicode(false) | 776 .unicode(false) |
769 // Big repos with big `.hgignore` will hit the default limit and | 777 // Big repos with big `.hgignore` will hit the default limit and |
770 // incur a significant performance hit. One repo's `hg status` hit | 778 // incur a significant performance hit. One repo's `hg status` hit |
771 // multiple *minutes*. | 779 // multiple *minutes*. |
772 .dfa_size_limit(50 * (1 << 20)) | 780 .dfa_size_limit(50 * (1 << 20)) |