Mercurial > hg
view rust/hg-core/src/matchers.rs @ 44521:a21881b40942
rust-matchers: add `build_regex_match` function
This function will be used to help build the upcoming `IncludeMatcher`.
Differential Revision: https://phab.mercurial-scm.org/D7924
author | Raphaël Gomès <rgomes@octobus.net> |
---|---|
date | Fri, 17 Jan 2020 11:32:02 +0100 |
parents | d4e8cfcde012 |
children | c697638e0e91 |
line wrap: on
line source
// matchers.rs // // Copyright 2019 Raphaël Gomès <rgomes@octobus.net> // // This software may be used and distributed according to the terms of the // GNU General Public License version 2 or any later version. //! Structs and types for matching files and directories. #[cfg(feature = "with-re2")] use crate::re2::Re2; use crate::{ filepatterns::{build_single_regex, PatternResult}, utils::hg_path::{HgPath, HgPathBuf}, DirsMultiset, DirstateMapError, IgnorePattern, PatternError, PatternSyntax, }; use std::collections::HashSet; use std::iter::FromIterator; use std::ops::Deref; #[derive(Debug, PartialEq)] pub enum VisitChildrenSet<'a> { /// Don't visit anything Empty, /// Only visit this directory This, /// Visit this directory and these subdirectories /// TODO Should we implement a `NonEmptyHashSet`? Set(HashSet<&'a HgPath>), /// Visit this directory and all subdirectories Recursive, } pub trait Matcher { /// Explicitly listed files fn file_set(&self) -> Option<&HashSet<&HgPath>>; /// Returns whether `filename` is in `file_set` fn exact_match(&self, filename: impl AsRef<HgPath>) -> bool; /// Returns whether `filename` is matched by this matcher fn matches(&self, filename: impl AsRef<HgPath>) -> bool; /// Decides whether a directory should be visited based on whether it /// has potential matches in it or one of its subdirectories, and /// potentially lists which subdirectories of that directory should be /// visited. This is based on the match's primary, included, and excluded /// patterns. /// /// # Example /// /// Assume matchers `['path:foo/bar', 'rootfilesin:qux']`, we would /// return the following values (assuming the implementation of /// visit_children_set is capable of recognizing this; some implementations /// are not). /// /// ```text /// ```ignore /// '' -> {'foo', 'qux'} /// 'baz' -> set() /// 'foo' -> {'bar'} /// // Ideally this would be `Recursive`, but since the prefix nature of /// // matchers is applied to the entire matcher, we have to downgrade this /// // to `This` due to the (yet to be implemented in Rust) non-prefix /// // `RootFilesIn'-kind matcher being mixed in. /// 'foo/bar' -> 'this' /// 'qux' -> 'this' /// ``` /// # Important /// /// Most matchers do not know if they're representing files or /// directories. They see `['path:dir/f']` and don't know whether `f` is a /// file or a directory, so `visit_children_set('dir')` for most matchers /// will return `HashSet{ HgPath { "f" } }`, but if the matcher knows it's /// a file (like the yet to be implemented in Rust `ExactMatcher` does), /// it may return `VisitChildrenSet::This`. /// Do not rely on the return being a `HashSet` indicating that there are /// no files in this dir to investigate (or equivalently that if there are /// files to investigate in 'dir' that it will always return /// `VisitChildrenSet::This`). fn visit_children_set( &self, directory: impl AsRef<HgPath>, ) -> VisitChildrenSet; /// Matcher will match everything and `files_set()` will be empty: /// optimization might be possible. fn matches_everything(&self) -> bool; /// Matcher will match exactly the files in `files_set()`: optimization /// might be possible. fn is_exact(&self) -> bool; } /// Matches everything. ///``` /// use hg::{ matchers::{Matcher, AlwaysMatcher}, utils::hg_path::HgPath }; /// /// let matcher = AlwaysMatcher; /// /// assert_eq!(matcher.matches(HgPath::new(b"whatever")), true); /// assert_eq!(matcher.matches(HgPath::new(b"b.txt")), true); /// assert_eq!(matcher.matches(HgPath::new(b"main.c")), true); /// assert_eq!(matcher.matches(HgPath::new(br"re:.*\.c$")), true); /// ``` #[derive(Debug)] pub struct AlwaysMatcher; impl Matcher for AlwaysMatcher { fn file_set(&self) -> Option<&HashSet<&HgPath>> { None } fn exact_match(&self, _filename: impl AsRef<HgPath>) -> bool { false } fn matches(&self, _filename: impl AsRef<HgPath>) -> bool { true } fn visit_children_set( &self, _directory: impl AsRef<HgPath>, ) -> VisitChildrenSet { VisitChildrenSet::Recursive } fn matches_everything(&self) -> bool { true } fn is_exact(&self) -> bool { false } } /// Matches the input files exactly. They are interpreted as paths, not /// patterns. /// ///``` /// use hg::{ matchers::{Matcher, FileMatcher}, utils::hg_path::HgPath }; /// /// let files = [HgPath::new(b"a.txt"), HgPath::new(br"re:.*\.c$")]; /// let matcher = FileMatcher::new(&files).unwrap(); /// /// assert_eq!(matcher.matches(HgPath::new(b"a.txt")), true); /// assert_eq!(matcher.matches(HgPath::new(b"b.txt")), false); /// assert_eq!(matcher.matches(HgPath::new(b"main.c")), false); /// assert_eq!(matcher.matches(HgPath::new(br"re:.*\.c$")), true); /// ``` #[derive(Debug)] pub struct FileMatcher<'a> { files: HashSet<&'a HgPath>, dirs: DirsMultiset, } impl<'a> FileMatcher<'a> { pub fn new( files: &'a [impl AsRef<HgPath>], ) -> Result<Self, DirstateMapError> { Ok(Self { files: HashSet::from_iter(files.iter().map(|f| f.as_ref())), dirs: DirsMultiset::from_manifest(files)?, }) } fn inner_matches(&self, filename: impl AsRef<HgPath>) -> bool { self.files.contains(filename.as_ref()) } } impl<'a> Matcher for FileMatcher<'a> { fn file_set(&self) -> Option<&HashSet<&HgPath>> { Some(&self.files) } fn exact_match(&self, filename: impl AsRef<HgPath>) -> bool { self.inner_matches(filename) } fn matches(&self, filename: impl AsRef<HgPath>) -> bool { self.inner_matches(filename) } fn visit_children_set( &self, directory: impl AsRef<HgPath>, ) -> VisitChildrenSet { if self.files.is_empty() || !self.dirs.contains(&directory) { return VisitChildrenSet::Empty; } let dirs_as_set = self.dirs.iter().map(|k| k.deref()).collect(); let mut candidates: HashSet<&HgPath> = self.files.union(&dirs_as_set).map(|k| *k).collect(); candidates.remove(HgPath::new(b"")); if !directory.as_ref().is_empty() { let directory = [directory.as_ref().as_bytes(), b"/"].concat(); candidates = candidates .iter() .filter_map(|c| { if c.as_bytes().starts_with(&directory) { Some(HgPath::new(&c.as_bytes()[directory.len()..])) } else { None } }) .collect(); } // `self.dirs` includes all of the directories, recursively, so if // we're attempting to match 'foo/bar/baz.txt', it'll have '', 'foo', // 'foo/bar' in it. Thus we can safely ignore a candidate that has a // '/' in it, indicating it's for a subdir-of-a-subdir; the immediate // subdir will be in there without a slash. VisitChildrenSet::Set( candidates .iter() .filter_map(|c| { if c.bytes().all(|b| *b != b'/') { Some(*c) } else { None } }) .collect(), ) } fn matches_everything(&self) -> bool { false } fn is_exact(&self) -> bool { true } } #[cfg(feature = "with-re2")] /// Returns a function that matches an `HgPath` against the given regex /// pattern. /// /// This can fail when the pattern is invalid or not supported by the /// underlying engine `Re2`, for instance anything with back-references. fn re_matcher( pattern: &[u8], ) -> PatternResult<impl Fn(&HgPath) -> bool + Sync> { let regex = Re2::new(pattern); let regex = regex.map_err(|e| PatternError::UnsupportedSyntax(e))?; Ok(move |path: &HgPath| regex.is_match(path.as_bytes())) } #[cfg(not(feature = "with-re2"))] fn re_matcher(_: &[u8]) -> PatternResult<Box<dyn Fn(&HgPath) -> bool + Sync>> { Err(PatternError::Re2NotInstalled) } /// Returns the regex pattern and a function that matches an `HgPath` against /// said regex formed by the given ignore patterns. fn build_regex_match<'a>( ignore_patterns: &'a [&'a IgnorePattern], ) -> PatternResult<(Vec<u8>, Box<dyn Fn(&HgPath) -> bool + Sync>)> { let regexps: Result<Vec<_>, PatternError> = ignore_patterns .into_iter() .map(|k| build_single_regex(*k)) .collect(); let regexps = regexps?; let full_regex = regexps.join(&b'|'); let matcher = re_matcher(&full_regex)?; let func = Box::new(move |filename: &HgPath| matcher(filename)); Ok((full_regex, func)) } /// Returns roots and directories corresponding to each pattern. /// /// This calculates the roots and directories exactly matching the patterns and /// returns a tuple of (roots, dirs). It does not return other directories /// which may also need to be considered, like the parent directories. fn roots_and_dirs( ignore_patterns: &[IgnorePattern], ) -> (Vec<HgPathBuf>, Vec<HgPathBuf>) { let mut roots = Vec::new(); let mut dirs = Vec::new(); for ignore_pattern in ignore_patterns { let IgnorePattern { syntax, pattern, .. } = ignore_pattern; match syntax { PatternSyntax::RootGlob | PatternSyntax::Glob => { let mut root = vec![]; for p in pattern.split(|c| *c == b'/') { if p.iter().any(|c| match *c { b'[' | b'{' | b'*' | b'?' => true, _ => false, }) { break; } root.push(HgPathBuf::from_bytes(p)); } let buf = root.iter().fold(HgPathBuf::new(), |acc, r| acc.join(r)); roots.push(buf); } PatternSyntax::Path | PatternSyntax::RelPath => { let pat = HgPath::new(if pattern == b"." { &[] as &[u8] } else { pattern }); roots.push(pat.to_owned()); } PatternSyntax::RootFiles => { let pat = if pattern == b"." { &[] as &[u8] } else { pattern }; dirs.push(HgPathBuf::from_bytes(pat)); } _ => { roots.push(HgPathBuf::new()); } } } (roots, dirs) } /// Paths extracted from patterns #[derive(Debug, PartialEq)] struct RootsDirsAndParents { /// Directories to match recursively pub roots: HashSet<HgPathBuf>, /// Directories to match non-recursively pub dirs: HashSet<HgPathBuf>, /// Implicitly required directories to go to items in either roots or dirs pub parents: HashSet<HgPathBuf>, } /// Extract roots, dirs and parents from patterns. fn roots_dirs_and_parents( ignore_patterns: &[IgnorePattern], ) -> PatternResult<RootsDirsAndParents> { let (roots, dirs) = roots_and_dirs(ignore_patterns); let mut parents = HashSet::new(); parents.extend( DirsMultiset::from_manifest(&dirs) .map_err(|e| match e { DirstateMapError::InvalidPath(e) => e, _ => unreachable!(), })? .iter() .map(|k| k.to_owned()), ); parents.extend( DirsMultiset::from_manifest(&roots) .map_err(|e| match e { DirstateMapError::InvalidPath(e) => e, _ => unreachable!(), })? .iter() .map(|k| k.to_owned()), ); Ok(RootsDirsAndParents { roots: HashSet::from_iter(roots), dirs: HashSet::from_iter(dirs), parents, }) } #[cfg(test)] mod tests { use super::*; use pretty_assertions::assert_eq; use std::path::Path; #[test] fn test_roots_and_dirs() { let pats = vec![ IgnorePattern::new(PatternSyntax::Glob, b"g/h/*", Path::new("")), IgnorePattern::new(PatternSyntax::Glob, b"g/h", Path::new("")), IgnorePattern::new(PatternSyntax::Glob, b"g*", Path::new("")), ]; let (roots, dirs) = roots_and_dirs(&pats); assert_eq!( roots, vec!( HgPathBuf::from_bytes(b"g/h"), HgPathBuf::from_bytes(b"g/h"), HgPathBuf::new() ), ); assert_eq!(dirs, vec!()); } #[test] fn test_roots_dirs_and_parents() { let pats = vec![ IgnorePattern::new(PatternSyntax::Glob, b"g/h/*", Path::new("")), IgnorePattern::new(PatternSyntax::Glob, b"g/h", Path::new("")), IgnorePattern::new(PatternSyntax::Glob, b"g*", Path::new("")), ]; let mut roots = HashSet::new(); roots.insert(HgPathBuf::from_bytes(b"g/h")); roots.insert(HgPathBuf::new()); let dirs = HashSet::new(); let mut parents = HashSet::new(); parents.insert(HgPathBuf::new()); parents.insert(HgPathBuf::from_bytes(b"g")); assert_eq!( roots_dirs_and_parents(&pats).unwrap(), RootsDirsAndParents {roots, dirs, parents} ); } #[test] fn test_filematcher_visit_children_set() { // Visitchildrenset let files = vec![HgPath::new(b"dir/subdir/foo.txt")]; let matcher = FileMatcher::new(&files).unwrap(); let mut set = HashSet::new(); set.insert(HgPath::new(b"dir")); assert_eq!( matcher.visit_children_set(HgPath::new(b"")), VisitChildrenSet::Set(set) ); let mut set = HashSet::new(); set.insert(HgPath::new(b"subdir")); assert_eq!( matcher.visit_children_set(HgPath::new(b"dir")), VisitChildrenSet::Set(set) ); let mut set = HashSet::new(); set.insert(HgPath::new(b"foo.txt")); assert_eq!( matcher.visit_children_set(HgPath::new(b"dir/subdir")), VisitChildrenSet::Set(set) ); assert_eq!( matcher.visit_children_set(HgPath::new(b"dir/subdir/x")), VisitChildrenSet::Empty ); assert_eq!( matcher.visit_children_set(HgPath::new(b"dir/subdir/foo.txt")), VisitChildrenSet::Empty ); assert_eq!( matcher.visit_children_set(HgPath::new(b"folder")), VisitChildrenSet::Empty ); } #[test] fn test_filematcher_visit_children_set_files_and_dirs() { let files = vec![ HgPath::new(b"rootfile.txt"), HgPath::new(b"a/file1.txt"), HgPath::new(b"a/b/file2.txt"), // No file in a/b/c HgPath::new(b"a/b/c/d/file4.txt"), ]; let matcher = FileMatcher::new(&files).unwrap(); let mut set = HashSet::new(); set.insert(HgPath::new(b"a")); set.insert(HgPath::new(b"rootfile.txt")); assert_eq!( matcher.visit_children_set(HgPath::new(b"")), VisitChildrenSet::Set(set) ); let mut set = HashSet::new(); set.insert(HgPath::new(b"b")); set.insert(HgPath::new(b"file1.txt")); assert_eq!( matcher.visit_children_set(HgPath::new(b"a")), VisitChildrenSet::Set(set) ); let mut set = HashSet::new(); set.insert(HgPath::new(b"c")); set.insert(HgPath::new(b"file2.txt")); assert_eq!( matcher.visit_children_set(HgPath::new(b"a/b")), VisitChildrenSet::Set(set) ); let mut set = HashSet::new(); set.insert(HgPath::new(b"d")); assert_eq!( matcher.visit_children_set(HgPath::new(b"a/b/c")), VisitChildrenSet::Set(set) ); let mut set = HashSet::new(); set.insert(HgPath::new(b"file4.txt")); assert_eq!( matcher.visit_children_set(HgPath::new(b"a/b/c/d")), VisitChildrenSet::Set(set) ); assert_eq!( matcher.visit_children_set(HgPath::new(b"a/b/c/d/e")), VisitChildrenSet::Empty ); assert_eq!( matcher.visit_children_set(HgPath::new(b"folder")), VisitChildrenSet::Empty ); } }