Mercurial > hg-stable
changeset 44346:d42eea9a0494
rust-filepatterns: improve API and robustness for pattern files parsing
Within the next few patches we will be using this new API.
Differential Revision: https://phab.mercurial-scm.org/D7908
author | Raphaël Gomès <rgomes@octobus.net> |
---|---|
date | Thu, 16 Jan 2020 10:28:40 +0100 |
parents | 4caac36c66bc |
children | 2fe89bec8011 |
files | rust/hg-core/src/filepatterns.rs rust/hg-core/src/lib.rs |
diffstat | 2 files changed, 245 insertions(+), 65 deletions(-) [+] |
line wrap: on
line diff
--- a/rust/hg-core/src/filepatterns.rs Tue Jan 14 17:10:20 2020 +0100 +++ b/rust/hg-core/src/filepatterns.rs Thu Jan 16 10:28:40 2020 +0100 @@ -7,9 +7,7 @@ //! Handling of Mercurial-specific patterns. -use crate::{ - utils::SliceExt, FastHashMap, LineNumber, PatternError, PatternFileError, -}; +use crate::{utils::SliceExt, FastHashMap, PatternError}; use lazy_static::lazy_static; use regex::bytes::{NoExpand, Regex}; use std::fs::File; @@ -32,18 +30,28 @@ const GLOB_REPLACEMENTS: &[(&[u8], &[u8])] = &[(b"*/", b"(?:.*/)?"), (b"*", b".*"), (b"", b"[^/]*")]; +/// Appended to the regexp of globs +const GLOB_SUFFIX: &[u8; 7] = b"(?:/|$)"; + #[derive(Debug, Copy, Clone, PartialEq, Eq)] pub enum PatternSyntax { + /// A regular expression Regexp, /// Glob that matches at the front of the path RootGlob, /// Glob that matches at any suffix of the path (still anchored at /// slashes) Glob, + /// a path relative to repository root, which is matched recursively Path, + /// A path relative to cwd RelPath, + /// an unrooted glob (*.rs matches Rust files in all dirs) RelGlob, + /// A regexp that needn't match the start of a name RelRegexp, + /// A path relative to repository root, which is matched non-recursively + /// (will not match subdirectories) RootFiles, } @@ -125,16 +133,18 @@ .collect() } -fn parse_pattern_syntax(kind: &[u8]) -> Result<PatternSyntax, PatternError> { +pub fn parse_pattern_syntax( + kind: &[u8], +) -> Result<PatternSyntax, PatternError> { match kind { - b"re" => Ok(PatternSyntax::Regexp), - b"path" => Ok(PatternSyntax::Path), - b"relpath" => Ok(PatternSyntax::RelPath), - b"rootfilesin" => Ok(PatternSyntax::RootFiles), - b"relglob" => Ok(PatternSyntax::RelGlob), - b"relre" => Ok(PatternSyntax::RelRegexp), - b"glob" => Ok(PatternSyntax::Glob), - b"rootglob" => Ok(PatternSyntax::RootGlob), + b"re:" => Ok(PatternSyntax::Regexp), + b"path:" => Ok(PatternSyntax::Path), + b"relpath:" => Ok(PatternSyntax::RelPath), + b"rootfilesin:" => Ok(PatternSyntax::RootFiles), + b"relglob:" => Ok(PatternSyntax::RelGlob), + b"relre:" => Ok(PatternSyntax::RelRegexp), + b"glob:" => Ok(PatternSyntax::Glob), + b"rootglob:" => Ok(PatternSyntax::RootGlob), _ => Err(PatternError::UnsupportedSyntax( String::from_utf8_lossy(kind).to_string(), )), @@ -144,11 +154,10 @@ /// Builds the regex that corresponds to the given pattern. /// If within a `syntax: regexp` context, returns the pattern, /// otherwise, returns the corresponding regex. -fn _build_single_regex( - syntax: PatternSyntax, - pattern: &[u8], - globsuffix: &[u8], -) -> Vec<u8> { +fn _build_single_regex(entry: &IgnorePattern) -> Vec<u8> { + let IgnorePattern { + syntax, pattern, .. + } = entry; if pattern.is_empty() { return vec![]; } @@ -158,7 +167,7 @@ if pattern[0] == b'^' { return pattern.to_owned(); } - [b".*", pattern].concat() + [&b".*"[..], pattern].concat() } PatternSyntax::Path | PatternSyntax::RelPath => { if pattern == b"." { @@ -181,13 +190,13 @@ PatternSyntax::RelGlob => { let glob_re = glob_to_re(pattern); if let Some(rest) = glob_re.drop_prefix(b"[^/]*") { - [b".*", rest, globsuffix].concat() + [b".*", rest, GLOB_SUFFIX].concat() } else { - [b"(?:|.*/)", glob_re.as_slice(), globsuffix].concat() + [b"(?:|.*/)", glob_re.as_slice(), GLOB_SUFFIX].concat() } } PatternSyntax::Glob | PatternSyntax::RootGlob => { - [glob_to_re(pattern).as_slice(), globsuffix].concat() + [glob_to_re(pattern).as_slice(), GLOB_SUFFIX].concat() } } } @@ -195,22 +204,73 @@ const GLOB_SPECIAL_CHARACTERS: [u8; 7] = [b'*', b'?', b'[', b']', b'{', b'}', b'\\']; +/// TODO support other platforms +#[cfg(unix)] +pub fn normalize_path_bytes(bytes: &[u8]) -> Vec<u8> { + if bytes.is_empty() { + return b".".to_vec(); + } + let sep = b'/'; + + let mut initial_slashes = bytes.iter().take_while(|b| **b == sep).count(); + if initial_slashes > 2 { + // POSIX allows one or two initial slashes, but treats three or more + // as single slash. + initial_slashes = 1; + } + let components = bytes + .split(|b| *b == sep) + .filter(|c| !(c.is_empty() || c == b".")) + .fold(vec![], |mut acc, component| { + if component != b".." + || (initial_slashes == 0 && acc.is_empty()) + || (!acc.is_empty() && acc[acc.len() - 1] == b"..") + { + acc.push(component) + } else if !acc.is_empty() { + acc.pop(); + } + acc + }); + let mut new_bytes = components.join(&sep); + + if initial_slashes > 0 { + let mut buf: Vec<_> = (0..initial_slashes).map(|_| sep).collect(); + buf.extend(new_bytes); + new_bytes = buf; + } + if new_bytes.is_empty() { + b".".to_vec() + } else { + new_bytes + } +} + /// Wrapper function to `_build_single_regex` that short-circuits 'exact' globs /// that don't need to be transformed into a regex. pub fn build_single_regex( - kind: &[u8], - pat: &[u8], - globsuffix: &[u8], + entry: &IgnorePattern, ) -> Result<Vec<u8>, PatternError> { - let enum_kind = parse_pattern_syntax(kind)?; - if enum_kind == PatternSyntax::RootGlob - && !pat.iter().any(|b| GLOB_SPECIAL_CHARACTERS.contains(b)) + let IgnorePattern { + pattern, syntax, .. + } = entry; + let pattern = match syntax { + PatternSyntax::RootGlob + | PatternSyntax::Path + | PatternSyntax::RelGlob + | PatternSyntax::RootFiles => normalize_path_bytes(&pattern), + _ => pattern.to_owned(), + }; + if *syntax == PatternSyntax::RootGlob + && !pattern.iter().any(|b| GLOB_SPECIAL_CHARACTERS.contains(b)) { - let mut escaped = escape_pattern(pat); - escaped.extend(b"(?:/|$)"); + let mut escaped = escape_pattern(&pattern); + escaped.extend(GLOB_SUFFIX); Ok(escaped) } else { - Ok(_build_single_regex(enum_kind, pat, globsuffix)) + let mut entry = entry.clone(); + entry.pattern = pattern; + Ok(_build_single_regex(&entry)) } } @@ -222,24 +282,29 @@ m.insert(b"regexp".as_ref(), b"relre:".as_ref()); m.insert(b"glob".as_ref(), b"relglob:".as_ref()); m.insert(b"rootglob".as_ref(), b"rootglob:".as_ref()); - m.insert(b"include".as_ref(), b"include".as_ref()); - m.insert(b"subinclude".as_ref(), b"subinclude".as_ref()); + m.insert(b"include".as_ref(), b"include:".as_ref()); + m.insert(b"subinclude".as_ref(), b"subinclude:".as_ref()); m }; } -pub type PatternTuple = (Vec<u8>, LineNumber, Vec<u8>); -type WarningTuple = (PathBuf, Vec<u8>); +#[derive(Debug)] +pub enum PatternFileWarning { + /// (file path, syntax bytes) + InvalidSyntax(PathBuf, Vec<u8>), + /// File path + NoSuchFile(PathBuf), +} pub fn parse_pattern_file_contents<P: AsRef<Path>>( lines: &[u8], file_path: P, warn: bool, -) -> (Vec<PatternTuple>, Vec<WarningTuple>) { +) -> Result<(Vec<IgnorePattern>, Vec<PatternFileWarning>), PatternError> { let comment_regex = Regex::new(r"((?:^|[^\\])(?:\\\\)*)#.*").unwrap(); let comment_escape_regex = Regex::new(r"\\#").unwrap(); - let mut inputs: Vec<PatternTuple> = vec![]; - let mut warnings: Vec<WarningTuple> = vec![]; + let mut inputs: Vec<IgnorePattern> = vec![]; + let mut warnings: Vec<PatternFileWarning> = vec![]; let mut current_syntax = b"relre:".as_ref(); @@ -267,8 +332,10 @@ if let Some(rel_syntax) = SYNTAXES.get(syntax) { current_syntax = rel_syntax; } else if warn { - warnings - .push((file_path.as_ref().to_owned(), syntax.to_owned())); + warnings.push(PatternFileWarning::InvalidSyntax( + file_path.as_ref().to_owned(), + syntax.to_owned(), + )); } continue; } @@ -288,34 +355,82 @@ } } - inputs.push(( - [line_syntax, line].concat(), - line_number, - line.to_owned(), + inputs.push(IgnorePattern::new( + parse_pattern_syntax(&line_syntax).map_err(|e| match e { + PatternError::UnsupportedSyntax(syntax) => { + PatternError::UnsupportedSyntaxInFile( + syntax, + file_path.as_ref().to_string_lossy().into(), + line_number, + ) + } + _ => e, + })?, + &line, + &file_path, )); } - (inputs, warnings) + Ok((inputs, warnings)) } pub fn read_pattern_file<P: AsRef<Path>>( file_path: P, warn: bool, -) -> Result<(Vec<PatternTuple>, Vec<WarningTuple>), PatternFileError> { - let mut f = File::open(file_path.as_ref())?; +) -> Result<(Vec<IgnorePattern>, Vec<PatternFileWarning>), PatternError> { + let mut f = match File::open(file_path.as_ref()) { + Ok(f) => Ok(f), + Err(e) => match e.kind() { + std::io::ErrorKind::NotFound => { + return Ok(( + vec![], + vec![PatternFileWarning::NoSuchFile( + file_path.as_ref().to_owned(), + )], + )) + } + _ => Err(e), + }, + }?; let mut contents = Vec::new(); f.read_to_end(&mut contents)?; - Ok(parse_pattern_file_contents(&contents, file_path, warn)) + Ok(parse_pattern_file_contents(&contents, file_path, warn)?) +} + +/// Represents an entry in an "ignore" file. +#[derive(Debug, Eq, PartialEq, Clone)] +pub struct IgnorePattern { + pub syntax: PatternSyntax, + pub pattern: Vec<u8>, + pub source: PathBuf, } +impl IgnorePattern { + pub fn new( + syntax: PatternSyntax, + pattern: &[u8], + source: impl AsRef<Path>, + ) -> Self { + Self { + syntax, + pattern: pattern.to_owned(), + source: source.as_ref().to_owned(), + } + } +} + +pub type PatternResult<T> = Result<T, PatternError>; + #[cfg(test)] mod tests { use super::*; + use pretty_assertions::assert_eq; #[test] fn escape_pattern_test() { - let untouched = br#"!"%',/0123456789:;<=>@ABCDEFGHIJKLMNOPQRSTUVWXYZ_`abcdefghijklmnopqrstuvwxyz"#; + let untouched = + br#"!"%',/0123456789:;<=>@ABCDEFGHIJKLMNOPQRSTUVWXYZ_`abcdefghijklmnopqrstuvwxyz"#; assert_eq!(escape_pattern(untouched), untouched.to_vec()); // All escape codes assert_eq!( @@ -342,39 +457,78 @@ let lines = b"syntax: glob\n*.elc"; assert_eq!( - vec![(b"relglob:*.elc".to_vec(), 2, b"*.elc".to_vec())], parse_pattern_file_contents(lines, Path::new("file_path"), false) + .unwrap() .0, + vec![IgnorePattern::new( + PatternSyntax::RelGlob, + b"*.elc", + Path::new("file_path") + )], ); let lines = b"syntax: include\nsyntax: glob"; assert_eq!( parse_pattern_file_contents(lines, Path::new("file_path"), false) + .unwrap() .0, vec![] ); let lines = b"glob:**.o"; assert_eq!( parse_pattern_file_contents(lines, Path::new("file_path"), false) + .unwrap() .0, - vec![(b"relglob:**.o".to_vec(), 1, b"**.o".to_vec())] + vec![IgnorePattern::new( + PatternSyntax::RelGlob, + b"**.o", + Path::new("file_path") + )] + ); + } + + #[test] + fn test_build_single_regex() { + assert_eq!( + build_single_regex(&IgnorePattern::new( + PatternSyntax::RelGlob, + b"rust/target/", + Path::new("") + )) + .unwrap(), + br"(?:|.*/)rust/target(?:/|$)".to_vec(), ); } #[test] fn test_build_single_regex_shortcut() { assert_eq!( - br"(?:/|$)".to_vec(), - build_single_regex(b"rootglob", b"", b"").unwrap() + build_single_regex(&IgnorePattern::new( + PatternSyntax::RootGlob, + b"", + Path::new("") + )) + .unwrap(), + br"\.(?:/|$)".to_vec(), ); assert_eq!( + build_single_regex(&IgnorePattern::new( + PatternSyntax::RootGlob, + b"whatever", + Path::new("") + )) + .unwrap(), br"whatever(?:/|$)".to_vec(), - build_single_regex(b"rootglob", b"whatever", b"").unwrap() ); assert_eq!( - br"[^/]*\.o".to_vec(), - build_single_regex(b"rootglob", b"*.o", b"").unwrap() + build_single_regex(&IgnorePattern::new( + PatternSyntax::RootGlob, + b"*.o", + Path::new("") + )) + .unwrap(), + br"[^/]*\.o(?:/|$)".to_vec(), ); } }
--- a/rust/hg-core/src/lib.rs Tue Jan 14 17:10:20 2020 +0100 +++ b/rust/hg-core/src/lib.rs Thu Jan 16 10:28:40 2020 +0100 @@ -25,7 +25,8 @@ use crate::utils::hg_path::{HgPathBuf, HgPathError}; pub use filepatterns::{ - build_single_regex, read_pattern_file, PatternSyntax, PatternTuple, + parse_pattern_syntax, read_pattern_file, IgnorePattern, + PatternFileWarning, PatternSyntax, }; use std::collections::HashMap; use twox_hash::RandomXxHashBuilder64; @@ -115,18 +116,31 @@ #[derive(Debug)] pub enum PatternError { + Path(HgPathError), UnsupportedSyntax(String), + UnsupportedSyntaxInFile(String, String, usize), + TooLong(usize), + IO(std::io::Error), } -#[derive(Debug)] -pub enum PatternFileError { - IO(std::io::Error), - Pattern(PatternError, LineNumber), -} - -impl From<std::io::Error> for PatternFileError { - fn from(e: std::io::Error) -> Self { - PatternFileError::IO(e) +impl ToString for PatternError { + fn to_string(&self) -> String { + match self { + PatternError::UnsupportedSyntax(syntax) => { + format!("Unsupported syntax {}", syntax) + } + PatternError::UnsupportedSyntaxInFile(syntax, file_path, line) => { + format!( + "{}:{}: unsupported syntax {}", + file_path, line, syntax + ) + } + PatternError::TooLong(size) => { + format!("matcher pattern is too long ({} bytes)", size) + } + PatternError::IO(e) => e.to_string(), + PatternError::Path(e) => e.to_string(), + } } } @@ -141,3 +155,15 @@ DirstateError::IO(e) } } + +impl From<std::io::Error> for PatternError { + fn from(e: std::io::Error) -> Self { + PatternError::IO(e) + } +} + +impl From<HgPathError> for PatternError { + fn from(e: HgPathError) -> Self { + PatternError::Path(e) + } +}