changeset 44346:d42eea9a0494

rust-filepatterns: improve API and robustness for pattern files parsing Within the next few patches we will be using this new API. Differential Revision: https://phab.mercurial-scm.org/D7908
author Raphaël Gomès <rgomes@octobus.net>
date Thu, 16 Jan 2020 10:28:40 +0100
parents 4caac36c66bc
children 2fe89bec8011
files rust/hg-core/src/filepatterns.rs rust/hg-core/src/lib.rs
diffstat 2 files changed, 245 insertions(+), 65 deletions(-) [+]
line wrap: on
line diff
--- a/rust/hg-core/src/filepatterns.rs	Tue Jan 14 17:10:20 2020 +0100
+++ b/rust/hg-core/src/filepatterns.rs	Thu Jan 16 10:28:40 2020 +0100
@@ -7,9 +7,7 @@
 
 //! Handling of Mercurial-specific patterns.
 
-use crate::{
-    utils::SliceExt, FastHashMap, LineNumber, PatternError, PatternFileError,
-};
+use crate::{utils::SliceExt, FastHashMap, PatternError};
 use lazy_static::lazy_static;
 use regex::bytes::{NoExpand, Regex};
 use std::fs::File;
@@ -32,18 +30,28 @@
 const GLOB_REPLACEMENTS: &[(&[u8], &[u8])] =
     &[(b"*/", b"(?:.*/)?"), (b"*", b".*"), (b"", b"[^/]*")];
 
+/// Appended to the regexp of globs
+const GLOB_SUFFIX: &[u8; 7] = b"(?:/|$)";
+
 #[derive(Debug, Copy, Clone, PartialEq, Eq)]
 pub enum PatternSyntax {
+    /// A regular expression
     Regexp,
     /// Glob that matches at the front of the path
     RootGlob,
     /// Glob that matches at any suffix of the path (still anchored at
     /// slashes)
     Glob,
+    /// a path relative to repository root, which is matched recursively
     Path,
+    /// A path relative to cwd
     RelPath,
+    /// an unrooted glob (*.rs matches Rust files in all dirs)
     RelGlob,
+    /// A regexp that needn't match the start of a name
     RelRegexp,
+    /// A path relative to repository root, which is matched non-recursively
+    /// (will not match subdirectories)
     RootFiles,
 }
 
@@ -125,16 +133,18 @@
         .collect()
 }
 
-fn parse_pattern_syntax(kind: &[u8]) -> Result<PatternSyntax, PatternError> {
+pub fn parse_pattern_syntax(
+    kind: &[u8],
+) -> Result<PatternSyntax, PatternError> {
     match kind {
-        b"re" => Ok(PatternSyntax::Regexp),
-        b"path" => Ok(PatternSyntax::Path),
-        b"relpath" => Ok(PatternSyntax::RelPath),
-        b"rootfilesin" => Ok(PatternSyntax::RootFiles),
-        b"relglob" => Ok(PatternSyntax::RelGlob),
-        b"relre" => Ok(PatternSyntax::RelRegexp),
-        b"glob" => Ok(PatternSyntax::Glob),
-        b"rootglob" => Ok(PatternSyntax::RootGlob),
+        b"re:" => Ok(PatternSyntax::Regexp),
+        b"path:" => Ok(PatternSyntax::Path),
+        b"relpath:" => Ok(PatternSyntax::RelPath),
+        b"rootfilesin:" => Ok(PatternSyntax::RootFiles),
+        b"relglob:" => Ok(PatternSyntax::RelGlob),
+        b"relre:" => Ok(PatternSyntax::RelRegexp),
+        b"glob:" => Ok(PatternSyntax::Glob),
+        b"rootglob:" => Ok(PatternSyntax::RootGlob),
         _ => Err(PatternError::UnsupportedSyntax(
             String::from_utf8_lossy(kind).to_string(),
         )),
@@ -144,11 +154,10 @@
 /// Builds the regex that corresponds to the given pattern.
 /// If within a `syntax: regexp` context, returns the pattern,
 /// otherwise, returns the corresponding regex.
-fn _build_single_regex(
-    syntax: PatternSyntax,
-    pattern: &[u8],
-    globsuffix: &[u8],
-) -> Vec<u8> {
+fn _build_single_regex(entry: &IgnorePattern) -> Vec<u8> {
+    let IgnorePattern {
+        syntax, pattern, ..
+    } = entry;
     if pattern.is_empty() {
         return vec![];
     }
@@ -158,7 +167,7 @@
             if pattern[0] == b'^' {
                 return pattern.to_owned();
             }
-            [b".*", pattern].concat()
+            [&b".*"[..], pattern].concat()
         }
         PatternSyntax::Path | PatternSyntax::RelPath => {
             if pattern == b"." {
@@ -181,13 +190,13 @@
         PatternSyntax::RelGlob => {
             let glob_re = glob_to_re(pattern);
             if let Some(rest) = glob_re.drop_prefix(b"[^/]*") {
-                [b".*", rest, globsuffix].concat()
+                [b".*", rest, GLOB_SUFFIX].concat()
             } else {
-                [b"(?:|.*/)", glob_re.as_slice(), globsuffix].concat()
+                [b"(?:|.*/)", glob_re.as_slice(), GLOB_SUFFIX].concat()
             }
         }
         PatternSyntax::Glob | PatternSyntax::RootGlob => {
-            [glob_to_re(pattern).as_slice(), globsuffix].concat()
+            [glob_to_re(pattern).as_slice(), GLOB_SUFFIX].concat()
         }
     }
 }
@@ -195,22 +204,73 @@
 const GLOB_SPECIAL_CHARACTERS: [u8; 7] =
     [b'*', b'?', b'[', b']', b'{', b'}', b'\\'];
 
+/// TODO support other platforms
+#[cfg(unix)]
+pub fn normalize_path_bytes(bytes: &[u8]) -> Vec<u8> {
+    if bytes.is_empty() {
+        return b".".to_vec();
+    }
+    let sep = b'/';
+
+    let mut initial_slashes = bytes.iter().take_while(|b| **b == sep).count();
+    if initial_slashes > 2 {
+        // POSIX allows one or two initial slashes, but treats three or more
+        // as single slash.
+        initial_slashes = 1;
+    }
+    let components = bytes
+        .split(|b| *b == sep)
+        .filter(|c| !(c.is_empty() || c == b"."))
+        .fold(vec![], |mut acc, component| {
+            if component != b".."
+                || (initial_slashes == 0 && acc.is_empty())
+                || (!acc.is_empty() && acc[acc.len() - 1] == b"..")
+            {
+                acc.push(component)
+            } else if !acc.is_empty() {
+                acc.pop();
+            }
+            acc
+        });
+    let mut new_bytes = components.join(&sep);
+
+    if initial_slashes > 0 {
+        let mut buf: Vec<_> = (0..initial_slashes).map(|_| sep).collect();
+        buf.extend(new_bytes);
+        new_bytes = buf;
+    }
+    if new_bytes.is_empty() {
+        b".".to_vec()
+    } else {
+        new_bytes
+    }
+}
+
 /// Wrapper function to `_build_single_regex` that short-circuits 'exact' globs
 /// that don't need to be transformed into a regex.
 pub fn build_single_regex(
-    kind: &[u8],
-    pat: &[u8],
-    globsuffix: &[u8],
+    entry: &IgnorePattern,
 ) -> Result<Vec<u8>, PatternError> {
-    let enum_kind = parse_pattern_syntax(kind)?;
-    if enum_kind == PatternSyntax::RootGlob
-        && !pat.iter().any(|b| GLOB_SPECIAL_CHARACTERS.contains(b))
+    let IgnorePattern {
+        pattern, syntax, ..
+    } = entry;
+    let pattern = match syntax {
+        PatternSyntax::RootGlob
+        | PatternSyntax::Path
+        | PatternSyntax::RelGlob
+        | PatternSyntax::RootFiles => normalize_path_bytes(&pattern),
+        _ => pattern.to_owned(),
+    };
+    if *syntax == PatternSyntax::RootGlob
+        && !pattern.iter().any(|b| GLOB_SPECIAL_CHARACTERS.contains(b))
     {
-        let mut escaped = escape_pattern(pat);
-        escaped.extend(b"(?:/|$)");
+        let mut escaped = escape_pattern(&pattern);
+        escaped.extend(GLOB_SUFFIX);
         Ok(escaped)
     } else {
-        Ok(_build_single_regex(enum_kind, pat, globsuffix))
+        let mut entry = entry.clone();
+        entry.pattern = pattern;
+        Ok(_build_single_regex(&entry))
     }
 }
 
@@ -222,24 +282,29 @@
         m.insert(b"regexp".as_ref(), b"relre:".as_ref());
         m.insert(b"glob".as_ref(), b"relglob:".as_ref());
         m.insert(b"rootglob".as_ref(), b"rootglob:".as_ref());
-        m.insert(b"include".as_ref(), b"include".as_ref());
-        m.insert(b"subinclude".as_ref(), b"subinclude".as_ref());
+        m.insert(b"include".as_ref(), b"include:".as_ref());
+        m.insert(b"subinclude".as_ref(), b"subinclude:".as_ref());
         m
     };
 }
 
-pub type PatternTuple = (Vec<u8>, LineNumber, Vec<u8>);
-type WarningTuple = (PathBuf, Vec<u8>);
+#[derive(Debug)]
+pub enum PatternFileWarning {
+    /// (file path, syntax bytes)
+    InvalidSyntax(PathBuf, Vec<u8>),
+    /// File path
+    NoSuchFile(PathBuf),
+}
 
 pub fn parse_pattern_file_contents<P: AsRef<Path>>(
     lines: &[u8],
     file_path: P,
     warn: bool,
-) -> (Vec<PatternTuple>, Vec<WarningTuple>) {
+) -> Result<(Vec<IgnorePattern>, Vec<PatternFileWarning>), PatternError> {
     let comment_regex = Regex::new(r"((?:^|[^\\])(?:\\\\)*)#.*").unwrap();
     let comment_escape_regex = Regex::new(r"\\#").unwrap();
-    let mut inputs: Vec<PatternTuple> = vec![];
-    let mut warnings: Vec<WarningTuple> = vec![];
+    let mut inputs: Vec<IgnorePattern> = vec![];
+    let mut warnings: Vec<PatternFileWarning> = vec![];
 
     let mut current_syntax = b"relre:".as_ref();
 
@@ -267,8 +332,10 @@
             if let Some(rel_syntax) = SYNTAXES.get(syntax) {
                 current_syntax = rel_syntax;
             } else if warn {
-                warnings
-                    .push((file_path.as_ref().to_owned(), syntax.to_owned()));
+                warnings.push(PatternFileWarning::InvalidSyntax(
+                    file_path.as_ref().to_owned(),
+                    syntax.to_owned(),
+                ));
             }
             continue;
         }
@@ -288,34 +355,82 @@
             }
         }
 
-        inputs.push((
-            [line_syntax, line].concat(),
-            line_number,
-            line.to_owned(),
+        inputs.push(IgnorePattern::new(
+            parse_pattern_syntax(&line_syntax).map_err(|e| match e {
+                PatternError::UnsupportedSyntax(syntax) => {
+                    PatternError::UnsupportedSyntaxInFile(
+                        syntax,
+                        file_path.as_ref().to_string_lossy().into(),
+                        line_number,
+                    )
+                }
+                _ => e,
+            })?,
+            &line,
+            &file_path,
         ));
     }
-    (inputs, warnings)
+    Ok((inputs, warnings))
 }
 
 pub fn read_pattern_file<P: AsRef<Path>>(
     file_path: P,
     warn: bool,
-) -> Result<(Vec<PatternTuple>, Vec<WarningTuple>), PatternFileError> {
-    let mut f = File::open(file_path.as_ref())?;
+) -> Result<(Vec<IgnorePattern>, Vec<PatternFileWarning>), PatternError> {
+    let mut f = match File::open(file_path.as_ref()) {
+        Ok(f) => Ok(f),
+        Err(e) => match e.kind() {
+            std::io::ErrorKind::NotFound => {
+                return Ok((
+                    vec![],
+                    vec![PatternFileWarning::NoSuchFile(
+                        file_path.as_ref().to_owned(),
+                    )],
+                ))
+            }
+            _ => Err(e),
+        },
+    }?;
     let mut contents = Vec::new();
 
     f.read_to_end(&mut contents)?;
 
-    Ok(parse_pattern_file_contents(&contents, file_path, warn))
+    Ok(parse_pattern_file_contents(&contents, file_path, warn)?)
+}
+
+/// Represents an entry in an "ignore" file.
+#[derive(Debug, Eq, PartialEq, Clone)]
+pub struct IgnorePattern {
+    pub syntax: PatternSyntax,
+    pub pattern: Vec<u8>,
+    pub source: PathBuf,
 }
 
+impl IgnorePattern {
+    pub fn new(
+        syntax: PatternSyntax,
+        pattern: &[u8],
+        source: impl AsRef<Path>,
+    ) -> Self {
+        Self {
+            syntax,
+            pattern: pattern.to_owned(),
+            source: source.as_ref().to_owned(),
+        }
+    }
+}
+
+pub type PatternResult<T> = Result<T, PatternError>;
+
 #[cfg(test)]
 mod tests {
     use super::*;
+    use pretty_assertions::assert_eq;
 
     #[test]
     fn escape_pattern_test() {
-        let untouched = br#"!"%',/0123456789:;<=>@ABCDEFGHIJKLMNOPQRSTUVWXYZ_`abcdefghijklmnopqrstuvwxyz"#;
+        let untouched =
+            br#"!"%',/0123456789:;<=>@ABCDEFGHIJKLMNOPQRSTUVWXYZ_`abcdefghijklmnopqrstuvwxyz"#;
         assert_eq!(escape_pattern(untouched), untouched.to_vec());
         // All escape codes
         assert_eq!(
@@ -342,39 +457,78 @@
         let lines = b"syntax: glob\n*.elc";
 
         assert_eq!(
-            vec![(b"relglob:*.elc".to_vec(), 2, b"*.elc".to_vec())],
             parse_pattern_file_contents(lines, Path::new("file_path"), false)
+                .unwrap()
                 .0,
+            vec![IgnorePattern::new(
+                PatternSyntax::RelGlob,
+                b"*.elc",
+                Path::new("file_path")
+            )],
         );
 
         let lines = b"syntax: include\nsyntax: glob";
 
         assert_eq!(
             parse_pattern_file_contents(lines, Path::new("file_path"), false)
+                .unwrap()
                 .0,
             vec![]
         );
         let lines = b"glob:**.o";
         assert_eq!(
             parse_pattern_file_contents(lines, Path::new("file_path"), false)
+                .unwrap()
                 .0,
-            vec![(b"relglob:**.o".to_vec(), 1, b"**.o".to_vec())]
+            vec![IgnorePattern::new(
+                PatternSyntax::RelGlob,
+                b"**.o",
+                Path::new("file_path")
+            )]
+        );
+    }
+
+    #[test]
+    fn test_build_single_regex() {
+        assert_eq!(
+            build_single_regex(&IgnorePattern::new(
+                PatternSyntax::RelGlob,
+                b"rust/target/",
+                Path::new("")
+            ))
+            .unwrap(),
+            br"(?:|.*/)rust/target(?:/|$)".to_vec(),
         );
     }
 
     #[test]
     fn test_build_single_regex_shortcut() {
         assert_eq!(
-            br"(?:/|$)".to_vec(),
-            build_single_regex(b"rootglob", b"", b"").unwrap()
+            build_single_regex(&IgnorePattern::new(
+                PatternSyntax::RootGlob,
+                b"",
+                Path::new("")
+            ))
+            .unwrap(),
+            br"\.(?:/|$)".to_vec(),
         );
         assert_eq!(
+            build_single_regex(&IgnorePattern::new(
+                PatternSyntax::RootGlob,
+                b"whatever",
+                Path::new("")
+            ))
+            .unwrap(),
             br"whatever(?:/|$)".to_vec(),
-            build_single_regex(b"rootglob", b"whatever", b"").unwrap()
         );
         assert_eq!(
-            br"[^/]*\.o".to_vec(),
-            build_single_regex(b"rootglob", b"*.o", b"").unwrap()
+            build_single_regex(&IgnorePattern::new(
+                PatternSyntax::RootGlob,
+                b"*.o",
+                Path::new("")
+            ))
+            .unwrap(),
+            br"[^/]*\.o(?:/|$)".to_vec(),
         );
     }
 }
--- a/rust/hg-core/src/lib.rs	Tue Jan 14 17:10:20 2020 +0100
+++ b/rust/hg-core/src/lib.rs	Thu Jan 16 10:28:40 2020 +0100
@@ -25,7 +25,8 @@
 
 use crate::utils::hg_path::{HgPathBuf, HgPathError};
 pub use filepatterns::{
-    build_single_regex, read_pattern_file, PatternSyntax, PatternTuple,
+    parse_pattern_syntax, read_pattern_file, IgnorePattern,
+    PatternFileWarning, PatternSyntax,
 };
 use std::collections::HashMap;
 use twox_hash::RandomXxHashBuilder64;
@@ -115,18 +116,31 @@
 
 #[derive(Debug)]
 pub enum PatternError {
+    Path(HgPathError),
     UnsupportedSyntax(String),
+    UnsupportedSyntaxInFile(String, String, usize),
+    TooLong(usize),
+    IO(std::io::Error),
 }
 
-#[derive(Debug)]
-pub enum PatternFileError {
-    IO(std::io::Error),
-    Pattern(PatternError, LineNumber),
-}
-
-impl From<std::io::Error> for PatternFileError {
-    fn from(e: std::io::Error) -> Self {
-        PatternFileError::IO(e)
+impl ToString for PatternError {
+    fn to_string(&self) -> String {
+        match self {
+            PatternError::UnsupportedSyntax(syntax) => {
+                format!("Unsupported syntax {}", syntax)
+            }
+            PatternError::UnsupportedSyntaxInFile(syntax, file_path, line) => {
+                format!(
+                    "{}:{}: unsupported syntax {}",
+                    file_path, line, syntax
+                )
+            }
+            PatternError::TooLong(size) => {
+                format!("matcher pattern is too long ({} bytes)", size)
+            }
+            PatternError::IO(e) => e.to_string(),
+            PatternError::Path(e) => e.to_string(),
+        }
     }
 }
 
@@ -141,3 +155,15 @@
         DirstateError::IO(e)
     }
 }
+
+impl From<std::io::Error> for PatternError {
+    fn from(e: std::io::Error) -> Self {
+        PatternError::IO(e)
+    }
+}
+
+impl From<HgPathError> for PatternError {
+    fn from(e: HgPathError) -> Self {
+        PatternError::Path(e)
+    }
+}