comparison rust/hg-core/src/filepatterns.rs @ 44303:d42eea9a0494

rust-filepatterns: improve API and robustness for pattern files parsing Within the next few patches we will be using this new API. Differential Revision: https://phab.mercurial-scm.org/D7908
author Raphaël Gomès <rgomes@octobus.net>
date Thu, 16 Jan 2020 10:28:40 +0100
parents 5ac243a92e37
children 2fe89bec8011
comparison
equal deleted inserted replaced
44301:4caac36c66bc 44303:d42eea9a0494
5 // This software may be used and distributed according to the terms of the 5 // This software may be used and distributed according to the terms of the
6 // GNU General Public License version 2 or any later version. 6 // GNU General Public License version 2 or any later version.
7 7
8 //! Handling of Mercurial-specific patterns. 8 //! Handling of Mercurial-specific patterns.
9 9
10 use crate::{ 10 use crate::{utils::SliceExt, FastHashMap, PatternError};
11 utils::SliceExt, FastHashMap, LineNumber, PatternError, PatternFileError,
12 };
13 use lazy_static::lazy_static; 11 use lazy_static::lazy_static;
14 use regex::bytes::{NoExpand, Regex}; 12 use regex::bytes::{NoExpand, Regex};
15 use std::fs::File; 13 use std::fs::File;
16 use std::io::Read; 14 use std::io::Read;
17 use std::path::{Path, PathBuf}; 15 use std::path::{Path, PathBuf};
30 28
31 /// These are matched in order 29 /// These are matched in order
32 const GLOB_REPLACEMENTS: &[(&[u8], &[u8])] = 30 const GLOB_REPLACEMENTS: &[(&[u8], &[u8])] =
33 &[(b"*/", b"(?:.*/)?"), (b"*", b".*"), (b"", b"[^/]*")]; 31 &[(b"*/", b"(?:.*/)?"), (b"*", b".*"), (b"", b"[^/]*")];
34 32
33 /// Appended to the regexp of globs
34 const GLOB_SUFFIX: &[u8; 7] = b"(?:/|$)";
35
35 #[derive(Debug, Copy, Clone, PartialEq, Eq)] 36 #[derive(Debug, Copy, Clone, PartialEq, Eq)]
36 pub enum PatternSyntax { 37 pub enum PatternSyntax {
38 /// A regular expression
37 Regexp, 39 Regexp,
38 /// Glob that matches at the front of the path 40 /// Glob that matches at the front of the path
39 RootGlob, 41 RootGlob,
40 /// Glob that matches at any suffix of the path (still anchored at 42 /// Glob that matches at any suffix of the path (still anchored at
41 /// slashes) 43 /// slashes)
42 Glob, 44 Glob,
45 /// a path relative to repository root, which is matched recursively
43 Path, 46 Path,
47 /// A path relative to cwd
44 RelPath, 48 RelPath,
49 /// an unrooted glob (*.rs matches Rust files in all dirs)
45 RelGlob, 50 RelGlob,
51 /// A regexp that needn't match the start of a name
46 RelRegexp, 52 RelRegexp,
53 /// A path relative to repository root, which is matched non-recursively
54 /// (will not match subdirectories)
47 RootFiles, 55 RootFiles,
48 } 56 }
49 57
50 /// Transforms a glob pattern into a regex 58 /// Transforms a glob pattern into a regex
51 fn glob_to_re(pat: &[u8]) -> Vec<u8> { 59 fn glob_to_re(pat: &[u8]) -> Vec<u8> {
123 .iter() 131 .iter()
124 .flat_map(|c| RE_ESCAPE[*c as usize].clone()) 132 .flat_map(|c| RE_ESCAPE[*c as usize].clone())
125 .collect() 133 .collect()
126 } 134 }
127 135
128 fn parse_pattern_syntax(kind: &[u8]) -> Result<PatternSyntax, PatternError> { 136 pub fn parse_pattern_syntax(
137 kind: &[u8],
138 ) -> Result<PatternSyntax, PatternError> {
129 match kind { 139 match kind {
130 b"re" => Ok(PatternSyntax::Regexp), 140 b"re:" => Ok(PatternSyntax::Regexp),
131 b"path" => Ok(PatternSyntax::Path), 141 b"path:" => Ok(PatternSyntax::Path),
132 b"relpath" => Ok(PatternSyntax::RelPath), 142 b"relpath:" => Ok(PatternSyntax::RelPath),
133 b"rootfilesin" => Ok(PatternSyntax::RootFiles), 143 b"rootfilesin:" => Ok(PatternSyntax::RootFiles),
134 b"relglob" => Ok(PatternSyntax::RelGlob), 144 b"relglob:" => Ok(PatternSyntax::RelGlob),
135 b"relre" => Ok(PatternSyntax::RelRegexp), 145 b"relre:" => Ok(PatternSyntax::RelRegexp),
136 b"glob" => Ok(PatternSyntax::Glob), 146 b"glob:" => Ok(PatternSyntax::Glob),
137 b"rootglob" => Ok(PatternSyntax::RootGlob), 147 b"rootglob:" => Ok(PatternSyntax::RootGlob),
138 _ => Err(PatternError::UnsupportedSyntax( 148 _ => Err(PatternError::UnsupportedSyntax(
139 String::from_utf8_lossy(kind).to_string(), 149 String::from_utf8_lossy(kind).to_string(),
140 )), 150 )),
141 } 151 }
142 } 152 }
143 153
144 /// Builds the regex that corresponds to the given pattern. 154 /// Builds the regex that corresponds to the given pattern.
145 /// If within a `syntax: regexp` context, returns the pattern, 155 /// If within a `syntax: regexp` context, returns the pattern,
146 /// otherwise, returns the corresponding regex. 156 /// otherwise, returns the corresponding regex.
147 fn _build_single_regex( 157 fn _build_single_regex(entry: &IgnorePattern) -> Vec<u8> {
148 syntax: PatternSyntax, 158 let IgnorePattern {
149 pattern: &[u8], 159 syntax, pattern, ..
150 globsuffix: &[u8], 160 } = entry;
151 ) -> Vec<u8> {
152 if pattern.is_empty() { 161 if pattern.is_empty() {
153 return vec![]; 162 return vec![];
154 } 163 }
155 match syntax { 164 match syntax {
156 PatternSyntax::Regexp => pattern.to_owned(), 165 PatternSyntax::Regexp => pattern.to_owned(),
157 PatternSyntax::RelRegexp => { 166 PatternSyntax::RelRegexp => {
158 if pattern[0] == b'^' { 167 if pattern[0] == b'^' {
159 return pattern.to_owned(); 168 return pattern.to_owned();
160 } 169 }
161 [b".*", pattern].concat() 170 [&b".*"[..], pattern].concat()
162 } 171 }
163 PatternSyntax::Path | PatternSyntax::RelPath => { 172 PatternSyntax::Path | PatternSyntax::RelPath => {
164 if pattern == b"." { 173 if pattern == b"." {
165 return vec![]; 174 return vec![];
166 } 175 }
179 res 188 res
180 } 189 }
181 PatternSyntax::RelGlob => { 190 PatternSyntax::RelGlob => {
182 let glob_re = glob_to_re(pattern); 191 let glob_re = glob_to_re(pattern);
183 if let Some(rest) = glob_re.drop_prefix(b"[^/]*") { 192 if let Some(rest) = glob_re.drop_prefix(b"[^/]*") {
184 [b".*", rest, globsuffix].concat() 193 [b".*", rest, GLOB_SUFFIX].concat()
185 } else { 194 } else {
186 [b"(?:|.*/)", glob_re.as_slice(), globsuffix].concat() 195 [b"(?:|.*/)", glob_re.as_slice(), GLOB_SUFFIX].concat()
187 } 196 }
188 } 197 }
189 PatternSyntax::Glob | PatternSyntax::RootGlob => { 198 PatternSyntax::Glob | PatternSyntax::RootGlob => {
190 [glob_to_re(pattern).as_slice(), globsuffix].concat() 199 [glob_to_re(pattern).as_slice(), GLOB_SUFFIX].concat()
191 } 200 }
192 } 201 }
193 } 202 }
194 203
195 const GLOB_SPECIAL_CHARACTERS: [u8; 7] = 204 const GLOB_SPECIAL_CHARACTERS: [u8; 7] =
196 [b'*', b'?', b'[', b']', b'{', b'}', b'\\']; 205 [b'*', b'?', b'[', b']', b'{', b'}', b'\\'];
206
207 /// TODO support other platforms
208 #[cfg(unix)]
209 pub fn normalize_path_bytes(bytes: &[u8]) -> Vec<u8> {
210 if bytes.is_empty() {
211 return b".".to_vec();
212 }
213 let sep = b'/';
214
215 let mut initial_slashes = bytes.iter().take_while(|b| **b == sep).count();
216 if initial_slashes > 2 {
217 // POSIX allows one or two initial slashes, but treats three or more
218 // as single slash.
219 initial_slashes = 1;
220 }
221 let components = bytes
222 .split(|b| *b == sep)
223 .filter(|c| !(c.is_empty() || c == b"."))
224 .fold(vec![], |mut acc, component| {
225 if component != b".."
226 || (initial_slashes == 0 && acc.is_empty())
227 || (!acc.is_empty() && acc[acc.len() - 1] == b"..")
228 {
229 acc.push(component)
230 } else if !acc.is_empty() {
231 acc.pop();
232 }
233 acc
234 });
235 let mut new_bytes = components.join(&sep);
236
237 if initial_slashes > 0 {
238 let mut buf: Vec<_> = (0..initial_slashes).map(|_| sep).collect();
239 buf.extend(new_bytes);
240 new_bytes = buf;
241 }
242 if new_bytes.is_empty() {
243 b".".to_vec()
244 } else {
245 new_bytes
246 }
247 }
197 248
198 /// Wrapper function to `_build_single_regex` that short-circuits 'exact' globs 249 /// Wrapper function to `_build_single_regex` that short-circuits 'exact' globs
199 /// that don't need to be transformed into a regex. 250 /// that don't need to be transformed into a regex.
200 pub fn build_single_regex( 251 pub fn build_single_regex(
201 kind: &[u8], 252 entry: &IgnorePattern,
202 pat: &[u8],
203 globsuffix: &[u8],
204 ) -> Result<Vec<u8>, PatternError> { 253 ) -> Result<Vec<u8>, PatternError> {
205 let enum_kind = parse_pattern_syntax(kind)?; 254 let IgnorePattern {
206 if enum_kind == PatternSyntax::RootGlob 255 pattern, syntax, ..
207 && !pat.iter().any(|b| GLOB_SPECIAL_CHARACTERS.contains(b)) 256 } = entry;
257 let pattern = match syntax {
258 PatternSyntax::RootGlob
259 | PatternSyntax::Path
260 | PatternSyntax::RelGlob
261 | PatternSyntax::RootFiles => normalize_path_bytes(&pattern),
262 _ => pattern.to_owned(),
263 };
264 if *syntax == PatternSyntax::RootGlob
265 && !pattern.iter().any(|b| GLOB_SPECIAL_CHARACTERS.contains(b))
208 { 266 {
209 let mut escaped = escape_pattern(pat); 267 let mut escaped = escape_pattern(&pattern);
210 escaped.extend(b"(?:/|$)"); 268 escaped.extend(GLOB_SUFFIX);
211 Ok(escaped) 269 Ok(escaped)
212 } else { 270 } else {
213 Ok(_build_single_regex(enum_kind, pat, globsuffix)) 271 let mut entry = entry.clone();
272 entry.pattern = pattern;
273 Ok(_build_single_regex(&entry))
214 } 274 }
215 } 275 }
216 276
217 lazy_static! { 277 lazy_static! {
218 static ref SYNTAXES: FastHashMap<&'static [u8], &'static [u8]> = { 278 static ref SYNTAXES: FastHashMap<&'static [u8], &'static [u8]> = {
220 280
221 m.insert(b"re".as_ref(), b"relre:".as_ref()); 281 m.insert(b"re".as_ref(), b"relre:".as_ref());
222 m.insert(b"regexp".as_ref(), b"relre:".as_ref()); 282 m.insert(b"regexp".as_ref(), b"relre:".as_ref());
223 m.insert(b"glob".as_ref(), b"relglob:".as_ref()); 283 m.insert(b"glob".as_ref(), b"relglob:".as_ref());
224 m.insert(b"rootglob".as_ref(), b"rootglob:".as_ref()); 284 m.insert(b"rootglob".as_ref(), b"rootglob:".as_ref());
225 m.insert(b"include".as_ref(), b"include".as_ref()); 285 m.insert(b"include".as_ref(), b"include:".as_ref());
226 m.insert(b"subinclude".as_ref(), b"subinclude".as_ref()); 286 m.insert(b"subinclude".as_ref(), b"subinclude:".as_ref());
227 m 287 m
228 }; 288 };
229 } 289 }
230 290
231 pub type PatternTuple = (Vec<u8>, LineNumber, Vec<u8>); 291 #[derive(Debug)]
232 type WarningTuple = (PathBuf, Vec<u8>); 292 pub enum PatternFileWarning {
293 /// (file path, syntax bytes)
294 InvalidSyntax(PathBuf, Vec<u8>),
295 /// File path
296 NoSuchFile(PathBuf),
297 }
233 298
234 pub fn parse_pattern_file_contents<P: AsRef<Path>>( 299 pub fn parse_pattern_file_contents<P: AsRef<Path>>(
235 lines: &[u8], 300 lines: &[u8],
236 file_path: P, 301 file_path: P,
237 warn: bool, 302 warn: bool,
238 ) -> (Vec<PatternTuple>, Vec<WarningTuple>) { 303 ) -> Result<(Vec<IgnorePattern>, Vec<PatternFileWarning>), PatternError> {
239 let comment_regex = Regex::new(r"((?:^|[^\\])(?:\\\\)*)#.*").unwrap(); 304 let comment_regex = Regex::new(r"((?:^|[^\\])(?:\\\\)*)#.*").unwrap();
240 let comment_escape_regex = Regex::new(r"\\#").unwrap(); 305 let comment_escape_regex = Regex::new(r"\\#").unwrap();
241 let mut inputs: Vec<PatternTuple> = vec![]; 306 let mut inputs: Vec<IgnorePattern> = vec![];
242 let mut warnings: Vec<WarningTuple> = vec![]; 307 let mut warnings: Vec<PatternFileWarning> = vec![];
243 308
244 let mut current_syntax = b"relre:".as_ref(); 309 let mut current_syntax = b"relre:".as_ref();
245 310
246 for (line_number, mut line) in lines.split(|c| *c == b'\n').enumerate() { 311 for (line_number, mut line) in lines.split(|c| *c == b'\n').enumerate() {
247 let line_number = line_number + 1; 312 let line_number = line_number + 1;
265 let syntax = syntax.trim(); 330 let syntax = syntax.trim();
266 331
267 if let Some(rel_syntax) = SYNTAXES.get(syntax) { 332 if let Some(rel_syntax) = SYNTAXES.get(syntax) {
268 current_syntax = rel_syntax; 333 current_syntax = rel_syntax;
269 } else if warn { 334 } else if warn {
270 warnings 335 warnings.push(PatternFileWarning::InvalidSyntax(
271 .push((file_path.as_ref().to_owned(), syntax.to_owned())); 336 file_path.as_ref().to_owned(),
337 syntax.to_owned(),
338 ));
272 } 339 }
273 continue; 340 continue;
274 } 341 }
275 342
276 let mut line_syntax: &[u8] = &current_syntax; 343 let mut line_syntax: &[u8] = &current_syntax;
286 line = rest; 353 line = rest;
287 break; 354 break;
288 } 355 }
289 } 356 }
290 357
291 inputs.push(( 358 inputs.push(IgnorePattern::new(
292 [line_syntax, line].concat(), 359 parse_pattern_syntax(&line_syntax).map_err(|e| match e {
293 line_number, 360 PatternError::UnsupportedSyntax(syntax) => {
294 line.to_owned(), 361 PatternError::UnsupportedSyntaxInFile(
362 syntax,
363 file_path.as_ref().to_string_lossy().into(),
364 line_number,
365 )
366 }
367 _ => e,
368 })?,
369 &line,
370 &file_path,
295 )); 371 ));
296 } 372 }
297 (inputs, warnings) 373 Ok((inputs, warnings))
298 } 374 }
299 375
300 pub fn read_pattern_file<P: AsRef<Path>>( 376 pub fn read_pattern_file<P: AsRef<Path>>(
301 file_path: P, 377 file_path: P,
302 warn: bool, 378 warn: bool,
303 ) -> Result<(Vec<PatternTuple>, Vec<WarningTuple>), PatternFileError> { 379 ) -> Result<(Vec<IgnorePattern>, Vec<PatternFileWarning>), PatternError> {
304 let mut f = File::open(file_path.as_ref())?; 380 let mut f = match File::open(file_path.as_ref()) {
381 Ok(f) => Ok(f),
382 Err(e) => match e.kind() {
383 std::io::ErrorKind::NotFound => {
384 return Ok((
385 vec![],
386 vec![PatternFileWarning::NoSuchFile(
387 file_path.as_ref().to_owned(),
388 )],
389 ))
390 }
391 _ => Err(e),
392 },
393 }?;
305 let mut contents = Vec::new(); 394 let mut contents = Vec::new();
306 395
307 f.read_to_end(&mut contents)?; 396 f.read_to_end(&mut contents)?;
308 397
309 Ok(parse_pattern_file_contents(&contents, file_path, warn)) 398 Ok(parse_pattern_file_contents(&contents, file_path, warn)?)
310 } 399 }
400
401 /// Represents an entry in an "ignore" file.
402 #[derive(Debug, Eq, PartialEq, Clone)]
403 pub struct IgnorePattern {
404 pub syntax: PatternSyntax,
405 pub pattern: Vec<u8>,
406 pub source: PathBuf,
407 }
408
409 impl IgnorePattern {
410 pub fn new(
411 syntax: PatternSyntax,
412 pattern: &[u8],
413 source: impl AsRef<Path>,
414 ) -> Self {
415 Self {
416 syntax,
417 pattern: pattern.to_owned(),
418 source: source.as_ref().to_owned(),
419 }
420 }
421 }
422
423 pub type PatternResult<T> = Result<T, PatternError>;
311 424
312 #[cfg(test)] 425 #[cfg(test)]
313 mod tests { 426 mod tests {
314 use super::*; 427 use super::*;
428 use pretty_assertions::assert_eq;
315 429
316 #[test] 430 #[test]
317 fn escape_pattern_test() { 431 fn escape_pattern_test() {
318 let untouched = br#"!"%',/0123456789:;<=>@ABCDEFGHIJKLMNOPQRSTUVWXYZ_`abcdefghijklmnopqrstuvwxyz"#; 432 let untouched =
433 br#"!"%',/0123456789:;<=>@ABCDEFGHIJKLMNOPQRSTUVWXYZ_`abcdefghijklmnopqrstuvwxyz"#;
319 assert_eq!(escape_pattern(untouched), untouched.to_vec()); 434 assert_eq!(escape_pattern(untouched), untouched.to_vec());
320 // All escape codes 435 // All escape codes
321 assert_eq!( 436 assert_eq!(
322 escape_pattern(br#"()[]{}?*+-|^$\\.&~# \t\n\r\v\f"#), 437 escape_pattern(br#"()[]{}?*+-|^$\\.&~# \t\n\r\v\f"#),
323 br#"\(\)\[\]\{\}\?\*\+\-\|\^\$\\\\\.\&\~\#\ \\t\\n\\r\\v\\f"# 438 br#"\(\)\[\]\{\}\?\*\+\-\|\^\$\\\\\.\&\~\#\ \\t\\n\\r\\v\\f"#
340 #[test] 455 #[test]
341 fn test_parse_pattern_file_contents() { 456 fn test_parse_pattern_file_contents() {
342 let lines = b"syntax: glob\n*.elc"; 457 let lines = b"syntax: glob\n*.elc";
343 458
344 assert_eq!( 459 assert_eq!(
345 vec![(b"relglob:*.elc".to_vec(), 2, b"*.elc".to_vec())],
346 parse_pattern_file_contents(lines, Path::new("file_path"), false) 460 parse_pattern_file_contents(lines, Path::new("file_path"), false)
461 .unwrap()
347 .0, 462 .0,
463 vec![IgnorePattern::new(
464 PatternSyntax::RelGlob,
465 b"*.elc",
466 Path::new("file_path")
467 )],
348 ); 468 );
349 469
350 let lines = b"syntax: include\nsyntax: glob"; 470 let lines = b"syntax: include\nsyntax: glob";
351 471
352 assert_eq!( 472 assert_eq!(
353 parse_pattern_file_contents(lines, Path::new("file_path"), false) 473 parse_pattern_file_contents(lines, Path::new("file_path"), false)
474 .unwrap()
354 .0, 475 .0,
355 vec![] 476 vec![]
356 ); 477 );
357 let lines = b"glob:**.o"; 478 let lines = b"glob:**.o";
358 assert_eq!( 479 assert_eq!(
359 parse_pattern_file_contents(lines, Path::new("file_path"), false) 480 parse_pattern_file_contents(lines, Path::new("file_path"), false)
481 .unwrap()
360 .0, 482 .0,
361 vec![(b"relglob:**.o".to_vec(), 1, b"**.o".to_vec())] 483 vec![IgnorePattern::new(
484 PatternSyntax::RelGlob,
485 b"**.o",
486 Path::new("file_path")
487 )]
488 );
489 }
490
491 #[test]
492 fn test_build_single_regex() {
493 assert_eq!(
494 build_single_regex(&IgnorePattern::new(
495 PatternSyntax::RelGlob,
496 b"rust/target/",
497 Path::new("")
498 ))
499 .unwrap(),
500 br"(?:|.*/)rust/target(?:/|$)".to_vec(),
362 ); 501 );
363 } 502 }
364 503
365 #[test] 504 #[test]
366 fn test_build_single_regex_shortcut() { 505 fn test_build_single_regex_shortcut() {
367 assert_eq!( 506 assert_eq!(
368 br"(?:/|$)".to_vec(), 507 build_single_regex(&IgnorePattern::new(
369 build_single_regex(b"rootglob", b"", b"").unwrap() 508 PatternSyntax::RootGlob,
370 ); 509 b"",
371 assert_eq!( 510 Path::new("")
511 ))
512 .unwrap(),
513 br"\.(?:/|$)".to_vec(),
514 );
515 assert_eq!(
516 build_single_regex(&IgnorePattern::new(
517 PatternSyntax::RootGlob,
518 b"whatever",
519 Path::new("")
520 ))
521 .unwrap(),
372 br"whatever(?:/|$)".to_vec(), 522 br"whatever(?:/|$)".to_vec(),
373 build_single_regex(b"rootglob", b"whatever", b"").unwrap() 523 );
374 ); 524 assert_eq!(
375 assert_eq!( 525 build_single_regex(&IgnorePattern::new(
376 br"[^/]*\.o".to_vec(), 526 PatternSyntax::RootGlob,
377 build_single_regex(b"rootglob", b"*.o", b"").unwrap() 527 b"*.o",
378 ); 528 Path::new("")
379 } 529 ))
380 } 530 .unwrap(),
531 br"[^/]*\.o(?:/|$)".to_vec(),
532 );
533 }
534 }