comparison mercurial/match.py @ 31013:693a5bb47854

match: making visitdir() deal with non-recursive entries Primarily as an optimization to avoid recursing into directories that will never have a match inside, this classifies each matcher pattern's root as recursive or non-recursive (erring on the side of keeping it recursive, which may lead to wasteful directory or manifest walks that yield no matches). I measured the performance of "rootfilesin" in two repos: - The Firefox repo with tree manifests, with "hg files -r . -I rootfilesin:browser". The browser directory contains about 3K files across 249 subdirectories. - A specific Google-internal directory which contains 75K files across 19K subdirectories, with "hg files -r . -I rootfilesin:REDACTED". I tested with both cold and warm disk caches. Cold cache was produced by running "sync; echo 3 > /proc/sys/vm/drop_caches". Warm cache was produced by re-running the same command a few times. These were the results: Cold cache Warm cache Before After Before After firefox 0m5.1s 0m2.18s 0m0.22s 0m0.14s google3 dir 2m3.9s 0m1.57s 0m8.12s 0m0.16s Certain extensions, notably narrowhg, can depend on this for correctness (not trying to recurse into directories for which it has no information).
author Rodrigo Damazio Bovendorp <rdamazio@google.com>
date Mon, 13 Feb 2017 17:03:14 -0800
parents 88358446da16
children 6168d4b93634
comparison
equal deleted inserted replaced
31012:88358446da16 31013:693a5bb47854
123 self._files = [] # exact files and roots of patterns 123 self._files = [] # exact files and roots of patterns
124 self._anypats = bool(include or exclude) 124 self._anypats = bool(include or exclude)
125 self._always = False 125 self._always = False
126 self._pathrestricted = bool(include or exclude or patterns) 126 self._pathrestricted = bool(include or exclude or patterns)
127 self._warn = warn 127 self._warn = warn
128
129 # roots are directories which are recursively included/excluded.
128 self._includeroots = set() 130 self._includeroots = set()
131 self._excluderoots = set()
132 # dirs are directories which are non-recursively included.
129 self._includedirs = set(['.']) 133 self._includedirs = set(['.'])
130 self._excluderoots = set()
131 134
132 if badfn is not None: 135 if badfn is not None:
133 self.bad = badfn 136 self.bad = badfn
134 137
135 matchfns = [] 138 matchfns = []
136 if include: 139 if include:
137 kindpats = self._normalize(include, 'glob', root, cwd, auditor) 140 kindpats = self._normalize(include, 'glob', root, cwd, auditor)
138 self.includepat, im = _buildmatch(ctx, kindpats, '(?:/|$)', 141 self.includepat, im = _buildmatch(ctx, kindpats, '(?:/|$)',
139 listsubrepos, root) 142 listsubrepos, root)
140 self._includeroots.update(_roots(kindpats)) 143 roots, dirs = _rootsanddirs(kindpats)
141 self._includedirs.update(util.dirs(self._includeroots)) 144 self._includeroots.update(roots)
145 self._includedirs.update(dirs)
142 matchfns.append(im) 146 matchfns.append(im)
143 if exclude: 147 if exclude:
144 kindpats = self._normalize(exclude, 'glob', root, cwd, auditor) 148 kindpats = self._normalize(exclude, 'glob', root, cwd, auditor)
145 self.excludepat, em = _buildmatch(ctx, kindpats, '(?:/|$)', 149 self.excludepat, em = _buildmatch(ctx, kindpats, '(?:/|$)',
146 listsubrepos, root) 150 listsubrepos, root)
147 if not _anypats(kindpats): 151 if not _anypats(kindpats):
152 # Only consider recursive excludes as such - if a non-recursive
153 # exclude is used, we must still recurse into the excluded
154 # directory, at least to find subdirectories. In such a case,
155 # the regex still won't match the non-recursively-excluded
156 # files.
148 self._excluderoots.update(_roots(kindpats)) 157 self._excluderoots.update(_roots(kindpats))
149 matchfns.append(lambda f: not em(f)) 158 matchfns.append(lambda f: not em(f))
150 if exact: 159 if exact:
151 if isinstance(patterns, list): 160 if isinstance(patterns, list):
152 self._files = patterns 161 self._files = patterns
239 ''' 248 '''
240 if self.prefix() and dir in self._fileroots: 249 if self.prefix() and dir in self._fileroots:
241 return 'all' 250 return 'all'
242 if dir in self._excluderoots: 251 if dir in self._excluderoots:
243 return False 252 return False
244 if (self._includeroots and 253 if ((self._includeroots or self._includedirs != set(['.'])) and
245 '.' not in self._includeroots and 254 '.' not in self._includeroots and
246 dir not in self._includeroots and 255 dir not in self._includeroots and
247 dir not in self._includedirs and 256 dir not in self._includedirs and
248 not any(parent in self._includeroots 257 not any(parent in self._includeroots
249 for parent in util.finddirs(dir))): 258 for parent in util.finddirs(dir))):
420 ctx=ctx, listsubrepos=listsubrepos, badfn=badfn) 429 ctx=ctx, listsubrepos=listsubrepos, badfn=badfn)
421 430
422 # m.exact(file) must be based off of the actual user input, otherwise 431 # m.exact(file) must be based off of the actual user input, otherwise
423 # inexact case matches are treated as exact, and not noted without -v. 432 # inexact case matches are treated as exact, and not noted without -v.
424 if self._files: 433 if self._files:
425 self._fileroots = set(_roots(self._kp)) 434 roots, dirs = _rootsanddirs(self._kp)
435 self._fileroots = set(roots)
436 self._fileroots.update(dirs)
426 437
427 def _normalize(self, patterns, default, root, cwd, auditor): 438 def _normalize(self, patterns, default, root, cwd, auditor):
428 self._kp = super(icasefsmatcher, self)._normalize(patterns, default, 439 self._kp = super(icasefsmatcher, self)._normalize(patterns, default,
429 root, cwd, auditor) 440 root, cwd, auditor)
430 kindpats = [] 441 kindpats = []
619 (s, k, p)) 630 (s, k, p))
620 else: 631 else:
621 raise error.Abort(_("invalid pattern (%s): %s") % (k, p)) 632 raise error.Abort(_("invalid pattern (%s): %s") % (k, p))
622 raise error.Abort(_("invalid pattern")) 633 raise error.Abort(_("invalid pattern"))
623 634
624 def _roots(kindpats): 635 def _patternrootsanddirs(kindpats):
625 '''return roots and exact explicitly listed files from patterns 636 '''Returns roots and directories corresponding to each pattern.
626 637
627 >>> _roots([('glob', 'g/*', ''), ('glob', 'g', ''), ('glob', 'g*', '')]) 638 This calculates the roots and directories exactly matching the patterns and
628 ['g', 'g', '.'] 639 returns a tuple of (roots, dirs) for each. It does not return other
629 >>> _roots([('rootfilesin', 'g', ''), ('rootfilesin', '', '')]) 640 directories which may also need to be considered, like the parent
630 ['g', '.'] 641 directories.
631 >>> _roots([('relpath', 'r', ''), ('path', 'p/p', ''), ('path', '', '')])
632 ['r', 'p/p', '.']
633 >>> _roots([('relglob', 'rg*', ''), ('re', 're/', ''), ('relre', 'rr', '')])
634 ['.', '.', '.']
635 ''' 642 '''
636 r = [] 643 r = []
644 d = []
637 for kind, pat, source in kindpats: 645 for kind, pat, source in kindpats:
638 if kind == 'glob': # find the non-glob prefix 646 if kind == 'glob': # find the non-glob prefix
639 root = [] 647 root = []
640 for p in pat.split('/'): 648 for p in pat.split('/'):
641 if '[' in p or '{' in p or '*' in p or '?' in p: 649 if '[' in p or '{' in p or '*' in p or '?' in p:
642 break 650 break
643 root.append(p) 651 root.append(p)
644 r.append('/'.join(root) or '.') 652 r.append('/'.join(root) or '.')
645 elif kind in ('relpath', 'path', 'rootfilesin'): 653 elif kind in ('relpath', 'path'):
646 r.append(pat or '.') 654 r.append(pat or '.')
655 elif kind in ('rootfilesin',):
656 d.append(pat or '.')
647 else: # relglob, re, relre 657 else: # relglob, re, relre
648 r.append('.') 658 r.append('.')
649 return r 659 return r, d
660
661 def _roots(kindpats):
662 '''Returns root directories to match recursively from the given patterns.'''
663 roots, dirs = _patternrootsanddirs(kindpats)
664 return roots
665
666 def _rootsanddirs(kindpats):
667 '''Returns roots and exact directories from patterns.
668
669 roots are directories to match recursively, whereas exact directories should
670 be matched non-recursively. The returned (roots, dirs) tuple will also
671 include directories that need to be implicitly considered as either, such as
672 parent directories.
673
674 >>> _rootsanddirs(\
675 [('glob', 'g/h/*', ''), ('glob', 'g/h', ''), ('glob', 'g*', '')])
676 (['g/h', 'g/h', '.'], ['g'])
677 >>> _rootsanddirs(\
678 [('rootfilesin', 'g/h', ''), ('rootfilesin', '', '')])
679 ([], ['g/h', '.', 'g'])
680 >>> _rootsanddirs(\
681 [('relpath', 'r', ''), ('path', 'p/p', ''), ('path', '', '')])
682 (['r', 'p/p', '.'], ['p'])
683 >>> _rootsanddirs(\
684 [('relglob', 'rg*', ''), ('re', 're/', ''), ('relre', 'rr', '')])
685 (['.', '.', '.'], [])
686 '''
687 r, d = _patternrootsanddirs(kindpats)
688
689 # Append the parents as non-recursive/exact directories, since they must be
690 # scanned to get to either the roots or the other exact directories.
691 d.extend(util.dirs(d))
692 d.extend(util.dirs(r))
693
694 return r, d
650 695
651 def _explicitfiles(kindpats): 696 def _explicitfiles(kindpats):
652 '''Returns the potential explicit filenames from the patterns. 697 '''Returns the potential explicit filenames from the patterns.
653 698
654 >>> _explicitfiles([('path', 'foo/bar', '')]) 699 >>> _explicitfiles([('path', 'foo/bar', '')])