match: making visitdir() deal with non-recursive entries
authorRodrigo Damazio Bovendorp <rdamazio@google.com>
Mon, 13 Feb 2017 17:03:14 -0800
changeset 31013 693a5bb47854
parent 31012 88358446da16
child 31014 219629d42786
match: making visitdir() deal with non-recursive entries Primarily as an optimization to avoid recursing into directories that will never have a match inside, this classifies each matcher pattern's root as recursive or non-recursive (erring on the side of keeping it recursive, which may lead to wasteful directory or manifest walks that yield no matches). I measured the performance of "rootfilesin" in two repos: - The Firefox repo with tree manifests, with "hg files -r . -I rootfilesin:browser". The browser directory contains about 3K files across 249 subdirectories. - A specific Google-internal directory which contains 75K files across 19K subdirectories, with "hg files -r . -I rootfilesin:REDACTED". I tested with both cold and warm disk caches. Cold cache was produced by running "sync; echo 3 > /proc/sys/vm/drop_caches". Warm cache was produced by re-running the same command a few times. These were the results: Cold cache Warm cache Before After Before After firefox 0m5.1s 0m2.18s 0m0.22s 0m0.14s google3 dir 2m3.9s 0m1.57s 0m8.12s 0m0.16s Certain extensions, notably narrowhg, can depend on this for correctness (not trying to recurse into directories for which it has no information).
mercurial/match.py
--- a/mercurial/match.py	Mon Feb 13 15:39:29 2017 -0800
+++ b/mercurial/match.py	Mon Feb 13 17:03:14 2017 -0800
@@ -125,9 +125,12 @@
         self._always = False
         self._pathrestricted = bool(include or exclude or patterns)
         self._warn = warn
+
+        # roots are directories which are recursively included/excluded.
         self._includeroots = set()
+        self._excluderoots = set()
+        # dirs are directories which are non-recursively included.
         self._includedirs = set(['.'])
-        self._excluderoots = set()
 
         if badfn is not None:
             self.bad = badfn
@@ -137,14 +140,20 @@
             kindpats = self._normalize(include, 'glob', root, cwd, auditor)
             self.includepat, im = _buildmatch(ctx, kindpats, '(?:/|$)',
                                               listsubrepos, root)
-            self._includeroots.update(_roots(kindpats))
-            self._includedirs.update(util.dirs(self._includeroots))
+            roots, dirs = _rootsanddirs(kindpats)
+            self._includeroots.update(roots)
+            self._includedirs.update(dirs)
             matchfns.append(im)
         if exclude:
             kindpats = self._normalize(exclude, 'glob', root, cwd, auditor)
             self.excludepat, em = _buildmatch(ctx, kindpats, '(?:/|$)',
                                               listsubrepos, root)
             if not _anypats(kindpats):
+                # Only consider recursive excludes as such - if a non-recursive
+                # exclude is used, we must still recurse into the excluded
+                # directory, at least to find subdirectories. In such a case,
+                # the regex still won't match the non-recursively-excluded
+                # files.
                 self._excluderoots.update(_roots(kindpats))
             matchfns.append(lambda f: not em(f))
         if exact:
@@ -241,7 +250,7 @@
             return 'all'
         if dir in self._excluderoots:
             return False
-        if (self._includeroots and
+        if ((self._includeroots or self._includedirs != set(['.'])) and
             '.' not in self._includeroots and
             dir not in self._includeroots and
             dir not in self._includedirs and
@@ -422,7 +431,9 @@
         # m.exact(file) must be based off of the actual user input, otherwise
         # inexact case matches are treated as exact, and not noted without -v.
         if self._files:
-            self._fileroots = set(_roots(self._kp))
+            roots, dirs = _rootsanddirs(self._kp)
+            self._fileroots = set(roots)
+            self._fileroots.update(dirs)
 
     def _normalize(self, patterns, default, root, cwd, auditor):
         self._kp = super(icasefsmatcher, self)._normalize(patterns, default,
@@ -621,19 +632,16 @@
                     raise error.Abort(_("invalid pattern (%s): %s") % (k, p))
         raise error.Abort(_("invalid pattern"))
 
-def _roots(kindpats):
-    '''return roots and exact explicitly listed files from patterns
+def _patternrootsanddirs(kindpats):
+    '''Returns roots and directories corresponding to each pattern.
 
-    >>> _roots([('glob', 'g/*', ''), ('glob', 'g', ''), ('glob', 'g*', '')])
-    ['g', 'g', '.']
-    >>> _roots([('rootfilesin', 'g', ''), ('rootfilesin', '', '')])
-    ['g', '.']
-    >>> _roots([('relpath', 'r', ''), ('path', 'p/p', ''), ('path', '', '')])
-    ['r', 'p/p', '.']
-    >>> _roots([('relglob', 'rg*', ''), ('re', 're/', ''), ('relre', 'rr', '')])
-    ['.', '.', '.']
+    This calculates the roots and directories exactly matching the patterns and
+    returns a tuple of (roots, dirs) for each. It does not return other
+    directories which may also need to be considered, like the parent
+    directories.
     '''
     r = []
+    d = []
     for kind, pat, source in kindpats:
         if kind == 'glob': # find the non-glob prefix
             root = []
@@ -642,11 +650,48 @@
                     break
                 root.append(p)
             r.append('/'.join(root) or '.')
-        elif kind in ('relpath', 'path', 'rootfilesin'):
+        elif kind in ('relpath', 'path'):
             r.append(pat or '.')
+        elif kind in ('rootfilesin',):
+            d.append(pat or '.')
         else: # relglob, re, relre
             r.append('.')
-    return r
+    return r, d
+
+def _roots(kindpats):
+    '''Returns root directories to match recursively from the given patterns.'''
+    roots, dirs = _patternrootsanddirs(kindpats)
+    return roots
+
+def _rootsanddirs(kindpats):
+    '''Returns roots and exact directories from patterns.
+
+    roots are directories to match recursively, whereas exact directories should
+    be matched non-recursively. The returned (roots, dirs) tuple will also
+    include directories that need to be implicitly considered as either, such as
+    parent directories.
+
+    >>> _rootsanddirs(\
+        [('glob', 'g/h/*', ''), ('glob', 'g/h', ''), ('glob', 'g*', '')])
+    (['g/h', 'g/h', '.'], ['g'])
+    >>> _rootsanddirs(\
+        [('rootfilesin', 'g/h', ''), ('rootfilesin', '', '')])
+    ([], ['g/h', '.', 'g'])
+    >>> _rootsanddirs(\
+        [('relpath', 'r', ''), ('path', 'p/p', ''), ('path', '', '')])
+    (['r', 'p/p', '.'], ['p'])
+    >>> _rootsanddirs(\
+        [('relglob', 'rg*', ''), ('re', 're/', ''), ('relre', 'rr', '')])
+    (['.', '.', '.'], [])
+    '''
+    r, d = _patternrootsanddirs(kindpats)
+
+    # Append the parents as non-recursive/exact directories, since they must be
+    # scanned to get to either the roots or the other exact directories.
+    d.extend(util.dirs(d))
+    d.extend(util.dirs(r))
+
+    return r, d
 
 def _explicitfiles(kindpats):
     '''Returns the potential explicit filenames from the patterns.