match: skip walking up the directory hierarchy if the number of pats are small
authorKyle Lippincott <spectral@google.com>
Mon, 30 Nov 2020 12:30:58 -0800
changeset 45999 c4c7a6b61146
parent 45998 cb623dedb17c
child 46000 c1bb02738f96
match: skip walking up the directory hierarchy if the number of pats are small Previously, we would receive a path like abc/def/ghi and "walk up" the directory hierarchy, checking abc/def, abc, and `b''` to see if they were in the set of prefixes that this matcher covered. We did this indiscriminately - we generated all of these paths even if the set of prefixes the matcher covered was completely empty, which is the case for a lot of repos at my company (the narrow matcher we use is usually non-recursive). This brings the time for a rebase in one of my repos from 12.20s to 10.87s. In this particular repo, this is entirely due to the `len(prefix_set) == 0` check, as I do not have any recursive patterns in the narrowspec. Differential Revision: https://phab.mercurial-scm.org/D9488
mercurial/match.py
--- a/mercurial/match.py	Tue Dec 01 22:19:01 2020 +0100
+++ b/mercurial/match.py	Mon Nov 30 12:30:58 2020 -0800
@@ -563,6 +563,36 @@
         return b'<predicatenmatcher pred=%s>' % s
 
 
+def path_or_parents_in_set(path, prefix_set):
+    """Returns True if `path` (or any parent of `path`) is in `prefix_set`."""
+    l = len(prefix_set)
+    if l == 0:
+        return False
+    if path in prefix_set:
+        return True
+    # If there's more than 5 paths in prefix_set, it's *probably* quicker to
+    # "walk up" the directory hierarchy instead, with the assumption that most
+    # directory hierarchies are relatively shallow and hash lookup is cheap.
+    if l > 5:
+        return any(
+                parentdir in prefix_set for parentdir in pathutil.finddirs(path)
+        )
+
+    # FIXME: Ideally we'd never get to this point if this is the case - we'd
+    # recognize ourselves as an 'always' matcher and skip this.
+    if b'' in prefix_set:
+        return True
+
+    if pycompat.ispy3:
+        sl = ord(b'/')
+    else:
+        sl = '/'
+
+    # We already checked that path isn't in prefix_set exactly, so
+    # `path[len(pf)] should never raise IndexError.
+    return any(path.startswith(pf) and path[len(pf)] == sl for pf in prefix_set)
+
+
 class patternmatcher(basematcher):
     r"""Matches a set of (kind, pat, source) against a 'root' directory.
 
@@ -611,12 +641,8 @@
         if self._prefix and dir in self._fileset:
             return b'all'
         return (
-            dir in self._fileset
-            or dir in self._dirs
-            or any(
-                parentdir in self._fileset
-                for parentdir in pathutil.finddirs(dir)
-            )
+            dir in self._dirs
+            or path_or_parents_in_set(dir, self._fileset)
         )
 
     def visitchildrenset(self, dir):
@@ -698,12 +724,9 @@
         if self._prefix and dir in self._roots:
             return b'all'
         return (
-            dir in self._roots
-            or dir in self._dirs
+            dir in self._dirs
             or dir in self._parents
-            or any(
-                parentdir in self._roots for parentdir in pathutil.finddirs(dir)
-            )
+            or path_or_parents_in_set(dir, self._roots)
         )
 
     @propertycache
@@ -726,11 +749,8 @@
         # visitdir, that's handled below.
         if (
             b'' in self._roots
-            or dir in self._roots
             or dir in self._dirs
-            or any(
-                parentdir in self._roots for parentdir in pathutil.finddirs(dir)
-            )
+            or path_or_parents_in_set(dir, self._roots)
         ):
             return b'this'