matcher: use re2 bindings if available
authorBryan O'Sullivan <bryano@fb.com>
Fri, 01 Jun 2012 15:26:20 -0700
changeset 16943 8d08a28aa63e
parent 16942 87882c8753d4
child 16944 5d3d77b3c512
matcher: use re2 bindings if available There are two sets of Python re2 bindings available on the internet; this code works with both. Using re2 can greatly improve "hg status" performance when a .hgignore file becomes even modestly complex. Example: "hg status" on a clean tree with 134K files, where "hg debugignore" reports a regexp 4256 bytes in size. no .hgignore: 1.76 sec Python re: 2.79 re2: 1.82 The overhead of regexp matching drops from 1.03 seconds with stock re to 0.06 with re2. (For comparison, a git repo with the same contents and .gitignore file runs "git status -s" in 1.71 seconds, i.e. only slightly faster than hg with re2.)
mercurial/match.py
mercurial/util.py
--- a/mercurial/match.py	Thu Jun 07 01:42:50 2012 +0200
+++ b/mercurial/match.py	Fri Jun 01 15:26:20 2012 -0700
@@ -9,6 +9,14 @@
 import scmutil, util, fileset
 from i18n import _
 
+def _rematcher(pat):
+    m = util.compilere(pat)
+    try:
+        # slightly faster, provided by facebook's re2 bindings
+        return m.test_match
+    except AttributeError:
+        return m.match
+
 def _expandsets(pats, ctx):
     '''convert set: patterns into a list of files in the given context'''
     fset = set()
@@ -280,7 +288,7 @@
         pat = '(?:%s)' % '|'.join([_regex(k, p, tail) for (k, p) in pats])
         if len(pat) > 20000:
             raise OverflowError
-        return pat, re.compile(pat).match
+        return pat, _rematcher(pat)
     except OverflowError:
         # We're using a Python with a tiny regex engine and we
         # made it explode, so we'll divide the pattern list in two
@@ -294,7 +302,7 @@
     except re.error:
         for k, p in pats:
             try:
-                re.compile('(?:%s)' % _regex(k, p, tail))
+                _rematcher('(?:%s)' % _regex(k, p, tail))
             except re.error:
                 raise util.Abort(_("invalid pattern (%s): %s") % (k, p))
         raise util.Abort(_("invalid pattern"))
--- a/mercurial/util.py	Thu Jun 07 01:42:50 2012 +0200
+++ b/mercurial/util.py	Fri Jun 01 15:26:20 2012 -0700
@@ -629,6 +629,30 @@
     except OSError:
         return True
 
+try:
+    import re2
+    _re2 = None
+except ImportError:
+    _re2 = False
+
+def compilere(pat):
+    '''Compile a regular expression, using re2 if possible
+
+    For best performance, use only re2-compatible regexp features.'''
+    global _re2
+    if _re2 is None:
+        try:
+            re2.compile
+            _re2 = True
+        except ImportError:
+            _re2 = False
+    if _re2:
+        try:
+            return re2.compile(pat)
+        except re2.error:
+            pass
+    return re.compile(pat)
+
 _fspathcache = {}
 def fspath(name, root):
     '''Get name in the case stored in the filesystem