matcher: use re2 bindings if available
There are two sets of Python re2 bindings available on the internet;
this code works with both.
Using re2 can greatly improve "hg status" performance when a .hgignore
file becomes even modestly complex.
Example: "hg status" on a clean tree with 134K files, where "hg
debugignore" reports a regexp 4256 bytes in size.
no .hgignore: 1.76 sec
Python re: 2.79
re2: 1.82
The overhead of regexp matching drops from 1.03 seconds with stock
re to 0.06 with re2.
(For comparison, a git repo with the same contents and .gitignore
file runs "git status -s" in 1.71 seconds, i.e. only slightly faster
than hg with re2.)
--- a/mercurial/match.py Thu Jun 07 01:42:50 2012 +0200
+++ b/mercurial/match.py Fri Jun 01 15:26:20 2012 -0700
@@ -9,6 +9,14 @@
import scmutil, util, fileset
from i18n import _
+def _rematcher(pat):
+ m = util.compilere(pat)
+ try:
+ # slightly faster, provided by facebook's re2 bindings
+ return m.test_match
+ except AttributeError:
+ return m.match
+
def _expandsets(pats, ctx):
'''convert set: patterns into a list of files in the given context'''
fset = set()
@@ -280,7 +288,7 @@
pat = '(?:%s)' % '|'.join([_regex(k, p, tail) for (k, p) in pats])
if len(pat) > 20000:
raise OverflowError
- return pat, re.compile(pat).match
+ return pat, _rematcher(pat)
except OverflowError:
# We're using a Python with a tiny regex engine and we
# made it explode, so we'll divide the pattern list in two
@@ -294,7 +302,7 @@
except re.error:
for k, p in pats:
try:
- re.compile('(?:%s)' % _regex(k, p, tail))
+ _rematcher('(?:%s)' % _regex(k, p, tail))
except re.error:
raise util.Abort(_("invalid pattern (%s): %s") % (k, p))
raise util.Abort(_("invalid pattern"))
--- a/mercurial/util.py Thu Jun 07 01:42:50 2012 +0200
+++ b/mercurial/util.py Fri Jun 01 15:26:20 2012 -0700
@@ -629,6 +629,30 @@
except OSError:
return True
+try:
+ import re2
+ _re2 = None
+except ImportError:
+ _re2 = False
+
+def compilere(pat):
+ '''Compile a regular expression, using re2 if possible
+
+ For best performance, use only re2-compatible regexp features.'''
+ global _re2
+ if _re2 is None:
+ try:
+ re2.compile
+ _re2 = True
+ except ImportError:
+ _re2 = False
+ if _re2:
+ try:
+ return re2.compile(pat)
+ except re2.error:
+ pass
+ return re.compile(pat)
+
_fspathcache = {}
def fspath(name, root):
'''Get name in the case stored in the filesystem