fileset: combine union of basic patterns into single matcher
This appears to improve query performance in a big repository than I thought.
Writing less Python in a hot loop, faster computation we gain.
$ hg files --cwd mozilla-central --time 'set:a* + b* + c* + d* + e*'
(orig) time: real 0.670 secs (user 0.640+0.000 sys 0.030+0.000)
(new) time: real 0.210 secs (user 0.180+0.000 sys 0.020+0.000)
--- a/mercurial/fileset.py Sat Jul 21 17:13:34 2018 +0900
+++ b/mercurial/fileset.py Sat Jul 21 17:19:12 2018 +0900
@@ -50,6 +50,12 @@
return stringmatch(mctx, _getkindpat(x, y, matchmod.allpatternkinds,
_("pattern must be a string")))
+def patternsmatch(mctx, *xs):
+ allkinds = matchmod.allpatternkinds
+ patterns = [getpattern(x, allkinds, _("pattern must be a string"))
+ for x in xs]
+ return mctx.matcher(patterns)
+
def andmatch(mctx, x, y):
xm = getmatch(mctx, x)
ym = getmatch(mctx, y)
@@ -436,6 +442,7 @@
'string': stringmatch,
'symbol': stringmatch,
'kindpat': kindpatmatch,
+ 'patterns': patternsmatch,
'and': andmatch,
'or': ormatch,
'minus': minusmatch,
--- a/mercurial/filesetlang.py Sat Jul 21 17:13:34 2018 +0900
+++ b/mercurial/filesetlang.py Sat Jul 21 17:19:12 2018 +0900
@@ -185,6 +185,21 @@
return ('minus', ta, tb[1])
return (op, ta, tb)
+def _optimizeunion(xs):
+ # collect string patterns so they can be compiled into a single regexp
+ ws, ts, ss = [], [], []
+ for x in xs:
+ w, t = _optimize(x)
+ if t is not None and t[0] in {'string', 'symbol', 'kindpat'}:
+ ss.append(t)
+ continue
+ ws.append(w)
+ ts.append(t)
+ if ss:
+ ws.append(WEIGHT_CHECK_FILENAME)
+ ts.append(('patterns',) + tuple(ss))
+ return ws, ts
+
def _optimize(x):
if x is None:
return 0, x
@@ -206,7 +221,9 @@
else:
return wb, _optimizeandops(op, tb, ta)
if op == 'or':
- ws, ts = zip(*(_optimize(y) for y in x[1:]))
+ ws, ts = _optimizeunion(x[1:])
+ if len(ts) == 1:
+ return ws[0], ts[0] # 'or' operation is fully optimized out
ts = tuple(it[1] for it in sorted(enumerate(ts),
key=lambda it: ws[it[0]]))
return max(ws), (op,) + ts
--- a/mercurial/minifileset.py Sat Jul 21 17:13:34 2018 +0900
+++ b/mercurial/minifileset.py Sat Jul 21 17:19:12 2018 +0900
@@ -40,7 +40,7 @@
return f
raise error.ParseError(_("unsupported file pattern: %s") % name,
hint=_('paths must be prefixed with "path:"'))
- elif op == 'or':
+ elif op in {'or', 'patterns'}:
funcs = [_compile(x) for x in tree[1:]]
return lambda n, s: any(f(n, s) for f in funcs)
elif op == 'and':
--- a/tests/test-fileset.t Sat Jul 21 17:13:34 2018 +0900
+++ b/tests/test-fileset.t Sat Jul 21 17:19:12 2018 +0900
@@ -53,9 +53,7 @@
(symbol 'glob')
(symbol 'b?')))
* matcher:
- <unionmatcher matchers=[
- <patternmatcher patterns='(?:a1(?:/|$))'>,
- <patternmatcher patterns='(?:b.$)'>]>
+ <patternmatcher patterns='(?:a1(?:/|$)|b.$)'>
a1
b1
b2
@@ -182,8 +180,9 @@
None)))
* optimized:
(or
- (symbol 'a1')
- (symbol 'a2')
+ (patterns
+ (symbol 'a1')
+ (symbol 'a2'))
(and
(func
(symbol 'clean')
@@ -193,8 +192,7 @@
(string 'b'))))
* matcher:
<unionmatcher matchers=[
- <patternmatcher patterns='(?:a1$)'>,
- <patternmatcher patterns='(?:a2$)'>,
+ <patternmatcher patterns='(?:a1$|a2$)'>,
<intersectionmatcher
m1=<predicatenmatcher pred=clean>,
m2=<predicatenmatcher pred=grep('b')>>]>
@@ -203,13 +201,30 @@
b1
b2
+Union of basic patterns:
+
+ $ fileset -p optimized -s -r. 'a1 or a2 or path:b1'
+ * optimized:
+ (patterns
+ (symbol 'a1')
+ (symbol 'a2')
+ (kindpat
+ (symbol 'path')
+ (symbol 'b1')))
+ * matcher:
+ <patternmatcher patterns='(?:a1$|a2$|b1(?:/|$))'>
+ a1
+ a2
+ b1
+
OR expression should be reordered by weight:
$ fileset -p optimized -s -r. 'grep("a") or a1 or grep("b") or b2'
* optimized:
(or
- (symbol 'a1')
- (symbol 'b2')
+ (patterns
+ (symbol 'a1')
+ (symbol 'b2'))
(func
(symbol 'grep')
(string 'a'))
@@ -218,8 +233,7 @@
(string 'b')))
* matcher:
<unionmatcher matchers=[
- <patternmatcher patterns='(?:a1$)'>,
- <patternmatcher patterns='(?:b2$)'>,
+ <patternmatcher patterns='(?:a1$|b2$)'>,
<predicatenmatcher pred=grep('a')>,
<predicatenmatcher pred=grep('b')>]>
a1