# HG changeset patch # User Yuya Nishihara # Date 1532161152 -32400 # Node ID 899b4c74209c2a0c4786a1ce5beb1fdf0cc98801 # Parent 73731fa8d1bdeec5a0caa1662b56db851ad70a60 fileset: combine union of basic patterns into single matcher This appears to improve query performance in a big repository than I thought. Writing less Python in a hot loop, faster computation we gain. $ hg files --cwd mozilla-central --time 'set:a* + b* + c* + d* + e*' (orig) time: real 0.670 secs (user 0.640+0.000 sys 0.030+0.000) (new) time: real 0.210 secs (user 0.180+0.000 sys 0.020+0.000) diff -r 73731fa8d1bd -r 899b4c74209c mercurial/fileset.py --- a/mercurial/fileset.py Sat Jul 21 17:13:34 2018 +0900 +++ b/mercurial/fileset.py Sat Jul 21 17:19:12 2018 +0900 @@ -50,6 +50,12 @@ return stringmatch(mctx, _getkindpat(x, y, matchmod.allpatternkinds, _("pattern must be a string"))) +def patternsmatch(mctx, *xs): + allkinds = matchmod.allpatternkinds + patterns = [getpattern(x, allkinds, _("pattern must be a string")) + for x in xs] + return mctx.matcher(patterns) + def andmatch(mctx, x, y): xm = getmatch(mctx, x) ym = getmatch(mctx, y) @@ -436,6 +442,7 @@ 'string': stringmatch, 'symbol': stringmatch, 'kindpat': kindpatmatch, + 'patterns': patternsmatch, 'and': andmatch, 'or': ormatch, 'minus': minusmatch, diff -r 73731fa8d1bd -r 899b4c74209c mercurial/filesetlang.py --- a/mercurial/filesetlang.py Sat Jul 21 17:13:34 2018 +0900 +++ b/mercurial/filesetlang.py Sat Jul 21 17:19:12 2018 +0900 @@ -185,6 +185,21 @@ return ('minus', ta, tb[1]) return (op, ta, tb) +def _optimizeunion(xs): + # collect string patterns so they can be compiled into a single regexp + ws, ts, ss = [], [], [] + for x in xs: + w, t = _optimize(x) + if t is not None and t[0] in {'string', 'symbol', 'kindpat'}: + ss.append(t) + continue + ws.append(w) + ts.append(t) + if ss: + ws.append(WEIGHT_CHECK_FILENAME) + ts.append(('patterns',) + tuple(ss)) + return ws, ts + def _optimize(x): if x is None: return 0, x @@ -206,7 +221,9 @@ else: return wb, _optimizeandops(op, tb, ta) if op == 'or': - ws, ts = zip(*(_optimize(y) for y in x[1:])) + ws, ts = _optimizeunion(x[1:]) + if len(ts) == 1: + return ws[0], ts[0] # 'or' operation is fully optimized out ts = tuple(it[1] for it in sorted(enumerate(ts), key=lambda it: ws[it[0]])) return max(ws), (op,) + ts diff -r 73731fa8d1bd -r 899b4c74209c mercurial/minifileset.py --- a/mercurial/minifileset.py Sat Jul 21 17:13:34 2018 +0900 +++ b/mercurial/minifileset.py Sat Jul 21 17:19:12 2018 +0900 @@ -40,7 +40,7 @@ return f raise error.ParseError(_("unsupported file pattern: %s") % name, hint=_('paths must be prefixed with "path:"')) - elif op == 'or': + elif op in {'or', 'patterns'}: funcs = [_compile(x) for x in tree[1:]] return lambda n, s: any(f(n, s) for f in funcs) elif op == 'and': diff -r 73731fa8d1bd -r 899b4c74209c tests/test-fileset.t --- a/tests/test-fileset.t Sat Jul 21 17:13:34 2018 +0900 +++ b/tests/test-fileset.t Sat Jul 21 17:19:12 2018 +0900 @@ -53,9 +53,7 @@ (symbol 'glob') (symbol 'b?'))) * matcher: - , - ]> + a1 b1 b2 @@ -182,8 +180,9 @@ None))) * optimized: (or - (symbol 'a1') - (symbol 'a2') + (patterns + (symbol 'a1') + (symbol 'a2')) (and (func (symbol 'clean') @@ -193,8 +192,7 @@ (string 'b')))) * matcher: , - , + , , m2=>]> @@ -203,13 +201,30 @@ b1 b2 +Union of basic patterns: + + $ fileset -p optimized -s -r. 'a1 or a2 or path:b1' + * optimized: + (patterns + (symbol 'a1') + (symbol 'a2') + (kindpat + (symbol 'path') + (symbol 'b1'))) + * matcher: + + a1 + a2 + b1 + OR expression should be reordered by weight: $ fileset -p optimized -s -r. 'grep("a") or a1 or grep("b") or b2' * optimized: (or - (symbol 'a1') - (symbol 'b2') + (patterns + (symbol 'a1') + (symbol 'b2')) (func (symbol 'grep') (string 'a')) @@ -218,8 +233,7 @@ (string 'b'))) * matcher: , - , + , , ]> a1