Mercurial > hg
changeset 38865:899b4c74209c
fileset: combine union of basic patterns into single matcher
This appears to improve query performance in a big repository than I thought.
Writing less Python in a hot loop, faster computation we gain.
$ hg files --cwd mozilla-central --time 'set:a* + b* + c* + d* + e*'
(orig) time: real 0.670 secs (user 0.640+0.000 sys 0.030+0.000)
(new) time: real 0.210 secs (user 0.180+0.000 sys 0.020+0.000)
author | Yuya Nishihara <yuya@tcha.org> |
---|---|
date | Sat, 21 Jul 2018 17:19:12 +0900 |
parents | 73731fa8d1bd |
children | aa33988ad8ab |
files | mercurial/fileset.py mercurial/filesetlang.py mercurial/minifileset.py tests/test-fileset.t |
diffstat | 4 files changed, 51 insertions(+), 13 deletions(-) [+] |
line wrap: on
line diff
--- a/mercurial/fileset.py Sat Jul 21 17:13:34 2018 +0900 +++ b/mercurial/fileset.py Sat Jul 21 17:19:12 2018 +0900 @@ -50,6 +50,12 @@ return stringmatch(mctx, _getkindpat(x, y, matchmod.allpatternkinds, _("pattern must be a string"))) +def patternsmatch(mctx, *xs): + allkinds = matchmod.allpatternkinds + patterns = [getpattern(x, allkinds, _("pattern must be a string")) + for x in xs] + return mctx.matcher(patterns) + def andmatch(mctx, x, y): xm = getmatch(mctx, x) ym = getmatch(mctx, y) @@ -436,6 +442,7 @@ 'string': stringmatch, 'symbol': stringmatch, 'kindpat': kindpatmatch, + 'patterns': patternsmatch, 'and': andmatch, 'or': ormatch, 'minus': minusmatch,
--- a/mercurial/filesetlang.py Sat Jul 21 17:13:34 2018 +0900 +++ b/mercurial/filesetlang.py Sat Jul 21 17:19:12 2018 +0900 @@ -185,6 +185,21 @@ return ('minus', ta, tb[1]) return (op, ta, tb) +def _optimizeunion(xs): + # collect string patterns so they can be compiled into a single regexp + ws, ts, ss = [], [], [] + for x in xs: + w, t = _optimize(x) + if t is not None and t[0] in {'string', 'symbol', 'kindpat'}: + ss.append(t) + continue + ws.append(w) + ts.append(t) + if ss: + ws.append(WEIGHT_CHECK_FILENAME) + ts.append(('patterns',) + tuple(ss)) + return ws, ts + def _optimize(x): if x is None: return 0, x @@ -206,7 +221,9 @@ else: return wb, _optimizeandops(op, tb, ta) if op == 'or': - ws, ts = zip(*(_optimize(y) for y in x[1:])) + ws, ts = _optimizeunion(x[1:]) + if len(ts) == 1: + return ws[0], ts[0] # 'or' operation is fully optimized out ts = tuple(it[1] for it in sorted(enumerate(ts), key=lambda it: ws[it[0]])) return max(ws), (op,) + ts
--- a/mercurial/minifileset.py Sat Jul 21 17:13:34 2018 +0900 +++ b/mercurial/minifileset.py Sat Jul 21 17:19:12 2018 +0900 @@ -40,7 +40,7 @@ return f raise error.ParseError(_("unsupported file pattern: %s") % name, hint=_('paths must be prefixed with "path:"')) - elif op == 'or': + elif op in {'or', 'patterns'}: funcs = [_compile(x) for x in tree[1:]] return lambda n, s: any(f(n, s) for f in funcs) elif op == 'and':
--- a/tests/test-fileset.t Sat Jul 21 17:13:34 2018 +0900 +++ b/tests/test-fileset.t Sat Jul 21 17:19:12 2018 +0900 @@ -53,9 +53,7 @@ (symbol 'glob') (symbol 'b?'))) * matcher: - <unionmatcher matchers=[ - <patternmatcher patterns='(?:a1(?:/|$))'>, - <patternmatcher patterns='(?:b.$)'>]> + <patternmatcher patterns='(?:a1(?:/|$)|b.$)'> a1 b1 b2 @@ -182,8 +180,9 @@ None))) * optimized: (or - (symbol 'a1') - (symbol 'a2') + (patterns + (symbol 'a1') + (symbol 'a2')) (and (func (symbol 'clean') @@ -193,8 +192,7 @@ (string 'b')))) * matcher: <unionmatcher matchers=[ - <patternmatcher patterns='(?:a1$)'>, - <patternmatcher patterns='(?:a2$)'>, + <patternmatcher patterns='(?:a1$|a2$)'>, <intersectionmatcher m1=<predicatenmatcher pred=clean>, m2=<predicatenmatcher pred=grep('b')>>]> @@ -203,13 +201,30 @@ b1 b2 +Union of basic patterns: + + $ fileset -p optimized -s -r. 'a1 or a2 or path:b1' + * optimized: + (patterns + (symbol 'a1') + (symbol 'a2') + (kindpat + (symbol 'path') + (symbol 'b1'))) + * matcher: + <patternmatcher patterns='(?:a1$|a2$|b1(?:/|$))'> + a1 + a2 + b1 + OR expression should be reordered by weight: $ fileset -p optimized -s -r. 'grep("a") or a1 or grep("b") or b2' * optimized: (or - (symbol 'a1') - (symbol 'b2') + (patterns + (symbol 'a1') + (symbol 'b2')) (func (symbol 'grep') (string 'a')) @@ -218,8 +233,7 @@ (string 'b'))) * matcher: <unionmatcher matchers=[ - <patternmatcher patterns='(?:a1$)'>, - <patternmatcher patterns='(?:b2$)'>, + <patternmatcher patterns='(?:a1$|b2$)'>, <predicatenmatcher pred=grep('a')>, <predicatenmatcher pred=grep('b')>]> a1