Mercurial > hg-stable
changeset 39477:35ecaa999a12
match: improve includematcher.visitchildrenset to be much faster and cached
This improves the speed of visitchildrenset considerably, especially when there
are complicated matchers involved that may have many entries in _dirs or
_parents.
Unfortunately the benchmark isn't easily upstreamed due to its reliance on
https://github.com/vstinner/perf (primarily due to the conflict when importing
it if I were to contribute the benchmark as contrib/matcherbenchmarks.py)
instead of asv or some other perf measurement system.
To describe the benchmark briefly: I generated an includematcher of either 5 or
3500 "rootfilesin:prefix1/prefix2/prefix3/<randomsubdirs, 1-8 levels deep>"
items in the 'setup' function, and then called
`im.visitchildrenset('prefix1/prefix2')` in the 'stmt' function in perf.timeit.
For the set of 5:
- before: 15.3 us +- 2.9 us
- after: 1.59 us +- 0.02 us
For the set of 3500:
- before: 3.90 ms +- 0.10 ms
- after: 3.15 us +- 0.09 us (note the m->u change)
Differential Revision: https://phab.mercurial-scm.org/D4351
author | Kyle Lippincott <spectral@google.com> |
---|---|
date | Fri, 17 Aug 2018 19:18:53 -0700 |
parents | bc3b99d5627e |
children | 7df9ae38c75c |
files | mercurial/match.py |
diffstat | 1 files changed, 58 insertions(+), 23 deletions(-) [+] |
line wrap: on
line diff
--- a/mercurial/match.py Thu Sep 06 03:21:05 2018 +0530 +++ b/mercurial/match.py Fri Aug 17 19:18:53 2018 -0700 @@ -496,6 +496,46 @@ def __repr__(self): return ('<patternmatcher patterns=%r>' % pycompat.bytestr(self._pats)) +# This is basically a reimplementation of util.dirs that stores the children +# instead of just a count of them, plus a small optional optimization to avoid +# some directories we don't need. +class _dirchildren(object): + def __init__(self, paths, onlyinclude=None): + self._dirs = {} + self._onlyinclude = onlyinclude or [] + addpath = self.addpath + for f in paths: + addpath(f) + + def addpath(self, path): + if path == '.': + return + dirs = self._dirs + findsplitdirs = _dirchildren._findsplitdirs + for d, b in findsplitdirs(path): + if d not in self._onlyinclude: + continue + dirs.setdefault(d, set()).add(b) + + @staticmethod + def _findsplitdirs(path): + # yields (dirname, basename) tuples, walking back to the root. This is + # very similar to util.finddirs, except: + # - produces a (dirname, basename) tuple, not just 'dirname' + # - includes root dir + # Unlike manifest._splittopdir, this does not suffix `dirname` with a + # slash, and produces '.' for the root instead of ''. + oldpos = len(path) + pos = path.rfind('/') + while pos != -1: + yield path[:pos], path[pos + 1:oldpos] + oldpos = pos + pos = path.rfind('/', 0, pos) + yield '.', path[:oldpos] + + def get(self, path): + return self._dirs.get(path, set()) + class includematcher(basematcher): def __init__(self, root, cwd, kindpats, listsubrepos=False, badfn=None): @@ -523,6 +563,18 @@ any(parentdir in self._roots for parentdir in util.finddirs(dir))) + @propertycache + def _allparentschildren(self): + # It may seem odd that we add dirs, roots, and parents, and then + # restrict to only parents. This is to catch the case of: + # dirs = ['foo/bar'] + # parents = ['foo'] + # if we asked for the children of 'foo', but had only added + # self._parents, we wouldn't be able to respond ['bar']. + return _dirchildren( + itertools.chain(self._dirs, self._roots, self._parents), + onlyinclude=self._parents) + def visitchildrenset(self, dir): if self._prefix and dir in self._roots: return 'all' @@ -535,30 +587,9 @@ for parentdir in util.finddirs(dir))): return 'this' - ret = set() if dir in self._parents: - # We add a '/' on to `dir` so that we don't return items that are - # prefixed by `dir` but are actually siblings of `dir`. - suffixeddir = dir + '/' if dir != '.' else '' - # Look in all _roots, _dirs, and _parents for things that start with - # 'suffixeddir'. - for d in [q for q in - itertools.chain(self._roots, self._dirs, self._parents) if - q.startswith(suffixeddir)]: - # Don't emit '.' in the response for the root directory - if not suffixeddir and d == '.': - continue - - # We return the item name without the `suffixeddir` prefix or a - # slash suffix - d = d[len(suffixeddir):] - if '/' in d: - # This is a subdirectory-of-a-subdirectory, i.e. - # suffixeddir='foo/', d was 'foo/bar/baz' before removing - # 'foo/'. - d = d[:d.index('/')] - ret.add(d) - return ret + return self._allparentschildren.get(dir) or set() + return set() @encoding.strmethod def __repr__(self): @@ -1238,6 +1269,10 @@ # util.dirs() does not include the root directory, so add it manually p.append('.') + # FIXME: all uses of this function convert these to sets, do so before + # returning. + # FIXME: all uses of this function do not need anything in 'roots' and + # 'dirs' to also be in 'parents', consider removing them before returning. return r, d, p def _explicitfiles(kindpats):