changegroup: avoid iterating the whole manifest
The old code gathered the list of all files that changed anywhere in
history and then gathered changed file nodes by walking the entirety
of each manifest to be sent in order to gather changed file
nodes. That's going to be unfortunate for narrowhg, and it's already
inefficient for medium-to-large repositories.
Timings for bundle --all on my hg repo, tested with hgperf:
Before:
! wall 23.442445 comb 23.440000 user 23.250000 sys 0.190000 (best of 3)
After:
! wall 20.272187 comb 20.270000 user 20.190000 sys 0.080000 (best of 3)
--- a/mercurial/changegroup.py Fri Dec 04 15:59:46 2015 -0500
+++ b/mercurial/changegroup.py Fri Dec 04 10:34:58 2015 -0500
@@ -613,7 +613,8 @@
clrevorder = {}
mfs = {} # needed manifests
fnodes = {} # needed file nodes
- changedfiles = set()
+ # maps manifest node id -> set(changed files)
+ mfchangedfiles = {}
# Callback for the changelog, used to collect changed files and manifest
# nodes.
@@ -621,9 +622,12 @@
def lookupcl(x):
c = cl.read(x)
clrevorder[x] = len(clrevorder)
- changedfiles.update(c[3])
+ n = c[0]
# record the first changeset introducing this manifest version
- mfs.setdefault(c[0], x)
+ mfs.setdefault(n, x)
+ # Record a complete list of potentially-changed files in
+ # this manifest.
+ mfchangedfiles.setdefault(n, set()).update(c[3])
return x
self._verbosenote(_('uncompressed size of bundle content:\n'))
@@ -668,8 +672,12 @@
clnode = mfs[x]
if not fastpathlinkrev:
mdata = ml.readfast(x)
- for f, n in mdata.iteritems():
- if f in changedfiles:
+ for f in mfchangedfiles[x]:
+ if True:
+ try:
+ n = mdata[f]
+ except KeyError:
+ continue
# record the first changeset introducing this filelog
# version
fclnodes = fnodes.setdefault(f, {})
@@ -696,6 +704,9 @@
return dict(genfilenodes())
return fnodes.get(fname, {})
+ changedfiles = set()
+ for x in mfchangedfiles.itervalues():
+ changedfiles.update(x)
for chunk in self.generatefiles(changedfiles, linknodes, commonrevs,
source):
yield chunk