# HG changeset patch # User Siddharth Agarwal # Date 1416127229 28800 # Node ID f8a2647fe020da7bf68e12083f10bef9183e7ee3 # Parent f710644e1ce99a9f422b5a3a7b53d3dae2af36eb setdiscovery: avoid a full changelog graph traversal We were definitely being suboptimal here: we were constructing two full sets, one with the full set of common nodes (i.e. a graph traversal) and one with all nodes. Then we subtract one set from the other. This whole process is O(commits) and causes discovery to be significantly slower than it should be. Instead, keep track of common incrementally and keep undecided as small as possible. This makes discovery massively faster on large repos: on one such repo, 'hg debugdiscovery' over SSH with one commit missing on the client and five on the server went from 4.5 seconds to 1.5. (An 'hg debugdiscovery' with no commits missing on the client, i.e. connection startup time, was 1.2 seconds.) diff -r f710644e1ce9 -r f8a2647fe020 mercurial/setdiscovery.py --- a/mercurial/setdiscovery.py Fri Nov 14 19:40:30 2014 -0800 +++ b/mercurial/setdiscovery.py Sun Nov 16 00:40:29 2014 -0800 @@ -40,7 +40,7 @@ classified with it (since all ancestors or descendants will be marked as well). """ -from node import nullid +from node import nullid, nullrev from i18n import _ import random import util, dagutil @@ -177,27 +177,23 @@ # own nodes where I don't know if remote knows them undecided = dag.nodeset() # own nodes I know we both know - common = set() + # treat remote heads (and maybe own heads) as a first implicit sample + # response + common = cl.incrementalmissingrevs(srvheads) + commoninsample = set(n for i, n in enumerate(sample) if yesno[i]) + common.addbases(commoninsample) + undecided = set(common.missingancestors(ownheads)) # own nodes I know remote lacks missing = set() - # treat remote heads (and maybe own heads) as a first implicit sample - # response - common.update(dag.ancestorset(srvheads)) - undecided.difference_update(common) - full = False while undecided: if sample: - commoninsample = set(n for i, n in enumerate(sample) if yesno[i]) - common.update(dag.ancestorset(commoninsample, common)) - missinginsample = [n for i, n in enumerate(sample) if not yesno[i]] missing.update(dag.descendantset(missinginsample, missing)) undecided.difference_update(missing) - undecided.difference_update(common) if not undecided: break @@ -206,7 +202,7 @@ ui.note(_("sampling from both directions\n")) sample = _takefullsample(dag, undecided, size=fullsamplesize) targetsize = fullsamplesize - elif common: + elif common.hasbases(): # use cheapish initial sample ui.debug("taking initial sample\n") sample = _takefullsample(dag, undecided, size=fullsamplesize) @@ -228,7 +224,17 @@ yesno = remote.known(dag.externalizeall(sample)) full = True - result = dag.headsetofconnecteds(common) + if sample: + commoninsample = set(n for i, n in enumerate(sample) if yesno[i]) + common.addbases(commoninsample) + common.removeancestorsfrom(undecided) + + # heads(common) == heads(common.bases) since common represents common.bases + # and all its ancestors + result = dag.headsetofconnecteds(common.bases) + # common.bases can include nullrev, but our contract requires us to not + # return any heads in that case, so discard that + result.discard(nullrev) ui.progress(_('searching'), None) ui.debug("%d total queries\n" % roundtrips)