# HG changeset patch
# User Pierre-Yves David <pierre-yves.david@octobus.net>
# Date 1537892626 -7200
# Node ID 683ceec8d37e4b51c59f089b12f0087993d2de2b
# Parent  1421ff5c5c96455caa47bfc669cf877729888740
pullbundle: add a command to generate cache hit statistic

This new command should help checking that the stable range produce reusable
bundle.

diff -r 1421ff5c5c96 -r 683ceec8d37e hgext3rd/pullbundle.py
--- a/hgext3rd/pullbundle.py	Tue Sep 25 13:44:32 2018 +0200
+++ b/hgext3rd/pullbundle.py	Tue Sep 25 18:23:46 2018 +0200
@@ -72,16 +72,21 @@
 extensions next to it. As soon as stable range have been upstreamed, we won't
 need the dependency to the evolve extension anymore.
 """
+
+import collections
 import errno
+import random
 import os
 
 from mercurial import (
     changegroup,
     discovery,
+    error,
     exchange,
     narrowspec,
     node as nodemod,
     registrar,
+    scmutil,
     util,
 )
 
@@ -92,6 +97,9 @@
 # minimumhgversion = ''
 buglink = 'https://bz.mercurial-scm.org/'
 
+cmdtable = {}
+command = registrar.command(cmdtable)
+
 configtable = {}
 configitem = registrar.configitem(configtable)
 
@@ -437,3 +445,114 @@
             pversion = version
         partdata = (cachedata, nbchanges, pversion)
     return _makepartfromstream(newpart, repo, *partdata)
+
+@command('^debugpullbundlecacheoverlap',
+         [('', 'count', 100, _('of "client" pulling')),
+         ],
+         _('hg debugpullbundlecacheoverlap [--client 100] REVSET'))
+def debugpullbundlecacheoverlap(ui, repo, *revs, **opts):
+    '''Display statistic on bundle cache hit
+
+    This command "simulate pulls from multiple clients. Each using a random
+    subset of revisions defined by REVSET. And display statistic about the
+    overlap in bundle necessary to serve them.
+    '''
+    actionrevs = scmutil.revrange(repo, revs)
+    if not revs:
+        raise error.Abort('No revision selected')
+    count = opts['count']
+
+    bundlehits = collections.defaultdict(lambda: 0)
+    pullstats = []
+
+    rlen = lambda rangeid: repo.stablerange.rangelength(repo, rangeid)
+
+    repo.ui.write("gathering %d sample pulls within %d revisions\n"
+                  % (count, len(actionrevs)))
+    for i in xrange(count):
+        repo.ui.progress('gathering data', i, total=count)
+        outgoing = takeonesample(repo, actionrevs)
+        ranges = sliceoutgoing(repo, outgoing)
+        hitranges = 0
+        hitchanges = 0
+        totalchanges = 0
+        for rangeid, __ in ranges:
+            length = rlen(rangeid)
+            totalchanges += length
+            if bundlehits[rangeid]:
+                hitranges += 1
+                hitchanges += rlen(rangeid)
+            bundlehits[rangeid] += 1
+        stats = (len(outgoing.missing),
+                 totalchanges,
+                 hitchanges,
+                 len(ranges),
+                 hitranges,
+                 )
+        pullstats.append(stats)
+    repo.ui.progress('gathering data', None)
+
+    sizes = []
+    changesmissing = []
+    totalchanges = 0
+    totalcached = 0
+    changesratio = []
+    rangesratio = []
+    bundlecount = []
+    for entry in pullstats:
+        sizes.append(entry[0])
+        changesmissing.append(entry[1] - entry[2])
+        changesratio.append(entry[2] / float(entry[1]))
+        rangesratio.append(entry[4] / float(entry[3]))
+        bundlecount.append(entry[3])
+        totalchanges += entry[1]
+        totalcached += entry[2]
+
+    sizesdist = distribution(sizes)
+    repo.ui.write(fmtdist('pull size', sizesdist))
+    changesmissingdist = distribution(changesmissing)
+    repo.ui.write(fmtdist('non-cached changesets', changesmissingdist))
+    changesratiodist = distribution(changesratio)
+    repo.ui.write(fmtdist('ratio of cached changesets', changesratiodist))
+    bundlecountdist = distribution(bundlecount)
+    repo.ui.write(fmtdist('bundle count', bundlecountdist))
+    rangesratiodist = distribution(rangesratio)
+    repo.ui.write(fmtdist('ratio of cached bundles', rangesratiodist))
+    repo.ui.write('changesets served:\n')
+    repo.ui.write('  total:      %7d\n' % totalchanges)
+    repo.ui.write('  from cache: %7d (%2d%%)\n'
+                  % (totalcached, (totalcached * 100 // totalchanges)))
+    repo.ui.write('  bundle:     %7d\n' % sum(bundlecount))
+
+def takeonesample(repo, revs):
+    node = repo.changelog.node
+    pulled = random.sample(revs, max(4, len(revs) // 1000))
+    pulled = repo.revs('%ld::%ld', pulled, pulled)
+    nodes = [node(r) for r in pulled]
+    return outgoingfromnodes(repo, nodes)
+
+def distribution(data):
+    data.sort()
+    length = len(data)
+    return {
+        'min': data[0],
+        '10%': data[length // 10],
+        '25%': data[length // 4],
+        '50%': data[length // 2],
+        '75%': data[(length // 4) * 3],
+        '90%': data[(length // 10) * 9],
+        'max': data[-1],
+    }
+
+STATSFORMAT = """{name}:
+  min: {min}
+  10%: {10%}
+  25%: {25%}
+  50%: {50%}
+  75%: {75%}
+  90%: {90%}
+  max: {max}
+"""
+
+def fmtdist(name, data):
+    return STATSFORMAT.format(name=name, **data)