contrib: add a commit synthesizer for reproducing scaling problems
authorBryan O'Sullivan <bryano@fb.com>
Mon, 08 Oct 2012 15:57:21 -0700
changeset 17734 619068c280fd
parent 17733 3c775c5a6c03
child 17736 1c523c4008f7
child 17748 49f759636aaf
contrib: add a commit synthesizer for reproducing scaling problems This adds two new commands: - analyze examines an existing repo and writes out a statistical description of its properties that contains no identifying information. - synthesize creates new commits based on the description generated by analyze. The intention is that a repo constructed using synthesize will have properties that are vaguely statistically similar to the originating repo, but entirely random content. This can be useful for forecasting performance as a repo grows, and for developers who want to find bottlenecks in proprietary repos to which they do not have access.
contrib/synthrepo.py
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/contrib/synthrepo.py	Mon Oct 08 15:57:21 2012 -0700
@@ -0,0 +1,377 @@
+# synthrepo.py - repo synthesis
+#
+# Copyright 2012 Facebook
+#
+# This software may be used and distributed according to the terms of the
+# GNU General Public License version 2 or any later version.
+
+'''synthesize structurally interesting change history
+
+This extension is useful for creating a repository with properties
+that are statistically similar to an existing repository. During
+analysis, a simple probability table is constructed from the history
+of an existing repository.  During synthesis, these properties are
+reconstructed.
+
+Properties that are analyzed and synthesized include the following:
+
+- Lines added or removed when an existing file is modified
+- Number and sizes of files added
+- Number of files removed
+- Line lengths
+- Topological distance to parent changeset(s)
+- Probability of a commit being a merge
+- Probability of a newly added file being added to a new directory
+- Interarrival time, and time zone, of commits
+
+A few obvious properties that are not currently handled realistically:
+
+- Merges are treated as regular commits with two parents, which is not
+  realistic
+- Modifications are not treated as operations on hunks of lines, but
+  as insertions and deletions of randomly chosen single lines
+- Committer ID (always random)
+- Executability of files
+- Symlinks and binary files are ignored
+'''
+
+import bisect, collections, json, os, random, time
+from mercurial import cmdutil, context, patch, scmutil, url, util
+from mercurial.i18n import _
+from mercurial.node import nullrev, nullid
+
+testedwith = 'internal'
+
+cmdtable = {}
+command = cmdutil.command(cmdtable)
+
+newfile = set(('new fi', 'rename', 'copy f', 'copy t'))
+
+def zerodict():
+    return collections.defaultdict(lambda: 0)
+
+def roundto(x, k):
+    if x > k * 2:
+        return int(round(x / float(k)) * k)
+    return int(round(x))
+
+def parsegitdiff(lines):
+    filename, mar, lineadd, lineremove = None, None, zerodict(), 0
+    binary = False
+    for line in lines:
+        start = line[:6]
+        if start == 'diff -':
+            if filename:
+                yield filename, mar, lineadd, lineremove, binary
+            mar, lineadd, lineremove, binary = 'm', zerodict(), 0, False
+            filename = patch.gitre.match(line).group(1)
+        elif start in newfile:
+            mar = 'a'
+        elif start == 'GIT bi':
+            binary = True
+        elif start == 'delete':
+            mar = 'r'
+        elif start:
+            s = start[0]
+            if s == '-' and not line.startswith('--- '):
+                lineremove += 1
+            elif s == '+' and not line.startswith('+++ '):
+                lineadd[roundto(len(line) - 1, 5)] += 1
+    if filename:
+        yield filename, mar, lineadd, lineremove, binary
+
+@command('analyze',
+         [('o', 'output', [], _('write output to given file'), _('FILE')),
+          ('r', 'rev', [], _('analyze specified revisions'), _('REV'))],
+         _('hg analyze'))
+def analyze(ui, repo, *revs, **opts):
+    '''create a simple model of a repository to use for later synthesis
+
+    This command examines every changeset in the given range (or all
+    of history if none are specified) and creates a simple statistical
+    model of the history of the repository.
+
+    The model is written out to a JSON file, and can be used by
+    :hg:`synthesize` to create or augment a repository with synthetic
+    commits that have a structure that is statistically similar to the
+    analyzed repository.
+    '''
+
+    revs = list(revs)
+    revs.extend(opts['rev'])
+    if not revs:
+        revs = [':']
+
+    output = opts['output']
+    if not output:
+        output = os.path.basename(repo.root) + '.json'
+
+    if output == '-':
+        fp = sys.stdout
+    else:
+        fp = open(output, 'w')
+
+    revs = scmutil.revrange(repo, revs)
+    revs.sort()
+
+    lineschanged = zerodict()
+    children = zerodict()
+    p1distance = zerodict()
+    p2distance = zerodict()
+    linesinfilesadded = zerodict()
+    fileschanged = zerodict()
+    filesadded = zerodict()
+    filesremoved = zerodict()
+    linelengths = zerodict()
+    interarrival = zerodict()
+    parents = zerodict()
+    dirsadded = zerodict()
+    tzoffset = zerodict()
+
+    progress = ui.progress
+    _analyzing = _('analyzing')
+    _changesets = _('changesets')
+    _total = len(revs)
+
+    for i, rev in enumerate(revs):
+        progress(_analyzing, i, unit=_changesets, total=_total)
+        ctx = repo[rev]
+        pl = ctx.parents()
+        pctx = pl[0]
+        prev = pctx.rev()
+        children[prev] += 1
+        p1distance[rev - prev] += 1
+        parents[len(pl)] += 1
+        tzoffset[ctx.date()[1]] += 1
+        if len(pl) > 1:
+            p2distance[rev - pl[1].rev()] += 1
+        if prev == rev - 1:
+            lastctx = pctx
+        else:
+            lastctx = repo[rev - 1]
+        if lastctx.rev() != nullrev:
+            interarrival[roundto(ctx.date()[0] - lastctx.date()[0], 300)] += 1
+        diff = sum((d.splitlines()
+                    for d in ctx.diff(pctx, opts=dict(git=True))), [])
+        fileadds, diradds, fileremoves, filechanges = 0, 0, 0, 0
+        for filename, mar, lineadd, lineremove, binary in parsegitdiff(diff):
+            if binary:
+                continue
+            added = sum(lineadd.itervalues(), 0)
+            if mar == 'm':
+                if added and lineremove:
+                    lineschanged[roundto(added, 5), roundto(lineremove, 5)] += 1
+                    filechanges += 1
+            elif mar == 'a':
+                fileadds += 1
+                if '/' in filename:
+                    filedir = filename.rsplit('/', 1)[0]
+                    if filedir not in pctx.dirs():
+                        diradds += 1
+                linesinfilesadded[roundto(added, 5)] += 1
+            elif mar == 'r':
+                fileremoves += 1
+            for length, count in lineadd.iteritems():
+                linelengths[length] += count
+        fileschanged[filechanges] += 1
+        filesadded[fileadds] += 1
+        dirsadded[diradds] += 1
+        filesremoved[fileremoves] += 1
+
+    invchildren = zerodict()
+
+    for rev, count in children.iteritems():
+        invchildren[count] += 1
+
+    if output != '-':
+        ui.status(_('writing output to %s\n') % output)
+
+    def pronk(d):
+        return sorted(d.iteritems(), key=lambda x: x[1], reverse=True)
+
+    json.dump(dict(revs=len(revs),
+                   lineschanged=pronk(lineschanged),
+                   children=pronk(invchildren),
+                   fileschanged=pronk(fileschanged),
+                   filesadded=pronk(filesadded),
+                   linesinfilesadded=pronk(linesinfilesadded),
+                   dirsadded=pronk(dirsadded),
+                   filesremoved=pronk(filesremoved),
+                   linelengths=pronk(linelengths),
+                   parents=pronk(parents),
+                   p1distance=pronk(p1distance),
+                   p2distance=pronk(p2distance),
+                   interarrival=pronk(interarrival),
+                   tzoffset=pronk(tzoffset),
+                   ),
+              fp)
+    fp.close()
+
+@command('synthesize',
+         [('c', 'count', 0, _('create given number of commits'), _('COUNT')),
+          ('', 'dict', '', _('path to a dictionary of words'), _('FILE'))],
+         _('hg synthesize [OPTION].. DESCFILE'))
+def synthesize(ui, repo, descpath, **opts):
+    '''synthesize commits based on a model of an existing repository
+
+    The model must have been generated by :hg:`analyze`. Commits will
+    be generated randomly according to the probabilities described in
+    the model.
+
+    When synthesizing new content, commit descriptions, and user
+    names, words will be chosen randomly from a dictionary that is
+    presumed to contain one word per line. Use --dict to specify the
+    path to an alternate dictionary to use.
+    '''
+    try:
+        fp = url.open(ui, descpath)
+    except Exception, err:
+        raise util.Abort('%s: %s' % (descpath, err[0].strerror))
+    desc = json.load(fp)
+    fp.close()
+
+    def cdf(l):
+        vals, probs = zip(*sorted(l, key=lambda x: x[1], reverse=True))
+        t = float(sum(probs, 0))
+        s, cdfs = 0, []
+        for v in probs:
+            s += v
+            cdfs.append(s / t)
+        return vals, cdfs
+
+    lineschanged = cdf(desc['lineschanged'])
+    fileschanged = cdf(desc['fileschanged'])
+    filesadded = cdf(desc['filesadded'])
+    dirsadded = cdf(desc['dirsadded'])
+    filesremoved = cdf(desc['filesremoved'])
+    linelengths = cdf(desc['linelengths'])
+    parents = cdf(desc['parents'])
+    p1distance = cdf(desc['p1distance'])
+    p2distance = cdf(desc['p2distance'])
+    interarrival = cdf(desc['interarrival'])
+    linesinfilesadded = cdf(desc['linesinfilesadded'])
+    tzoffset = cdf(desc['tzoffset'])
+
+    dictfile = opts.get('dict') or '/usr/share/dict/words'
+    try:
+        fp = open(dictfile, 'rU')
+    except IOError, err:
+        raise util.Abort('%s: %s' % (dictfile, err.strerror))
+    words = fp.read().splitlines()
+    fp.close()
+
+    def pick(cdf):
+        return cdf[0][bisect.bisect_left(cdf[1], random.random())]
+
+    def makeline(minimum=0):
+        total = max(minimum, pick(linelengths))
+        c, l = 0, []
+        while c < total:
+            w = random.choice(words)
+            c += len(w) + 1
+            l.append(w)
+        return ' '.join(l)
+
+    wlock = repo.wlock()
+    lock = repo.lock()
+
+    nevertouch = set(('.hgsub', '.hgignore', '.hgtags'))
+
+    progress = ui.progress
+    _synthesizing = _('synthesizing')
+    _changesets = _('changesets')
+
+    count = int(opts['count'])
+    heads = set(map(repo.changelog.rev, repo.heads()))
+    for i in xrange(count):
+        progress(_synthesizing, i, unit=_changesets, total=count)
+
+        node = repo.changelog.node
+        revs = len(repo)
+
+        def pickhead(heads, distance):
+            if heads:
+                lheads = sorted(heads)
+                rev = revs - min(pick(distance), revs)
+                if rev < lheads[-1]:
+                    rev = lheads[bisect.bisect_left(lheads, rev)]
+                else:
+                    rev = lheads[-1]
+                return rev, node(rev)
+            return nullrev, nullid
+
+        r1 = revs - min(pick(p1distance), revs)
+        p1 = node(r1)
+
+        # the number of heads will grow without bound if we use a pure
+        # model, so artificially constrain their proliferation
+        if pick(parents) == 2 or len(heads) > random.randint(1, 20):
+            r2, p2 = pickhead(heads.difference([r1]), p2distance)
+        else:
+            r2, p2 = nullrev, nullid
+
+        pl = [p1, p2]
+        pctx = repo[r1]
+        mf = pctx.manifest()
+        mfk = mf.keys()
+        changes = {}
+        if mfk:
+            for __ in xrange(pick(fileschanged)):
+                for __ in xrange(10):
+                    fctx = pctx.filectx(random.choice(mfk))
+                    path = fctx.path()
+                    if not (path in nevertouch or fctx.isbinary() or
+                            'l' in fctx.flags()):
+                        break
+                lines = fctx.data().splitlines()
+                add, remove = pick(lineschanged)
+                for __ in xrange(remove):
+                    if not lines:
+                        break
+                    del lines[random.randrange(0, len(lines))]
+                for __ in xrange(add):
+                    lines.insert(random.randint(0, len(lines)), makeline())
+                path = fctx.path()
+                changes[path] = context.memfilectx(path,
+                                                   '\n'.join(lines) + '\n')
+            for __ in xrange(pick(filesremoved)):
+                path = random.choice(mfk)
+                for __ in xrange(10):
+                    path = random.choice(mfk)
+                    if path not in changes:
+                        changes[path] = None
+                        break
+        if filesadded:
+            dirs = list(pctx.dirs())
+            dirs.append('')
+        for __ in xrange(pick(filesadded)):
+            path = [random.choice(dirs)]
+            if pick(dirsadded):
+                path.append(random.choice(words))
+            path.append(random.choice(words))
+            path = '/'.join(filter(None, path))
+            data = '\n'.join(makeline()
+                             for __ in xrange(pick(linesinfilesadded))) + '\n'
+            changes[path] = context.memfilectx(path, data)
+        def filectxfn(repo, memctx, path):
+            data = changes[path]
+            if data is None:
+                raise IOError
+            return data
+        if not changes:
+            continue
+        if revs:
+            date = repo['tip'].date()[0] + pick(interarrival)
+        else:
+            date = time.time() - (86400 * count)
+        user = random.choice(words) + '@' + random.choice(words)
+        mc = context.memctx(repo, pl, makeline(minimum=2),
+                            sorted(changes.iterkeys()),
+                            filectxfn, user, '%d %d' % (date, pick(tzoffset)))
+        newnode = mc.commit()
+        heads.add(repo.changelog.rev(newnode))
+        heads.discard(r1)
+        heads.discard(r2)
+
+    lock.release()
+    wlock.release()