contrib/synthrepo.py
changeset 22709 889789a2ca9f
parent 22708 4c66e70c3488
child 23234 944d6cfbe166
--- a/contrib/synthrepo.py	Fri Sep 12 22:04:29 2014 -0400
+++ b/contrib/synthrepo.py	Fri Sep 12 22:07:23 2014 -0400
@@ -23,6 +23,7 @@
 - Probability of a commit being a merge
 - Probability of a newly added file being added to a new directory
 - Interarrival time, and time zone, of commits
+- Number of files in each directory
 
 A few obvious properties that are not currently handled realistically:
 
@@ -81,21 +82,25 @@
         yield filename, mar, lineadd, lineremove, binary
 
 @command('analyze',
-         [('o', 'output', [], _('write output to given file'), _('FILE')),
+         [('o', 'output', '', _('write output to given file'), _('FILE')),
           ('r', 'rev', [], _('analyze specified revisions'), _('REV'))],
-         _('hg analyze'))
+         _('hg analyze'), optionalrepo=True)
 def analyze(ui, repo, *revs, **opts):
     '''create a simple model of a repository to use for later synthesis
 
     This command examines every changeset in the given range (or all
     of history if none are specified) and creates a simple statistical
-    model of the history of the repository.
+    model of the history of the repository. It also measures the directory
+    structure of the repository as checked out.
 
     The model is written out to a JSON file, and can be used by
     :hg:`synthesize` to create or augment a repository with synthetic
     commits that have a structure that is statistically similar to the
     analyzed repository.
     '''
+    root = repo.root
+    if not root.endswith(os.path.sep):
+        root += os.path.sep
 
     revs = list(revs)
     revs.extend(opts['rev'])
@@ -104,15 +109,24 @@
 
     output = opts['output']
     if not output:
-        output = os.path.basename(repo.root) + '.json'
+        output = os.path.basename(root) + '.json'
 
     if output == '-':
         fp = sys.stdout
     else:
         fp = open(output, 'w')
 
-    revs = scmutil.revrange(repo, revs)
-    revs.sort()
+    # Always obtain file counts of each directory in the given root directory.
+    def onerror(e):
+        ui.warn(_('error walking directory structure: %s\n') % e)
+
+    dirs = {}
+    rootprefixlen = len(root)
+    for dirpath, dirnames, filenames in os.walk(root, onerror=onerror):
+        dirpathfromroot = dirpath[rootprefixlen:]
+        dirs[dirpathfromroot] = len(filenames)
+        if '.hg' in dirnames:
+            dirnames.remove('.hg')
 
     lineschanged = zerodict()
     children = zerodict()
@@ -128,54 +142,61 @@
     dirsadded = zerodict()
     tzoffset = zerodict()
 
-    progress = ui.progress
-    _analyzing = _('analyzing')
-    _changesets = _('changesets')
-    _total = len(revs)
+    # If a mercurial repo is available, also model the commit history.
+    if repo:
+        revs = scmutil.revrange(repo, revs)
+        revs.sort()
+
+        progress = ui.progress
+        _analyzing = _('analyzing')
+        _changesets = _('changesets')
+        _total = len(revs)
 
-    for i, rev in enumerate(revs):
-        progress(_analyzing, i, unit=_changesets, total=_total)
-        ctx = repo[rev]
-        pl = ctx.parents()
-        pctx = pl[0]
-        prev = pctx.rev()
-        children[prev] += 1
-        p1distance[rev - prev] += 1
-        parents[len(pl)] += 1
-        tzoffset[ctx.date()[1]] += 1
-        if len(pl) > 1:
-            p2distance[rev - pl[1].rev()] += 1
-        if prev == rev - 1:
-            lastctx = pctx
-        else:
-            lastctx = repo[rev - 1]
-        if lastctx.rev() != nullrev:
-            interarrival[roundto(ctx.date()[0] - lastctx.date()[0], 300)] += 1
-        diff = sum((d.splitlines() for d in ctx.diff(pctx, git=True)), [])
-        fileadds, diradds, fileremoves, filechanges = 0, 0, 0, 0
-        for filename, mar, lineadd, lineremove, binary in parsegitdiff(diff):
-            if binary:
-                continue
-            added = sum(lineadd.itervalues(), 0)
-            if mar == 'm':
-                if added and lineremove:
-                    lineschanged[roundto(added, 5), roundto(lineremove, 5)] += 1
-                    filechanges += 1
-            elif mar == 'a':
-                fileadds += 1
-                if '/' in filename:
-                    filedir = filename.rsplit('/', 1)[0]
-                    if filedir not in pctx.dirs():
-                        diradds += 1
-                linesinfilesadded[roundto(added, 5)] += 1
-            elif mar == 'r':
-                fileremoves += 1
-            for length, count in lineadd.iteritems():
-                linelengths[length] += count
-        fileschanged[filechanges] += 1
-        filesadded[fileadds] += 1
-        dirsadded[diradds] += 1
-        filesremoved[fileremoves] += 1
+        for i, rev in enumerate(revs):
+            progress(_analyzing, i, unit=_changesets, total=_total)
+            ctx = repo[rev]
+            pl = ctx.parents()
+            pctx = pl[0]
+            prev = pctx.rev()
+            children[prev] += 1
+            p1distance[rev - prev] += 1
+            parents[len(pl)] += 1
+            tzoffset[ctx.date()[1]] += 1
+            if len(pl) > 1:
+                p2distance[rev - pl[1].rev()] += 1
+            if prev == rev - 1:
+                lastctx = pctx
+            else:
+                lastctx = repo[rev - 1]
+            if lastctx.rev() != nullrev:
+                timedelta = ctx.date()[0] - lastctx.date()[0]
+                interarrival[roundto(timedelta, 300)] += 1
+            diff = sum((d.splitlines() for d in ctx.diff(pctx, git=True)), [])
+            fileadds, diradds, fileremoves, filechanges = 0, 0, 0, 0
+            for filename, mar, lineadd, lineremove, isbin in parsegitdiff(diff):
+                if isbin:
+                    continue
+                added = sum(lineadd.itervalues(), 0)
+                if mar == 'm':
+                    if added and lineremove:
+                        lineschanged[roundto(added, 5),
+                                     roundto(lineremove, 5)] += 1
+                        filechanges += 1
+                elif mar == 'a':
+                    fileadds += 1
+                    if '/' in filename:
+                        filedir = filename.rsplit('/', 1)[0]
+                        if filedir not in pctx.dirs():
+                            diradds += 1
+                    linesinfilesadded[roundto(added, 5)] += 1
+                elif mar == 'r':
+                    fileremoves += 1
+                for length, count in lineadd.iteritems():
+                    linelengths[length] += count
+            fileschanged[filechanges] += 1
+            filesadded[fileadds] += 1
+            dirsadded[diradds] += 1
+            filesremoved[fileremoves] += 1
 
     invchildren = zerodict()
 
@@ -189,6 +210,7 @@
         return sorted(d.iteritems(), key=lambda x: x[1], reverse=True)
 
     json.dump({'revs': len(revs),
+               'initdirs': pronk(dirs),
                'lineschanged': pronk(lineschanged),
                'children': pronk(invchildren),
                'fileschanged': pronk(fileschanged),