changeset 9515:f7d85980261c

Add script to rewrite revlog to workaround lack of parent deltas. Defaults to rewriting the manifest in the current repository. Based on a patch to rewrite-log by Benoit Boissinot that I found here: http://article.gmane.org/gmane.comp.version-control.mercurial.general/11908
author Greg Ward <greg-hg@gerg.ca>
date Thu, 27 Aug 2009 10:21:32 -0400
parents 7c01599dd340
children f8048c334066
files contrib/shrink-revlog.py
diffstat 1 files changed, 218 insertions(+), 0 deletions(-) [+]
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/contrib/shrink-revlog.py	Thu Aug 27 10:21:32 2009 -0400
@@ -0,0 +1,218 @@
+#!/usr/bin/env python
+
+"""\
+Reorder a revlog (by default the the manifest file in the current
+repository) to save space.  Specifically, this topologically sorts the
+revisions in the revlog so that revisions on the same branch are adjacent
+as much as possible.  This is a workaround for the fact that Mercurial
+computes deltas relative to the previous revision rather than relative to a
+parent revision.  This is *not* safe to run on a changelog.
+"""
+
+# Originally written by Benoit Boissinot <benoit.boissinot at ens-lyon.org>
+# as a patch to rewrite-log.  Cleaned up, refactored, documented, and
+# renamed by Greg Ward <greg at gerg.ca>.
+
+# XXX would be nice to have a way to verify the repository after shrinking,
+# e.g. by comparing "before" and "after" states of random changesets
+# (maybe: export before, shrink, export after, diff).
+
+import sys, os, tempfile
+import optparse
+from mercurial import ui as ui_, hg, revlog, transaction, node, util
+
+def toposort(rl):
+    write = sys.stdout.write
+
+    children = {}
+    root = []
+    # build children and roots
+    write('reading %d revs ' % len(rl))
+    try:
+        for i in rl:
+            children[i] = []
+            parents = [p for p in rl.parentrevs(i) if p != node.nullrev]
+            # in case of duplicate parents
+            if len(parents) == 2 and parents[0] == parents[1]:
+                del parents[1]
+            for p in parents:
+                assert p in children
+                children[p].append(i)
+
+            if len(parents) == 0:
+                root.append(i)
+
+            if i % 1000 == 0:
+                write('.')
+    finally:
+        write('\n')
+
+    # XXX this is a reimplementation of the 'branchsort' topo sort
+    # algorithm in hgext.convert.convcmd... would be nice not to duplicate
+    # the algorithm
+    write('sorting ...')
+    visit = root
+    ret = []
+    while visit:
+        i = visit.pop(0)
+        ret.append(i)
+        if i not in children:
+            # This only happens if some node's p1 == p2, which can
+            # happen in the manifest in certain circumstances.
+            continue
+        next = []
+        for c in children.pop(i):
+            parents_unseen = [p for p in rl.parentrevs(c)
+                              if p != node.nullrev and p in children]
+            if len(parents_unseen) == 0:
+                next.append(c)
+        visit = next + visit
+    write('\n')
+    return ret
+
+def writerevs(r1, r2, order, tr):
+    write = sys.stdout.write
+    write('writing %d revs ' % len(order))
+    try:
+        count = 0
+        for rev in order:
+            n = r1.node(rev)
+            p1, p2 = r1.parents(n)
+            l = r1.linkrev(rev)
+            t = r1.revision(n)
+            n2 = r2.addrevision(t, tr, l, p1, p2)
+
+            if count % 1000 == 0:
+                write('.')
+            count += 1
+    finally:
+        write('\n')
+    
+def report(olddatafn, newdatafn):
+    oldsize = float(os.stat(olddatafn).st_size)
+    newsize = float(os.stat(newdatafn).st_size)
+
+    # argh: have to pass an int to %d, because a float >= 2^32 
+    # blows up under Python 2.5 or earlier
+    sys.stdout.write('old file size: %12d bytes (%6.1f MiB)\n'
+                     % (int(oldsize), oldsize/1024/1024))
+    sys.stdout.write('new file size: %12d bytes (%6.1f MiB)\n'
+                     % (int(newsize), newsize/1024/1024))
+
+    shrink_percent = (oldsize - newsize) / oldsize * 100
+    shrink_factor = oldsize / newsize
+    sys.stdout.write('shrinkage: %.1f%% (%.1fx)\n'
+                     % (shrink_percent, shrink_factor))
+
+def main():
+
+    # Unbuffer stdout for nice progress output.
+    sys.stdout = os.fdopen(sys.stdout.fileno(), 'w', 0)
+    write = sys.stdout.write
+
+    parser = optparse.OptionParser(description=__doc__)
+    parser.add_option('-R', '--repository',
+                      default=os.path.curdir,
+                      metavar='REPO',
+                      help='repository root directory [default: current dir]')
+    parser.add_option('--revlog',
+                      metavar='FILE',
+                      help='shrink FILE [default: REPO/hg/store/00manifest.i]')
+    (options, args) = parser.parse_args()
+    if args:
+        parser.error('too many arguments')
+
+    # Open the specified repository.
+    ui = ui_.ui()
+    repo = hg.repository(ui, options.repository)
+    if not repo.local():
+        parser.error('not a local repository: %s' % options.repository)
+
+    if options.revlog is None:
+        indexfn = repo.sjoin('00manifest.i')
+    else:
+        if not options.revlog.endswith('.i'):
+            parser.error('--revlog option must specify the revlog index file '
+                         '(*.i), not %s' % options.revlog)
+
+        indexfn = os.path.realpath(options.revlog)
+        store = repo.sjoin('')
+        if not indexfn.startswith(store):
+            parser.error('--revlog option must specify a revlog in %s, not %s'
+                         % (store, indexfn))
+
+    datafn = indexfn[:-2] + '.d'
+    if not os.path.exists(indexfn):
+        parser.error('no such file: %s' % indexfn)
+    if '00changelog' in indexfn:
+        parser.error('shrinking the changelog will corrupt your repository')
+    if not os.path.exists(datafn):
+        # This is just a lazy shortcut because I can't be bothered to
+        # handle all the special cases that entail from no .d file.
+        parser.error('%s does not exist: revlog not big enough '
+                     'to be worth shrinking' % datafn)
+
+    oldindexfn = indexfn + '.old'
+    olddatafn = datafn + '.old'
+    if os.path.exists(oldindexfn) or os.path.exists(olddatafn):
+        parser.error('one or both of\n'
+                     '  %s\n'
+                     '  %s\n'
+                     'exists from a previous run; please clean up before '
+                     'running again'
+                     % (oldindexfn, olddatafn))
+
+    write('shrinking %s\n' % indexfn)
+    prefix = os.path.basename(indexfn)[:-1]
+    (tmpfd, tmpindexfn) = tempfile.mkstemp(dir=os.path.dirname(indexfn),
+                                           prefix=prefix,
+                                           suffix='.i')
+    tmpdatafn = tmpindexfn[:-2] + '.d'
+    os.close(tmpfd)
+
+    r1 = revlog.revlog(util.opener(os.getcwd(), audit=False), indexfn)
+    r2 = revlog.revlog(util.opener(os.getcwd(), audit=False), tmpindexfn)
+
+    # Don't use repo.transaction(), because then things get hairy with
+    # paths: some need to be relative to .hg, and some need to be
+    # absolute.  Doing it this way keeps things simple: everything is an
+    # absolute path.
+    lock = repo.lock(wait=False)
+    tr = transaction.transaction(sys.stderr.write,
+                                 open,
+                                 repo.sjoin('journal'))
+
+    try:
+        try:
+            order = toposort(r1)
+            writerevs(r1, r2, order, tr)
+            report(datafn, tmpdatafn)
+            tr.close()
+        except:
+            # Abort transaction first, so we truncate the files before
+            # deleting them.
+            tr.abort()
+            if os.path.exists(tmpindexfn):
+                os.unlink(tmpindexfn)
+            if os.path.exists(tmpdatafn):
+                os.unlink(tmpdatafn)
+            raise
+    finally:
+        lock.release()
+
+    os.link(indexfn, oldindexfn)
+    os.link(datafn, olddatafn)
+    os.rename(tmpindexfn, indexfn)
+    os.rename(tmpdatafn, datafn)
+    write('note: old revlog saved in:\n'
+          '  %s\n'
+          '  %s\n'
+          '(You can delete those files when you are satisfied that your\n'
+          'repository is still sane.  '
+          'Running \'hg verify\' is strongly recommended.)\n'
+          % (oldindexfn, olddatafn))
+
+try:
+    main()
+except KeyboardInterrupt:
+    sys.exit("interrupted")