Mercurial > hg
changeset 9515:f7d85980261c
Add script to rewrite revlog to workaround lack of parent deltas.
Defaults to rewriting the manifest in the current repository.
Based on a patch to rewrite-log by Benoit Boissinot that I found here:
http://article.gmane.org/gmane.comp.version-control.mercurial.general/11908
author | Greg Ward <greg-hg@gerg.ca> |
---|---|
date | Thu, 27 Aug 2009 10:21:32 -0400 |
parents | 7c01599dd340 |
children | f8048c334066 |
files | contrib/shrink-revlog.py |
diffstat | 1 files changed, 218 insertions(+), 0 deletions(-) [+] |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/contrib/shrink-revlog.py Thu Aug 27 10:21:32 2009 -0400 @@ -0,0 +1,218 @@ +#!/usr/bin/env python + +"""\ +Reorder a revlog (by default the the manifest file in the current +repository) to save space. Specifically, this topologically sorts the +revisions in the revlog so that revisions on the same branch are adjacent +as much as possible. This is a workaround for the fact that Mercurial +computes deltas relative to the previous revision rather than relative to a +parent revision. This is *not* safe to run on a changelog. +""" + +# Originally written by Benoit Boissinot <benoit.boissinot at ens-lyon.org> +# as a patch to rewrite-log. Cleaned up, refactored, documented, and +# renamed by Greg Ward <greg at gerg.ca>. + +# XXX would be nice to have a way to verify the repository after shrinking, +# e.g. by comparing "before" and "after" states of random changesets +# (maybe: export before, shrink, export after, diff). + +import sys, os, tempfile +import optparse +from mercurial import ui as ui_, hg, revlog, transaction, node, util + +def toposort(rl): + write = sys.stdout.write + + children = {} + root = [] + # build children and roots + write('reading %d revs ' % len(rl)) + try: + for i in rl: + children[i] = [] + parents = [p for p in rl.parentrevs(i) if p != node.nullrev] + # in case of duplicate parents + if len(parents) == 2 and parents[0] == parents[1]: + del parents[1] + for p in parents: + assert p in children + children[p].append(i) + + if len(parents) == 0: + root.append(i) + + if i % 1000 == 0: + write('.') + finally: + write('\n') + + # XXX this is a reimplementation of the 'branchsort' topo sort + # algorithm in hgext.convert.convcmd... would be nice not to duplicate + # the algorithm + write('sorting ...') + visit = root + ret = [] + while visit: + i = visit.pop(0) + ret.append(i) + if i not in children: + # This only happens if some node's p1 == p2, which can + # happen in the manifest in certain circumstances. + continue + next = [] + for c in children.pop(i): + parents_unseen = [p for p in rl.parentrevs(c) + if p != node.nullrev and p in children] + if len(parents_unseen) == 0: + next.append(c) + visit = next + visit + write('\n') + return ret + +def writerevs(r1, r2, order, tr): + write = sys.stdout.write + write('writing %d revs ' % len(order)) + try: + count = 0 + for rev in order: + n = r1.node(rev) + p1, p2 = r1.parents(n) + l = r1.linkrev(rev) + t = r1.revision(n) + n2 = r2.addrevision(t, tr, l, p1, p2) + + if count % 1000 == 0: + write('.') + count += 1 + finally: + write('\n') + +def report(olddatafn, newdatafn): + oldsize = float(os.stat(olddatafn).st_size) + newsize = float(os.stat(newdatafn).st_size) + + # argh: have to pass an int to %d, because a float >= 2^32 + # blows up under Python 2.5 or earlier + sys.stdout.write('old file size: %12d bytes (%6.1f MiB)\n' + % (int(oldsize), oldsize/1024/1024)) + sys.stdout.write('new file size: %12d bytes (%6.1f MiB)\n' + % (int(newsize), newsize/1024/1024)) + + shrink_percent = (oldsize - newsize) / oldsize * 100 + shrink_factor = oldsize / newsize + sys.stdout.write('shrinkage: %.1f%% (%.1fx)\n' + % (shrink_percent, shrink_factor)) + +def main(): + + # Unbuffer stdout for nice progress output. + sys.stdout = os.fdopen(sys.stdout.fileno(), 'w', 0) + write = sys.stdout.write + + parser = optparse.OptionParser(description=__doc__) + parser.add_option('-R', '--repository', + default=os.path.curdir, + metavar='REPO', + help='repository root directory [default: current dir]') + parser.add_option('--revlog', + metavar='FILE', + help='shrink FILE [default: REPO/hg/store/00manifest.i]') + (options, args) = parser.parse_args() + if args: + parser.error('too many arguments') + + # Open the specified repository. + ui = ui_.ui() + repo = hg.repository(ui, options.repository) + if not repo.local(): + parser.error('not a local repository: %s' % options.repository) + + if options.revlog is None: + indexfn = repo.sjoin('00manifest.i') + else: + if not options.revlog.endswith('.i'): + parser.error('--revlog option must specify the revlog index file ' + '(*.i), not %s' % options.revlog) + + indexfn = os.path.realpath(options.revlog) + store = repo.sjoin('') + if not indexfn.startswith(store): + parser.error('--revlog option must specify a revlog in %s, not %s' + % (store, indexfn)) + + datafn = indexfn[:-2] + '.d' + if not os.path.exists(indexfn): + parser.error('no such file: %s' % indexfn) + if '00changelog' in indexfn: + parser.error('shrinking the changelog will corrupt your repository') + if not os.path.exists(datafn): + # This is just a lazy shortcut because I can't be bothered to + # handle all the special cases that entail from no .d file. + parser.error('%s does not exist: revlog not big enough ' + 'to be worth shrinking' % datafn) + + oldindexfn = indexfn + '.old' + olddatafn = datafn + '.old' + if os.path.exists(oldindexfn) or os.path.exists(olddatafn): + parser.error('one or both of\n' + ' %s\n' + ' %s\n' + 'exists from a previous run; please clean up before ' + 'running again' + % (oldindexfn, olddatafn)) + + write('shrinking %s\n' % indexfn) + prefix = os.path.basename(indexfn)[:-1] + (tmpfd, tmpindexfn) = tempfile.mkstemp(dir=os.path.dirname(indexfn), + prefix=prefix, + suffix='.i') + tmpdatafn = tmpindexfn[:-2] + '.d' + os.close(tmpfd) + + r1 = revlog.revlog(util.opener(os.getcwd(), audit=False), indexfn) + r2 = revlog.revlog(util.opener(os.getcwd(), audit=False), tmpindexfn) + + # Don't use repo.transaction(), because then things get hairy with + # paths: some need to be relative to .hg, and some need to be + # absolute. Doing it this way keeps things simple: everything is an + # absolute path. + lock = repo.lock(wait=False) + tr = transaction.transaction(sys.stderr.write, + open, + repo.sjoin('journal')) + + try: + try: + order = toposort(r1) + writerevs(r1, r2, order, tr) + report(datafn, tmpdatafn) + tr.close() + except: + # Abort transaction first, so we truncate the files before + # deleting them. + tr.abort() + if os.path.exists(tmpindexfn): + os.unlink(tmpindexfn) + if os.path.exists(tmpdatafn): + os.unlink(tmpdatafn) + raise + finally: + lock.release() + + os.link(indexfn, oldindexfn) + os.link(datafn, olddatafn) + os.rename(tmpindexfn, indexfn) + os.rename(tmpdatafn, datafn) + write('note: old revlog saved in:\n' + ' %s\n' + ' %s\n' + '(You can delete those files when you are satisfied that your\n' + 'repository is still sane. ' + 'Running \'hg verify\' is strongly recommended.)\n' + % (oldindexfn, olddatafn)) + +try: + main() +except KeyboardInterrupt: + sys.exit("interrupted")