contrib/dumprevlog
author Matt Mackall <mpm@selenic.com>
Thu, 07 Jan 2016 14:57:57 -0600
changeset 27699 c8d3392f76e1
parent 14233 659f34b833b9
child 29165 a212ca70205c
permissions -rwxr-xr-x
encoding: handle UTF-16 internal limit with fromutf8b (issue5031) Default builds of Python have a Unicode type that isn't actually full Unicode but UTF-16, which encodes non-BMP codepoints to a pair of BMP codepoints with surrogate escaping. Since our UTF-8b hack escaping uses a plane that overlaps with the UTF-16 escaping system, this gets extra complicated. In addition, unichr() for codepoints greater than U+FFFF may not work either. This changes the code to reuse getutf8char to walk the byte string, so we only rely on Python for unpacking our U+DCxx characters.

#!/usr/bin/env python
# Dump revlogs as raw data stream
# $ find .hg/store/ -name "*.i" | xargs dumprevlog > repo.dump

import sys
from mercurial import revlog, node, util

for fp in (sys.stdin, sys.stdout, sys.stderr):
    util.setbinary(fp)

for f in sys.argv[1:]:
    binopen = lambda fn: open(fn, 'rb')
    r = revlog.revlog(binopen, f)
    print "file:", f
    for i in r:
        n = r.node(i)
        p = r.parents(n)
        d = r.revision(n)
        print "node:", node.hex(n)
        print "linkrev:", r.linkrev(i)
        print "parents:", node.hex(p[0]), node.hex(p[1])
        print "length:", len(d)
        print "-start-"
        print d
        print "-end-"