Mercurial > hg
view contrib/dumprevlog @ 50400:95acba2c29f6
encoding: avoid quadratic time complexity when json-encoding non-UTF8 strings
Apparently the code uses "+=" with a bytes object, which is linear-time, so the
whole encoding is quadratic-time. This patch makes us use a bytearray object,
instead, which has a(n amortized-)constant-time append operation.
The encoding is still not particularly fast, but at least a 10MB file
takes tens of seconds, not many hours to encode.
author | Arseniy Alekseyev <aalekseyev@janestreet.com> |
---|---|
date | Mon, 06 Mar 2023 11:27:57 +0000 |
parents | 6000f5b25c9b |
children |
line wrap: on
line source
#!/usr/bin/env python3 # Dump revlogs as raw data stream # $ find .hg/store/ -name "*.i" | xargs dumprevlog > repo.dump import sys from mercurial.node import hex from mercurial import ( encoding, pycompat, revlog, ) from mercurial.utils import procutil from mercurial.revlogutils import ( constants as revlog_constants, ) for fp in (sys.stdin, sys.stdout, sys.stderr): procutil.setbinary(fp) def binopen(path, mode=b'rb'): if b'b' not in mode: mode = mode + b'b' return open(path, pycompat.sysstr(mode)) binopen.options = {} def printb(data, end=b'\n'): sys.stdout.flush() procutil.stdout.write(data + end) for f in sys.argv[1:]: localf = encoding.strtolocal(f) if not localf.endswith(b'.i'): print("file:", f, file=sys.stderr) print(" invalid filename", file=sys.stderr) r = revlog.revlog( binopen, target=(revlog_constants.KIND_OTHER, b'dump-revlog'), radix=localf[:-2], ) print("file:", f) for i in r: n = r.node(i) p = r.parents(n) d = r.revision(n) printb(b"node: %s" % hex(n)) printb(b"linkrev: %d" % r.linkrev(i)) printb(b"parents: %s %s" % (hex(p[0]), hex(p[1]))) printb(b"length: %d" % len(d)) printb(b"-start-") printb(d) printb(b"-end-")