view contrib/dumprevlog @ 50400:95acba2c29f6

encoding: avoid quadratic time complexity when json-encoding non-UTF8 strings Apparently the code uses "+=" with a bytes object, which is linear-time, so the whole encoding is quadratic-time. This patch makes us use a bytearray object, instead, which has a(n amortized-)constant-time append operation. The encoding is still not particularly fast, but at least a 10MB file takes tens of seconds, not many hours to encode.
author Arseniy Alekseyev <aalekseyev@janestreet.com>
date Mon, 06 Mar 2023 11:27:57 +0000
parents 6000f5b25c9b
children
line wrap: on
line source

#!/usr/bin/env python3
# Dump revlogs as raw data stream
# $ find .hg/store/ -name "*.i" | xargs dumprevlog > repo.dump


import sys
from mercurial.node import hex
from mercurial import (
    encoding,
    pycompat,
    revlog,
)
from mercurial.utils import procutil

from mercurial.revlogutils import (
    constants as revlog_constants,
)

for fp in (sys.stdin, sys.stdout, sys.stderr):
    procutil.setbinary(fp)


def binopen(path, mode=b'rb'):
    if b'b' not in mode:
        mode = mode + b'b'
    return open(path, pycompat.sysstr(mode))


binopen.options = {}


def printb(data, end=b'\n'):
    sys.stdout.flush()
    procutil.stdout.write(data + end)


for f in sys.argv[1:]:
    localf = encoding.strtolocal(f)
    if not localf.endswith(b'.i'):
        print("file:", f, file=sys.stderr)
        print("  invalid filename", file=sys.stderr)

    r = revlog.revlog(
        binopen,
        target=(revlog_constants.KIND_OTHER, b'dump-revlog'),
        radix=localf[:-2],
    )
    print("file:", f)
    for i in r:
        n = r.node(i)
        p = r.parents(n)
        d = r.revision(n)
        printb(b"node: %s" % hex(n))
        printb(b"linkrev: %d" % r.linkrev(i))
        printb(b"parents: %s %s" % (hex(p[0]), hex(p[1])))
        printb(b"length: %d" % len(d))
        printb(b"-start-")
        printb(d)
        printb(b"-end-")