annotate contrib/dumprevlog @ 50400:95acba2c29f6

encoding: avoid quadratic time complexity when json-encoding non-UTF8 strings Apparently the code uses "+=" with a bytes object, which is linear-time, so the whole encoding is quadratic-time. This patch makes us use a bytearray object, instead, which has a(n amortized-)constant-time append operation. The encoding is still not particularly fast, but at least a 10MB file takes tens of seconds, not many hours to encode.
author Arseniy Alekseyev <aalekseyev@janestreet.com>
date Mon, 06 Mar 2023 11:27:57 +0000
parents 6000f5b25c9b
children
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
45830
c102b704edb5 global: use python3 in shebangs
Gregory Szorc <gregory.szorc@gmail.com>
parents: 45055
diff changeset
1 #!/usr/bin/env python3
6433
ec5d77eb3431 add simple dump and undump scripts to contrib/
Matt Mackall <mpm@selenic.com>
parents:
diff changeset
2 # Dump revlogs as raw data stream
ec5d77eb3431 add simple dump and undump scripts to contrib/
Matt Mackall <mpm@selenic.com>
parents:
diff changeset
3 # $ find .hg/store/ -name "*.i" | xargs dumprevlog > repo.dump
ec5d77eb3431 add simple dump and undump scripts to contrib/
Matt Mackall <mpm@selenic.com>
parents:
diff changeset
4
29165
a212ca70205c py3: make contrib/dumprevlog use absolute_import
Pulkit Goyal <7895pulkit@gmail.com>
parents: 14233
diff changeset
5
6433
ec5d77eb3431 add simple dump and undump scripts to contrib/
Matt Mackall <mpm@selenic.com>
parents:
diff changeset
6 import sys
46113
59fa3890d40a node: import symbols explicitly
Joerg Sonnenberger <joerg@bec.de>
parents: 45830
diff changeset
7 from mercurial.node import hex
29165
a212ca70205c py3: make contrib/dumprevlog use absolute_import
Pulkit Goyal <7895pulkit@gmail.com>
parents: 14233
diff changeset
8 from mercurial import (
39947
a063b84ce064 py3: byteify contrib/dumprevlog
Matt Harbison <matt_harbison@yahoo.com>
parents: 37120
diff changeset
9 encoding,
a063b84ce064 py3: byteify contrib/dumprevlog
Matt Harbison <matt_harbison@yahoo.com>
parents: 37120
diff changeset
10 pycompat,
29165
a212ca70205c py3: make contrib/dumprevlog use absolute_import
Pulkit Goyal <7895pulkit@gmail.com>
parents: 14233
diff changeset
11 revlog,
37120
a8a902d7176e procutil: bulk-replace function calls to point to new module
Yuya Nishihara <yuya@tcha.org>
parents: 35964
diff changeset
12 )
43659
99e231afc29c black: blacken scripts
Gregory Szorc <gregory.szorc@gmail.com>
parents: 43025
diff changeset
13 from mercurial.utils import procutil
6466
9c426da6b03b contrib: fix binary file issues with dumprevlog on Windows
Adrian Buehlmann <adrian@cadifra.com>
parents: 6433
diff changeset
14
47072
4c041c71ec01 revlog: introduce an explicit tracking of what the revlog is about
Pierre-Yves David <pierre-yves.david@octobus.net>
parents: 46113
diff changeset
15 from mercurial.revlogutils import (
4c041c71ec01 revlog: introduce an explicit tracking of what the revlog is about
Pierre-Yves David <pierre-yves.david@octobus.net>
parents: 46113
diff changeset
16 constants as revlog_constants,
4c041c71ec01 revlog: introduce an explicit tracking of what the revlog is about
Pierre-Yves David <pierre-yves.david@octobus.net>
parents: 46113
diff changeset
17 )
4c041c71ec01 revlog: introduce an explicit tracking of what the revlog is about
Pierre-Yves David <pierre-yves.david@octobus.net>
parents: 46113
diff changeset
18
6466
9c426da6b03b contrib: fix binary file issues with dumprevlog on Windows
Adrian Buehlmann <adrian@cadifra.com>
parents: 6433
diff changeset
19 for fp in (sys.stdin, sys.stdout, sys.stderr):
37120
a8a902d7176e procutil: bulk-replace function calls to point to new module
Yuya Nishihara <yuya@tcha.org>
parents: 35964
diff changeset
20 procutil.setbinary(fp)
6433
ec5d77eb3431 add simple dump and undump scripts to contrib/
Matt Mackall <mpm@selenic.com>
parents:
diff changeset
21
43659
99e231afc29c black: blacken scripts
Gregory Szorc <gregory.szorc@gmail.com>
parents: 43025
diff changeset
22
39947
a063b84ce064 py3: byteify contrib/dumprevlog
Matt Harbison <matt_harbison@yahoo.com>
parents: 37120
diff changeset
23 def binopen(path, mode=b'rb'):
a063b84ce064 py3: byteify contrib/dumprevlog
Matt Harbison <matt_harbison@yahoo.com>
parents: 37120
diff changeset
24 if b'b' not in mode:
a063b84ce064 py3: byteify contrib/dumprevlog
Matt Harbison <matt_harbison@yahoo.com>
parents: 37120
diff changeset
25 mode = mode + b'b'
a063b84ce064 py3: byteify contrib/dumprevlog
Matt Harbison <matt_harbison@yahoo.com>
parents: 37120
diff changeset
26 return open(path, pycompat.sysstr(mode))
43659
99e231afc29c black: blacken scripts
Gregory Szorc <gregory.szorc@gmail.com>
parents: 43025
diff changeset
27
99e231afc29c black: blacken scripts
Gregory Szorc <gregory.szorc@gmail.com>
parents: 43025
diff changeset
28
43025
3518da504303 vfs: give all vfs an options attribute by default
Pierre-Yves David <pierre-yves.david@octobus.net>
parents: 39947
diff changeset
29 binopen.options = {}
39947
a063b84ce064 py3: byteify contrib/dumprevlog
Matt Harbison <matt_harbison@yahoo.com>
parents: 37120
diff changeset
30
43659
99e231afc29c black: blacken scripts
Gregory Szorc <gregory.szorc@gmail.com>
parents: 43025
diff changeset
31
39947
a063b84ce064 py3: byteify contrib/dumprevlog
Matt Harbison <matt_harbison@yahoo.com>
parents: 37120
diff changeset
32 def printb(data, end=b'\n'):
a063b84ce064 py3: byteify contrib/dumprevlog
Matt Harbison <matt_harbison@yahoo.com>
parents: 37120
diff changeset
33 sys.stdout.flush()
45055
4c1b4805db57 pycompat: change users of pycompat.{stdin,stdout,stderr} to use procutil.std*
Manuel Jacob <me@manueljacob.de>
parents: 43659
diff changeset
34 procutil.stdout.write(data + end)
35964
a915465a731e dumprevlog: handle being passed a mode parameter
Boris Feld <boris.feld@octobus.net>
parents: 29166
diff changeset
35
43659
99e231afc29c black: blacken scripts
Gregory Szorc <gregory.szorc@gmail.com>
parents: 43025
diff changeset
36
6433
ec5d77eb3431 add simple dump and undump scripts to contrib/
Matt Mackall <mpm@selenic.com>
parents:
diff changeset
37 for f in sys.argv[1:]:
47150
8d3c2f9d4af7 revlog: use a "radix" to address revlog
Pierre-Yves David <pierre-yves.david@octobus.net>
parents: 47072
diff changeset
38 localf = encoding.strtolocal(f)
8d3c2f9d4af7 revlog: use a "radix" to address revlog
Pierre-Yves David <pierre-yves.david@octobus.net>
parents: 47072
diff changeset
39 if not localf.endswith(b'.i'):
8d3c2f9d4af7 revlog: use a "radix" to address revlog
Pierre-Yves David <pierre-yves.david@octobus.net>
parents: 47072
diff changeset
40 print("file:", f, file=sys.stderr)
47177
5d5abfdc32d8 contrib: fix typo
Raphaël Gomès <rgomes@octobus.net>
parents: 47150
diff changeset
41 print(" invalid filename", file=sys.stderr)
47150
8d3c2f9d4af7 revlog: use a "radix" to address revlog
Pierre-Yves David <pierre-yves.david@octobus.net>
parents: 47072
diff changeset
42
47072
4c041c71ec01 revlog: introduce an explicit tracking of what the revlog is about
Pierre-Yves David <pierre-yves.david@octobus.net>
parents: 46113
diff changeset
43 r = revlog.revlog(
4c041c71ec01 revlog: introduce an explicit tracking of what the revlog is about
Pierre-Yves David <pierre-yves.david@octobus.net>
parents: 46113
diff changeset
44 binopen,
4c041c71ec01 revlog: introduce an explicit tracking of what the revlog is about
Pierre-Yves David <pierre-yves.david@octobus.net>
parents: 46113
diff changeset
45 target=(revlog_constants.KIND_OTHER, b'dump-revlog'),
47150
8d3c2f9d4af7 revlog: use a "radix" to address revlog
Pierre-Yves David <pierre-yves.david@octobus.net>
parents: 47072
diff changeset
46 radix=localf[:-2],
47072
4c041c71ec01 revlog: introduce an explicit tracking of what the revlog is about
Pierre-Yves David <pierre-yves.david@octobus.net>
parents: 46113
diff changeset
47 )
29166
6359b80f15fb py3: make contrib/dumprevlog use print_function
Pulkit Goyal <7895pulkit@gmail.com>
parents: 29165
diff changeset
48 print("file:", f)
6750
fb42030d79d6 add __len__ and __iter__ methods to repo and revlog
Matt Mackall <mpm@selenic.com>
parents: 6466
diff changeset
49 for i in r:
6433
ec5d77eb3431 add simple dump and undump scripts to contrib/
Matt Mackall <mpm@selenic.com>
parents:
diff changeset
50 n = r.node(i)
ec5d77eb3431 add simple dump and undump scripts to contrib/
Matt Mackall <mpm@selenic.com>
parents:
diff changeset
51 p = r.parents(n)
ec5d77eb3431 add simple dump and undump scripts to contrib/
Matt Mackall <mpm@selenic.com>
parents:
diff changeset
52 d = r.revision(n)
46113
59fa3890d40a node: import symbols explicitly
Joerg Sonnenberger <joerg@bec.de>
parents: 45830
diff changeset
53 printb(b"node: %s" % hex(n))
39947
a063b84ce064 py3: byteify contrib/dumprevlog
Matt Harbison <matt_harbison@yahoo.com>
parents: 37120
diff changeset
54 printb(b"linkrev: %d" % r.linkrev(i))
46113
59fa3890d40a node: import symbols explicitly
Joerg Sonnenberger <joerg@bec.de>
parents: 45830
diff changeset
55 printb(b"parents: %s %s" % (hex(p[0]), hex(p[1])))
39947
a063b84ce064 py3: byteify contrib/dumprevlog
Matt Harbison <matt_harbison@yahoo.com>
parents: 37120
diff changeset
56 printb(b"length: %d" % len(d))
a063b84ce064 py3: byteify contrib/dumprevlog
Matt Harbison <matt_harbison@yahoo.com>
parents: 37120
diff changeset
57 printb(b"-start-")
a063b84ce064 py3: byteify contrib/dumprevlog
Matt Harbison <matt_harbison@yahoo.com>
parents: 37120
diff changeset
58 printb(d)
a063b84ce064 py3: byteify contrib/dumprevlog
Matt Harbison <matt_harbison@yahoo.com>
parents: 37120
diff changeset
59 printb(b"-end-")