Mercurial > hg
annotate contrib/bdiff-torture.py @ 50400:95acba2c29f6
encoding: avoid quadratic time complexity when json-encoding non-UTF8 strings
Apparently the code uses "+=" with a bytes object, which is linear-time, so the
whole encoding is quadratic-time. This patch makes us use a bytearray object,
instead, which has a(n amortized-)constant-time append operation.
The encoding is still not particularly fast, but at least a 10MB file
takes tens of seconds, not many hours to encode.
author | Arseniy Alekseyev <aalekseyev@janestreet.com> |
---|---|
date | Mon, 06 Mar 2023 11:27:57 +0000 |
parents | 6000f5b25c9b |
children |
rev | line source |
---|---|
29012
4bd67ae7d75a
bdiff: fix latent normalization bug
Matt Mackall <mpm@selenic.com>
parents:
diff
changeset
|
1 # Randomized torture test generation for bdiff |
4bd67ae7d75a
bdiff: fix latent normalization bug
Matt Mackall <mpm@selenic.com>
parents:
diff
changeset
|
2 |
29209
eccfd6500636
py3: make contrib/bdiff-torture.py conform to our import style
Yuya Nishihara <yuya@tcha.org>
parents:
29012
diff
changeset
|
3 import random |
eccfd6500636
py3: make contrib/bdiff-torture.py conform to our import style
Yuya Nishihara <yuya@tcha.org>
parents:
29012
diff
changeset
|
4 import sys |
eccfd6500636
py3: make contrib/bdiff-torture.py conform to our import style
Yuya Nishihara <yuya@tcha.org>
parents:
29012
diff
changeset
|
5 |
29012
4bd67ae7d75a
bdiff: fix latent normalization bug
Matt Mackall <mpm@selenic.com>
parents:
diff
changeset
|
6 from mercurial import ( |
32202
ded48ad55146
bdiff: proxy through mdiff module
Yuya Nishihara <yuya@tcha.org>
parents:
29209
diff
changeset
|
7 mdiff, |
43009
e05c141511dd
contrib: use pycompat.xrange in bdiff-torture.py
Gregory Szorc <gregory.szorc@gmail.com>
parents:
42857
diff
changeset
|
8 pycompat, |
29012
4bd67ae7d75a
bdiff: fix latent normalization bug
Matt Mackall <mpm@selenic.com>
parents:
diff
changeset
|
9 ) |
4bd67ae7d75a
bdiff: fix latent normalization bug
Matt Mackall <mpm@selenic.com>
parents:
diff
changeset
|
10 |
43076
2372284d9457
formatting: blacken the codebase
Augie Fackler <augie@google.com>
parents:
43009
diff
changeset
|
11 |
29012
4bd67ae7d75a
bdiff: fix latent normalization bug
Matt Mackall <mpm@selenic.com>
parents:
diff
changeset
|
12 def reducetest(a, b): |
4bd67ae7d75a
bdiff: fix latent normalization bug
Matt Mackall <mpm@selenic.com>
parents:
diff
changeset
|
13 tries = 0 |
4bd67ae7d75a
bdiff: fix latent normalization bug
Matt Mackall <mpm@selenic.com>
parents:
diff
changeset
|
14 reductions = 0 |
4bd67ae7d75a
bdiff: fix latent normalization bug
Matt Mackall <mpm@selenic.com>
parents:
diff
changeset
|
15 print("reducing...") |
4bd67ae7d75a
bdiff: fix latent normalization bug
Matt Mackall <mpm@selenic.com>
parents:
diff
changeset
|
16 while tries < 1000: |
43076
2372284d9457
formatting: blacken the codebase
Augie Fackler <augie@google.com>
parents:
43009
diff
changeset
|
17 a2 = ( |
2372284d9457
formatting: blacken the codebase
Augie Fackler <augie@google.com>
parents:
43009
diff
changeset
|
18 "\n".join(l for l in a.splitlines() if random.randint(0, 100) > 0) |
2372284d9457
formatting: blacken the codebase
Augie Fackler <augie@google.com>
parents:
43009
diff
changeset
|
19 + "\n" |
2372284d9457
formatting: blacken the codebase
Augie Fackler <augie@google.com>
parents:
43009
diff
changeset
|
20 ) |
2372284d9457
formatting: blacken the codebase
Augie Fackler <augie@google.com>
parents:
43009
diff
changeset
|
21 b2 = ( |
2372284d9457
formatting: blacken the codebase
Augie Fackler <augie@google.com>
parents:
43009
diff
changeset
|
22 "\n".join(l for l in b.splitlines() if random.randint(0, 100) > 0) |
2372284d9457
formatting: blacken the codebase
Augie Fackler <augie@google.com>
parents:
43009
diff
changeset
|
23 + "\n" |
2372284d9457
formatting: blacken the codebase
Augie Fackler <augie@google.com>
parents:
43009
diff
changeset
|
24 ) |
29012
4bd67ae7d75a
bdiff: fix latent normalization bug
Matt Mackall <mpm@selenic.com>
parents:
diff
changeset
|
25 if a2 == a and b2 == b: |
4bd67ae7d75a
bdiff: fix latent normalization bug
Matt Mackall <mpm@selenic.com>
parents:
diff
changeset
|
26 continue |
4bd67ae7d75a
bdiff: fix latent normalization bug
Matt Mackall <mpm@selenic.com>
parents:
diff
changeset
|
27 if a2 == b2: |
4bd67ae7d75a
bdiff: fix latent normalization bug
Matt Mackall <mpm@selenic.com>
parents:
diff
changeset
|
28 continue |
4bd67ae7d75a
bdiff: fix latent normalization bug
Matt Mackall <mpm@selenic.com>
parents:
diff
changeset
|
29 tries += 1 |
4bd67ae7d75a
bdiff: fix latent normalization bug
Matt Mackall <mpm@selenic.com>
parents:
diff
changeset
|
30 |
4bd67ae7d75a
bdiff: fix latent normalization bug
Matt Mackall <mpm@selenic.com>
parents:
diff
changeset
|
31 try: |
4bd67ae7d75a
bdiff: fix latent normalization bug
Matt Mackall <mpm@selenic.com>
parents:
diff
changeset
|
32 test1(a, b) |
41365
876494fd967d
cleanup: delete lots of unused local variables
Martin von Zweigbergk <martinvonz@google.com>
parents:
32203
diff
changeset
|
33 except Exception: |
29012
4bd67ae7d75a
bdiff: fix latent normalization bug
Matt Mackall <mpm@selenic.com>
parents:
diff
changeset
|
34 reductions += 1 |
4bd67ae7d75a
bdiff: fix latent normalization bug
Matt Mackall <mpm@selenic.com>
parents:
diff
changeset
|
35 tries = 0 |
4bd67ae7d75a
bdiff: fix latent normalization bug
Matt Mackall <mpm@selenic.com>
parents:
diff
changeset
|
36 a = a2 |
4bd67ae7d75a
bdiff: fix latent normalization bug
Matt Mackall <mpm@selenic.com>
parents:
diff
changeset
|
37 b = b2 |
4bd67ae7d75a
bdiff: fix latent normalization bug
Matt Mackall <mpm@selenic.com>
parents:
diff
changeset
|
38 |
43076
2372284d9457
formatting: blacken the codebase
Augie Fackler <augie@google.com>
parents:
43009
diff
changeset
|
39 print("reduced:", reductions, len(a) + len(b), repr(a), repr(b)) |
29012
4bd67ae7d75a
bdiff: fix latent normalization bug
Matt Mackall <mpm@selenic.com>
parents:
diff
changeset
|
40 try: |
4bd67ae7d75a
bdiff: fix latent normalization bug
Matt Mackall <mpm@selenic.com>
parents:
diff
changeset
|
41 test1(a, b) |
4bd67ae7d75a
bdiff: fix latent normalization bug
Matt Mackall <mpm@selenic.com>
parents:
diff
changeset
|
42 except Exception as inst: |
4bd67ae7d75a
bdiff: fix latent normalization bug
Matt Mackall <mpm@selenic.com>
parents:
diff
changeset
|
43 print("failed:", inst) |
4bd67ae7d75a
bdiff: fix latent normalization bug
Matt Mackall <mpm@selenic.com>
parents:
diff
changeset
|
44 |
4bd67ae7d75a
bdiff: fix latent normalization bug
Matt Mackall <mpm@selenic.com>
parents:
diff
changeset
|
45 sys.exit(0) |
4bd67ae7d75a
bdiff: fix latent normalization bug
Matt Mackall <mpm@selenic.com>
parents:
diff
changeset
|
46 |
43076
2372284d9457
formatting: blacken the codebase
Augie Fackler <augie@google.com>
parents:
43009
diff
changeset
|
47 |
29012
4bd67ae7d75a
bdiff: fix latent normalization bug
Matt Mackall <mpm@selenic.com>
parents:
diff
changeset
|
48 def test1(a, b): |
32202
ded48ad55146
bdiff: proxy through mdiff module
Yuya Nishihara <yuya@tcha.org>
parents:
29209
diff
changeset
|
49 d = mdiff.textdiff(a, b) |
29012
4bd67ae7d75a
bdiff: fix latent normalization bug
Matt Mackall <mpm@selenic.com>
parents:
diff
changeset
|
50 if not d: |
4bd67ae7d75a
bdiff: fix latent normalization bug
Matt Mackall <mpm@selenic.com>
parents:
diff
changeset
|
51 raise ValueError("empty") |
32203
0c73634d0570
mpatch: proxy through mdiff module
Yuya Nishihara <yuya@tcha.org>
parents:
32202
diff
changeset
|
52 c = mdiff.patches(a, [d]) |
29012
4bd67ae7d75a
bdiff: fix latent normalization bug
Matt Mackall <mpm@selenic.com>
parents:
diff
changeset
|
53 if c != b: |
4bd67ae7d75a
bdiff: fix latent normalization bug
Matt Mackall <mpm@selenic.com>
parents:
diff
changeset
|
54 raise ValueError("bad") |
4bd67ae7d75a
bdiff: fix latent normalization bug
Matt Mackall <mpm@selenic.com>
parents:
diff
changeset
|
55 |
43076
2372284d9457
formatting: blacken the codebase
Augie Fackler <augie@google.com>
parents:
43009
diff
changeset
|
56 |
29012
4bd67ae7d75a
bdiff: fix latent normalization bug
Matt Mackall <mpm@selenic.com>
parents:
diff
changeset
|
57 def testwrap(a, b): |
4bd67ae7d75a
bdiff: fix latent normalization bug
Matt Mackall <mpm@selenic.com>
parents:
diff
changeset
|
58 try: |
4bd67ae7d75a
bdiff: fix latent normalization bug
Matt Mackall <mpm@selenic.com>
parents:
diff
changeset
|
59 test1(a, b) |
4bd67ae7d75a
bdiff: fix latent normalization bug
Matt Mackall <mpm@selenic.com>
parents:
diff
changeset
|
60 return |
4bd67ae7d75a
bdiff: fix latent normalization bug
Matt Mackall <mpm@selenic.com>
parents:
diff
changeset
|
61 except Exception as inst: |
42857
3316e59b0105
bdiff-torture: fix pyflakes warning reporting undefined name 'inst'
Pulkit Goyal <pulkit@yandex-team.ru>
parents:
41365
diff
changeset
|
62 print("exception:", inst) |
29012
4bd67ae7d75a
bdiff: fix latent normalization bug
Matt Mackall <mpm@selenic.com>
parents:
diff
changeset
|
63 reducetest(a, b) |
4bd67ae7d75a
bdiff: fix latent normalization bug
Matt Mackall <mpm@selenic.com>
parents:
diff
changeset
|
64 |
43076
2372284d9457
formatting: blacken the codebase
Augie Fackler <augie@google.com>
parents:
43009
diff
changeset
|
65 |
29012
4bd67ae7d75a
bdiff: fix latent normalization bug
Matt Mackall <mpm@selenic.com>
parents:
diff
changeset
|
66 def test(a, b): |
4bd67ae7d75a
bdiff: fix latent normalization bug
Matt Mackall <mpm@selenic.com>
parents:
diff
changeset
|
67 testwrap(a, b) |
4bd67ae7d75a
bdiff: fix latent normalization bug
Matt Mackall <mpm@selenic.com>
parents:
diff
changeset
|
68 testwrap(b, a) |
4bd67ae7d75a
bdiff: fix latent normalization bug
Matt Mackall <mpm@selenic.com>
parents:
diff
changeset
|
69 |
43076
2372284d9457
formatting: blacken the codebase
Augie Fackler <augie@google.com>
parents:
43009
diff
changeset
|
70 |
29012
4bd67ae7d75a
bdiff: fix latent normalization bug
Matt Mackall <mpm@selenic.com>
parents:
diff
changeset
|
71 def rndtest(size, noise): |
4bd67ae7d75a
bdiff: fix latent normalization bug
Matt Mackall <mpm@selenic.com>
parents:
diff
changeset
|
72 a = [] |
4bd67ae7d75a
bdiff: fix latent normalization bug
Matt Mackall <mpm@selenic.com>
parents:
diff
changeset
|
73 src = " aaaaaaaabbbbccd" |
43009
e05c141511dd
contrib: use pycompat.xrange in bdiff-torture.py
Gregory Szorc <gregory.szorc@gmail.com>
parents:
42857
diff
changeset
|
74 for x in pycompat.xrange(size): |
29012
4bd67ae7d75a
bdiff: fix latent normalization bug
Matt Mackall <mpm@selenic.com>
parents:
diff
changeset
|
75 a.append(src[random.randint(0, len(src) - 1)]) |
4bd67ae7d75a
bdiff: fix latent normalization bug
Matt Mackall <mpm@selenic.com>
parents:
diff
changeset
|
76 |
4bd67ae7d75a
bdiff: fix latent normalization bug
Matt Mackall <mpm@selenic.com>
parents:
diff
changeset
|
77 while True: |
4bd67ae7d75a
bdiff: fix latent normalization bug
Matt Mackall <mpm@selenic.com>
parents:
diff
changeset
|
78 b = [c for c in a if random.randint(0, 99) > noise] |
4bd67ae7d75a
bdiff: fix latent normalization bug
Matt Mackall <mpm@selenic.com>
parents:
diff
changeset
|
79 b2 = [] |
4bd67ae7d75a
bdiff: fix latent normalization bug
Matt Mackall <mpm@selenic.com>
parents:
diff
changeset
|
80 for c in b: |
4bd67ae7d75a
bdiff: fix latent normalization bug
Matt Mackall <mpm@selenic.com>
parents:
diff
changeset
|
81 b2.append(c) |
4bd67ae7d75a
bdiff: fix latent normalization bug
Matt Mackall <mpm@selenic.com>
parents:
diff
changeset
|
82 while random.randint(0, 99) < noise: |
4bd67ae7d75a
bdiff: fix latent normalization bug
Matt Mackall <mpm@selenic.com>
parents:
diff
changeset
|
83 b2.append(src[random.randint(0, len(src) - 1)]) |
4bd67ae7d75a
bdiff: fix latent normalization bug
Matt Mackall <mpm@selenic.com>
parents:
diff
changeset
|
84 if b2 != a: |
4bd67ae7d75a
bdiff: fix latent normalization bug
Matt Mackall <mpm@selenic.com>
parents:
diff
changeset
|
85 break |
4bd67ae7d75a
bdiff: fix latent normalization bug
Matt Mackall <mpm@selenic.com>
parents:
diff
changeset
|
86 |
4bd67ae7d75a
bdiff: fix latent normalization bug
Matt Mackall <mpm@selenic.com>
parents:
diff
changeset
|
87 a = "\n".join(a) + "\n" |
4bd67ae7d75a
bdiff: fix latent normalization bug
Matt Mackall <mpm@selenic.com>
parents:
diff
changeset
|
88 b = "\n".join(b2) + "\n" |
4bd67ae7d75a
bdiff: fix latent normalization bug
Matt Mackall <mpm@selenic.com>
parents:
diff
changeset
|
89 |
4bd67ae7d75a
bdiff: fix latent normalization bug
Matt Mackall <mpm@selenic.com>
parents:
diff
changeset
|
90 test(a, b) |
4bd67ae7d75a
bdiff: fix latent normalization bug
Matt Mackall <mpm@selenic.com>
parents:
diff
changeset
|
91 |
43076
2372284d9457
formatting: blacken the codebase
Augie Fackler <augie@google.com>
parents:
43009
diff
changeset
|
92 |
29012
4bd67ae7d75a
bdiff: fix latent normalization bug
Matt Mackall <mpm@selenic.com>
parents:
diff
changeset
|
93 maxvol = 10000 |
4bd67ae7d75a
bdiff: fix latent normalization bug
Matt Mackall <mpm@selenic.com>
parents:
diff
changeset
|
94 startsize = 2 |
4bd67ae7d75a
bdiff: fix latent normalization bug
Matt Mackall <mpm@selenic.com>
parents:
diff
changeset
|
95 while True: |
4bd67ae7d75a
bdiff: fix latent normalization bug
Matt Mackall <mpm@selenic.com>
parents:
diff
changeset
|
96 size = startsize |
4bd67ae7d75a
bdiff: fix latent normalization bug
Matt Mackall <mpm@selenic.com>
parents:
diff
changeset
|
97 count = 0 |
4bd67ae7d75a
bdiff: fix latent normalization bug
Matt Mackall <mpm@selenic.com>
parents:
diff
changeset
|
98 while size < maxvol: |
4bd67ae7d75a
bdiff: fix latent normalization bug
Matt Mackall <mpm@selenic.com>
parents:
diff
changeset
|
99 print(size) |
4bd67ae7d75a
bdiff: fix latent normalization bug
Matt Mackall <mpm@selenic.com>
parents:
diff
changeset
|
100 volume = 0 |
4bd67ae7d75a
bdiff: fix latent normalization bug
Matt Mackall <mpm@selenic.com>
parents:
diff
changeset
|
101 while volume < maxvol: |
4bd67ae7d75a
bdiff: fix latent normalization bug
Matt Mackall <mpm@selenic.com>
parents:
diff
changeset
|
102 rndtest(size, 2) |
4bd67ae7d75a
bdiff: fix latent normalization bug
Matt Mackall <mpm@selenic.com>
parents:
diff
changeset
|
103 volume += size |
4bd67ae7d75a
bdiff: fix latent normalization bug
Matt Mackall <mpm@selenic.com>
parents:
diff
changeset
|
104 count += 2 |
4bd67ae7d75a
bdiff: fix latent normalization bug
Matt Mackall <mpm@selenic.com>
parents:
diff
changeset
|
105 size *= 2 |
4bd67ae7d75a
bdiff: fix latent normalization bug
Matt Mackall <mpm@selenic.com>
parents:
diff
changeset
|
106 maxvol *= 4 |
4bd67ae7d75a
bdiff: fix latent normalization bug
Matt Mackall <mpm@selenic.com>
parents:
diff
changeset
|
107 startsize *= 4 |