encoding: avoid quadratic time complexity when json-encoding non-UTF8 strings
Apparently the code uses "+=" with a bytes object, which is linear-time, so the
whole encoding is quadratic-time. This patch makes us use a bytearray object,
instead, which has a(n amortized-)constant-time append operation.
The encoding is still not particularly fast, but at least a 10MB file
takes tens of seconds, not many hours to encode.
--- a/mercurial/encoding.py Wed Mar 08 11:01:11 2023 +0100
+++ b/mercurial/encoding.py Mon Mar 06 11:27:57 2023 +0000
@@ -657,7 +657,7 @@
pass
s = pycompat.bytestr(s)
- r = b""
+ r = bytearray()
pos = 0
l = len(s)
while pos < l:
@@ -673,7 +673,7 @@
c = unichr(0xDC00 + ord(s[pos])).encode('utf-8', _utf8strict)
pos += 1
r += c
- return r
+ return bytes(r)
def fromutf8b(s):
@@ -712,7 +712,7 @@
# helper again to walk the string without "decoding" it.
s = pycompat.bytestr(s)
- r = b""
+ r = bytearray()
pos = 0
l = len(s)
while pos < l:
@@ -722,4 +722,4 @@
if b"\xed\xb0\x80" <= c <= b"\xed\xb3\xbf":
c = pycompat.bytechr(ord(c.decode("utf-8", _utf8strict)) & 0xFF)
r += c
- return r
+ return bytes(r)