# HG changeset patch # User Arseniy Alekseyev # Date 1678102077 0 # Node ID 95acba2c29f6e90a7c19da0585a7b52efb416082 # Parent bcf54837241d2efa0d4e0b727d7fe9b6433e3448 encoding: avoid quadratic time complexity when json-encoding non-UTF8 strings Apparently the code uses "+=" with a bytes object, which is linear-time, so the whole encoding is quadratic-time. This patch makes us use a bytearray object, instead, which has a(n amortized-)constant-time append operation. The encoding is still not particularly fast, but at least a 10MB file takes tens of seconds, not many hours to encode. diff -r bcf54837241d -r 95acba2c29f6 mercurial/encoding.py --- a/mercurial/encoding.py Wed Mar 08 11:01:11 2023 +0100 +++ b/mercurial/encoding.py Mon Mar 06 11:27:57 2023 +0000 @@ -657,7 +657,7 @@ pass s = pycompat.bytestr(s) - r = b"" + r = bytearray() pos = 0 l = len(s) while pos < l: @@ -673,7 +673,7 @@ c = unichr(0xDC00 + ord(s[pos])).encode('utf-8', _utf8strict) pos += 1 r += c - return r + return bytes(r) def fromutf8b(s): @@ -712,7 +712,7 @@ # helper again to walk the string without "decoding" it. s = pycompat.bytestr(s) - r = b"" + r = bytearray() pos = 0 l = len(s) while pos < l: @@ -722,4 +722,4 @@ if b"\xed\xb0\x80" <= c <= b"\xed\xb3\xbf": c = pycompat.bytechr(ord(c.decode("utf-8", _utf8strict)) & 0xFF) r += c - return r + return bytes(r)