Mercurial > hg
changeset 50400:95acba2c29f6
encoding: avoid quadratic time complexity when json-encoding non-UTF8 strings
Apparently the code uses "+=" with a bytes object, which is linear-time, so the
whole encoding is quadratic-time. This patch makes us use a bytearray object,
instead, which has a(n amortized-)constant-time append operation.
The encoding is still not particularly fast, but at least a 10MB file
takes tens of seconds, not many hours to encode.
author | Arseniy Alekseyev <aalekseyev@janestreet.com> |
---|---|
date | Mon, 06 Mar 2023 11:27:57 +0000 |
parents | bcf54837241d |
children | dd42156b6441 |
files | mercurial/encoding.py |
diffstat | 1 files changed, 4 insertions(+), 4 deletions(-) [+] |
line wrap: on
line diff
--- a/mercurial/encoding.py Wed Mar 08 11:01:11 2023 +0100 +++ b/mercurial/encoding.py Mon Mar 06 11:27:57 2023 +0000 @@ -657,7 +657,7 @@ pass s = pycompat.bytestr(s) - r = b"" + r = bytearray() pos = 0 l = len(s) while pos < l: @@ -673,7 +673,7 @@ c = unichr(0xDC00 + ord(s[pos])).encode('utf-8', _utf8strict) pos += 1 r += c - return r + return bytes(r) def fromutf8b(s): @@ -712,7 +712,7 @@ # helper again to walk the string without "decoding" it. s = pycompat.bytestr(s) - r = b"" + r = bytearray() pos = 0 l = len(s) while pos < l: @@ -722,4 +722,4 @@ if b"\xed\xb0\x80" <= c <= b"\xed\xb3\xbf": c = pycompat.bytechr(ord(c.decode("utf-8", _utf8strict)) & 0xFF) r += c - return r + return bytes(r)