Mercurial > hg
changeset 34218:aa877860d4d7
py3: use 'surrogatepass' error handler to process U+DCxx transparently
It's disallowed by default on Python 3.
https://docs.python.org/3/library/codecs.html#error-handlers
author | Yuya Nishihara <yuya@tcha.org> |
---|---|
date | Sat, 16 Sep 2017 22:55:48 +0900 |
parents | 5307cc57f271 |
children | 21fc747e1bc5 |
files | mercurial/encoding.py mercurial/pure/charencode.py tests/test-doctest.py |
diffstat | 3 files changed, 20 insertions(+), 7 deletions(-) [+] |
line wrap: on
line diff
--- a/mercurial/encoding.py Sat Sep 16 22:42:19 2017 +0900 +++ b/mercurial/encoding.py Sat Sep 16 22:55:48 2017 +0900 @@ -448,6 +448,13 @@ pass return charencodepure.jsonescapeu8fallback(u8chars, paranoid) +# We need to decode/encode U+DCxx codes transparently since invalid UTF-8 +# bytes are mapped to that range. +if pycompat.ispy3: + _utf8strict = r'surrogatepass' +else: + _utf8strict = r'strict' + _utf8len = [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 2, 2, 3, 4] def getutf8char(s, pos): @@ -464,7 +471,7 @@ c = s[pos:pos + l] # validate with attempted decode - c.decode("utf-8") + c.decode("utf-8", _utf8strict) return c def toutf8b(s): @@ -503,7 +510,7 @@ if isinstance(s, localstr): return s._utf8 try: - s.decode('utf-8') + s.decode('utf-8', _utf8strict) return s except UnicodeDecodeError: pass @@ -517,12 +524,12 @@ c = getutf8char(s, pos) if "\xed\xb0\x80" <= c <= "\xed\xb3\xbf": # have to re-escape existing U+DCxx characters - c = unichr(0xdc00 + ord(s[pos])).encode('utf-8') + c = unichr(0xdc00 + ord(s[pos])).encode('utf-8', _utf8strict) pos += 1 else: pos += len(c) except UnicodeDecodeError: - c = unichr(0xdc00 + ord(s[pos])).encode('utf-8') + c = unichr(0xdc00 + ord(s[pos])).encode('utf-8', _utf8strict) pos += 1 r += c return r @@ -570,7 +577,7 @@ pos += len(c) # unescape U+DCxx characters if "\xed\xb0\x80" <= c <= "\xed\xb3\xbf": - c = chr(ord(c.decode("utf-8")) & 0xff) + c = pycompat.bytechr(ord(c.decode("utf-8", _utf8strict)) & 0xff) r += c return r
--- a/mercurial/pure/charencode.py Sat Sep 16 22:42:19 2017 +0900 +++ b/mercurial/pure/charencode.py Sat Sep 16 22:55:48 2017 +0900 @@ -64,6 +64,11 @@ except IndexError: raise ValueError +if pycompat.ispy3: + _utf8strict = r'surrogatepass' +else: + _utf8strict = r'strict' + def jsonescapeu8fallback(u8chars, paranoid): """Convert a UTF-8 byte string to JSON-escaped form (slow path) @@ -74,6 +79,7 @@ else: jm = _jsonmap # non-BMP char is represented as UTF-16 surrogate pair - u16codes = array.array(r'H', u8chars.decode('utf-8').encode('utf-16')) + u16b = u8chars.decode('utf-8', _utf8strict).encode('utf-16', _utf8strict) + u16codes = array.array(r'H', u16b) u16codes.pop(0) # drop BOM return ''.join(jm[x] if x < 128 else '\\u%04x' % x for x in u16codes)
--- a/tests/test-doctest.py Sat Sep 16 22:42:19 2017 +0900 +++ b/tests/test-doctest.py Sat Sep 16 22:55:48 2017 +0900 @@ -50,7 +50,7 @@ testmod('mercurial.context') testmod('mercurial.dagparser', optionflags=doctest.NORMALIZE_WHITESPACE) testmod('mercurial.dispatch') -testmod('mercurial.encoding', py3=False) # py3: multiple encoding issues +testmod('mercurial.encoding') testmod('mercurial.formatter', py3=False) # py3: write bytes to stdout testmod('mercurial.hg') testmod('mercurial.hgweb.hgwebdir_mod', py3=False) # py3: repr(bytes) ?