# HG changeset patch # User Yuya Nishihara # Date 1505570148 -32400 # Node ID aa877860d4d7af411b7d6652aa27a6e2a2e1a19f # Parent 5307cc57f2719e7cd082a93dddcdbdde04f23ccf py3: use 'surrogatepass' error handler to process U+DCxx transparently It's disallowed by default on Python 3. https://docs.python.org/3/library/codecs.html#error-handlers diff -r 5307cc57f271 -r aa877860d4d7 mercurial/encoding.py --- a/mercurial/encoding.py Sat Sep 16 22:42:19 2017 +0900 +++ b/mercurial/encoding.py Sat Sep 16 22:55:48 2017 +0900 @@ -448,6 +448,13 @@ pass return charencodepure.jsonescapeu8fallback(u8chars, paranoid) +# We need to decode/encode U+DCxx codes transparently since invalid UTF-8 +# bytes are mapped to that range. +if pycompat.ispy3: + _utf8strict = r'surrogatepass' +else: + _utf8strict = r'strict' + _utf8len = [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 2, 2, 3, 4] def getutf8char(s, pos): @@ -464,7 +471,7 @@ c = s[pos:pos + l] # validate with attempted decode - c.decode("utf-8") + c.decode("utf-8", _utf8strict) return c def toutf8b(s): @@ -503,7 +510,7 @@ if isinstance(s, localstr): return s._utf8 try: - s.decode('utf-8') + s.decode('utf-8', _utf8strict) return s except UnicodeDecodeError: pass @@ -517,12 +524,12 @@ c = getutf8char(s, pos) if "\xed\xb0\x80" <= c <= "\xed\xb3\xbf": # have to re-escape existing U+DCxx characters - c = unichr(0xdc00 + ord(s[pos])).encode('utf-8') + c = unichr(0xdc00 + ord(s[pos])).encode('utf-8', _utf8strict) pos += 1 else: pos += len(c) except UnicodeDecodeError: - c = unichr(0xdc00 + ord(s[pos])).encode('utf-8') + c = unichr(0xdc00 + ord(s[pos])).encode('utf-8', _utf8strict) pos += 1 r += c return r @@ -570,7 +577,7 @@ pos += len(c) # unescape U+DCxx characters if "\xed\xb0\x80" <= c <= "\xed\xb3\xbf": - c = chr(ord(c.decode("utf-8")) & 0xff) + c = pycompat.bytechr(ord(c.decode("utf-8", _utf8strict)) & 0xff) r += c return r diff -r 5307cc57f271 -r aa877860d4d7 mercurial/pure/charencode.py --- a/mercurial/pure/charencode.py Sat Sep 16 22:42:19 2017 +0900 +++ b/mercurial/pure/charencode.py Sat Sep 16 22:55:48 2017 +0900 @@ -64,6 +64,11 @@ except IndexError: raise ValueError +if pycompat.ispy3: + _utf8strict = r'surrogatepass' +else: + _utf8strict = r'strict' + def jsonescapeu8fallback(u8chars, paranoid): """Convert a UTF-8 byte string to JSON-escaped form (slow path) @@ -74,6 +79,7 @@ else: jm = _jsonmap # non-BMP char is represented as UTF-16 surrogate pair - u16codes = array.array(r'H', u8chars.decode('utf-8').encode('utf-16')) + u16b = u8chars.decode('utf-8', _utf8strict).encode('utf-16', _utf8strict) + u16codes = array.array(r'H', u16b) u16codes.pop(0) # drop BOM return ''.join(jm[x] if x < 128 else '\\u%04x' % x for x in u16codes) diff -r 5307cc57f271 -r aa877860d4d7 tests/test-doctest.py --- a/tests/test-doctest.py Sat Sep 16 22:42:19 2017 +0900 +++ b/tests/test-doctest.py Sat Sep 16 22:55:48 2017 +0900 @@ -50,7 +50,7 @@ testmod('mercurial.context') testmod('mercurial.dagparser', optionflags=doctest.NORMALIZE_WHITESPACE) testmod('mercurial.dispatch') -testmod('mercurial.encoding', py3=False) # py3: multiple encoding issues +testmod('mercurial.encoding') testmod('mercurial.formatter', py3=False) # py3: write bytes to stdout testmod('mercurial.hg') testmod('mercurial.hgweb.hgwebdir_mod', py3=False) # py3: repr(bytes) ?