py3: use 'surrogatepass' error handler to process U+DCxx transparently
It's disallowed by default on Python 3.
https://docs.python.org/3/library/codecs.html#error-handlers
--- a/mercurial/encoding.py Sat Sep 16 22:42:19 2017 +0900
+++ b/mercurial/encoding.py Sat Sep 16 22:55:48 2017 +0900
@@ -448,6 +448,13 @@
pass
return charencodepure.jsonescapeu8fallback(u8chars, paranoid)
+# We need to decode/encode U+DCxx codes transparently since invalid UTF-8
+# bytes are mapped to that range.
+if pycompat.ispy3:
+ _utf8strict = r'surrogatepass'
+else:
+ _utf8strict = r'strict'
+
_utf8len = [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 2, 2, 3, 4]
def getutf8char(s, pos):
@@ -464,7 +471,7 @@
c = s[pos:pos + l]
# validate with attempted decode
- c.decode("utf-8")
+ c.decode("utf-8", _utf8strict)
return c
def toutf8b(s):
@@ -503,7 +510,7 @@
if isinstance(s, localstr):
return s._utf8
try:
- s.decode('utf-8')
+ s.decode('utf-8', _utf8strict)
return s
except UnicodeDecodeError:
pass
@@ -517,12 +524,12 @@
c = getutf8char(s, pos)
if "\xed\xb0\x80" <= c <= "\xed\xb3\xbf":
# have to re-escape existing U+DCxx characters
- c = unichr(0xdc00 + ord(s[pos])).encode('utf-8')
+ c = unichr(0xdc00 + ord(s[pos])).encode('utf-8', _utf8strict)
pos += 1
else:
pos += len(c)
except UnicodeDecodeError:
- c = unichr(0xdc00 + ord(s[pos])).encode('utf-8')
+ c = unichr(0xdc00 + ord(s[pos])).encode('utf-8', _utf8strict)
pos += 1
r += c
return r
@@ -570,7 +577,7 @@
pos += len(c)
# unescape U+DCxx characters
if "\xed\xb0\x80" <= c <= "\xed\xb3\xbf":
- c = chr(ord(c.decode("utf-8")) & 0xff)
+ c = pycompat.bytechr(ord(c.decode("utf-8", _utf8strict)) & 0xff)
r += c
return r
--- a/mercurial/pure/charencode.py Sat Sep 16 22:42:19 2017 +0900
+++ b/mercurial/pure/charencode.py Sat Sep 16 22:55:48 2017 +0900
@@ -64,6 +64,11 @@
except IndexError:
raise ValueError
+if pycompat.ispy3:
+ _utf8strict = r'surrogatepass'
+else:
+ _utf8strict = r'strict'
+
def jsonescapeu8fallback(u8chars, paranoid):
"""Convert a UTF-8 byte string to JSON-escaped form (slow path)
@@ -74,6 +79,7 @@
else:
jm = _jsonmap
# non-BMP char is represented as UTF-16 surrogate pair
- u16codes = array.array(r'H', u8chars.decode('utf-8').encode('utf-16'))
+ u16b = u8chars.decode('utf-8', _utf8strict).encode('utf-16', _utf8strict)
+ u16codes = array.array(r'H', u16b)
u16codes.pop(0) # drop BOM
return ''.join(jm[x] if x < 128 else '\\u%04x' % x for x in u16codes)
--- a/tests/test-doctest.py Sat Sep 16 22:42:19 2017 +0900
+++ b/tests/test-doctest.py Sat Sep 16 22:55:48 2017 +0900
@@ -50,7 +50,7 @@
testmod('mercurial.context')
testmod('mercurial.dagparser', optionflags=doctest.NORMALIZE_WHITESPACE)
testmod('mercurial.dispatch')
-testmod('mercurial.encoding', py3=False) # py3: multiple encoding issues
+testmod('mercurial.encoding')
testmod('mercurial.formatter', py3=False) # py3: write bytes to stdout
testmod('mercurial.hg')
testmod('mercurial.hgweb.hgwebdir_mod', py3=False) # py3: repr(bytes) ?