--- a/mercurial/encoding.py Sat Sep 16 22:42:19 2017 +0900
+++ b/mercurial/encoding.py Sat Sep 16 22:55:48 2017 +0900
@@ -448,6 +448,13 @@
pass
return charencodepure.jsonescapeu8fallback(u8chars, paranoid)
+# We need to decode/encode U+DCxx codes transparently since invalid UTF-8
+# bytes are mapped to that range.
+if pycompat.ispy3:
+ _utf8strict = r'surrogatepass'
+else:
+ _utf8strict = r'strict'
+
_utf8len = [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 2, 2, 3, 4]
def getutf8char(s, pos):
@@ -464,7 +471,7 @@
c = s[pos:pos + l]
# validate with attempted decode
- c.decode("utf-8")
+ c.decode("utf-8", _utf8strict)
return c
def toutf8b(s):
@@ -503,7 +510,7 @@
if isinstance(s, localstr):
return s._utf8
try:
- s.decode('utf-8')
+ s.decode('utf-8', _utf8strict)
return s
except UnicodeDecodeError:
pass
@@ -517,12 +524,12 @@
c = getutf8char(s, pos)
if "\xed\xb0\x80" <= c <= "\xed\xb3\xbf":
# have to re-escape existing U+DCxx characters
- c = unichr(0xdc00 + ord(s[pos])).encode('utf-8')
+ c = unichr(0xdc00 + ord(s[pos])).encode('utf-8', _utf8strict)
pos += 1
else:
pos += len(c)
except UnicodeDecodeError:
- c = unichr(0xdc00 + ord(s[pos])).encode('utf-8')
+ c = unichr(0xdc00 + ord(s[pos])).encode('utf-8', _utf8strict)
pos += 1
r += c
return r
@@ -570,7 +577,7 @@
pos += len(c)
# unescape U+DCxx characters
if "\xed\xb0\x80" <= c <= "\xed\xb3\xbf":
- c = chr(ord(c.decode("utf-8")) & 0xff)
+ c = pycompat.bytechr(ord(c.decode("utf-8", _utf8strict)) & 0xff)
r += c
return r