encoding: re-escape U+DCxx characters in toutf8b input (
issue4927)
This is the final missing piece in fully round-tripping random byte
strings through UTF-8b. While this issue means that UTF-8 <-> UTF-8b
isn't fully bijective, we don't expect to ever see U+DCxx codepoints
in "real" UTF-8 data, so it should remain bijective in practice.
--- a/mercurial/encoding.py Thu Nov 05 17:21:43 2015 -0600
+++ b/mercurial/encoding.py Thu Nov 05 17:30:10 2015 -0600
@@ -463,14 +463,14 @@
internal surrogate encoding as a UTF-8 string.)
'''
- if isinstance(s, localstr):
- return s._utf8
-
- try:
- s.decode('utf-8')
- return s
- except UnicodeDecodeError:
- pass
+ if "\xed" not in s:
+ if isinstance(s, localstr):
+ return s._utf8
+ try:
+ s.decode('utf-8')
+ return s
+ except UnicodeDecodeError:
+ pass
r = ""
pos = 0
@@ -478,7 +478,12 @@
while pos < l:
try:
c = getutf8char(s, pos)
- pos += len(c)
+ if "\xed\xb0\x80" <= c <= "\xed\xb3\xbf":
+ # have to re-escape existing U+DCxx characters
+ c = unichr(0xdc00 + ord(s[pos])).encode('utf-8')
+ pos += 1
+ else:
+ pos += len(c)
except UnicodeDecodeError:
c = unichr(0xdc00 + ord(s[pos])).encode('utf-8')
pos += 1