# HG changeset patch # User Matt Mackall # Date 1446766210 21600 # Node ID a24b98f4e03c5d499f3b07f948d52538164366c8 # Parent d7e83f1064590c1ecb6cde0e42a55311febd73e4 encoding: re-escape U+DCxx characters in toutf8b input (issue4927) This is the final missing piece in fully round-tripping random byte strings through UTF-8b. While this issue means that UTF-8 <-> UTF-8b isn't fully bijective, we don't expect to ever see U+DCxx codepoints in "real" UTF-8 data, so it should remain bijective in practice. diff -r d7e83f106459 -r a24b98f4e03c mercurial/encoding.py --- a/mercurial/encoding.py Thu Nov 05 17:21:43 2015 -0600 +++ b/mercurial/encoding.py Thu Nov 05 17:30:10 2015 -0600 @@ -463,14 +463,14 @@ internal surrogate encoding as a UTF-8 string.) ''' - if isinstance(s, localstr): - return s._utf8 - - try: - s.decode('utf-8') - return s - except UnicodeDecodeError: - pass + if "\xed" not in s: + if isinstance(s, localstr): + return s._utf8 + try: + s.decode('utf-8') + return s + except UnicodeDecodeError: + pass r = "" pos = 0 @@ -478,7 +478,12 @@ while pos < l: try: c = getutf8char(s, pos) - pos += len(c) + if "\xed\xb0\x80" <= c <= "\xed\xb3\xbf": + # have to re-escape existing U+DCxx characters + c = unichr(0xdc00 + ord(s[pos])).encode('utf-8') + pos += 1 + else: + pos += len(c) except UnicodeDecodeError: c = unichr(0xdc00 + ord(s[pos])).encode('utf-8') pos += 1