Mercurial > hg
changeset 26879:a24b98f4e03c
encoding: re-escape U+DCxx characters in toutf8b input (issue4927)
This is the final missing piece in fully round-tripping random byte
strings through UTF-8b. While this issue means that UTF-8 <-> UTF-8b
isn't fully bijective, we don't expect to ever see U+DCxx codepoints
in "real" UTF-8 data, so it should remain bijective in practice.
author | Matt Mackall <mpm@selenic.com> |
---|---|
date | Thu, 05 Nov 2015 17:30:10 -0600 |
parents | d7e83f106459 |
children | 8b2fbe3f59b1 |
files | mercurial/encoding.py |
diffstat | 1 files changed, 14 insertions(+), 9 deletions(-) [+] |
line wrap: on
line diff
--- a/mercurial/encoding.py Thu Nov 05 17:21:43 2015 -0600 +++ b/mercurial/encoding.py Thu Nov 05 17:30:10 2015 -0600 @@ -463,14 +463,14 @@ internal surrogate encoding as a UTF-8 string.) ''' - if isinstance(s, localstr): - return s._utf8 - - try: - s.decode('utf-8') - return s - except UnicodeDecodeError: - pass + if "\xed" not in s: + if isinstance(s, localstr): + return s._utf8 + try: + s.decode('utf-8') + return s + except UnicodeDecodeError: + pass r = "" pos = 0 @@ -478,7 +478,12 @@ while pos < l: try: c = getutf8char(s, pos) - pos += len(c) + if "\xed\xb0\x80" <= c <= "\xed\xb3\xbf": + # have to re-escape existing U+DCxx characters + c = unichr(0xdc00 + ord(s[pos])).encode('utf-8') + pos += 1 + else: + pos += len(c) except UnicodeDecodeError: c = unichr(0xdc00 + ord(s[pos])).encode('utf-8') pos += 1