Mercurial > hg
changeset 26878:d7e83f106459
encoding: use getutf8char in toutf8b
This correctly avoids the ambiguity of U+FFFD already present in the
input and similar confusion by working a character at a time.
author | Matt Mackall <mpm@selenic.com> |
---|---|
date | Thu, 05 Nov 2015 17:21:43 -0600 |
parents | cb467a9d7593 |
children | a24b98f4e03c |
files | mercurial/encoding.py |
diffstat | 1 files changed, 14 insertions(+), 11 deletions(-) [+] |
line wrap: on
line diff
--- a/mercurial/encoding.py Thu Nov 05 17:11:50 2015 -0600 +++ b/mercurial/encoding.py Thu Nov 05 17:21:43 2015 -0600 @@ -470,17 +470,20 @@ s.decode('utf-8') return s except UnicodeDecodeError: - # surrogate-encode any characters that don't round-trip - s2 = s.decode('utf-8', 'ignore').encode('utf-8') - r = "" - pos = 0 - for c in s: - if s2[pos:pos + 1] == c: - r += c - pos += 1 - else: - r += unichr(0xdc00 + ord(c)).encode('utf-8') - return r + pass + + r = "" + pos = 0 + l = len(s) + while pos < l: + try: + c = getutf8char(s, pos) + pos += len(c) + except UnicodeDecodeError: + c = unichr(0xdc00 + ord(s[pos])).encode('utf-8') + pos += 1 + r += c + return r def fromutf8b(s): '''Given a UTF-8b string, return a local, possibly-binary string.