encoding: use getutf8char in toutf8b
This correctly avoids the ambiguity of U+FFFD already present in the
input and similar confusion by working a character at a time.
--- a/mercurial/encoding.py Thu Nov 05 17:11:50 2015 -0600
+++ b/mercurial/encoding.py Thu Nov 05 17:21:43 2015 -0600
@@ -470,17 +470,20 @@
s.decode('utf-8')
return s
except UnicodeDecodeError:
- # surrogate-encode any characters that don't round-trip
- s2 = s.decode('utf-8', 'ignore').encode('utf-8')
- r = ""
- pos = 0
- for c in s:
- if s2[pos:pos + 1] == c:
- r += c
- pos += 1
- else:
- r += unichr(0xdc00 + ord(c)).encode('utf-8')
- return r
+ pass
+
+ r = ""
+ pos = 0
+ l = len(s)
+ while pos < l:
+ try:
+ c = getutf8char(s, pos)
+ pos += len(c)
+ except UnicodeDecodeError:
+ c = unichr(0xdc00 + ord(s[pos])).encode('utf-8')
+ pos += 1
+ r += c
+ return r
def fromutf8b(s):
'''Given a UTF-8b string, return a local, possibly-binary string.