encoding: use getutf8char in toutf8b
authorMatt Mackall <mpm@selenic.com>
Thu, 05 Nov 2015 17:21:43 -0600
changeset 26878 d7e83f106459
parent 26877 cb467a9d7593
child 26879 a24b98f4e03c
encoding: use getutf8char in toutf8b This correctly avoids the ambiguity of U+FFFD already present in the input and similar confusion by working a character at a time.
mercurial/encoding.py
--- a/mercurial/encoding.py	Thu Nov 05 17:11:50 2015 -0600
+++ b/mercurial/encoding.py	Thu Nov 05 17:21:43 2015 -0600
@@ -470,17 +470,20 @@
         s.decode('utf-8')
         return s
     except UnicodeDecodeError:
-        # surrogate-encode any characters that don't round-trip
-        s2 = s.decode('utf-8', 'ignore').encode('utf-8')
-        r = ""
-        pos = 0
-        for c in s:
-            if s2[pos:pos + 1] == c:
-                r += c
-                pos += 1
-            else:
-                r += unichr(0xdc00 + ord(c)).encode('utf-8')
-        return r
+        pass
+
+    r = ""
+    pos = 0
+    l = len(s)
+    while pos < l:
+        try:
+            c = getutf8char(s, pos)
+            pos += len(c)
+        except UnicodeDecodeError:
+            c = unichr(0xdc00 + ord(s[pos])).encode('utf-8')
+            pos += 1
+        r += c
+    return r
 
 def fromutf8b(s):
     '''Given a UTF-8b string, return a local, possibly-binary string.