encoding: fix toutf8b() to resurrect lossy characters even if "\xed" in it
If 's' is a localstr, 's._utf8' must be returned to get the original UTF-8
sequence back. Because of this, it was totally wrong to test if '"\xed" not
in s', which should be either '"\xed" not in s._utf8' or just omitted.
This patch moves the localstr handling to top as the validity of 's._utf8'
should be pre-checked by encoding.tolocal().
--- a/mercurial/encoding.py Sun Mar 25 16:47:33 2018 +0900
+++ b/mercurial/encoding.py Sun Apr 22 11:38:53 2018 +0900
@@ -504,11 +504,13 @@
internal surrogate encoding as a UTF-8 string.)
'''
- if not isinstance(s, localstr) and isasciistr(s):
+ if isinstance(s, localstr):
+ # assume that the original UTF-8 sequence would never contain
+ # invalid characters in U+DCxx range
+ return s._utf8
+ elif isasciistr(s):
return s
if "\xed" not in s:
- if isinstance(s, localstr):
- return s._utf8
try:
s.decode('utf-8', _utf8strict)
return s
--- a/tests/test-encoding-func.py Sun Mar 25 16:47:33 2018 +0900
+++ b/tests/test-encoding-func.py Sun Apr 22 11:38:53 2018 +0900
@@ -35,11 +35,32 @@
self.assertTrue(s is encoding.fromlocal(s))
class Utf8bEncodingTest(unittest.TestCase):
+ def setUp(self):
+ self.origencoding = encoding.encoding
+
+ def tearDown(self):
+ encoding.encoding = self.origencoding
+
def testasciifastpath(self):
s = b'\0' * 100
self.assertTrue(s is encoding.toutf8b(s))
self.assertTrue(s is encoding.fromutf8b(s))
+ def testlossylatin(self):
+ encoding.encoding = b'ascii'
+ s = u'\xc0'.encode('utf-8')
+ l = encoding.tolocal(s)
+ self.assertEqual(l, b'?') # lossy
+ self.assertEqual(s, encoding.toutf8b(l)) # utf8 sequence preserved
+
+ def testlossy0xed(self):
+ encoding.encoding = b'euc-kr' # U+Dxxx Hangul
+ s = u'\ud1bc\xc0'.encode('utf-8')
+ l = encoding.tolocal(s)
+ self.assertIn(b'\xed', l)
+ self.assertTrue(l.endswith(b'?')) # lossy
+ self.assertEqual(s, encoding.toutf8b(l)) # utf8 sequence preserved
+
if __name__ == '__main__':
import silenttestrunner
silenttestrunner.main(__name__)