diff tests/test-encoding-func.py @ 37947:3ea3c96ada54

encoding: introduce tagging type for non-lossy non-ASCII string This fixes the weird behavior of toutf8b(), which would convert a local string back to UTF-8 *only if* it was lossy in the system encoding. Before b7b26e54e37a "encoding: avoid localstr when a string can be encoded losslessly (issue2763)", all local strings were wrapped by the localstr class. I think this would justify the round-trip behavior of toutf8b(). ASCII strings are special-cased, so the cost of wrapping with safelocalstr is negligible. (with mercurial repo) $ export HGRCPATH=/dev/null HGPLAIN= HGENCODING=latin-1 $ hg log --time --config experimental.evolution=all > /dev/null (original) time: real 11.340 secs (user 11.290+0.000 sys 0.060+0.000) time: real 11.390 secs (user 11.300+0.000 sys 0.080+0.000) time: real 11.430 secs (user 11.360+0.000 sys 0.070+0.000) (this patch) time: real 11.200 secs (user 11.100+0.000 sys 0.100+0.000) time: real 11.370 secs (user 11.300+0.000 sys 0.070+0.000) time: real 11.190 secs (user 11.130+0.000 sys 0.060+0.000)
author Yuya Nishihara <yuya@tcha.org>
date Sun, 23 Apr 2017 13:15:30 +0900
parents 57b0c7221dba
children 2372284d9457
line wrap: on
line diff
--- a/tests/test-encoding-func.py	Sun Apr 22 11:38:53 2018 +0900
+++ b/tests/test-encoding-func.py	Sun Apr 23 13:15:30 2017 +0900
@@ -53,6 +53,13 @@
         self.assertEqual(l, b'?')  # lossy
         self.assertEqual(s, encoding.toutf8b(l))  # utf8 sequence preserved
 
+    def testlosslesslatin(self):
+        encoding.encoding = b'latin-1'
+        s = u'\xc0'.encode('utf-8')
+        l = encoding.tolocal(s)
+        self.assertEqual(l, b'\xc0')  # lossless
+        self.assertEqual(s, encoding.toutf8b(l))  # convert back to utf-8
+
     def testlossy0xed(self):
         encoding.encoding = b'euc-kr'  # U+Dxxx Hangul
         s = u'\ud1bc\xc0'.encode('utf-8')
@@ -61,6 +68,13 @@
         self.assertTrue(l.endswith(b'?'))  # lossy
         self.assertEqual(s, encoding.toutf8b(l))  # utf8 sequence preserved
 
+    def testlossless0xed(self):
+        encoding.encoding = b'euc-kr'  # U+Dxxx Hangul
+        s = u'\ud1bc'.encode('utf-8')
+        l = encoding.tolocal(s)
+        self.assertEqual(l, b'\xc5\xed')  # lossless
+        self.assertEqual(s, encoding.toutf8b(l))  # convert back to utf-8
+
 if __name__ == '__main__':
     import silenttestrunner
     silenttestrunner.main(__name__)