encoding: introduce tagging type for non-lossy non-ASCII string
authorYuya Nishihara <yuya@tcha.org>
Sun, 23 Apr 2017 13:15:30 +0900
changeset 37991 3ea3c96ada54
parent 37990 57b0c7221dba
child 37992 a25513263075
encoding: introduce tagging type for non-lossy non-ASCII string This fixes the weird behavior of toutf8b(), which would convert a local string back to UTF-8 *only if* it was lossy in the system encoding. Before b7b26e54e37a "encoding: avoid localstr when a string can be encoded losslessly (issue2763)", all local strings were wrapped by the localstr class. I think this would justify the round-trip behavior of toutf8b(). ASCII strings are special-cased, so the cost of wrapping with safelocalstr is negligible. (with mercurial repo) $ export HGRCPATH=/dev/null HGPLAIN= HGENCODING=latin-1 $ hg log --time --config experimental.evolution=all > /dev/null (original) time: real 11.340 secs (user 11.290+0.000 sys 0.060+0.000) time: real 11.390 secs (user 11.300+0.000 sys 0.080+0.000) time: real 11.430 secs (user 11.360+0.000 sys 0.070+0.000) (this patch) time: real 11.200 secs (user 11.100+0.000 sys 0.100+0.000) time: real 11.370 secs (user 11.300+0.000 sys 0.070+0.000) time: real 11.190 secs (user 11.130+0.000 sys 0.060+0.000)
mercurial/encoding.py
mercurial/templatekw.py
tests/test-command-template.t
tests/test-encoding-func.py
--- a/mercurial/encoding.py	Sun Apr 22 11:38:53 2018 +0900
+++ b/mercurial/encoding.py	Sun Apr 23 13:15:30 2017 +0900
@@ -93,6 +93,16 @@
     def __hash__(self):
         return hash(self._utf8) # avoid collisions in local string space
 
+class safelocalstr(bytes):
+    """Tagged string denoting it was previously an internal UTF-8 string,
+    and can be converted back to UTF-8 losslessly
+
+    >>> assert safelocalstr(b'\\xc3') == b'\\xc3'
+    >>> assert b'\\xc3' == safelocalstr(b'\\xc3')
+    >>> assert b'\\xc3' in {safelocalstr(b'\\xc3'): 0}
+    >>> assert safelocalstr(b'\\xc3') in {b'\\xc3': 0}
+    """
+
 def tolocal(s):
     """
     Convert a string from internal UTF-8 to local encoding
@@ -140,7 +150,7 @@
             r = u.encode(_sysstr(encoding), u"replace")
             if u == r.decode(_sysstr(encoding)):
                 # r is a safe, non-lossy encoding of s
-                return r
+                return safelocalstr(r)
             return localstr(s, r)
         except UnicodeDecodeError:
             # we should only get here if we're looking at an ancient changeset
@@ -149,7 +159,7 @@
                 r = u.encode(_sysstr(encoding), u"replace")
                 if u == r.decode(_sysstr(encoding)):
                     # r is a safe, non-lossy encoding of s
-                    return r
+                    return safelocalstr(r)
                 return localstr(u.encode('UTF-8'), r)
             except UnicodeDecodeError:
                 u = s.decode("utf-8", "replace") # last ditch
@@ -402,7 +412,7 @@
     JSON is problematic for us because it doesn't support non-Unicode
     bytes. To deal with this, we take the following approach:
 
-    - localstr objects are converted back to UTF-8
+    - localstr/safelocalstr objects are converted back to UTF-8
     - valid UTF-8/ASCII strings are passed as-is
     - other strings are converted to UTF-8b surrogate encoding
     - apply JSON-specified string escaping
@@ -495,6 +505,7 @@
     - local strings that have a cached known UTF-8 encoding (aka
       localstr) get sent as UTF-8 so Unicode-oriented clients get the
       Unicode data they want
+    - non-lossy local strings (aka safelocalstr) get sent as UTF-8 as well
     - because we must preserve UTF-8 bytestring in places such as
       filenames, metadata can't be roundtripped without help
 
@@ -508,6 +519,10 @@
         # assume that the original UTF-8 sequence would never contain
         # invalid characters in U+DCxx range
         return s._utf8
+    elif isinstance(s, safelocalstr):
+        # already verified that s is non-lossy in legacy encoding, which
+        # shouldn't contain characters in U+DCxx range
+        return fromlocal(s)
     elif isasciistr(s):
         return s
     if "\xed" not in s:
--- a/mercurial/templatekw.py	Sun Apr 22 11:38:53 2018 +0900
+++ b/mercurial/templatekw.py	Sun Apr 23 13:15:30 2017 +0900
@@ -278,6 +278,8 @@
     if isinstance(s, encoding.localstr):
         # try hard to preserve utf-8 bytes
         return encoding.tolocal(encoding.fromlocal(s).strip())
+    elif isinstance(s, encoding.safelocalstr):
+        return encoding.safelocalstr(s.strip())
     else:
         return s.strip()
 
--- a/tests/test-command-template.t	Sun Apr 22 11:38:53 2018 +0900
+++ b/tests/test-command-template.t	Sun Apr 23 13:15:30 2017 +0900
@@ -4691,6 +4691,13 @@
   $ HGENCODING=ascii hg log -T "{desc|json}\n" -r0
   "non-ascii branch: \u00e9"
 
+json filter should take input as utf-8 if it was converted from utf-8:
+
+  $ HGENCODING=latin-1 hg log -T "{branch|json}\n" -r0
+  "\u00e9"
+  $ HGENCODING=latin-1 hg log -T "{desc|json}\n" -r0
+  "non-ascii branch: \u00e9"
+
 json filter takes input as utf-8b:
 
   $ HGENCODING=ascii hg log -T "{'`cat utf-8`'|json}\n" -l1
--- a/tests/test-encoding-func.py	Sun Apr 22 11:38:53 2018 +0900
+++ b/tests/test-encoding-func.py	Sun Apr 23 13:15:30 2017 +0900
@@ -53,6 +53,13 @@
         self.assertEqual(l, b'?')  # lossy
         self.assertEqual(s, encoding.toutf8b(l))  # utf8 sequence preserved
 
+    def testlosslesslatin(self):
+        encoding.encoding = b'latin-1'
+        s = u'\xc0'.encode('utf-8')
+        l = encoding.tolocal(s)
+        self.assertEqual(l, b'\xc0')  # lossless
+        self.assertEqual(s, encoding.toutf8b(l))  # convert back to utf-8
+
     def testlossy0xed(self):
         encoding.encoding = b'euc-kr'  # U+Dxxx Hangul
         s = u'\ud1bc\xc0'.encode('utf-8')
@@ -61,6 +68,13 @@
         self.assertTrue(l.endswith(b'?'))  # lossy
         self.assertEqual(s, encoding.toutf8b(l))  # utf8 sequence preserved
 
+    def testlossless0xed(self):
+        encoding.encoding = b'euc-kr'  # U+Dxxx Hangul
+        s = u'\ud1bc'.encode('utf-8')
+        l = encoding.tolocal(s)
+        self.assertEqual(l, b'\xc5\xed')  # lossless
+        self.assertEqual(s, encoding.toutf8b(l))  # convert back to utf-8
+
 if __name__ == '__main__':
     import silenttestrunner
     silenttestrunner.main(__name__)