mercurial/encoding.py
branchstable
changeset 38739 7acec9408e1c
parent 38615 443029011990
parent 37947 3ea3c96ada54
child 38783 e7aa113b14f7
--- a/mercurial/encoding.py	Sun Jul 01 23:36:53 2018 +0900
+++ b/mercurial/encoding.py	Thu Jul 19 13:55:54 2018 -0400
@@ -98,6 +98,16 @@
     def __hash__(self):
         return hash(self._utf8) # avoid collisions in local string space
 
+class safelocalstr(bytes):
+    """Tagged string denoting it was previously an internal UTF-8 string,
+    and can be converted back to UTF-8 losslessly
+
+    >>> assert safelocalstr(b'\\xc3') == b'\\xc3'
+    >>> assert b'\\xc3' == safelocalstr(b'\\xc3')
+    >>> assert b'\\xc3' in {safelocalstr(b'\\xc3'): 0}
+    >>> assert safelocalstr(b'\\xc3') in {b'\\xc3': 0}
+    """
+
 def tolocal(s):
     """
     Convert a string from internal UTF-8 to local encoding
@@ -145,7 +155,7 @@
             r = u.encode(_sysstr(encoding), u"replace")
             if u == r.decode(_sysstr(encoding)):
                 # r is a safe, non-lossy encoding of s
-                return r
+                return safelocalstr(r)
             return localstr(s, r)
         except UnicodeDecodeError:
             # we should only get here if we're looking at an ancient changeset
@@ -154,7 +164,7 @@
                 r = u.encode(_sysstr(encoding), u"replace")
                 if u == r.decode(_sysstr(encoding)):
                     # r is a safe, non-lossy encoding of s
-                    return r
+                    return safelocalstr(r)
                 return localstr(u.encode('UTF-8'), r)
             except UnicodeDecodeError:
                 u = s.decode("utf-8", "replace") # last ditch
@@ -407,7 +417,7 @@
     JSON is problematic for us because it doesn't support non-Unicode
     bytes. To deal with this, we take the following approach:
 
-    - localstr objects are converted back to UTF-8
+    - localstr/safelocalstr objects are converted back to UTF-8
     - valid UTF-8/ASCII strings are passed as-is
     - other strings are converted to UTF-8b surrogate encoding
     - apply JSON-specified string escaping
@@ -500,6 +510,7 @@
     - local strings that have a cached known UTF-8 encoding (aka
       localstr) get sent as UTF-8 so Unicode-oriented clients get the
       Unicode data they want
+    - non-lossy local strings (aka safelocalstr) get sent as UTF-8 as well
     - because we must preserve UTF-8 bytestring in places such as
       filenames, metadata can't be roundtripped without help
 
@@ -509,11 +520,17 @@
     internal surrogate encoding as a UTF-8 string.)
     '''
 
-    if not isinstance(s, localstr) and isasciistr(s):
+    if isinstance(s, localstr):
+        # assume that the original UTF-8 sequence would never contain
+        # invalid characters in U+DCxx range
+        return s._utf8
+    elif isinstance(s, safelocalstr):
+        # already verified that s is non-lossy in legacy encoding, which
+        # shouldn't contain characters in U+DCxx range
+        return fromlocal(s)
+    elif isasciistr(s):
         return s
     if "\xed" not in s:
-        if isinstance(s, localstr):
-            return s._utf8
         try:
             s.decode('utf-8', _utf8strict)
             return s