--- a/mercurial/encoding.py Sun Jul 01 23:36:53 2018 +0900
+++ b/mercurial/encoding.py Thu Jul 19 13:55:54 2018 -0400
@@ -98,6 +98,16 @@
def __hash__(self):
return hash(self._utf8) # avoid collisions in local string space
+class safelocalstr(bytes):
+ """Tagged string denoting it was previously an internal UTF-8 string,
+ and can be converted back to UTF-8 losslessly
+
+ >>> assert safelocalstr(b'\\xc3') == b'\\xc3'
+ >>> assert b'\\xc3' == safelocalstr(b'\\xc3')
+ >>> assert b'\\xc3' in {safelocalstr(b'\\xc3'): 0}
+ >>> assert safelocalstr(b'\\xc3') in {b'\\xc3': 0}
+ """
+
def tolocal(s):
"""
Convert a string from internal UTF-8 to local encoding
@@ -145,7 +155,7 @@
r = u.encode(_sysstr(encoding), u"replace")
if u == r.decode(_sysstr(encoding)):
# r is a safe, non-lossy encoding of s
- return r
+ return safelocalstr(r)
return localstr(s, r)
except UnicodeDecodeError:
# we should only get here if we're looking at an ancient changeset
@@ -154,7 +164,7 @@
r = u.encode(_sysstr(encoding), u"replace")
if u == r.decode(_sysstr(encoding)):
# r is a safe, non-lossy encoding of s
- return r
+ return safelocalstr(r)
return localstr(u.encode('UTF-8'), r)
except UnicodeDecodeError:
u = s.decode("utf-8", "replace") # last ditch
@@ -407,7 +417,7 @@
JSON is problematic for us because it doesn't support non-Unicode
bytes. To deal with this, we take the following approach:
- - localstr objects are converted back to UTF-8
+ - localstr/safelocalstr objects are converted back to UTF-8
- valid UTF-8/ASCII strings are passed as-is
- other strings are converted to UTF-8b surrogate encoding
- apply JSON-specified string escaping
@@ -500,6 +510,7 @@
- local strings that have a cached known UTF-8 encoding (aka
localstr) get sent as UTF-8 so Unicode-oriented clients get the
Unicode data they want
+ - non-lossy local strings (aka safelocalstr) get sent as UTF-8 as well
- because we must preserve UTF-8 bytestring in places such as
filenames, metadata can't be roundtripped without help
@@ -509,11 +520,17 @@
internal surrogate encoding as a UTF-8 string.)
'''
- if not isinstance(s, localstr) and isasciistr(s):
+ if isinstance(s, localstr):
+ # assume that the original UTF-8 sequence would never contain
+ # invalid characters in U+DCxx range
+ return s._utf8
+ elif isinstance(s, safelocalstr):
+ # already verified that s is non-lossy in legacy encoding, which
+ # shouldn't contain characters in U+DCxx range
+ return fromlocal(s)
+ elif isasciistr(s):
return s
if "\xed" not in s:
- if isinstance(s, localstr):
- return s._utf8
try:
s.decode('utf-8', _utf8strict)
return s