encoding: introduce tagging type for non-lossy non-ASCII string
This fixes the weird behavior of toutf8b(), which would convert a local
string back to UTF-8 *only if* it was lossy in the system encoding.
Before
b7b26e54e37a "encoding: avoid localstr when a string can be encoded
losslessly (
issue2763)", all local strings were wrapped by the localstr
class. I think this would justify the round-trip behavior of toutf8b().
ASCII strings are special-cased, so the cost of wrapping with safelocalstr
is negligible.
(with mercurial repo)
$ export HGRCPATH=/dev/null HGPLAIN= HGENCODING=latin-1
$ hg log --time --config experimental.evolution=all > /dev/null
(original)
time: real 11.340 secs (user 11.290+0.000 sys 0.060+0.000)
time: real 11.390 secs (user 11.300+0.000 sys 0.080+0.000)
time: real 11.430 secs (user 11.360+0.000 sys 0.070+0.000)
(this patch)
time: real 11.200 secs (user 11.100+0.000 sys 0.100+0.000)
time: real 11.370 secs (user 11.300+0.000 sys 0.070+0.000)
time: real 11.190 secs (user 11.130+0.000 sys 0.060+0.000)
--- a/mercurial/encoding.py Sun Apr 22 11:38:53 2018 +0900
+++ b/mercurial/encoding.py Sun Apr 23 13:15:30 2017 +0900
@@ -93,6 +93,16 @@
def __hash__(self):
return hash(self._utf8) # avoid collisions in local string space
+class safelocalstr(bytes):
+ """Tagged string denoting it was previously an internal UTF-8 string,
+ and can be converted back to UTF-8 losslessly
+
+ >>> assert safelocalstr(b'\\xc3') == b'\\xc3'
+ >>> assert b'\\xc3' == safelocalstr(b'\\xc3')
+ >>> assert b'\\xc3' in {safelocalstr(b'\\xc3'): 0}
+ >>> assert safelocalstr(b'\\xc3') in {b'\\xc3': 0}
+ """
+
def tolocal(s):
"""
Convert a string from internal UTF-8 to local encoding
@@ -140,7 +150,7 @@
r = u.encode(_sysstr(encoding), u"replace")
if u == r.decode(_sysstr(encoding)):
# r is a safe, non-lossy encoding of s
- return r
+ return safelocalstr(r)
return localstr(s, r)
except UnicodeDecodeError:
# we should only get here if we're looking at an ancient changeset
@@ -149,7 +159,7 @@
r = u.encode(_sysstr(encoding), u"replace")
if u == r.decode(_sysstr(encoding)):
# r is a safe, non-lossy encoding of s
- return r
+ return safelocalstr(r)
return localstr(u.encode('UTF-8'), r)
except UnicodeDecodeError:
u = s.decode("utf-8", "replace") # last ditch
@@ -402,7 +412,7 @@
JSON is problematic for us because it doesn't support non-Unicode
bytes. To deal with this, we take the following approach:
- - localstr objects are converted back to UTF-8
+ - localstr/safelocalstr objects are converted back to UTF-8
- valid UTF-8/ASCII strings are passed as-is
- other strings are converted to UTF-8b surrogate encoding
- apply JSON-specified string escaping
@@ -495,6 +505,7 @@
- local strings that have a cached known UTF-8 encoding (aka
localstr) get sent as UTF-8 so Unicode-oriented clients get the
Unicode data they want
+ - non-lossy local strings (aka safelocalstr) get sent as UTF-8 as well
- because we must preserve UTF-8 bytestring in places such as
filenames, metadata can't be roundtripped without help
@@ -508,6 +519,10 @@
# assume that the original UTF-8 sequence would never contain
# invalid characters in U+DCxx range
return s._utf8
+ elif isinstance(s, safelocalstr):
+ # already verified that s is non-lossy in legacy encoding, which
+ # shouldn't contain characters in U+DCxx range
+ return fromlocal(s)
elif isasciistr(s):
return s
if "\xed" not in s:
--- a/mercurial/templatekw.py Sun Apr 22 11:38:53 2018 +0900
+++ b/mercurial/templatekw.py Sun Apr 23 13:15:30 2017 +0900
@@ -278,6 +278,8 @@
if isinstance(s, encoding.localstr):
# try hard to preserve utf-8 bytes
return encoding.tolocal(encoding.fromlocal(s).strip())
+ elif isinstance(s, encoding.safelocalstr):
+ return encoding.safelocalstr(s.strip())
else:
return s.strip()
--- a/tests/test-command-template.t Sun Apr 22 11:38:53 2018 +0900
+++ b/tests/test-command-template.t Sun Apr 23 13:15:30 2017 +0900
@@ -4691,6 +4691,13 @@
$ HGENCODING=ascii hg log -T "{desc|json}\n" -r0
"non-ascii branch: \u00e9"
+json filter should take input as utf-8 if it was converted from utf-8:
+
+ $ HGENCODING=latin-1 hg log -T "{branch|json}\n" -r0
+ "\u00e9"
+ $ HGENCODING=latin-1 hg log -T "{desc|json}\n" -r0
+ "non-ascii branch: \u00e9"
+
json filter takes input as utf-8b:
$ HGENCODING=ascii hg log -T "{'`cat utf-8`'|json}\n" -l1
--- a/tests/test-encoding-func.py Sun Apr 22 11:38:53 2018 +0900
+++ b/tests/test-encoding-func.py Sun Apr 23 13:15:30 2017 +0900
@@ -53,6 +53,13 @@
self.assertEqual(l, b'?') # lossy
self.assertEqual(s, encoding.toutf8b(l)) # utf8 sequence preserved
+ def testlosslesslatin(self):
+ encoding.encoding = b'latin-1'
+ s = u'\xc0'.encode('utf-8')
+ l = encoding.tolocal(s)
+ self.assertEqual(l, b'\xc0') # lossless
+ self.assertEqual(s, encoding.toutf8b(l)) # convert back to utf-8
+
def testlossy0xed(self):
encoding.encoding = b'euc-kr' # U+Dxxx Hangul
s = u'\ud1bc\xc0'.encode('utf-8')
@@ -61,6 +68,13 @@
self.assertTrue(l.endswith(b'?')) # lossy
self.assertEqual(s, encoding.toutf8b(l)) # utf8 sequence preserved
+ def testlossless0xed(self):
+ encoding.encoding = b'euc-kr' # U+Dxxx Hangul
+ s = u'\ud1bc'.encode('utf-8')
+ l = encoding.tolocal(s)
+ self.assertEqual(l, b'\xc5\xed') # lossless
+ self.assertEqual(s, encoding.toutf8b(l)) # convert back to utf-8
+
if __name__ == '__main__':
import silenttestrunner
silenttestrunner.main(__name__)