encoding: add localstr class to track UTF-8 version of transcoded strings
This allows UTF-8 strings to losslessly round-trip through Mercurial
--- a/mercurial/encoding.py Fri Nov 26 20:08:03 2010 -0600
+++ b/mercurial/encoding.py Wed Nov 24 15:38:52 2010 -0600
@@ -48,6 +48,16 @@
encodingmode = os.environ.get("HGENCODINGMODE", "strict")
fallbackencoding = 'ISO-8859-1'
+class localstr(str):
+ '''This class allows strings that are unmodified to be
+ round-tripped to the local encoding and back'''
+ def __new__(cls, u, l):
+ s = str.__new__(cls, l)
+ s._utf8 = u
+ return s
+ def __hash__(self):
+ return hash(self._utf8) # avoid collisions in local string space
+
def tolocal(s):
"""
Convert a string from internal UTF-8 to local encoding
@@ -57,17 +67,45 @@
other character sets. We attempt to decode everything strictly
using UTF-8, then Latin-1, and failing that, we use UTF-8 and
replace unknown characters.
+
+ The localstr class is used to cache the known UTF-8 encoding of
+ strings next to their local representation to allow lossless
+ round-trip conversion back to UTF-8.
+
+ >>> u = 'foo: \\xc3\\xa4' # utf-8
+ >>> l = tolocal(u)
+ >>> l
+ 'foo: ?'
+ >>> fromlocal(l)
+ 'foo: \\xc3\\xa4'
+ >>> u2 = 'foo: \\xc3\\xa1'
+ >>> d = { l: 1, tolocal(u2): 2 }
+ >>> d # no collision
+ {'foo: ?': 1, 'foo: ?': 2}
+ >>> 'foo: ?' in d
+ False
+ >>> l1 = 'foo: \\xe4' # historical latin1 fallback
+ >>> l = tolocal(l1)
+ >>> l
+ 'foo: ?'
+ >>> fromlocal(l) # magically in utf-8
+ 'foo: \\xc3\\xa4'
"""
+
for e in ('UTF-8', fallbackencoding):
try:
u = s.decode(e) # attempt strict decoding
- return u.encode(encoding, "replace")
+ if u == 'UTF-8':
+ return localstr(s, u.encode(encoding, "replace"))
+ else:
+ return localstr(u.encode('UTF-8'),
+ u.encode(encoding, "replace"))
except LookupError, k:
raise error.Abort("%s, please check your locale settings" % k)
except UnicodeDecodeError:
pass
u = s.decode("utf-8", "replace") # last ditch
- return u.encode(encoding, "replace")
+ return u.encode(encoding, "replace") # can't round-trip
def fromlocal(s):
"""
@@ -79,6 +117,11 @@
'replace', which replaces unknown characters with a special
Unicode character, and 'ignore', which drops the character.
"""
+
+ # can we do a lossless round-trip?
+ if isinstance(s, localstr):
+ return s._utf8
+
try:
return s.decode(encoding, encodingmode).encode("utf-8")
except UnicodeDecodeError, inst:
--- a/tests/test-doctest.py Fri Nov 26 20:08:03 2010 -0600
+++ b/tests/test-doctest.py Wed Nov 24 15:38:52 2010 -0600
@@ -13,8 +13,8 @@
import mercurial.match
doctest.testmod(mercurial.match)
-import mercurial.url
-doctest.testmod(mercurial.url)
+import mercurial.encoding
+doctest.testmod(mercurial.encoding)
import hgext.convert.cvsps
doctest.testmod(hgext.convert.cvsps)