changeset 13046:7cc4263e07a9

encoding: add localstr class to track UTF-8 version of transcoded strings This allows UTF-8 strings to losslessly round-trip through Mercurial
author Matt Mackall <mpm@selenic.com>
date Wed, 24 Nov 2010 15:38:52 -0600
parents 1b1cbc246377
children 6c375e07d673
files mercurial/encoding.py tests/test-doctest.py
diffstat 2 files changed, 47 insertions(+), 4 deletions(-) [+]
line wrap: on
line diff
--- a/mercurial/encoding.py	Fri Nov 26 20:08:03 2010 -0600
+++ b/mercurial/encoding.py	Wed Nov 24 15:38:52 2010 -0600
@@ -48,6 +48,16 @@
 encodingmode = os.environ.get("HGENCODINGMODE", "strict")
 fallbackencoding = 'ISO-8859-1'
 
+class localstr(str):
+    '''This class allows strings that are unmodified to be
+    round-tripped to the local encoding and back'''
+    def __new__(cls, u, l):
+        s = str.__new__(cls, l)
+        s._utf8 = u
+        return s
+    def __hash__(self):
+        return hash(self._utf8) # avoid collisions in local string space
+
 def tolocal(s):
     """
     Convert a string from internal UTF-8 to local encoding
@@ -57,17 +67,45 @@
     other character sets. We attempt to decode everything strictly
     using UTF-8, then Latin-1, and failing that, we use UTF-8 and
     replace unknown characters.
+
+    The localstr class is used to cache the known UTF-8 encoding of
+    strings next to their local representation to allow lossless
+    round-trip conversion back to UTF-8.
+
+    >>> u = 'foo: \\xc3\\xa4' # utf-8
+    >>> l = tolocal(u)
+    >>> l
+    'foo: ?'
+    >>> fromlocal(l)
+    'foo: \\xc3\\xa4'
+    >>> u2 = 'foo: \\xc3\\xa1'
+    >>> d = { l: 1, tolocal(u2): 2 }
+    >>> d # no collision
+    {'foo: ?': 1, 'foo: ?': 2}
+    >>> 'foo: ?' in d
+    False
+    >>> l1 = 'foo: \\xe4' # historical latin1 fallback
+    >>> l = tolocal(l1)
+    >>> l
+    'foo: ?'
+    >>> fromlocal(l) # magically in utf-8
+    'foo: \\xc3\\xa4'
     """
+
     for e in ('UTF-8', fallbackencoding):
         try:
             u = s.decode(e) # attempt strict decoding
-            return u.encode(encoding, "replace")
+            if u == 'UTF-8':
+                return localstr(s, u.encode(encoding, "replace"))
+            else:
+                return localstr(u.encode('UTF-8'),
+                                u.encode(encoding, "replace"))
         except LookupError, k:
             raise error.Abort("%s, please check your locale settings" % k)
         except UnicodeDecodeError:
             pass
     u = s.decode("utf-8", "replace") # last ditch
-    return u.encode(encoding, "replace")
+    return u.encode(encoding, "replace") # can't round-trip
 
 def fromlocal(s):
     """
@@ -79,6 +117,11 @@
     'replace', which replaces unknown characters with a special
     Unicode character, and 'ignore', which drops the character.
     """
+
+    # can we do a lossless round-trip?
+    if isinstance(s, localstr):
+        return s._utf8
+
     try:
         return s.decode(encoding, encodingmode).encode("utf-8")
     except UnicodeDecodeError, inst:
--- a/tests/test-doctest.py	Fri Nov 26 20:08:03 2010 -0600
+++ b/tests/test-doctest.py	Wed Nov 24 15:38:52 2010 -0600
@@ -13,8 +13,8 @@
 import mercurial.match
 doctest.testmod(mercurial.match)
 
-import mercurial.url
-doctest.testmod(mercurial.url)
+import mercurial.encoding
+doctest.testmod(mercurial.encoding)
 
 import hgext.convert.cvsps
 doctest.testmod(hgext.convert.cvsps)