Mercurial > hg-stable
changeset 13046:7cc4263e07a9
encoding: add localstr class to track UTF-8 version of transcoded strings
This allows UTF-8 strings to losslessly round-trip through Mercurial
author | Matt Mackall <mpm@selenic.com> |
---|---|
date | Wed, 24 Nov 2010 15:38:52 -0600 |
parents | 1b1cbc246377 |
children | 6c375e07d673 |
files | mercurial/encoding.py tests/test-doctest.py |
diffstat | 2 files changed, 47 insertions(+), 4 deletions(-) [+] |
line wrap: on
line diff
--- a/mercurial/encoding.py Fri Nov 26 20:08:03 2010 -0600 +++ b/mercurial/encoding.py Wed Nov 24 15:38:52 2010 -0600 @@ -48,6 +48,16 @@ encodingmode = os.environ.get("HGENCODINGMODE", "strict") fallbackencoding = 'ISO-8859-1' +class localstr(str): + '''This class allows strings that are unmodified to be + round-tripped to the local encoding and back''' + def __new__(cls, u, l): + s = str.__new__(cls, l) + s._utf8 = u + return s + def __hash__(self): + return hash(self._utf8) # avoid collisions in local string space + def tolocal(s): """ Convert a string from internal UTF-8 to local encoding @@ -57,17 +67,45 @@ other character sets. We attempt to decode everything strictly using UTF-8, then Latin-1, and failing that, we use UTF-8 and replace unknown characters. + + The localstr class is used to cache the known UTF-8 encoding of + strings next to their local representation to allow lossless + round-trip conversion back to UTF-8. + + >>> u = 'foo: \\xc3\\xa4' # utf-8 + >>> l = tolocal(u) + >>> l + 'foo: ?' + >>> fromlocal(l) + 'foo: \\xc3\\xa4' + >>> u2 = 'foo: \\xc3\\xa1' + >>> d = { l: 1, tolocal(u2): 2 } + >>> d # no collision + {'foo: ?': 1, 'foo: ?': 2} + >>> 'foo: ?' in d + False + >>> l1 = 'foo: \\xe4' # historical latin1 fallback + >>> l = tolocal(l1) + >>> l + 'foo: ?' + >>> fromlocal(l) # magically in utf-8 + 'foo: \\xc3\\xa4' """ + for e in ('UTF-8', fallbackencoding): try: u = s.decode(e) # attempt strict decoding - return u.encode(encoding, "replace") + if u == 'UTF-8': + return localstr(s, u.encode(encoding, "replace")) + else: + return localstr(u.encode('UTF-8'), + u.encode(encoding, "replace")) except LookupError, k: raise error.Abort("%s, please check your locale settings" % k) except UnicodeDecodeError: pass u = s.decode("utf-8", "replace") # last ditch - return u.encode(encoding, "replace") + return u.encode(encoding, "replace") # can't round-trip def fromlocal(s): """ @@ -79,6 +117,11 @@ 'replace', which replaces unknown characters with a special Unicode character, and 'ignore', which drops the character. """ + + # can we do a lossless round-trip? + if isinstance(s, localstr): + return s._utf8 + try: return s.decode(encoding, encodingmode).encode("utf-8") except UnicodeDecodeError, inst:
--- a/tests/test-doctest.py Fri Nov 26 20:08:03 2010 -0600 +++ b/tests/test-doctest.py Wed Nov 24 15:38:52 2010 -0600 @@ -13,8 +13,8 @@ import mercurial.match doctest.testmod(mercurial.match) -import mercurial.url -doctest.testmod(mercurial.url) +import mercurial.encoding +doctest.testmod(mercurial.encoding) import hgext.convert.cvsps doctest.testmod(hgext.convert.cvsps)