comparison mercurial/encoding.py @ 13046:7cc4263e07a9

encoding: add localstr class to track UTF-8 version of transcoded strings This allows UTF-8 strings to losslessly round-trip through Mercurial
author Matt Mackall <mpm@selenic.com>
date Wed, 24 Nov 2010 15:38:52 -0600
parents eddc20306ab6
children 120eccaaa522
comparison
equal deleted inserted replaced
13045:1b1cbc246377 13046:7cc4263e07a9
46 except locale.Error: 46 except locale.Error:
47 encoding = 'ascii' 47 encoding = 'ascii'
48 encodingmode = os.environ.get("HGENCODINGMODE", "strict") 48 encodingmode = os.environ.get("HGENCODINGMODE", "strict")
49 fallbackencoding = 'ISO-8859-1' 49 fallbackencoding = 'ISO-8859-1'
50 50
51 class localstr(str):
52 '''This class allows strings that are unmodified to be
53 round-tripped to the local encoding and back'''
54 def __new__(cls, u, l):
55 s = str.__new__(cls, l)
56 s._utf8 = u
57 return s
58 def __hash__(self):
59 return hash(self._utf8) # avoid collisions in local string space
60
51 def tolocal(s): 61 def tolocal(s):
52 """ 62 """
53 Convert a string from internal UTF-8 to local encoding 63 Convert a string from internal UTF-8 to local encoding
54 64
55 All internal strings should be UTF-8 but some repos before the 65 All internal strings should be UTF-8 but some repos before the
56 implementation of locale support may contain latin1 or possibly 66 implementation of locale support may contain latin1 or possibly
57 other character sets. We attempt to decode everything strictly 67 other character sets. We attempt to decode everything strictly
58 using UTF-8, then Latin-1, and failing that, we use UTF-8 and 68 using UTF-8, then Latin-1, and failing that, we use UTF-8 and
59 replace unknown characters. 69 replace unknown characters.
70
71 The localstr class is used to cache the known UTF-8 encoding of
72 strings next to their local representation to allow lossless
73 round-trip conversion back to UTF-8.
74
75 >>> u = 'foo: \\xc3\\xa4' # utf-8
76 >>> l = tolocal(u)
77 >>> l
78 'foo: ?'
79 >>> fromlocal(l)
80 'foo: \\xc3\\xa4'
81 >>> u2 = 'foo: \\xc3\\xa1'
82 >>> d = { l: 1, tolocal(u2): 2 }
83 >>> d # no collision
84 {'foo: ?': 1, 'foo: ?': 2}
85 >>> 'foo: ?' in d
86 False
87 >>> l1 = 'foo: \\xe4' # historical latin1 fallback
88 >>> l = tolocal(l1)
89 >>> l
90 'foo: ?'
91 >>> fromlocal(l) # magically in utf-8
92 'foo: \\xc3\\xa4'
60 """ 93 """
94
61 for e in ('UTF-8', fallbackencoding): 95 for e in ('UTF-8', fallbackencoding):
62 try: 96 try:
63 u = s.decode(e) # attempt strict decoding 97 u = s.decode(e) # attempt strict decoding
64 return u.encode(encoding, "replace") 98 if u == 'UTF-8':
99 return localstr(s, u.encode(encoding, "replace"))
100 else:
101 return localstr(u.encode('UTF-8'),
102 u.encode(encoding, "replace"))
65 except LookupError, k: 103 except LookupError, k:
66 raise error.Abort("%s, please check your locale settings" % k) 104 raise error.Abort("%s, please check your locale settings" % k)
67 except UnicodeDecodeError: 105 except UnicodeDecodeError:
68 pass 106 pass
69 u = s.decode("utf-8", "replace") # last ditch 107 u = s.decode("utf-8", "replace") # last ditch
70 return u.encode(encoding, "replace") 108 return u.encode(encoding, "replace") # can't round-trip
71 109
72 def fromlocal(s): 110 def fromlocal(s):
73 """ 111 """
74 Convert a string from the local character encoding to UTF-8 112 Convert a string from the local character encoding to UTF-8
75 113
77 HGENCODINGMODE, which defaults to 'strict'. In this mode, unknown 115 HGENCODINGMODE, which defaults to 'strict'. In this mode, unknown
78 characters will cause an error message. Other modes include 116 characters will cause an error message. Other modes include
79 'replace', which replaces unknown characters with a special 117 'replace', which replaces unknown characters with a special
80 Unicode character, and 'ignore', which drops the character. 118 Unicode character, and 'ignore', which drops the character.
81 """ 119 """
120
121 # can we do a lossless round-trip?
122 if isinstance(s, localstr):
123 return s._utf8
124
82 try: 125 try:
83 return s.decode(encoding, encodingmode).encode("utf-8") 126 return s.decode(encoding, encodingmode).encode("utf-8")
84 except UnicodeDecodeError, inst: 127 except UnicodeDecodeError, inst:
85 sub = s[max(0, inst.start - 10):inst.start + 10] 128 sub = s[max(0, inst.start - 10):inst.start + 10]
86 raise error.Abort("decoding near '%s': %s!" % (sub, inst)) 129 raise error.Abort("decoding near '%s': %s!" % (sub, inst))