comparison mercurial/posix.py @ 19131:af3b651505e2 stable

hfs+: rewrite percent-escaper (issue3918) The original code was a bit too clever and got confused by some cp949 Korean text. This rewrite bytes the bullet and manually decodes UTF-8 sequences. Adds some doctests.
author Matt Mackall <mpm@selenic.com>
date Sat, 04 May 2013 14:51:21 -0500
parents cafa447a7d3b
children cfdae231ba78
comparison
equal deleted inserted replaced
19129:bd19587a3347 19131:af3b651505e2
192 192
193 if sys.platform == 'darwin': 193 if sys.platform == 'darwin':
194 import fcntl # only needed on darwin, missing on jython 194 import fcntl # only needed on darwin, missing on jython
195 195
196 def normcase(path): 196 def normcase(path):
197 '''
198 Normalize a filename for OS X-compatible comparison:
199 - escape-encode invalid characters
200 - decompose to NFD
201 - lowercase
202
203 >>> normcase('UPPER')
204 'upper'
205 >>> normcase('Caf\xc3\xa9')
206 'cafe\\xcc\\x81'
207 >>> normcase('\xc3\x89')
208 'e\\xcc\\x81'
209 >>> normcase('\xb8\xca\xc3\xca\xbe\xc8.JPG') # issue3918
210 '%b8%ca%c3\\xca\\xbe%c8.jpg'
211 '''
212
197 try: 213 try:
198 path.decode('ascii') # throw exception for non-ASCII character 214 path.decode('ascii') # throw exception for non-ASCII character
199 return path.lower() 215 return path.lower()
200 except UnicodeDecodeError: 216 except UnicodeDecodeError:
201 pass 217 pass
202 try: 218 try:
203 u = path.decode('utf-8') 219 u = path.decode('utf-8')
204 except UnicodeDecodeError: 220 except UnicodeDecodeError:
205 # percent-encode any characters that don't round-trip 221 # OS X percent-encodes any bytes that aren't valid utf-8
206 p2 = path.decode('utf-8', 'ignore').encode('utf-8') 222 s = ''
207 s = "" 223 g = ''
208 pos = 0 224 l = 0
209 for c in path: 225 for c in path:
210 if p2[pos:pos + 1] == c: 226 o = ord(c)
227 if l and o < 128 or o >= 192:
228 # we want a continuation byte, but didn't get one
229 s += ''.join(["%%%02X" % ord(x) for x in g])
230 g = ''
231 l = 0
232 if l == 0 and o < 128:
233 # ascii
211 s += c 234 s += c
212 pos += 1 235 elif l == 0 and 194 <= o < 245:
236 # valid leading bytes
237 if o < 224:
238 l = 1
239 elif o < 240:
240 l = 2
241 else:
242 l = 3
243 g = c
244 elif l > 0 and 128 <= o < 192:
245 # valid continuations
246 g += c
247 l -= 1
248 if not l:
249 s += g
250 g = ''
213 else: 251 else:
214 s += "%%%02X" % ord(c) 252 # invalid
253 s += "%%%02X" % o
254
255 # any remaining partial characters
256 s += ''.join(["%%%02X" % ord(x) for x in g])
215 u = s.decode('utf-8') 257 u = s.decode('utf-8')
216 258
217 # Decompose then lowercase (HFS+ technote specifies lower) 259 # Decompose then lowercase (HFS+ technote specifies lower)
218 return unicodedata.normalize('NFD', u).lower().encode('utf-8') 260 return unicodedata.normalize('NFD', u).lower().encode('utf-8')
219 261