Mercurial > hg
comparison mercurial/posix.py @ 19131:af3b651505e2 stable
hfs+: rewrite percent-escaper (issue3918)
The original code was a bit too clever and got confused by some cp949
Korean text. This rewrite bytes the bullet and manually decodes UTF-8
sequences. Adds some doctests.
author | Matt Mackall <mpm@selenic.com> |
---|---|
date | Sat, 04 May 2013 14:51:21 -0500 |
parents | cafa447a7d3b |
children | cfdae231ba78 |
comparison
equal
deleted
inserted
replaced
19129:bd19587a3347 | 19131:af3b651505e2 |
---|---|
192 | 192 |
193 if sys.platform == 'darwin': | 193 if sys.platform == 'darwin': |
194 import fcntl # only needed on darwin, missing on jython | 194 import fcntl # only needed on darwin, missing on jython |
195 | 195 |
196 def normcase(path): | 196 def normcase(path): |
197 ''' | |
198 Normalize a filename for OS X-compatible comparison: | |
199 - escape-encode invalid characters | |
200 - decompose to NFD | |
201 - lowercase | |
202 | |
203 >>> normcase('UPPER') | |
204 'upper' | |
205 >>> normcase('Caf\xc3\xa9') | |
206 'cafe\\xcc\\x81' | |
207 >>> normcase('\xc3\x89') | |
208 'e\\xcc\\x81' | |
209 >>> normcase('\xb8\xca\xc3\xca\xbe\xc8.JPG') # issue3918 | |
210 '%b8%ca%c3\\xca\\xbe%c8.jpg' | |
211 ''' | |
212 | |
197 try: | 213 try: |
198 path.decode('ascii') # throw exception for non-ASCII character | 214 path.decode('ascii') # throw exception for non-ASCII character |
199 return path.lower() | 215 return path.lower() |
200 except UnicodeDecodeError: | 216 except UnicodeDecodeError: |
201 pass | 217 pass |
202 try: | 218 try: |
203 u = path.decode('utf-8') | 219 u = path.decode('utf-8') |
204 except UnicodeDecodeError: | 220 except UnicodeDecodeError: |
205 # percent-encode any characters that don't round-trip | 221 # OS X percent-encodes any bytes that aren't valid utf-8 |
206 p2 = path.decode('utf-8', 'ignore').encode('utf-8') | 222 s = '' |
207 s = "" | 223 g = '' |
208 pos = 0 | 224 l = 0 |
209 for c in path: | 225 for c in path: |
210 if p2[pos:pos + 1] == c: | 226 o = ord(c) |
227 if l and o < 128 or o >= 192: | |
228 # we want a continuation byte, but didn't get one | |
229 s += ''.join(["%%%02X" % ord(x) for x in g]) | |
230 g = '' | |
231 l = 0 | |
232 if l == 0 and o < 128: | |
233 # ascii | |
211 s += c | 234 s += c |
212 pos += 1 | 235 elif l == 0 and 194 <= o < 245: |
236 # valid leading bytes | |
237 if o < 224: | |
238 l = 1 | |
239 elif o < 240: | |
240 l = 2 | |
241 else: | |
242 l = 3 | |
243 g = c | |
244 elif l > 0 and 128 <= o < 192: | |
245 # valid continuations | |
246 g += c | |
247 l -= 1 | |
248 if not l: | |
249 s += g | |
250 g = '' | |
213 else: | 251 else: |
214 s += "%%%02X" % ord(c) | 252 # invalid |
253 s += "%%%02X" % o | |
254 | |
255 # any remaining partial characters | |
256 s += ''.join(["%%%02X" % ord(x) for x in g]) | |
215 u = s.decode('utf-8') | 257 u = s.decode('utf-8') |
216 | 258 |
217 # Decompose then lowercase (HFS+ technote specifies lower) | 259 # Decompose then lowercase (HFS+ technote specifies lower) |
218 return unicodedata.normalize('NFD', u).lower().encode('utf-8') | 260 return unicodedata.normalize('NFD', u).lower().encode('utf-8') |
219 | 261 |