# HG changeset patch # User Matt Mackall # Date 1367697171 18000 # Node ID 1c2dd751c12d5d57e7974bd93942db6328917091 # Parent 75031762aff2f857775611704f0025953bdfb866# Parent af3b651505e204c396ec1069ae5b6c7fb4449ea5 merge with stable diff -r 75031762aff2 -r 1c2dd751c12d mercurial/posix.py --- a/mercurial/posix.py Fri May 03 15:18:21 2013 -0700 +++ b/mercurial/posix.py Sat May 04 14:52:51 2013 -0500 @@ -194,6 +194,22 @@ import fcntl # only needed on darwin, missing on jython def normcase(path): + ''' + Normalize a filename for OS X-compatible comparison: + - escape-encode invalid characters + - decompose to NFD + - lowercase + + >>> normcase('UPPER') + 'upper' + >>> normcase('Caf\xc3\xa9') + 'cafe\\xcc\\x81' + >>> normcase('\xc3\x89') + 'e\\xcc\\x81' + >>> normcase('\xb8\xca\xc3\xca\xbe\xc8.JPG') # issue3918 + '%b8%ca%c3\\xca\\xbe%c8.jpg' + ''' + try: path.decode('ascii') # throw exception for non-ASCII character return path.lower() @@ -202,16 +218,42 @@ try: u = path.decode('utf-8') except UnicodeDecodeError: - # percent-encode any characters that don't round-trip - p2 = path.decode('utf-8', 'ignore').encode('utf-8') - s = "" - pos = 0 + # OS X percent-encodes any bytes that aren't valid utf-8 + s = '' + g = '' + l = 0 for c in path: - if p2[pos:pos + 1] == c: + o = ord(c) + if l and o < 128 or o >= 192: + # we want a continuation byte, but didn't get one + s += ''.join(["%%%02X" % ord(x) for x in g]) + g = '' + l = 0 + if l == 0 and o < 128: + # ascii s += c - pos += 1 + elif l == 0 and 194 <= o < 245: + # valid leading bytes + if o < 224: + l = 1 + elif o < 240: + l = 2 + else: + l = 3 + g = c + elif l > 0 and 128 <= o < 192: + # valid continuations + g += c + l -= 1 + if not l: + s += g + g = '' else: - s += "%%%02X" % ord(c) + # invalid + s += "%%%02X" % o + + # any remaining partial characters + s += ''.join(["%%%02X" % ord(x) for x in g]) u = s.decode('utf-8') # Decompose then lowercase (HFS+ technote specifies lower)