hfs+: rewrite percent-escaper (issue3918) stable
authorMatt Mackall <mpm@selenic.com>
Sat, 04 May 2013 14:51:21 -0500
branchstable
changeset 19131 af3b651505e2
parent 19129 bd19587a3347
child 19135 1c2dd751c12d
child 19145 0a12e5f3a979
hfs+: rewrite percent-escaper (issue3918) The original code was a bit too clever and got confused by some cp949 Korean text. This rewrite bytes the bullet and manually decodes UTF-8 sequences. Adds some doctests.
mercurial/posix.py
--- a/mercurial/posix.py	Fri May 03 12:40:17 2013 -0700
+++ b/mercurial/posix.py	Sat May 04 14:51:21 2013 -0500
@@ -194,6 +194,22 @@
     import fcntl # only needed on darwin, missing on jython
 
     def normcase(path):
+        '''
+        Normalize a filename for OS X-compatible comparison:
+        - escape-encode invalid characters
+        - decompose to NFD
+        - lowercase
+
+        >>> normcase('UPPER')
+        'upper'
+        >>> normcase('Caf\xc3\xa9')
+        'cafe\\xcc\\x81'
+        >>> normcase('\xc3\x89')
+        'e\\xcc\\x81'
+        >>> normcase('\xb8\xca\xc3\xca\xbe\xc8.JPG') # issue3918
+        '%b8%ca%c3\\xca\\xbe%c8.jpg'
+        '''
+
         try:
             path.decode('ascii') # throw exception for non-ASCII character
             return path.lower()
@@ -202,16 +218,42 @@
         try:
             u = path.decode('utf-8')
         except UnicodeDecodeError:
-            # percent-encode any characters that don't round-trip
-            p2 = path.decode('utf-8', 'ignore').encode('utf-8')
-            s = ""
-            pos = 0
+            # OS X percent-encodes any bytes that aren't valid utf-8
+            s = ''
+            g = ''
+            l = 0
             for c in path:
-                if p2[pos:pos + 1] == c:
+                o = ord(c)
+                if l and o < 128 or o >= 192:
+                    # we want a continuation byte, but didn't get one
+                    s += ''.join(["%%%02X" % ord(x) for x in g])
+                    g = ''
+                    l = 0
+                if l == 0 and o < 128:
+                    # ascii
                     s += c
-                    pos += 1
+                elif l == 0 and 194 <= o < 245:
+                    # valid leading bytes
+                    if o < 224:
+                        l = 1
+                    elif o < 240:
+                        l = 2
+                    else:
+                        l = 3
+                    g = c
+                elif l > 0 and 128 <= o < 192:
+                    # valid continuations
+                    g += c
+                    l -= 1
+                    if not l:
+                        s += g
+                        g = ''
                 else:
-                    s += "%%%02X" % ord(c)
+                    # invalid
+                    s += "%%%02X" % o
+
+            # any remaining partial characters
+            s += ''.join(["%%%02X" % ord(x) for x in g])
             u = s.decode('utf-8')
 
         # Decompose then lowercase (HFS+ technote specifies lower)