posix: use getutf8char to handle OS X filename percent-escaping
This replaces an open-coded utf-8 parser that was ignoring subtle issues
like overlong encodings.
--- a/mercurial/posix.py Thu Nov 05 16:48:46 2015 -0600
+++ b/mercurial/posix.py Thu Nov 05 17:09:00 2015 -0600
@@ -255,40 +255,17 @@
except UnicodeDecodeError:
# OS X percent-encodes any bytes that aren't valid utf-8
s = ''
- g = ''
- l = 0
- for c in path:
- o = ord(c)
- if l and o < 128 or o >= 192:
- # we want a continuation byte, but didn't get one
- s += ''.join(["%%%02X" % ord(x) for x in g])
- g = ''
- l = 0
- if l == 0 and o < 128:
- # ascii
- s += c
- elif l == 0 and 194 <= o < 245:
- # valid leading bytes
- if o < 224:
- l = 1
- elif o < 240:
- l = 2
- else:
- l = 3
- g = c
- elif l > 0 and 128 <= o < 192:
- # valid continuations
- g += c
- l -= 1
- if not l:
- s += g
- g = ''
- else:
- # invalid
- s += "%%%02X" % o
+ pos = 0
+ l = len(s)
+ while pos < l:
+ try:
+ c = encoding.getutf8char(path, pos)
+ pos += len(c)
+ except ValueError:
+ c = '%%%%02X' % path[pos]
+ pos += 1
+ s += c
- # any remaining partial characters
- s += ''.join(["%%%02X" % ord(x) for x in g])
u = s.decode('utf-8')
# Decompose then lowercase (HFS+ technote specifies lower)