# HG changeset patch # User Matt Mackall # Date 1446764940 21600 # Node ID b8381832ce2b4e791ba7bb6d0822e82203475d8e # Parent cf47bdb2183caaaa5daa2acb0d7ab7e88c12b9f9 posix: use getutf8char to handle OS X filename percent-escaping This replaces an open-coded utf-8 parser that was ignoring subtle issues like overlong encodings. diff -r cf47bdb2183c -r b8381832ce2b mercurial/posix.py --- a/mercurial/posix.py Thu Nov 05 16:48:46 2015 -0600 +++ b/mercurial/posix.py Thu Nov 05 17:09:00 2015 -0600 @@ -255,40 +255,17 @@ except UnicodeDecodeError: # OS X percent-encodes any bytes that aren't valid utf-8 s = '' - g = '' - l = 0 - for c in path: - o = ord(c) - if l and o < 128 or o >= 192: - # we want a continuation byte, but didn't get one - s += ''.join(["%%%02X" % ord(x) for x in g]) - g = '' - l = 0 - if l == 0 and o < 128: - # ascii - s += c - elif l == 0 and 194 <= o < 245: - # valid leading bytes - if o < 224: - l = 1 - elif o < 240: - l = 2 - else: - l = 3 - g = c - elif l > 0 and 128 <= o < 192: - # valid continuations - g += c - l -= 1 - if not l: - s += g - g = '' - else: - # invalid - s += "%%%02X" % o + pos = 0 + l = len(s) + while pos < l: + try: + c = encoding.getutf8char(path, pos) + pos += len(c) + except ValueError: + c = '%%%%02X' % path[pos] + pos += 1 + s += c - # any remaining partial characters - s += ''.join(["%%%02X" % ord(x) for x in g]) u = s.decode('utf-8') # Decompose then lowercase (HFS+ technote specifies lower)