# HG changeset patch # User Matt Mackall # Date 1329777765 21600 # Node ID 84c58da3a1f880e501141f7f9df16e8b4ff8d285 # Parent 41fc1e078d6822502277ac80325ec3db99707a6f encoding: introduce utf8-b helpers diff -r 41fc1e078d68 -r 84c58da3a1f8 mercurial/encoding.py --- a/mercurial/encoding.py Fri Feb 17 13:53:41 2012 -0600 +++ b/mercurial/encoding.py Mon Feb 20 16:42:45 2012 -0600 @@ -190,3 +190,80 @@ return s.upper() # we don't know how to fold this except in ASCII except LookupError, k: raise error.Abort(k, hint="please check your locale settings") + +def toutf8b(s): + '''convert a local, possibly-binary string into UTF-8b + + This is intended as a generic method to preserve data when working + with schemes like JSON and XML that have no provision for + arbitrary byte strings. As Mercurial often doesn't know + what encoding data is in, we use so-called UTF-8b. + + If a string is already valid UTF-8 (or ASCII), it passes unmodified. + Otherwise, unsupported bytes are mapped to UTF-16 surrogate range, + uDC00-uDCFF. + + Principles of operation: + + - ASCII and UTF-8 data sucessfully round-trips and is understood + by Unicode-oriented clients + - filenames and file contents in arbitrary other encodings can have + be round-tripped or recovered by clueful clients + - local strings that have a cached known UTF-8 encoding (aka + localstr) get sent as UTF-8 so Unicode-oriented clients get the + Unicode data they want + - because we must preserve UTF-8 bytestring in places such as + filenames, metadata can't be roundtripped without help + + (Note: "UTF-8b" often refers to decoding a mix of valid UTF-8 and + arbitrary bytes into an internal Unicode format that can be + re-encoded back into the original. Here we are exposing the + internal surrogate encoding as a UTF-8 string.) + ''' + + if isinstance(s, localstr): + return s._utf8 + + try: + if s.decode('utf-8'): + return s + except UnicodeDecodeError: + # surrogate-encode any characters that don't round-trip + s2 = s.decode('utf-8', 'ignore').encode('utf-8') + r = "" + pos = 0 + for c in s: + if s2[pos:pos + 1] == c: + r += c + pos += 1 + else: + r += unichr(0xdc00 + ord(c)).encode('utf-8') + return r + +def fromutf8b(s): + '''Given a UTF-8b string, return a local, possibly-binary string. + + return the original binary string. This + is a round-trip process for strings like filenames, but metadata + that's was passed through tolocal will remain in UTF-8. + + >>> m = "\\xc3\\xa9\\x99abcd" + >>> n = toutf8b(m) + >>> n + '\\xc3\\xa9\\xed\\xb2\\x99abcd' + >>> fromutf8b(n) == m + True + ''' + + # fast path - look for uDxxx prefixes in s + if "\xed" not in s: + return s + + u = s.decode("utf-8") + r = "" + for c in u: + if ord(c) & 0xff00 == 0xdc00: + r += chr(ord(c) & 0xff) + else: + r += c.encode("utf-8") + return r