Mercurial > hg-stable
changeset 28068:9ece901f7a19
encoding: add option to escape non-ascii characters in JSON
This is necessary for hgweb to embed JSON data in HTML. JSON data must be
able to be embedded in non-UTF-8 HTML page so long as the page encoding is
compatible with ASCII.
According to RFC 7159, non-BMP character is represented as UTF-16 surrogate
pair. This function first splits an input string into an array of UTF-16
code points.
https://tools.ietf.org/html/rfc7159.html#section-7
author | Yuya Nishihara <yuya@tcha.org> |
---|---|
date | Sun, 27 Dec 2015 19:28:34 +0900 |
parents | 69a02b1e947c |
children | b2d24c2898f9 |
files | mercurial/encoding.py |
diffstat | 1 files changed, 32 insertions(+), 4 deletions(-) [+] |
line wrap: on
line diff
--- a/mercurial/encoding.py Sat Jan 30 19:48:35 2016 +0900 +++ b/mercurial/encoding.py Sun Dec 27 19:28:34 2015 +0900 @@ -7,6 +7,7 @@ from __future__ import absolute_import +import array import locale import os import unicodedata @@ -380,8 +381,8 @@ _jsonmap = [] _jsonmap.extend("\\u%04x" % x for x in xrange(32)) -_jsonmap.extend(chr(x) for x in xrange(32, 256)) -_jsonmap[0x7f] = '\\u007f' +_jsonmap.extend(chr(x) for x in xrange(32, 127)) +_jsonmap.append('\\u007f') _jsonmap[0x09] = '\\t' _jsonmap[0x0a] = '\\n' _jsonmap[0x22] = '\\"' @@ -389,8 +390,10 @@ _jsonmap[0x08] = '\\b' _jsonmap[0x0c] = '\\f' _jsonmap[0x0d] = '\\r' +_paranoidjsonmap = _jsonmap[:] +_jsonmap.extend(chr(x) for x in xrange(128, 256)) -def jsonescape(s): +def jsonescape(s, paranoid=False): '''returns a string suitable for JSON JSON is problematic for us because it doesn't support non-Unicode @@ -415,9 +418,34 @@ 'utf-8: caf\\xc3\\xa9' >>> jsonescape('') '' + + If paranoid, non-ascii characters are also escaped. This is suitable for + web output. + + >>> jsonescape('escape boundary: \\x7e \\x7f \\xc2\\x80', paranoid=True) + 'escape boundary: ~ \\\\u007f \\\\u0080' + >>> jsonescape('a weird byte: \\xdd', paranoid=True) + 'a weird byte: \\\\udcdd' + >>> jsonescape('utf-8: caf\\xc3\\xa9', paranoid=True) + 'utf-8: caf\\\\u00e9' + >>> jsonescape('non-BMP: \\xf0\\x9d\\x84\\x9e', paranoid=True) + 'non-BMP: \\\\ud834\\\\udd1e' ''' - return ''.join(_jsonmap[x] for x in bytearray(toutf8b(s))) + if paranoid: + jm = _paranoidjsonmap + else: + jm = _jsonmap + + u8chars = toutf8b(s) + try: + return ''.join(jm[x] for x in bytearray(u8chars)) # fast path + except IndexError: + pass + # non-BMP char is represented as UTF-16 surrogate pair + u16codes = array.array('H', u8chars.decode('utf-8').encode('utf-16')) + u16codes.pop(0) # drop BOM + return ''.join(jm[x] if x < 128 else '\\u%04x' % x for x in u16codes) _utf8len = [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 2, 2, 3, 4]