encoding: add option to escape non-ascii characters in JSON
This is necessary for hgweb to embed JSON data in HTML. JSON data must be
able to be embedded in non-UTF-8 HTML page so long as the page encoding is
compatible with ASCII.
According to RFC 7159, non-BMP character is represented as UTF-16 surrogate
pair. This function first splits an input string into an array of UTF-16
code points.
https://tools.ietf.org/html/rfc7159.html#section-7
--- a/mercurial/encoding.py Sat Jan 30 19:48:35 2016 +0900
+++ b/mercurial/encoding.py Sun Dec 27 19:28:34 2015 +0900
@@ -7,6 +7,7 @@
from __future__ import absolute_import
+import array
import locale
import os
import unicodedata
@@ -380,8 +381,8 @@
_jsonmap = []
_jsonmap.extend("\\u%04x" % x for x in xrange(32))
-_jsonmap.extend(chr(x) for x in xrange(32, 256))
-_jsonmap[0x7f] = '\\u007f'
+_jsonmap.extend(chr(x) for x in xrange(32, 127))
+_jsonmap.append('\\u007f')
_jsonmap[0x09] = '\\t'
_jsonmap[0x0a] = '\\n'
_jsonmap[0x22] = '\\"'
@@ -389,8 +390,10 @@
_jsonmap[0x08] = '\\b'
_jsonmap[0x0c] = '\\f'
_jsonmap[0x0d] = '\\r'
+_paranoidjsonmap = _jsonmap[:]
+_jsonmap.extend(chr(x) for x in xrange(128, 256))
-def jsonescape(s):
+def jsonescape(s, paranoid=False):
'''returns a string suitable for JSON
JSON is problematic for us because it doesn't support non-Unicode
@@ -415,9 +418,34 @@
'utf-8: caf\\xc3\\xa9'
>>> jsonescape('')
''
+
+ If paranoid, non-ascii characters are also escaped. This is suitable for
+ web output.
+
+ >>> jsonescape('escape boundary: \\x7e \\x7f \\xc2\\x80', paranoid=True)
+ 'escape boundary: ~ \\\\u007f \\\\u0080'
+ >>> jsonescape('a weird byte: \\xdd', paranoid=True)
+ 'a weird byte: \\\\udcdd'
+ >>> jsonescape('utf-8: caf\\xc3\\xa9', paranoid=True)
+ 'utf-8: caf\\\\u00e9'
+ >>> jsonescape('non-BMP: \\xf0\\x9d\\x84\\x9e', paranoid=True)
+ 'non-BMP: \\\\ud834\\\\udd1e'
'''
- return ''.join(_jsonmap[x] for x in bytearray(toutf8b(s)))
+ if paranoid:
+ jm = _paranoidjsonmap
+ else:
+ jm = _jsonmap
+
+ u8chars = toutf8b(s)
+ try:
+ return ''.join(jm[x] for x in bytearray(u8chars)) # fast path
+ except IndexError:
+ pass
+ # non-BMP char is represented as UTF-16 surrogate pair
+ u16codes = array.array('H', u8chars.decode('utf-8').encode('utf-16'))
+ u16codes.pop(0) # drop BOM
+ return ''.join(jm[x] if x < 128 else '\\u%04x' % x for x in u16codes)
_utf8len = [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 2, 2, 3, 4]