Mercurial: changeset 28068:9ece901f7a19

encoding: add option to escape non-ascii characters in JSON This is necessary for hgweb to embed JSON data in HTML. JSON data must be able to be embedded in non-UTF-8 HTML page so long as the page encoding is compatible with ASCII. According to RFC 7159, non-BMP character is represented as UTF-16 surrogate pair. This function first splits an input string into an array of UTF-16 code points. https://tools.ietf.org/html/rfc7159.html#section-7

--- a/mercurial/encoding.py	Sat Jan 30 19:48:35 2016 +0900
+++ b/mercurial/encoding.py	Sun Dec 27 19:28:34 2015 +0900
@@ -7,6 +7,7 @@
 
 from __future__ import absolute_import
 
+import array
 import locale
 import os
 import unicodedata
@@ -380,8 +381,8 @@
 
 _jsonmap = []
 _jsonmap.extend("\\u%04x" % x for x in xrange(32))
-_jsonmap.extend(chr(x) for x in xrange(32, 256))
-_jsonmap[0x7f] = '\\u007f'
+_jsonmap.extend(chr(x) for x in xrange(32, 127))
+_jsonmap.append('\\u007f')
 _jsonmap[0x09] = '\\t'
 _jsonmap[0x0a] = '\\n'
 _jsonmap[0x22] = '\\"'
@@ -389,8 +390,10 @@
 _jsonmap[0x08] = '\\b'
 _jsonmap[0x0c] = '\\f'
 _jsonmap[0x0d] = '\\r'
+_paranoidjsonmap = _jsonmap[:]
+_jsonmap.extend(chr(x) for x in xrange(128, 256))
 
-def jsonescape(s):
+def jsonescape(s, paranoid=False):
     '''returns a string suitable for JSON
 
     JSON is problematic for us because it doesn't support non-Unicode
@@ -415,9 +418,34 @@
     'utf-8: caf\\xc3\\xa9'
     >>> jsonescape('')
     ''
+
+    If paranoid, non-ascii characters are also escaped. This is suitable for
+    web output.
+
+    >>> jsonescape('escape boundary: \\x7e \\x7f \\xc2\\x80', paranoid=True)
+    'escape boundary: ~ \\\\u007f \\\\u0080'
+    >>> jsonescape('a weird byte: \\xdd', paranoid=True)
+    'a weird byte: \\\\udcdd'
+    >>> jsonescape('utf-8: caf\\xc3\\xa9', paranoid=True)
+    'utf-8: caf\\\\u00e9'
+    >>> jsonescape('non-BMP: \\xf0\\x9d\\x84\\x9e', paranoid=True)
+    'non-BMP: \\\\ud834\\\\udd1e'
     '''
 
-    return ''.join(_jsonmap[x] for x in bytearray(toutf8b(s)))
+    if paranoid:
+        jm = _paranoidjsonmap
+    else:
+        jm = _jsonmap
+
+    u8chars = toutf8b(s)
+    try:
+        return ''.join(jm[x] for x in bytearray(u8chars))  # fast path
+    except IndexError:
+        pass
+    # non-BMP char is represented as UTF-16 surrogate pair
+    u16codes = array.array('H', u8chars.decode('utf-8').encode('utf-16'))
+    u16codes.pop(0)  # drop BOM
+    return ''.join(jm[x] if x < 128 else '\\u%04x' % x for x in u16codes)
 
 _utf8len = [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 2, 2, 3, 4]

author	Yuya Nishihara <yuya@tcha.org>
	Sun, 27 Dec 2015 19:28:34 +0900
changeset 28068	9ece901f7a19
parent 28067	69a02b1e947c
child 28069	b2d24c2898f9