Mercurial > hg
changeset 33924:b9101467d88b
encoding: extract stub for fast JSON escape
This moves JSON character maps to pure/charencode.py because they will be
used only when the fast-path fails.
author | Yuya Nishihara <yuya@tcha.org> |
---|---|
date | Sun, 23 Apr 2017 16:10:51 +0900 |
parents | e6d421566906 |
children | 2c37f9dabc32 |
files | mercurial/encoding.py mercurial/pure/charencode.py |
diffstat | 2 files changed, 58 insertions(+), 28 deletions(-) [+] |
line wrap: on
line diff
--- a/mercurial/encoding.py Thu Aug 24 21:43:54 2017 -0700 +++ b/mercurial/encoding.py Sun Apr 23 16:10:51 2017 +0900 @@ -7,7 +7,6 @@ from __future__ import absolute_import -import array import io import locale import os @@ -19,10 +18,15 @@ pycompat, ) +from .pure import ( + charencode as charencodepure, +) + charencode = policy.importmod(r'charencode') asciilower = charencode.asciilower asciiupper = charencode.asciiupper +_jsonescapeu8fast = charencodepure.jsonescapeu8fast # TODO: no "pure" _sysstr = pycompat.sysstr @@ -383,22 +387,6 @@ upper = 1 other = 0 -_jsonmap = [] -_jsonmap.extend("\\u%04x" % x for x in range(32)) -_jsonmap.extend(pycompat.bytechr(x) for x in range(32, 127)) -_jsonmap.append('\\u007f') -_jsonmap[0x09] = '\\t' -_jsonmap[0x0a] = '\\n' -_jsonmap[0x22] = '\\"' -_jsonmap[0x5c] = '\\\\' -_jsonmap[0x08] = '\\b' -_jsonmap[0x0c] = '\\f' -_jsonmap[0x0d] = '\\r' -_paranoidjsonmap = _jsonmap[:] -_paranoidjsonmap[0x3c] = '\\u003c' # '<' (e.g. escape "</script>") -_paranoidjsonmap[0x3e] = '\\u003e' # '>' -_jsonmap.extend(pycompat.bytechr(x) for x in range(128, 256)) - def jsonescape(s, paranoid=False): '''returns a string suitable for JSON @@ -440,20 +428,12 @@ '\\\\u003cfoo@example.org\\\\u003e' ''' - if paranoid: - jm = _paranoidjsonmap - else: - jm = _jsonmap - u8chars = toutf8b(s) try: - return ''.join(jm[x] for x in bytearray(u8chars)) # fast path - except IndexError: + return _jsonescapeu8fast(u8chars, paranoid) + except ValueError: pass - # non-BMP char is represented as UTF-16 surrogate pair - u16codes = array.array('H', u8chars.decode('utf-8').encode('utf-16')) - u16codes.pop(0) # drop BOM - return ''.join(jm[x] if x < 128 else '\\u%04x' % x for x in u16codes) + return charencodepure.jsonescapeu8fallback(u8chars, paranoid) _utf8len = [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 2, 2, 3, 4]
--- a/mercurial/pure/charencode.py Thu Aug 24 21:43:54 2017 -0700 +++ b/mercurial/pure/charencode.py Sun Apr 23 16:10:51 2017 +0900 @@ -7,6 +7,12 @@ from __future__ import absolute_import +import array + +from .. import ( + pycompat, +) + def asciilower(s): '''convert a string to lowercase if ASCII @@ -20,3 +26,47 @@ Raises UnicodeDecodeError if non-ASCII characters are found.''' s.decode('ascii') return s.upper() + +_jsonmap = [] +_jsonmap.extend("\\u%04x" % x for x in range(32)) +_jsonmap.extend(pycompat.bytechr(x) for x in range(32, 127)) +_jsonmap.append('\\u007f') +_jsonmap[0x09] = '\\t' +_jsonmap[0x0a] = '\\n' +_jsonmap[0x22] = '\\"' +_jsonmap[0x5c] = '\\\\' +_jsonmap[0x08] = '\\b' +_jsonmap[0x0c] = '\\f' +_jsonmap[0x0d] = '\\r' +_paranoidjsonmap = _jsonmap[:] +_paranoidjsonmap[0x3c] = '\\u003c' # '<' (e.g. escape "</script>") +_paranoidjsonmap[0x3e] = '\\u003e' # '>' +_jsonmap.extend(pycompat.bytechr(x) for x in range(128, 256)) + +def jsonescapeu8fast(u8chars, paranoid): + """Convert a UTF-8 byte string to JSON-escaped form (fast path) + + Raises ValueError if non-ASCII characters have to be escaped. + """ + if paranoid: + jm = _paranoidjsonmap + else: + jm = _jsonmap + try: + return ''.join(jm[x] for x in bytearray(u8chars)) + except IndexError: + raise ValueError + +def jsonescapeu8fallback(u8chars, paranoid): + """Convert a UTF-8 byte string to JSON-escaped form (slow path) + + Escapes all non-ASCII characters no matter if paranoid is False. + """ + if paranoid: + jm = _paranoidjsonmap + else: + jm = _jsonmap + # non-BMP char is represented as UTF-16 surrogate pair + u16codes = array.array('H', u8chars.decode('utf-8').encode('utf-16')) + u16codes.pop(0) # drop BOM + return ''.join(jm[x] if x < 128 else '\\u%04x' % x for x in u16codes)