comparison mercurial/encoding.py @ 28068:9ece901f7a19

encoding: add option to escape non-ascii characters in JSON This is necessary for hgweb to embed JSON data in HTML. JSON data must be able to be embedded in non-UTF-8 HTML page so long as the page encoding is compatible with ASCII. According to RFC 7159, non-BMP character is represented as UTF-16 surrogate pair. This function first splits an input string into an array of UTF-16 code points. https://tools.ietf.org/html/rfc7159.html#section-7
author Yuya Nishihara <yuya@tcha.org>
date Sun, 27 Dec 2015 19:28:34 +0900
parents 69a02b1e947c
children b2d24c2898f9
comparison
equal deleted inserted replaced
28067:69a02b1e947c 28068:9ece901f7a19
5 # This software may be used and distributed according to the terms of the 5 # This software may be used and distributed according to the terms of the
6 # GNU General Public License version 2 or any later version. 6 # GNU General Public License version 2 or any later version.
7 7
8 from __future__ import absolute_import 8 from __future__ import absolute_import
9 9
10 import array
10 import locale 11 import locale
11 import os 12 import os
12 import unicodedata 13 import unicodedata
13 14
14 from . import ( 15 from . import (
378 upper = 1 379 upper = 1
379 other = 0 380 other = 0
380 381
381 _jsonmap = [] 382 _jsonmap = []
382 _jsonmap.extend("\\u%04x" % x for x in xrange(32)) 383 _jsonmap.extend("\\u%04x" % x for x in xrange(32))
383 _jsonmap.extend(chr(x) for x in xrange(32, 256)) 384 _jsonmap.extend(chr(x) for x in xrange(32, 127))
384 _jsonmap[0x7f] = '\\u007f' 385 _jsonmap.append('\\u007f')
385 _jsonmap[0x09] = '\\t' 386 _jsonmap[0x09] = '\\t'
386 _jsonmap[0x0a] = '\\n' 387 _jsonmap[0x0a] = '\\n'
387 _jsonmap[0x22] = '\\"' 388 _jsonmap[0x22] = '\\"'
388 _jsonmap[0x5c] = '\\\\' 389 _jsonmap[0x5c] = '\\\\'
389 _jsonmap[0x08] = '\\b' 390 _jsonmap[0x08] = '\\b'
390 _jsonmap[0x0c] = '\\f' 391 _jsonmap[0x0c] = '\\f'
391 _jsonmap[0x0d] = '\\r' 392 _jsonmap[0x0d] = '\\r'
392 393 _paranoidjsonmap = _jsonmap[:]
393 def jsonescape(s): 394 _jsonmap.extend(chr(x) for x in xrange(128, 256))
395
396 def jsonescape(s, paranoid=False):
394 '''returns a string suitable for JSON 397 '''returns a string suitable for JSON
395 398
396 JSON is problematic for us because it doesn't support non-Unicode 399 JSON is problematic for us because it doesn't support non-Unicode
397 bytes. To deal with this, we take the following approach: 400 bytes. To deal with this, we take the following approach:
398 401
413 'a weird byte: \\xed\\xb3\\x9d' 416 'a weird byte: \\xed\\xb3\\x9d'
414 >>> jsonescape('utf-8: caf\\xc3\\xa9') 417 >>> jsonescape('utf-8: caf\\xc3\\xa9')
415 'utf-8: caf\\xc3\\xa9' 418 'utf-8: caf\\xc3\\xa9'
416 >>> jsonescape('') 419 >>> jsonescape('')
417 '' 420 ''
421
422 If paranoid, non-ascii characters are also escaped. This is suitable for
423 web output.
424
425 >>> jsonescape('escape boundary: \\x7e \\x7f \\xc2\\x80', paranoid=True)
426 'escape boundary: ~ \\\\u007f \\\\u0080'
427 >>> jsonescape('a weird byte: \\xdd', paranoid=True)
428 'a weird byte: \\\\udcdd'
429 >>> jsonescape('utf-8: caf\\xc3\\xa9', paranoid=True)
430 'utf-8: caf\\\\u00e9'
431 >>> jsonescape('non-BMP: \\xf0\\x9d\\x84\\x9e', paranoid=True)
432 'non-BMP: \\\\ud834\\\\udd1e'
418 ''' 433 '''
419 434
420 return ''.join(_jsonmap[x] for x in bytearray(toutf8b(s))) 435 if paranoid:
436 jm = _paranoidjsonmap
437 else:
438 jm = _jsonmap
439
440 u8chars = toutf8b(s)
441 try:
442 return ''.join(jm[x] for x in bytearray(u8chars)) # fast path
443 except IndexError:
444 pass
445 # non-BMP char is represented as UTF-16 surrogate pair
446 u16codes = array.array('H', u8chars.decode('utf-8').encode('utf-16'))
447 u16codes.pop(0) # drop BOM
448 return ''.join(jm[x] if x < 128 else '\\u%04x' % x for x in u16codes)
421 449
422 _utf8len = [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 2, 2, 3, 4] 450 _utf8len = [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 2, 2, 3, 4]
423 451
424 def getutf8char(s, pos): 452 def getutf8char(s, pos):
425 '''get the next full utf-8 character in the given string, starting at pos 453 '''get the next full utf-8 character in the given string, starting at pos