Mercurial > hg-stable
comparison mercurial/encoding.py @ 28068:9ece901f7a19
encoding: add option to escape non-ascii characters in JSON
This is necessary for hgweb to embed JSON data in HTML. JSON data must be
able to be embedded in non-UTF-8 HTML page so long as the page encoding is
compatible with ASCII.
According to RFC 7159, non-BMP character is represented as UTF-16 surrogate
pair. This function first splits an input string into an array of UTF-16
code points.
https://tools.ietf.org/html/rfc7159.html#section-7
author | Yuya Nishihara <yuya@tcha.org> |
---|---|
date | Sun, 27 Dec 2015 19:28:34 +0900 |
parents | 69a02b1e947c |
children | b2d24c2898f9 |
comparison
equal
deleted
inserted
replaced
28067:69a02b1e947c | 28068:9ece901f7a19 |
---|---|
5 # This software may be used and distributed according to the terms of the | 5 # This software may be used and distributed according to the terms of the |
6 # GNU General Public License version 2 or any later version. | 6 # GNU General Public License version 2 or any later version. |
7 | 7 |
8 from __future__ import absolute_import | 8 from __future__ import absolute_import |
9 | 9 |
10 import array | |
10 import locale | 11 import locale |
11 import os | 12 import os |
12 import unicodedata | 13 import unicodedata |
13 | 14 |
14 from . import ( | 15 from . import ( |
378 upper = 1 | 379 upper = 1 |
379 other = 0 | 380 other = 0 |
380 | 381 |
381 _jsonmap = [] | 382 _jsonmap = [] |
382 _jsonmap.extend("\\u%04x" % x for x in xrange(32)) | 383 _jsonmap.extend("\\u%04x" % x for x in xrange(32)) |
383 _jsonmap.extend(chr(x) for x in xrange(32, 256)) | 384 _jsonmap.extend(chr(x) for x in xrange(32, 127)) |
384 _jsonmap[0x7f] = '\\u007f' | 385 _jsonmap.append('\\u007f') |
385 _jsonmap[0x09] = '\\t' | 386 _jsonmap[0x09] = '\\t' |
386 _jsonmap[0x0a] = '\\n' | 387 _jsonmap[0x0a] = '\\n' |
387 _jsonmap[0x22] = '\\"' | 388 _jsonmap[0x22] = '\\"' |
388 _jsonmap[0x5c] = '\\\\' | 389 _jsonmap[0x5c] = '\\\\' |
389 _jsonmap[0x08] = '\\b' | 390 _jsonmap[0x08] = '\\b' |
390 _jsonmap[0x0c] = '\\f' | 391 _jsonmap[0x0c] = '\\f' |
391 _jsonmap[0x0d] = '\\r' | 392 _jsonmap[0x0d] = '\\r' |
392 | 393 _paranoidjsonmap = _jsonmap[:] |
393 def jsonescape(s): | 394 _jsonmap.extend(chr(x) for x in xrange(128, 256)) |
395 | |
396 def jsonescape(s, paranoid=False): | |
394 '''returns a string suitable for JSON | 397 '''returns a string suitable for JSON |
395 | 398 |
396 JSON is problematic for us because it doesn't support non-Unicode | 399 JSON is problematic for us because it doesn't support non-Unicode |
397 bytes. To deal with this, we take the following approach: | 400 bytes. To deal with this, we take the following approach: |
398 | 401 |
413 'a weird byte: \\xed\\xb3\\x9d' | 416 'a weird byte: \\xed\\xb3\\x9d' |
414 >>> jsonescape('utf-8: caf\\xc3\\xa9') | 417 >>> jsonescape('utf-8: caf\\xc3\\xa9') |
415 'utf-8: caf\\xc3\\xa9' | 418 'utf-8: caf\\xc3\\xa9' |
416 >>> jsonescape('') | 419 >>> jsonescape('') |
417 '' | 420 '' |
421 | |
422 If paranoid, non-ascii characters are also escaped. This is suitable for | |
423 web output. | |
424 | |
425 >>> jsonescape('escape boundary: \\x7e \\x7f \\xc2\\x80', paranoid=True) | |
426 'escape boundary: ~ \\\\u007f \\\\u0080' | |
427 >>> jsonescape('a weird byte: \\xdd', paranoid=True) | |
428 'a weird byte: \\\\udcdd' | |
429 >>> jsonescape('utf-8: caf\\xc3\\xa9', paranoid=True) | |
430 'utf-8: caf\\\\u00e9' | |
431 >>> jsonescape('non-BMP: \\xf0\\x9d\\x84\\x9e', paranoid=True) | |
432 'non-BMP: \\\\ud834\\\\udd1e' | |
418 ''' | 433 ''' |
419 | 434 |
420 return ''.join(_jsonmap[x] for x in bytearray(toutf8b(s))) | 435 if paranoid: |
436 jm = _paranoidjsonmap | |
437 else: | |
438 jm = _jsonmap | |
439 | |
440 u8chars = toutf8b(s) | |
441 try: | |
442 return ''.join(jm[x] for x in bytearray(u8chars)) # fast path | |
443 except IndexError: | |
444 pass | |
445 # non-BMP char is represented as UTF-16 surrogate pair | |
446 u16codes = array.array('H', u8chars.decode('utf-8').encode('utf-16')) | |
447 u16codes.pop(0) # drop BOM | |
448 return ''.join(jm[x] if x < 128 else '\\u%04x' % x for x in u16codes) | |
421 | 449 |
422 _utf8len = [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 2, 2, 3, 4] | 450 _utf8len = [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 2, 2, 3, 4] |
423 | 451 |
424 def getutf8char(s, pos): | 452 def getutf8char(s, pos): |
425 '''get the next full utf-8 character in the given string, starting at pos | 453 '''get the next full utf-8 character in the given string, starting at pos |