hg-stable: mercurial/encoding.py@707c3804e607 (annotated)

8226 8b2cd04a6e97 put license and copyright info into comment blocks Martin Geisler <mg@lazybytes.net> parents: 8225 diff changeset	1	# encoding.py - character transcoding support for Mercurial
8b2cd04a6e97 put license and copyright info into comment blocks Martin Geisler <mg@lazybytes.net> parents: 8225 diff changeset	2	#
8b2cd04a6e97 put license and copyright info into comment blocks Martin Geisler <mg@lazybytes.net> parents: 8225 diff changeset	3	# Copyright 2005-2009 Matt Mackall <mpm@selenic.com> and others
8b2cd04a6e97 put license and copyright info into comment blocks Martin Geisler <mg@lazybytes.net> parents: 8225 diff changeset	4	#
8b2cd04a6e97 put license and copyright info into comment blocks Martin Geisler <mg@lazybytes.net> parents: 8225 diff changeset	5	# This software may be used and distributed according to the terms of the
10263 25e572394f5c Update license to GPLv2+ Matt Mackall <mpm@selenic.com> parents: 9574 diff changeset	6	# GNU General Public License version 2 or any later version.
7948 de377b1a9a84 move encoding bits from util to encoding Matt Mackall <mpm@selenic.com> parents: diff changeset	7
34152 a8994d08e4a2 doctest: use print_function and convert bytes to unicode where needed Yuya Nishihara <yuya@tcha.org> parents: 34151 diff changeset	8	from __future__ import absolute_import, print_function
27355 b479fc425a81 encoding: use absolute_import Gregory Szorc <gregory.szorc@gmail.com> parents: 26963 diff changeset	9
b479fc425a81 encoding: use absolute_import Gregory Szorc <gregory.szorc@gmail.com> parents: 26963 diff changeset	10	import locale
b479fc425a81 encoding: use absolute_import Gregory Szorc <gregory.szorc@gmail.com> parents: 26963 diff changeset	11	import os
b479fc425a81 encoding: use absolute_import Gregory Szorc <gregory.szorc@gmail.com> parents: 26963 diff changeset	12	import unicodedata
b479fc425a81 encoding: use absolute_import Gregory Szorc <gregory.szorc@gmail.com> parents: 26963 diff changeset	13
b479fc425a81 encoding: use absolute_import Gregory Szorc <gregory.szorc@gmail.com> parents: 26963 diff changeset	14	from . import (
b479fc425a81 encoding: use absolute_import Gregory Szorc <gregory.szorc@gmail.com> parents: 26963 diff changeset	15	error,
32411 df448de7cf3b parsers: switch to policy importer Yuya Nishihara <yuya@tcha.org> parents: 32339 diff changeset	16	policy,
30031 0f6d6fdd3c2a pycompat: provide 'ispy3' constant Yuya Nishihara <yuya@tcha.org> parents: 28508 diff changeset	17	pycompat,
27355 b479fc425a81 encoding: use absolute_import Gregory Szorc <gregory.szorc@gmail.com> parents: 26963 diff changeset	18	)
7948 de377b1a9a84 move encoding bits from util to encoding Matt Mackall <mpm@selenic.com> parents: diff changeset	19
33942 b9101467d88b encoding: extract stub for fast JSON escape Yuya Nishihara <yuya@tcha.org> parents: 33873 diff changeset	20	from .pure import (
b9101467d88b encoding: extract stub for fast JSON escape Yuya Nishihara <yuya@tcha.org> parents: 33873 diff changeset	21	charencode as charencodepure,
b9101467d88b encoding: extract stub for fast JSON escape Yuya Nishihara <yuya@tcha.org> parents: 33873 diff changeset	22	)
b9101467d88b encoding: extract stub for fast JSON escape Yuya Nishihara <yuya@tcha.org> parents: 33873 diff changeset	23
33782 f5fc54e7e467 encoding: drop circular import by proxying through '<policy>.charencode' Yuya Nishihara <yuya@tcha.org> parents: 33038 diff changeset	24	charencode = policy.importmod(r'charencode')
f5fc54e7e467 encoding: drop circular import by proxying through '<policy>.charencode' Yuya Nishihara <yuya@tcha.org> parents: 33038 diff changeset	25
33944 f4433f2713d0 encoding: add function to test if a str consists of ASCII characters Yuya Nishihara <yuya@tcha.org> parents: 33943 diff changeset	26	isasciistr = charencode.isasciistr
33782 f5fc54e7e467 encoding: drop circular import by proxying through '<policy>.charencode' Yuya Nishihara <yuya@tcha.org> parents: 33038 diff changeset	27	asciilower = charencode.asciilower
f5fc54e7e467 encoding: drop circular import by proxying through '<policy>.charencode' Yuya Nishihara <yuya@tcha.org> parents: 33038 diff changeset	28	asciiupper = charencode.asciiupper
33943 2c37f9dabc32 encoding: add fast path of jsonescape() (issue5533) Yuya Nishihara <yuya@tcha.org> parents: 33942 diff changeset	29	_jsonescapeu8fast = charencode.jsonescapeu8fast
33782 f5fc54e7e467 encoding: drop circular import by proxying through '<policy>.charencode' Yuya Nishihara <yuya@tcha.org> parents: 33038 diff changeset	30
30033 02dbfaa6df0b py3: convert encoding name and mode to str Yuya Nishihara <yuya@tcha.org> parents: 30031 diff changeset	31	_sysstr = pycompat.sysstr
02dbfaa6df0b py3: convert encoding name and mode to str Yuya Nishihara <yuya@tcha.org> parents: 30031 diff changeset	32
30031 0f6d6fdd3c2a pycompat: provide 'ispy3' constant Yuya Nishihara <yuya@tcha.org> parents: 28508 diff changeset	33	if pycompat.ispy3:
28507 9bcbd9412225 encoding: make HFS+ ignore code Python 3 compatible Gregory Szorc <gregory.szorc@gmail.com> parents: 28069 diff changeset	34	unichr = chr
9bcbd9412225 encoding: make HFS+ ignore code Python 3 compatible Gregory Szorc <gregory.szorc@gmail.com> parents: 28069 diff changeset	35
23596 885bd7c5c7e3 encoding: add hfsignoreclean to clean out HFS-ignored characters Augie Fackler <raf@durin42.com> parents: 22973 diff changeset	36	# These unicode characters are ignored by HFS+ (Apple Technote 1150,
885bd7c5c7e3 encoding: add hfsignoreclean to clean out HFS-ignored characters Augie Fackler <raf@durin42.com> parents: 22973 diff changeset	37	# "Unicode Subtleties"), so we need to ignore them in some places for
885bd7c5c7e3 encoding: add hfsignoreclean to clean out HFS-ignored characters Augie Fackler <raf@durin42.com> parents: 22973 diff changeset	38	# sanity.
885bd7c5c7e3 encoding: add hfsignoreclean to clean out HFS-ignored characters Augie Fackler <raf@durin42.com> parents: 22973 diff changeset	39	_ignore = [unichr(int(x, 16)).encode("utf-8") for x in
885bd7c5c7e3 encoding: add hfsignoreclean to clean out HFS-ignored characters Augie Fackler <raf@durin42.com> parents: 22973 diff changeset	40	"200c 200d 200e 200f 202a 202b 202c 202d 202e "
885bd7c5c7e3 encoding: add hfsignoreclean to clean out HFS-ignored characters Augie Fackler <raf@durin42.com> parents: 22973 diff changeset	41	"206a 206b 206c 206d 206e 206f feff".split()]
885bd7c5c7e3 encoding: add hfsignoreclean to clean out HFS-ignored characters Augie Fackler <raf@durin42.com> parents: 22973 diff changeset	42	# verify the next function will work
32339 7040f5131454 encoding: use i.startswith() instead of i[0] to eliminate py2/3 divergence Yuya Nishihara <yuya@tcha.org> parents: 32331 diff changeset	43	assert all(i.startswith(("\xe2", "\xef")) for i in _ignore)
23596 885bd7c5c7e3 encoding: add hfsignoreclean to clean out HFS-ignored characters Augie Fackler <raf@durin42.com> parents: 22973 diff changeset	44
885bd7c5c7e3 encoding: add hfsignoreclean to clean out HFS-ignored characters Augie Fackler <raf@durin42.com> parents: 22973 diff changeset	45	def hfsignoreclean(s):
885bd7c5c7e3 encoding: add hfsignoreclean to clean out HFS-ignored characters Augie Fackler <raf@durin42.com> parents: 22973 diff changeset	46	"""Remove codepoints ignored by HFS+ from s.
885bd7c5c7e3 encoding: add hfsignoreclean to clean out HFS-ignored characters Augie Fackler <raf@durin42.com> parents: 22973 diff changeset	47
885bd7c5c7e3 encoding: add hfsignoreclean to clean out HFS-ignored characters Augie Fackler <raf@durin42.com> parents: 22973 diff changeset	48	>>> hfsignoreclean(u'.h\u200cg'.encode('utf-8'))
885bd7c5c7e3 encoding: add hfsignoreclean to clean out HFS-ignored characters Augie Fackler <raf@durin42.com> parents: 22973 diff changeset	49	'.hg'
885bd7c5c7e3 encoding: add hfsignoreclean to clean out HFS-ignored characters Augie Fackler <raf@durin42.com> parents: 22973 diff changeset	50	>>> hfsignoreclean(u'.h\ufeffg'.encode('utf-8'))
885bd7c5c7e3 encoding: add hfsignoreclean to clean out HFS-ignored characters Augie Fackler <raf@durin42.com> parents: 22973 diff changeset	51	'.hg'
885bd7c5c7e3 encoding: add hfsignoreclean to clean out HFS-ignored characters Augie Fackler <raf@durin42.com> parents: 22973 diff changeset	52	"""
885bd7c5c7e3 encoding: add hfsignoreclean to clean out HFS-ignored characters Augie Fackler <raf@durin42.com> parents: 22973 diff changeset	53	if "\xe2" in s or "\xef" in s:
885bd7c5c7e3 encoding: add hfsignoreclean to clean out HFS-ignored characters Augie Fackler <raf@durin42.com> parents: 22973 diff changeset	54	for c in _ignore:
885bd7c5c7e3 encoding: add hfsignoreclean to clean out HFS-ignored characters Augie Fackler <raf@durin42.com> parents: 22973 diff changeset	55	s = s.replace(c, '')
885bd7c5c7e3 encoding: add hfsignoreclean to clean out HFS-ignored characters Augie Fackler <raf@durin42.com> parents: 22973 diff changeset	56	return s
885bd7c5c7e3 encoding: add hfsignoreclean to clean out HFS-ignored characters Augie Fackler <raf@durin42.com> parents: 22973 diff changeset	57
30034 e4a6b439acc5 py3: provide encoding.environ which is a dict of bytes Yuya Nishihara <yuya@tcha.org> parents: 30033 diff changeset	58	# encoding.environ is provided read-only, which may not be used to modify
e4a6b439acc5 py3: provide encoding.environ which is a dict of bytes Yuya Nishihara <yuya@tcha.org> parents: 30033 diff changeset	59	# the process environment
e4a6b439acc5 py3: provide encoding.environ which is a dict of bytes Yuya Nishihara <yuya@tcha.org> parents: 30033 diff changeset	60	_nativeenviron = (not pycompat.ispy3 or os.supports_bytes_environ)
e4a6b439acc5 py3: provide encoding.environ which is a dict of bytes Yuya Nishihara <yuya@tcha.org> parents: 30033 diff changeset	61	if not pycompat.ispy3:
32231 cf424dae5dc7 check-code: ignore re-exports of os.environ in encoding.py Yuya Nishihara <yuya@tcha.org> parents: 32205 diff changeset	62	environ = os.environ # re-exports
30034 e4a6b439acc5 py3: provide encoding.environ which is a dict of bytes Yuya Nishihara <yuya@tcha.org> parents: 30033 diff changeset	63	elif _nativeenviron:
32231 cf424dae5dc7 check-code: ignore re-exports of os.environ in encoding.py Yuya Nishihara <yuya@tcha.org> parents: 32205 diff changeset	64	environ = os.environb # re-exports
30034 e4a6b439acc5 py3: provide encoding.environ which is a dict of bytes Yuya Nishihara <yuya@tcha.org> parents: 30033 diff changeset	65	else:
e4a6b439acc5 py3: provide encoding.environ which is a dict of bytes Yuya Nishihara <yuya@tcha.org> parents: 30033 diff changeset	66	# preferred encoding isn't known yet; use utf-8 to avoid unicode error
e4a6b439acc5 py3: provide encoding.environ which is a dict of bytes Yuya Nishihara <yuya@tcha.org> parents: 30033 diff changeset	67	# and recreate it once encoding is settled
e4a6b439acc5 py3: provide encoding.environ which is a dict of bytes Yuya Nishihara <yuya@tcha.org> parents: 30033 diff changeset	68	environ = dict((k.encode(u'utf-8'), v.encode(u'utf-8'))
32231 cf424dae5dc7 check-code: ignore re-exports of os.environ in encoding.py Yuya Nishihara <yuya@tcha.org> parents: 32205 diff changeset	69	for k, v in os.environ.items()) # re-exports
30034 e4a6b439acc5 py3: provide encoding.environ which is a dict of bytes Yuya Nishihara <yuya@tcha.org> parents: 30033 diff changeset	70
39844 9e8fcd2e78c1 encoding: remove unnecessary lambdas from _encodingfixers Martin von Zweigbergk <martinvonz@google.com> parents: 39824 diff changeset	71	_encodingrewrites = {
9e8fcd2e78c1 encoding: remove unnecessary lambdas from _encodingfixers Martin von Zweigbergk <martinvonz@google.com> parents: 39824 diff changeset	72	'646': 'ascii',
9e8fcd2e78c1 encoding: remove unnecessary lambdas from _encodingfixers Martin von Zweigbergk <martinvonz@google.com> parents: 39824 diff changeset	73	'ANSI_X3.4-1968': 'ascii',
11892 2be70ca17311 encoding: improve handling of buggy getpreferredencoding() on Mac OS X Dan Villiom Podlaski Christiansen <danchr@gmail.com> parents: 11297 diff changeset	74	}
37883 443029011990 encoding: alias cp65001 to utf-8 on Windows Yuya Nishihara <yuya@tcha.org> parents: 36805 diff changeset	75	# cp65001 is a Windows variant of utf-8, which isn't supported on Python 2.
443029011990 encoding: alias cp65001 to utf-8 on Windows Yuya Nishihara <yuya@tcha.org> parents: 36805 diff changeset	76	# No idea if it should be rewritten to the canonical name 'utf-8' on Python 3.
443029011990 encoding: alias cp65001 to utf-8 on Windows Yuya Nishihara <yuya@tcha.org> parents: 36805 diff changeset	77	# https://bugs.python.org/issue13216
443029011990 encoding: alias cp65001 to utf-8 on Windows Yuya Nishihara <yuya@tcha.org> parents: 36805 diff changeset	78	if pycompat.iswindows and not pycompat.ispy3:
39844 9e8fcd2e78c1 encoding: remove unnecessary lambdas from _encodingfixers Martin von Zweigbergk <martinvonz@google.com> parents: 39824 diff changeset	79	_encodingrewrites['cp65001'] = 'utf-8'
7948 de377b1a9a84 move encoding bits from util to encoding Matt Mackall <mpm@selenic.com> parents: diff changeset	80
de377b1a9a84 move encoding bits from util to encoding Matt Mackall <mpm@selenic.com> parents: diff changeset	81	try:
30034 e4a6b439acc5 py3: provide encoding.environ which is a dict of bytes Yuya Nishihara <yuya@tcha.org> parents: 30033 diff changeset	82	encoding = environ.get("HGENCODING")
7948 de377b1a9a84 move encoding bits from util to encoding Matt Mackall <mpm@selenic.com> parents: diff changeset	83	if not encoding:
30627 ce36fa9b140c py3: make sure encoding.encoding is a bytes variable Pulkit Goyal <7895pulkit@gmail.com> parents: 30034 diff changeset	84	encoding = locale.getpreferredencoding().encode('ascii') or 'ascii'
39844 9e8fcd2e78c1 encoding: remove unnecessary lambdas from _encodingfixers Martin von Zweigbergk <martinvonz@google.com> parents: 39824 diff changeset	85	encoding = _encodingrewrites.get(encoding, encoding)
7948 de377b1a9a84 move encoding bits from util to encoding Matt Mackall <mpm@selenic.com> parents: diff changeset	86	except locale.Error:
de377b1a9a84 move encoding bits from util to encoding Matt Mackall <mpm@selenic.com> parents: diff changeset	87	encoding = 'ascii'
30034 e4a6b439acc5 py3: provide encoding.environ which is a dict of bytes Yuya Nishihara <yuya@tcha.org> parents: 30033 diff changeset	88	encodingmode = environ.get("HGENCODINGMODE", "strict")
7948 de377b1a9a84 move encoding bits from util to encoding Matt Mackall <mpm@selenic.com> parents: diff changeset	89	fallbackencoding = 'ISO-8859-1'
de377b1a9a84 move encoding bits from util to encoding Matt Mackall <mpm@selenic.com> parents: diff changeset	90
33832 dabe1f11ae3a py3: change encoding.localstr to a subclass of bytes, not str Yuya Nishihara <yuya@tcha.org> parents: 33782 diff changeset	91	class localstr(bytes):
13046 7cc4263e07a9 encoding: add localstr class to track UTF-8 version of transcoded strings Matt Mackall <mpm@selenic.com> parents: 12866 diff changeset	92	'''This class allows strings that are unmodified to be
7cc4263e07a9 encoding: add localstr class to track UTF-8 version of transcoded strings Matt Mackall <mpm@selenic.com> parents: 12866 diff changeset	93	round-tripped to the local encoding and back'''
7cc4263e07a9 encoding: add localstr class to track UTF-8 version of transcoded strings Matt Mackall <mpm@selenic.com> parents: 12866 diff changeset	94	def __new__(cls, u, l):
33832 dabe1f11ae3a py3: change encoding.localstr to a subclass of bytes, not str Yuya Nishihara <yuya@tcha.org> parents: 33782 diff changeset	95	s = bytes.__new__(cls, l)
13046 7cc4263e07a9 encoding: add localstr class to track UTF-8 version of transcoded strings Matt Mackall <mpm@selenic.com> parents: 12866 diff changeset	96	s._utf8 = u
7cc4263e07a9 encoding: add localstr class to track UTF-8 version of transcoded strings Matt Mackall <mpm@selenic.com> parents: 12866 diff changeset	97	return s
7cc4263e07a9 encoding: add localstr class to track UTF-8 version of transcoded strings Matt Mackall <mpm@selenic.com> parents: 12866 diff changeset	98	def __hash__(self):
7cc4263e07a9 encoding: add localstr class to track UTF-8 version of transcoded strings Matt Mackall <mpm@selenic.com> parents: 12866 diff changeset	99	return hash(self._utf8) # avoid collisions in local string space
7cc4263e07a9 encoding: add localstr class to track UTF-8 version of transcoded strings Matt Mackall <mpm@selenic.com> parents: 12866 diff changeset	100
37991 3ea3c96ada54 encoding: introduce tagging type for non-lossy non-ASCII string Yuya Nishihara <yuya@tcha.org> parents: 37990 diff changeset	101	class safelocalstr(bytes):
3ea3c96ada54 encoding: introduce tagging type for non-lossy non-ASCII string Yuya Nishihara <yuya@tcha.org> parents: 37990 diff changeset	102	"""Tagged string denoting it was previously an internal UTF-8 string,
3ea3c96ada54 encoding: introduce tagging type for non-lossy non-ASCII string Yuya Nishihara <yuya@tcha.org> parents: 37990 diff changeset	103	and can be converted back to UTF-8 losslessly
3ea3c96ada54 encoding: introduce tagging type for non-lossy non-ASCII string Yuya Nishihara <yuya@tcha.org> parents: 37990 diff changeset	104
3ea3c96ada54 encoding: introduce tagging type for non-lossy non-ASCII string Yuya Nishihara <yuya@tcha.org> parents: 37990 diff changeset	105	>>> assert safelocalstr(b'\\xc3') == b'\\xc3'
3ea3c96ada54 encoding: introduce tagging type for non-lossy non-ASCII string Yuya Nishihara <yuya@tcha.org> parents: 37990 diff changeset	106	>>> assert b'\\xc3' == safelocalstr(b'\\xc3')
3ea3c96ada54 encoding: introduce tagging type for non-lossy non-ASCII string Yuya Nishihara <yuya@tcha.org> parents: 37990 diff changeset	107	>>> assert b'\\xc3' in {safelocalstr(b'\\xc3'): 0}
3ea3c96ada54 encoding: introduce tagging type for non-lossy non-ASCII string Yuya Nishihara <yuya@tcha.org> parents: 37990 diff changeset	108	>>> assert safelocalstr(b'\\xc3') in {b'\\xc3': 0}
3ea3c96ada54 encoding: introduce tagging type for non-lossy non-ASCII string Yuya Nishihara <yuya@tcha.org> parents: 37990 diff changeset	109	"""
3ea3c96ada54 encoding: introduce tagging type for non-lossy non-ASCII string Yuya Nishihara <yuya@tcha.org> parents: 37990 diff changeset	110
7948 de377b1a9a84 move encoding bits from util to encoding Matt Mackall <mpm@selenic.com> parents: diff changeset	111	def tolocal(s):
de377b1a9a84 move encoding bits from util to encoding Matt Mackall <mpm@selenic.com> parents: diff changeset	112	"""
de377b1a9a84 move encoding bits from util to encoding Matt Mackall <mpm@selenic.com> parents: diff changeset	113	Convert a string from internal UTF-8 to local encoding
de377b1a9a84 move encoding bits from util to encoding Matt Mackall <mpm@selenic.com> parents: diff changeset	114
de377b1a9a84 move encoding bits from util to encoding Matt Mackall <mpm@selenic.com> parents: diff changeset	115	All internal strings should be UTF-8 but some repos before the
de377b1a9a84 move encoding bits from util to encoding Matt Mackall <mpm@selenic.com> parents: diff changeset	116	implementation of locale support may contain latin1 or possibly
de377b1a9a84 move encoding bits from util to encoding Matt Mackall <mpm@selenic.com> parents: diff changeset	117	other character sets. We attempt to decode everything strictly
de377b1a9a84 move encoding bits from util to encoding Matt Mackall <mpm@selenic.com> parents: diff changeset	118	using UTF-8, then Latin-1, and failing that, we use UTF-8 and
de377b1a9a84 move encoding bits from util to encoding Matt Mackall <mpm@selenic.com> parents: diff changeset	119	replace unknown characters.
13046 7cc4263e07a9 encoding: add localstr class to track UTF-8 version of transcoded strings Matt Mackall <mpm@selenic.com> parents: 12866 diff changeset	120
7cc4263e07a9 encoding: add localstr class to track UTF-8 version of transcoded strings Matt Mackall <mpm@selenic.com> parents: 12866 diff changeset	121	The localstr class is used to cache the known UTF-8 encoding of
7cc4263e07a9 encoding: add localstr class to track UTF-8 version of transcoded strings Matt Mackall <mpm@selenic.com> parents: 12866 diff changeset	122	strings next to their local representation to allow lossless
7cc4263e07a9 encoding: add localstr class to track UTF-8 version of transcoded strings Matt Mackall <mpm@selenic.com> parents: 12866 diff changeset	123	round-trip conversion back to UTF-8.
7cc4263e07a9 encoding: add localstr class to track UTF-8 version of transcoded strings Matt Mackall <mpm@selenic.com> parents: 12866 diff changeset	124
34146 0fa781320203 doctest: bulk-replace string literals with b'' for Python 3 Yuya Nishihara <yuya@tcha.org> parents: 33946 diff changeset	125	>>> u = b'foo: \\xc3\\xa4' # utf-8
13046 7cc4263e07a9 encoding: add localstr class to track UTF-8 version of transcoded strings Matt Mackall <mpm@selenic.com> parents: 12866 diff changeset	126	>>> l = tolocal(u)
7cc4263e07a9 encoding: add localstr class to track UTF-8 version of transcoded strings Matt Mackall <mpm@selenic.com> parents: 12866 diff changeset	127	>>> l
7cc4263e07a9 encoding: add localstr class to track UTF-8 version of transcoded strings Matt Mackall <mpm@selenic.com> parents: 12866 diff changeset	128	'foo: ?'
7cc4263e07a9 encoding: add localstr class to track UTF-8 version of transcoded strings Matt Mackall <mpm@selenic.com> parents: 12866 diff changeset	129	>>> fromlocal(l)
7cc4263e07a9 encoding: add localstr class to track UTF-8 version of transcoded strings Matt Mackall <mpm@selenic.com> parents: 12866 diff changeset	130	'foo: \\xc3\\xa4'
34146 0fa781320203 doctest: bulk-replace string literals with b'' for Python 3 Yuya Nishihara <yuya@tcha.org> parents: 33946 diff changeset	131	>>> u2 = b'foo: \\xc3\\xa1'
13046 7cc4263e07a9 encoding: add localstr class to track UTF-8 version of transcoded strings Matt Mackall <mpm@selenic.com> parents: 12866 diff changeset	132	>>> d = { l: 1, tolocal(u2): 2 }
18378 404feac78b8a tests: stabilize doctest output Mads Kiilerich <mads@kiilerich.com> parents: 17424 diff changeset	133	>>> len(d) # no collision
404feac78b8a tests: stabilize doctest output Mads Kiilerich <mads@kiilerich.com> parents: 17424 diff changeset	134	2
34146 0fa781320203 doctest: bulk-replace string literals with b'' for Python 3 Yuya Nishihara <yuya@tcha.org> parents: 33946 diff changeset	135	>>> b'foo: ?' in d
13046 7cc4263e07a9 encoding: add localstr class to track UTF-8 version of transcoded strings Matt Mackall <mpm@selenic.com> parents: 12866 diff changeset	136	False
34146 0fa781320203 doctest: bulk-replace string literals with b'' for Python 3 Yuya Nishihara <yuya@tcha.org> parents: 33946 diff changeset	137	>>> l1 = b'foo: \\xe4' # historical latin1 fallback
13046 7cc4263e07a9 encoding: add localstr class to track UTF-8 version of transcoded strings Matt Mackall <mpm@selenic.com> parents: 12866 diff changeset	138	>>> l = tolocal(l1)
7cc4263e07a9 encoding: add localstr class to track UTF-8 version of transcoded strings Matt Mackall <mpm@selenic.com> parents: 12866 diff changeset	139	>>> l
7cc4263e07a9 encoding: add localstr class to track UTF-8 version of transcoded strings Matt Mackall <mpm@selenic.com> parents: 12866 diff changeset	140	'foo: ?'
7cc4263e07a9 encoding: add localstr class to track UTF-8 version of transcoded strings Matt Mackall <mpm@selenic.com> parents: 12866 diff changeset	141	>>> fromlocal(l) # magically in utf-8
7cc4263e07a9 encoding: add localstr class to track UTF-8 version of transcoded strings Matt Mackall <mpm@selenic.com> parents: 12866 diff changeset	142	'foo: \\xc3\\xa4'
7948 de377b1a9a84 move encoding bits from util to encoding Matt Mackall <mpm@selenic.com> parents: diff changeset	143	"""
13046 7cc4263e07a9 encoding: add localstr class to track UTF-8 version of transcoded strings Matt Mackall <mpm@selenic.com> parents: 12866 diff changeset	144
33945 853574db5b12 encoding: add fast path of from/tolocal() for ASCII strings Yuya Nishihara <yuya@tcha.org> parents: 33944 diff changeset	145	if isasciistr(s):
853574db5b12 encoding: add fast path of from/tolocal() for ASCII strings Yuya Nishihara <yuya@tcha.org> parents: 33944 diff changeset	146	return s
853574db5b12 encoding: add fast path of from/tolocal() for ASCII strings Yuya Nishihara <yuya@tcha.org> parents: 33944 diff changeset	147
16274 5d75eb8568d1 encoding: tune fast-path of tolocal a bit Matt Mackall <mpm@selenic.com> parents: 16133 diff changeset	148	try:
7948 de377b1a9a84 move encoding bits from util to encoding Matt Mackall <mpm@selenic.com> parents: diff changeset	149	try:
16274 5d75eb8568d1 encoding: tune fast-path of tolocal a bit Matt Mackall <mpm@selenic.com> parents: 16133 diff changeset	150	# make sure string is actually stored in UTF-8
5d75eb8568d1 encoding: tune fast-path of tolocal a bit Matt Mackall <mpm@selenic.com> parents: 16133 diff changeset	151	u = s.decode('UTF-8')
5d75eb8568d1 encoding: tune fast-path of tolocal a bit Matt Mackall <mpm@selenic.com> parents: 16133 diff changeset	152	if encoding == 'UTF-8':
5d75eb8568d1 encoding: tune fast-path of tolocal a bit Matt Mackall <mpm@selenic.com> parents: 16133 diff changeset	153	# fast path
5d75eb8568d1 encoding: tune fast-path of tolocal a bit Matt Mackall <mpm@selenic.com> parents: 16133 diff changeset	154	return s
30033 02dbfaa6df0b py3: convert encoding name and mode to str Yuya Nishihara <yuya@tcha.org> parents: 30031 diff changeset	155	r = u.encode(_sysstr(encoding), u"replace")
02dbfaa6df0b py3: convert encoding name and mode to str Yuya Nishihara <yuya@tcha.org> parents: 30031 diff changeset	156	if u == r.decode(_sysstr(encoding)):
13940 b7b26e54e37a encoding: avoid localstr when a string can be encoded losslessly (issue2763) Matt Mackall <mpm@selenic.com> parents: 13051 diff changeset	157	# r is a safe, non-lossy encoding of s
37991 3ea3c96ada54 encoding: introduce tagging type for non-lossy non-ASCII string Yuya Nishihara <yuya@tcha.org> parents: 37990 diff changeset	158	return safelocalstr(r)
16274 5d75eb8568d1 encoding: tune fast-path of tolocal a bit Matt Mackall <mpm@selenic.com> parents: 16133 diff changeset	159	return localstr(s, r)
5d75eb8568d1 encoding: tune fast-path of tolocal a bit Matt Mackall <mpm@selenic.com> parents: 16133 diff changeset	160	except UnicodeDecodeError:
5d75eb8568d1 encoding: tune fast-path of tolocal a bit Matt Mackall <mpm@selenic.com> parents: 16133 diff changeset	161	# we should only get here if we're looking at an ancient changeset
5d75eb8568d1 encoding: tune fast-path of tolocal a bit Matt Mackall <mpm@selenic.com> parents: 16133 diff changeset	162	try:
30033 02dbfaa6df0b py3: convert encoding name and mode to str Yuya Nishihara <yuya@tcha.org> parents: 30031 diff changeset	163	u = s.decode(_sysstr(fallbackencoding))
02dbfaa6df0b py3: convert encoding name and mode to str Yuya Nishihara <yuya@tcha.org> parents: 30031 diff changeset	164	r = u.encode(_sysstr(encoding), u"replace")
02dbfaa6df0b py3: convert encoding name and mode to str Yuya Nishihara <yuya@tcha.org> parents: 30031 diff changeset	165	if u == r.decode(_sysstr(encoding)):
16274 5d75eb8568d1 encoding: tune fast-path of tolocal a bit Matt Mackall <mpm@selenic.com> parents: 16133 diff changeset	166	# r is a safe, non-lossy encoding of s
37991 3ea3c96ada54 encoding: introduce tagging type for non-lossy non-ASCII string Yuya Nishihara <yuya@tcha.org> parents: 37990 diff changeset	167	return safelocalstr(r)
13940 b7b26e54e37a encoding: avoid localstr when a string can be encoded losslessly (issue2763) Matt Mackall <mpm@selenic.com> parents: 13051 diff changeset	168	return localstr(u.encode('UTF-8'), r)
16274 5d75eb8568d1 encoding: tune fast-path of tolocal a bit Matt Mackall <mpm@selenic.com> parents: 16133 diff changeset	169	except UnicodeDecodeError:
5d75eb8568d1 encoding: tune fast-path of tolocal a bit Matt Mackall <mpm@selenic.com> parents: 16133 diff changeset	170	u = s.decode("utf-8", "replace") # last ditch
30033 02dbfaa6df0b py3: convert encoding name and mode to str Yuya Nishihara <yuya@tcha.org> parents: 30031 diff changeset	171	# can't round-trip
02dbfaa6df0b py3: convert encoding name and mode to str Yuya Nishihara <yuya@tcha.org> parents: 30031 diff changeset	172	return u.encode(_sysstr(encoding), u"replace")
25660 328739ea70c3 global: mass rewrite to use modern exception syntax Gregory Szorc <gregory.szorc@gmail.com> parents: 24608 diff changeset	173	except LookupError as k:
16274 5d75eb8568d1 encoding: tune fast-path of tolocal a bit Matt Mackall <mpm@selenic.com> parents: 16133 diff changeset	174	raise error.Abort(k, hint="please check your locale settings")
7948 de377b1a9a84 move encoding bits from util to encoding Matt Mackall <mpm@selenic.com> parents: diff changeset	175
de377b1a9a84 move encoding bits from util to encoding Matt Mackall <mpm@selenic.com> parents: diff changeset	176	def fromlocal(s):
de377b1a9a84 move encoding bits from util to encoding Matt Mackall <mpm@selenic.com> parents: diff changeset	177	"""
de377b1a9a84 move encoding bits from util to encoding Matt Mackall <mpm@selenic.com> parents: diff changeset	178	Convert a string from the local character encoding to UTF-8
de377b1a9a84 move encoding bits from util to encoding Matt Mackall <mpm@selenic.com> parents: diff changeset	179
de377b1a9a84 move encoding bits from util to encoding Matt Mackall <mpm@selenic.com> parents: diff changeset	180	We attempt to decode strings using the encoding mode set by
de377b1a9a84 move encoding bits from util to encoding Matt Mackall <mpm@selenic.com> parents: diff changeset	181	HGENCODINGMODE, which defaults to 'strict'. In this mode, unknown
de377b1a9a84 move encoding bits from util to encoding Matt Mackall <mpm@selenic.com> parents: diff changeset	182	characters will cause an error message. Other modes include
de377b1a9a84 move encoding bits from util to encoding Matt Mackall <mpm@selenic.com> parents: diff changeset	183	'replace', which replaces unknown characters with a special
de377b1a9a84 move encoding bits from util to encoding Matt Mackall <mpm@selenic.com> parents: diff changeset	184	Unicode character, and 'ignore', which drops the character.
de377b1a9a84 move encoding bits from util to encoding Matt Mackall <mpm@selenic.com> parents: diff changeset	185	"""
13046 7cc4263e07a9 encoding: add localstr class to track UTF-8 version of transcoded strings Matt Mackall <mpm@selenic.com> parents: 12866 diff changeset	186
7cc4263e07a9 encoding: add localstr class to track UTF-8 version of transcoded strings Matt Mackall <mpm@selenic.com> parents: 12866 diff changeset	187	# can we do a lossless round-trip?
7cc4263e07a9 encoding: add localstr class to track UTF-8 version of transcoded strings Matt Mackall <mpm@selenic.com> parents: 12866 diff changeset	188	if isinstance(s, localstr):
7cc4263e07a9 encoding: add localstr class to track UTF-8 version of transcoded strings Matt Mackall <mpm@selenic.com> parents: 12866 diff changeset	189	return s._utf8
33945 853574db5b12 encoding: add fast path of from/tolocal() for ASCII strings Yuya Nishihara <yuya@tcha.org> parents: 33944 diff changeset	190	if isasciistr(s):
853574db5b12 encoding: add fast path of from/tolocal() for ASCII strings Yuya Nishihara <yuya@tcha.org> parents: 33944 diff changeset	191	return s
13046 7cc4263e07a9 encoding: add localstr class to track UTF-8 version of transcoded strings Matt Mackall <mpm@selenic.com> parents: 12866 diff changeset	192
7948 de377b1a9a84 move encoding bits from util to encoding Matt Mackall <mpm@selenic.com> parents: diff changeset	193	try:
30033 02dbfaa6df0b py3: convert encoding name and mode to str Yuya Nishihara <yuya@tcha.org> parents: 30031 diff changeset	194	u = s.decode(_sysstr(encoding), _sysstr(encodingmode))
02dbfaa6df0b py3: convert encoding name and mode to str Yuya Nishihara <yuya@tcha.org> parents: 30031 diff changeset	195	return u.encode("utf-8")
25660 328739ea70c3 global: mass rewrite to use modern exception syntax Gregory Szorc <gregory.szorc@gmail.com> parents: 24608 diff changeset	196	except UnicodeDecodeError as inst:
10282 08a0f04b56bd many, many trivial check-code fixups Matt Mackall <mpm@selenic.com> parents: 10263 diff changeset	197	sub = s[max(0, inst.start - 10):inst.start + 10]
36578 3696efeab66f py3: don't crash when re-raising encoding error Yuya Nishihara <yuya@tcha.org> parents: 34225 diff changeset	198	raise error.Abort("decoding near '%s': %s!"
3696efeab66f py3: don't crash when re-raising encoding error Yuya Nishihara <yuya@tcha.org> parents: 34225 diff changeset	199	% (sub, pycompat.bytestr(inst)))
25660 328739ea70c3 global: mass rewrite to use modern exception syntax Gregory Szorc <gregory.szorc@gmail.com> parents: 24608 diff changeset	200	except LookupError as k:
15769 afdf4f5bac61 encoding: use hint markup for "please check your locale settings" Mads Kiilerich <mads@kiilerich.com> parents: 15672 diff changeset	201	raise error.Abort(k, hint="please check your locale settings")
7948 de377b1a9a84 move encoding bits from util to encoding Matt Mackall <mpm@selenic.com> parents: diff changeset	202
31456 067add650129 encoding: factor out unicode variants of from/tolocal() Yuya Nishihara <yuya@tcha.org> parents: 30627 diff changeset	203	def unitolocal(u):
067add650129 encoding: factor out unicode variants of from/tolocal() Yuya Nishihara <yuya@tcha.org> parents: 30627 diff changeset	204	"""Convert a unicode string to a byte string of local encoding"""
067add650129 encoding: factor out unicode variants of from/tolocal() Yuya Nishihara <yuya@tcha.org> parents: 30627 diff changeset	205	return tolocal(u.encode('utf-8'))
067add650129 encoding: factor out unicode variants of from/tolocal() Yuya Nishihara <yuya@tcha.org> parents: 30627 diff changeset	206
067add650129 encoding: factor out unicode variants of from/tolocal() Yuya Nishihara <yuya@tcha.org> parents: 30627 diff changeset	207	def unifromlocal(s):
067add650129 encoding: factor out unicode variants of from/tolocal() Yuya Nishihara <yuya@tcha.org> parents: 30627 diff changeset	208	"""Convert a byte string of local encoding to a unicode string"""
067add650129 encoding: factor out unicode variants of from/tolocal() Yuya Nishihara <yuya@tcha.org> parents: 30627 diff changeset	209	return fromlocal(s).decode('utf-8')
067add650129 encoding: factor out unicode variants of from/tolocal() Yuya Nishihara <yuya@tcha.org> parents: 30627 diff changeset	210
33038 ce96efec8112 py3: add utility to forward __str__() to __bytes__() Yuya Nishihara <yuya@tcha.org> parents: 32570 diff changeset	211	def unimethod(bytesfunc):
ce96efec8112 py3: add utility to forward __str__() to __bytes__() Yuya Nishihara <yuya@tcha.org> parents: 32570 diff changeset	212	"""Create a proxy method that forwards __unicode__() and __str__() of
ce96efec8112 py3: add utility to forward __str__() to __bytes__() Yuya Nishihara <yuya@tcha.org> parents: 32570 diff changeset	213	Python 3 to __bytes__()"""
ce96efec8112 py3: add utility to forward __str__() to __bytes__() Yuya Nishihara <yuya@tcha.org> parents: 32570 diff changeset	214	def unifunc(obj):
ce96efec8112 py3: add utility to forward __str__() to __bytes__() Yuya Nishihara <yuya@tcha.org> parents: 32570 diff changeset	215	return unifromlocal(bytesfunc(obj))
ce96efec8112 py3: add utility to forward __str__() to __bytes__() Yuya Nishihara <yuya@tcha.org> parents: 32570 diff changeset	216	return unifunc
ce96efec8112 py3: add utility to forward __str__() to __bytes__() Yuya Nishihara <yuya@tcha.org> parents: 32570 diff changeset	217
31457 6419cd243017 encoding: add converter between native str and byte string Yuya Nishihara <yuya@tcha.org> parents: 31456 diff changeset	218	# converter functions between native str and byte string. use these if the
6419cd243017 encoding: add converter between native str and byte string Yuya Nishihara <yuya@tcha.org> parents: 31456 diff changeset	219	# character encoding is not aware (e.g. exception message) or is known to
6419cd243017 encoding: add converter between native str and byte string Yuya Nishihara <yuya@tcha.org> parents: 31456 diff changeset	220	# be locale dependent (e.g. date formatting.)
6419cd243017 encoding: add converter between native str and byte string Yuya Nishihara <yuya@tcha.org> parents: 31456 diff changeset	221	if pycompat.ispy3:
6419cd243017 encoding: add converter between native str and byte string Yuya Nishihara <yuya@tcha.org> parents: 31456 diff changeset	222	strtolocal = unitolocal
6419cd243017 encoding: add converter between native str and byte string Yuya Nishihara <yuya@tcha.org> parents: 31456 diff changeset	223	strfromlocal = unifromlocal
33038 ce96efec8112 py3: add utility to forward __str__() to __bytes__() Yuya Nishihara <yuya@tcha.org> parents: 32570 diff changeset	224	strmethod = unimethod
31457 6419cd243017 encoding: add converter between native str and byte string Yuya Nishihara <yuya@tcha.org> parents: 31456 diff changeset	225	else:
31777 7d2cbe11ae48 pycompat: introduce identity function as a compat stub Yuya Nishihara <yuya@tcha.org> parents: 31457 diff changeset	226	strtolocal = pycompat.identity
7d2cbe11ae48 pycompat: introduce identity function as a compat stub Yuya Nishihara <yuya@tcha.org> parents: 31457 diff changeset	227	strfromlocal = pycompat.identity
33038 ce96efec8112 py3: add utility to forward __str__() to __bytes__() Yuya Nishihara <yuya@tcha.org> parents: 32570 diff changeset	228	strmethod = pycompat.identity
31457 6419cd243017 encoding: add converter between native str and byte string Yuya Nishihara <yuya@tcha.org> parents: 31456 diff changeset	229
30034 e4a6b439acc5 py3: provide encoding.environ which is a dict of bytes Yuya Nishihara <yuya@tcha.org> parents: 30033 diff changeset	230	if not _nativeenviron:
e4a6b439acc5 py3: provide encoding.environ which is a dict of bytes Yuya Nishihara <yuya@tcha.org> parents: 30033 diff changeset	231	# now encoding and helper functions are available, recreate the environ
e4a6b439acc5 py3: provide encoding.environ which is a dict of bytes Yuya Nishihara <yuya@tcha.org> parents: 30033 diff changeset	232	# dict to be exported to other modules
e4a6b439acc5 py3: provide encoding.environ which is a dict of bytes Yuya Nishihara <yuya@tcha.org> parents: 30033 diff changeset	233	environ = dict((tolocal(k.encode(u'utf-8')), tolocal(v.encode(u'utf-8')))
32231 cf424dae5dc7 check-code: ignore re-exports of os.environ in encoding.py Yuya Nishihara <yuya@tcha.org> parents: 32205 diff changeset	234	for k, v in os.environ.items()) # re-exports
30034 e4a6b439acc5 py3: provide encoding.environ which is a dict of bytes Yuya Nishihara <yuya@tcha.org> parents: 30033 diff changeset	235
39823 24e493ec2229 py3: rename pycompat.getcwd() to encoding.getcwd() (API) Matt Harbison <matt_harbison@yahoo.com> parents: 38823 diff changeset	236	if pycompat.ispy3:
24e493ec2229 py3: rename pycompat.getcwd() to encoding.getcwd() (API) Matt Harbison <matt_harbison@yahoo.com> parents: 38823 diff changeset	237	# os.getcwd() on Python 3 returns string, but it has os.getcwdb() which
24e493ec2229 py3: rename pycompat.getcwd() to encoding.getcwd() (API) Matt Harbison <matt_harbison@yahoo.com> parents: 38823 diff changeset	238	# returns bytes.
39824 fb628c048d64 py3: don't use os.getcwdb() on Windows to avoid DeprecationWarnings Matt Harbison <matt_harbison@yahoo.com> parents: 39823 diff changeset	239	if pycompat.iswindows:
fb628c048d64 py3: don't use os.getcwdb() on Windows to avoid DeprecationWarnings Matt Harbison <matt_harbison@yahoo.com> parents: 39823 diff changeset	240	# Python 3 on Windows issues a DeprecationWarning about using the bytes
fb628c048d64 py3: don't use os.getcwdb() on Windows to avoid DeprecationWarnings Matt Harbison <matt_harbison@yahoo.com> parents: 39823 diff changeset	241	# API when os.getcwdb() is called.
fb628c048d64 py3: don't use os.getcwdb() on Windows to avoid DeprecationWarnings Matt Harbison <matt_harbison@yahoo.com> parents: 39823 diff changeset	242	getcwd = lambda: strtolocal(os.getcwd()) # re-exports
fb628c048d64 py3: don't use os.getcwdb() on Windows to avoid DeprecationWarnings Matt Harbison <matt_harbison@yahoo.com> parents: 39823 diff changeset	243	else:
fb628c048d64 py3: don't use os.getcwdb() on Windows to avoid DeprecationWarnings Matt Harbison <matt_harbison@yahoo.com> parents: 39823 diff changeset	244	getcwd = os.getcwdb # re-exports
39823 24e493ec2229 py3: rename pycompat.getcwd() to encoding.getcwd() (API) Matt Harbison <matt_harbison@yahoo.com> parents: 38823 diff changeset	245	else:
24e493ec2229 py3: rename pycompat.getcwd() to encoding.getcwd() (API) Matt Harbison <matt_harbison@yahoo.com> parents: 38823 diff changeset	246	getcwd = os.getcwd # re-exports
24e493ec2229 py3: rename pycompat.getcwd() to encoding.getcwd() (API) Matt Harbison <matt_harbison@yahoo.com> parents: 38823 diff changeset	247
12866 eddc20306ab6 encoding: default ambiguous character to narrow Matt Mackall <mpm@selenic.com> parents: 12770 diff changeset	248	# How to treat ambiguous-width characters. Set to 'wide' to treat as wide.
32570 044f3d7eb9ae encoding: make sure "wide" variable never be referenced from other modules Yuya Nishihara <yuya@tcha.org> parents: 32562 diff changeset	249	_wide = _sysstr(environ.get("HGENCODINGAMBIGUOUS", "narrow") == "wide"
044f3d7eb9ae encoding: make sure "wide" variable never be referenced from other modules Yuya Nishihara <yuya@tcha.org> parents: 32562 diff changeset	250	and "WFA" or "WF")
12866 eddc20306ab6 encoding: default ambiguous character to narrow Matt Mackall <mpm@selenic.com> parents: 12770 diff changeset	251
7948 de377b1a9a84 move encoding bits from util to encoding Matt Mackall <mpm@selenic.com> parents: diff changeset	252	def colwidth(s):
15142 176882876780 encoding: colwidth input is in the local encoding Matt Mackall <mpm@selenic.com> parents: 15066 diff changeset	253	"Find the column width of a string for display in the local encoding"
30033 02dbfaa6df0b py3: convert encoding name and mode to str Yuya Nishihara <yuya@tcha.org> parents: 30031 diff changeset	254	return ucolwidth(s.decode(_sysstr(encoding), u'replace'))
15066 24efa83d81cb i18n: calculate terminal columns by width information of each characters FUJIWARA Katsunori <foozy@lares.dti.ne.jp> parents: 14951 diff changeset	255
24efa83d81cb i18n: calculate terminal columns by width information of each characters FUJIWARA Katsunori <foozy@lares.dti.ne.jp> parents: 14951 diff changeset	256	def ucolwidth(d):
24efa83d81cb i18n: calculate terminal columns by width information of each characters FUJIWARA Katsunori <foozy@lares.dti.ne.jp> parents: 14951 diff changeset	257	"Find the column width of a Unicode string for display"
14951 61807854004e encoding: use getattr isntead of hasattr Augie Fackler <durin42@gmail.com> parents: 14069 diff changeset	258	eaw = getattr(unicodedata, 'east_asian_width', None)
61807854004e encoding: use getattr isntead of hasattr Augie Fackler <durin42@gmail.com> parents: 14069 diff changeset	259	if eaw is not None:
32570 044f3d7eb9ae encoding: make sure "wide" variable never be referenced from other modules Yuya Nishihara <yuya@tcha.org> parents: 32562 diff changeset	260	return sum([eaw(c) in _wide and 2 or 1 for c in d])
7948 de377b1a9a84 move encoding bits from util to encoding Matt Mackall <mpm@selenic.com> parents: diff changeset	261	return len(d)
de377b1a9a84 move encoding bits from util to encoding Matt Mackall <mpm@selenic.com> parents: diff changeset	262
15143 16c129b0f465 encoding: add getcols to extract substrings based on column width Matt Mackall <mpm@selenic.com> parents: 15142 diff changeset	263	def getcols(s, start, c):
16c129b0f465 encoding: add getcols to extract substrings based on column width Matt Mackall <mpm@selenic.com> parents: 15142 diff changeset	264	'''Use colwidth to find a c-column substring of s starting at byte
16c129b0f465 encoding: add getcols to extract substrings based on column width Matt Mackall <mpm@selenic.com> parents: 15142 diff changeset	265	index start'''
38823 e7aa113b14f7 global: use pycompat.xrange() Gregory Szorc <gregory.szorc@gmail.com> parents: 38739 diff changeset	266	for x in pycompat.xrange(start + c, len(s)):
15143 16c129b0f465 encoding: add getcols to extract substrings based on column width Matt Mackall <mpm@selenic.com> parents: 15142 diff changeset	267	t = s[start:x]
16c129b0f465 encoding: add getcols to extract substrings based on column width Matt Mackall <mpm@selenic.com> parents: 15142 diff changeset	268	if colwidth(t) == c:
16c129b0f465 encoding: add getcols to extract substrings based on column width Matt Mackall <mpm@selenic.com> parents: 15142 diff changeset	269	return t
16c129b0f465 encoding: add getcols to extract substrings based on column width Matt Mackall <mpm@selenic.com> parents: 15142 diff changeset	270
21861 b515c3a63e96 encoding: add 'leftside' argument into 'trim' to switch trimming side FUJIWARA Katsunori <foozy@lares.dti.ne.jp> parents: 21856 diff changeset	271	def trim(s, width, ellipsis='', leftside=False):
21856 d24969ee272f encoding: add 'trim' to trim multi-byte characters at most specified columns FUJIWARA Katsunori <foozy@lares.dti.ne.jp> parents: 18378 diff changeset	272	"""Trim string 's' to at most 'width' columns (including 'ellipsis').
d24969ee272f encoding: add 'trim' to trim multi-byte characters at most specified columns FUJIWARA Katsunori <foozy@lares.dti.ne.jp> parents: 18378 diff changeset	273
21861 b515c3a63e96 encoding: add 'leftside' argument into 'trim' to switch trimming side FUJIWARA Katsunori <foozy@lares.dti.ne.jp> parents: 21856 diff changeset	274	If 'leftside' is True, left side of string 's' is trimmed.
b515c3a63e96 encoding: add 'leftside' argument into 'trim' to switch trimming side FUJIWARA Katsunori <foozy@lares.dti.ne.jp> parents: 21856 diff changeset	275	'ellipsis' is always placed at trimmed side.
b515c3a63e96 encoding: add 'leftside' argument into 'trim' to switch trimming side FUJIWARA Katsunori <foozy@lares.dti.ne.jp> parents: 21856 diff changeset	276
34151 414a3513c2bd doctest: do not embed non-ascii characters in docstring Yuya Nishihara <yuya@tcha.org> parents: 34150 diff changeset	277	>>> from .node import bin
34152 a8994d08e4a2 doctest: use print_function and convert bytes to unicode where needed Yuya Nishihara <yuya@tcha.org> parents: 34151 diff changeset	278	>>> def bprint(s):
a8994d08e4a2 doctest: use print_function and convert bytes to unicode where needed Yuya Nishihara <yuya@tcha.org> parents: 34151 diff changeset	279	... print(pycompat.sysstr(s))
34146 0fa781320203 doctest: bulk-replace string literals with b'' for Python 3 Yuya Nishihara <yuya@tcha.org> parents: 33946 diff changeset	280	>>> ellipsis = b'+++'
27355 b479fc425a81 encoding: use absolute_import Gregory Szorc <gregory.szorc@gmail.com> parents: 26963 diff changeset	281	>>> from . import encoding
34146 0fa781320203 doctest: bulk-replace string literals with b'' for Python 3 Yuya Nishihara <yuya@tcha.org> parents: 33946 diff changeset	282	>>> encoding.encoding = b'utf-8'
0fa781320203 doctest: bulk-replace string literals with b'' for Python 3 Yuya Nishihara <yuya@tcha.org> parents: 33946 diff changeset	283	>>> t = b'1234567890'
34152 a8994d08e4a2 doctest: use print_function and convert bytes to unicode where needed Yuya Nishihara <yuya@tcha.org> parents: 34151 diff changeset	284	>>> bprint(trim(t, 12, ellipsis=ellipsis))
21856 d24969ee272f encoding: add 'trim' to trim multi-byte characters at most specified columns FUJIWARA Katsunori <foozy@lares.dti.ne.jp> parents: 18378 diff changeset	285	1234567890
34152 a8994d08e4a2 doctest: use print_function and convert bytes to unicode where needed Yuya Nishihara <yuya@tcha.org> parents: 34151 diff changeset	286	>>> bprint(trim(t, 10, ellipsis=ellipsis))
21856 d24969ee272f encoding: add 'trim' to trim multi-byte characters at most specified columns FUJIWARA Katsunori <foozy@lares.dti.ne.jp> parents: 18378 diff changeset	287	1234567890
34152 a8994d08e4a2 doctest: use print_function and convert bytes to unicode where needed Yuya Nishihara <yuya@tcha.org> parents: 34151 diff changeset	288	>>> bprint(trim(t, 8, ellipsis=ellipsis))
21856 d24969ee272f encoding: add 'trim' to trim multi-byte characters at most specified columns FUJIWARA Katsunori <foozy@lares.dti.ne.jp> parents: 18378 diff changeset	289	12345+++
34152 a8994d08e4a2 doctest: use print_function and convert bytes to unicode where needed Yuya Nishihara <yuya@tcha.org> parents: 34151 diff changeset	290	>>> bprint(trim(t, 8, ellipsis=ellipsis, leftside=True))
21861 b515c3a63e96 encoding: add 'leftside' argument into 'trim' to switch trimming side FUJIWARA Katsunori <foozy@lares.dti.ne.jp> parents: 21856 diff changeset	291	+++67890
34152 a8994d08e4a2 doctest: use print_function and convert bytes to unicode where needed Yuya Nishihara <yuya@tcha.org> parents: 34151 diff changeset	292	>>> bprint(trim(t, 8))
21856 d24969ee272f encoding: add 'trim' to trim multi-byte characters at most specified columns FUJIWARA Katsunori <foozy@lares.dti.ne.jp> parents: 18378 diff changeset	293	12345678
34152 a8994d08e4a2 doctest: use print_function and convert bytes to unicode where needed Yuya Nishihara <yuya@tcha.org> parents: 34151 diff changeset	294	>>> bprint(trim(t, 8, leftside=True))
21861 b515c3a63e96 encoding: add 'leftside' argument into 'trim' to switch trimming side FUJIWARA Katsunori <foozy@lares.dti.ne.jp> parents: 21856 diff changeset	295	34567890
34152 a8994d08e4a2 doctest: use print_function and convert bytes to unicode where needed Yuya Nishihara <yuya@tcha.org> parents: 34151 diff changeset	296	>>> bprint(trim(t, 3, ellipsis=ellipsis))
21856 d24969ee272f encoding: add 'trim' to trim multi-byte characters at most specified columns FUJIWARA Katsunori <foozy@lares.dti.ne.jp> parents: 18378 diff changeset	297	+++
34152 a8994d08e4a2 doctest: use print_function and convert bytes to unicode where needed Yuya Nishihara <yuya@tcha.org> parents: 34151 diff changeset	298	>>> bprint(trim(t, 1, ellipsis=ellipsis))
21856 d24969ee272f encoding: add 'trim' to trim multi-byte characters at most specified columns FUJIWARA Katsunori <foozy@lares.dti.ne.jp> parents: 18378 diff changeset	299	+
d24969ee272f encoding: add 'trim' to trim multi-byte characters at most specified columns FUJIWARA Katsunori <foozy@lares.dti.ne.jp> parents: 18378 diff changeset	300	>>> u = u'\u3042\u3044\u3046\u3048\u304a' # 2 x 5 = 10 columns
34150 e9e225f16932 doctest: pass encoding name as system string Yuya Nishihara <yuya@tcha.org> parents: 34146 diff changeset	301	>>> t = u.encode(pycompat.sysstr(encoding.encoding))
34152 a8994d08e4a2 doctest: use print_function and convert bytes to unicode where needed Yuya Nishihara <yuya@tcha.org> parents: 34151 diff changeset	302	>>> bprint(trim(t, 12, ellipsis=ellipsis))
21856 d24969ee272f encoding: add 'trim' to trim multi-byte characters at most specified columns FUJIWARA Katsunori <foozy@lares.dti.ne.jp> parents: 18378 diff changeset	303	\xe3\x81\x82\xe3\x81\x84\xe3\x81\x86\xe3\x81\x88\xe3\x81\x8a
34152 a8994d08e4a2 doctest: use print_function and convert bytes to unicode where needed Yuya Nishihara <yuya@tcha.org> parents: 34151 diff changeset	304	>>> bprint(trim(t, 10, ellipsis=ellipsis))
21856 d24969ee272f encoding: add 'trim' to trim multi-byte characters at most specified columns FUJIWARA Katsunori <foozy@lares.dti.ne.jp> parents: 18378 diff changeset	305	\xe3\x81\x82\xe3\x81\x84\xe3\x81\x86\xe3\x81\x88\xe3\x81\x8a
34152 a8994d08e4a2 doctest: use print_function and convert bytes to unicode where needed Yuya Nishihara <yuya@tcha.org> parents: 34151 diff changeset	306	>>> bprint(trim(t, 8, ellipsis=ellipsis))
21856 d24969ee272f encoding: add 'trim' to trim multi-byte characters at most specified columns FUJIWARA Katsunori <foozy@lares.dti.ne.jp> parents: 18378 diff changeset	307	\xe3\x81\x82\xe3\x81\x84+++
34152 a8994d08e4a2 doctest: use print_function and convert bytes to unicode where needed Yuya Nishihara <yuya@tcha.org> parents: 34151 diff changeset	308	>>> bprint(trim(t, 8, ellipsis=ellipsis, leftside=True))
21861 b515c3a63e96 encoding: add 'leftside' argument into 'trim' to switch trimming side FUJIWARA Katsunori <foozy@lares.dti.ne.jp> parents: 21856 diff changeset	309	+++\xe3\x81\x88\xe3\x81\x8a
34152 a8994d08e4a2 doctest: use print_function and convert bytes to unicode where needed Yuya Nishihara <yuya@tcha.org> parents: 34151 diff changeset	310	>>> bprint(trim(t, 5))
21856 d24969ee272f encoding: add 'trim' to trim multi-byte characters at most specified columns FUJIWARA Katsunori <foozy@lares.dti.ne.jp> parents: 18378 diff changeset	311	\xe3\x81\x82\xe3\x81\x84
34152 a8994d08e4a2 doctest: use print_function and convert bytes to unicode where needed Yuya Nishihara <yuya@tcha.org> parents: 34151 diff changeset	312	>>> bprint(trim(t, 5, leftside=True))
21861 b515c3a63e96 encoding: add 'leftside' argument into 'trim' to switch trimming side FUJIWARA Katsunori <foozy@lares.dti.ne.jp> parents: 21856 diff changeset	313	\xe3\x81\x88\xe3\x81\x8a
34152 a8994d08e4a2 doctest: use print_function and convert bytes to unicode where needed Yuya Nishihara <yuya@tcha.org> parents: 34151 diff changeset	314	>>> bprint(trim(t, 4, ellipsis=ellipsis))
21856 d24969ee272f encoding: add 'trim' to trim multi-byte characters at most specified columns FUJIWARA Katsunori <foozy@lares.dti.ne.jp> parents: 18378 diff changeset	315	+++
34152 a8994d08e4a2 doctest: use print_function and convert bytes to unicode where needed Yuya Nishihara <yuya@tcha.org> parents: 34151 diff changeset	316	>>> bprint(trim(t, 4, ellipsis=ellipsis, leftside=True))
21861 b515c3a63e96 encoding: add 'leftside' argument into 'trim' to switch trimming side FUJIWARA Katsunori <foozy@lares.dti.ne.jp> parents: 21856 diff changeset	317	+++
34151 414a3513c2bd doctest: do not embed non-ascii characters in docstring Yuya Nishihara <yuya@tcha.org> parents: 34150 diff changeset	318	>>> t = bin(b'112233445566778899aa') # invalid byte sequence
34152 a8994d08e4a2 doctest: use print_function and convert bytes to unicode where needed Yuya Nishihara <yuya@tcha.org> parents: 34151 diff changeset	319	>>> bprint(trim(t, 12, ellipsis=ellipsis))
21856 d24969ee272f encoding: add 'trim' to trim multi-byte characters at most specified columns FUJIWARA Katsunori <foozy@lares.dti.ne.jp> parents: 18378 diff changeset	320	\x11\x22\x33\x44\x55\x66\x77\x88\x99\xaa
34152 a8994d08e4a2 doctest: use print_function and convert bytes to unicode where needed Yuya Nishihara <yuya@tcha.org> parents: 34151 diff changeset	321	>>> bprint(trim(t, 10, ellipsis=ellipsis))
21856 d24969ee272f encoding: add 'trim' to trim multi-byte characters at most specified columns FUJIWARA Katsunori <foozy@lares.dti.ne.jp> parents: 18378 diff changeset	322	\x11\x22\x33\x44\x55\x66\x77\x88\x99\xaa
34152 a8994d08e4a2 doctest: use print_function and convert bytes to unicode where needed Yuya Nishihara <yuya@tcha.org> parents: 34151 diff changeset	323	>>> bprint(trim(t, 8, ellipsis=ellipsis))
21856 d24969ee272f encoding: add 'trim' to trim multi-byte characters at most specified columns FUJIWARA Katsunori <foozy@lares.dti.ne.jp> parents: 18378 diff changeset	324	\x11\x22\x33\x44\x55+++
34152 a8994d08e4a2 doctest: use print_function and convert bytes to unicode where needed Yuya Nishihara <yuya@tcha.org> parents: 34151 diff changeset	325	>>> bprint(trim(t, 8, ellipsis=ellipsis, leftside=True))
21861 b515c3a63e96 encoding: add 'leftside' argument into 'trim' to switch trimming side FUJIWARA Katsunori <foozy@lares.dti.ne.jp> parents: 21856 diff changeset	326	+++\x66\x77\x88\x99\xaa
34152 a8994d08e4a2 doctest: use print_function and convert bytes to unicode where needed Yuya Nishihara <yuya@tcha.org> parents: 34151 diff changeset	327	>>> bprint(trim(t, 8))
21856 d24969ee272f encoding: add 'trim' to trim multi-byte characters at most specified columns FUJIWARA Katsunori <foozy@lares.dti.ne.jp> parents: 18378 diff changeset	328	\x11\x22\x33\x44\x55\x66\x77\x88
34152 a8994d08e4a2 doctest: use print_function and convert bytes to unicode where needed Yuya Nishihara <yuya@tcha.org> parents: 34151 diff changeset	329	>>> bprint(trim(t, 8, leftside=True))
21861 b515c3a63e96 encoding: add 'leftside' argument into 'trim' to switch trimming side FUJIWARA Katsunori <foozy@lares.dti.ne.jp> parents: 21856 diff changeset	330	\x33\x44\x55\x66\x77\x88\x99\xaa
34152 a8994d08e4a2 doctest: use print_function and convert bytes to unicode where needed Yuya Nishihara <yuya@tcha.org> parents: 34151 diff changeset	331	>>> bprint(trim(t, 3, ellipsis=ellipsis))
21856 d24969ee272f encoding: add 'trim' to trim multi-byte characters at most specified columns FUJIWARA Katsunori <foozy@lares.dti.ne.jp> parents: 18378 diff changeset	332	+++
34152 a8994d08e4a2 doctest: use print_function and convert bytes to unicode where needed Yuya Nishihara <yuya@tcha.org> parents: 34151 diff changeset	333	>>> bprint(trim(t, 1, ellipsis=ellipsis))
21856 d24969ee272f encoding: add 'trim' to trim multi-byte characters at most specified columns FUJIWARA Katsunori <foozy@lares.dti.ne.jp> parents: 18378 diff changeset	334	+
d24969ee272f encoding: add 'trim' to trim multi-byte characters at most specified columns FUJIWARA Katsunori <foozy@lares.dti.ne.jp> parents: 18378 diff changeset	335	"""
d24969ee272f encoding: add 'trim' to trim multi-byte characters at most specified columns FUJIWARA Katsunori <foozy@lares.dti.ne.jp> parents: 18378 diff changeset	336	try:
30033 02dbfaa6df0b py3: convert encoding name and mode to str Yuya Nishihara <yuya@tcha.org> parents: 30031 diff changeset	337	u = s.decode(_sysstr(encoding))
21856 d24969ee272f encoding: add 'trim' to trim multi-byte characters at most specified columns FUJIWARA Katsunori <foozy@lares.dti.ne.jp> parents: 18378 diff changeset	338	except UnicodeDecodeError:
d24969ee272f encoding: add 'trim' to trim multi-byte characters at most specified columns FUJIWARA Katsunori <foozy@lares.dti.ne.jp> parents: 18378 diff changeset	339	if len(s) <= width: # trimming is not needed
d24969ee272f encoding: add 'trim' to trim multi-byte characters at most specified columns FUJIWARA Katsunori <foozy@lares.dti.ne.jp> parents: 18378 diff changeset	340	return s
d24969ee272f encoding: add 'trim' to trim multi-byte characters at most specified columns FUJIWARA Katsunori <foozy@lares.dti.ne.jp> parents: 18378 diff changeset	341	width -= len(ellipsis)
d24969ee272f encoding: add 'trim' to trim multi-byte characters at most specified columns FUJIWARA Katsunori <foozy@lares.dti.ne.jp> parents: 18378 diff changeset	342	if width <= 0: # no enough room even for ellipsis
d24969ee272f encoding: add 'trim' to trim multi-byte characters at most specified columns FUJIWARA Katsunori <foozy@lares.dti.ne.jp> parents: 18378 diff changeset	343	return ellipsis[:width + len(ellipsis)]
21861 b515c3a63e96 encoding: add 'leftside' argument into 'trim' to switch trimming side FUJIWARA Katsunori <foozy@lares.dti.ne.jp> parents: 21856 diff changeset	344	if leftside:
b515c3a63e96 encoding: add 'leftside' argument into 'trim' to switch trimming side FUJIWARA Katsunori <foozy@lares.dti.ne.jp> parents: 21856 diff changeset	345	return ellipsis + s[-width:]
21856 d24969ee272f encoding: add 'trim' to trim multi-byte characters at most specified columns FUJIWARA Katsunori <foozy@lares.dti.ne.jp> parents: 18378 diff changeset	346	return s[:width] + ellipsis
d24969ee272f encoding: add 'trim' to trim multi-byte characters at most specified columns FUJIWARA Katsunori <foozy@lares.dti.ne.jp> parents: 18378 diff changeset	347
d24969ee272f encoding: add 'trim' to trim multi-byte characters at most specified columns FUJIWARA Katsunori <foozy@lares.dti.ne.jp> parents: 18378 diff changeset	348	if ucolwidth(u) <= width: # trimming is not needed
d24969ee272f encoding: add 'trim' to trim multi-byte characters at most specified columns FUJIWARA Katsunori <foozy@lares.dti.ne.jp> parents: 18378 diff changeset	349	return s
d24969ee272f encoding: add 'trim' to trim multi-byte characters at most specified columns FUJIWARA Katsunori <foozy@lares.dti.ne.jp> parents: 18378 diff changeset	350
d24969ee272f encoding: add 'trim' to trim multi-byte characters at most specified columns FUJIWARA Katsunori <foozy@lares.dti.ne.jp> parents: 18378 diff changeset	351	width -= len(ellipsis)
d24969ee272f encoding: add 'trim' to trim multi-byte characters at most specified columns FUJIWARA Katsunori <foozy@lares.dti.ne.jp> parents: 18378 diff changeset	352	if width <= 0: # no enough room even for ellipsis
d24969ee272f encoding: add 'trim' to trim multi-byte characters at most specified columns FUJIWARA Katsunori <foozy@lares.dti.ne.jp> parents: 18378 diff changeset	353	return ellipsis[:width + len(ellipsis)]
d24969ee272f encoding: add 'trim' to trim multi-byte characters at most specified columns FUJIWARA Katsunori <foozy@lares.dti.ne.jp> parents: 18378 diff changeset	354
21861 b515c3a63e96 encoding: add 'leftside' argument into 'trim' to switch trimming side FUJIWARA Katsunori <foozy@lares.dti.ne.jp> parents: 21856 diff changeset	355	if leftside:
b515c3a63e96 encoding: add 'leftside' argument into 'trim' to switch trimming side FUJIWARA Katsunori <foozy@lares.dti.ne.jp> parents: 21856 diff changeset	356	uslice = lambda i: u[i:]
b515c3a63e96 encoding: add 'leftside' argument into 'trim' to switch trimming side FUJIWARA Katsunori <foozy@lares.dti.ne.jp> parents: 21856 diff changeset	357	concat = lambda s: ellipsis + s
b515c3a63e96 encoding: add 'leftside' argument into 'trim' to switch trimming side FUJIWARA Katsunori <foozy@lares.dti.ne.jp> parents: 21856 diff changeset	358	else:
b515c3a63e96 encoding: add 'leftside' argument into 'trim' to switch trimming side FUJIWARA Katsunori <foozy@lares.dti.ne.jp> parents: 21856 diff changeset	359	uslice = lambda i: u[:-i]
b515c3a63e96 encoding: add 'leftside' argument into 'trim' to switch trimming side FUJIWARA Katsunori <foozy@lares.dti.ne.jp> parents: 21856 diff changeset	360	concat = lambda s: s + ellipsis
38823 e7aa113b14f7 global: use pycompat.xrange() Gregory Szorc <gregory.szorc@gmail.com> parents: 38739 diff changeset	361	for i in pycompat.xrange(1, len(u)):
21856 d24969ee272f encoding: add 'trim' to trim multi-byte characters at most specified columns FUJIWARA Katsunori <foozy@lares.dti.ne.jp> parents: 18378 diff changeset	362	usub = uslice(i)
d24969ee272f encoding: add 'trim' to trim multi-byte characters at most specified columns FUJIWARA Katsunori <foozy@lares.dti.ne.jp> parents: 18378 diff changeset	363	if ucolwidth(usub) <= width:
30033 02dbfaa6df0b py3: convert encoding name and mode to str Yuya Nishihara <yuya@tcha.org> parents: 30031 diff changeset	364	return concat(usub.encode(_sysstr(encoding)))
21856 d24969ee272f encoding: add 'trim' to trim multi-byte characters at most specified columns FUJIWARA Katsunori <foozy@lares.dti.ne.jp> parents: 18378 diff changeset	365	return ellipsis # no enough room for multi-column characters
d24969ee272f encoding: add 'trim' to trim multi-byte characters at most specified columns FUJIWARA Katsunori <foozy@lares.dti.ne.jp> parents: 18378 diff changeset	366
14069 e38846a79a23 encoding: add an encoding-aware lower function Matt Mackall <mpm@selenic.com> parents: 13940 diff changeset	367	def lower(s):
e38846a79a23 encoding: add an encoding-aware lower function Matt Mackall <mpm@selenic.com> parents: 13940 diff changeset	368	"best-effort encoding-aware case-folding of local string s"
e38846a79a23 encoding: add an encoding-aware lower function Matt Mackall <mpm@selenic.com> parents: 13940 diff changeset	369	try:
22779 d9585dda63c3 encoding.lower: use fast ASCII lower Siddharth Agarwal <sid0@fb.com> parents: 22778 diff changeset	370	return asciilower(s)
17235 3745ae495ce5 encoding: use s.decode to trigger UnicodeDecodeError Martin Geisler <mg@aragost.com> parents: 16493 diff changeset	371	except UnicodeDecodeError:
16387 c481761033bd encoding: add fast-path for ASCII lowercase Matt Mackall <mpm@selenic.com> parents: 16274 diff changeset	372	pass
c481761033bd encoding: add fast-path for ASCII lowercase Matt Mackall <mpm@selenic.com> parents: 16274 diff changeset	373	try:
14069 e38846a79a23 encoding: add an encoding-aware lower function Matt Mackall <mpm@selenic.com> parents: 13940 diff changeset	374	if isinstance(s, localstr):
e38846a79a23 encoding: add an encoding-aware lower function Matt Mackall <mpm@selenic.com> parents: 13940 diff changeset	375	u = s._utf8.decode("utf-8")
e38846a79a23 encoding: add an encoding-aware lower function Matt Mackall <mpm@selenic.com> parents: 13940 diff changeset	376	else:
30033 02dbfaa6df0b py3: convert encoding name and mode to str Yuya Nishihara <yuya@tcha.org> parents: 30031 diff changeset	377	u = s.decode(_sysstr(encoding), _sysstr(encodingmode))
14069 e38846a79a23 encoding: add an encoding-aware lower function Matt Mackall <mpm@selenic.com> parents: 13940 diff changeset	378
e38846a79a23 encoding: add an encoding-aware lower function Matt Mackall <mpm@selenic.com> parents: 13940 diff changeset	379	lu = u.lower()
e38846a79a23 encoding: add an encoding-aware lower function Matt Mackall <mpm@selenic.com> parents: 13940 diff changeset	380	if u == lu:
e38846a79a23 encoding: add an encoding-aware lower function Matt Mackall <mpm@selenic.com> parents: 13940 diff changeset	381	return s # preserve localstring
30033 02dbfaa6df0b py3: convert encoding name and mode to str Yuya Nishihara <yuya@tcha.org> parents: 30031 diff changeset	382	return lu.encode(_sysstr(encoding))
14069 e38846a79a23 encoding: add an encoding-aware lower function Matt Mackall <mpm@selenic.com> parents: 13940 diff changeset	383	except UnicodeError:
e38846a79a23 encoding: add an encoding-aware lower function Matt Mackall <mpm@selenic.com> parents: 13940 diff changeset	384	return s.lower() # we don't know how to fold this except in ASCII
25660 328739ea70c3 global: mass rewrite to use modern exception syntax Gregory Szorc <gregory.szorc@gmail.com> parents: 24608 diff changeset	385	except LookupError as k:
15672 2ebe3d0ce91d i18n: use encoding.lower/upper for encoding aware case folding FUJIWARA Katsunori <foozy@lares.dti.ne.jp> parents: 15143 diff changeset	386	raise error.Abort(k, hint="please check your locale settings")
2ebe3d0ce91d i18n: use encoding.lower/upper for encoding aware case folding FUJIWARA Katsunori <foozy@lares.dti.ne.jp> parents: 15143 diff changeset	387
2ebe3d0ce91d i18n: use encoding.lower/upper for encoding aware case folding FUJIWARA Katsunori <foozy@lares.dti.ne.jp> parents: 15143 diff changeset	388	def upper(s):
2ebe3d0ce91d i18n: use encoding.lower/upper for encoding aware case folding FUJIWARA Katsunori <foozy@lares.dti.ne.jp> parents: 15143 diff changeset	389	"best-effort encoding-aware case-folding of local string s"
2ebe3d0ce91d i18n: use encoding.lower/upper for encoding aware case folding FUJIWARA Katsunori <foozy@lares.dti.ne.jp> parents: 15143 diff changeset	390	try:
24578 ac08de78de7f encoding: use parsers.asciiupper when available Siddharth Agarwal <sid0@fb.com> parents: 23596 diff changeset	391	return asciiupper(s)
17236 9fb8312dbdbd encoding: add fast-path for ASCII uppercase. Martin Geisler <mg@aragost.com> parents: 17235 diff changeset	392	except UnicodeDecodeError:
24597 b4258d5a1600 encoding.upper: factor out fallback code Siddharth Agarwal <sid0@fb.com> parents: 24593 diff changeset	393	return upperfallback(s)
b4258d5a1600 encoding.upper: factor out fallback code Siddharth Agarwal <sid0@fb.com> parents: 24593 diff changeset	394
b4258d5a1600 encoding.upper: factor out fallback code Siddharth Agarwal <sid0@fb.com> parents: 24593 diff changeset	395	def upperfallback(s):
17236 9fb8312dbdbd encoding: add fast-path for ASCII uppercase. Martin Geisler <mg@aragost.com> parents: 17235 diff changeset	396	try:
15672 2ebe3d0ce91d i18n: use encoding.lower/upper for encoding aware case folding FUJIWARA Katsunori <foozy@lares.dti.ne.jp> parents: 15143 diff changeset	397	if isinstance(s, localstr):
2ebe3d0ce91d i18n: use encoding.lower/upper for encoding aware case folding FUJIWARA Katsunori <foozy@lares.dti.ne.jp> parents: 15143 diff changeset	398	u = s._utf8.decode("utf-8")
2ebe3d0ce91d i18n: use encoding.lower/upper for encoding aware case folding FUJIWARA Katsunori <foozy@lares.dti.ne.jp> parents: 15143 diff changeset	399	else:
30033 02dbfaa6df0b py3: convert encoding name and mode to str Yuya Nishihara <yuya@tcha.org> parents: 30031 diff changeset	400	u = s.decode(_sysstr(encoding), _sysstr(encodingmode))
15672 2ebe3d0ce91d i18n: use encoding.lower/upper for encoding aware case folding FUJIWARA Katsunori <foozy@lares.dti.ne.jp> parents: 15143 diff changeset	401
2ebe3d0ce91d i18n: use encoding.lower/upper for encoding aware case folding FUJIWARA Katsunori <foozy@lares.dti.ne.jp> parents: 15143 diff changeset	402	uu = u.upper()
2ebe3d0ce91d i18n: use encoding.lower/upper for encoding aware case folding FUJIWARA Katsunori <foozy@lares.dti.ne.jp> parents: 15143 diff changeset	403	if u == uu:
2ebe3d0ce91d i18n: use encoding.lower/upper for encoding aware case folding FUJIWARA Katsunori <foozy@lares.dti.ne.jp> parents: 15143 diff changeset	404	return s # preserve localstring
30033 02dbfaa6df0b py3: convert encoding name and mode to str Yuya Nishihara <yuya@tcha.org> parents: 30031 diff changeset	405	return uu.encode(_sysstr(encoding))
15672 2ebe3d0ce91d i18n: use encoding.lower/upper for encoding aware case folding FUJIWARA Katsunori <foozy@lares.dti.ne.jp> parents: 15143 diff changeset	406	except UnicodeError:
2ebe3d0ce91d i18n: use encoding.lower/upper for encoding aware case folding FUJIWARA Katsunori <foozy@lares.dti.ne.jp> parents: 15143 diff changeset	407	return s.upper() # we don't know how to fold this except in ASCII
25660 328739ea70c3 global: mass rewrite to use modern exception syntax Gregory Szorc <gregory.szorc@gmail.com> parents: 24608 diff changeset	408	except LookupError as k:
15672 2ebe3d0ce91d i18n: use encoding.lower/upper for encoding aware case folding FUJIWARA Katsunori <foozy@lares.dti.ne.jp> parents: 15143 diff changeset	409	raise error.Abort(k, hint="please check your locale settings")
16133 84c58da3a1f8 encoding: introduce utf8-b helpers Matt Mackall <mpm@selenic.com> parents: 15769 diff changeset	410
24593 f473a1fe5c7c encoding: define an enum that specifies what normcase does to ASCII strings Siddharth Agarwal <sid0@fb.com> parents: 24578 diff changeset	411	class normcasespecs(object):
f473a1fe5c7c encoding: define an enum that specifies what normcase does to ASCII strings Siddharth Agarwal <sid0@fb.com> parents: 24578 diff changeset	412	'''what a platform's normcase does to ASCII strings
f473a1fe5c7c encoding: define an enum that specifies what normcase does to ASCII strings Siddharth Agarwal <sid0@fb.com> parents: 24578 diff changeset	413
f473a1fe5c7c encoding: define an enum that specifies what normcase does to ASCII strings Siddharth Agarwal <sid0@fb.com> parents: 24578 diff changeset	414	This is specified per platform, and should be consistent with what normcase
f473a1fe5c7c encoding: define an enum that specifies what normcase does to ASCII strings Siddharth Agarwal <sid0@fb.com> parents: 24578 diff changeset	415	on that platform actually does.
f473a1fe5c7c encoding: define an enum that specifies what normcase does to ASCII strings Siddharth Agarwal <sid0@fb.com> parents: 24578 diff changeset	416
f473a1fe5c7c encoding: define an enum that specifies what normcase does to ASCII strings Siddharth Agarwal <sid0@fb.com> parents: 24578 diff changeset	417	lower: normcase lowercases ASCII strings
f473a1fe5c7c encoding: define an enum that specifies what normcase does to ASCII strings Siddharth Agarwal <sid0@fb.com> parents: 24578 diff changeset	418	upper: normcase uppercases ASCII strings
24608 1c533e23ce95 util.h: define an enum for normcase specs Siddharth Agarwal <sid0@fb.com> parents: 24597 diff changeset	419	other: the fallback function should always be called
1c533e23ce95 util.h: define an enum for normcase specs Siddharth Agarwal <sid0@fb.com> parents: 24597 diff changeset	420
1c533e23ce95 util.h: define an enum for normcase specs Siddharth Agarwal <sid0@fb.com> parents: 24597 diff changeset	421	This should be kept in sync with normcase_spec in util.h.'''
24593 f473a1fe5c7c encoding: define an enum that specifies what normcase does to ASCII strings Siddharth Agarwal <sid0@fb.com> parents: 24578 diff changeset	422	lower = -1
f473a1fe5c7c encoding: define an enum that specifies what normcase does to ASCII strings Siddharth Agarwal <sid0@fb.com> parents: 24578 diff changeset	423	upper = 1
f473a1fe5c7c encoding: define an enum that specifies what normcase does to ASCII strings Siddharth Agarwal <sid0@fb.com> parents: 24578 diff changeset	424	other = 0
f473a1fe5c7c encoding: define an enum that specifies what normcase does to ASCII strings Siddharth Agarwal <sid0@fb.com> parents: 24578 diff changeset	425
28068 9ece901f7a19 encoding: add option to escape non-ascii characters in JSON Yuya Nishihara <yuya@tcha.org> parents: 28067 diff changeset	426	def jsonescape(s, paranoid=False):
22426 f6b533e64ed6 encoding: add json escaping filter Matt Mackall <mpm@selenic.com> parents: 22425 diff changeset	427	'''returns a string suitable for JSON
f6b533e64ed6 encoding: add json escaping filter Matt Mackall <mpm@selenic.com> parents: 22425 diff changeset	428
f6b533e64ed6 encoding: add json escaping filter Matt Mackall <mpm@selenic.com> parents: 22425 diff changeset	429	JSON is problematic for us because it doesn't support non-Unicode
f6b533e64ed6 encoding: add json escaping filter Matt Mackall <mpm@selenic.com> parents: 22425 diff changeset	430	bytes. To deal with this, we take the following approach:
f6b533e64ed6 encoding: add json escaping filter Matt Mackall <mpm@selenic.com> parents: 22425 diff changeset	431
37991 3ea3c96ada54 encoding: introduce tagging type for non-lossy non-ASCII string Yuya Nishihara <yuya@tcha.org> parents: 37990 diff changeset	432	- localstr/safelocalstr objects are converted back to UTF-8
22426 f6b533e64ed6 encoding: add json escaping filter Matt Mackall <mpm@selenic.com> parents: 22425 diff changeset	433	- valid UTF-8/ASCII strings are passed as-is
f6b533e64ed6 encoding: add json escaping filter Matt Mackall <mpm@selenic.com> parents: 22425 diff changeset	434	- other strings are converted to UTF-8b surrogate encoding
f6b533e64ed6 encoding: add json escaping filter Matt Mackall <mpm@selenic.com> parents: 22425 diff changeset	435	- apply JSON-specified string escaping
f6b533e64ed6 encoding: add json escaping filter Matt Mackall <mpm@selenic.com> parents: 22425 diff changeset	436
f6b533e64ed6 encoding: add json escaping filter Matt Mackall <mpm@selenic.com> parents: 22425 diff changeset	437	(escapes are doubled in these tests)
f6b533e64ed6 encoding: add json escaping filter Matt Mackall <mpm@selenic.com> parents: 22425 diff changeset	438
34146 0fa781320203 doctest: bulk-replace string literals with b'' for Python 3 Yuya Nishihara <yuya@tcha.org> parents: 33946 diff changeset	439	>>> jsonescape(b'this is a test')
22426 f6b533e64ed6 encoding: add json escaping filter Matt Mackall <mpm@selenic.com> parents: 22425 diff changeset	440	'this is a test'
34146 0fa781320203 doctest: bulk-replace string literals with b'' for Python 3 Yuya Nishihara <yuya@tcha.org> parents: 33946 diff changeset	441	>>> jsonescape(b'escape characters: \\0 \\x0b \\x7f')
27881 ffa599f3f503 encoding: escape U+007F (DEL) character in JSON Yuya Nishihara <yuya@tcha.org> parents: 27699 diff changeset	442	'escape characters: \\\\u0000 \\\\u000b \\\\u007f'
34146 0fa781320203 doctest: bulk-replace string literals with b'' for Python 3 Yuya Nishihara <yuya@tcha.org> parents: 33946 diff changeset	443	>>> jsonescape(b'escape characters: \\b \\t \\n \\f \\r \\" \\\\')
33943 2c37f9dabc32 encoding: add fast path of jsonescape() (issue5533) Yuya Nishihara <yuya@tcha.org> parents: 33942 diff changeset	444	'escape characters: \\\\b \\\\t \\\\n \\\\f \\\\r \\\\" \\\\\\\\'
34146 0fa781320203 doctest: bulk-replace string literals with b'' for Python 3 Yuya Nishihara <yuya@tcha.org> parents: 33946 diff changeset	445	>>> jsonescape(b'a weird byte: \\xdd')
22426 f6b533e64ed6 encoding: add json escaping filter Matt Mackall <mpm@selenic.com> parents: 22425 diff changeset	446	'a weird byte: \\xed\\xb3\\x9d'
34146 0fa781320203 doctest: bulk-replace string literals with b'' for Python 3 Yuya Nishihara <yuya@tcha.org> parents: 33946 diff changeset	447	>>> jsonescape(b'utf-8: caf\\xc3\\xa9')
22426 f6b533e64ed6 encoding: add json escaping filter Matt Mackall <mpm@selenic.com> parents: 22425 diff changeset	448	'utf-8: caf\\xc3\\xa9'
34146 0fa781320203 doctest: bulk-replace string literals with b'' for Python 3 Yuya Nishihara <yuya@tcha.org> parents: 33946 diff changeset	449	>>> jsonescape(b'')
22426 f6b533e64ed6 encoding: add json escaping filter Matt Mackall <mpm@selenic.com> parents: 22425 diff changeset	450	''
28068 9ece901f7a19 encoding: add option to escape non-ascii characters in JSON Yuya Nishihara <yuya@tcha.org> parents: 28067 diff changeset	451
28069 b2d24c2898f9 encoding: backport paranoid escaping from templatefilters.jsonescape() Yuya Nishihara <yuya@tcha.org> parents: 28068 diff changeset	452	If paranoid, non-ascii and common troublesome characters are also escaped.
b2d24c2898f9 encoding: backport paranoid escaping from templatefilters.jsonescape() Yuya Nishihara <yuya@tcha.org> parents: 28068 diff changeset	453	This is suitable for web output.
28068 9ece901f7a19 encoding: add option to escape non-ascii characters in JSON Yuya Nishihara <yuya@tcha.org> parents: 28067 diff changeset	454
34146 0fa781320203 doctest: bulk-replace string literals with b'' for Python 3 Yuya Nishihara <yuya@tcha.org> parents: 33946 diff changeset	455	>>> s = b'escape characters: \\0 \\x0b \\x7f'
33943 2c37f9dabc32 encoding: add fast path of jsonescape() (issue5533) Yuya Nishihara <yuya@tcha.org> parents: 33942 diff changeset	456	>>> assert jsonescape(s) == jsonescape(s, paranoid=True)
34146 0fa781320203 doctest: bulk-replace string literals with b'' for Python 3 Yuya Nishihara <yuya@tcha.org> parents: 33946 diff changeset	457	>>> s = b'escape characters: \\b \\t \\n \\f \\r \\" \\\\'
33943 2c37f9dabc32 encoding: add fast path of jsonescape() (issue5533) Yuya Nishihara <yuya@tcha.org> parents: 33942 diff changeset	458	>>> assert jsonescape(s) == jsonescape(s, paranoid=True)
34146 0fa781320203 doctest: bulk-replace string literals with b'' for Python 3 Yuya Nishihara <yuya@tcha.org> parents: 33946 diff changeset	459	>>> jsonescape(b'escape boundary: \\x7e \\x7f \\xc2\\x80', paranoid=True)
28068 9ece901f7a19 encoding: add option to escape non-ascii characters in JSON Yuya Nishihara <yuya@tcha.org> parents: 28067 diff changeset	460	'escape boundary: ~ \\\\u007f \\\\u0080'
34146 0fa781320203 doctest: bulk-replace string literals with b'' for Python 3 Yuya Nishihara <yuya@tcha.org> parents: 33946 diff changeset	461	>>> jsonescape(b'a weird byte: \\xdd', paranoid=True)
28068 9ece901f7a19 encoding: add option to escape non-ascii characters in JSON Yuya Nishihara <yuya@tcha.org> parents: 28067 diff changeset	462	'a weird byte: \\\\udcdd'
34146 0fa781320203 doctest: bulk-replace string literals with b'' for Python 3 Yuya Nishihara <yuya@tcha.org> parents: 33946 diff changeset	463	>>> jsonescape(b'utf-8: caf\\xc3\\xa9', paranoid=True)
28068 9ece901f7a19 encoding: add option to escape non-ascii characters in JSON Yuya Nishihara <yuya@tcha.org> parents: 28067 diff changeset	464	'utf-8: caf\\\\u00e9'
34146 0fa781320203 doctest: bulk-replace string literals with b'' for Python 3 Yuya Nishihara <yuya@tcha.org> parents: 33946 diff changeset	465	>>> jsonescape(b'non-BMP: \\xf0\\x9d\\x84\\x9e', paranoid=True)
28068 9ece901f7a19 encoding: add option to escape non-ascii characters in JSON Yuya Nishihara <yuya@tcha.org> parents: 28067 diff changeset	466	'non-BMP: \\\\ud834\\\\udd1e'
34146 0fa781320203 doctest: bulk-replace string literals with b'' for Python 3 Yuya Nishihara <yuya@tcha.org> parents: 33946 diff changeset	467	>>> jsonescape(b'<foo@example.org>', paranoid=True)
28069 b2d24c2898f9 encoding: backport paranoid escaping from templatefilters.jsonescape() Yuya Nishihara <yuya@tcha.org> parents: 28068 diff changeset	468	'\\\\u003cfoo@example.org\\\\u003e'
22426 f6b533e64ed6 encoding: add json escaping filter Matt Mackall <mpm@selenic.com> parents: 22425 diff changeset	469	'''
f6b533e64ed6 encoding: add json escaping filter Matt Mackall <mpm@selenic.com> parents: 22425 diff changeset	470
28068 9ece901f7a19 encoding: add option to escape non-ascii characters in JSON Yuya Nishihara <yuya@tcha.org> parents: 28067 diff changeset	471	u8chars = toutf8b(s)
9ece901f7a19 encoding: add option to escape non-ascii characters in JSON Yuya Nishihara <yuya@tcha.org> parents: 28067 diff changeset	472	try:
33942 b9101467d88b encoding: extract stub for fast JSON escape Yuya Nishihara <yuya@tcha.org> parents: 33873 diff changeset	473	return _jsonescapeu8fast(u8chars, paranoid)
b9101467d88b encoding: extract stub for fast JSON escape Yuya Nishihara <yuya@tcha.org> parents: 33873 diff changeset	474	except ValueError:
28068 9ece901f7a19 encoding: add option to escape non-ascii characters in JSON Yuya Nishihara <yuya@tcha.org> parents: 28067 diff changeset	475	pass
33942 b9101467d88b encoding: extract stub for fast JSON escape Yuya Nishihara <yuya@tcha.org> parents: 33873 diff changeset	476	return charencodepure.jsonescapeu8fallback(u8chars, paranoid)
22426 f6b533e64ed6 encoding: add json escaping filter Matt Mackall <mpm@selenic.com> parents: 22425 diff changeset	477
34225 aa877860d4d7 py3: use 'surrogatepass' error handler to process U+DCxx transparently Yuya Nishihara <yuya@tcha.org> parents: 34223 diff changeset	478	# We need to decode/encode U+DCxx codes transparently since invalid UTF-8
aa877860d4d7 py3: use 'surrogatepass' error handler to process U+DCxx transparently Yuya Nishihara <yuya@tcha.org> parents: 34223 diff changeset	479	# bytes are mapped to that range.
aa877860d4d7 py3: use 'surrogatepass' error handler to process U+DCxx transparently Yuya Nishihara <yuya@tcha.org> parents: 34223 diff changeset	480	if pycompat.ispy3:
aa877860d4d7 py3: use 'surrogatepass' error handler to process U+DCxx transparently Yuya Nishihara <yuya@tcha.org> parents: 34223 diff changeset	481	_utf8strict = r'surrogatepass'
aa877860d4d7 py3: use 'surrogatepass' error handler to process U+DCxx transparently Yuya Nishihara <yuya@tcha.org> parents: 34223 diff changeset	482	else:
aa877860d4d7 py3: use 'surrogatepass' error handler to process U+DCxx transparently Yuya Nishihara <yuya@tcha.org> parents: 34223 diff changeset	483	_utf8strict = r'strict'
aa877860d4d7 py3: use 'surrogatepass' error handler to process U+DCxx transparently Yuya Nishihara <yuya@tcha.org> parents: 34223 diff changeset	484
26875 cf47bdb2183c encoding: add getutf8char helper Matt Mackall <mpm@selenic.com> parents: 25660 diff changeset	485	_utf8len = [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 2, 2, 3, 4]
cf47bdb2183c encoding: add getutf8char helper Matt Mackall <mpm@selenic.com> parents: 25660 diff changeset	486
cf47bdb2183c encoding: add getutf8char helper Matt Mackall <mpm@selenic.com> parents: 25660 diff changeset	487	def getutf8char(s, pos):
cf47bdb2183c encoding: add getutf8char helper Matt Mackall <mpm@selenic.com> parents: 25660 diff changeset	488	'''get the next full utf-8 character in the given string, starting at pos
cf47bdb2183c encoding: add getutf8char helper Matt Mackall <mpm@selenic.com> parents: 25660 diff changeset	489
cf47bdb2183c encoding: add getutf8char helper Matt Mackall <mpm@selenic.com> parents: 25660 diff changeset	490	Raises a UnicodeError if the given location does not start a valid
cf47bdb2183c encoding: add getutf8char helper Matt Mackall <mpm@selenic.com> parents: 25660 diff changeset	491	utf-8 character.
cf47bdb2183c encoding: add getutf8char helper Matt Mackall <mpm@selenic.com> parents: 25660 diff changeset	492	'''
cf47bdb2183c encoding: add getutf8char helper Matt Mackall <mpm@selenic.com> parents: 25660 diff changeset	493
cf47bdb2183c encoding: add getutf8char helper Matt Mackall <mpm@selenic.com> parents: 25660 diff changeset	494	# find how many bytes to attempt decoding from first nibble
34207 112f118ecb00 encoding: ensure getutf8char always returns a bytestr, never an int Augie Fackler <raf@durin42.com> parents: 34152 diff changeset	495	l = _utf8len[ord(s[pos:pos + 1]) >> 4]
26875 cf47bdb2183c encoding: add getutf8char helper Matt Mackall <mpm@selenic.com> parents: 25660 diff changeset	496	if not l: # ascii
34207 112f118ecb00 encoding: ensure getutf8char always returns a bytestr, never an int Augie Fackler <raf@durin42.com> parents: 34152 diff changeset	497	return s[pos:pos + 1]
26875 cf47bdb2183c encoding: add getutf8char helper Matt Mackall <mpm@selenic.com> parents: 25660 diff changeset	498
cf47bdb2183c encoding: add getutf8char helper Matt Mackall <mpm@selenic.com> parents: 25660 diff changeset	499	c = s[pos:pos + l]
cf47bdb2183c encoding: add getutf8char helper Matt Mackall <mpm@selenic.com> parents: 25660 diff changeset	500	# validate with attempted decode
34225 aa877860d4d7 py3: use 'surrogatepass' error handler to process U+DCxx transparently Yuya Nishihara <yuya@tcha.org> parents: 34223 diff changeset	501	c.decode("utf-8", _utf8strict)
26875 cf47bdb2183c encoding: add getutf8char helper Matt Mackall <mpm@selenic.com> parents: 25660 diff changeset	502	return c
cf47bdb2183c encoding: add getutf8char helper Matt Mackall <mpm@selenic.com> parents: 25660 diff changeset	503
16133 84c58da3a1f8 encoding: introduce utf8-b helpers Matt Mackall <mpm@selenic.com> parents: 15769 diff changeset	504	def toutf8b(s):
84c58da3a1f8 encoding: introduce utf8-b helpers Matt Mackall <mpm@selenic.com> parents: 15769 diff changeset	505	'''convert a local, possibly-binary string into UTF-8b
84c58da3a1f8 encoding: introduce utf8-b helpers Matt Mackall <mpm@selenic.com> parents: 15769 diff changeset	506
84c58da3a1f8 encoding: introduce utf8-b helpers Matt Mackall <mpm@selenic.com> parents: 15769 diff changeset	507	This is intended as a generic method to preserve data when working
84c58da3a1f8 encoding: introduce utf8-b helpers Matt Mackall <mpm@selenic.com> parents: 15769 diff changeset	508	with schemes like JSON and XML that have no provision for
84c58da3a1f8 encoding: introduce utf8-b helpers Matt Mackall <mpm@selenic.com> parents: 15769 diff changeset	509	arbitrary byte strings. As Mercurial often doesn't know
84c58da3a1f8 encoding: introduce utf8-b helpers Matt Mackall <mpm@selenic.com> parents: 15769 diff changeset	510	what encoding data is in, we use so-called UTF-8b.
84c58da3a1f8 encoding: introduce utf8-b helpers Matt Mackall <mpm@selenic.com> parents: 15769 diff changeset	511
84c58da3a1f8 encoding: introduce utf8-b helpers Matt Mackall <mpm@selenic.com> parents: 15769 diff changeset	512	If a string is already valid UTF-8 (or ASCII), it passes unmodified.
84c58da3a1f8 encoding: introduce utf8-b helpers Matt Mackall <mpm@selenic.com> parents: 15769 diff changeset	513	Otherwise, unsupported bytes are mapped to UTF-16 surrogate range,
84c58da3a1f8 encoding: introduce utf8-b helpers Matt Mackall <mpm@selenic.com> parents: 15769 diff changeset	514	uDC00-uDCFF.
84c58da3a1f8 encoding: introduce utf8-b helpers Matt Mackall <mpm@selenic.com> parents: 15769 diff changeset	515
84c58da3a1f8 encoding: introduce utf8-b helpers Matt Mackall <mpm@selenic.com> parents: 15769 diff changeset	516	Principles of operation:
84c58da3a1f8 encoding: introduce utf8-b helpers Matt Mackall <mpm@selenic.com> parents: 15769 diff changeset	517
17424 e7cfe3587ea4 fix trivial spelling errors Mads Kiilerich <mads@kiilerich.com> parents: 17236 diff changeset	518	- ASCII and UTF-8 data successfully round-trips and is understood
16133 84c58da3a1f8 encoding: introduce utf8-b helpers Matt Mackall <mpm@selenic.com> parents: 15769 diff changeset	519	by Unicode-oriented clients
84c58da3a1f8 encoding: introduce utf8-b helpers Matt Mackall <mpm@selenic.com> parents: 15769 diff changeset	520	- filenames and file contents in arbitrary other encodings can have
84c58da3a1f8 encoding: introduce utf8-b helpers Matt Mackall <mpm@selenic.com> parents: 15769 diff changeset	521	be round-tripped or recovered by clueful clients
84c58da3a1f8 encoding: introduce utf8-b helpers Matt Mackall <mpm@selenic.com> parents: 15769 diff changeset	522	- local strings that have a cached known UTF-8 encoding (aka
84c58da3a1f8 encoding: introduce utf8-b helpers Matt Mackall <mpm@selenic.com> parents: 15769 diff changeset	523	localstr) get sent as UTF-8 so Unicode-oriented clients get the
84c58da3a1f8 encoding: introduce utf8-b helpers Matt Mackall <mpm@selenic.com> parents: 15769 diff changeset	524	Unicode data they want
37991 3ea3c96ada54 encoding: introduce tagging type for non-lossy non-ASCII string Yuya Nishihara <yuya@tcha.org> parents: 37990 diff changeset	525	- non-lossy local strings (aka safelocalstr) get sent as UTF-8 as well
16133 84c58da3a1f8 encoding: introduce utf8-b helpers Matt Mackall <mpm@selenic.com> parents: 15769 diff changeset	526	- because we must preserve UTF-8 bytestring in places such as
84c58da3a1f8 encoding: introduce utf8-b helpers Matt Mackall <mpm@selenic.com> parents: 15769 diff changeset	527	filenames, metadata can't be roundtripped without help
84c58da3a1f8 encoding: introduce utf8-b helpers Matt Mackall <mpm@selenic.com> parents: 15769 diff changeset	528
84c58da3a1f8 encoding: introduce utf8-b helpers Matt Mackall <mpm@selenic.com> parents: 15769 diff changeset	529	(Note: "UTF-8b" often refers to decoding a mix of valid UTF-8 and
84c58da3a1f8 encoding: introduce utf8-b helpers Matt Mackall <mpm@selenic.com> parents: 15769 diff changeset	530	arbitrary bytes into an internal Unicode format that can be
84c58da3a1f8 encoding: introduce utf8-b helpers Matt Mackall <mpm@selenic.com> parents: 15769 diff changeset	531	re-encoded back into the original. Here we are exposing the
84c58da3a1f8 encoding: introduce utf8-b helpers Matt Mackall <mpm@selenic.com> parents: 15769 diff changeset	532	internal surrogate encoding as a UTF-8 string.)
84c58da3a1f8 encoding: introduce utf8-b helpers Matt Mackall <mpm@selenic.com> parents: 15769 diff changeset	533	'''
84c58da3a1f8 encoding: introduce utf8-b helpers Matt Mackall <mpm@selenic.com> parents: 15769 diff changeset	534
37990 57b0c7221dba encoding: fix toutf8b() to resurrect lossy characters even if "\xed" in it Yuya Nishihara <yuya@tcha.org> parents: 36805 diff changeset	535	if isinstance(s, localstr):
57b0c7221dba encoding: fix toutf8b() to resurrect lossy characters even if "\xed" in it Yuya Nishihara <yuya@tcha.org> parents: 36805 diff changeset	536	# assume that the original UTF-8 sequence would never contain
57b0c7221dba encoding: fix toutf8b() to resurrect lossy characters even if "\xed" in it Yuya Nishihara <yuya@tcha.org> parents: 36805 diff changeset	537	# invalid characters in U+DCxx range
57b0c7221dba encoding: fix toutf8b() to resurrect lossy characters even if "\xed" in it Yuya Nishihara <yuya@tcha.org> parents: 36805 diff changeset	538	return s._utf8
37991 3ea3c96ada54 encoding: introduce tagging type for non-lossy non-ASCII string Yuya Nishihara <yuya@tcha.org> parents: 37990 diff changeset	539	elif isinstance(s, safelocalstr):
3ea3c96ada54 encoding: introduce tagging type for non-lossy non-ASCII string Yuya Nishihara <yuya@tcha.org> parents: 37990 diff changeset	540	# already verified that s is non-lossy in legacy encoding, which
3ea3c96ada54 encoding: introduce tagging type for non-lossy non-ASCII string Yuya Nishihara <yuya@tcha.org> parents: 37990 diff changeset	541	# shouldn't contain characters in U+DCxx range
3ea3c96ada54 encoding: introduce tagging type for non-lossy non-ASCII string Yuya Nishihara <yuya@tcha.org> parents: 37990 diff changeset	542	return fromlocal(s)
37990 57b0c7221dba encoding: fix toutf8b() to resurrect lossy characters even if "\xed" in it Yuya Nishihara <yuya@tcha.org> parents: 36805 diff changeset	543	elif isasciistr(s):
33946 6c119dbfd0c0 encoding: add fast path of from/toutf8b() for ASCII strings Yuya Nishihara <yuya@tcha.org> parents: 33945 diff changeset	544	return s
26879 a24b98f4e03c encoding: re-escape U+DCxx characters in toutf8b input (issue4927) Matt Mackall <mpm@selenic.com> parents: 26878 diff changeset	545	if "\xed" not in s:
a24b98f4e03c encoding: re-escape U+DCxx characters in toutf8b input (issue4927) Matt Mackall <mpm@selenic.com> parents: 26878 diff changeset	546	try:
34225 aa877860d4d7 py3: use 'surrogatepass' error handler to process U+DCxx transparently Yuya Nishihara <yuya@tcha.org> parents: 34223 diff changeset	547	s.decode('utf-8', _utf8strict)
26879 a24b98f4e03c encoding: re-escape U+DCxx characters in toutf8b input (issue4927) Matt Mackall <mpm@selenic.com> parents: 26878 diff changeset	548	return s
a24b98f4e03c encoding: re-escape U+DCxx characters in toutf8b input (issue4927) Matt Mackall <mpm@selenic.com> parents: 26878 diff changeset	549	except UnicodeDecodeError:
a24b98f4e03c encoding: re-escape U+DCxx characters in toutf8b input (issue4927) Matt Mackall <mpm@selenic.com> parents: 26878 diff changeset	550	pass
26878 d7e83f106459 encoding: use getutf8char in toutf8b Matt Mackall <mpm@selenic.com> parents: 26877 diff changeset	551
34223 1c601df9894c py3: wrap bytes in encoding.from/toutf8b() with bytestr Yuya Nishihara <yuya@tcha.org> parents: 34207 diff changeset	552	s = pycompat.bytestr(s)
26878 d7e83f106459 encoding: use getutf8char in toutf8b Matt Mackall <mpm@selenic.com> parents: 26877 diff changeset	553	r = ""
d7e83f106459 encoding: use getutf8char in toutf8b Matt Mackall <mpm@selenic.com> parents: 26877 diff changeset	554	pos = 0
d7e83f106459 encoding: use getutf8char in toutf8b Matt Mackall <mpm@selenic.com> parents: 26877 diff changeset	555	l = len(s)
d7e83f106459 encoding: use getutf8char in toutf8b Matt Mackall <mpm@selenic.com> parents: 26877 diff changeset	556	while pos < l:
d7e83f106459 encoding: use getutf8char in toutf8b Matt Mackall <mpm@selenic.com> parents: 26877 diff changeset	557	try:
d7e83f106459 encoding: use getutf8char in toutf8b Matt Mackall <mpm@selenic.com> parents: 26877 diff changeset	558	c = getutf8char(s, pos)
26879 a24b98f4e03c encoding: re-escape U+DCxx characters in toutf8b input (issue4927) Matt Mackall <mpm@selenic.com> parents: 26878 diff changeset	559	if "\xed\xb0\x80" <= c <= "\xed\xb3\xbf":
a24b98f4e03c encoding: re-escape U+DCxx characters in toutf8b input (issue4927) Matt Mackall <mpm@selenic.com> parents: 26878 diff changeset	560	# have to re-escape existing U+DCxx characters
34225 aa877860d4d7 py3: use 'surrogatepass' error handler to process U+DCxx transparently Yuya Nishihara <yuya@tcha.org> parents: 34223 diff changeset	561	c = unichr(0xdc00 + ord(s[pos])).encode('utf-8', _utf8strict)
26879 a24b98f4e03c encoding: re-escape U+DCxx characters in toutf8b input (issue4927) Matt Mackall <mpm@selenic.com> parents: 26878 diff changeset	562	pos += 1
a24b98f4e03c encoding: re-escape U+DCxx characters in toutf8b input (issue4927) Matt Mackall <mpm@selenic.com> parents: 26878 diff changeset	563	else:
a24b98f4e03c encoding: re-escape U+DCxx characters in toutf8b input (issue4927) Matt Mackall <mpm@selenic.com> parents: 26878 diff changeset	564	pos += len(c)
26878 d7e83f106459 encoding: use getutf8char in toutf8b Matt Mackall <mpm@selenic.com> parents: 26877 diff changeset	565	except UnicodeDecodeError:
34225 aa877860d4d7 py3: use 'surrogatepass' error handler to process U+DCxx transparently Yuya Nishihara <yuya@tcha.org> parents: 34223 diff changeset	566	c = unichr(0xdc00 + ord(s[pos])).encode('utf-8', _utf8strict)
26878 d7e83f106459 encoding: use getutf8char in toutf8b Matt Mackall <mpm@selenic.com> parents: 26877 diff changeset	567	pos += 1
d7e83f106459 encoding: use getutf8char in toutf8b Matt Mackall <mpm@selenic.com> parents: 26877 diff changeset	568	r += c
d7e83f106459 encoding: use getutf8char in toutf8b Matt Mackall <mpm@selenic.com> parents: 26877 diff changeset	569	return r
16133 84c58da3a1f8 encoding: introduce utf8-b helpers Matt Mackall <mpm@selenic.com> parents: 15769 diff changeset	570
84c58da3a1f8 encoding: introduce utf8-b helpers Matt Mackall <mpm@selenic.com> parents: 15769 diff changeset	571	def fromutf8b(s):
84c58da3a1f8 encoding: introduce utf8-b helpers Matt Mackall <mpm@selenic.com> parents: 15769 diff changeset	572	'''Given a UTF-8b string, return a local, possibly-binary string.
84c58da3a1f8 encoding: introduce utf8-b helpers Matt Mackall <mpm@selenic.com> parents: 15769 diff changeset	573
84c58da3a1f8 encoding: introduce utf8-b helpers Matt Mackall <mpm@selenic.com> parents: 15769 diff changeset	574	return the original binary string. This
84c58da3a1f8 encoding: introduce utf8-b helpers Matt Mackall <mpm@selenic.com> parents: 15769 diff changeset	575	is a round-trip process for strings like filenames, but metadata
84c58da3a1f8 encoding: introduce utf8-b helpers Matt Mackall <mpm@selenic.com> parents: 15769 diff changeset	576	that's was passed through tolocal will remain in UTF-8.
84c58da3a1f8 encoding: introduce utf8-b helpers Matt Mackall <mpm@selenic.com> parents: 15769 diff changeset	577
26963 de5ae97ce9f4 encoding: extend test cases for utf8b Matt Mackall <mpm@selenic.com> parents: 26879 diff changeset	578	>>> roundtrip = lambda x: fromutf8b(toutf8b(x)) == x
34146 0fa781320203 doctest: bulk-replace string literals with b'' for Python 3 Yuya Nishihara <yuya@tcha.org> parents: 33946 diff changeset	579	>>> m = b"\\xc3\\xa9\\x99abcd"
26963 de5ae97ce9f4 encoding: extend test cases for utf8b Matt Mackall <mpm@selenic.com> parents: 26879 diff changeset	580	>>> toutf8b(m)
16133 84c58da3a1f8 encoding: introduce utf8-b helpers Matt Mackall <mpm@selenic.com> parents: 15769 diff changeset	581	'\\xc3\\xa9\\xed\\xb2\\x99abcd'
26963 de5ae97ce9f4 encoding: extend test cases for utf8b Matt Mackall <mpm@selenic.com> parents: 26879 diff changeset	582	>>> roundtrip(m)
de5ae97ce9f4 encoding: extend test cases for utf8b Matt Mackall <mpm@selenic.com> parents: 26879 diff changeset	583	True
34146 0fa781320203 doctest: bulk-replace string literals with b'' for Python 3 Yuya Nishihara <yuya@tcha.org> parents: 33946 diff changeset	584	>>> roundtrip(b"\\xc2\\xc2\\x80")
26963 de5ae97ce9f4 encoding: extend test cases for utf8b Matt Mackall <mpm@selenic.com> parents: 26879 diff changeset	585	True
34146 0fa781320203 doctest: bulk-replace string literals with b'' for Python 3 Yuya Nishihara <yuya@tcha.org> parents: 33946 diff changeset	586	>>> roundtrip(b"\\xef\\xbf\\xbd")
26963 de5ae97ce9f4 encoding: extend test cases for utf8b Matt Mackall <mpm@selenic.com> parents: 26879 diff changeset	587	True
34146 0fa781320203 doctest: bulk-replace string literals with b'' for Python 3 Yuya Nishihara <yuya@tcha.org> parents: 33946 diff changeset	588	>>> roundtrip(b"\\xef\\xef\\xbf\\xbd")
16133 84c58da3a1f8 encoding: introduce utf8-b helpers Matt Mackall <mpm@selenic.com> parents: 15769 diff changeset	589	True
34146 0fa781320203 doctest: bulk-replace string literals with b'' for Python 3 Yuya Nishihara <yuya@tcha.org> parents: 33946 diff changeset	590	>>> roundtrip(b"\\xf1\\x80\\x80\\x80\\x80")
27699 c8d3392f76e1 encoding: handle UTF-16 internal limit with fromutf8b (issue5031) Matt Mackall <mpm@selenic.com> parents: 27356 diff changeset	591	True
16133 84c58da3a1f8 encoding: introduce utf8-b helpers Matt Mackall <mpm@selenic.com> parents: 15769 diff changeset	592	'''
84c58da3a1f8 encoding: introduce utf8-b helpers Matt Mackall <mpm@selenic.com> parents: 15769 diff changeset	593
33946 6c119dbfd0c0 encoding: add fast path of from/toutf8b() for ASCII strings Yuya Nishihara <yuya@tcha.org> parents: 33945 diff changeset	594	if isasciistr(s):
6c119dbfd0c0 encoding: add fast path of from/toutf8b() for ASCII strings Yuya Nishihara <yuya@tcha.org> parents: 33945 diff changeset	595	return s
16133 84c58da3a1f8 encoding: introduce utf8-b helpers Matt Mackall <mpm@selenic.com> parents: 15769 diff changeset	596	# fast path - look for uDxxx prefixes in s
84c58da3a1f8 encoding: introduce utf8-b helpers Matt Mackall <mpm@selenic.com> parents: 15769 diff changeset	597	if "\xed" not in s:
84c58da3a1f8 encoding: introduce utf8-b helpers Matt Mackall <mpm@selenic.com> parents: 15769 diff changeset	598	return s
84c58da3a1f8 encoding: introduce utf8-b helpers Matt Mackall <mpm@selenic.com> parents: 15769 diff changeset	599
27699 c8d3392f76e1 encoding: handle UTF-16 internal limit with fromutf8b (issue5031) Matt Mackall <mpm@selenic.com> parents: 27356 diff changeset	600	# We could do this with the unicode type but some Python builds
c8d3392f76e1 encoding: handle UTF-16 internal limit with fromutf8b (issue5031) Matt Mackall <mpm@selenic.com> parents: 27356 diff changeset	601	# use UTF-16 internally (issue5031) which causes non-BMP code
c8d3392f76e1 encoding: handle UTF-16 internal limit with fromutf8b (issue5031) Matt Mackall <mpm@selenic.com> parents: 27356 diff changeset	602	# points to be escaped. Instead, we use our handy getutf8char
c8d3392f76e1 encoding: handle UTF-16 internal limit with fromutf8b (issue5031) Matt Mackall <mpm@selenic.com> parents: 27356 diff changeset	603	# helper again to walk the string without "decoding" it.
c8d3392f76e1 encoding: handle UTF-16 internal limit with fromutf8b (issue5031) Matt Mackall <mpm@selenic.com> parents: 27356 diff changeset	604
34223 1c601df9894c py3: wrap bytes in encoding.from/toutf8b() with bytestr Yuya Nishihara <yuya@tcha.org> parents: 34207 diff changeset	605	s = pycompat.bytestr(s)
16133 84c58da3a1f8 encoding: introduce utf8-b helpers Matt Mackall <mpm@selenic.com> parents: 15769 diff changeset	606	r = ""
27699 c8d3392f76e1 encoding: handle UTF-16 internal limit with fromutf8b (issue5031) Matt Mackall <mpm@selenic.com> parents: 27356 diff changeset	607	pos = 0
c8d3392f76e1 encoding: handle UTF-16 internal limit with fromutf8b (issue5031) Matt Mackall <mpm@selenic.com> parents: 27356 diff changeset	608	l = len(s)
c8d3392f76e1 encoding: handle UTF-16 internal limit with fromutf8b (issue5031) Matt Mackall <mpm@selenic.com> parents: 27356 diff changeset	609	while pos < l:
c8d3392f76e1 encoding: handle UTF-16 internal limit with fromutf8b (issue5031) Matt Mackall <mpm@selenic.com> parents: 27356 diff changeset	610	c = getutf8char(s, pos)
c8d3392f76e1 encoding: handle UTF-16 internal limit with fromutf8b (issue5031) Matt Mackall <mpm@selenic.com> parents: 27356 diff changeset	611	pos += len(c)
c8d3392f76e1 encoding: handle UTF-16 internal limit with fromutf8b (issue5031) Matt Mackall <mpm@selenic.com> parents: 27356 diff changeset	612	# unescape U+DCxx characters
c8d3392f76e1 encoding: handle UTF-16 internal limit with fromutf8b (issue5031) Matt Mackall <mpm@selenic.com> parents: 27356 diff changeset	613	if "\xed\xb0\x80" <= c <= "\xed\xb3\xbf":
34225 aa877860d4d7 py3: use 'surrogatepass' error handler to process U+DCxx transparently Yuya Nishihara <yuya@tcha.org> parents: 34223 diff changeset	614	c = pycompat.bytechr(ord(c.decode("utf-8", _utf8strict)) & 0xff)
27699 c8d3392f76e1 encoding: handle UTF-16 internal limit with fromutf8b (issue5031) Matt Mackall <mpm@selenic.com> parents: 27356 diff changeset	615	r += c
16133 84c58da3a1f8 encoding: introduce utf8-b helpers Matt Mackall <mpm@selenic.com> parents: 15769 diff changeset	616	return r

author	Martin von Zweigbergk <martinvonz@google.com>
	Fri, 28 Sep 2018 12:56:57 -0700
changeset 39967	707c3804e607
parent 39844	9e8fcd2e78c1
child 41841	25694a78e4a4
permissions	-rw-r--r--