annotate mercurial/encoding.py @ 46858:85e3a630cad9

revlog: move the details of revlog "v2" index inside revlog.utils.constants the revlog module is quite large and this kind of format information would handy for other module. So let us start to gather this information about the format in a more appropriate place. We update various reference to this information to use the new "source of truth" in the process. Differential Revision: https://phab.mercurial-scm.org/D10305
author Pierre-Yves David <pierre-yves.david@octobus.net>
date Mon, 05 Apr 2021 12:21:12 +0200
parents d4ba4d51f85f
children 53a864a60281
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
8226
8b2cd04a6e97 put license and copyright info into comment blocks
Martin Geisler <mg@lazybytes.net>
parents: 8225
diff changeset
1 # encoding.py - character transcoding support for Mercurial
8b2cd04a6e97 put license and copyright info into comment blocks
Martin Geisler <mg@lazybytes.net>
parents: 8225
diff changeset
2 #
46819
d4ba4d51f85f contributor: change mentions of mpm to olivia
Raphaël Gomès <rgomes@octobus.net>
parents: 46319
diff changeset
3 # Copyright 2005-2009 Olivia Mackall <olivia@selenic.com> and others
8226
8b2cd04a6e97 put license and copyright info into comment blocks
Martin Geisler <mg@lazybytes.net>
parents: 8225
diff changeset
4 #
8b2cd04a6e97 put license and copyright info into comment blocks
Martin Geisler <mg@lazybytes.net>
parents: 8225
diff changeset
5 # This software may be used and distributed according to the terms of the
10263
25e572394f5c Update license to GPLv2+
Matt Mackall <mpm@selenic.com>
parents: 9574
diff changeset
6 # GNU General Public License version 2 or any later version.
7948
de377b1a9a84 move encoding bits from util to encoding
Matt Mackall <mpm@selenic.com>
parents:
diff changeset
7
34137
a8994d08e4a2 doctest: use print_function and convert bytes to unicode where needed
Yuya Nishihara <yuya@tcha.org>
parents: 34136
diff changeset
8 from __future__ import absolute_import, print_function
27355
b479fc425a81 encoding: use absolute_import
Gregory Szorc <gregory.szorc@gmail.com>
parents: 26963
diff changeset
9
b479fc425a81 encoding: use absolute_import
Gregory Szorc <gregory.szorc@gmail.com>
parents: 26963
diff changeset
10 import locale
b479fc425a81 encoding: use absolute_import
Gregory Szorc <gregory.szorc@gmail.com>
parents: 26963
diff changeset
11 import os
b479fc425a81 encoding: use absolute_import
Gregory Szorc <gregory.szorc@gmail.com>
parents: 26963
diff changeset
12 import unicodedata
b479fc425a81 encoding: use absolute_import
Gregory Szorc <gregory.szorc@gmail.com>
parents: 26963
diff changeset
13
43089
c59eb1560c44 py3: manually import getattr where it is needed
Gregory Szorc <gregory.szorc@gmail.com>
parents: 43077
diff changeset
14 from .pycompat import getattr
27355
b479fc425a81 encoding: use absolute_import
Gregory Szorc <gregory.szorc@gmail.com>
parents: 26963
diff changeset
15 from . import (
b479fc425a81 encoding: use absolute_import
Gregory Szorc <gregory.szorc@gmail.com>
parents: 26963
diff changeset
16 error,
32372
df448de7cf3b parsers: switch to policy importer
Yuya Nishihara <yuya@tcha.org>
parents: 32299
diff changeset
17 policy,
30030
0f6d6fdd3c2a pycompat: provide 'ispy3' constant
Yuya Nishihara <yuya@tcha.org>
parents: 28508
diff changeset
18 pycompat,
27355
b479fc425a81 encoding: use absolute_import
Gregory Szorc <gregory.szorc@gmail.com>
parents: 26963
diff changeset
19 )
7948
de377b1a9a84 move encoding bits from util to encoding
Matt Mackall <mpm@selenic.com>
parents:
diff changeset
20
43076
2372284d9457 formatting: blacken the codebase
Augie Fackler <augie@google.com>
parents: 41836
diff changeset
21 from .pure import charencode as charencodepure
33924
b9101467d88b encoding: extract stub for fast JSON escape
Yuya Nishihara <yuya@tcha.org>
parents: 33852
diff changeset
22
43773
7b14d649af1b typing: consolidate "if not globals():" trick
Yuya Nishihara <yuya@tcha.org>
parents: 43685
diff changeset
23 if pycompat.TYPE_CHECKING:
43496
2ade00f3b03b encoding: add comment-based type hints for pytype
Augie Fackler <augie@google.com>
parents: 43469
diff changeset
24 from typing import (
2ade00f3b03b encoding: add comment-based type hints for pytype
Augie Fackler <augie@google.com>
parents: 43469
diff changeset
25 Any,
2ade00f3b03b encoding: add comment-based type hints for pytype
Augie Fackler <augie@google.com>
parents: 43469
diff changeset
26 Callable,
2ade00f3b03b encoding: add comment-based type hints for pytype
Augie Fackler <augie@google.com>
parents: 43469
diff changeset
27 List,
2ade00f3b03b encoding: add comment-based type hints for pytype
Augie Fackler <augie@google.com>
parents: 43469
diff changeset
28 Text,
2ade00f3b03b encoding: add comment-based type hints for pytype
Augie Fackler <augie@google.com>
parents: 43469
diff changeset
29 Type,
2ade00f3b03b encoding: add comment-based type hints for pytype
Augie Fackler <augie@google.com>
parents: 43469
diff changeset
30 TypeVar,
2ade00f3b03b encoding: add comment-based type hints for pytype
Augie Fackler <augie@google.com>
parents: 43469
diff changeset
31 Union,
2ade00f3b03b encoding: add comment-based type hints for pytype
Augie Fackler <augie@google.com>
parents: 43469
diff changeset
32 )
2ade00f3b03b encoding: add comment-based type hints for pytype
Augie Fackler <augie@google.com>
parents: 43469
diff changeset
33
2ade00f3b03b encoding: add comment-based type hints for pytype
Augie Fackler <augie@google.com>
parents: 43469
diff changeset
34 # keep pyflakes happy
2ade00f3b03b encoding: add comment-based type hints for pytype
Augie Fackler <augie@google.com>
parents: 43469
diff changeset
35 for t in (Any, Callable, List, Text, Type, Union):
2ade00f3b03b encoding: add comment-based type hints for pytype
Augie Fackler <augie@google.com>
parents: 43469
diff changeset
36 assert t
2ade00f3b03b encoding: add comment-based type hints for pytype
Augie Fackler <augie@google.com>
parents: 43469
diff changeset
37
43680
3364a15f61f0 typing: fix forward reference in _Tlocalstr type bound
Yuya Nishihara <yuya@tcha.org>
parents: 43679
diff changeset
38 _Tlocalstr = TypeVar('_Tlocalstr', bound='localstr')
43496
2ade00f3b03b encoding: add comment-based type hints for pytype
Augie Fackler <augie@google.com>
parents: 43469
diff changeset
39
43506
9f70512ae2cf cleanup: remove pointless r-prefixes on single-quoted strings
Augie Fackler <augie@google.com>
parents: 43503
diff changeset
40 charencode = policy.importmod('charencode')
33761
f5fc54e7e467 encoding: drop circular import by proxying through '<policy>.charencode'
Yuya Nishihara <yuya@tcha.org>
parents: 33022
diff changeset
41
33926
f4433f2713d0 encoding: add function to test if a str consists of ASCII characters
Yuya Nishihara <yuya@tcha.org>
parents: 33925
diff changeset
42 isasciistr = charencode.isasciistr
33761
f5fc54e7e467 encoding: drop circular import by proxying through '<policy>.charencode'
Yuya Nishihara <yuya@tcha.org>
parents: 33022
diff changeset
43 asciilower = charencode.asciilower
f5fc54e7e467 encoding: drop circular import by proxying through '<policy>.charencode'
Yuya Nishihara <yuya@tcha.org>
parents: 33022
diff changeset
44 asciiupper = charencode.asciiupper
33925
2c37f9dabc32 encoding: add fast path of jsonescape() (issue5533)
Yuya Nishihara <yuya@tcha.org>
parents: 33924
diff changeset
45 _jsonescapeu8fast = charencode.jsonescapeu8fast
33761
f5fc54e7e467 encoding: drop circular import by proxying through '<policy>.charencode'
Yuya Nishihara <yuya@tcha.org>
parents: 33022
diff changeset
46
30033
02dbfaa6df0b py3: convert encoding name and mode to str
Yuya Nishihara <yuya@tcha.org>
parents: 30030
diff changeset
47 _sysstr = pycompat.sysstr
02dbfaa6df0b py3: convert encoding name and mode to str
Yuya Nishihara <yuya@tcha.org>
parents: 30030
diff changeset
48
30030
0f6d6fdd3c2a pycompat: provide 'ispy3' constant
Yuya Nishihara <yuya@tcha.org>
parents: 28508
diff changeset
49 if pycompat.ispy3:
28507
9bcbd9412225 encoding: make HFS+ ignore code Python 3 compatible
Gregory Szorc <gregory.szorc@gmail.com>
parents: 28069
diff changeset
50 unichr = chr
9bcbd9412225 encoding: make HFS+ ignore code Python 3 compatible
Gregory Szorc <gregory.szorc@gmail.com>
parents: 28069
diff changeset
51
23596
885bd7c5c7e3 encoding: add hfsignoreclean to clean out HFS-ignored characters
Augie Fackler <raf@durin42.com>
parents: 22973
diff changeset
52 # These unicode characters are ignored by HFS+ (Apple Technote 1150,
885bd7c5c7e3 encoding: add hfsignoreclean to clean out HFS-ignored characters
Augie Fackler <raf@durin42.com>
parents: 22973
diff changeset
53 # "Unicode Subtleties"), so we need to ignore them in some places for
885bd7c5c7e3 encoding: add hfsignoreclean to clean out HFS-ignored characters
Augie Fackler <raf@durin42.com>
parents: 22973
diff changeset
54 # sanity.
43076
2372284d9457 formatting: blacken the codebase
Augie Fackler <augie@google.com>
parents: 41836
diff changeset
55 _ignore = [
2372284d9457 formatting: blacken the codebase
Augie Fackler <augie@google.com>
parents: 41836
diff changeset
56 unichr(int(x, 16)).encode("utf-8")
43077
687b865b95ad formatting: byteify all mercurial/ and hgext/ string literals
Augie Fackler <augie@google.com>
parents: 43076
diff changeset
57 for x in b"200c 200d 200e 200f 202a 202b 202c 202d 202e "
687b865b95ad formatting: byteify all mercurial/ and hgext/ string literals
Augie Fackler <augie@google.com>
parents: 43076
diff changeset
58 b"206a 206b 206c 206d 206e 206f feff".split()
43076
2372284d9457 formatting: blacken the codebase
Augie Fackler <augie@google.com>
parents: 41836
diff changeset
59 ]
23596
885bd7c5c7e3 encoding: add hfsignoreclean to clean out HFS-ignored characters
Augie Fackler <raf@durin42.com>
parents: 22973
diff changeset
60 # verify the next function will work
43077
687b865b95ad formatting: byteify all mercurial/ and hgext/ string literals
Augie Fackler <augie@google.com>
parents: 43076
diff changeset
61 assert all(i.startswith((b"\xe2", b"\xef")) for i in _ignore)
23596
885bd7c5c7e3 encoding: add hfsignoreclean to clean out HFS-ignored characters
Augie Fackler <raf@durin42.com>
parents: 22973
diff changeset
62
43076
2372284d9457 formatting: blacken the codebase
Augie Fackler <augie@google.com>
parents: 41836
diff changeset
63
23596
885bd7c5c7e3 encoding: add hfsignoreclean to clean out HFS-ignored characters
Augie Fackler <raf@durin42.com>
parents: 22973
diff changeset
64 def hfsignoreclean(s):
43496
2ade00f3b03b encoding: add comment-based type hints for pytype
Augie Fackler <augie@google.com>
parents: 43469
diff changeset
65 # type: (bytes) -> bytes
23596
885bd7c5c7e3 encoding: add hfsignoreclean to clean out HFS-ignored characters
Augie Fackler <raf@durin42.com>
parents: 22973
diff changeset
66 """Remove codepoints ignored by HFS+ from s.
885bd7c5c7e3 encoding: add hfsignoreclean to clean out HFS-ignored characters
Augie Fackler <raf@durin42.com>
parents: 22973
diff changeset
67
885bd7c5c7e3 encoding: add hfsignoreclean to clean out HFS-ignored characters
Augie Fackler <raf@durin42.com>
parents: 22973
diff changeset
68 >>> hfsignoreclean(u'.h\u200cg'.encode('utf-8'))
885bd7c5c7e3 encoding: add hfsignoreclean to clean out HFS-ignored characters
Augie Fackler <raf@durin42.com>
parents: 22973
diff changeset
69 '.hg'
885bd7c5c7e3 encoding: add hfsignoreclean to clean out HFS-ignored characters
Augie Fackler <raf@durin42.com>
parents: 22973
diff changeset
70 >>> hfsignoreclean(u'.h\ufeffg'.encode('utf-8'))
885bd7c5c7e3 encoding: add hfsignoreclean to clean out HFS-ignored characters
Augie Fackler <raf@durin42.com>
parents: 22973
diff changeset
71 '.hg'
885bd7c5c7e3 encoding: add hfsignoreclean to clean out HFS-ignored characters
Augie Fackler <raf@durin42.com>
parents: 22973
diff changeset
72 """
43077
687b865b95ad formatting: byteify all mercurial/ and hgext/ string literals
Augie Fackler <augie@google.com>
parents: 43076
diff changeset
73 if b"\xe2" in s or b"\xef" in s:
23596
885bd7c5c7e3 encoding: add hfsignoreclean to clean out HFS-ignored characters
Augie Fackler <raf@durin42.com>
parents: 22973
diff changeset
74 for c in _ignore:
43077
687b865b95ad formatting: byteify all mercurial/ and hgext/ string literals
Augie Fackler <augie@google.com>
parents: 43076
diff changeset
75 s = s.replace(c, b'')
23596
885bd7c5c7e3 encoding: add hfsignoreclean to clean out HFS-ignored characters
Augie Fackler <raf@durin42.com>
parents: 22973
diff changeset
76 return s
885bd7c5c7e3 encoding: add hfsignoreclean to clean out HFS-ignored characters
Augie Fackler <raf@durin42.com>
parents: 22973
diff changeset
77
43076
2372284d9457 formatting: blacken the codebase
Augie Fackler <augie@google.com>
parents: 41836
diff changeset
78
30034
e4a6b439acc5 py3: provide encoding.environ which is a dict of bytes
Yuya Nishihara <yuya@tcha.org>
parents: 30033
diff changeset
79 # encoding.environ is provided read-only, which may not be used to modify
e4a6b439acc5 py3: provide encoding.environ which is a dict of bytes
Yuya Nishihara <yuya@tcha.org>
parents: 30033
diff changeset
80 # the process environment
43076
2372284d9457 formatting: blacken the codebase
Augie Fackler <augie@google.com>
parents: 41836
diff changeset
81 _nativeenviron = not pycompat.ispy3 or os.supports_bytes_environ
30034
e4a6b439acc5 py3: provide encoding.environ which is a dict of bytes
Yuya Nishihara <yuya@tcha.org>
parents: 30033
diff changeset
82 if not pycompat.ispy3:
32184
cf424dae5dc7 check-code: ignore re-exports of os.environ in encoding.py
Yuya Nishihara <yuya@tcha.org>
parents: 32156
diff changeset
83 environ = os.environ # re-exports
30034
e4a6b439acc5 py3: provide encoding.environ which is a dict of bytes
Yuya Nishihara <yuya@tcha.org>
parents: 30033
diff changeset
84 elif _nativeenviron:
32184
cf424dae5dc7 check-code: ignore re-exports of os.environ in encoding.py
Yuya Nishihara <yuya@tcha.org>
parents: 32156
diff changeset
85 environ = os.environb # re-exports
30034
e4a6b439acc5 py3: provide encoding.environ which is a dict of bytes
Yuya Nishihara <yuya@tcha.org>
parents: 30033
diff changeset
86 else:
e4a6b439acc5 py3: provide encoding.environ which is a dict of bytes
Yuya Nishihara <yuya@tcha.org>
parents: 30033
diff changeset
87 # preferred encoding isn't known yet; use utf-8 to avoid unicode error
e4a6b439acc5 py3: provide encoding.environ which is a dict of bytes
Yuya Nishihara <yuya@tcha.org>
parents: 30033
diff changeset
88 # and recreate it once encoding is settled
44452
9d2b2df2c2ba cleanup: run pyupgrade on our source tree to clean up varying things
Augie Fackler <augie@google.com>
parents: 43787
diff changeset
89 environ = {
9d2b2df2c2ba cleanup: run pyupgrade on our source tree to clean up varying things
Augie Fackler <augie@google.com>
parents: 43787
diff changeset
90 k.encode('utf-8'): v.encode('utf-8')
43076
2372284d9457 formatting: blacken the codebase
Augie Fackler <augie@google.com>
parents: 41836
diff changeset
91 for k, v in os.environ.items() # re-exports
44452
9d2b2df2c2ba cleanup: run pyupgrade on our source tree to clean up varying things
Augie Fackler <augie@google.com>
parents: 43787
diff changeset
92 }
30034
e4a6b439acc5 py3: provide encoding.environ which is a dict of bytes
Yuya Nishihara <yuya@tcha.org>
parents: 30033
diff changeset
93
39839
9e8fcd2e78c1 encoding: remove unnecessary lambdas from _encodingfixers
Martin von Zweigbergk <martinvonz@google.com>
parents: 39819
diff changeset
94 _encodingrewrites = {
43077
687b865b95ad formatting: byteify all mercurial/ and hgext/ string literals
Augie Fackler <augie@google.com>
parents: 43076
diff changeset
95 b'646': b'ascii',
687b865b95ad formatting: byteify all mercurial/ and hgext/ string literals
Augie Fackler <augie@google.com>
parents: 43076
diff changeset
96 b'ANSI_X3.4-1968': b'ascii',
11892
2be70ca17311 encoding: improve handling of buggy getpreferredencoding() on Mac OS X
Dan Villiom Podlaski Christiansen <danchr@gmail.com>
parents: 11297
diff changeset
97 }
38615
443029011990 encoding: alias cp65001 to utf-8 on Windows
Yuya Nishihara <yuya@tcha.org>
parents: 36797
diff changeset
98 # cp65001 is a Windows variant of utf-8, which isn't supported on Python 2.
443029011990 encoding: alias cp65001 to utf-8 on Windows
Yuya Nishihara <yuya@tcha.org>
parents: 36797
diff changeset
99 # No idea if it should be rewritten to the canonical name 'utf-8' on Python 3.
443029011990 encoding: alias cp65001 to utf-8 on Windows
Yuya Nishihara <yuya@tcha.org>
parents: 36797
diff changeset
100 # https://bugs.python.org/issue13216
443029011990 encoding: alias cp65001 to utf-8 on Windows
Yuya Nishihara <yuya@tcha.org>
parents: 36797
diff changeset
101 if pycompat.iswindows and not pycompat.ispy3:
43077
687b865b95ad formatting: byteify all mercurial/ and hgext/ string literals
Augie Fackler <augie@google.com>
parents: 43076
diff changeset
102 _encodingrewrites[b'cp65001'] = b'utf-8'
7948
de377b1a9a84 move encoding bits from util to encoding
Matt Mackall <mpm@selenic.com>
parents:
diff changeset
103
de377b1a9a84 move encoding bits from util to encoding
Matt Mackall <mpm@selenic.com>
parents:
diff changeset
104 try:
43077
687b865b95ad formatting: byteify all mercurial/ and hgext/ string literals
Augie Fackler <augie@google.com>
parents: 43076
diff changeset
105 encoding = environ.get(b"HGENCODING")
7948
de377b1a9a84 move encoding bits from util to encoding
Matt Mackall <mpm@selenic.com>
parents:
diff changeset
106 if not encoding:
43077
687b865b95ad formatting: byteify all mercurial/ and hgext/ string literals
Augie Fackler <augie@google.com>
parents: 43076
diff changeset
107 encoding = locale.getpreferredencoding().encode('ascii') or b'ascii'
39839
9e8fcd2e78c1 encoding: remove unnecessary lambdas from _encodingfixers
Martin von Zweigbergk <martinvonz@google.com>
parents: 39819
diff changeset
108 encoding = _encodingrewrites.get(encoding, encoding)
7948
de377b1a9a84 move encoding bits from util to encoding
Matt Mackall <mpm@selenic.com>
parents:
diff changeset
109 except locale.Error:
43077
687b865b95ad formatting: byteify all mercurial/ and hgext/ string literals
Augie Fackler <augie@google.com>
parents: 43076
diff changeset
110 encoding = b'ascii'
687b865b95ad formatting: byteify all mercurial/ and hgext/ string literals
Augie Fackler <augie@google.com>
parents: 43076
diff changeset
111 encodingmode = environ.get(b"HGENCODINGMODE", b"strict")
687b865b95ad formatting: byteify all mercurial/ and hgext/ string literals
Augie Fackler <augie@google.com>
parents: 43076
diff changeset
112 fallbackencoding = b'ISO-8859-1'
7948
de377b1a9a84 move encoding bits from util to encoding
Matt Mackall <mpm@selenic.com>
parents:
diff changeset
113
43076
2372284d9457 formatting: blacken the codebase
Augie Fackler <augie@google.com>
parents: 41836
diff changeset
114
33811
dabe1f11ae3a py3: change encoding.localstr to a subclass of bytes, not str
Yuya Nishihara <yuya@tcha.org>
parents: 33761
diff changeset
115 class localstr(bytes):
45942
89a2afe31e82 formating: upgrade to black 20.8b1
Augie Fackler <raf@durin42.com>
parents: 45681
diff changeset
116 """This class allows strings that are unmodified to be
89a2afe31e82 formating: upgrade to black 20.8b1
Augie Fackler <raf@durin42.com>
parents: 45681
diff changeset
117 round-tripped to the local encoding and back"""
43076
2372284d9457 formatting: blacken the codebase
Augie Fackler <augie@google.com>
parents: 41836
diff changeset
118
13046
7cc4263e07a9 encoding: add localstr class to track UTF-8 version of transcoded strings
Matt Mackall <mpm@selenic.com>
parents: 12866
diff changeset
119 def __new__(cls, u, l):
33811
dabe1f11ae3a py3: change encoding.localstr to a subclass of bytes, not str
Yuya Nishihara <yuya@tcha.org>
parents: 33761
diff changeset
120 s = bytes.__new__(cls, l)
13046
7cc4263e07a9 encoding: add localstr class to track UTF-8 version of transcoded strings
Matt Mackall <mpm@selenic.com>
parents: 12866
diff changeset
121 s._utf8 = u
7cc4263e07a9 encoding: add localstr class to track UTF-8 version of transcoded strings
Matt Mackall <mpm@selenic.com>
parents: 12866
diff changeset
122 return s
43076
2372284d9457 formatting: blacken the codebase
Augie Fackler <augie@google.com>
parents: 41836
diff changeset
123
43773
7b14d649af1b typing: consolidate "if not globals():" trick
Yuya Nishihara <yuya@tcha.org>
parents: 43685
diff changeset
124 if pycompat.TYPE_CHECKING:
43685
da925257a39e typing: add pseudo localstr.__init__() to help pytype
Yuya Nishihara <yuya@tcha.org>
parents: 43684
diff changeset
125 # pseudo implementation to help pytype see localstr() constructor
da925257a39e typing: add pseudo localstr.__init__() to help pytype
Yuya Nishihara <yuya@tcha.org>
parents: 43684
diff changeset
126 def __init__(self, u, l):
da925257a39e typing: add pseudo localstr.__init__() to help pytype
Yuya Nishihara <yuya@tcha.org>
parents: 43684
diff changeset
127 # type: (bytes, bytes) -> None
da925257a39e typing: add pseudo localstr.__init__() to help pytype
Yuya Nishihara <yuya@tcha.org>
parents: 43684
diff changeset
128 super(localstr, self).__init__(l)
da925257a39e typing: add pseudo localstr.__init__() to help pytype
Yuya Nishihara <yuya@tcha.org>
parents: 43684
diff changeset
129 self._utf8 = u
da925257a39e typing: add pseudo localstr.__init__() to help pytype
Yuya Nishihara <yuya@tcha.org>
parents: 43684
diff changeset
130
13046
7cc4263e07a9 encoding: add localstr class to track UTF-8 version of transcoded strings
Matt Mackall <mpm@selenic.com>
parents: 12866
diff changeset
131 def __hash__(self):
43076
2372284d9457 formatting: blacken the codebase
Augie Fackler <augie@google.com>
parents: 41836
diff changeset
132 return hash(self._utf8) # avoid collisions in local string space
2372284d9457 formatting: blacken the codebase
Augie Fackler <augie@google.com>
parents: 41836
diff changeset
133
13046
7cc4263e07a9 encoding: add localstr class to track UTF-8 version of transcoded strings
Matt Mackall <mpm@selenic.com>
parents: 12866
diff changeset
134
37947
3ea3c96ada54 encoding: introduce tagging type for non-lossy non-ASCII string
Yuya Nishihara <yuya@tcha.org>
parents: 37946
diff changeset
135 class safelocalstr(bytes):
3ea3c96ada54 encoding: introduce tagging type for non-lossy non-ASCII string
Yuya Nishihara <yuya@tcha.org>
parents: 37946
diff changeset
136 """Tagged string denoting it was previously an internal UTF-8 string,
3ea3c96ada54 encoding: introduce tagging type for non-lossy non-ASCII string
Yuya Nishihara <yuya@tcha.org>
parents: 37946
diff changeset
137 and can be converted back to UTF-8 losslessly
3ea3c96ada54 encoding: introduce tagging type for non-lossy non-ASCII string
Yuya Nishihara <yuya@tcha.org>
parents: 37946
diff changeset
138
3ea3c96ada54 encoding: introduce tagging type for non-lossy non-ASCII string
Yuya Nishihara <yuya@tcha.org>
parents: 37946
diff changeset
139 >>> assert safelocalstr(b'\\xc3') == b'\\xc3'
3ea3c96ada54 encoding: introduce tagging type for non-lossy non-ASCII string
Yuya Nishihara <yuya@tcha.org>
parents: 37946
diff changeset
140 >>> assert b'\\xc3' == safelocalstr(b'\\xc3')
3ea3c96ada54 encoding: introduce tagging type for non-lossy non-ASCII string
Yuya Nishihara <yuya@tcha.org>
parents: 37946
diff changeset
141 >>> assert b'\\xc3' in {safelocalstr(b'\\xc3'): 0}
3ea3c96ada54 encoding: introduce tagging type for non-lossy non-ASCII string
Yuya Nishihara <yuya@tcha.org>
parents: 37946
diff changeset
142 >>> assert safelocalstr(b'\\xc3') in {b'\\xc3': 0}
3ea3c96ada54 encoding: introduce tagging type for non-lossy non-ASCII string
Yuya Nishihara <yuya@tcha.org>
parents: 37946
diff changeset
143 """
3ea3c96ada54 encoding: introduce tagging type for non-lossy non-ASCII string
Yuya Nishihara <yuya@tcha.org>
parents: 37946
diff changeset
144
43076
2372284d9457 formatting: blacken the codebase
Augie Fackler <augie@google.com>
parents: 41836
diff changeset
145
7948
de377b1a9a84 move encoding bits from util to encoding
Matt Mackall <mpm@selenic.com>
parents:
diff changeset
146 def tolocal(s):
43681
b65fcccd9100 typing: fix argument type of encoding.tolocal() and .fromutf8b()
Yuya Nishihara <yuya@tcha.org>
parents: 43680
diff changeset
147 # type: (bytes) -> bytes
7948
de377b1a9a84 move encoding bits from util to encoding
Matt Mackall <mpm@selenic.com>
parents:
diff changeset
148 """
de377b1a9a84 move encoding bits from util to encoding
Matt Mackall <mpm@selenic.com>
parents:
diff changeset
149 Convert a string from internal UTF-8 to local encoding
de377b1a9a84 move encoding bits from util to encoding
Matt Mackall <mpm@selenic.com>
parents:
diff changeset
150
de377b1a9a84 move encoding bits from util to encoding
Matt Mackall <mpm@selenic.com>
parents:
diff changeset
151 All internal strings should be UTF-8 but some repos before the
de377b1a9a84 move encoding bits from util to encoding
Matt Mackall <mpm@selenic.com>
parents:
diff changeset
152 implementation of locale support may contain latin1 or possibly
de377b1a9a84 move encoding bits from util to encoding
Matt Mackall <mpm@selenic.com>
parents:
diff changeset
153 other character sets. We attempt to decode everything strictly
de377b1a9a84 move encoding bits from util to encoding
Matt Mackall <mpm@selenic.com>
parents:
diff changeset
154 using UTF-8, then Latin-1, and failing that, we use UTF-8 and
de377b1a9a84 move encoding bits from util to encoding
Matt Mackall <mpm@selenic.com>
parents:
diff changeset
155 replace unknown characters.
13046
7cc4263e07a9 encoding: add localstr class to track UTF-8 version of transcoded strings
Matt Mackall <mpm@selenic.com>
parents: 12866
diff changeset
156
7cc4263e07a9 encoding: add localstr class to track UTF-8 version of transcoded strings
Matt Mackall <mpm@selenic.com>
parents: 12866
diff changeset
157 The localstr class is used to cache the known UTF-8 encoding of
7cc4263e07a9 encoding: add localstr class to track UTF-8 version of transcoded strings
Matt Mackall <mpm@selenic.com>
parents: 12866
diff changeset
158 strings next to their local representation to allow lossless
7cc4263e07a9 encoding: add localstr class to track UTF-8 version of transcoded strings
Matt Mackall <mpm@selenic.com>
parents: 12866
diff changeset
159 round-trip conversion back to UTF-8.
7cc4263e07a9 encoding: add localstr class to track UTF-8 version of transcoded strings
Matt Mackall <mpm@selenic.com>
parents: 12866
diff changeset
160
34131
0fa781320203 doctest: bulk-replace string literals with b'' for Python 3
Yuya Nishihara <yuya@tcha.org>
parents: 33928
diff changeset
161 >>> u = b'foo: \\xc3\\xa4' # utf-8
13046
7cc4263e07a9 encoding: add localstr class to track UTF-8 version of transcoded strings
Matt Mackall <mpm@selenic.com>
parents: 12866
diff changeset
162 >>> l = tolocal(u)
7cc4263e07a9 encoding: add localstr class to track UTF-8 version of transcoded strings
Matt Mackall <mpm@selenic.com>
parents: 12866
diff changeset
163 >>> l
7cc4263e07a9 encoding: add localstr class to track UTF-8 version of transcoded strings
Matt Mackall <mpm@selenic.com>
parents: 12866
diff changeset
164 'foo: ?'
7cc4263e07a9 encoding: add localstr class to track UTF-8 version of transcoded strings
Matt Mackall <mpm@selenic.com>
parents: 12866
diff changeset
165 >>> fromlocal(l)
7cc4263e07a9 encoding: add localstr class to track UTF-8 version of transcoded strings
Matt Mackall <mpm@selenic.com>
parents: 12866
diff changeset
166 'foo: \\xc3\\xa4'
34131
0fa781320203 doctest: bulk-replace string literals with b'' for Python 3
Yuya Nishihara <yuya@tcha.org>
parents: 33928
diff changeset
167 >>> u2 = b'foo: \\xc3\\xa1'
13046
7cc4263e07a9 encoding: add localstr class to track UTF-8 version of transcoded strings
Matt Mackall <mpm@selenic.com>
parents: 12866
diff changeset
168 >>> d = { l: 1, tolocal(u2): 2 }
18378
404feac78b8a tests: stabilize doctest output
Mads Kiilerich <mads@kiilerich.com>
parents: 17424
diff changeset
169 >>> len(d) # no collision
404feac78b8a tests: stabilize doctest output
Mads Kiilerich <mads@kiilerich.com>
parents: 17424
diff changeset
170 2
34131
0fa781320203 doctest: bulk-replace string literals with b'' for Python 3
Yuya Nishihara <yuya@tcha.org>
parents: 33928
diff changeset
171 >>> b'foo: ?' in d
13046
7cc4263e07a9 encoding: add localstr class to track UTF-8 version of transcoded strings
Matt Mackall <mpm@selenic.com>
parents: 12866
diff changeset
172 False
34131
0fa781320203 doctest: bulk-replace string literals with b'' for Python 3
Yuya Nishihara <yuya@tcha.org>
parents: 33928
diff changeset
173 >>> l1 = b'foo: \\xe4' # historical latin1 fallback
13046
7cc4263e07a9 encoding: add localstr class to track UTF-8 version of transcoded strings
Matt Mackall <mpm@selenic.com>
parents: 12866
diff changeset
174 >>> l = tolocal(l1)
7cc4263e07a9 encoding: add localstr class to track UTF-8 version of transcoded strings
Matt Mackall <mpm@selenic.com>
parents: 12866
diff changeset
175 >>> l
7cc4263e07a9 encoding: add localstr class to track UTF-8 version of transcoded strings
Matt Mackall <mpm@selenic.com>
parents: 12866
diff changeset
176 'foo: ?'
7cc4263e07a9 encoding: add localstr class to track UTF-8 version of transcoded strings
Matt Mackall <mpm@selenic.com>
parents: 12866
diff changeset
177 >>> fromlocal(l) # magically in utf-8
7cc4263e07a9 encoding: add localstr class to track UTF-8 version of transcoded strings
Matt Mackall <mpm@selenic.com>
parents: 12866
diff changeset
178 'foo: \\xc3\\xa4'
7948
de377b1a9a84 move encoding bits from util to encoding
Matt Mackall <mpm@selenic.com>
parents:
diff changeset
179 """
13046
7cc4263e07a9 encoding: add localstr class to track UTF-8 version of transcoded strings
Matt Mackall <mpm@selenic.com>
parents: 12866
diff changeset
180
33927
853574db5b12 encoding: add fast path of from/tolocal() for ASCII strings
Yuya Nishihara <yuya@tcha.org>
parents: 33926
diff changeset
181 if isasciistr(s):
853574db5b12 encoding: add fast path of from/tolocal() for ASCII strings
Yuya Nishihara <yuya@tcha.org>
parents: 33926
diff changeset
182 return s
853574db5b12 encoding: add fast path of from/tolocal() for ASCII strings
Yuya Nishihara <yuya@tcha.org>
parents: 33926
diff changeset
183
16274
5d75eb8568d1 encoding: tune fast-path of tolocal a bit
Matt Mackall <mpm@selenic.com>
parents: 16133
diff changeset
184 try:
7948
de377b1a9a84 move encoding bits from util to encoding
Matt Mackall <mpm@selenic.com>
parents:
diff changeset
185 try:
16274
5d75eb8568d1 encoding: tune fast-path of tolocal a bit
Matt Mackall <mpm@selenic.com>
parents: 16133
diff changeset
186 # make sure string is actually stored in UTF-8
5d75eb8568d1 encoding: tune fast-path of tolocal a bit
Matt Mackall <mpm@selenic.com>
parents: 16133
diff changeset
187 u = s.decode('UTF-8')
43077
687b865b95ad formatting: byteify all mercurial/ and hgext/ string literals
Augie Fackler <augie@google.com>
parents: 43076
diff changeset
188 if encoding == b'UTF-8':
16274
5d75eb8568d1 encoding: tune fast-path of tolocal a bit
Matt Mackall <mpm@selenic.com>
parents: 16133
diff changeset
189 # fast path
5d75eb8568d1 encoding: tune fast-path of tolocal a bit
Matt Mackall <mpm@selenic.com>
parents: 16133
diff changeset
190 return s
43503
313e3a279828 cleanup: remove pointless r-prefixes on double-quoted strings
Augie Fackler <augie@google.com>
parents: 43496
diff changeset
191 r = u.encode(_sysstr(encoding), "replace")
30033
02dbfaa6df0b py3: convert encoding name and mode to str
Yuya Nishihara <yuya@tcha.org>
parents: 30030
diff changeset
192 if u == r.decode(_sysstr(encoding)):
13940
b7b26e54e37a encoding: avoid localstr when a string can be encoded losslessly (issue2763)
Matt Mackall <mpm@selenic.com>
parents: 13051
diff changeset
193 # r is a safe, non-lossy encoding of s
37947
3ea3c96ada54 encoding: introduce tagging type for non-lossy non-ASCII string
Yuya Nishihara <yuya@tcha.org>
parents: 37946
diff changeset
194 return safelocalstr(r)
16274
5d75eb8568d1 encoding: tune fast-path of tolocal a bit
Matt Mackall <mpm@selenic.com>
parents: 16133
diff changeset
195 return localstr(s, r)
5d75eb8568d1 encoding: tune fast-path of tolocal a bit
Matt Mackall <mpm@selenic.com>
parents: 16133
diff changeset
196 except UnicodeDecodeError:
5d75eb8568d1 encoding: tune fast-path of tolocal a bit
Matt Mackall <mpm@selenic.com>
parents: 16133
diff changeset
197 # we should only get here if we're looking at an ancient changeset
5d75eb8568d1 encoding: tune fast-path of tolocal a bit
Matt Mackall <mpm@selenic.com>
parents: 16133
diff changeset
198 try:
30033
02dbfaa6df0b py3: convert encoding name and mode to str
Yuya Nishihara <yuya@tcha.org>
parents: 30030
diff changeset
199 u = s.decode(_sysstr(fallbackencoding))
43503
313e3a279828 cleanup: remove pointless r-prefixes on double-quoted strings
Augie Fackler <augie@google.com>
parents: 43496
diff changeset
200 r = u.encode(_sysstr(encoding), "replace")
30033
02dbfaa6df0b py3: convert encoding name and mode to str
Yuya Nishihara <yuya@tcha.org>
parents: 30030
diff changeset
201 if u == r.decode(_sysstr(encoding)):
16274
5d75eb8568d1 encoding: tune fast-path of tolocal a bit
Matt Mackall <mpm@selenic.com>
parents: 16133
diff changeset
202 # r is a safe, non-lossy encoding of s
37947
3ea3c96ada54 encoding: introduce tagging type for non-lossy non-ASCII string
Yuya Nishihara <yuya@tcha.org>
parents: 37946
diff changeset
203 return safelocalstr(r)
13940
b7b26e54e37a encoding: avoid localstr when a string can be encoded losslessly (issue2763)
Matt Mackall <mpm@selenic.com>
parents: 13051
diff changeset
204 return localstr(u.encode('UTF-8'), r)
16274
5d75eb8568d1 encoding: tune fast-path of tolocal a bit
Matt Mackall <mpm@selenic.com>
parents: 16133
diff changeset
205 except UnicodeDecodeError:
43076
2372284d9457 formatting: blacken the codebase
Augie Fackler <augie@google.com>
parents: 41836
diff changeset
206 u = s.decode("utf-8", "replace") # last ditch
30033
02dbfaa6df0b py3: convert encoding name and mode to str
Yuya Nishihara <yuya@tcha.org>
parents: 30030
diff changeset
207 # can't round-trip
43503
313e3a279828 cleanup: remove pointless r-prefixes on double-quoted strings
Augie Fackler <augie@google.com>
parents: 43496
diff changeset
208 return u.encode(_sysstr(encoding), "replace")
25660
328739ea70c3 global: mass rewrite to use modern exception syntax
Gregory Szorc <gregory.szorc@gmail.com>
parents: 24608
diff changeset
209 except LookupError as k:
45681
a736ab681b78 errors: stop passing non-strings to Abort's constructor
Martin von Zweigbergk <martinvonz@google.com>
parents: 44452
diff changeset
210 raise error.Abort(
a736ab681b78 errors: stop passing non-strings to Abort's constructor
Martin von Zweigbergk <martinvonz@google.com>
parents: 44452
diff changeset
211 pycompat.bytestr(k), hint=b"please check your locale settings"
a736ab681b78 errors: stop passing non-strings to Abort's constructor
Martin von Zweigbergk <martinvonz@google.com>
parents: 44452
diff changeset
212 )
7948
de377b1a9a84 move encoding bits from util to encoding
Matt Mackall <mpm@selenic.com>
parents:
diff changeset
213
43076
2372284d9457 formatting: blacken the codebase
Augie Fackler <augie@google.com>
parents: 41836
diff changeset
214
7948
de377b1a9a84 move encoding bits from util to encoding
Matt Mackall <mpm@selenic.com>
parents:
diff changeset
215 def fromlocal(s):
43637
7edc07fb890c encoding: fix bad type annotation
Augie Fackler <augie@google.com>
parents: 43506
diff changeset
216 # type: (bytes) -> bytes
7948
de377b1a9a84 move encoding bits from util to encoding
Matt Mackall <mpm@selenic.com>
parents:
diff changeset
217 """
de377b1a9a84 move encoding bits from util to encoding
Matt Mackall <mpm@selenic.com>
parents:
diff changeset
218 Convert a string from the local character encoding to UTF-8
de377b1a9a84 move encoding bits from util to encoding
Matt Mackall <mpm@selenic.com>
parents:
diff changeset
219
de377b1a9a84 move encoding bits from util to encoding
Matt Mackall <mpm@selenic.com>
parents:
diff changeset
220 We attempt to decode strings using the encoding mode set by
de377b1a9a84 move encoding bits from util to encoding
Matt Mackall <mpm@selenic.com>
parents:
diff changeset
221 HGENCODINGMODE, which defaults to 'strict'. In this mode, unknown
de377b1a9a84 move encoding bits from util to encoding
Matt Mackall <mpm@selenic.com>
parents:
diff changeset
222 characters will cause an error message. Other modes include
de377b1a9a84 move encoding bits from util to encoding
Matt Mackall <mpm@selenic.com>
parents:
diff changeset
223 'replace', which replaces unknown characters with a special
de377b1a9a84 move encoding bits from util to encoding
Matt Mackall <mpm@selenic.com>
parents:
diff changeset
224 Unicode character, and 'ignore', which drops the character.
de377b1a9a84 move encoding bits from util to encoding
Matt Mackall <mpm@selenic.com>
parents:
diff changeset
225 """
13046
7cc4263e07a9 encoding: add localstr class to track UTF-8 version of transcoded strings
Matt Mackall <mpm@selenic.com>
parents: 12866
diff changeset
226
7cc4263e07a9 encoding: add localstr class to track UTF-8 version of transcoded strings
Matt Mackall <mpm@selenic.com>
parents: 12866
diff changeset
227 # can we do a lossless round-trip?
7cc4263e07a9 encoding: add localstr class to track UTF-8 version of transcoded strings
Matt Mackall <mpm@selenic.com>
parents: 12866
diff changeset
228 if isinstance(s, localstr):
7cc4263e07a9 encoding: add localstr class to track UTF-8 version of transcoded strings
Matt Mackall <mpm@selenic.com>
parents: 12866
diff changeset
229 return s._utf8
33927
853574db5b12 encoding: add fast path of from/tolocal() for ASCII strings
Yuya Nishihara <yuya@tcha.org>
parents: 33926
diff changeset
230 if isasciistr(s):
853574db5b12 encoding: add fast path of from/tolocal() for ASCII strings
Yuya Nishihara <yuya@tcha.org>
parents: 33926
diff changeset
231 return s
13046
7cc4263e07a9 encoding: add localstr class to track UTF-8 version of transcoded strings
Matt Mackall <mpm@selenic.com>
parents: 12866
diff changeset
232
7948
de377b1a9a84 move encoding bits from util to encoding
Matt Mackall <mpm@selenic.com>
parents:
diff changeset
233 try:
30033
02dbfaa6df0b py3: convert encoding name and mode to str
Yuya Nishihara <yuya@tcha.org>
parents: 30030
diff changeset
234 u = s.decode(_sysstr(encoding), _sysstr(encodingmode))
02dbfaa6df0b py3: convert encoding name and mode to str
Yuya Nishihara <yuya@tcha.org>
parents: 30030
diff changeset
235 return u.encode("utf-8")
25660
328739ea70c3 global: mass rewrite to use modern exception syntax
Gregory Szorc <gregory.szorc@gmail.com>
parents: 24608
diff changeset
236 except UnicodeDecodeError as inst:
43076
2372284d9457 formatting: blacken the codebase
Augie Fackler <augie@google.com>
parents: 41836
diff changeset
237 sub = s[max(0, inst.start - 10) : inst.start + 10]
2372284d9457 formatting: blacken the codebase
Augie Fackler <augie@google.com>
parents: 41836
diff changeset
238 raise error.Abort(
43077
687b865b95ad formatting: byteify all mercurial/ and hgext/ string literals
Augie Fackler <augie@google.com>
parents: 43076
diff changeset
239 b"decoding near '%s': %s!" % (sub, pycompat.bytestr(inst))
43076
2372284d9457 formatting: blacken the codebase
Augie Fackler <augie@google.com>
parents: 41836
diff changeset
240 )
25660
328739ea70c3 global: mass rewrite to use modern exception syntax
Gregory Szorc <gregory.szorc@gmail.com>
parents: 24608
diff changeset
241 except LookupError as k:
43077
687b865b95ad formatting: byteify all mercurial/ and hgext/ string literals
Augie Fackler <augie@google.com>
parents: 43076
diff changeset
242 raise error.Abort(k, hint=b"please check your locale settings")
7948
de377b1a9a84 move encoding bits from util to encoding
Matt Mackall <mpm@selenic.com>
parents:
diff changeset
243
43076
2372284d9457 formatting: blacken the codebase
Augie Fackler <augie@google.com>
parents: 41836
diff changeset
244
31447
067add650129 encoding: factor out unicode variants of from/tolocal()
Yuya Nishihara <yuya@tcha.org>
parents: 30622
diff changeset
245 def unitolocal(u):
43496
2ade00f3b03b encoding: add comment-based type hints for pytype
Augie Fackler <augie@google.com>
parents: 43469
diff changeset
246 # type: (Text) -> bytes
31447
067add650129 encoding: factor out unicode variants of from/tolocal()
Yuya Nishihara <yuya@tcha.org>
parents: 30622
diff changeset
247 """Convert a unicode string to a byte string of local encoding"""
067add650129 encoding: factor out unicode variants of from/tolocal()
Yuya Nishihara <yuya@tcha.org>
parents: 30622
diff changeset
248 return tolocal(u.encode('utf-8'))
067add650129 encoding: factor out unicode variants of from/tolocal()
Yuya Nishihara <yuya@tcha.org>
parents: 30622
diff changeset
249
43076
2372284d9457 formatting: blacken the codebase
Augie Fackler <augie@google.com>
parents: 41836
diff changeset
250
31447
067add650129 encoding: factor out unicode variants of from/tolocal()
Yuya Nishihara <yuya@tcha.org>
parents: 30622
diff changeset
251 def unifromlocal(s):
43496
2ade00f3b03b encoding: add comment-based type hints for pytype
Augie Fackler <augie@google.com>
parents: 43469
diff changeset
252 # type: (bytes) -> Text
31447
067add650129 encoding: factor out unicode variants of from/tolocal()
Yuya Nishihara <yuya@tcha.org>
parents: 30622
diff changeset
253 """Convert a byte string of local encoding to a unicode string"""
067add650129 encoding: factor out unicode variants of from/tolocal()
Yuya Nishihara <yuya@tcha.org>
parents: 30622
diff changeset
254 return fromlocal(s).decode('utf-8')
067add650129 encoding: factor out unicode variants of from/tolocal()
Yuya Nishihara <yuya@tcha.org>
parents: 30622
diff changeset
255
43076
2372284d9457 formatting: blacken the codebase
Augie Fackler <augie@google.com>
parents: 41836
diff changeset
256
33022
ce96efec8112 py3: add utility to forward __str__() to __bytes__()
Yuya Nishihara <yuya@tcha.org>
parents: 32537
diff changeset
257 def unimethod(bytesfunc):
43496
2ade00f3b03b encoding: add comment-based type hints for pytype
Augie Fackler <augie@google.com>
parents: 43469
diff changeset
258 # type: (Callable[[Any], bytes]) -> Callable[[Any], Text]
33022
ce96efec8112 py3: add utility to forward __str__() to __bytes__()
Yuya Nishihara <yuya@tcha.org>
parents: 32537
diff changeset
259 """Create a proxy method that forwards __unicode__() and __str__() of
ce96efec8112 py3: add utility to forward __str__() to __bytes__()
Yuya Nishihara <yuya@tcha.org>
parents: 32537
diff changeset
260 Python 3 to __bytes__()"""
43076
2372284d9457 formatting: blacken the codebase
Augie Fackler <augie@google.com>
parents: 41836
diff changeset
261
33022
ce96efec8112 py3: add utility to forward __str__() to __bytes__()
Yuya Nishihara <yuya@tcha.org>
parents: 32537
diff changeset
262 def unifunc(obj):
ce96efec8112 py3: add utility to forward __str__() to __bytes__()
Yuya Nishihara <yuya@tcha.org>
parents: 32537
diff changeset
263 return unifromlocal(bytesfunc(obj))
43076
2372284d9457 formatting: blacken the codebase
Augie Fackler <augie@google.com>
parents: 41836
diff changeset
264
33022
ce96efec8112 py3: add utility to forward __str__() to __bytes__()
Yuya Nishihara <yuya@tcha.org>
parents: 32537
diff changeset
265 return unifunc
ce96efec8112 py3: add utility to forward __str__() to __bytes__()
Yuya Nishihara <yuya@tcha.org>
parents: 32537
diff changeset
266
43076
2372284d9457 formatting: blacken the codebase
Augie Fackler <augie@google.com>
parents: 41836
diff changeset
267
31448
6419cd243017 encoding: add converter between native str and byte string
Yuya Nishihara <yuya@tcha.org>
parents: 31447
diff changeset
268 # converter functions between native str and byte string. use these if the
6419cd243017 encoding: add converter between native str and byte string
Yuya Nishihara <yuya@tcha.org>
parents: 31447
diff changeset
269 # character encoding is not aware (e.g. exception message) or is known to
6419cd243017 encoding: add converter between native str and byte string
Yuya Nishihara <yuya@tcha.org>
parents: 31447
diff changeset
270 # be locale dependent (e.g. date formatting.)
6419cd243017 encoding: add converter between native str and byte string
Yuya Nishihara <yuya@tcha.org>
parents: 31447
diff changeset
271 if pycompat.ispy3:
6419cd243017 encoding: add converter between native str and byte string
Yuya Nishihara <yuya@tcha.org>
parents: 31447
diff changeset
272 strtolocal = unitolocal
6419cd243017 encoding: add converter between native str and byte string
Yuya Nishihara <yuya@tcha.org>
parents: 31447
diff changeset
273 strfromlocal = unifromlocal
33022
ce96efec8112 py3: add utility to forward __str__() to __bytes__()
Yuya Nishihara <yuya@tcha.org>
parents: 32537
diff changeset
274 strmethod = unimethod
31448
6419cd243017 encoding: add converter between native str and byte string
Yuya Nishihara <yuya@tcha.org>
parents: 31447
diff changeset
275 else:
43469
5f2a8dabb0d8 encoding: define local identify functions with explicit type comments
Augie Fackler <augie@google.com>
parents: 43089
diff changeset
276
5f2a8dabb0d8 encoding: define local identify functions with explicit type comments
Augie Fackler <augie@google.com>
parents: 43089
diff changeset
277 def strtolocal(s):
5f2a8dabb0d8 encoding: define local identify functions with explicit type comments
Augie Fackler <augie@google.com>
parents: 43089
diff changeset
278 # type: (str) -> bytes
43683
7f51bc36194d typing: suppress error of py2 encoding.strtolocal() and .strfromlocal()
Yuya Nishihara <yuya@tcha.org>
parents: 43682
diff changeset
279 return s # pytype: disable=bad-return-type
43469
5f2a8dabb0d8 encoding: define local identify functions with explicit type comments
Augie Fackler <augie@google.com>
parents: 43089
diff changeset
280
5f2a8dabb0d8 encoding: define local identify functions with explicit type comments
Augie Fackler <augie@google.com>
parents: 43089
diff changeset
281 def strfromlocal(s):
5f2a8dabb0d8 encoding: define local identify functions with explicit type comments
Augie Fackler <augie@google.com>
parents: 43089
diff changeset
282 # type: (bytes) -> str
43683
7f51bc36194d typing: suppress error of py2 encoding.strtolocal() and .strfromlocal()
Yuya Nishihara <yuya@tcha.org>
parents: 43682
diff changeset
283 return s # pytype: disable=bad-return-type
43469
5f2a8dabb0d8 encoding: define local identify functions with explicit type comments
Augie Fackler <augie@google.com>
parents: 43089
diff changeset
284
33022
ce96efec8112 py3: add utility to forward __str__() to __bytes__()
Yuya Nishihara <yuya@tcha.org>
parents: 32537
diff changeset
285 strmethod = pycompat.identity
31448
6419cd243017 encoding: add converter between native str and byte string
Yuya Nishihara <yuya@tcha.org>
parents: 31447
diff changeset
286
30034
e4a6b439acc5 py3: provide encoding.environ which is a dict of bytes
Yuya Nishihara <yuya@tcha.org>
parents: 30033
diff changeset
287 if not _nativeenviron:
e4a6b439acc5 py3: provide encoding.environ which is a dict of bytes
Yuya Nishihara <yuya@tcha.org>
parents: 30033
diff changeset
288 # now encoding and helper functions are available, recreate the environ
e4a6b439acc5 py3: provide encoding.environ which is a dict of bytes
Yuya Nishihara <yuya@tcha.org>
parents: 30033
diff changeset
289 # dict to be exported to other modules
44452
9d2b2df2c2ba cleanup: run pyupgrade on our source tree to clean up varying things
Augie Fackler <augie@google.com>
parents: 43787
diff changeset
290 environ = {
9d2b2df2c2ba cleanup: run pyupgrade on our source tree to clean up varying things
Augie Fackler <augie@google.com>
parents: 43787
diff changeset
291 tolocal(k.encode('utf-8')): tolocal(v.encode('utf-8'))
43076
2372284d9457 formatting: blacken the codebase
Augie Fackler <augie@google.com>
parents: 41836
diff changeset
292 for k, v in os.environ.items() # re-exports
44452
9d2b2df2c2ba cleanup: run pyupgrade on our source tree to clean up varying things
Augie Fackler <augie@google.com>
parents: 43787
diff changeset
293 }
30034
e4a6b439acc5 py3: provide encoding.environ which is a dict of bytes
Yuya Nishihara <yuya@tcha.org>
parents: 30033
diff changeset
294
39818
24e493ec2229 py3: rename pycompat.getcwd() to encoding.getcwd() (API)
Matt Harbison <matt_harbison@yahoo.com>
parents: 38783
diff changeset
295 if pycompat.ispy3:
24e493ec2229 py3: rename pycompat.getcwd() to encoding.getcwd() (API)
Matt Harbison <matt_harbison@yahoo.com>
parents: 38783
diff changeset
296 # os.getcwd() on Python 3 returns string, but it has os.getcwdb() which
24e493ec2229 py3: rename pycompat.getcwd() to encoding.getcwd() (API)
Matt Harbison <matt_harbison@yahoo.com>
parents: 38783
diff changeset
297 # returns bytes.
39819
fb628c048d64 py3: don't use os.getcwdb() on Windows to avoid DeprecationWarnings
Matt Harbison <matt_harbison@yahoo.com>
parents: 39818
diff changeset
298 if pycompat.iswindows:
fb628c048d64 py3: don't use os.getcwdb() on Windows to avoid DeprecationWarnings
Matt Harbison <matt_harbison@yahoo.com>
parents: 39818
diff changeset
299 # Python 3 on Windows issues a DeprecationWarning about using the bytes
fb628c048d64 py3: don't use os.getcwdb() on Windows to avoid DeprecationWarnings
Matt Harbison <matt_harbison@yahoo.com>
parents: 39818
diff changeset
300 # API when os.getcwdb() is called.
46319
3dfebba99ef6 windows: wrap `os.getcwd()` in `os.path.realpath()` on py3
Matt Harbison <matt_harbison@yahoo.com>
parents: 45942
diff changeset
301 #
3dfebba99ef6 windows: wrap `os.getcwd()` in `os.path.realpath()` on py3
Matt Harbison <matt_harbison@yahoo.com>
parents: 45942
diff changeset
302 # Additionally, py3.8+ uppercases the drive letter when calling
3dfebba99ef6 windows: wrap `os.getcwd()` in `os.path.realpath()` on py3
Matt Harbison <matt_harbison@yahoo.com>
parents: 45942
diff changeset
303 # os.path.realpath(), which is used on ``repo.root``. Since those
3dfebba99ef6 windows: wrap `os.getcwd()` in `os.path.realpath()` on py3
Matt Harbison <matt_harbison@yahoo.com>
parents: 45942
diff changeset
304 # strings are compared in various places as simple strings, also call
3dfebba99ef6 windows: wrap `os.getcwd()` in `os.path.realpath()` on py3
Matt Harbison <matt_harbison@yahoo.com>
parents: 45942
diff changeset
305 # realpath here. See https://bugs.python.org/issue40368
3dfebba99ef6 windows: wrap `os.getcwd()` in `os.path.realpath()` on py3
Matt Harbison <matt_harbison@yahoo.com>
parents: 45942
diff changeset
306 getcwd = lambda: strtolocal(os.path.realpath(os.getcwd())) # re-exports
39819
fb628c048d64 py3: don't use os.getcwdb() on Windows to avoid DeprecationWarnings
Matt Harbison <matt_harbison@yahoo.com>
parents: 39818
diff changeset
307 else:
fb628c048d64 py3: don't use os.getcwdb() on Windows to avoid DeprecationWarnings
Matt Harbison <matt_harbison@yahoo.com>
parents: 39818
diff changeset
308 getcwd = os.getcwdb # re-exports
39818
24e493ec2229 py3: rename pycompat.getcwd() to encoding.getcwd() (API)
Matt Harbison <matt_harbison@yahoo.com>
parents: 38783
diff changeset
309 else:
24e493ec2229 py3: rename pycompat.getcwd() to encoding.getcwd() (API)
Matt Harbison <matt_harbison@yahoo.com>
parents: 38783
diff changeset
310 getcwd = os.getcwd # re-exports
24e493ec2229 py3: rename pycompat.getcwd() to encoding.getcwd() (API)
Matt Harbison <matt_harbison@yahoo.com>
parents: 38783
diff changeset
311
12866
eddc20306ab6 encoding: default ambiguous character to narrow
Matt Mackall <mpm@selenic.com>
parents: 12770
diff changeset
312 # How to treat ambiguous-width characters. Set to 'wide' to treat as wide.
43076
2372284d9457 formatting: blacken the codebase
Augie Fackler <augie@google.com>
parents: 41836
diff changeset
313 _wide = _sysstr(
43077
687b865b95ad formatting: byteify all mercurial/ and hgext/ string literals
Augie Fackler <augie@google.com>
parents: 43076
diff changeset
314 environ.get(b"HGENCODINGAMBIGUOUS", b"narrow") == b"wide"
687b865b95ad formatting: byteify all mercurial/ and hgext/ string literals
Augie Fackler <augie@google.com>
parents: 43076
diff changeset
315 and b"WFA"
687b865b95ad formatting: byteify all mercurial/ and hgext/ string literals
Augie Fackler <augie@google.com>
parents: 43076
diff changeset
316 or b"WF"
43076
2372284d9457 formatting: blacken the codebase
Augie Fackler <augie@google.com>
parents: 41836
diff changeset
317 )
2372284d9457 formatting: blacken the codebase
Augie Fackler <augie@google.com>
parents: 41836
diff changeset
318
12866
eddc20306ab6 encoding: default ambiguous character to narrow
Matt Mackall <mpm@selenic.com>
parents: 12770
diff changeset
319
7948
de377b1a9a84 move encoding bits from util to encoding
Matt Mackall <mpm@selenic.com>
parents:
diff changeset
320 def colwidth(s):
43496
2ade00f3b03b encoding: add comment-based type hints for pytype
Augie Fackler <augie@google.com>
parents: 43469
diff changeset
321 # type: (bytes) -> int
43787
be8552f25cab cleanup: fix docstring formatting
Matt Harbison <matt_harbison@yahoo.com>
parents: 43773
diff changeset
322 """Find the column width of a string for display in the local encoding"""
43506
9f70512ae2cf cleanup: remove pointless r-prefixes on single-quoted strings
Augie Fackler <augie@google.com>
parents: 43503
diff changeset
323 return ucolwidth(s.decode(_sysstr(encoding), 'replace'))
15066
24efa83d81cb i18n: calculate terminal columns by width information of each characters
FUJIWARA Katsunori <foozy@lares.dti.ne.jp>
parents: 14951
diff changeset
324
43076
2372284d9457 formatting: blacken the codebase
Augie Fackler <augie@google.com>
parents: 41836
diff changeset
325
15066
24efa83d81cb i18n: calculate terminal columns by width information of each characters
FUJIWARA Katsunori <foozy@lares.dti.ne.jp>
parents: 14951
diff changeset
326 def ucolwidth(d):
43496
2ade00f3b03b encoding: add comment-based type hints for pytype
Augie Fackler <augie@google.com>
parents: 43469
diff changeset
327 # type: (Text) -> int
43787
be8552f25cab cleanup: fix docstring formatting
Matt Harbison <matt_harbison@yahoo.com>
parents: 43773
diff changeset
328 """Find the column width of a Unicode string for display"""
14951
61807854004e encoding: use getattr isntead of hasattr
Augie Fackler <durin42@gmail.com>
parents: 14069
diff changeset
329 eaw = getattr(unicodedata, 'east_asian_width', None)
61807854004e encoding: use getattr isntead of hasattr
Augie Fackler <durin42@gmail.com>
parents: 14069
diff changeset
330 if eaw is not None:
32537
044f3d7eb9ae encoding: make sure "wide" variable never be referenced from other modules
Yuya Nishihara <yuya@tcha.org>
parents: 32529
diff changeset
331 return sum([eaw(c) in _wide and 2 or 1 for c in d])
7948
de377b1a9a84 move encoding bits from util to encoding
Matt Mackall <mpm@selenic.com>
parents:
diff changeset
332 return len(d)
de377b1a9a84 move encoding bits from util to encoding
Matt Mackall <mpm@selenic.com>
parents:
diff changeset
333
43076
2372284d9457 formatting: blacken the codebase
Augie Fackler <augie@google.com>
parents: 41836
diff changeset
334
15143
16c129b0f465 encoding: add getcols to extract substrings based on column width
Matt Mackall <mpm@selenic.com>
parents: 15142
diff changeset
335 def getcols(s, start, c):
43496
2ade00f3b03b encoding: add comment-based type hints for pytype
Augie Fackler <augie@google.com>
parents: 43469
diff changeset
336 # type: (bytes, int, int) -> bytes
45942
89a2afe31e82 formating: upgrade to black 20.8b1
Augie Fackler <raf@durin42.com>
parents: 45681
diff changeset
337 """Use colwidth to find a c-column substring of s starting at byte
89a2afe31e82 formating: upgrade to black 20.8b1
Augie Fackler <raf@durin42.com>
parents: 45681
diff changeset
338 index start"""
38783
e7aa113b14f7 global: use pycompat.xrange()
Gregory Szorc <gregory.szorc@gmail.com>
parents: 38739
diff changeset
339 for x in pycompat.xrange(start + c, len(s)):
15143
16c129b0f465 encoding: add getcols to extract substrings based on column width
Matt Mackall <mpm@selenic.com>
parents: 15142
diff changeset
340 t = s[start:x]
16c129b0f465 encoding: add getcols to extract substrings based on column width
Matt Mackall <mpm@selenic.com>
parents: 15142
diff changeset
341 if colwidth(t) == c:
16c129b0f465 encoding: add getcols to extract substrings based on column width
Matt Mackall <mpm@selenic.com>
parents: 15142
diff changeset
342 return t
43679
7cf332318f62 encoding: make getcols() raise exception explicitly
Yuya Nishihara <yuya@tcha.org>
parents: 43637
diff changeset
343 raise ValueError('substring not found')
15143
16c129b0f465 encoding: add getcols to extract substrings based on column width
Matt Mackall <mpm@selenic.com>
parents: 15142
diff changeset
344
43076
2372284d9457 formatting: blacken the codebase
Augie Fackler <augie@google.com>
parents: 41836
diff changeset
345
43077
687b865b95ad formatting: byteify all mercurial/ and hgext/ string literals
Augie Fackler <augie@google.com>
parents: 43076
diff changeset
346 def trim(s, width, ellipsis=b'', leftside=False):
43496
2ade00f3b03b encoding: add comment-based type hints for pytype
Augie Fackler <augie@google.com>
parents: 43469
diff changeset
347 # type: (bytes, int, bytes, bool) -> bytes
21856
d24969ee272f encoding: add 'trim' to trim multi-byte characters at most specified columns
FUJIWARA Katsunori <foozy@lares.dti.ne.jp>
parents: 18378
diff changeset
348 """Trim string 's' to at most 'width' columns (including 'ellipsis').
d24969ee272f encoding: add 'trim' to trim multi-byte characters at most specified columns
FUJIWARA Katsunori <foozy@lares.dti.ne.jp>
parents: 18378
diff changeset
349
21861
b515c3a63e96 encoding: add 'leftside' argument into 'trim' to switch trimming side
FUJIWARA Katsunori <foozy@lares.dti.ne.jp>
parents: 21856
diff changeset
350 If 'leftside' is True, left side of string 's' is trimmed.
b515c3a63e96 encoding: add 'leftside' argument into 'trim' to switch trimming side
FUJIWARA Katsunori <foozy@lares.dti.ne.jp>
parents: 21856
diff changeset
351 'ellipsis' is always placed at trimmed side.
b515c3a63e96 encoding: add 'leftside' argument into 'trim' to switch trimming side
FUJIWARA Katsunori <foozy@lares.dti.ne.jp>
parents: 21856
diff changeset
352
34136
414a3513c2bd doctest: do not embed non-ascii characters in docstring
Yuya Nishihara <yuya@tcha.org>
parents: 34135
diff changeset
353 >>> from .node import bin
34137
a8994d08e4a2 doctest: use print_function and convert bytes to unicode where needed
Yuya Nishihara <yuya@tcha.org>
parents: 34136
diff changeset
354 >>> def bprint(s):
a8994d08e4a2 doctest: use print_function and convert bytes to unicode where needed
Yuya Nishihara <yuya@tcha.org>
parents: 34136
diff changeset
355 ... print(pycompat.sysstr(s))
34131
0fa781320203 doctest: bulk-replace string literals with b'' for Python 3
Yuya Nishihara <yuya@tcha.org>
parents: 33928
diff changeset
356 >>> ellipsis = b'+++'
27355
b479fc425a81 encoding: use absolute_import
Gregory Szorc <gregory.szorc@gmail.com>
parents: 26963
diff changeset
357 >>> from . import encoding
34131
0fa781320203 doctest: bulk-replace string literals with b'' for Python 3
Yuya Nishihara <yuya@tcha.org>
parents: 33928
diff changeset
358 >>> encoding.encoding = b'utf-8'
0fa781320203 doctest: bulk-replace string literals with b'' for Python 3
Yuya Nishihara <yuya@tcha.org>
parents: 33928
diff changeset
359 >>> t = b'1234567890'
34137
a8994d08e4a2 doctest: use print_function and convert bytes to unicode where needed
Yuya Nishihara <yuya@tcha.org>
parents: 34136
diff changeset
360 >>> bprint(trim(t, 12, ellipsis=ellipsis))
21856
d24969ee272f encoding: add 'trim' to trim multi-byte characters at most specified columns
FUJIWARA Katsunori <foozy@lares.dti.ne.jp>
parents: 18378
diff changeset
361 1234567890
34137
a8994d08e4a2 doctest: use print_function and convert bytes to unicode where needed
Yuya Nishihara <yuya@tcha.org>
parents: 34136
diff changeset
362 >>> bprint(trim(t, 10, ellipsis=ellipsis))
21856
d24969ee272f encoding: add 'trim' to trim multi-byte characters at most specified columns
FUJIWARA Katsunori <foozy@lares.dti.ne.jp>
parents: 18378
diff changeset
363 1234567890
34137
a8994d08e4a2 doctest: use print_function and convert bytes to unicode where needed
Yuya Nishihara <yuya@tcha.org>
parents: 34136
diff changeset
364 >>> bprint(trim(t, 8, ellipsis=ellipsis))
21856
d24969ee272f encoding: add 'trim' to trim multi-byte characters at most specified columns
FUJIWARA Katsunori <foozy@lares.dti.ne.jp>
parents: 18378
diff changeset
365 12345+++
34137
a8994d08e4a2 doctest: use print_function and convert bytes to unicode where needed
Yuya Nishihara <yuya@tcha.org>
parents: 34136
diff changeset
366 >>> bprint(trim(t, 8, ellipsis=ellipsis, leftside=True))
21861
b515c3a63e96 encoding: add 'leftside' argument into 'trim' to switch trimming side
FUJIWARA Katsunori <foozy@lares.dti.ne.jp>
parents: 21856
diff changeset
367 +++67890
34137
a8994d08e4a2 doctest: use print_function and convert bytes to unicode where needed
Yuya Nishihara <yuya@tcha.org>
parents: 34136
diff changeset
368 >>> bprint(trim(t, 8))
21856
d24969ee272f encoding: add 'trim' to trim multi-byte characters at most specified columns
FUJIWARA Katsunori <foozy@lares.dti.ne.jp>
parents: 18378
diff changeset
369 12345678
34137
a8994d08e4a2 doctest: use print_function and convert bytes to unicode where needed
Yuya Nishihara <yuya@tcha.org>
parents: 34136
diff changeset
370 >>> bprint(trim(t, 8, leftside=True))
21861
b515c3a63e96 encoding: add 'leftside' argument into 'trim' to switch trimming side
FUJIWARA Katsunori <foozy@lares.dti.ne.jp>
parents: 21856
diff changeset
371 34567890
34137
a8994d08e4a2 doctest: use print_function and convert bytes to unicode where needed
Yuya Nishihara <yuya@tcha.org>
parents: 34136
diff changeset
372 >>> bprint(trim(t, 3, ellipsis=ellipsis))
21856
d24969ee272f encoding: add 'trim' to trim multi-byte characters at most specified columns
FUJIWARA Katsunori <foozy@lares.dti.ne.jp>
parents: 18378
diff changeset
373 +++
34137
a8994d08e4a2 doctest: use print_function and convert bytes to unicode where needed
Yuya Nishihara <yuya@tcha.org>
parents: 34136
diff changeset
374 >>> bprint(trim(t, 1, ellipsis=ellipsis))
21856
d24969ee272f encoding: add 'trim' to trim multi-byte characters at most specified columns
FUJIWARA Katsunori <foozy@lares.dti.ne.jp>
parents: 18378
diff changeset
375 +
d24969ee272f encoding: add 'trim' to trim multi-byte characters at most specified columns
FUJIWARA Katsunori <foozy@lares.dti.ne.jp>
parents: 18378
diff changeset
376 >>> u = u'\u3042\u3044\u3046\u3048\u304a' # 2 x 5 = 10 columns
34135
e9e225f16932 doctest: pass encoding name as system string
Yuya Nishihara <yuya@tcha.org>
parents: 34131
diff changeset
377 >>> t = u.encode(pycompat.sysstr(encoding.encoding))
34137
a8994d08e4a2 doctest: use print_function and convert bytes to unicode where needed
Yuya Nishihara <yuya@tcha.org>
parents: 34136
diff changeset
378 >>> bprint(trim(t, 12, ellipsis=ellipsis))
21856
d24969ee272f encoding: add 'trim' to trim multi-byte characters at most specified columns
FUJIWARA Katsunori <foozy@lares.dti.ne.jp>
parents: 18378
diff changeset
379 \xe3\x81\x82\xe3\x81\x84\xe3\x81\x86\xe3\x81\x88\xe3\x81\x8a
34137
a8994d08e4a2 doctest: use print_function and convert bytes to unicode where needed
Yuya Nishihara <yuya@tcha.org>
parents: 34136
diff changeset
380 >>> bprint(trim(t, 10, ellipsis=ellipsis))
21856
d24969ee272f encoding: add 'trim' to trim multi-byte characters at most specified columns
FUJIWARA Katsunori <foozy@lares.dti.ne.jp>
parents: 18378
diff changeset
381 \xe3\x81\x82\xe3\x81\x84\xe3\x81\x86\xe3\x81\x88\xe3\x81\x8a
34137
a8994d08e4a2 doctest: use print_function and convert bytes to unicode where needed
Yuya Nishihara <yuya@tcha.org>
parents: 34136
diff changeset
382 >>> bprint(trim(t, 8, ellipsis=ellipsis))
21856
d24969ee272f encoding: add 'trim' to trim multi-byte characters at most specified columns
FUJIWARA Katsunori <foozy@lares.dti.ne.jp>
parents: 18378
diff changeset
383 \xe3\x81\x82\xe3\x81\x84+++
34137
a8994d08e4a2 doctest: use print_function and convert bytes to unicode where needed
Yuya Nishihara <yuya@tcha.org>
parents: 34136
diff changeset
384 >>> bprint(trim(t, 8, ellipsis=ellipsis, leftside=True))
21861
b515c3a63e96 encoding: add 'leftside' argument into 'trim' to switch trimming side
FUJIWARA Katsunori <foozy@lares.dti.ne.jp>
parents: 21856
diff changeset
385 +++\xe3\x81\x88\xe3\x81\x8a
34137
a8994d08e4a2 doctest: use print_function and convert bytes to unicode where needed
Yuya Nishihara <yuya@tcha.org>
parents: 34136
diff changeset
386 >>> bprint(trim(t, 5))
21856
d24969ee272f encoding: add 'trim' to trim multi-byte characters at most specified columns
FUJIWARA Katsunori <foozy@lares.dti.ne.jp>
parents: 18378
diff changeset
387 \xe3\x81\x82\xe3\x81\x84
34137
a8994d08e4a2 doctest: use print_function and convert bytes to unicode where needed
Yuya Nishihara <yuya@tcha.org>
parents: 34136
diff changeset
388 >>> bprint(trim(t, 5, leftside=True))
21861
b515c3a63e96 encoding: add 'leftside' argument into 'trim' to switch trimming side
FUJIWARA Katsunori <foozy@lares.dti.ne.jp>
parents: 21856
diff changeset
389 \xe3\x81\x88\xe3\x81\x8a
34137
a8994d08e4a2 doctest: use print_function and convert bytes to unicode where needed
Yuya Nishihara <yuya@tcha.org>
parents: 34136
diff changeset
390 >>> bprint(trim(t, 4, ellipsis=ellipsis))
21856
d24969ee272f encoding: add 'trim' to trim multi-byte characters at most specified columns
FUJIWARA Katsunori <foozy@lares.dti.ne.jp>
parents: 18378
diff changeset
391 +++
34137
a8994d08e4a2 doctest: use print_function and convert bytes to unicode where needed
Yuya Nishihara <yuya@tcha.org>
parents: 34136
diff changeset
392 >>> bprint(trim(t, 4, ellipsis=ellipsis, leftside=True))
21861
b515c3a63e96 encoding: add 'leftside' argument into 'trim' to switch trimming side
FUJIWARA Katsunori <foozy@lares.dti.ne.jp>
parents: 21856
diff changeset
393 +++
34136
414a3513c2bd doctest: do not embed non-ascii characters in docstring
Yuya Nishihara <yuya@tcha.org>
parents: 34135
diff changeset
394 >>> t = bin(b'112233445566778899aa') # invalid byte sequence
34137
a8994d08e4a2 doctest: use print_function and convert bytes to unicode where needed
Yuya Nishihara <yuya@tcha.org>
parents: 34136
diff changeset
395 >>> bprint(trim(t, 12, ellipsis=ellipsis))
21856
d24969ee272f encoding: add 'trim' to trim multi-byte characters at most specified columns
FUJIWARA Katsunori <foozy@lares.dti.ne.jp>
parents: 18378
diff changeset
396 \x11\x22\x33\x44\x55\x66\x77\x88\x99\xaa
34137
a8994d08e4a2 doctest: use print_function and convert bytes to unicode where needed
Yuya Nishihara <yuya@tcha.org>
parents: 34136
diff changeset
397 >>> bprint(trim(t, 10, ellipsis=ellipsis))
21856
d24969ee272f encoding: add 'trim' to trim multi-byte characters at most specified columns
FUJIWARA Katsunori <foozy@lares.dti.ne.jp>
parents: 18378
diff changeset
398 \x11\x22\x33\x44\x55\x66\x77\x88\x99\xaa
34137
a8994d08e4a2 doctest: use print_function and convert bytes to unicode where needed
Yuya Nishihara <yuya@tcha.org>
parents: 34136
diff changeset
399 >>> bprint(trim(t, 8, ellipsis=ellipsis))
21856
d24969ee272f encoding: add 'trim' to trim multi-byte characters at most specified columns
FUJIWARA Katsunori <foozy@lares.dti.ne.jp>
parents: 18378
diff changeset
400 \x11\x22\x33\x44\x55+++
34137
a8994d08e4a2 doctest: use print_function and convert bytes to unicode where needed
Yuya Nishihara <yuya@tcha.org>
parents: 34136
diff changeset
401 >>> bprint(trim(t, 8, ellipsis=ellipsis, leftside=True))
21861
b515c3a63e96 encoding: add 'leftside' argument into 'trim' to switch trimming side
FUJIWARA Katsunori <foozy@lares.dti.ne.jp>
parents: 21856
diff changeset
402 +++\x66\x77\x88\x99\xaa
34137
a8994d08e4a2 doctest: use print_function and convert bytes to unicode where needed
Yuya Nishihara <yuya@tcha.org>
parents: 34136
diff changeset
403 >>> bprint(trim(t, 8))
21856
d24969ee272f encoding: add 'trim' to trim multi-byte characters at most specified columns
FUJIWARA Katsunori <foozy@lares.dti.ne.jp>
parents: 18378
diff changeset
404 \x11\x22\x33\x44\x55\x66\x77\x88
34137
a8994d08e4a2 doctest: use print_function and convert bytes to unicode where needed
Yuya Nishihara <yuya@tcha.org>
parents: 34136
diff changeset
405 >>> bprint(trim(t, 8, leftside=True))
21861
b515c3a63e96 encoding: add 'leftside' argument into 'trim' to switch trimming side
FUJIWARA Katsunori <foozy@lares.dti.ne.jp>
parents: 21856
diff changeset
406 \x33\x44\x55\x66\x77\x88\x99\xaa
34137
a8994d08e4a2 doctest: use print_function and convert bytes to unicode where needed
Yuya Nishihara <yuya@tcha.org>
parents: 34136
diff changeset
407 >>> bprint(trim(t, 3, ellipsis=ellipsis))
21856
d24969ee272f encoding: add 'trim' to trim multi-byte characters at most specified columns
FUJIWARA Katsunori <foozy@lares.dti.ne.jp>
parents: 18378
diff changeset
408 +++
34137
a8994d08e4a2 doctest: use print_function and convert bytes to unicode where needed
Yuya Nishihara <yuya@tcha.org>
parents: 34136
diff changeset
409 >>> bprint(trim(t, 1, ellipsis=ellipsis))
21856
d24969ee272f encoding: add 'trim' to trim multi-byte characters at most specified columns
FUJIWARA Katsunori <foozy@lares.dti.ne.jp>
parents: 18378
diff changeset
410 +
d24969ee272f encoding: add 'trim' to trim multi-byte characters at most specified columns
FUJIWARA Katsunori <foozy@lares.dti.ne.jp>
parents: 18378
diff changeset
411 """
d24969ee272f encoding: add 'trim' to trim multi-byte characters at most specified columns
FUJIWARA Katsunori <foozy@lares.dti.ne.jp>
parents: 18378
diff changeset
412 try:
30033
02dbfaa6df0b py3: convert encoding name and mode to str
Yuya Nishihara <yuya@tcha.org>
parents: 30030
diff changeset
413 u = s.decode(_sysstr(encoding))
21856
d24969ee272f encoding: add 'trim' to trim multi-byte characters at most specified columns
FUJIWARA Katsunori <foozy@lares.dti.ne.jp>
parents: 18378
diff changeset
414 except UnicodeDecodeError:
43076
2372284d9457 formatting: blacken the codebase
Augie Fackler <augie@google.com>
parents: 41836
diff changeset
415 if len(s) <= width: # trimming is not needed
21856
d24969ee272f encoding: add 'trim' to trim multi-byte characters at most specified columns
FUJIWARA Katsunori <foozy@lares.dti.ne.jp>
parents: 18378
diff changeset
416 return s
d24969ee272f encoding: add 'trim' to trim multi-byte characters at most specified columns
FUJIWARA Katsunori <foozy@lares.dti.ne.jp>
parents: 18378
diff changeset
417 width -= len(ellipsis)
43076
2372284d9457 formatting: blacken the codebase
Augie Fackler <augie@google.com>
parents: 41836
diff changeset
418 if width <= 0: # no enough room even for ellipsis
2372284d9457 formatting: blacken the codebase
Augie Fackler <augie@google.com>
parents: 41836
diff changeset
419 return ellipsis[: width + len(ellipsis)]
21861
b515c3a63e96 encoding: add 'leftside' argument into 'trim' to switch trimming side
FUJIWARA Katsunori <foozy@lares.dti.ne.jp>
parents: 21856
diff changeset
420 if leftside:
b515c3a63e96 encoding: add 'leftside' argument into 'trim' to switch trimming side
FUJIWARA Katsunori <foozy@lares.dti.ne.jp>
parents: 21856
diff changeset
421 return ellipsis + s[-width:]
21856
d24969ee272f encoding: add 'trim' to trim multi-byte characters at most specified columns
FUJIWARA Katsunori <foozy@lares.dti.ne.jp>
parents: 18378
diff changeset
422 return s[:width] + ellipsis
d24969ee272f encoding: add 'trim' to trim multi-byte characters at most specified columns
FUJIWARA Katsunori <foozy@lares.dti.ne.jp>
parents: 18378
diff changeset
423
43076
2372284d9457 formatting: blacken the codebase
Augie Fackler <augie@google.com>
parents: 41836
diff changeset
424 if ucolwidth(u) <= width: # trimming is not needed
21856
d24969ee272f encoding: add 'trim' to trim multi-byte characters at most specified columns
FUJIWARA Katsunori <foozy@lares.dti.ne.jp>
parents: 18378
diff changeset
425 return s
d24969ee272f encoding: add 'trim' to trim multi-byte characters at most specified columns
FUJIWARA Katsunori <foozy@lares.dti.ne.jp>
parents: 18378
diff changeset
426
d24969ee272f encoding: add 'trim' to trim multi-byte characters at most specified columns
FUJIWARA Katsunori <foozy@lares.dti.ne.jp>
parents: 18378
diff changeset
427 width -= len(ellipsis)
43076
2372284d9457 formatting: blacken the codebase
Augie Fackler <augie@google.com>
parents: 41836
diff changeset
428 if width <= 0: # no enough room even for ellipsis
2372284d9457 formatting: blacken the codebase
Augie Fackler <augie@google.com>
parents: 41836
diff changeset
429 return ellipsis[: width + len(ellipsis)]
21856
d24969ee272f encoding: add 'trim' to trim multi-byte characters at most specified columns
FUJIWARA Katsunori <foozy@lares.dti.ne.jp>
parents: 18378
diff changeset
430
21861
b515c3a63e96 encoding: add 'leftside' argument into 'trim' to switch trimming side
FUJIWARA Katsunori <foozy@lares.dti.ne.jp>
parents: 21856
diff changeset
431 if leftside:
b515c3a63e96 encoding: add 'leftside' argument into 'trim' to switch trimming side
FUJIWARA Katsunori <foozy@lares.dti.ne.jp>
parents: 21856
diff changeset
432 uslice = lambda i: u[i:]
b515c3a63e96 encoding: add 'leftside' argument into 'trim' to switch trimming side
FUJIWARA Katsunori <foozy@lares.dti.ne.jp>
parents: 21856
diff changeset
433 concat = lambda s: ellipsis + s
b515c3a63e96 encoding: add 'leftside' argument into 'trim' to switch trimming side
FUJIWARA Katsunori <foozy@lares.dti.ne.jp>
parents: 21856
diff changeset
434 else:
b515c3a63e96 encoding: add 'leftside' argument into 'trim' to switch trimming side
FUJIWARA Katsunori <foozy@lares.dti.ne.jp>
parents: 21856
diff changeset
435 uslice = lambda i: u[:-i]
b515c3a63e96 encoding: add 'leftside' argument into 'trim' to switch trimming side
FUJIWARA Katsunori <foozy@lares.dti.ne.jp>
parents: 21856
diff changeset
436 concat = lambda s: s + ellipsis
38783
e7aa113b14f7 global: use pycompat.xrange()
Gregory Szorc <gregory.szorc@gmail.com>
parents: 38739
diff changeset
437 for i in pycompat.xrange(1, len(u)):
21856
d24969ee272f encoding: add 'trim' to trim multi-byte characters at most specified columns
FUJIWARA Katsunori <foozy@lares.dti.ne.jp>
parents: 18378
diff changeset
438 usub = uslice(i)
d24969ee272f encoding: add 'trim' to trim multi-byte characters at most specified columns
FUJIWARA Katsunori <foozy@lares.dti.ne.jp>
parents: 18378
diff changeset
439 if ucolwidth(usub) <= width:
30033
02dbfaa6df0b py3: convert encoding name and mode to str
Yuya Nishihara <yuya@tcha.org>
parents: 30030
diff changeset
440 return concat(usub.encode(_sysstr(encoding)))
43076
2372284d9457 formatting: blacken the codebase
Augie Fackler <augie@google.com>
parents: 41836
diff changeset
441 return ellipsis # no enough room for multi-column characters
2372284d9457 formatting: blacken the codebase
Augie Fackler <augie@google.com>
parents: 41836
diff changeset
442
21856
d24969ee272f encoding: add 'trim' to trim multi-byte characters at most specified columns
FUJIWARA Katsunori <foozy@lares.dti.ne.jp>
parents: 18378
diff changeset
443
14069
e38846a79a23 encoding: add an encoding-aware lower function
Matt Mackall <mpm@selenic.com>
parents: 13940
diff changeset
444 def lower(s):
43496
2ade00f3b03b encoding: add comment-based type hints for pytype
Augie Fackler <augie@google.com>
parents: 43469
diff changeset
445 # type: (bytes) -> bytes
43787
be8552f25cab cleanup: fix docstring formatting
Matt Harbison <matt_harbison@yahoo.com>
parents: 43773
diff changeset
446 """best-effort encoding-aware case-folding of local string s"""
14069
e38846a79a23 encoding: add an encoding-aware lower function
Matt Mackall <mpm@selenic.com>
parents: 13940
diff changeset
447 try:
22779
d9585dda63c3 encoding.lower: use fast ASCII lower
Siddharth Agarwal <sid0@fb.com>
parents: 22778
diff changeset
448 return asciilower(s)
17235
3745ae495ce5 encoding: use s.decode to trigger UnicodeDecodeError
Martin Geisler <mg@aragost.com>
parents: 16493
diff changeset
449 except UnicodeDecodeError:
16387
c481761033bd encoding: add fast-path for ASCII lowercase
Matt Mackall <mpm@selenic.com>
parents: 16274
diff changeset
450 pass
c481761033bd encoding: add fast-path for ASCII lowercase
Matt Mackall <mpm@selenic.com>
parents: 16274
diff changeset
451 try:
14069
e38846a79a23 encoding: add an encoding-aware lower function
Matt Mackall <mpm@selenic.com>
parents: 13940
diff changeset
452 if isinstance(s, localstr):
e38846a79a23 encoding: add an encoding-aware lower function
Matt Mackall <mpm@selenic.com>
parents: 13940
diff changeset
453 u = s._utf8.decode("utf-8")
e38846a79a23 encoding: add an encoding-aware lower function
Matt Mackall <mpm@selenic.com>
parents: 13940
diff changeset
454 else:
30033
02dbfaa6df0b py3: convert encoding name and mode to str
Yuya Nishihara <yuya@tcha.org>
parents: 30030
diff changeset
455 u = s.decode(_sysstr(encoding), _sysstr(encodingmode))
14069
e38846a79a23 encoding: add an encoding-aware lower function
Matt Mackall <mpm@selenic.com>
parents: 13940
diff changeset
456
e38846a79a23 encoding: add an encoding-aware lower function
Matt Mackall <mpm@selenic.com>
parents: 13940
diff changeset
457 lu = u.lower()
e38846a79a23 encoding: add an encoding-aware lower function
Matt Mackall <mpm@selenic.com>
parents: 13940
diff changeset
458 if u == lu:
43076
2372284d9457 formatting: blacken the codebase
Augie Fackler <augie@google.com>
parents: 41836
diff changeset
459 return s # preserve localstring
30033
02dbfaa6df0b py3: convert encoding name and mode to str
Yuya Nishihara <yuya@tcha.org>
parents: 30030
diff changeset
460 return lu.encode(_sysstr(encoding))
14069
e38846a79a23 encoding: add an encoding-aware lower function
Matt Mackall <mpm@selenic.com>
parents: 13940
diff changeset
461 except UnicodeError:
43076
2372284d9457 formatting: blacken the codebase
Augie Fackler <augie@google.com>
parents: 41836
diff changeset
462 return s.lower() # we don't know how to fold this except in ASCII
25660
328739ea70c3 global: mass rewrite to use modern exception syntax
Gregory Szorc <gregory.szorc@gmail.com>
parents: 24608
diff changeset
463 except LookupError as k:
43077
687b865b95ad formatting: byteify all mercurial/ and hgext/ string literals
Augie Fackler <augie@google.com>
parents: 43076
diff changeset
464 raise error.Abort(k, hint=b"please check your locale settings")
15672
2ebe3d0ce91d i18n: use encoding.lower/upper for encoding aware case folding
FUJIWARA Katsunori <foozy@lares.dti.ne.jp>
parents: 15143
diff changeset
465
43076
2372284d9457 formatting: blacken the codebase
Augie Fackler <augie@google.com>
parents: 41836
diff changeset
466
15672
2ebe3d0ce91d i18n: use encoding.lower/upper for encoding aware case folding
FUJIWARA Katsunori <foozy@lares.dti.ne.jp>
parents: 15143
diff changeset
467 def upper(s):
43496
2ade00f3b03b encoding: add comment-based type hints for pytype
Augie Fackler <augie@google.com>
parents: 43469
diff changeset
468 # type: (bytes) -> bytes
43787
be8552f25cab cleanup: fix docstring formatting
Matt Harbison <matt_harbison@yahoo.com>
parents: 43773
diff changeset
469 """best-effort encoding-aware case-folding of local string s"""
15672
2ebe3d0ce91d i18n: use encoding.lower/upper for encoding aware case folding
FUJIWARA Katsunori <foozy@lares.dti.ne.jp>
parents: 15143
diff changeset
470 try:
24578
ac08de78de7f encoding: use parsers.asciiupper when available
Siddharth Agarwal <sid0@fb.com>
parents: 23596
diff changeset
471 return asciiupper(s)
17236
9fb8312dbdbd encoding: add fast-path for ASCII uppercase.
Martin Geisler <mg@aragost.com>
parents: 17235
diff changeset
472 except UnicodeDecodeError:
24597
b4258d5a1600 encoding.upper: factor out fallback code
Siddharth Agarwal <sid0@fb.com>
parents: 24593
diff changeset
473 return upperfallback(s)
b4258d5a1600 encoding.upper: factor out fallback code
Siddharth Agarwal <sid0@fb.com>
parents: 24593
diff changeset
474
43076
2372284d9457 formatting: blacken the codebase
Augie Fackler <augie@google.com>
parents: 41836
diff changeset
475
24597
b4258d5a1600 encoding.upper: factor out fallback code
Siddharth Agarwal <sid0@fb.com>
parents: 24593
diff changeset
476 def upperfallback(s):
43496
2ade00f3b03b encoding: add comment-based type hints for pytype
Augie Fackler <augie@google.com>
parents: 43469
diff changeset
477 # type: (Any) -> Any
17236
9fb8312dbdbd encoding: add fast-path for ASCII uppercase.
Martin Geisler <mg@aragost.com>
parents: 17235
diff changeset
478 try:
15672
2ebe3d0ce91d i18n: use encoding.lower/upper for encoding aware case folding
FUJIWARA Katsunori <foozy@lares.dti.ne.jp>
parents: 15143
diff changeset
479 if isinstance(s, localstr):
2ebe3d0ce91d i18n: use encoding.lower/upper for encoding aware case folding
FUJIWARA Katsunori <foozy@lares.dti.ne.jp>
parents: 15143
diff changeset
480 u = s._utf8.decode("utf-8")
2ebe3d0ce91d i18n: use encoding.lower/upper for encoding aware case folding
FUJIWARA Katsunori <foozy@lares.dti.ne.jp>
parents: 15143
diff changeset
481 else:
30033
02dbfaa6df0b py3: convert encoding name and mode to str
Yuya Nishihara <yuya@tcha.org>
parents: 30030
diff changeset
482 u = s.decode(_sysstr(encoding), _sysstr(encodingmode))
15672
2ebe3d0ce91d i18n: use encoding.lower/upper for encoding aware case folding
FUJIWARA Katsunori <foozy@lares.dti.ne.jp>
parents: 15143
diff changeset
483
2ebe3d0ce91d i18n: use encoding.lower/upper for encoding aware case folding
FUJIWARA Katsunori <foozy@lares.dti.ne.jp>
parents: 15143
diff changeset
484 uu = u.upper()
2ebe3d0ce91d i18n: use encoding.lower/upper for encoding aware case folding
FUJIWARA Katsunori <foozy@lares.dti.ne.jp>
parents: 15143
diff changeset
485 if u == uu:
43076
2372284d9457 formatting: blacken the codebase
Augie Fackler <augie@google.com>
parents: 41836
diff changeset
486 return s # preserve localstring
30033
02dbfaa6df0b py3: convert encoding name and mode to str
Yuya Nishihara <yuya@tcha.org>
parents: 30030
diff changeset
487 return uu.encode(_sysstr(encoding))
15672
2ebe3d0ce91d i18n: use encoding.lower/upper for encoding aware case folding
FUJIWARA Katsunori <foozy@lares.dti.ne.jp>
parents: 15143
diff changeset
488 except UnicodeError:
43076
2372284d9457 formatting: blacken the codebase
Augie Fackler <augie@google.com>
parents: 41836
diff changeset
489 return s.upper() # we don't know how to fold this except in ASCII
25660
328739ea70c3 global: mass rewrite to use modern exception syntax
Gregory Szorc <gregory.szorc@gmail.com>
parents: 24608
diff changeset
490 except LookupError as k:
43077
687b865b95ad formatting: byteify all mercurial/ and hgext/ string literals
Augie Fackler <augie@google.com>
parents: 43076
diff changeset
491 raise error.Abort(k, hint=b"please check your locale settings")
16133
84c58da3a1f8 encoding: introduce utf8-b helpers
Matt Mackall <mpm@selenic.com>
parents: 15769
diff changeset
492
43076
2372284d9457 formatting: blacken the codebase
Augie Fackler <augie@google.com>
parents: 41836
diff changeset
493
24593
f473a1fe5c7c encoding: define an enum that specifies what normcase does to ASCII strings
Siddharth Agarwal <sid0@fb.com>
parents: 24578
diff changeset
494 class normcasespecs(object):
45942
89a2afe31e82 formating: upgrade to black 20.8b1
Augie Fackler <raf@durin42.com>
parents: 45681
diff changeset
495 """what a platform's normcase does to ASCII strings
24593
f473a1fe5c7c encoding: define an enum that specifies what normcase does to ASCII strings
Siddharth Agarwal <sid0@fb.com>
parents: 24578
diff changeset
496
f473a1fe5c7c encoding: define an enum that specifies what normcase does to ASCII strings
Siddharth Agarwal <sid0@fb.com>
parents: 24578
diff changeset
497 This is specified per platform, and should be consistent with what normcase
f473a1fe5c7c encoding: define an enum that specifies what normcase does to ASCII strings
Siddharth Agarwal <sid0@fb.com>
parents: 24578
diff changeset
498 on that platform actually does.
f473a1fe5c7c encoding: define an enum that specifies what normcase does to ASCII strings
Siddharth Agarwal <sid0@fb.com>
parents: 24578
diff changeset
499
f473a1fe5c7c encoding: define an enum that specifies what normcase does to ASCII strings
Siddharth Agarwal <sid0@fb.com>
parents: 24578
diff changeset
500 lower: normcase lowercases ASCII strings
f473a1fe5c7c encoding: define an enum that specifies what normcase does to ASCII strings
Siddharth Agarwal <sid0@fb.com>
parents: 24578
diff changeset
501 upper: normcase uppercases ASCII strings
24608
1c533e23ce95 util.h: define an enum for normcase specs
Siddharth Agarwal <sid0@fb.com>
parents: 24597
diff changeset
502 other: the fallback function should always be called
1c533e23ce95 util.h: define an enum for normcase specs
Siddharth Agarwal <sid0@fb.com>
parents: 24597
diff changeset
503
45942
89a2afe31e82 formating: upgrade to black 20.8b1
Augie Fackler <raf@durin42.com>
parents: 45681
diff changeset
504 This should be kept in sync with normcase_spec in util.h."""
43076
2372284d9457 formatting: blacken the codebase
Augie Fackler <augie@google.com>
parents: 41836
diff changeset
505
24593
f473a1fe5c7c encoding: define an enum that specifies what normcase does to ASCII strings
Siddharth Agarwal <sid0@fb.com>
parents: 24578
diff changeset
506 lower = -1
f473a1fe5c7c encoding: define an enum that specifies what normcase does to ASCII strings
Siddharth Agarwal <sid0@fb.com>
parents: 24578
diff changeset
507 upper = 1
f473a1fe5c7c encoding: define an enum that specifies what normcase does to ASCII strings
Siddharth Agarwal <sid0@fb.com>
parents: 24578
diff changeset
508 other = 0
f473a1fe5c7c encoding: define an enum that specifies what normcase does to ASCII strings
Siddharth Agarwal <sid0@fb.com>
parents: 24578
diff changeset
509
43076
2372284d9457 formatting: blacken the codebase
Augie Fackler <augie@google.com>
parents: 41836
diff changeset
510
28068
9ece901f7a19 encoding: add option to escape non-ascii characters in JSON
Yuya Nishihara <yuya@tcha.org>
parents: 28067
diff changeset
511 def jsonescape(s, paranoid=False):
43496
2ade00f3b03b encoding: add comment-based type hints for pytype
Augie Fackler <augie@google.com>
parents: 43469
diff changeset
512 # type: (Any, Any) -> Any
45942
89a2afe31e82 formating: upgrade to black 20.8b1
Augie Fackler <raf@durin42.com>
parents: 45681
diff changeset
513 """returns a string suitable for JSON
22426
f6b533e64ed6 encoding: add json escaping filter
Matt Mackall <mpm@selenic.com>
parents: 22425
diff changeset
514
f6b533e64ed6 encoding: add json escaping filter
Matt Mackall <mpm@selenic.com>
parents: 22425
diff changeset
515 JSON is problematic for us because it doesn't support non-Unicode
f6b533e64ed6 encoding: add json escaping filter
Matt Mackall <mpm@selenic.com>
parents: 22425
diff changeset
516 bytes. To deal with this, we take the following approach:
f6b533e64ed6 encoding: add json escaping filter
Matt Mackall <mpm@selenic.com>
parents: 22425
diff changeset
517
37947
3ea3c96ada54 encoding: introduce tagging type for non-lossy non-ASCII string
Yuya Nishihara <yuya@tcha.org>
parents: 37946
diff changeset
518 - localstr/safelocalstr objects are converted back to UTF-8
22426
f6b533e64ed6 encoding: add json escaping filter
Matt Mackall <mpm@selenic.com>
parents: 22425
diff changeset
519 - valid UTF-8/ASCII strings are passed as-is
f6b533e64ed6 encoding: add json escaping filter
Matt Mackall <mpm@selenic.com>
parents: 22425
diff changeset
520 - other strings are converted to UTF-8b surrogate encoding
f6b533e64ed6 encoding: add json escaping filter
Matt Mackall <mpm@selenic.com>
parents: 22425
diff changeset
521 - apply JSON-specified string escaping
f6b533e64ed6 encoding: add json escaping filter
Matt Mackall <mpm@selenic.com>
parents: 22425
diff changeset
522
f6b533e64ed6 encoding: add json escaping filter
Matt Mackall <mpm@selenic.com>
parents: 22425
diff changeset
523 (escapes are doubled in these tests)
f6b533e64ed6 encoding: add json escaping filter
Matt Mackall <mpm@selenic.com>
parents: 22425
diff changeset
524
34131
0fa781320203 doctest: bulk-replace string literals with b'' for Python 3
Yuya Nishihara <yuya@tcha.org>
parents: 33928
diff changeset
525 >>> jsonescape(b'this is a test')
22426
f6b533e64ed6 encoding: add json escaping filter
Matt Mackall <mpm@selenic.com>
parents: 22425
diff changeset
526 'this is a test'
34131
0fa781320203 doctest: bulk-replace string literals with b'' for Python 3
Yuya Nishihara <yuya@tcha.org>
parents: 33928
diff changeset
527 >>> jsonescape(b'escape characters: \\0 \\x0b \\x7f')
27881
ffa599f3f503 encoding: escape U+007F (DEL) character in JSON
Yuya Nishihara <yuya@tcha.org>
parents: 27699
diff changeset
528 'escape characters: \\\\u0000 \\\\u000b \\\\u007f'
34131
0fa781320203 doctest: bulk-replace string literals with b'' for Python 3
Yuya Nishihara <yuya@tcha.org>
parents: 33928
diff changeset
529 >>> jsonescape(b'escape characters: \\b \\t \\n \\f \\r \\" \\\\')
33925
2c37f9dabc32 encoding: add fast path of jsonescape() (issue5533)
Yuya Nishihara <yuya@tcha.org>
parents: 33924
diff changeset
530 'escape characters: \\\\b \\\\t \\\\n \\\\f \\\\r \\\\" \\\\\\\\'
34131
0fa781320203 doctest: bulk-replace string literals with b'' for Python 3
Yuya Nishihara <yuya@tcha.org>
parents: 33928
diff changeset
531 >>> jsonescape(b'a weird byte: \\xdd')
22426
f6b533e64ed6 encoding: add json escaping filter
Matt Mackall <mpm@selenic.com>
parents: 22425
diff changeset
532 'a weird byte: \\xed\\xb3\\x9d'
34131
0fa781320203 doctest: bulk-replace string literals with b'' for Python 3
Yuya Nishihara <yuya@tcha.org>
parents: 33928
diff changeset
533 >>> jsonescape(b'utf-8: caf\\xc3\\xa9')
22426
f6b533e64ed6 encoding: add json escaping filter
Matt Mackall <mpm@selenic.com>
parents: 22425
diff changeset
534 'utf-8: caf\\xc3\\xa9'
34131
0fa781320203 doctest: bulk-replace string literals with b'' for Python 3
Yuya Nishihara <yuya@tcha.org>
parents: 33928
diff changeset
535 >>> jsonescape(b'')
22426
f6b533e64ed6 encoding: add json escaping filter
Matt Mackall <mpm@selenic.com>
parents: 22425
diff changeset
536 ''
28068
9ece901f7a19 encoding: add option to escape non-ascii characters in JSON
Yuya Nishihara <yuya@tcha.org>
parents: 28067
diff changeset
537
28069
b2d24c2898f9 encoding: backport paranoid escaping from templatefilters.jsonescape()
Yuya Nishihara <yuya@tcha.org>
parents: 28068
diff changeset
538 If paranoid, non-ascii and common troublesome characters are also escaped.
b2d24c2898f9 encoding: backport paranoid escaping from templatefilters.jsonescape()
Yuya Nishihara <yuya@tcha.org>
parents: 28068
diff changeset
539 This is suitable for web output.
28068
9ece901f7a19 encoding: add option to escape non-ascii characters in JSON
Yuya Nishihara <yuya@tcha.org>
parents: 28067
diff changeset
540
34131
0fa781320203 doctest: bulk-replace string literals with b'' for Python 3
Yuya Nishihara <yuya@tcha.org>
parents: 33928
diff changeset
541 >>> s = b'escape characters: \\0 \\x0b \\x7f'
33925
2c37f9dabc32 encoding: add fast path of jsonescape() (issue5533)
Yuya Nishihara <yuya@tcha.org>
parents: 33924
diff changeset
542 >>> assert jsonescape(s) == jsonescape(s, paranoid=True)
34131
0fa781320203 doctest: bulk-replace string literals with b'' for Python 3
Yuya Nishihara <yuya@tcha.org>
parents: 33928
diff changeset
543 >>> s = b'escape characters: \\b \\t \\n \\f \\r \\" \\\\'
33925
2c37f9dabc32 encoding: add fast path of jsonescape() (issue5533)
Yuya Nishihara <yuya@tcha.org>
parents: 33924
diff changeset
544 >>> assert jsonescape(s) == jsonescape(s, paranoid=True)
34131
0fa781320203 doctest: bulk-replace string literals with b'' for Python 3
Yuya Nishihara <yuya@tcha.org>
parents: 33928
diff changeset
545 >>> jsonescape(b'escape boundary: \\x7e \\x7f \\xc2\\x80', paranoid=True)
28068
9ece901f7a19 encoding: add option to escape non-ascii characters in JSON
Yuya Nishihara <yuya@tcha.org>
parents: 28067
diff changeset
546 'escape boundary: ~ \\\\u007f \\\\u0080'
34131
0fa781320203 doctest: bulk-replace string literals with b'' for Python 3
Yuya Nishihara <yuya@tcha.org>
parents: 33928
diff changeset
547 >>> jsonescape(b'a weird byte: \\xdd', paranoid=True)
28068
9ece901f7a19 encoding: add option to escape non-ascii characters in JSON
Yuya Nishihara <yuya@tcha.org>
parents: 28067
diff changeset
548 'a weird byte: \\\\udcdd'
34131
0fa781320203 doctest: bulk-replace string literals with b'' for Python 3
Yuya Nishihara <yuya@tcha.org>
parents: 33928
diff changeset
549 >>> jsonescape(b'utf-8: caf\\xc3\\xa9', paranoid=True)
28068
9ece901f7a19 encoding: add option to escape non-ascii characters in JSON
Yuya Nishihara <yuya@tcha.org>
parents: 28067
diff changeset
550 'utf-8: caf\\\\u00e9'
34131
0fa781320203 doctest: bulk-replace string literals with b'' for Python 3
Yuya Nishihara <yuya@tcha.org>
parents: 33928
diff changeset
551 >>> jsonescape(b'non-BMP: \\xf0\\x9d\\x84\\x9e', paranoid=True)
28068
9ece901f7a19 encoding: add option to escape non-ascii characters in JSON
Yuya Nishihara <yuya@tcha.org>
parents: 28067
diff changeset
552 'non-BMP: \\\\ud834\\\\udd1e'
34131
0fa781320203 doctest: bulk-replace string literals with b'' for Python 3
Yuya Nishihara <yuya@tcha.org>
parents: 33928
diff changeset
553 >>> jsonescape(b'<foo@example.org>', paranoid=True)
28069
b2d24c2898f9 encoding: backport paranoid escaping from templatefilters.jsonescape()
Yuya Nishihara <yuya@tcha.org>
parents: 28068
diff changeset
554 '\\\\u003cfoo@example.org\\\\u003e'
45942
89a2afe31e82 formating: upgrade to black 20.8b1
Augie Fackler <raf@durin42.com>
parents: 45681
diff changeset
555 """
22426
f6b533e64ed6 encoding: add json escaping filter
Matt Mackall <mpm@selenic.com>
parents: 22425
diff changeset
556
28068
9ece901f7a19 encoding: add option to escape non-ascii characters in JSON
Yuya Nishihara <yuya@tcha.org>
parents: 28067
diff changeset
557 u8chars = toutf8b(s)
9ece901f7a19 encoding: add option to escape non-ascii characters in JSON
Yuya Nishihara <yuya@tcha.org>
parents: 28067
diff changeset
558 try:
33924
b9101467d88b encoding: extract stub for fast JSON escape
Yuya Nishihara <yuya@tcha.org>
parents: 33852
diff changeset
559 return _jsonescapeu8fast(u8chars, paranoid)
b9101467d88b encoding: extract stub for fast JSON escape
Yuya Nishihara <yuya@tcha.org>
parents: 33852
diff changeset
560 except ValueError:
28068
9ece901f7a19 encoding: add option to escape non-ascii characters in JSON
Yuya Nishihara <yuya@tcha.org>
parents: 28067
diff changeset
561 pass
33924
b9101467d88b encoding: extract stub for fast JSON escape
Yuya Nishihara <yuya@tcha.org>
parents: 33852
diff changeset
562 return charencodepure.jsonescapeu8fallback(u8chars, paranoid)
22426
f6b533e64ed6 encoding: add json escaping filter
Matt Mackall <mpm@selenic.com>
parents: 22425
diff changeset
563
43076
2372284d9457 formatting: blacken the codebase
Augie Fackler <augie@google.com>
parents: 41836
diff changeset
564
34218
aa877860d4d7 py3: use 'surrogatepass' error handler to process U+DCxx transparently
Yuya Nishihara <yuya@tcha.org>
parents: 34216
diff changeset
565 # We need to decode/encode U+DCxx codes transparently since invalid UTF-8
aa877860d4d7 py3: use 'surrogatepass' error handler to process U+DCxx transparently
Yuya Nishihara <yuya@tcha.org>
parents: 34216
diff changeset
566 # bytes are mapped to that range.
aa877860d4d7 py3: use 'surrogatepass' error handler to process U+DCxx transparently
Yuya Nishihara <yuya@tcha.org>
parents: 34216
diff changeset
567 if pycompat.ispy3:
aa877860d4d7 py3: use 'surrogatepass' error handler to process U+DCxx transparently
Yuya Nishihara <yuya@tcha.org>
parents: 34216
diff changeset
568 _utf8strict = r'surrogatepass'
aa877860d4d7 py3: use 'surrogatepass' error handler to process U+DCxx transparently
Yuya Nishihara <yuya@tcha.org>
parents: 34216
diff changeset
569 else:
aa877860d4d7 py3: use 'surrogatepass' error handler to process U+DCxx transparently
Yuya Nishihara <yuya@tcha.org>
parents: 34216
diff changeset
570 _utf8strict = r'strict'
aa877860d4d7 py3: use 'surrogatepass' error handler to process U+DCxx transparently
Yuya Nishihara <yuya@tcha.org>
parents: 34216
diff changeset
571
26875
cf47bdb2183c encoding: add getutf8char helper
Matt Mackall <mpm@selenic.com>
parents: 25660
diff changeset
572 _utf8len = [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 2, 2, 3, 4]
cf47bdb2183c encoding: add getutf8char helper
Matt Mackall <mpm@selenic.com>
parents: 25660
diff changeset
573
43076
2372284d9457 formatting: blacken the codebase
Augie Fackler <augie@google.com>
parents: 41836
diff changeset
574
26875
cf47bdb2183c encoding: add getutf8char helper
Matt Mackall <mpm@selenic.com>
parents: 25660
diff changeset
575 def getutf8char(s, pos):
43682
83a349aaeba3 typing: constrain argument/return types of encoding.toutf8b()
Yuya Nishihara <yuya@tcha.org>
parents: 43681
diff changeset
576 # type: (bytes, int) -> bytes
45942
89a2afe31e82 formating: upgrade to black 20.8b1
Augie Fackler <raf@durin42.com>
parents: 45681
diff changeset
577 """get the next full utf-8 character in the given string, starting at pos
26875
cf47bdb2183c encoding: add getutf8char helper
Matt Mackall <mpm@selenic.com>
parents: 25660
diff changeset
578
cf47bdb2183c encoding: add getutf8char helper
Matt Mackall <mpm@selenic.com>
parents: 25660
diff changeset
579 Raises a UnicodeError if the given location does not start a valid
cf47bdb2183c encoding: add getutf8char helper
Matt Mackall <mpm@selenic.com>
parents: 25660
diff changeset
580 utf-8 character.
45942
89a2afe31e82 formating: upgrade to black 20.8b1
Augie Fackler <raf@durin42.com>
parents: 45681
diff changeset
581 """
26875
cf47bdb2183c encoding: add getutf8char helper
Matt Mackall <mpm@selenic.com>
parents: 25660
diff changeset
582
cf47bdb2183c encoding: add getutf8char helper
Matt Mackall <mpm@selenic.com>
parents: 25660
diff changeset
583 # find how many bytes to attempt decoding from first nibble
43076
2372284d9457 formatting: blacken the codebase
Augie Fackler <augie@google.com>
parents: 41836
diff changeset
584 l = _utf8len[ord(s[pos : pos + 1]) >> 4]
2372284d9457 formatting: blacken the codebase
Augie Fackler <augie@google.com>
parents: 41836
diff changeset
585 if not l: # ascii
2372284d9457 formatting: blacken the codebase
Augie Fackler <augie@google.com>
parents: 41836
diff changeset
586 return s[pos : pos + 1]
26875
cf47bdb2183c encoding: add getutf8char helper
Matt Mackall <mpm@selenic.com>
parents: 25660
diff changeset
587
43076
2372284d9457 formatting: blacken the codebase
Augie Fackler <augie@google.com>
parents: 41836
diff changeset
588 c = s[pos : pos + l]
26875
cf47bdb2183c encoding: add getutf8char helper
Matt Mackall <mpm@selenic.com>
parents: 25660
diff changeset
589 # validate with attempted decode
34218
aa877860d4d7 py3: use 'surrogatepass' error handler to process U+DCxx transparently
Yuya Nishihara <yuya@tcha.org>
parents: 34216
diff changeset
590 c.decode("utf-8", _utf8strict)
26875
cf47bdb2183c encoding: add getutf8char helper
Matt Mackall <mpm@selenic.com>
parents: 25660
diff changeset
591 return c
cf47bdb2183c encoding: add getutf8char helper
Matt Mackall <mpm@selenic.com>
parents: 25660
diff changeset
592
43076
2372284d9457 formatting: blacken the codebase
Augie Fackler <augie@google.com>
parents: 41836
diff changeset
593
16133
84c58da3a1f8 encoding: introduce utf8-b helpers
Matt Mackall <mpm@selenic.com>
parents: 15769
diff changeset
594 def toutf8b(s):
43682
83a349aaeba3 typing: constrain argument/return types of encoding.toutf8b()
Yuya Nishihara <yuya@tcha.org>
parents: 43681
diff changeset
595 # type: (bytes) -> bytes
45942
89a2afe31e82 formating: upgrade to black 20.8b1
Augie Fackler <raf@durin42.com>
parents: 45681
diff changeset
596 """convert a local, possibly-binary string into UTF-8b
16133
84c58da3a1f8 encoding: introduce utf8-b helpers
Matt Mackall <mpm@selenic.com>
parents: 15769
diff changeset
597
84c58da3a1f8 encoding: introduce utf8-b helpers
Matt Mackall <mpm@selenic.com>
parents: 15769
diff changeset
598 This is intended as a generic method to preserve data when working
84c58da3a1f8 encoding: introduce utf8-b helpers
Matt Mackall <mpm@selenic.com>
parents: 15769
diff changeset
599 with schemes like JSON and XML that have no provision for
84c58da3a1f8 encoding: introduce utf8-b helpers
Matt Mackall <mpm@selenic.com>
parents: 15769
diff changeset
600 arbitrary byte strings. As Mercurial often doesn't know
84c58da3a1f8 encoding: introduce utf8-b helpers
Matt Mackall <mpm@selenic.com>
parents: 15769
diff changeset
601 what encoding data is in, we use so-called UTF-8b.
84c58da3a1f8 encoding: introduce utf8-b helpers
Matt Mackall <mpm@selenic.com>
parents: 15769
diff changeset
602
84c58da3a1f8 encoding: introduce utf8-b helpers
Matt Mackall <mpm@selenic.com>
parents: 15769
diff changeset
603 If a string is already valid UTF-8 (or ASCII), it passes unmodified.
84c58da3a1f8 encoding: introduce utf8-b helpers
Matt Mackall <mpm@selenic.com>
parents: 15769
diff changeset
604 Otherwise, unsupported bytes are mapped to UTF-16 surrogate range,
84c58da3a1f8 encoding: introduce utf8-b helpers
Matt Mackall <mpm@selenic.com>
parents: 15769
diff changeset
605 uDC00-uDCFF.
84c58da3a1f8 encoding: introduce utf8-b helpers
Matt Mackall <mpm@selenic.com>
parents: 15769
diff changeset
606
84c58da3a1f8 encoding: introduce utf8-b helpers
Matt Mackall <mpm@selenic.com>
parents: 15769
diff changeset
607 Principles of operation:
84c58da3a1f8 encoding: introduce utf8-b helpers
Matt Mackall <mpm@selenic.com>
parents: 15769
diff changeset
608
17424
e7cfe3587ea4 fix trivial spelling errors
Mads Kiilerich <mads@kiilerich.com>
parents: 17236
diff changeset
609 - ASCII and UTF-8 data successfully round-trips and is understood
16133
84c58da3a1f8 encoding: introduce utf8-b helpers
Matt Mackall <mpm@selenic.com>
parents: 15769
diff changeset
610 by Unicode-oriented clients
84c58da3a1f8 encoding: introduce utf8-b helpers
Matt Mackall <mpm@selenic.com>
parents: 15769
diff changeset
611 - filenames and file contents in arbitrary other encodings can have
84c58da3a1f8 encoding: introduce utf8-b helpers
Matt Mackall <mpm@selenic.com>
parents: 15769
diff changeset
612 be round-tripped or recovered by clueful clients
84c58da3a1f8 encoding: introduce utf8-b helpers
Matt Mackall <mpm@selenic.com>
parents: 15769
diff changeset
613 - local strings that have a cached known UTF-8 encoding (aka
84c58da3a1f8 encoding: introduce utf8-b helpers
Matt Mackall <mpm@selenic.com>
parents: 15769
diff changeset
614 localstr) get sent as UTF-8 so Unicode-oriented clients get the
84c58da3a1f8 encoding: introduce utf8-b helpers
Matt Mackall <mpm@selenic.com>
parents: 15769
diff changeset
615 Unicode data they want
37947
3ea3c96ada54 encoding: introduce tagging type for non-lossy non-ASCII string
Yuya Nishihara <yuya@tcha.org>
parents: 37946
diff changeset
616 - non-lossy local strings (aka safelocalstr) get sent as UTF-8 as well
16133
84c58da3a1f8 encoding: introduce utf8-b helpers
Matt Mackall <mpm@selenic.com>
parents: 15769
diff changeset
617 - because we must preserve UTF-8 bytestring in places such as
84c58da3a1f8 encoding: introduce utf8-b helpers
Matt Mackall <mpm@selenic.com>
parents: 15769
diff changeset
618 filenames, metadata can't be roundtripped without help
84c58da3a1f8 encoding: introduce utf8-b helpers
Matt Mackall <mpm@selenic.com>
parents: 15769
diff changeset
619
84c58da3a1f8 encoding: introduce utf8-b helpers
Matt Mackall <mpm@selenic.com>
parents: 15769
diff changeset
620 (Note: "UTF-8b" often refers to decoding a mix of valid UTF-8 and
84c58da3a1f8 encoding: introduce utf8-b helpers
Matt Mackall <mpm@selenic.com>
parents: 15769
diff changeset
621 arbitrary bytes into an internal Unicode format that can be
84c58da3a1f8 encoding: introduce utf8-b helpers
Matt Mackall <mpm@selenic.com>
parents: 15769
diff changeset
622 re-encoded back into the original. Here we are exposing the
84c58da3a1f8 encoding: introduce utf8-b helpers
Matt Mackall <mpm@selenic.com>
parents: 15769
diff changeset
623 internal surrogate encoding as a UTF-8 string.)
45942
89a2afe31e82 formating: upgrade to black 20.8b1
Augie Fackler <raf@durin42.com>
parents: 45681
diff changeset
624 """
16133
84c58da3a1f8 encoding: introduce utf8-b helpers
Matt Mackall <mpm@selenic.com>
parents: 15769
diff changeset
625
37946
57b0c7221dba encoding: fix toutf8b() to resurrect lossy characters even if "\xed" in it
Yuya Nishihara <yuya@tcha.org>
parents: 36797
diff changeset
626 if isinstance(s, localstr):
57b0c7221dba encoding: fix toutf8b() to resurrect lossy characters even if "\xed" in it
Yuya Nishihara <yuya@tcha.org>
parents: 36797
diff changeset
627 # assume that the original UTF-8 sequence would never contain
57b0c7221dba encoding: fix toutf8b() to resurrect lossy characters even if "\xed" in it
Yuya Nishihara <yuya@tcha.org>
parents: 36797
diff changeset
628 # invalid characters in U+DCxx range
57b0c7221dba encoding: fix toutf8b() to resurrect lossy characters even if "\xed" in it
Yuya Nishihara <yuya@tcha.org>
parents: 36797
diff changeset
629 return s._utf8
37947
3ea3c96ada54 encoding: introduce tagging type for non-lossy non-ASCII string
Yuya Nishihara <yuya@tcha.org>
parents: 37946
diff changeset
630 elif isinstance(s, safelocalstr):
3ea3c96ada54 encoding: introduce tagging type for non-lossy non-ASCII string
Yuya Nishihara <yuya@tcha.org>
parents: 37946
diff changeset
631 # already verified that s is non-lossy in legacy encoding, which
3ea3c96ada54 encoding: introduce tagging type for non-lossy non-ASCII string
Yuya Nishihara <yuya@tcha.org>
parents: 37946
diff changeset
632 # shouldn't contain characters in U+DCxx range
3ea3c96ada54 encoding: introduce tagging type for non-lossy non-ASCII string
Yuya Nishihara <yuya@tcha.org>
parents: 37946
diff changeset
633 return fromlocal(s)
37946
57b0c7221dba encoding: fix toutf8b() to resurrect lossy characters even if "\xed" in it
Yuya Nishihara <yuya@tcha.org>
parents: 36797
diff changeset
634 elif isasciistr(s):
33928
6c119dbfd0c0 encoding: add fast path of from/toutf8b() for ASCII strings
Yuya Nishihara <yuya@tcha.org>
parents: 33927
diff changeset
635 return s
43077
687b865b95ad formatting: byteify all mercurial/ and hgext/ string literals
Augie Fackler <augie@google.com>
parents: 43076
diff changeset
636 if b"\xed" not in s:
26879
a24b98f4e03c encoding: re-escape U+DCxx characters in toutf8b input (issue4927)
Matt Mackall <mpm@selenic.com>
parents: 26878
diff changeset
637 try:
34218
aa877860d4d7 py3: use 'surrogatepass' error handler to process U+DCxx transparently
Yuya Nishihara <yuya@tcha.org>
parents: 34216
diff changeset
638 s.decode('utf-8', _utf8strict)
26879
a24b98f4e03c encoding: re-escape U+DCxx characters in toutf8b input (issue4927)
Matt Mackall <mpm@selenic.com>
parents: 26878
diff changeset
639 return s
a24b98f4e03c encoding: re-escape U+DCxx characters in toutf8b input (issue4927)
Matt Mackall <mpm@selenic.com>
parents: 26878
diff changeset
640 except UnicodeDecodeError:
a24b98f4e03c encoding: re-escape U+DCxx characters in toutf8b input (issue4927)
Matt Mackall <mpm@selenic.com>
parents: 26878
diff changeset
641 pass
26878
d7e83f106459 encoding: use getutf8char in toutf8b
Matt Mackall <mpm@selenic.com>
parents: 26877
diff changeset
642
34216
1c601df9894c py3: wrap bytes in encoding.from/toutf8b() with bytestr
Yuya Nishihara <yuya@tcha.org>
parents: 34200
diff changeset
643 s = pycompat.bytestr(s)
43077
687b865b95ad formatting: byteify all mercurial/ and hgext/ string literals
Augie Fackler <augie@google.com>
parents: 43076
diff changeset
644 r = b""
26878
d7e83f106459 encoding: use getutf8char in toutf8b
Matt Mackall <mpm@selenic.com>
parents: 26877
diff changeset
645 pos = 0
d7e83f106459 encoding: use getutf8char in toutf8b
Matt Mackall <mpm@selenic.com>
parents: 26877
diff changeset
646 l = len(s)
d7e83f106459 encoding: use getutf8char in toutf8b
Matt Mackall <mpm@selenic.com>
parents: 26877
diff changeset
647 while pos < l:
d7e83f106459 encoding: use getutf8char in toutf8b
Matt Mackall <mpm@selenic.com>
parents: 26877
diff changeset
648 try:
d7e83f106459 encoding: use getutf8char in toutf8b
Matt Mackall <mpm@selenic.com>
parents: 26877
diff changeset
649 c = getutf8char(s, pos)
43077
687b865b95ad formatting: byteify all mercurial/ and hgext/ string literals
Augie Fackler <augie@google.com>
parents: 43076
diff changeset
650 if b"\xed\xb0\x80" <= c <= b"\xed\xb3\xbf":
26879
a24b98f4e03c encoding: re-escape U+DCxx characters in toutf8b input (issue4927)
Matt Mackall <mpm@selenic.com>
parents: 26878
diff changeset
651 # have to re-escape existing U+DCxx characters
43076
2372284d9457 formatting: blacken the codebase
Augie Fackler <augie@google.com>
parents: 41836
diff changeset
652 c = unichr(0xDC00 + ord(s[pos])).encode('utf-8', _utf8strict)
26879
a24b98f4e03c encoding: re-escape U+DCxx characters in toutf8b input (issue4927)
Matt Mackall <mpm@selenic.com>
parents: 26878
diff changeset
653 pos += 1
a24b98f4e03c encoding: re-escape U+DCxx characters in toutf8b input (issue4927)
Matt Mackall <mpm@selenic.com>
parents: 26878
diff changeset
654 else:
a24b98f4e03c encoding: re-escape U+DCxx characters in toutf8b input (issue4927)
Matt Mackall <mpm@selenic.com>
parents: 26878
diff changeset
655 pos += len(c)
26878
d7e83f106459 encoding: use getutf8char in toutf8b
Matt Mackall <mpm@selenic.com>
parents: 26877
diff changeset
656 except UnicodeDecodeError:
43076
2372284d9457 formatting: blacken the codebase
Augie Fackler <augie@google.com>
parents: 41836
diff changeset
657 c = unichr(0xDC00 + ord(s[pos])).encode('utf-8', _utf8strict)
26878
d7e83f106459 encoding: use getutf8char in toutf8b
Matt Mackall <mpm@selenic.com>
parents: 26877
diff changeset
658 pos += 1
d7e83f106459 encoding: use getutf8char in toutf8b
Matt Mackall <mpm@selenic.com>
parents: 26877
diff changeset
659 r += c
d7e83f106459 encoding: use getutf8char in toutf8b
Matt Mackall <mpm@selenic.com>
parents: 26877
diff changeset
660 return r
16133
84c58da3a1f8 encoding: introduce utf8-b helpers
Matt Mackall <mpm@selenic.com>
parents: 15769
diff changeset
661
43076
2372284d9457 formatting: blacken the codebase
Augie Fackler <augie@google.com>
parents: 41836
diff changeset
662
16133
84c58da3a1f8 encoding: introduce utf8-b helpers
Matt Mackall <mpm@selenic.com>
parents: 15769
diff changeset
663 def fromutf8b(s):
43681
b65fcccd9100 typing: fix argument type of encoding.tolocal() and .fromutf8b()
Yuya Nishihara <yuya@tcha.org>
parents: 43680
diff changeset
664 # type: (bytes) -> bytes
45942
89a2afe31e82 formating: upgrade to black 20.8b1
Augie Fackler <raf@durin42.com>
parents: 45681
diff changeset
665 """Given a UTF-8b string, return a local, possibly-binary string.
16133
84c58da3a1f8 encoding: introduce utf8-b helpers
Matt Mackall <mpm@selenic.com>
parents: 15769
diff changeset
666
84c58da3a1f8 encoding: introduce utf8-b helpers
Matt Mackall <mpm@selenic.com>
parents: 15769
diff changeset
667 return the original binary string. This
84c58da3a1f8 encoding: introduce utf8-b helpers
Matt Mackall <mpm@selenic.com>
parents: 15769
diff changeset
668 is a round-trip process for strings like filenames, but metadata
84c58da3a1f8 encoding: introduce utf8-b helpers
Matt Mackall <mpm@selenic.com>
parents: 15769
diff changeset
669 that's was passed through tolocal will remain in UTF-8.
84c58da3a1f8 encoding: introduce utf8-b helpers
Matt Mackall <mpm@selenic.com>
parents: 15769
diff changeset
670
26963
de5ae97ce9f4 encoding: extend test cases for utf8b
Matt Mackall <mpm@selenic.com>
parents: 26879
diff changeset
671 >>> roundtrip = lambda x: fromutf8b(toutf8b(x)) == x
34131
0fa781320203 doctest: bulk-replace string literals with b'' for Python 3
Yuya Nishihara <yuya@tcha.org>
parents: 33928
diff changeset
672 >>> m = b"\\xc3\\xa9\\x99abcd"
26963
de5ae97ce9f4 encoding: extend test cases for utf8b
Matt Mackall <mpm@selenic.com>
parents: 26879
diff changeset
673 >>> toutf8b(m)
16133
84c58da3a1f8 encoding: introduce utf8-b helpers
Matt Mackall <mpm@selenic.com>
parents: 15769
diff changeset
674 '\\xc3\\xa9\\xed\\xb2\\x99abcd'
26963
de5ae97ce9f4 encoding: extend test cases for utf8b
Matt Mackall <mpm@selenic.com>
parents: 26879
diff changeset
675 >>> roundtrip(m)
de5ae97ce9f4 encoding: extend test cases for utf8b
Matt Mackall <mpm@selenic.com>
parents: 26879
diff changeset
676 True
34131
0fa781320203 doctest: bulk-replace string literals with b'' for Python 3
Yuya Nishihara <yuya@tcha.org>
parents: 33928
diff changeset
677 >>> roundtrip(b"\\xc2\\xc2\\x80")
26963
de5ae97ce9f4 encoding: extend test cases for utf8b
Matt Mackall <mpm@selenic.com>
parents: 26879
diff changeset
678 True
34131
0fa781320203 doctest: bulk-replace string literals with b'' for Python 3
Yuya Nishihara <yuya@tcha.org>
parents: 33928
diff changeset
679 >>> roundtrip(b"\\xef\\xbf\\xbd")
26963
de5ae97ce9f4 encoding: extend test cases for utf8b
Matt Mackall <mpm@selenic.com>
parents: 26879
diff changeset
680 True
34131
0fa781320203 doctest: bulk-replace string literals with b'' for Python 3
Yuya Nishihara <yuya@tcha.org>
parents: 33928
diff changeset
681 >>> roundtrip(b"\\xef\\xef\\xbf\\xbd")
16133
84c58da3a1f8 encoding: introduce utf8-b helpers
Matt Mackall <mpm@selenic.com>
parents: 15769
diff changeset
682 True
34131
0fa781320203 doctest: bulk-replace string literals with b'' for Python 3
Yuya Nishihara <yuya@tcha.org>
parents: 33928
diff changeset
683 >>> roundtrip(b"\\xf1\\x80\\x80\\x80\\x80")
27699
c8d3392f76e1 encoding: handle UTF-16 internal limit with fromutf8b (issue5031)
Matt Mackall <mpm@selenic.com>
parents: 27356
diff changeset
684 True
45942
89a2afe31e82 formating: upgrade to black 20.8b1
Augie Fackler <raf@durin42.com>
parents: 45681
diff changeset
685 """
16133
84c58da3a1f8 encoding: introduce utf8-b helpers
Matt Mackall <mpm@selenic.com>
parents: 15769
diff changeset
686
33928
6c119dbfd0c0 encoding: add fast path of from/toutf8b() for ASCII strings
Yuya Nishihara <yuya@tcha.org>
parents: 33927
diff changeset
687 if isasciistr(s):
6c119dbfd0c0 encoding: add fast path of from/toutf8b() for ASCII strings
Yuya Nishihara <yuya@tcha.org>
parents: 33927
diff changeset
688 return s
16133
84c58da3a1f8 encoding: introduce utf8-b helpers
Matt Mackall <mpm@selenic.com>
parents: 15769
diff changeset
689 # fast path - look for uDxxx prefixes in s
43077
687b865b95ad formatting: byteify all mercurial/ and hgext/ string literals
Augie Fackler <augie@google.com>
parents: 43076
diff changeset
690 if b"\xed" not in s:
16133
84c58da3a1f8 encoding: introduce utf8-b helpers
Matt Mackall <mpm@selenic.com>
parents: 15769
diff changeset
691 return s
84c58da3a1f8 encoding: introduce utf8-b helpers
Matt Mackall <mpm@selenic.com>
parents: 15769
diff changeset
692
27699
c8d3392f76e1 encoding: handle UTF-16 internal limit with fromutf8b (issue5031)
Matt Mackall <mpm@selenic.com>
parents: 27356
diff changeset
693 # We could do this with the unicode type but some Python builds
c8d3392f76e1 encoding: handle UTF-16 internal limit with fromutf8b (issue5031)
Matt Mackall <mpm@selenic.com>
parents: 27356
diff changeset
694 # use UTF-16 internally (issue5031) which causes non-BMP code
c8d3392f76e1 encoding: handle UTF-16 internal limit with fromutf8b (issue5031)
Matt Mackall <mpm@selenic.com>
parents: 27356
diff changeset
695 # points to be escaped. Instead, we use our handy getutf8char
c8d3392f76e1 encoding: handle UTF-16 internal limit with fromutf8b (issue5031)
Matt Mackall <mpm@selenic.com>
parents: 27356
diff changeset
696 # helper again to walk the string without "decoding" it.
c8d3392f76e1 encoding: handle UTF-16 internal limit with fromutf8b (issue5031)
Matt Mackall <mpm@selenic.com>
parents: 27356
diff changeset
697
34216
1c601df9894c py3: wrap bytes in encoding.from/toutf8b() with bytestr
Yuya Nishihara <yuya@tcha.org>
parents: 34200
diff changeset
698 s = pycompat.bytestr(s)
43077
687b865b95ad formatting: byteify all mercurial/ and hgext/ string literals
Augie Fackler <augie@google.com>
parents: 43076
diff changeset
699 r = b""
27699
c8d3392f76e1 encoding: handle UTF-16 internal limit with fromutf8b (issue5031)
Matt Mackall <mpm@selenic.com>
parents: 27356
diff changeset
700 pos = 0
c8d3392f76e1 encoding: handle UTF-16 internal limit with fromutf8b (issue5031)
Matt Mackall <mpm@selenic.com>
parents: 27356
diff changeset
701 l = len(s)
c8d3392f76e1 encoding: handle UTF-16 internal limit with fromutf8b (issue5031)
Matt Mackall <mpm@selenic.com>
parents: 27356
diff changeset
702 while pos < l:
c8d3392f76e1 encoding: handle UTF-16 internal limit with fromutf8b (issue5031)
Matt Mackall <mpm@selenic.com>
parents: 27356
diff changeset
703 c = getutf8char(s, pos)
c8d3392f76e1 encoding: handle UTF-16 internal limit with fromutf8b (issue5031)
Matt Mackall <mpm@selenic.com>
parents: 27356
diff changeset
704 pos += len(c)
c8d3392f76e1 encoding: handle UTF-16 internal limit with fromutf8b (issue5031)
Matt Mackall <mpm@selenic.com>
parents: 27356
diff changeset
705 # unescape U+DCxx characters
43077
687b865b95ad formatting: byteify all mercurial/ and hgext/ string literals
Augie Fackler <augie@google.com>
parents: 43076
diff changeset
706 if b"\xed\xb0\x80" <= c <= b"\xed\xb3\xbf":
43076
2372284d9457 formatting: blacken the codebase
Augie Fackler <augie@google.com>
parents: 41836
diff changeset
707 c = pycompat.bytechr(ord(c.decode("utf-8", _utf8strict)) & 0xFF)
27699
c8d3392f76e1 encoding: handle UTF-16 internal limit with fromutf8b (issue5031)
Matt Mackall <mpm@selenic.com>
parents: 27356
diff changeset
708 r += c
16133
84c58da3a1f8 encoding: introduce utf8-b helpers
Matt Mackall <mpm@selenic.com>
parents: 15769
diff changeset
709 return r