Mercurial > hg
annotate mercurial/encoding.py @ 52164:e01e84e5e426
rust-revlog: add a Rust-only `InnerRevlog`
This mirrors the Python `InnerRevlog` and will be used in a future patch
to replace said Python implementation. This allows us to start doing more
things in pure Rust, in particular reading and writing operations.
A lot of changes have to be introduced all at once, it wouldn't be very
useful to separate this patch IMO since all of them are either interlocked
or only useful with the rest.
author | Raphaël Gomès <rgomes@octobus.net> |
---|---|
date | Thu, 10 Oct 2024 10:34:51 +0200 |
parents | 54d9f496f07a |
children |
rev | line source |
---|---|
8226
8b2cd04a6e97
put license and copyright info into comment blocks
Martin Geisler <mg@lazybytes.net>
parents:
8225
diff
changeset
|
1 # encoding.py - character transcoding support for Mercurial |
8b2cd04a6e97
put license and copyright info into comment blocks
Martin Geisler <mg@lazybytes.net>
parents:
8225
diff
changeset
|
2 # |
46819
d4ba4d51f85f
contributor: change mentions of mpm to olivia
Raphaël Gomès <rgomes@octobus.net>
parents:
46319
diff
changeset
|
3 # Copyright 2005-2009 Olivia Mackall <olivia@selenic.com> and others |
8226
8b2cd04a6e97
put license and copyright info into comment blocks
Martin Geisler <mg@lazybytes.net>
parents:
8225
diff
changeset
|
4 # |
8b2cd04a6e97
put license and copyright info into comment blocks
Martin Geisler <mg@lazybytes.net>
parents:
8225
diff
changeset
|
5 # This software may be used and distributed according to the terms of the |
10263 | 6 # GNU General Public License version 2 or any later version. |
7948
de377b1a9a84
move encoding bits from util to encoding
Matt Mackall <mpm@selenic.com>
parents:
diff
changeset
|
7 |
51863
f4733654f144
typing: add `from __future__ import annotations` to most files
Matt Harbison <matt_harbison@yahoo.com>
parents:
51722
diff
changeset
|
8 from __future__ import annotations |
27355
b479fc425a81
encoding: use absolute_import
Gregory Szorc <gregory.szorc@gmail.com>
parents:
26963
diff
changeset
|
9 |
b479fc425a81
encoding: use absolute_import
Gregory Szorc <gregory.szorc@gmail.com>
parents:
26963
diff
changeset
|
10 import locale |
b479fc425a81
encoding: use absolute_import
Gregory Szorc <gregory.szorc@gmail.com>
parents:
26963
diff
changeset
|
11 import os |
47621
d6ee6456bd5f
windows: enforce upper case drive letter for getcwd in mercurial too
Pierre-Yves David <pierre-yves.david@octobus.net>
parents:
47560
diff
changeset
|
12 import re |
51285
9d3721552b6c
pytype: import typing directly
Pierre-Yves David <pierre-yves.david@octobus.net>
parents:
50995
diff
changeset
|
13 import typing |
27355
b479fc425a81
encoding: use absolute_import
Gregory Szorc <gregory.szorc@gmail.com>
parents:
26963
diff
changeset
|
14 import unicodedata |
b479fc425a81
encoding: use absolute_import
Gregory Szorc <gregory.szorc@gmail.com>
parents:
26963
diff
changeset
|
15 |
51285
9d3721552b6c
pytype: import typing directly
Pierre-Yves David <pierre-yves.david@octobus.net>
parents:
50995
diff
changeset
|
16 from typing import ( |
9d3721552b6c
pytype: import typing directly
Pierre-Yves David <pierre-yves.david@octobus.net>
parents:
50995
diff
changeset
|
17 Any, |
9d3721552b6c
pytype: import typing directly
Pierre-Yves David <pierre-yves.david@octobus.net>
parents:
50995
diff
changeset
|
18 Callable, |
9d3721552b6c
pytype: import typing directly
Pierre-Yves David <pierre-yves.david@octobus.net>
parents:
50995
diff
changeset
|
19 Text, |
9d3721552b6c
pytype: import typing directly
Pierre-Yves David <pierre-yves.david@octobus.net>
parents:
50995
diff
changeset
|
20 TypeVar, |
9d3721552b6c
pytype: import typing directly
Pierre-Yves David <pierre-yves.david@octobus.net>
parents:
50995
diff
changeset
|
21 ) |
9d3721552b6c
pytype: import typing directly
Pierre-Yves David <pierre-yves.david@octobus.net>
parents:
50995
diff
changeset
|
22 |
27355
b479fc425a81
encoding: use absolute_import
Gregory Szorc <gregory.szorc@gmail.com>
parents:
26963
diff
changeset
|
23 from . import ( |
b479fc425a81
encoding: use absolute_import
Gregory Szorc <gregory.szorc@gmail.com>
parents:
26963
diff
changeset
|
24 error, |
32372
df448de7cf3b
parsers: switch to policy importer
Yuya Nishihara <yuya@tcha.org>
parents:
32299
diff
changeset
|
25 policy, |
30030
0f6d6fdd3c2a
pycompat: provide 'ispy3' constant
Yuya Nishihara <yuya@tcha.org>
parents:
28508
diff
changeset
|
26 pycompat, |
27355
b479fc425a81
encoding: use absolute_import
Gregory Szorc <gregory.szorc@gmail.com>
parents:
26963
diff
changeset
|
27 ) |
7948
de377b1a9a84
move encoding bits from util to encoding
Matt Mackall <mpm@selenic.com>
parents:
diff
changeset
|
28 |
51940
54d9f496f07a
interfaces: introduce and use a protocol class for the `charencoding` module
Matt Harbison <matt_harbison@yahoo.com>
parents:
51863
diff
changeset
|
29 from .interfaces import modules as intmod |
43076
2372284d9457
formatting: blacken the codebase
Augie Fackler <augie@google.com>
parents:
41836
diff
changeset
|
30 from .pure import charencode as charencodepure |
33924
b9101467d88b
encoding: extract stub for fast JSON escape
Yuya Nishihara <yuya@tcha.org>
parents:
33852
diff
changeset
|
31 |
51285
9d3721552b6c
pytype: import typing directly
Pierre-Yves David <pierre-yves.david@octobus.net>
parents:
50995
diff
changeset
|
32 _Tlocalstr = TypeVar('_Tlocalstr', bound='localstr') |
43496
2ade00f3b03b
encoding: add comment-based type hints for pytype
Augie Fackler <augie@google.com>
parents:
43469
diff
changeset
|
33 |
51940
54d9f496f07a
interfaces: introduce and use a protocol class for the `charencoding` module
Matt Harbison <matt_harbison@yahoo.com>
parents:
51863
diff
changeset
|
34 charencode: intmod.CharEncoding = policy.importmod('charencode') |
33761
f5fc54e7e467
encoding: drop circular import by proxying through '<policy>.charencode'
Yuya Nishihara <yuya@tcha.org>
parents:
33022
diff
changeset
|
35 |
33926
f4433f2713d0
encoding: add function to test if a str consists of ASCII characters
Yuya Nishihara <yuya@tcha.org>
parents:
33925
diff
changeset
|
36 isasciistr = charencode.isasciistr |
33761
f5fc54e7e467
encoding: drop circular import by proxying through '<policy>.charencode'
Yuya Nishihara <yuya@tcha.org>
parents:
33022
diff
changeset
|
37 asciilower = charencode.asciilower |
f5fc54e7e467
encoding: drop circular import by proxying through '<policy>.charencode'
Yuya Nishihara <yuya@tcha.org>
parents:
33022
diff
changeset
|
38 asciiupper = charencode.asciiupper |
33925
2c37f9dabc32
encoding: add fast path of jsonescape() (issue5533)
Yuya Nishihara <yuya@tcha.org>
parents:
33924
diff
changeset
|
39 _jsonescapeu8fast = charencode.jsonescapeu8fast |
33761
f5fc54e7e467
encoding: drop circular import by proxying through '<policy>.charencode'
Yuya Nishihara <yuya@tcha.org>
parents:
33022
diff
changeset
|
40 |
30033
02dbfaa6df0b
py3: convert encoding name and mode to str
Yuya Nishihara <yuya@tcha.org>
parents:
30030
diff
changeset
|
41 _sysstr = pycompat.sysstr |
02dbfaa6df0b
py3: convert encoding name and mode to str
Yuya Nishihara <yuya@tcha.org>
parents:
30030
diff
changeset
|
42 |
48892
fa2b1a46d92e
encoding: remove Python 2 support code
Gregory Szorc <gregory.szorc@gmail.com>
parents:
48875
diff
changeset
|
43 unichr = chr |
28507
9bcbd9412225
encoding: make HFS+ ignore code Python 3 compatible
Gregory Szorc <gregory.szorc@gmail.com>
parents:
28069
diff
changeset
|
44 |
51722
43adbe03079b
typing: add type hints to the `charencode` module
Matt Harbison <matt_harbison@yahoo.com>
parents:
51703
diff
changeset
|
45 |
23596
885bd7c5c7e3
encoding: add hfsignoreclean to clean out HFS-ignored characters
Augie Fackler <raf@durin42.com>
parents:
22973
diff
changeset
|
46 # These unicode characters are ignored by HFS+ (Apple Technote 1150, |
885bd7c5c7e3
encoding: add hfsignoreclean to clean out HFS-ignored characters
Augie Fackler <raf@durin42.com>
parents:
22973
diff
changeset
|
47 # "Unicode Subtleties"), so we need to ignore them in some places for |
885bd7c5c7e3
encoding: add hfsignoreclean to clean out HFS-ignored characters
Augie Fackler <raf@durin42.com>
parents:
22973
diff
changeset
|
48 # sanity. |
43076
2372284d9457
formatting: blacken the codebase
Augie Fackler <augie@google.com>
parents:
41836
diff
changeset
|
49 _ignore = [ |
2372284d9457
formatting: blacken the codebase
Augie Fackler <augie@google.com>
parents:
41836
diff
changeset
|
50 unichr(int(x, 16)).encode("utf-8") |
43077
687b865b95ad
formatting: byteify all mercurial/ and hgext/ string literals
Augie Fackler <augie@google.com>
parents:
43076
diff
changeset
|
51 for x in b"200c 200d 200e 200f 202a 202b 202c 202d 202e " |
687b865b95ad
formatting: byteify all mercurial/ and hgext/ string literals
Augie Fackler <augie@google.com>
parents:
43076
diff
changeset
|
52 b"206a 206b 206c 206d 206e 206f feff".split() |
43076
2372284d9457
formatting: blacken the codebase
Augie Fackler <augie@google.com>
parents:
41836
diff
changeset
|
53 ] |
23596
885bd7c5c7e3
encoding: add hfsignoreclean to clean out HFS-ignored characters
Augie Fackler <raf@durin42.com>
parents:
22973
diff
changeset
|
54 # verify the next function will work |
43077
687b865b95ad
formatting: byteify all mercurial/ and hgext/ string literals
Augie Fackler <augie@google.com>
parents:
43076
diff
changeset
|
55 assert all(i.startswith((b"\xe2", b"\xef")) for i in _ignore) |
23596
885bd7c5c7e3
encoding: add hfsignoreclean to clean out HFS-ignored characters
Augie Fackler <raf@durin42.com>
parents:
22973
diff
changeset
|
56 |
43076
2372284d9457
formatting: blacken the codebase
Augie Fackler <augie@google.com>
parents:
41836
diff
changeset
|
57 |
51287
f15cb5111a1e
pytype: move some type comment to proper annotation
Pierre-Yves David <pierre-yves.david@octobus.net>
parents:
51285
diff
changeset
|
58 def hfsignoreclean(s: bytes) -> bytes: |
23596
885bd7c5c7e3
encoding: add hfsignoreclean to clean out HFS-ignored characters
Augie Fackler <raf@durin42.com>
parents:
22973
diff
changeset
|
59 """Remove codepoints ignored by HFS+ from s. |
885bd7c5c7e3
encoding: add hfsignoreclean to clean out HFS-ignored characters
Augie Fackler <raf@durin42.com>
parents:
22973
diff
changeset
|
60 |
885bd7c5c7e3
encoding: add hfsignoreclean to clean out HFS-ignored characters
Augie Fackler <raf@durin42.com>
parents:
22973
diff
changeset
|
61 >>> hfsignoreclean(u'.h\u200cg'.encode('utf-8')) |
885bd7c5c7e3
encoding: add hfsignoreclean to clean out HFS-ignored characters
Augie Fackler <raf@durin42.com>
parents:
22973
diff
changeset
|
62 '.hg' |
885bd7c5c7e3
encoding: add hfsignoreclean to clean out HFS-ignored characters
Augie Fackler <raf@durin42.com>
parents:
22973
diff
changeset
|
63 >>> hfsignoreclean(u'.h\ufeffg'.encode('utf-8')) |
885bd7c5c7e3
encoding: add hfsignoreclean to clean out HFS-ignored characters
Augie Fackler <raf@durin42.com>
parents:
22973
diff
changeset
|
64 '.hg' |
885bd7c5c7e3
encoding: add hfsignoreclean to clean out HFS-ignored characters
Augie Fackler <raf@durin42.com>
parents:
22973
diff
changeset
|
65 """ |
43077
687b865b95ad
formatting: byteify all mercurial/ and hgext/ string literals
Augie Fackler <augie@google.com>
parents:
43076
diff
changeset
|
66 if b"\xe2" in s or b"\xef" in s: |
23596
885bd7c5c7e3
encoding: add hfsignoreclean to clean out HFS-ignored characters
Augie Fackler <raf@durin42.com>
parents:
22973
diff
changeset
|
67 for c in _ignore: |
43077
687b865b95ad
formatting: byteify all mercurial/ and hgext/ string literals
Augie Fackler <augie@google.com>
parents:
43076
diff
changeset
|
68 s = s.replace(c, b'') |
23596
885bd7c5c7e3
encoding: add hfsignoreclean to clean out HFS-ignored characters
Augie Fackler <raf@durin42.com>
parents:
22973
diff
changeset
|
69 return s |
885bd7c5c7e3
encoding: add hfsignoreclean to clean out HFS-ignored characters
Augie Fackler <raf@durin42.com>
parents:
22973
diff
changeset
|
70 |
43076
2372284d9457
formatting: blacken the codebase
Augie Fackler <augie@google.com>
parents:
41836
diff
changeset
|
71 |
30034
e4a6b439acc5
py3: provide encoding.environ which is a dict of bytes
Yuya Nishihara <yuya@tcha.org>
parents:
30033
diff
changeset
|
72 # encoding.environ is provided read-only, which may not be used to modify |
e4a6b439acc5
py3: provide encoding.environ which is a dict of bytes
Yuya Nishihara <yuya@tcha.org>
parents:
30033
diff
changeset
|
73 # the process environment |
48892
fa2b1a46d92e
encoding: remove Python 2 support code
Gregory Szorc <gregory.szorc@gmail.com>
parents:
48875
diff
changeset
|
74 _nativeenviron = os.supports_bytes_environ |
fa2b1a46d92e
encoding: remove Python 2 support code
Gregory Szorc <gregory.szorc@gmail.com>
parents:
48875
diff
changeset
|
75 if _nativeenviron: |
32184
cf424dae5dc7
check-code: ignore re-exports of os.environ in encoding.py
Yuya Nishihara <yuya@tcha.org>
parents:
32156
diff
changeset
|
76 environ = os.environb # re-exports |
50995
80c243eab724
openvms: duck-punch a bugfix into `environb` object
Jean-Francois Pieronne <jf.pieronne@laposte.net>
parents:
50929
diff
changeset
|
77 if pycompat.sysplatform == b'OpenVMS': |
80c243eab724
openvms: duck-punch a bugfix into `environb` object
Jean-Francois Pieronne <jf.pieronne@laposte.net>
parents:
50929
diff
changeset
|
78 # workaround for a bug in VSI 3.10 port |
80c243eab724
openvms: duck-punch a bugfix into `environb` object
Jean-Francois Pieronne <jf.pieronne@laposte.net>
parents:
50929
diff
changeset
|
79 # os.environb is only populated with a few Predefined symbols |
80c243eab724
openvms: duck-punch a bugfix into `environb` object
Jean-Francois Pieronne <jf.pieronne@laposte.net>
parents:
50929
diff
changeset
|
80 def newget(self, key, default=None): |
80c243eab724
openvms: duck-punch a bugfix into `environb` object
Jean-Francois Pieronne <jf.pieronne@laposte.net>
parents:
50929
diff
changeset
|
81 # pytype on linux does not understand OpenVMS special modules |
80c243eab724
openvms: duck-punch a bugfix into `environb` object
Jean-Francois Pieronne <jf.pieronne@laposte.net>
parents:
50929
diff
changeset
|
82 import _decc # pytype: disable=import-error |
80c243eab724
openvms: duck-punch a bugfix into `environb` object
Jean-Francois Pieronne <jf.pieronne@laposte.net>
parents:
50929
diff
changeset
|
83 |
80c243eab724
openvms: duck-punch a bugfix into `environb` object
Jean-Francois Pieronne <jf.pieronne@laposte.net>
parents:
50929
diff
changeset
|
84 v = _decc.getenv(key, None) |
80c243eab724
openvms: duck-punch a bugfix into `environb` object
Jean-Francois Pieronne <jf.pieronne@laposte.net>
parents:
50929
diff
changeset
|
85 if isinstance(key, bytes): |
80c243eab724
openvms: duck-punch a bugfix into `environb` object
Jean-Francois Pieronne <jf.pieronne@laposte.net>
parents:
50929
diff
changeset
|
86 return default if v is None else v.encode('latin-1') |
80c243eab724
openvms: duck-punch a bugfix into `environb` object
Jean-Francois Pieronne <jf.pieronne@laposte.net>
parents:
50929
diff
changeset
|
87 else: |
80c243eab724
openvms: duck-punch a bugfix into `environb` object
Jean-Francois Pieronne <jf.pieronne@laposte.net>
parents:
50929
diff
changeset
|
88 return default if v is None else v |
80c243eab724
openvms: duck-punch a bugfix into `environb` object
Jean-Francois Pieronne <jf.pieronne@laposte.net>
parents:
50929
diff
changeset
|
89 |
80c243eab724
openvms: duck-punch a bugfix into `environb` object
Jean-Francois Pieronne <jf.pieronne@laposte.net>
parents:
50929
diff
changeset
|
90 environ.__class__.get = newget |
30034
e4a6b439acc5
py3: provide encoding.environ which is a dict of bytes
Yuya Nishihara <yuya@tcha.org>
parents:
30033
diff
changeset
|
91 else: |
e4a6b439acc5
py3: provide encoding.environ which is a dict of bytes
Yuya Nishihara <yuya@tcha.org>
parents:
30033
diff
changeset
|
92 # preferred encoding isn't known yet; use utf-8 to avoid unicode error |
e4a6b439acc5
py3: provide encoding.environ which is a dict of bytes
Yuya Nishihara <yuya@tcha.org>
parents:
30033
diff
changeset
|
93 # and recreate it once encoding is settled |
44452
9d2b2df2c2ba
cleanup: run pyupgrade on our source tree to clean up varying things
Augie Fackler <augie@google.com>
parents:
43787
diff
changeset
|
94 environ = { |
9d2b2df2c2ba
cleanup: run pyupgrade on our source tree to clean up varying things
Augie Fackler <augie@google.com>
parents:
43787
diff
changeset
|
95 k.encode('utf-8'): v.encode('utf-8') |
43076
2372284d9457
formatting: blacken the codebase
Augie Fackler <augie@google.com>
parents:
41836
diff
changeset
|
96 for k, v in os.environ.items() # re-exports |
44452
9d2b2df2c2ba
cleanup: run pyupgrade on our source tree to clean up varying things
Augie Fackler <augie@google.com>
parents:
43787
diff
changeset
|
97 } |
30034
e4a6b439acc5
py3: provide encoding.environ which is a dict of bytes
Yuya Nishihara <yuya@tcha.org>
parents:
30033
diff
changeset
|
98 |
39839
9e8fcd2e78c1
encoding: remove unnecessary lambdas from _encodingfixers
Martin von Zweigbergk <martinvonz@google.com>
parents:
39819
diff
changeset
|
99 _encodingrewrites = { |
43077
687b865b95ad
formatting: byteify all mercurial/ and hgext/ string literals
Augie Fackler <augie@google.com>
parents:
43076
diff
changeset
|
100 b'646': b'ascii', |
687b865b95ad
formatting: byteify all mercurial/ and hgext/ string literals
Augie Fackler <augie@google.com>
parents:
43076
diff
changeset
|
101 b'ANSI_X3.4-1968': b'ascii', |
11892
2be70ca17311
encoding: improve handling of buggy getpreferredencoding() on Mac OS X
Dan Villiom Podlaski Christiansen <danchr@gmail.com>
parents:
11297
diff
changeset
|
102 } |
38615
443029011990
encoding: alias cp65001 to utf-8 on Windows
Yuya Nishihara <yuya@tcha.org>
parents:
36797
diff
changeset
|
103 # cp65001 is a Windows variant of utf-8, which isn't supported on Python 2. |
443029011990
encoding: alias cp65001 to utf-8 on Windows
Yuya Nishihara <yuya@tcha.org>
parents:
36797
diff
changeset
|
104 # No idea if it should be rewritten to the canonical name 'utf-8' on Python 3. |
443029011990
encoding: alias cp65001 to utf-8 on Windows
Yuya Nishihara <yuya@tcha.org>
parents:
36797
diff
changeset
|
105 # https://bugs.python.org/issue13216 |
48892
fa2b1a46d92e
encoding: remove Python 2 support code
Gregory Szorc <gregory.szorc@gmail.com>
parents:
48875
diff
changeset
|
106 if pycompat.iswindows: |
43077
687b865b95ad
formatting: byteify all mercurial/ and hgext/ string literals
Augie Fackler <augie@google.com>
parents:
43076
diff
changeset
|
107 _encodingrewrites[b'cp65001'] = b'utf-8' |
7948
de377b1a9a84
move encoding bits from util to encoding
Matt Mackall <mpm@selenic.com>
parents:
diff
changeset
|
108 |
51673
f70f61a8c5bc
typing: restore `encoding.encoding` and `encoding.encodingmode` to bytes
Matt Harbison <matt_harbison@yahoo.com>
parents:
51290
diff
changeset
|
109 encoding: bytes = b'' # help pytype avoid seeing None value |
7948
de377b1a9a84
move encoding bits from util to encoding
Matt Mackall <mpm@selenic.com>
parents:
diff
changeset
|
110 try: |
51673
f70f61a8c5bc
typing: restore `encoding.encoding` and `encoding.encodingmode` to bytes
Matt Harbison <matt_harbison@yahoo.com>
parents:
51290
diff
changeset
|
111 encoding = environ.get(b"HGENCODING", b'') |
7948
de377b1a9a84
move encoding bits from util to encoding
Matt Mackall <mpm@selenic.com>
parents:
diff
changeset
|
112 if not encoding: |
43077
687b865b95ad
formatting: byteify all mercurial/ and hgext/ string literals
Augie Fackler <augie@google.com>
parents:
43076
diff
changeset
|
113 encoding = locale.getpreferredencoding().encode('ascii') or b'ascii' |
39839
9e8fcd2e78c1
encoding: remove unnecessary lambdas from _encodingfixers
Martin von Zweigbergk <martinvonz@google.com>
parents:
39819
diff
changeset
|
114 encoding = _encodingrewrites.get(encoding, encoding) |
7948
de377b1a9a84
move encoding bits from util to encoding
Matt Mackall <mpm@selenic.com>
parents:
diff
changeset
|
115 except locale.Error: |
43077
687b865b95ad
formatting: byteify all mercurial/ and hgext/ string literals
Augie Fackler <augie@google.com>
parents:
43076
diff
changeset
|
116 encoding = b'ascii' |
51673
f70f61a8c5bc
typing: restore `encoding.encoding` and `encoding.encodingmode` to bytes
Matt Harbison <matt_harbison@yahoo.com>
parents:
51290
diff
changeset
|
117 encodingmode: bytes = environ.get(b"HGENCODINGMODE", b"strict") |
43077
687b865b95ad
formatting: byteify all mercurial/ and hgext/ string literals
Augie Fackler <augie@google.com>
parents:
43076
diff
changeset
|
118 fallbackencoding = b'ISO-8859-1' |
7948
de377b1a9a84
move encoding bits from util to encoding
Matt Mackall <mpm@selenic.com>
parents:
diff
changeset
|
119 |
43076
2372284d9457
formatting: blacken the codebase
Augie Fackler <augie@google.com>
parents:
41836
diff
changeset
|
120 |
33811
dabe1f11ae3a
py3: change encoding.localstr to a subclass of bytes, not str
Yuya Nishihara <yuya@tcha.org>
parents:
33761
diff
changeset
|
121 class localstr(bytes): |
45942
89a2afe31e82
formating: upgrade to black 20.8b1
Augie Fackler <raf@durin42.com>
parents:
45681
diff
changeset
|
122 """This class allows strings that are unmodified to be |
89a2afe31e82
formating: upgrade to black 20.8b1
Augie Fackler <raf@durin42.com>
parents:
45681
diff
changeset
|
123 round-tripped to the local encoding and back""" |
43076
2372284d9457
formatting: blacken the codebase
Augie Fackler <augie@google.com>
parents:
41836
diff
changeset
|
124 |
13046
7cc4263e07a9
encoding: add localstr class to track UTF-8 version of transcoded strings
Matt Mackall <mpm@selenic.com>
parents:
12866
diff
changeset
|
125 def __new__(cls, u, l): |
33811
dabe1f11ae3a
py3: change encoding.localstr to a subclass of bytes, not str
Yuya Nishihara <yuya@tcha.org>
parents:
33761
diff
changeset
|
126 s = bytes.__new__(cls, l) |
13046
7cc4263e07a9
encoding: add localstr class to track UTF-8 version of transcoded strings
Matt Mackall <mpm@selenic.com>
parents:
12866
diff
changeset
|
127 s._utf8 = u |
7cc4263e07a9
encoding: add localstr class to track UTF-8 version of transcoded strings
Matt Mackall <mpm@selenic.com>
parents:
12866
diff
changeset
|
128 return s |
43076
2372284d9457
formatting: blacken the codebase
Augie Fackler <augie@google.com>
parents:
41836
diff
changeset
|
129 |
51285
9d3721552b6c
pytype: import typing directly
Pierre-Yves David <pierre-yves.david@octobus.net>
parents:
50995
diff
changeset
|
130 if typing.TYPE_CHECKING: |
43685
da925257a39e
typing: add pseudo localstr.__init__() to help pytype
Yuya Nishihara <yuya@tcha.org>
parents:
43684
diff
changeset
|
131 # pseudo implementation to help pytype see localstr() constructor |
51287
f15cb5111a1e
pytype: move some type comment to proper annotation
Pierre-Yves David <pierre-yves.david@octobus.net>
parents:
51285
diff
changeset
|
132 def __init__(self, u: bytes, l: bytes) -> None: |
43685
da925257a39e
typing: add pseudo localstr.__init__() to help pytype
Yuya Nishihara <yuya@tcha.org>
parents:
43684
diff
changeset
|
133 super(localstr, self).__init__(l) |
da925257a39e
typing: add pseudo localstr.__init__() to help pytype
Yuya Nishihara <yuya@tcha.org>
parents:
43684
diff
changeset
|
134 self._utf8 = u |
da925257a39e
typing: add pseudo localstr.__init__() to help pytype
Yuya Nishihara <yuya@tcha.org>
parents:
43684
diff
changeset
|
135 |
13046
7cc4263e07a9
encoding: add localstr class to track UTF-8 version of transcoded strings
Matt Mackall <mpm@selenic.com>
parents:
12866
diff
changeset
|
136 def __hash__(self): |
43076
2372284d9457
formatting: blacken the codebase
Augie Fackler <augie@google.com>
parents:
41836
diff
changeset
|
137 return hash(self._utf8) # avoid collisions in local string space |
2372284d9457
formatting: blacken the codebase
Augie Fackler <augie@google.com>
parents:
41836
diff
changeset
|
138 |
13046
7cc4263e07a9
encoding: add localstr class to track UTF-8 version of transcoded strings
Matt Mackall <mpm@selenic.com>
parents:
12866
diff
changeset
|
139 |
37947
3ea3c96ada54
encoding: introduce tagging type for non-lossy non-ASCII string
Yuya Nishihara <yuya@tcha.org>
parents:
37946
diff
changeset
|
140 class safelocalstr(bytes): |
3ea3c96ada54
encoding: introduce tagging type for non-lossy non-ASCII string
Yuya Nishihara <yuya@tcha.org>
parents:
37946
diff
changeset
|
141 """Tagged string denoting it was previously an internal UTF-8 string, |
3ea3c96ada54
encoding: introduce tagging type for non-lossy non-ASCII string
Yuya Nishihara <yuya@tcha.org>
parents:
37946
diff
changeset
|
142 and can be converted back to UTF-8 losslessly |
3ea3c96ada54
encoding: introduce tagging type for non-lossy non-ASCII string
Yuya Nishihara <yuya@tcha.org>
parents:
37946
diff
changeset
|
143 |
3ea3c96ada54
encoding: introduce tagging type for non-lossy non-ASCII string
Yuya Nishihara <yuya@tcha.org>
parents:
37946
diff
changeset
|
144 >>> assert safelocalstr(b'\\xc3') == b'\\xc3' |
3ea3c96ada54
encoding: introduce tagging type for non-lossy non-ASCII string
Yuya Nishihara <yuya@tcha.org>
parents:
37946
diff
changeset
|
145 >>> assert b'\\xc3' == safelocalstr(b'\\xc3') |
3ea3c96ada54
encoding: introduce tagging type for non-lossy non-ASCII string
Yuya Nishihara <yuya@tcha.org>
parents:
37946
diff
changeset
|
146 >>> assert b'\\xc3' in {safelocalstr(b'\\xc3'): 0} |
3ea3c96ada54
encoding: introduce tagging type for non-lossy non-ASCII string
Yuya Nishihara <yuya@tcha.org>
parents:
37946
diff
changeset
|
147 >>> assert safelocalstr(b'\\xc3') in {b'\\xc3': 0} |
3ea3c96ada54
encoding: introduce tagging type for non-lossy non-ASCII string
Yuya Nishihara <yuya@tcha.org>
parents:
37946
diff
changeset
|
148 """ |
3ea3c96ada54
encoding: introduce tagging type for non-lossy non-ASCII string
Yuya Nishihara <yuya@tcha.org>
parents:
37946
diff
changeset
|
149 |
43076
2372284d9457
formatting: blacken the codebase
Augie Fackler <augie@google.com>
parents:
41836
diff
changeset
|
150 |
51287
f15cb5111a1e
pytype: move some type comment to proper annotation
Pierre-Yves David <pierre-yves.david@octobus.net>
parents:
51285
diff
changeset
|
151 def tolocal(s: bytes) -> bytes: |
7948
de377b1a9a84
move encoding bits from util to encoding
Matt Mackall <mpm@selenic.com>
parents:
diff
changeset
|
152 """ |
de377b1a9a84
move encoding bits from util to encoding
Matt Mackall <mpm@selenic.com>
parents:
diff
changeset
|
153 Convert a string from internal UTF-8 to local encoding |
de377b1a9a84
move encoding bits from util to encoding
Matt Mackall <mpm@selenic.com>
parents:
diff
changeset
|
154 |
de377b1a9a84
move encoding bits from util to encoding
Matt Mackall <mpm@selenic.com>
parents:
diff
changeset
|
155 All internal strings should be UTF-8 but some repos before the |
de377b1a9a84
move encoding bits from util to encoding
Matt Mackall <mpm@selenic.com>
parents:
diff
changeset
|
156 implementation of locale support may contain latin1 or possibly |
de377b1a9a84
move encoding bits from util to encoding
Matt Mackall <mpm@selenic.com>
parents:
diff
changeset
|
157 other character sets. We attempt to decode everything strictly |
de377b1a9a84
move encoding bits from util to encoding
Matt Mackall <mpm@selenic.com>
parents:
diff
changeset
|
158 using UTF-8, then Latin-1, and failing that, we use UTF-8 and |
de377b1a9a84
move encoding bits from util to encoding
Matt Mackall <mpm@selenic.com>
parents:
diff
changeset
|
159 replace unknown characters. |
13046
7cc4263e07a9
encoding: add localstr class to track UTF-8 version of transcoded strings
Matt Mackall <mpm@selenic.com>
parents:
12866
diff
changeset
|
160 |
7cc4263e07a9
encoding: add localstr class to track UTF-8 version of transcoded strings
Matt Mackall <mpm@selenic.com>
parents:
12866
diff
changeset
|
161 The localstr class is used to cache the known UTF-8 encoding of |
7cc4263e07a9
encoding: add localstr class to track UTF-8 version of transcoded strings
Matt Mackall <mpm@selenic.com>
parents:
12866
diff
changeset
|
162 strings next to their local representation to allow lossless |
7cc4263e07a9
encoding: add localstr class to track UTF-8 version of transcoded strings
Matt Mackall <mpm@selenic.com>
parents:
12866
diff
changeset
|
163 round-trip conversion back to UTF-8. |
7cc4263e07a9
encoding: add localstr class to track UTF-8 version of transcoded strings
Matt Mackall <mpm@selenic.com>
parents:
12866
diff
changeset
|
164 |
34131
0fa781320203
doctest: bulk-replace string literals with b'' for Python 3
Yuya Nishihara <yuya@tcha.org>
parents:
33928
diff
changeset
|
165 >>> u = b'foo: \\xc3\\xa4' # utf-8 |
13046
7cc4263e07a9
encoding: add localstr class to track UTF-8 version of transcoded strings
Matt Mackall <mpm@selenic.com>
parents:
12866
diff
changeset
|
166 >>> l = tolocal(u) |
7cc4263e07a9
encoding: add localstr class to track UTF-8 version of transcoded strings
Matt Mackall <mpm@selenic.com>
parents:
12866
diff
changeset
|
167 >>> l |
7cc4263e07a9
encoding: add localstr class to track UTF-8 version of transcoded strings
Matt Mackall <mpm@selenic.com>
parents:
12866
diff
changeset
|
168 'foo: ?' |
7cc4263e07a9
encoding: add localstr class to track UTF-8 version of transcoded strings
Matt Mackall <mpm@selenic.com>
parents:
12866
diff
changeset
|
169 >>> fromlocal(l) |
7cc4263e07a9
encoding: add localstr class to track UTF-8 version of transcoded strings
Matt Mackall <mpm@selenic.com>
parents:
12866
diff
changeset
|
170 'foo: \\xc3\\xa4' |
34131
0fa781320203
doctest: bulk-replace string literals with b'' for Python 3
Yuya Nishihara <yuya@tcha.org>
parents:
33928
diff
changeset
|
171 >>> u2 = b'foo: \\xc3\\xa1' |
13046
7cc4263e07a9
encoding: add localstr class to track UTF-8 version of transcoded strings
Matt Mackall <mpm@selenic.com>
parents:
12866
diff
changeset
|
172 >>> d = { l: 1, tolocal(u2): 2 } |
18378
404feac78b8a
tests: stabilize doctest output
Mads Kiilerich <mads@kiilerich.com>
parents:
17424
diff
changeset
|
173 >>> len(d) # no collision |
404feac78b8a
tests: stabilize doctest output
Mads Kiilerich <mads@kiilerich.com>
parents:
17424
diff
changeset
|
174 2 |
34131
0fa781320203
doctest: bulk-replace string literals with b'' for Python 3
Yuya Nishihara <yuya@tcha.org>
parents:
33928
diff
changeset
|
175 >>> b'foo: ?' in d |
13046
7cc4263e07a9
encoding: add localstr class to track UTF-8 version of transcoded strings
Matt Mackall <mpm@selenic.com>
parents:
12866
diff
changeset
|
176 False |
34131
0fa781320203
doctest: bulk-replace string literals with b'' for Python 3
Yuya Nishihara <yuya@tcha.org>
parents:
33928
diff
changeset
|
177 >>> l1 = b'foo: \\xe4' # historical latin1 fallback |
13046
7cc4263e07a9
encoding: add localstr class to track UTF-8 version of transcoded strings
Matt Mackall <mpm@selenic.com>
parents:
12866
diff
changeset
|
178 >>> l = tolocal(l1) |
7cc4263e07a9
encoding: add localstr class to track UTF-8 version of transcoded strings
Matt Mackall <mpm@selenic.com>
parents:
12866
diff
changeset
|
179 >>> l |
7cc4263e07a9
encoding: add localstr class to track UTF-8 version of transcoded strings
Matt Mackall <mpm@selenic.com>
parents:
12866
diff
changeset
|
180 'foo: ?' |
7cc4263e07a9
encoding: add localstr class to track UTF-8 version of transcoded strings
Matt Mackall <mpm@selenic.com>
parents:
12866
diff
changeset
|
181 >>> fromlocal(l) # magically in utf-8 |
7cc4263e07a9
encoding: add localstr class to track UTF-8 version of transcoded strings
Matt Mackall <mpm@selenic.com>
parents:
12866
diff
changeset
|
182 'foo: \\xc3\\xa4' |
7948
de377b1a9a84
move encoding bits from util to encoding
Matt Mackall <mpm@selenic.com>
parents:
diff
changeset
|
183 """ |
13046
7cc4263e07a9
encoding: add localstr class to track UTF-8 version of transcoded strings
Matt Mackall <mpm@selenic.com>
parents:
12866
diff
changeset
|
184 |
33927
853574db5b12
encoding: add fast path of from/tolocal() for ASCII strings
Yuya Nishihara <yuya@tcha.org>
parents:
33926
diff
changeset
|
185 if isasciistr(s): |
853574db5b12
encoding: add fast path of from/tolocal() for ASCII strings
Yuya Nishihara <yuya@tcha.org>
parents:
33926
diff
changeset
|
186 return s |
853574db5b12
encoding: add fast path of from/tolocal() for ASCII strings
Yuya Nishihara <yuya@tcha.org>
parents:
33926
diff
changeset
|
187 |
16274
5d75eb8568d1
encoding: tune fast-path of tolocal a bit
Matt Mackall <mpm@selenic.com>
parents:
16133
diff
changeset
|
188 try: |
7948
de377b1a9a84
move encoding bits from util to encoding
Matt Mackall <mpm@selenic.com>
parents:
diff
changeset
|
189 try: |
16274
5d75eb8568d1
encoding: tune fast-path of tolocal a bit
Matt Mackall <mpm@selenic.com>
parents:
16133
diff
changeset
|
190 # make sure string is actually stored in UTF-8 |
5d75eb8568d1
encoding: tune fast-path of tolocal a bit
Matt Mackall <mpm@selenic.com>
parents:
16133
diff
changeset
|
191 u = s.decode('UTF-8') |
43077
687b865b95ad
formatting: byteify all mercurial/ and hgext/ string literals
Augie Fackler <augie@google.com>
parents:
43076
diff
changeset
|
192 if encoding == b'UTF-8': |
16274
5d75eb8568d1
encoding: tune fast-path of tolocal a bit
Matt Mackall <mpm@selenic.com>
parents:
16133
diff
changeset
|
193 # fast path |
5d75eb8568d1
encoding: tune fast-path of tolocal a bit
Matt Mackall <mpm@selenic.com>
parents:
16133
diff
changeset
|
194 return s |
43503
313e3a279828
cleanup: remove pointless r-prefixes on double-quoted strings
Augie Fackler <augie@google.com>
parents:
43496
diff
changeset
|
195 r = u.encode(_sysstr(encoding), "replace") |
30033
02dbfaa6df0b
py3: convert encoding name and mode to str
Yuya Nishihara <yuya@tcha.org>
parents:
30030
diff
changeset
|
196 if u == r.decode(_sysstr(encoding)): |
13940
b7b26e54e37a
encoding: avoid localstr when a string can be encoded losslessly (issue2763)
Matt Mackall <mpm@selenic.com>
parents:
13051
diff
changeset
|
197 # r is a safe, non-lossy encoding of s |
37947
3ea3c96ada54
encoding: introduce tagging type for non-lossy non-ASCII string
Yuya Nishihara <yuya@tcha.org>
parents:
37946
diff
changeset
|
198 return safelocalstr(r) |
16274
5d75eb8568d1
encoding: tune fast-path of tolocal a bit
Matt Mackall <mpm@selenic.com>
parents:
16133
diff
changeset
|
199 return localstr(s, r) |
5d75eb8568d1
encoding: tune fast-path of tolocal a bit
Matt Mackall <mpm@selenic.com>
parents:
16133
diff
changeset
|
200 except UnicodeDecodeError: |
5d75eb8568d1
encoding: tune fast-path of tolocal a bit
Matt Mackall <mpm@selenic.com>
parents:
16133
diff
changeset
|
201 # we should only get here if we're looking at an ancient changeset |
5d75eb8568d1
encoding: tune fast-path of tolocal a bit
Matt Mackall <mpm@selenic.com>
parents:
16133
diff
changeset
|
202 try: |
30033
02dbfaa6df0b
py3: convert encoding name and mode to str
Yuya Nishihara <yuya@tcha.org>
parents:
30030
diff
changeset
|
203 u = s.decode(_sysstr(fallbackencoding)) |
43503
313e3a279828
cleanup: remove pointless r-prefixes on double-quoted strings
Augie Fackler <augie@google.com>
parents:
43496
diff
changeset
|
204 r = u.encode(_sysstr(encoding), "replace") |
30033
02dbfaa6df0b
py3: convert encoding name and mode to str
Yuya Nishihara <yuya@tcha.org>
parents:
30030
diff
changeset
|
205 if u == r.decode(_sysstr(encoding)): |
16274
5d75eb8568d1
encoding: tune fast-path of tolocal a bit
Matt Mackall <mpm@selenic.com>
parents:
16133
diff
changeset
|
206 # r is a safe, non-lossy encoding of s |
37947
3ea3c96ada54
encoding: introduce tagging type for non-lossy non-ASCII string
Yuya Nishihara <yuya@tcha.org>
parents:
37946
diff
changeset
|
207 return safelocalstr(r) |
13940
b7b26e54e37a
encoding: avoid localstr when a string can be encoded losslessly (issue2763)
Matt Mackall <mpm@selenic.com>
parents:
13051
diff
changeset
|
208 return localstr(u.encode('UTF-8'), r) |
16274
5d75eb8568d1
encoding: tune fast-path of tolocal a bit
Matt Mackall <mpm@selenic.com>
parents:
16133
diff
changeset
|
209 except UnicodeDecodeError: |
43076
2372284d9457
formatting: blacken the codebase
Augie Fackler <augie@google.com>
parents:
41836
diff
changeset
|
210 u = s.decode("utf-8", "replace") # last ditch |
30033
02dbfaa6df0b
py3: convert encoding name and mode to str
Yuya Nishihara <yuya@tcha.org>
parents:
30030
diff
changeset
|
211 # can't round-trip |
43503
313e3a279828
cleanup: remove pointless r-prefixes on double-quoted strings
Augie Fackler <augie@google.com>
parents:
43496
diff
changeset
|
212 return u.encode(_sysstr(encoding), "replace") |
25660
328739ea70c3
global: mass rewrite to use modern exception syntax
Gregory Szorc <gregory.szorc@gmail.com>
parents:
24608
diff
changeset
|
213 except LookupError as k: |
45681
a736ab681b78
errors: stop passing non-strings to Abort's constructor
Martin von Zweigbergk <martinvonz@google.com>
parents:
44452
diff
changeset
|
214 raise error.Abort( |
a736ab681b78
errors: stop passing non-strings to Abort's constructor
Martin von Zweigbergk <martinvonz@google.com>
parents:
44452
diff
changeset
|
215 pycompat.bytestr(k), hint=b"please check your locale settings" |
a736ab681b78
errors: stop passing non-strings to Abort's constructor
Martin von Zweigbergk <martinvonz@google.com>
parents:
44452
diff
changeset
|
216 ) |
7948
de377b1a9a84
move encoding bits from util to encoding
Matt Mackall <mpm@selenic.com>
parents:
diff
changeset
|
217 |
43076
2372284d9457
formatting: blacken the codebase
Augie Fackler <augie@google.com>
parents:
41836
diff
changeset
|
218 |
51287
f15cb5111a1e
pytype: move some type comment to proper annotation
Pierre-Yves David <pierre-yves.david@octobus.net>
parents:
51285
diff
changeset
|
219 def fromlocal(s: bytes) -> bytes: |
7948
de377b1a9a84
move encoding bits from util to encoding
Matt Mackall <mpm@selenic.com>
parents:
diff
changeset
|
220 """ |
de377b1a9a84
move encoding bits from util to encoding
Matt Mackall <mpm@selenic.com>
parents:
diff
changeset
|
221 Convert a string from the local character encoding to UTF-8 |
de377b1a9a84
move encoding bits from util to encoding
Matt Mackall <mpm@selenic.com>
parents:
diff
changeset
|
222 |
de377b1a9a84
move encoding bits from util to encoding
Matt Mackall <mpm@selenic.com>
parents:
diff
changeset
|
223 We attempt to decode strings using the encoding mode set by |
de377b1a9a84
move encoding bits from util to encoding
Matt Mackall <mpm@selenic.com>
parents:
diff
changeset
|
224 HGENCODINGMODE, which defaults to 'strict'. In this mode, unknown |
de377b1a9a84
move encoding bits from util to encoding
Matt Mackall <mpm@selenic.com>
parents:
diff
changeset
|
225 characters will cause an error message. Other modes include |
de377b1a9a84
move encoding bits from util to encoding
Matt Mackall <mpm@selenic.com>
parents:
diff
changeset
|
226 'replace', which replaces unknown characters with a special |
de377b1a9a84
move encoding bits from util to encoding
Matt Mackall <mpm@selenic.com>
parents:
diff
changeset
|
227 Unicode character, and 'ignore', which drops the character. |
de377b1a9a84
move encoding bits from util to encoding
Matt Mackall <mpm@selenic.com>
parents:
diff
changeset
|
228 """ |
13046
7cc4263e07a9
encoding: add localstr class to track UTF-8 version of transcoded strings
Matt Mackall <mpm@selenic.com>
parents:
12866
diff
changeset
|
229 |
7cc4263e07a9
encoding: add localstr class to track UTF-8 version of transcoded strings
Matt Mackall <mpm@selenic.com>
parents:
12866
diff
changeset
|
230 # can we do a lossless round-trip? |
7cc4263e07a9
encoding: add localstr class to track UTF-8 version of transcoded strings
Matt Mackall <mpm@selenic.com>
parents:
12866
diff
changeset
|
231 if isinstance(s, localstr): |
7cc4263e07a9
encoding: add localstr class to track UTF-8 version of transcoded strings
Matt Mackall <mpm@selenic.com>
parents:
12866
diff
changeset
|
232 return s._utf8 |
33927
853574db5b12
encoding: add fast path of from/tolocal() for ASCII strings
Yuya Nishihara <yuya@tcha.org>
parents:
33926
diff
changeset
|
233 if isasciistr(s): |
853574db5b12
encoding: add fast path of from/tolocal() for ASCII strings
Yuya Nishihara <yuya@tcha.org>
parents:
33926
diff
changeset
|
234 return s |
13046
7cc4263e07a9
encoding: add localstr class to track UTF-8 version of transcoded strings
Matt Mackall <mpm@selenic.com>
parents:
12866
diff
changeset
|
235 |
7948
de377b1a9a84
move encoding bits from util to encoding
Matt Mackall <mpm@selenic.com>
parents:
diff
changeset
|
236 try: |
30033
02dbfaa6df0b
py3: convert encoding name and mode to str
Yuya Nishihara <yuya@tcha.org>
parents:
30030
diff
changeset
|
237 u = s.decode(_sysstr(encoding), _sysstr(encodingmode)) |
02dbfaa6df0b
py3: convert encoding name and mode to str
Yuya Nishihara <yuya@tcha.org>
parents:
30030
diff
changeset
|
238 return u.encode("utf-8") |
25660
328739ea70c3
global: mass rewrite to use modern exception syntax
Gregory Szorc <gregory.szorc@gmail.com>
parents:
24608
diff
changeset
|
239 except UnicodeDecodeError as inst: |
43076
2372284d9457
formatting: blacken the codebase
Augie Fackler <augie@google.com>
parents:
41836
diff
changeset
|
240 sub = s[max(0, inst.start - 10) : inst.start + 10] |
2372284d9457
formatting: blacken the codebase
Augie Fackler <augie@google.com>
parents:
41836
diff
changeset
|
241 raise error.Abort( |
43077
687b865b95ad
formatting: byteify all mercurial/ and hgext/ string literals
Augie Fackler <augie@google.com>
parents:
43076
diff
changeset
|
242 b"decoding near '%s': %s!" % (sub, pycompat.bytestr(inst)) |
43076
2372284d9457
formatting: blacken the codebase
Augie Fackler <augie@google.com>
parents:
41836
diff
changeset
|
243 ) |
25660
328739ea70c3
global: mass rewrite to use modern exception syntax
Gregory Szorc <gregory.szorc@gmail.com>
parents:
24608
diff
changeset
|
244 except LookupError as k: |
48007
28c62f83b652
encoding: force a few Errors to bytes before passing to `error.Abort`
Matt Harbison <matt_harbison@yahoo.com>
parents:
47621
diff
changeset
|
245 raise error.Abort( |
28c62f83b652
encoding: force a few Errors to bytes before passing to `error.Abort`
Matt Harbison <matt_harbison@yahoo.com>
parents:
47621
diff
changeset
|
246 pycompat.bytestr(k), hint=b"please check your locale settings" |
28c62f83b652
encoding: force a few Errors to bytes before passing to `error.Abort`
Matt Harbison <matt_harbison@yahoo.com>
parents:
47621
diff
changeset
|
247 ) |
7948
de377b1a9a84
move encoding bits from util to encoding
Matt Mackall <mpm@selenic.com>
parents:
diff
changeset
|
248 |
43076
2372284d9457
formatting: blacken the codebase
Augie Fackler <augie@google.com>
parents:
41836
diff
changeset
|
249 |
51287
f15cb5111a1e
pytype: move some type comment to proper annotation
Pierre-Yves David <pierre-yves.david@octobus.net>
parents:
51285
diff
changeset
|
250 def unitolocal(u: str) -> bytes: |
31447
067add650129
encoding: factor out unicode variants of from/tolocal()
Yuya Nishihara <yuya@tcha.org>
parents:
30622
diff
changeset
|
251 """Convert a unicode string to a byte string of local encoding""" |
067add650129
encoding: factor out unicode variants of from/tolocal()
Yuya Nishihara <yuya@tcha.org>
parents:
30622
diff
changeset
|
252 return tolocal(u.encode('utf-8')) |
067add650129
encoding: factor out unicode variants of from/tolocal()
Yuya Nishihara <yuya@tcha.org>
parents:
30622
diff
changeset
|
253 |
43076
2372284d9457
formatting: blacken the codebase
Augie Fackler <augie@google.com>
parents:
41836
diff
changeset
|
254 |
51287
f15cb5111a1e
pytype: move some type comment to proper annotation
Pierre-Yves David <pierre-yves.david@octobus.net>
parents:
51285
diff
changeset
|
255 def unifromlocal(s: bytes) -> str: |
31447
067add650129
encoding: factor out unicode variants of from/tolocal()
Yuya Nishihara <yuya@tcha.org>
parents:
30622
diff
changeset
|
256 """Convert a byte string of local encoding to a unicode string""" |
067add650129
encoding: factor out unicode variants of from/tolocal()
Yuya Nishihara <yuya@tcha.org>
parents:
30622
diff
changeset
|
257 return fromlocal(s).decode('utf-8') |
067add650129
encoding: factor out unicode variants of from/tolocal()
Yuya Nishihara <yuya@tcha.org>
parents:
30622
diff
changeset
|
258 |
43076
2372284d9457
formatting: blacken the codebase
Augie Fackler <augie@google.com>
parents:
41836
diff
changeset
|
259 |
51287
f15cb5111a1e
pytype: move some type comment to proper annotation
Pierre-Yves David <pierre-yves.david@octobus.net>
parents:
51285
diff
changeset
|
260 def unimethod(bytesfunc: Callable[[Any], bytes]) -> Callable[[Any], str]: |
33022
ce96efec8112
py3: add utility to forward __str__() to __bytes__()
Yuya Nishihara <yuya@tcha.org>
parents:
32537
diff
changeset
|
261 """Create a proxy method that forwards __unicode__() and __str__() of |
ce96efec8112
py3: add utility to forward __str__() to __bytes__()
Yuya Nishihara <yuya@tcha.org>
parents:
32537
diff
changeset
|
262 Python 3 to __bytes__()""" |
43076
2372284d9457
formatting: blacken the codebase
Augie Fackler <augie@google.com>
parents:
41836
diff
changeset
|
263 |
33022
ce96efec8112
py3: add utility to forward __str__() to __bytes__()
Yuya Nishihara <yuya@tcha.org>
parents:
32537
diff
changeset
|
264 def unifunc(obj): |
ce96efec8112
py3: add utility to forward __str__() to __bytes__()
Yuya Nishihara <yuya@tcha.org>
parents:
32537
diff
changeset
|
265 return unifromlocal(bytesfunc(obj)) |
43076
2372284d9457
formatting: blacken the codebase
Augie Fackler <augie@google.com>
parents:
41836
diff
changeset
|
266 |
33022
ce96efec8112
py3: add utility to forward __str__() to __bytes__()
Yuya Nishihara <yuya@tcha.org>
parents:
32537
diff
changeset
|
267 return unifunc |
ce96efec8112
py3: add utility to forward __str__() to __bytes__()
Yuya Nishihara <yuya@tcha.org>
parents:
32537
diff
changeset
|
268 |
43076
2372284d9457
formatting: blacken the codebase
Augie Fackler <augie@google.com>
parents:
41836
diff
changeset
|
269 |
31448
6419cd243017
encoding: add converter between native str and byte string
Yuya Nishihara <yuya@tcha.org>
parents:
31447
diff
changeset
|
270 # converter functions between native str and byte string. use these if the |
6419cd243017
encoding: add converter between native str and byte string
Yuya Nishihara <yuya@tcha.org>
parents:
31447
diff
changeset
|
271 # character encoding is not aware (e.g. exception message) or is known to |
6419cd243017
encoding: add converter between native str and byte string
Yuya Nishihara <yuya@tcha.org>
parents:
31447
diff
changeset
|
272 # be locale dependent (e.g. date formatting.) |
48892
fa2b1a46d92e
encoding: remove Python 2 support code
Gregory Szorc <gregory.szorc@gmail.com>
parents:
48875
diff
changeset
|
273 strtolocal = unitolocal |
fa2b1a46d92e
encoding: remove Python 2 support code
Gregory Szorc <gregory.szorc@gmail.com>
parents:
48875
diff
changeset
|
274 strfromlocal = unifromlocal |
fa2b1a46d92e
encoding: remove Python 2 support code
Gregory Szorc <gregory.szorc@gmail.com>
parents:
48875
diff
changeset
|
275 strmethod = unimethod |
31448
6419cd243017
encoding: add converter between native str and byte string
Yuya Nishihara <yuya@tcha.org>
parents:
31447
diff
changeset
|
276 |
47559
53a864a60281
encoding: move case-related utils up
Raphaël Gomès <rgomes@octobus.net>
parents:
46819
diff
changeset
|
277 |
51287
f15cb5111a1e
pytype: move some type comment to proper annotation
Pierre-Yves David <pierre-yves.david@octobus.net>
parents:
51285
diff
changeset
|
278 def lower(s: bytes) -> bytes: |
47559
53a864a60281
encoding: move case-related utils up
Raphaël Gomès <rgomes@octobus.net>
parents:
46819
diff
changeset
|
279 """best-effort encoding-aware case-folding of local string s""" |
53a864a60281
encoding: move case-related utils up
Raphaël Gomès <rgomes@octobus.net>
parents:
46819
diff
changeset
|
280 try: |
53a864a60281
encoding: move case-related utils up
Raphaël Gomès <rgomes@octobus.net>
parents:
46819
diff
changeset
|
281 return asciilower(s) |
53a864a60281
encoding: move case-related utils up
Raphaël Gomès <rgomes@octobus.net>
parents:
46819
diff
changeset
|
282 except UnicodeDecodeError: |
53a864a60281
encoding: move case-related utils up
Raphaël Gomès <rgomes@octobus.net>
parents:
46819
diff
changeset
|
283 pass |
53a864a60281
encoding: move case-related utils up
Raphaël Gomès <rgomes@octobus.net>
parents:
46819
diff
changeset
|
284 try: |
53a864a60281
encoding: move case-related utils up
Raphaël Gomès <rgomes@octobus.net>
parents:
46819
diff
changeset
|
285 if isinstance(s, localstr): |
53a864a60281
encoding: move case-related utils up
Raphaël Gomès <rgomes@octobus.net>
parents:
46819
diff
changeset
|
286 u = s._utf8.decode("utf-8") |
53a864a60281
encoding: move case-related utils up
Raphaël Gomès <rgomes@octobus.net>
parents:
46819
diff
changeset
|
287 else: |
53a864a60281
encoding: move case-related utils up
Raphaël Gomès <rgomes@octobus.net>
parents:
46819
diff
changeset
|
288 u = s.decode(_sysstr(encoding), _sysstr(encodingmode)) |
53a864a60281
encoding: move case-related utils up
Raphaël Gomès <rgomes@octobus.net>
parents:
46819
diff
changeset
|
289 |
53a864a60281
encoding: move case-related utils up
Raphaël Gomès <rgomes@octobus.net>
parents:
46819
diff
changeset
|
290 lu = u.lower() |
53a864a60281
encoding: move case-related utils up
Raphaël Gomès <rgomes@octobus.net>
parents:
46819
diff
changeset
|
291 if u == lu: |
53a864a60281
encoding: move case-related utils up
Raphaël Gomès <rgomes@octobus.net>
parents:
46819
diff
changeset
|
292 return s # preserve localstring |
53a864a60281
encoding: move case-related utils up
Raphaël Gomès <rgomes@octobus.net>
parents:
46819
diff
changeset
|
293 return lu.encode(_sysstr(encoding)) |
53a864a60281
encoding: move case-related utils up
Raphaël Gomès <rgomes@octobus.net>
parents:
46819
diff
changeset
|
294 except UnicodeError: |
53a864a60281
encoding: move case-related utils up
Raphaël Gomès <rgomes@octobus.net>
parents:
46819
diff
changeset
|
295 return s.lower() # we don't know how to fold this except in ASCII |
53a864a60281
encoding: move case-related utils up
Raphaël Gomès <rgomes@octobus.net>
parents:
46819
diff
changeset
|
296 except LookupError as k: |
48007
28c62f83b652
encoding: force a few Errors to bytes before passing to `error.Abort`
Matt Harbison <matt_harbison@yahoo.com>
parents:
47621
diff
changeset
|
297 raise error.Abort( |
28c62f83b652
encoding: force a few Errors to bytes before passing to `error.Abort`
Matt Harbison <matt_harbison@yahoo.com>
parents:
47621
diff
changeset
|
298 pycompat.bytestr(k), hint=b"please check your locale settings" |
28c62f83b652
encoding: force a few Errors to bytes before passing to `error.Abort`
Matt Harbison <matt_harbison@yahoo.com>
parents:
47621
diff
changeset
|
299 ) |
47559
53a864a60281
encoding: move case-related utils up
Raphaël Gomès <rgomes@octobus.net>
parents:
46819
diff
changeset
|
300 |
53a864a60281
encoding: move case-related utils up
Raphaël Gomès <rgomes@octobus.net>
parents:
46819
diff
changeset
|
301 |
51287
f15cb5111a1e
pytype: move some type comment to proper annotation
Pierre-Yves David <pierre-yves.david@octobus.net>
parents:
51285
diff
changeset
|
302 def upper(s: bytes) -> bytes: |
47559
53a864a60281
encoding: move case-related utils up
Raphaël Gomès <rgomes@octobus.net>
parents:
46819
diff
changeset
|
303 """best-effort encoding-aware case-folding of local string s""" |
53a864a60281
encoding: move case-related utils up
Raphaël Gomès <rgomes@octobus.net>
parents:
46819
diff
changeset
|
304 try: |
53a864a60281
encoding: move case-related utils up
Raphaël Gomès <rgomes@octobus.net>
parents:
46819
diff
changeset
|
305 return asciiupper(s) |
53a864a60281
encoding: move case-related utils up
Raphaël Gomès <rgomes@octobus.net>
parents:
46819
diff
changeset
|
306 except UnicodeDecodeError: |
53a864a60281
encoding: move case-related utils up
Raphaël Gomès <rgomes@octobus.net>
parents:
46819
diff
changeset
|
307 return upperfallback(s) |
53a864a60281
encoding: move case-related utils up
Raphaël Gomès <rgomes@octobus.net>
parents:
46819
diff
changeset
|
308 |
53a864a60281
encoding: move case-related utils up
Raphaël Gomès <rgomes@octobus.net>
parents:
46819
diff
changeset
|
309 |
51287
f15cb5111a1e
pytype: move some type comment to proper annotation
Pierre-Yves David <pierre-yves.david@octobus.net>
parents:
51285
diff
changeset
|
310 def upperfallback(s: Any) -> Any: |
47559
53a864a60281
encoding: move case-related utils up
Raphaël Gomès <rgomes@octobus.net>
parents:
46819
diff
changeset
|
311 try: |
53a864a60281
encoding: move case-related utils up
Raphaël Gomès <rgomes@octobus.net>
parents:
46819
diff
changeset
|
312 if isinstance(s, localstr): |
53a864a60281
encoding: move case-related utils up
Raphaël Gomès <rgomes@octobus.net>
parents:
46819
diff
changeset
|
313 u = s._utf8.decode("utf-8") |
53a864a60281
encoding: move case-related utils up
Raphaël Gomès <rgomes@octobus.net>
parents:
46819
diff
changeset
|
314 else: |
53a864a60281
encoding: move case-related utils up
Raphaël Gomès <rgomes@octobus.net>
parents:
46819
diff
changeset
|
315 u = s.decode(_sysstr(encoding), _sysstr(encodingmode)) |
53a864a60281
encoding: move case-related utils up
Raphaël Gomès <rgomes@octobus.net>
parents:
46819
diff
changeset
|
316 |
53a864a60281
encoding: move case-related utils up
Raphaël Gomès <rgomes@octobus.net>
parents:
46819
diff
changeset
|
317 uu = u.upper() |
53a864a60281
encoding: move case-related utils up
Raphaël Gomès <rgomes@octobus.net>
parents:
46819
diff
changeset
|
318 if u == uu: |
53a864a60281
encoding: move case-related utils up
Raphaël Gomès <rgomes@octobus.net>
parents:
46819
diff
changeset
|
319 return s # preserve localstring |
53a864a60281
encoding: move case-related utils up
Raphaël Gomès <rgomes@octobus.net>
parents:
46819
diff
changeset
|
320 return uu.encode(_sysstr(encoding)) |
53a864a60281
encoding: move case-related utils up
Raphaël Gomès <rgomes@octobus.net>
parents:
46819
diff
changeset
|
321 except UnicodeError: |
53a864a60281
encoding: move case-related utils up
Raphaël Gomès <rgomes@octobus.net>
parents:
46819
diff
changeset
|
322 return s.upper() # we don't know how to fold this except in ASCII |
53a864a60281
encoding: move case-related utils up
Raphaël Gomès <rgomes@octobus.net>
parents:
46819
diff
changeset
|
323 except LookupError as k: |
48007
28c62f83b652
encoding: force a few Errors to bytes before passing to `error.Abort`
Matt Harbison <matt_harbison@yahoo.com>
parents:
47621
diff
changeset
|
324 raise error.Abort( |
28c62f83b652
encoding: force a few Errors to bytes before passing to `error.Abort`
Matt Harbison <matt_harbison@yahoo.com>
parents:
47621
diff
changeset
|
325 pycompat.bytestr(k), hint=b"please check your locale settings" |
28c62f83b652
encoding: force a few Errors to bytes before passing to `error.Abort`
Matt Harbison <matt_harbison@yahoo.com>
parents:
47621
diff
changeset
|
326 ) |
47559
53a864a60281
encoding: move case-related utils up
Raphaël Gomès <rgomes@octobus.net>
parents:
46819
diff
changeset
|
327 |
53a864a60281
encoding: move case-related utils up
Raphaël Gomès <rgomes@octobus.net>
parents:
46819
diff
changeset
|
328 |
30034
e4a6b439acc5
py3: provide encoding.environ which is a dict of bytes
Yuya Nishihara <yuya@tcha.org>
parents:
30033
diff
changeset
|
329 if not _nativeenviron: |
e4a6b439acc5
py3: provide encoding.environ which is a dict of bytes
Yuya Nishihara <yuya@tcha.org>
parents:
30033
diff
changeset
|
330 # now encoding and helper functions are available, recreate the environ |
e4a6b439acc5
py3: provide encoding.environ which is a dict of bytes
Yuya Nishihara <yuya@tcha.org>
parents:
30033
diff
changeset
|
331 # dict to be exported to other modules |
48892
fa2b1a46d92e
encoding: remove Python 2 support code
Gregory Szorc <gregory.szorc@gmail.com>
parents:
48875
diff
changeset
|
332 if pycompat.iswindows: |
47560
af633293a5bd
windows: replicate the normalizing behavior of os.environ
Raphaël Gomès <rgomes@octobus.net>
parents:
47559
diff
changeset
|
333 |
af633293a5bd
windows: replicate the normalizing behavior of os.environ
Raphaël Gomès <rgomes@octobus.net>
parents:
47559
diff
changeset
|
334 class WindowsEnviron(dict): |
af633293a5bd
windows: replicate the normalizing behavior of os.environ
Raphaël Gomès <rgomes@octobus.net>
parents:
47559
diff
changeset
|
335 """`os.environ` normalizes environment variables to uppercase on windows""" |
af633293a5bd
windows: replicate the normalizing behavior of os.environ
Raphaël Gomès <rgomes@octobus.net>
parents:
47559
diff
changeset
|
336 |
af633293a5bd
windows: replicate the normalizing behavior of os.environ
Raphaël Gomès <rgomes@octobus.net>
parents:
47559
diff
changeset
|
337 def get(self, key, default=None): |
af633293a5bd
windows: replicate the normalizing behavior of os.environ
Raphaël Gomès <rgomes@octobus.net>
parents:
47559
diff
changeset
|
338 return super().get(upper(key), default) |
af633293a5bd
windows: replicate the normalizing behavior of os.environ
Raphaël Gomès <rgomes@octobus.net>
parents:
47559
diff
changeset
|
339 |
af633293a5bd
windows: replicate the normalizing behavior of os.environ
Raphaël Gomès <rgomes@octobus.net>
parents:
47559
diff
changeset
|
340 environ = WindowsEnviron() |
af633293a5bd
windows: replicate the normalizing behavior of os.environ
Raphaël Gomès <rgomes@octobus.net>
parents:
47559
diff
changeset
|
341 |
af633293a5bd
windows: replicate the normalizing behavior of os.environ
Raphaël Gomès <rgomes@octobus.net>
parents:
47559
diff
changeset
|
342 for k, v in os.environ.items(): # re-exports |
af633293a5bd
windows: replicate the normalizing behavior of os.environ
Raphaël Gomès <rgomes@octobus.net>
parents:
47559
diff
changeset
|
343 environ[tolocal(k.encode('utf-8'))] = tolocal(v.encode('utf-8')) |
af633293a5bd
windows: replicate the normalizing behavior of os.environ
Raphaël Gomès <rgomes@octobus.net>
parents:
47559
diff
changeset
|
344 |
30034
e4a6b439acc5
py3: provide encoding.environ which is a dict of bytes
Yuya Nishihara <yuya@tcha.org>
parents:
30033
diff
changeset
|
345 |
47621
d6ee6456bd5f
windows: enforce upper case drive letter for getcwd in mercurial too
Pierre-Yves David <pierre-yves.david@octobus.net>
parents:
47560
diff
changeset
|
346 DRIVE_RE = re.compile(b'^[a-z]:') |
d6ee6456bd5f
windows: enforce upper case drive letter for getcwd in mercurial too
Pierre-Yves David <pierre-yves.david@octobus.net>
parents:
47560
diff
changeset
|
347 |
48892
fa2b1a46d92e
encoding: remove Python 2 support code
Gregory Szorc <gregory.szorc@gmail.com>
parents:
48875
diff
changeset
|
348 # os.getcwd() on Python 3 returns string, but it has os.getcwdb() which |
fa2b1a46d92e
encoding: remove Python 2 support code
Gregory Szorc <gregory.szorc@gmail.com>
parents:
48875
diff
changeset
|
349 # returns bytes. |
fa2b1a46d92e
encoding: remove Python 2 support code
Gregory Szorc <gregory.szorc@gmail.com>
parents:
48875
diff
changeset
|
350 if pycompat.iswindows: |
fa2b1a46d92e
encoding: remove Python 2 support code
Gregory Szorc <gregory.szorc@gmail.com>
parents:
48875
diff
changeset
|
351 # Python 3 on Windows issues a DeprecationWarning about using the bytes |
fa2b1a46d92e
encoding: remove Python 2 support code
Gregory Szorc <gregory.szorc@gmail.com>
parents:
48875
diff
changeset
|
352 # API when os.getcwdb() is called. |
fa2b1a46d92e
encoding: remove Python 2 support code
Gregory Szorc <gregory.szorc@gmail.com>
parents:
48875
diff
changeset
|
353 # |
fa2b1a46d92e
encoding: remove Python 2 support code
Gregory Szorc <gregory.szorc@gmail.com>
parents:
48875
diff
changeset
|
354 # Additionally, py3.8+ uppercases the drive letter when calling |
fa2b1a46d92e
encoding: remove Python 2 support code
Gregory Szorc <gregory.szorc@gmail.com>
parents:
48875
diff
changeset
|
355 # os.path.realpath(), which is used on ``repo.root``. Since those |
fa2b1a46d92e
encoding: remove Python 2 support code
Gregory Szorc <gregory.szorc@gmail.com>
parents:
48875
diff
changeset
|
356 # strings are compared in various places as simple strings, also call |
fa2b1a46d92e
encoding: remove Python 2 support code
Gregory Szorc <gregory.szorc@gmail.com>
parents:
48875
diff
changeset
|
357 # realpath here. See https://bugs.python.org/issue40368 |
fa2b1a46d92e
encoding: remove Python 2 support code
Gregory Szorc <gregory.szorc@gmail.com>
parents:
48875
diff
changeset
|
358 # |
fa2b1a46d92e
encoding: remove Python 2 support code
Gregory Szorc <gregory.szorc@gmail.com>
parents:
48875
diff
changeset
|
359 # However this is not reliable, so lets explicitly make this drive |
fa2b1a46d92e
encoding: remove Python 2 support code
Gregory Szorc <gregory.szorc@gmail.com>
parents:
48875
diff
changeset
|
360 # letter upper case. |
fa2b1a46d92e
encoding: remove Python 2 support code
Gregory Szorc <gregory.szorc@gmail.com>
parents:
48875
diff
changeset
|
361 # |
fa2b1a46d92e
encoding: remove Python 2 support code
Gregory Szorc <gregory.szorc@gmail.com>
parents:
48875
diff
changeset
|
362 # note: we should consider dropping realpath here since it seems to |
fa2b1a46d92e
encoding: remove Python 2 support code
Gregory Szorc <gregory.szorc@gmail.com>
parents:
48875
diff
changeset
|
363 # change the semantic of `getcwd`. |
47621
d6ee6456bd5f
windows: enforce upper case drive letter for getcwd in mercurial too
Pierre-Yves David <pierre-yves.david@octobus.net>
parents:
47560
diff
changeset
|
364 |
48892
fa2b1a46d92e
encoding: remove Python 2 support code
Gregory Szorc <gregory.szorc@gmail.com>
parents:
48875
diff
changeset
|
365 def getcwd(): |
fa2b1a46d92e
encoding: remove Python 2 support code
Gregory Szorc <gregory.szorc@gmail.com>
parents:
48875
diff
changeset
|
366 cwd = os.getcwd() # re-exports |
fa2b1a46d92e
encoding: remove Python 2 support code
Gregory Szorc <gregory.szorc@gmail.com>
parents:
48875
diff
changeset
|
367 cwd = os.path.realpath(cwd) |
fa2b1a46d92e
encoding: remove Python 2 support code
Gregory Szorc <gregory.szorc@gmail.com>
parents:
48875
diff
changeset
|
368 cwd = strtolocal(cwd) |
fa2b1a46d92e
encoding: remove Python 2 support code
Gregory Szorc <gregory.szorc@gmail.com>
parents:
48875
diff
changeset
|
369 if DRIVE_RE.match(cwd): |
fa2b1a46d92e
encoding: remove Python 2 support code
Gregory Szorc <gregory.szorc@gmail.com>
parents:
48875
diff
changeset
|
370 cwd = cwd[0:1].upper() + cwd[1:] |
fa2b1a46d92e
encoding: remove Python 2 support code
Gregory Szorc <gregory.szorc@gmail.com>
parents:
48875
diff
changeset
|
371 return cwd |
47621
d6ee6456bd5f
windows: enforce upper case drive letter for getcwd in mercurial too
Pierre-Yves David <pierre-yves.david@octobus.net>
parents:
47560
diff
changeset
|
372 |
39818
24e493ec2229
py3: rename pycompat.getcwd() to encoding.getcwd() (API)
Matt Harbison <matt_harbison@yahoo.com>
parents:
38783
diff
changeset
|
373 else: |
48892
fa2b1a46d92e
encoding: remove Python 2 support code
Gregory Szorc <gregory.szorc@gmail.com>
parents:
48875
diff
changeset
|
374 getcwd = os.getcwdb # re-exports |
39818
24e493ec2229
py3: rename pycompat.getcwd() to encoding.getcwd() (API)
Matt Harbison <matt_harbison@yahoo.com>
parents:
38783
diff
changeset
|
375 |
12866
eddc20306ab6
encoding: default ambiguous character to narrow
Matt Mackall <mpm@selenic.com>
parents:
12770
diff
changeset
|
376 # How to treat ambiguous-width characters. Set to 'wide' to treat as wide. |
43076
2372284d9457
formatting: blacken the codebase
Augie Fackler <augie@google.com>
parents:
41836
diff
changeset
|
377 _wide = _sysstr( |
43077
687b865b95ad
formatting: byteify all mercurial/ and hgext/ string literals
Augie Fackler <augie@google.com>
parents:
43076
diff
changeset
|
378 environ.get(b"HGENCODINGAMBIGUOUS", b"narrow") == b"wide" |
687b865b95ad
formatting: byteify all mercurial/ and hgext/ string literals
Augie Fackler <augie@google.com>
parents:
43076
diff
changeset
|
379 and b"WFA" |
687b865b95ad
formatting: byteify all mercurial/ and hgext/ string literals
Augie Fackler <augie@google.com>
parents:
43076
diff
changeset
|
380 or b"WF" |
43076
2372284d9457
formatting: blacken the codebase
Augie Fackler <augie@google.com>
parents:
41836
diff
changeset
|
381 ) |
2372284d9457
formatting: blacken the codebase
Augie Fackler <augie@google.com>
parents:
41836
diff
changeset
|
382 |
12866
eddc20306ab6
encoding: default ambiguous character to narrow
Matt Mackall <mpm@selenic.com>
parents:
12770
diff
changeset
|
383 |
51287
f15cb5111a1e
pytype: move some type comment to proper annotation
Pierre-Yves David <pierre-yves.david@octobus.net>
parents:
51285
diff
changeset
|
384 def colwidth(s: bytes) -> int: |
43787
be8552f25cab
cleanup: fix docstring formatting
Matt Harbison <matt_harbison@yahoo.com>
parents:
43773
diff
changeset
|
385 """Find the column width of a string for display in the local encoding""" |
43506
9f70512ae2cf
cleanup: remove pointless r-prefixes on single-quoted strings
Augie Fackler <augie@google.com>
parents:
43503
diff
changeset
|
386 return ucolwidth(s.decode(_sysstr(encoding), 'replace')) |
15066
24efa83d81cb
i18n: calculate terminal columns by width information of each characters
FUJIWARA Katsunori <foozy@lares.dti.ne.jp>
parents:
14951
diff
changeset
|
387 |
43076
2372284d9457
formatting: blacken the codebase
Augie Fackler <augie@google.com>
parents:
41836
diff
changeset
|
388 |
51287
f15cb5111a1e
pytype: move some type comment to proper annotation
Pierre-Yves David <pierre-yves.david@octobus.net>
parents:
51285
diff
changeset
|
389 def ucolwidth(d: Text) -> int: |
43787
be8552f25cab
cleanup: fix docstring formatting
Matt Harbison <matt_harbison@yahoo.com>
parents:
43773
diff
changeset
|
390 """Find the column width of a Unicode string for display""" |
14951
61807854004e
encoding: use getattr isntead of hasattr
Augie Fackler <durin42@gmail.com>
parents:
14069
diff
changeset
|
391 eaw = getattr(unicodedata, 'east_asian_width', None) |
61807854004e
encoding: use getattr isntead of hasattr
Augie Fackler <durin42@gmail.com>
parents:
14069
diff
changeset
|
392 if eaw is not None: |
32537
044f3d7eb9ae
encoding: make sure "wide" variable never be referenced from other modules
Yuya Nishihara <yuya@tcha.org>
parents:
32529
diff
changeset
|
393 return sum([eaw(c) in _wide and 2 or 1 for c in d]) |
7948
de377b1a9a84
move encoding bits from util to encoding
Matt Mackall <mpm@selenic.com>
parents:
diff
changeset
|
394 return len(d) |
de377b1a9a84
move encoding bits from util to encoding
Matt Mackall <mpm@selenic.com>
parents:
diff
changeset
|
395 |
43076
2372284d9457
formatting: blacken the codebase
Augie Fackler <augie@google.com>
parents:
41836
diff
changeset
|
396 |
51287
f15cb5111a1e
pytype: move some type comment to proper annotation
Pierre-Yves David <pierre-yves.david@octobus.net>
parents:
51285
diff
changeset
|
397 def getcols(s: bytes, start: int, c: int) -> bytes: |
45942
89a2afe31e82
formating: upgrade to black 20.8b1
Augie Fackler <raf@durin42.com>
parents:
45681
diff
changeset
|
398 """Use colwidth to find a c-column substring of s starting at byte |
89a2afe31e82
formating: upgrade to black 20.8b1
Augie Fackler <raf@durin42.com>
parents:
45681
diff
changeset
|
399 index start""" |
49284
d44e3c45f0e4
py3: replace `pycompat.xrange` by `range`
Manuel Jacob <me@manueljacob.de>
parents:
48946
diff
changeset
|
400 for x in range(start + c, len(s)): |
15143
16c129b0f465
encoding: add getcols to extract substrings based on column width
Matt Mackall <mpm@selenic.com>
parents:
15142
diff
changeset
|
401 t = s[start:x] |
16c129b0f465
encoding: add getcols to extract substrings based on column width
Matt Mackall <mpm@selenic.com>
parents:
15142
diff
changeset
|
402 if colwidth(t) == c: |
16c129b0f465
encoding: add getcols to extract substrings based on column width
Matt Mackall <mpm@selenic.com>
parents:
15142
diff
changeset
|
403 return t |
43679
7cf332318f62
encoding: make getcols() raise exception explicitly
Yuya Nishihara <yuya@tcha.org>
parents:
43637
diff
changeset
|
404 raise ValueError('substring not found') |
15143
16c129b0f465
encoding: add getcols to extract substrings based on column width
Matt Mackall <mpm@selenic.com>
parents:
15142
diff
changeset
|
405 |
43076
2372284d9457
formatting: blacken the codebase
Augie Fackler <augie@google.com>
parents:
41836
diff
changeset
|
406 |
51287
f15cb5111a1e
pytype: move some type comment to proper annotation
Pierre-Yves David <pierre-yves.david@octobus.net>
parents:
51285
diff
changeset
|
407 def trim( |
f15cb5111a1e
pytype: move some type comment to proper annotation
Pierre-Yves David <pierre-yves.david@octobus.net>
parents:
51285
diff
changeset
|
408 s: bytes, |
f15cb5111a1e
pytype: move some type comment to proper annotation
Pierre-Yves David <pierre-yves.david@octobus.net>
parents:
51285
diff
changeset
|
409 width: int, |
f15cb5111a1e
pytype: move some type comment to proper annotation
Pierre-Yves David <pierre-yves.david@octobus.net>
parents:
51285
diff
changeset
|
410 ellipsis: bytes = b'', |
f15cb5111a1e
pytype: move some type comment to proper annotation
Pierre-Yves David <pierre-yves.david@octobus.net>
parents:
51285
diff
changeset
|
411 leftside: bool = False, |
f15cb5111a1e
pytype: move some type comment to proper annotation
Pierre-Yves David <pierre-yves.david@octobus.net>
parents:
51285
diff
changeset
|
412 ) -> bytes: |
21856
d24969ee272f
encoding: add 'trim' to trim multi-byte characters at most specified columns
FUJIWARA Katsunori <foozy@lares.dti.ne.jp>
parents:
18378
diff
changeset
|
413 """Trim string 's' to at most 'width' columns (including 'ellipsis'). |
d24969ee272f
encoding: add 'trim' to trim multi-byte characters at most specified columns
FUJIWARA Katsunori <foozy@lares.dti.ne.jp>
parents:
18378
diff
changeset
|
414 |
21861
b515c3a63e96
encoding: add 'leftside' argument into 'trim' to switch trimming side
FUJIWARA Katsunori <foozy@lares.dti.ne.jp>
parents:
21856
diff
changeset
|
415 If 'leftside' is True, left side of string 's' is trimmed. |
b515c3a63e96
encoding: add 'leftside' argument into 'trim' to switch trimming side
FUJIWARA Katsunori <foozy@lares.dti.ne.jp>
parents:
21856
diff
changeset
|
416 'ellipsis' is always placed at trimmed side. |
b515c3a63e96
encoding: add 'leftside' argument into 'trim' to switch trimming side
FUJIWARA Katsunori <foozy@lares.dti.ne.jp>
parents:
21856
diff
changeset
|
417 |
34136
414a3513c2bd
doctest: do not embed non-ascii characters in docstring
Yuya Nishihara <yuya@tcha.org>
parents:
34135
diff
changeset
|
418 >>> from .node import bin |
34137
a8994d08e4a2
doctest: use print_function and convert bytes to unicode where needed
Yuya Nishihara <yuya@tcha.org>
parents:
34136
diff
changeset
|
419 >>> def bprint(s): |
a8994d08e4a2
doctest: use print_function and convert bytes to unicode where needed
Yuya Nishihara <yuya@tcha.org>
parents:
34136
diff
changeset
|
420 ... print(pycompat.sysstr(s)) |
34131
0fa781320203
doctest: bulk-replace string literals with b'' for Python 3
Yuya Nishihara <yuya@tcha.org>
parents:
33928
diff
changeset
|
421 >>> ellipsis = b'+++' |
27355
b479fc425a81
encoding: use absolute_import
Gregory Szorc <gregory.szorc@gmail.com>
parents:
26963
diff
changeset
|
422 >>> from . import encoding |
34131
0fa781320203
doctest: bulk-replace string literals with b'' for Python 3
Yuya Nishihara <yuya@tcha.org>
parents:
33928
diff
changeset
|
423 >>> encoding.encoding = b'utf-8' |
0fa781320203
doctest: bulk-replace string literals with b'' for Python 3
Yuya Nishihara <yuya@tcha.org>
parents:
33928
diff
changeset
|
424 >>> t = b'1234567890' |
34137
a8994d08e4a2
doctest: use print_function and convert bytes to unicode where needed
Yuya Nishihara <yuya@tcha.org>
parents:
34136
diff
changeset
|
425 >>> bprint(trim(t, 12, ellipsis=ellipsis)) |
21856
d24969ee272f
encoding: add 'trim' to trim multi-byte characters at most specified columns
FUJIWARA Katsunori <foozy@lares.dti.ne.jp>
parents:
18378
diff
changeset
|
426 1234567890 |
34137
a8994d08e4a2
doctest: use print_function and convert bytes to unicode where needed
Yuya Nishihara <yuya@tcha.org>
parents:
34136
diff
changeset
|
427 >>> bprint(trim(t, 10, ellipsis=ellipsis)) |
21856
d24969ee272f
encoding: add 'trim' to trim multi-byte characters at most specified columns
FUJIWARA Katsunori <foozy@lares.dti.ne.jp>
parents:
18378
diff
changeset
|
428 1234567890 |
34137
a8994d08e4a2
doctest: use print_function and convert bytes to unicode where needed
Yuya Nishihara <yuya@tcha.org>
parents:
34136
diff
changeset
|
429 >>> bprint(trim(t, 8, ellipsis=ellipsis)) |
21856
d24969ee272f
encoding: add 'trim' to trim multi-byte characters at most specified columns
FUJIWARA Katsunori <foozy@lares.dti.ne.jp>
parents:
18378
diff
changeset
|
430 12345+++ |
34137
a8994d08e4a2
doctest: use print_function and convert bytes to unicode where needed
Yuya Nishihara <yuya@tcha.org>
parents:
34136
diff
changeset
|
431 >>> bprint(trim(t, 8, ellipsis=ellipsis, leftside=True)) |
21861
b515c3a63e96
encoding: add 'leftside' argument into 'trim' to switch trimming side
FUJIWARA Katsunori <foozy@lares.dti.ne.jp>
parents:
21856
diff
changeset
|
432 +++67890 |
34137
a8994d08e4a2
doctest: use print_function and convert bytes to unicode where needed
Yuya Nishihara <yuya@tcha.org>
parents:
34136
diff
changeset
|
433 >>> bprint(trim(t, 8)) |
21856
d24969ee272f
encoding: add 'trim' to trim multi-byte characters at most specified columns
FUJIWARA Katsunori <foozy@lares.dti.ne.jp>
parents:
18378
diff
changeset
|
434 12345678 |
34137
a8994d08e4a2
doctest: use print_function and convert bytes to unicode where needed
Yuya Nishihara <yuya@tcha.org>
parents:
34136
diff
changeset
|
435 >>> bprint(trim(t, 8, leftside=True)) |
21861
b515c3a63e96
encoding: add 'leftside' argument into 'trim' to switch trimming side
FUJIWARA Katsunori <foozy@lares.dti.ne.jp>
parents:
21856
diff
changeset
|
436 34567890 |
34137
a8994d08e4a2
doctest: use print_function and convert bytes to unicode where needed
Yuya Nishihara <yuya@tcha.org>
parents:
34136
diff
changeset
|
437 >>> bprint(trim(t, 3, ellipsis=ellipsis)) |
21856
d24969ee272f
encoding: add 'trim' to trim multi-byte characters at most specified columns
FUJIWARA Katsunori <foozy@lares.dti.ne.jp>
parents:
18378
diff
changeset
|
438 +++ |
34137
a8994d08e4a2
doctest: use print_function and convert bytes to unicode where needed
Yuya Nishihara <yuya@tcha.org>
parents:
34136
diff
changeset
|
439 >>> bprint(trim(t, 1, ellipsis=ellipsis)) |
21856
d24969ee272f
encoding: add 'trim' to trim multi-byte characters at most specified columns
FUJIWARA Katsunori <foozy@lares.dti.ne.jp>
parents:
18378
diff
changeset
|
440 + |
d24969ee272f
encoding: add 'trim' to trim multi-byte characters at most specified columns
FUJIWARA Katsunori <foozy@lares.dti.ne.jp>
parents:
18378
diff
changeset
|
441 >>> u = u'\u3042\u3044\u3046\u3048\u304a' # 2 x 5 = 10 columns |
34135
e9e225f16932
doctest: pass encoding name as system string
Yuya Nishihara <yuya@tcha.org>
parents:
34131
diff
changeset
|
442 >>> t = u.encode(pycompat.sysstr(encoding.encoding)) |
34137
a8994d08e4a2
doctest: use print_function and convert bytes to unicode where needed
Yuya Nishihara <yuya@tcha.org>
parents:
34136
diff
changeset
|
443 >>> bprint(trim(t, 12, ellipsis=ellipsis)) |
21856
d24969ee272f
encoding: add 'trim' to trim multi-byte characters at most specified columns
FUJIWARA Katsunori <foozy@lares.dti.ne.jp>
parents:
18378
diff
changeset
|
444 \xe3\x81\x82\xe3\x81\x84\xe3\x81\x86\xe3\x81\x88\xe3\x81\x8a |
34137
a8994d08e4a2
doctest: use print_function and convert bytes to unicode where needed
Yuya Nishihara <yuya@tcha.org>
parents:
34136
diff
changeset
|
445 >>> bprint(trim(t, 10, ellipsis=ellipsis)) |
21856
d24969ee272f
encoding: add 'trim' to trim multi-byte characters at most specified columns
FUJIWARA Katsunori <foozy@lares.dti.ne.jp>
parents:
18378
diff
changeset
|
446 \xe3\x81\x82\xe3\x81\x84\xe3\x81\x86\xe3\x81\x88\xe3\x81\x8a |
34137
a8994d08e4a2
doctest: use print_function and convert bytes to unicode where needed
Yuya Nishihara <yuya@tcha.org>
parents:
34136
diff
changeset
|
447 >>> bprint(trim(t, 8, ellipsis=ellipsis)) |
21856
d24969ee272f
encoding: add 'trim' to trim multi-byte characters at most specified columns
FUJIWARA Katsunori <foozy@lares.dti.ne.jp>
parents:
18378
diff
changeset
|
448 \xe3\x81\x82\xe3\x81\x84+++ |
34137
a8994d08e4a2
doctest: use print_function and convert bytes to unicode where needed
Yuya Nishihara <yuya@tcha.org>
parents:
34136
diff
changeset
|
449 >>> bprint(trim(t, 8, ellipsis=ellipsis, leftside=True)) |
21861
b515c3a63e96
encoding: add 'leftside' argument into 'trim' to switch trimming side
FUJIWARA Katsunori <foozy@lares.dti.ne.jp>
parents:
21856
diff
changeset
|
450 +++\xe3\x81\x88\xe3\x81\x8a |
34137
a8994d08e4a2
doctest: use print_function and convert bytes to unicode where needed
Yuya Nishihara <yuya@tcha.org>
parents:
34136
diff
changeset
|
451 >>> bprint(trim(t, 5)) |
21856
d24969ee272f
encoding: add 'trim' to trim multi-byte characters at most specified columns
FUJIWARA Katsunori <foozy@lares.dti.ne.jp>
parents:
18378
diff
changeset
|
452 \xe3\x81\x82\xe3\x81\x84 |
34137
a8994d08e4a2
doctest: use print_function and convert bytes to unicode where needed
Yuya Nishihara <yuya@tcha.org>
parents:
34136
diff
changeset
|
453 >>> bprint(trim(t, 5, leftside=True)) |
21861
b515c3a63e96
encoding: add 'leftside' argument into 'trim' to switch trimming side
FUJIWARA Katsunori <foozy@lares.dti.ne.jp>
parents:
21856
diff
changeset
|
454 \xe3\x81\x88\xe3\x81\x8a |
34137
a8994d08e4a2
doctest: use print_function and convert bytes to unicode where needed
Yuya Nishihara <yuya@tcha.org>
parents:
34136
diff
changeset
|
455 >>> bprint(trim(t, 4, ellipsis=ellipsis)) |
21856
d24969ee272f
encoding: add 'trim' to trim multi-byte characters at most specified columns
FUJIWARA Katsunori <foozy@lares.dti.ne.jp>
parents:
18378
diff
changeset
|
456 +++ |
34137
a8994d08e4a2
doctest: use print_function and convert bytes to unicode where needed
Yuya Nishihara <yuya@tcha.org>
parents:
34136
diff
changeset
|
457 >>> bprint(trim(t, 4, ellipsis=ellipsis, leftside=True)) |
21861
b515c3a63e96
encoding: add 'leftside' argument into 'trim' to switch trimming side
FUJIWARA Katsunori <foozy@lares.dti.ne.jp>
parents:
21856
diff
changeset
|
458 +++ |
34136
414a3513c2bd
doctest: do not embed non-ascii characters in docstring
Yuya Nishihara <yuya@tcha.org>
parents:
34135
diff
changeset
|
459 >>> t = bin(b'112233445566778899aa') # invalid byte sequence |
34137
a8994d08e4a2
doctest: use print_function and convert bytes to unicode where needed
Yuya Nishihara <yuya@tcha.org>
parents:
34136
diff
changeset
|
460 >>> bprint(trim(t, 12, ellipsis=ellipsis)) |
21856
d24969ee272f
encoding: add 'trim' to trim multi-byte characters at most specified columns
FUJIWARA Katsunori <foozy@lares.dti.ne.jp>
parents:
18378
diff
changeset
|
461 \x11\x22\x33\x44\x55\x66\x77\x88\x99\xaa |
34137
a8994d08e4a2
doctest: use print_function and convert bytes to unicode where needed
Yuya Nishihara <yuya@tcha.org>
parents:
34136
diff
changeset
|
462 >>> bprint(trim(t, 10, ellipsis=ellipsis)) |
21856
d24969ee272f
encoding: add 'trim' to trim multi-byte characters at most specified columns
FUJIWARA Katsunori <foozy@lares.dti.ne.jp>
parents:
18378
diff
changeset
|
463 \x11\x22\x33\x44\x55\x66\x77\x88\x99\xaa |
34137
a8994d08e4a2
doctest: use print_function and convert bytes to unicode where needed
Yuya Nishihara <yuya@tcha.org>
parents:
34136
diff
changeset
|
464 >>> bprint(trim(t, 8, ellipsis=ellipsis)) |
21856
d24969ee272f
encoding: add 'trim' to trim multi-byte characters at most specified columns
FUJIWARA Katsunori <foozy@lares.dti.ne.jp>
parents:
18378
diff
changeset
|
465 \x11\x22\x33\x44\x55+++ |
34137
a8994d08e4a2
doctest: use print_function and convert bytes to unicode where needed
Yuya Nishihara <yuya@tcha.org>
parents:
34136
diff
changeset
|
466 >>> bprint(trim(t, 8, ellipsis=ellipsis, leftside=True)) |
21861
b515c3a63e96
encoding: add 'leftside' argument into 'trim' to switch trimming side
FUJIWARA Katsunori <foozy@lares.dti.ne.jp>
parents:
21856
diff
changeset
|
467 +++\x66\x77\x88\x99\xaa |
34137
a8994d08e4a2
doctest: use print_function and convert bytes to unicode where needed
Yuya Nishihara <yuya@tcha.org>
parents:
34136
diff
changeset
|
468 >>> bprint(trim(t, 8)) |
21856
d24969ee272f
encoding: add 'trim' to trim multi-byte characters at most specified columns
FUJIWARA Katsunori <foozy@lares.dti.ne.jp>
parents:
18378
diff
changeset
|
469 \x11\x22\x33\x44\x55\x66\x77\x88 |
34137
a8994d08e4a2
doctest: use print_function and convert bytes to unicode where needed
Yuya Nishihara <yuya@tcha.org>
parents:
34136
diff
changeset
|
470 >>> bprint(trim(t, 8, leftside=True)) |
21861
b515c3a63e96
encoding: add 'leftside' argument into 'trim' to switch trimming side
FUJIWARA Katsunori <foozy@lares.dti.ne.jp>
parents:
21856
diff
changeset
|
471 \x33\x44\x55\x66\x77\x88\x99\xaa |
34137
a8994d08e4a2
doctest: use print_function and convert bytes to unicode where needed
Yuya Nishihara <yuya@tcha.org>
parents:
34136
diff
changeset
|
472 >>> bprint(trim(t, 3, ellipsis=ellipsis)) |
21856
d24969ee272f
encoding: add 'trim' to trim multi-byte characters at most specified columns
FUJIWARA Katsunori <foozy@lares.dti.ne.jp>
parents:
18378
diff
changeset
|
473 +++ |
34137
a8994d08e4a2
doctest: use print_function and convert bytes to unicode where needed
Yuya Nishihara <yuya@tcha.org>
parents:
34136
diff
changeset
|
474 >>> bprint(trim(t, 1, ellipsis=ellipsis)) |
21856
d24969ee272f
encoding: add 'trim' to trim multi-byte characters at most specified columns
FUJIWARA Katsunori <foozy@lares.dti.ne.jp>
parents:
18378
diff
changeset
|
475 + |
d24969ee272f
encoding: add 'trim' to trim multi-byte characters at most specified columns
FUJIWARA Katsunori <foozy@lares.dti.ne.jp>
parents:
18378
diff
changeset
|
476 """ |
d24969ee272f
encoding: add 'trim' to trim multi-byte characters at most specified columns
FUJIWARA Katsunori <foozy@lares.dti.ne.jp>
parents:
18378
diff
changeset
|
477 try: |
30033
02dbfaa6df0b
py3: convert encoding name and mode to str
Yuya Nishihara <yuya@tcha.org>
parents:
30030
diff
changeset
|
478 u = s.decode(_sysstr(encoding)) |
21856
d24969ee272f
encoding: add 'trim' to trim multi-byte characters at most specified columns
FUJIWARA Katsunori <foozy@lares.dti.ne.jp>
parents:
18378
diff
changeset
|
479 except UnicodeDecodeError: |
43076
2372284d9457
formatting: blacken the codebase
Augie Fackler <augie@google.com>
parents:
41836
diff
changeset
|
480 if len(s) <= width: # trimming is not needed |
21856
d24969ee272f
encoding: add 'trim' to trim multi-byte characters at most specified columns
FUJIWARA Katsunori <foozy@lares.dti.ne.jp>
parents:
18378
diff
changeset
|
481 return s |
d24969ee272f
encoding: add 'trim' to trim multi-byte characters at most specified columns
FUJIWARA Katsunori <foozy@lares.dti.ne.jp>
parents:
18378
diff
changeset
|
482 width -= len(ellipsis) |
43076
2372284d9457
formatting: blacken the codebase
Augie Fackler <augie@google.com>
parents:
41836
diff
changeset
|
483 if width <= 0: # no enough room even for ellipsis |
2372284d9457
formatting: blacken the codebase
Augie Fackler <augie@google.com>
parents:
41836
diff
changeset
|
484 return ellipsis[: width + len(ellipsis)] |
21861
b515c3a63e96
encoding: add 'leftside' argument into 'trim' to switch trimming side
FUJIWARA Katsunori <foozy@lares.dti.ne.jp>
parents:
21856
diff
changeset
|
485 if leftside: |
b515c3a63e96
encoding: add 'leftside' argument into 'trim' to switch trimming side
FUJIWARA Katsunori <foozy@lares.dti.ne.jp>
parents:
21856
diff
changeset
|
486 return ellipsis + s[-width:] |
21856
d24969ee272f
encoding: add 'trim' to trim multi-byte characters at most specified columns
FUJIWARA Katsunori <foozy@lares.dti.ne.jp>
parents:
18378
diff
changeset
|
487 return s[:width] + ellipsis |
d24969ee272f
encoding: add 'trim' to trim multi-byte characters at most specified columns
FUJIWARA Katsunori <foozy@lares.dti.ne.jp>
parents:
18378
diff
changeset
|
488 |
43076
2372284d9457
formatting: blacken the codebase
Augie Fackler <augie@google.com>
parents:
41836
diff
changeset
|
489 if ucolwidth(u) <= width: # trimming is not needed |
21856
d24969ee272f
encoding: add 'trim' to trim multi-byte characters at most specified columns
FUJIWARA Katsunori <foozy@lares.dti.ne.jp>
parents:
18378
diff
changeset
|
490 return s |
d24969ee272f
encoding: add 'trim' to trim multi-byte characters at most specified columns
FUJIWARA Katsunori <foozy@lares.dti.ne.jp>
parents:
18378
diff
changeset
|
491 |
d24969ee272f
encoding: add 'trim' to trim multi-byte characters at most specified columns
FUJIWARA Katsunori <foozy@lares.dti.ne.jp>
parents:
18378
diff
changeset
|
492 width -= len(ellipsis) |
43076
2372284d9457
formatting: blacken the codebase
Augie Fackler <augie@google.com>
parents:
41836
diff
changeset
|
493 if width <= 0: # no enough room even for ellipsis |
2372284d9457
formatting: blacken the codebase
Augie Fackler <augie@google.com>
parents:
41836
diff
changeset
|
494 return ellipsis[: width + len(ellipsis)] |
21856
d24969ee272f
encoding: add 'trim' to trim multi-byte characters at most specified columns
FUJIWARA Katsunori <foozy@lares.dti.ne.jp>
parents:
18378
diff
changeset
|
495 |
48671
f1ed5c304f45
encoding: fix trim() to be O(n) instead of O(n^2)
Martin von Zweigbergk <martinvonz@google.com>
parents:
48007
diff
changeset
|
496 chars = list(u) |
21861
b515c3a63e96
encoding: add 'leftside' argument into 'trim' to switch trimming side
FUJIWARA Katsunori <foozy@lares.dti.ne.jp>
parents:
21856
diff
changeset
|
497 if leftside: |
48671
f1ed5c304f45
encoding: fix trim() to be O(n) instead of O(n^2)
Martin von Zweigbergk <martinvonz@google.com>
parents:
48007
diff
changeset
|
498 chars.reverse() |
f1ed5c304f45
encoding: fix trim() to be O(n) instead of O(n^2)
Martin von Zweigbergk <martinvonz@google.com>
parents:
48007
diff
changeset
|
499 width_so_far = 0 |
f1ed5c304f45
encoding: fix trim() to be O(n) instead of O(n^2)
Martin von Zweigbergk <martinvonz@google.com>
parents:
48007
diff
changeset
|
500 for i, c in enumerate(chars): |
f1ed5c304f45
encoding: fix trim() to be O(n) instead of O(n^2)
Martin von Zweigbergk <martinvonz@google.com>
parents:
48007
diff
changeset
|
501 width_so_far += ucolwidth(c) |
f1ed5c304f45
encoding: fix trim() to be O(n) instead of O(n^2)
Martin von Zweigbergk <martinvonz@google.com>
parents:
48007
diff
changeset
|
502 if width_so_far > width: |
f1ed5c304f45
encoding: fix trim() to be O(n) instead of O(n^2)
Martin von Zweigbergk <martinvonz@google.com>
parents:
48007
diff
changeset
|
503 break |
f1ed5c304f45
encoding: fix trim() to be O(n) instead of O(n^2)
Martin von Zweigbergk <martinvonz@google.com>
parents:
48007
diff
changeset
|
504 chars = chars[:i] |
f1ed5c304f45
encoding: fix trim() to be O(n) instead of O(n^2)
Martin von Zweigbergk <martinvonz@google.com>
parents:
48007
diff
changeset
|
505 if leftside: |
f1ed5c304f45
encoding: fix trim() to be O(n) instead of O(n^2)
Martin von Zweigbergk <martinvonz@google.com>
parents:
48007
diff
changeset
|
506 chars.reverse() |
f1ed5c304f45
encoding: fix trim() to be O(n) instead of O(n^2)
Martin von Zweigbergk <martinvonz@google.com>
parents:
48007
diff
changeset
|
507 u = u''.join(chars).encode(_sysstr(encoding)) |
f1ed5c304f45
encoding: fix trim() to be O(n) instead of O(n^2)
Martin von Zweigbergk <martinvonz@google.com>
parents:
48007
diff
changeset
|
508 if leftside: |
f1ed5c304f45
encoding: fix trim() to be O(n) instead of O(n^2)
Martin von Zweigbergk <martinvonz@google.com>
parents:
48007
diff
changeset
|
509 return ellipsis + u |
f1ed5c304f45
encoding: fix trim() to be O(n) instead of O(n^2)
Martin von Zweigbergk <martinvonz@google.com>
parents:
48007
diff
changeset
|
510 return u + ellipsis |
43076
2372284d9457
formatting: blacken the codebase
Augie Fackler <augie@google.com>
parents:
41836
diff
changeset
|
511 |
21856
d24969ee272f
encoding: add 'trim' to trim multi-byte characters at most specified columns
FUJIWARA Katsunori <foozy@lares.dti.ne.jp>
parents:
18378
diff
changeset
|
512 |
48946
642e31cb55f0
py3: use class X: instead of class X(object):
Gregory Szorc <gregory.szorc@gmail.com>
parents:
48892
diff
changeset
|
513 class normcasespecs: |
45942
89a2afe31e82
formating: upgrade to black 20.8b1
Augie Fackler <raf@durin42.com>
parents:
45681
diff
changeset
|
514 """what a platform's normcase does to ASCII strings |
24593
f473a1fe5c7c
encoding: define an enum that specifies what normcase does to ASCII strings
Siddharth Agarwal <sid0@fb.com>
parents:
24578
diff
changeset
|
515 |
f473a1fe5c7c
encoding: define an enum that specifies what normcase does to ASCII strings
Siddharth Agarwal <sid0@fb.com>
parents:
24578
diff
changeset
|
516 This is specified per platform, and should be consistent with what normcase |
f473a1fe5c7c
encoding: define an enum that specifies what normcase does to ASCII strings
Siddharth Agarwal <sid0@fb.com>
parents:
24578
diff
changeset
|
517 on that platform actually does. |
f473a1fe5c7c
encoding: define an enum that specifies what normcase does to ASCII strings
Siddharth Agarwal <sid0@fb.com>
parents:
24578
diff
changeset
|
518 |
f473a1fe5c7c
encoding: define an enum that specifies what normcase does to ASCII strings
Siddharth Agarwal <sid0@fb.com>
parents:
24578
diff
changeset
|
519 lower: normcase lowercases ASCII strings |
f473a1fe5c7c
encoding: define an enum that specifies what normcase does to ASCII strings
Siddharth Agarwal <sid0@fb.com>
parents:
24578
diff
changeset
|
520 upper: normcase uppercases ASCII strings |
24608
1c533e23ce95
util.h: define an enum for normcase specs
Siddharth Agarwal <sid0@fb.com>
parents:
24597
diff
changeset
|
521 other: the fallback function should always be called |
1c533e23ce95
util.h: define an enum for normcase specs
Siddharth Agarwal <sid0@fb.com>
parents:
24597
diff
changeset
|
522 |
45942
89a2afe31e82
formating: upgrade to black 20.8b1
Augie Fackler <raf@durin42.com>
parents:
45681
diff
changeset
|
523 This should be kept in sync with normcase_spec in util.h.""" |
43076
2372284d9457
formatting: blacken the codebase
Augie Fackler <augie@google.com>
parents:
41836
diff
changeset
|
524 |
24593
f473a1fe5c7c
encoding: define an enum that specifies what normcase does to ASCII strings
Siddharth Agarwal <sid0@fb.com>
parents:
24578
diff
changeset
|
525 lower = -1 |
f473a1fe5c7c
encoding: define an enum that specifies what normcase does to ASCII strings
Siddharth Agarwal <sid0@fb.com>
parents:
24578
diff
changeset
|
526 upper = 1 |
f473a1fe5c7c
encoding: define an enum that specifies what normcase does to ASCII strings
Siddharth Agarwal <sid0@fb.com>
parents:
24578
diff
changeset
|
527 other = 0 |
f473a1fe5c7c
encoding: define an enum that specifies what normcase does to ASCII strings
Siddharth Agarwal <sid0@fb.com>
parents:
24578
diff
changeset
|
528 |
43076
2372284d9457
formatting: blacken the codebase
Augie Fackler <augie@google.com>
parents:
41836
diff
changeset
|
529 |
51722
43adbe03079b
typing: add type hints to the `charencode` module
Matt Harbison <matt_harbison@yahoo.com>
parents:
51703
diff
changeset
|
530 def jsonescape(s: bytes, paranoid: bool = False) -> bytes: |
45942
89a2afe31e82
formating: upgrade to black 20.8b1
Augie Fackler <raf@durin42.com>
parents:
45681
diff
changeset
|
531 """returns a string suitable for JSON |
22426
f6b533e64ed6
encoding: add json escaping filter
Matt Mackall <mpm@selenic.com>
parents:
22425
diff
changeset
|
532 |
f6b533e64ed6
encoding: add json escaping filter
Matt Mackall <mpm@selenic.com>
parents:
22425
diff
changeset
|
533 JSON is problematic for us because it doesn't support non-Unicode |
f6b533e64ed6
encoding: add json escaping filter
Matt Mackall <mpm@selenic.com>
parents:
22425
diff
changeset
|
534 bytes. To deal with this, we take the following approach: |
f6b533e64ed6
encoding: add json escaping filter
Matt Mackall <mpm@selenic.com>
parents:
22425
diff
changeset
|
535 |
37947
3ea3c96ada54
encoding: introduce tagging type for non-lossy non-ASCII string
Yuya Nishihara <yuya@tcha.org>
parents:
37946
diff
changeset
|
536 - localstr/safelocalstr objects are converted back to UTF-8 |
22426
f6b533e64ed6
encoding: add json escaping filter
Matt Mackall <mpm@selenic.com>
parents:
22425
diff
changeset
|
537 - valid UTF-8/ASCII strings are passed as-is |
f6b533e64ed6
encoding: add json escaping filter
Matt Mackall <mpm@selenic.com>
parents:
22425
diff
changeset
|
538 - other strings are converted to UTF-8b surrogate encoding |
f6b533e64ed6
encoding: add json escaping filter
Matt Mackall <mpm@selenic.com>
parents:
22425
diff
changeset
|
539 - apply JSON-specified string escaping |
f6b533e64ed6
encoding: add json escaping filter
Matt Mackall <mpm@selenic.com>
parents:
22425
diff
changeset
|
540 |
f6b533e64ed6
encoding: add json escaping filter
Matt Mackall <mpm@selenic.com>
parents:
22425
diff
changeset
|
541 (escapes are doubled in these tests) |
f6b533e64ed6
encoding: add json escaping filter
Matt Mackall <mpm@selenic.com>
parents:
22425
diff
changeset
|
542 |
34131
0fa781320203
doctest: bulk-replace string literals with b'' for Python 3
Yuya Nishihara <yuya@tcha.org>
parents:
33928
diff
changeset
|
543 >>> jsonescape(b'this is a test') |
22426
f6b533e64ed6
encoding: add json escaping filter
Matt Mackall <mpm@selenic.com>
parents:
22425
diff
changeset
|
544 'this is a test' |
34131
0fa781320203
doctest: bulk-replace string literals with b'' for Python 3
Yuya Nishihara <yuya@tcha.org>
parents:
33928
diff
changeset
|
545 >>> jsonescape(b'escape characters: \\0 \\x0b \\x7f') |
27881
ffa599f3f503
encoding: escape U+007F (DEL) character in JSON
Yuya Nishihara <yuya@tcha.org>
parents:
27699
diff
changeset
|
546 'escape characters: \\\\u0000 \\\\u000b \\\\u007f' |
34131
0fa781320203
doctest: bulk-replace string literals with b'' for Python 3
Yuya Nishihara <yuya@tcha.org>
parents:
33928
diff
changeset
|
547 >>> jsonescape(b'escape characters: \\b \\t \\n \\f \\r \\" \\\\') |
33925
2c37f9dabc32
encoding: add fast path of jsonescape() (issue5533)
Yuya Nishihara <yuya@tcha.org>
parents:
33924
diff
changeset
|
548 'escape characters: \\\\b \\\\t \\\\n \\\\f \\\\r \\\\" \\\\\\\\' |
34131
0fa781320203
doctest: bulk-replace string literals with b'' for Python 3
Yuya Nishihara <yuya@tcha.org>
parents:
33928
diff
changeset
|
549 >>> jsonescape(b'a weird byte: \\xdd') |
22426
f6b533e64ed6
encoding: add json escaping filter
Matt Mackall <mpm@selenic.com>
parents:
22425
diff
changeset
|
550 'a weird byte: \\xed\\xb3\\x9d' |
34131
0fa781320203
doctest: bulk-replace string literals with b'' for Python 3
Yuya Nishihara <yuya@tcha.org>
parents:
33928
diff
changeset
|
551 >>> jsonescape(b'utf-8: caf\\xc3\\xa9') |
22426
f6b533e64ed6
encoding: add json escaping filter
Matt Mackall <mpm@selenic.com>
parents:
22425
diff
changeset
|
552 'utf-8: caf\\xc3\\xa9' |
34131
0fa781320203
doctest: bulk-replace string literals with b'' for Python 3
Yuya Nishihara <yuya@tcha.org>
parents:
33928
diff
changeset
|
553 >>> jsonescape(b'') |
22426
f6b533e64ed6
encoding: add json escaping filter
Matt Mackall <mpm@selenic.com>
parents:
22425
diff
changeset
|
554 '' |
28068
9ece901f7a19
encoding: add option to escape non-ascii characters in JSON
Yuya Nishihara <yuya@tcha.org>
parents:
28067
diff
changeset
|
555 |
28069
b2d24c2898f9
encoding: backport paranoid escaping from templatefilters.jsonescape()
Yuya Nishihara <yuya@tcha.org>
parents:
28068
diff
changeset
|
556 If paranoid, non-ascii and common troublesome characters are also escaped. |
b2d24c2898f9
encoding: backport paranoid escaping from templatefilters.jsonescape()
Yuya Nishihara <yuya@tcha.org>
parents:
28068
diff
changeset
|
557 This is suitable for web output. |
28068
9ece901f7a19
encoding: add option to escape non-ascii characters in JSON
Yuya Nishihara <yuya@tcha.org>
parents:
28067
diff
changeset
|
558 |
34131
0fa781320203
doctest: bulk-replace string literals with b'' for Python 3
Yuya Nishihara <yuya@tcha.org>
parents:
33928
diff
changeset
|
559 >>> s = b'escape characters: \\0 \\x0b \\x7f' |
33925
2c37f9dabc32
encoding: add fast path of jsonescape() (issue5533)
Yuya Nishihara <yuya@tcha.org>
parents:
33924
diff
changeset
|
560 >>> assert jsonescape(s) == jsonescape(s, paranoid=True) |
34131
0fa781320203
doctest: bulk-replace string literals with b'' for Python 3
Yuya Nishihara <yuya@tcha.org>
parents:
33928
diff
changeset
|
561 >>> s = b'escape characters: \\b \\t \\n \\f \\r \\" \\\\' |
33925
2c37f9dabc32
encoding: add fast path of jsonescape() (issue5533)
Yuya Nishihara <yuya@tcha.org>
parents:
33924
diff
changeset
|
562 >>> assert jsonescape(s) == jsonescape(s, paranoid=True) |
34131
0fa781320203
doctest: bulk-replace string literals with b'' for Python 3
Yuya Nishihara <yuya@tcha.org>
parents:
33928
diff
changeset
|
563 >>> jsonescape(b'escape boundary: \\x7e \\x7f \\xc2\\x80', paranoid=True) |
28068
9ece901f7a19
encoding: add option to escape non-ascii characters in JSON
Yuya Nishihara <yuya@tcha.org>
parents:
28067
diff
changeset
|
564 'escape boundary: ~ \\\\u007f \\\\u0080' |
34131
0fa781320203
doctest: bulk-replace string literals with b'' for Python 3
Yuya Nishihara <yuya@tcha.org>
parents:
33928
diff
changeset
|
565 >>> jsonescape(b'a weird byte: \\xdd', paranoid=True) |
28068
9ece901f7a19
encoding: add option to escape non-ascii characters in JSON
Yuya Nishihara <yuya@tcha.org>
parents:
28067
diff
changeset
|
566 'a weird byte: \\\\udcdd' |
34131
0fa781320203
doctest: bulk-replace string literals with b'' for Python 3
Yuya Nishihara <yuya@tcha.org>
parents:
33928
diff
changeset
|
567 >>> jsonescape(b'utf-8: caf\\xc3\\xa9', paranoid=True) |
28068
9ece901f7a19
encoding: add option to escape non-ascii characters in JSON
Yuya Nishihara <yuya@tcha.org>
parents:
28067
diff
changeset
|
568 'utf-8: caf\\\\u00e9' |
34131
0fa781320203
doctest: bulk-replace string literals with b'' for Python 3
Yuya Nishihara <yuya@tcha.org>
parents:
33928
diff
changeset
|
569 >>> jsonescape(b'non-BMP: \\xf0\\x9d\\x84\\x9e', paranoid=True) |
28068
9ece901f7a19
encoding: add option to escape non-ascii characters in JSON
Yuya Nishihara <yuya@tcha.org>
parents:
28067
diff
changeset
|
570 'non-BMP: \\\\ud834\\\\udd1e' |
34131
0fa781320203
doctest: bulk-replace string literals with b'' for Python 3
Yuya Nishihara <yuya@tcha.org>
parents:
33928
diff
changeset
|
571 >>> jsonescape(b'<foo@example.org>', paranoid=True) |
28069
b2d24c2898f9
encoding: backport paranoid escaping from templatefilters.jsonescape()
Yuya Nishihara <yuya@tcha.org>
parents:
28068
diff
changeset
|
572 '\\\\u003cfoo@example.org\\\\u003e' |
45942
89a2afe31e82
formating: upgrade to black 20.8b1
Augie Fackler <raf@durin42.com>
parents:
45681
diff
changeset
|
573 """ |
22426
f6b533e64ed6
encoding: add json escaping filter
Matt Mackall <mpm@selenic.com>
parents:
22425
diff
changeset
|
574 |
28068
9ece901f7a19
encoding: add option to escape non-ascii characters in JSON
Yuya Nishihara <yuya@tcha.org>
parents:
28067
diff
changeset
|
575 u8chars = toutf8b(s) |
9ece901f7a19
encoding: add option to escape non-ascii characters in JSON
Yuya Nishihara <yuya@tcha.org>
parents:
28067
diff
changeset
|
576 try: |
33924
b9101467d88b
encoding: extract stub for fast JSON escape
Yuya Nishihara <yuya@tcha.org>
parents:
33852
diff
changeset
|
577 return _jsonescapeu8fast(u8chars, paranoid) |
b9101467d88b
encoding: extract stub for fast JSON escape
Yuya Nishihara <yuya@tcha.org>
parents:
33852
diff
changeset
|
578 except ValueError: |
28068
9ece901f7a19
encoding: add option to escape non-ascii characters in JSON
Yuya Nishihara <yuya@tcha.org>
parents:
28067
diff
changeset
|
579 pass |
33924
b9101467d88b
encoding: extract stub for fast JSON escape
Yuya Nishihara <yuya@tcha.org>
parents:
33852
diff
changeset
|
580 return charencodepure.jsonescapeu8fallback(u8chars, paranoid) |
22426
f6b533e64ed6
encoding: add json escaping filter
Matt Mackall <mpm@selenic.com>
parents:
22425
diff
changeset
|
581 |
43076
2372284d9457
formatting: blacken the codebase
Augie Fackler <augie@google.com>
parents:
41836
diff
changeset
|
582 |
34218
aa877860d4d7
py3: use 'surrogatepass' error handler to process U+DCxx transparently
Yuya Nishihara <yuya@tcha.org>
parents:
34216
diff
changeset
|
583 # We need to decode/encode U+DCxx codes transparently since invalid UTF-8 |
aa877860d4d7
py3: use 'surrogatepass' error handler to process U+DCxx transparently
Yuya Nishihara <yuya@tcha.org>
parents:
34216
diff
changeset
|
584 # bytes are mapped to that range. |
48892
fa2b1a46d92e
encoding: remove Python 2 support code
Gregory Szorc <gregory.szorc@gmail.com>
parents:
48875
diff
changeset
|
585 _utf8strict = r'surrogatepass' |
34218
aa877860d4d7
py3: use 'surrogatepass' error handler to process U+DCxx transparently
Yuya Nishihara <yuya@tcha.org>
parents:
34216
diff
changeset
|
586 |
26875
cf47bdb2183c
encoding: add getutf8char helper
Matt Mackall <mpm@selenic.com>
parents:
25660
diff
changeset
|
587 _utf8len = [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 2, 2, 3, 4] |
cf47bdb2183c
encoding: add getutf8char helper
Matt Mackall <mpm@selenic.com>
parents:
25660
diff
changeset
|
588 |
43076
2372284d9457
formatting: blacken the codebase
Augie Fackler <augie@google.com>
parents:
41836
diff
changeset
|
589 |
51287
f15cb5111a1e
pytype: move some type comment to proper annotation
Pierre-Yves David <pierre-yves.david@octobus.net>
parents:
51285
diff
changeset
|
590 def getutf8char(s: bytes, pos: int) -> bytes: |
45942
89a2afe31e82
formating: upgrade to black 20.8b1
Augie Fackler <raf@durin42.com>
parents:
45681
diff
changeset
|
591 """get the next full utf-8 character in the given string, starting at pos |
26875
cf47bdb2183c
encoding: add getutf8char helper
Matt Mackall <mpm@selenic.com>
parents:
25660
diff
changeset
|
592 |
cf47bdb2183c
encoding: add getutf8char helper
Matt Mackall <mpm@selenic.com>
parents:
25660
diff
changeset
|
593 Raises a UnicodeError if the given location does not start a valid |
cf47bdb2183c
encoding: add getutf8char helper
Matt Mackall <mpm@selenic.com>
parents:
25660
diff
changeset
|
594 utf-8 character. |
45942
89a2afe31e82
formating: upgrade to black 20.8b1
Augie Fackler <raf@durin42.com>
parents:
45681
diff
changeset
|
595 """ |
26875
cf47bdb2183c
encoding: add getutf8char helper
Matt Mackall <mpm@selenic.com>
parents:
25660
diff
changeset
|
596 |
cf47bdb2183c
encoding: add getutf8char helper
Matt Mackall <mpm@selenic.com>
parents:
25660
diff
changeset
|
597 # find how many bytes to attempt decoding from first nibble |
43076
2372284d9457
formatting: blacken the codebase
Augie Fackler <augie@google.com>
parents:
41836
diff
changeset
|
598 l = _utf8len[ord(s[pos : pos + 1]) >> 4] |
2372284d9457
formatting: blacken the codebase
Augie Fackler <augie@google.com>
parents:
41836
diff
changeset
|
599 if not l: # ascii |
2372284d9457
formatting: blacken the codebase
Augie Fackler <augie@google.com>
parents:
41836
diff
changeset
|
600 return s[pos : pos + 1] |
26875
cf47bdb2183c
encoding: add getutf8char helper
Matt Mackall <mpm@selenic.com>
parents:
25660
diff
changeset
|
601 |
43076
2372284d9457
formatting: blacken the codebase
Augie Fackler <augie@google.com>
parents:
41836
diff
changeset
|
602 c = s[pos : pos + l] |
26875
cf47bdb2183c
encoding: add getutf8char helper
Matt Mackall <mpm@selenic.com>
parents:
25660
diff
changeset
|
603 # validate with attempted decode |
34218
aa877860d4d7
py3: use 'surrogatepass' error handler to process U+DCxx transparently
Yuya Nishihara <yuya@tcha.org>
parents:
34216
diff
changeset
|
604 c.decode("utf-8", _utf8strict) |
26875
cf47bdb2183c
encoding: add getutf8char helper
Matt Mackall <mpm@selenic.com>
parents:
25660
diff
changeset
|
605 return c |
cf47bdb2183c
encoding: add getutf8char helper
Matt Mackall <mpm@selenic.com>
parents:
25660
diff
changeset
|
606 |
43076
2372284d9457
formatting: blacken the codebase
Augie Fackler <augie@google.com>
parents:
41836
diff
changeset
|
607 |
51287
f15cb5111a1e
pytype: move some type comment to proper annotation
Pierre-Yves David <pierre-yves.david@octobus.net>
parents:
51285
diff
changeset
|
608 def toutf8b(s: bytes) -> bytes: |
45942
89a2afe31e82
formating: upgrade to black 20.8b1
Augie Fackler <raf@durin42.com>
parents:
45681
diff
changeset
|
609 """convert a local, possibly-binary string into UTF-8b |
16133
84c58da3a1f8
encoding: introduce utf8-b helpers
Matt Mackall <mpm@selenic.com>
parents:
15769
diff
changeset
|
610 |
84c58da3a1f8
encoding: introduce utf8-b helpers
Matt Mackall <mpm@selenic.com>
parents:
15769
diff
changeset
|
611 This is intended as a generic method to preserve data when working |
84c58da3a1f8
encoding: introduce utf8-b helpers
Matt Mackall <mpm@selenic.com>
parents:
15769
diff
changeset
|
612 with schemes like JSON and XML that have no provision for |
84c58da3a1f8
encoding: introduce utf8-b helpers
Matt Mackall <mpm@selenic.com>
parents:
15769
diff
changeset
|
613 arbitrary byte strings. As Mercurial often doesn't know |
84c58da3a1f8
encoding: introduce utf8-b helpers
Matt Mackall <mpm@selenic.com>
parents:
15769
diff
changeset
|
614 what encoding data is in, we use so-called UTF-8b. |
84c58da3a1f8
encoding: introduce utf8-b helpers
Matt Mackall <mpm@selenic.com>
parents:
15769
diff
changeset
|
615 |
84c58da3a1f8
encoding: introduce utf8-b helpers
Matt Mackall <mpm@selenic.com>
parents:
15769
diff
changeset
|
616 If a string is already valid UTF-8 (or ASCII), it passes unmodified. |
84c58da3a1f8
encoding: introduce utf8-b helpers
Matt Mackall <mpm@selenic.com>
parents:
15769
diff
changeset
|
617 Otherwise, unsupported bytes are mapped to UTF-16 surrogate range, |
84c58da3a1f8
encoding: introduce utf8-b helpers
Matt Mackall <mpm@selenic.com>
parents:
15769
diff
changeset
|
618 uDC00-uDCFF. |
84c58da3a1f8
encoding: introduce utf8-b helpers
Matt Mackall <mpm@selenic.com>
parents:
15769
diff
changeset
|
619 |
84c58da3a1f8
encoding: introduce utf8-b helpers
Matt Mackall <mpm@selenic.com>
parents:
15769
diff
changeset
|
620 Principles of operation: |
84c58da3a1f8
encoding: introduce utf8-b helpers
Matt Mackall <mpm@selenic.com>
parents:
15769
diff
changeset
|
621 |
17424
e7cfe3587ea4
fix trivial spelling errors
Mads Kiilerich <mads@kiilerich.com>
parents:
17236
diff
changeset
|
622 - ASCII and UTF-8 data successfully round-trips and is understood |
16133
84c58da3a1f8
encoding: introduce utf8-b helpers
Matt Mackall <mpm@selenic.com>
parents:
15769
diff
changeset
|
623 by Unicode-oriented clients |
84c58da3a1f8
encoding: introduce utf8-b helpers
Matt Mackall <mpm@selenic.com>
parents:
15769
diff
changeset
|
624 - filenames and file contents in arbitrary other encodings can have |
84c58da3a1f8
encoding: introduce utf8-b helpers
Matt Mackall <mpm@selenic.com>
parents:
15769
diff
changeset
|
625 be round-tripped or recovered by clueful clients |
84c58da3a1f8
encoding: introduce utf8-b helpers
Matt Mackall <mpm@selenic.com>
parents:
15769
diff
changeset
|
626 - local strings that have a cached known UTF-8 encoding (aka |
84c58da3a1f8
encoding: introduce utf8-b helpers
Matt Mackall <mpm@selenic.com>
parents:
15769
diff
changeset
|
627 localstr) get sent as UTF-8 so Unicode-oriented clients get the |
84c58da3a1f8
encoding: introduce utf8-b helpers
Matt Mackall <mpm@selenic.com>
parents:
15769
diff
changeset
|
628 Unicode data they want |
37947
3ea3c96ada54
encoding: introduce tagging type for non-lossy non-ASCII string
Yuya Nishihara <yuya@tcha.org>
parents:
37946
diff
changeset
|
629 - non-lossy local strings (aka safelocalstr) get sent as UTF-8 as well |
16133
84c58da3a1f8
encoding: introduce utf8-b helpers
Matt Mackall <mpm@selenic.com>
parents:
15769
diff
changeset
|
630 - because we must preserve UTF-8 bytestring in places such as |
84c58da3a1f8
encoding: introduce utf8-b helpers
Matt Mackall <mpm@selenic.com>
parents:
15769
diff
changeset
|
631 filenames, metadata can't be roundtripped without help |
84c58da3a1f8
encoding: introduce utf8-b helpers
Matt Mackall <mpm@selenic.com>
parents:
15769
diff
changeset
|
632 |
84c58da3a1f8
encoding: introduce utf8-b helpers
Matt Mackall <mpm@selenic.com>
parents:
15769
diff
changeset
|
633 (Note: "UTF-8b" often refers to decoding a mix of valid UTF-8 and |
84c58da3a1f8
encoding: introduce utf8-b helpers
Matt Mackall <mpm@selenic.com>
parents:
15769
diff
changeset
|
634 arbitrary bytes into an internal Unicode format that can be |
84c58da3a1f8
encoding: introduce utf8-b helpers
Matt Mackall <mpm@selenic.com>
parents:
15769
diff
changeset
|
635 re-encoded back into the original. Here we are exposing the |
84c58da3a1f8
encoding: introduce utf8-b helpers
Matt Mackall <mpm@selenic.com>
parents:
15769
diff
changeset
|
636 internal surrogate encoding as a UTF-8 string.) |
45942
89a2afe31e82
formating: upgrade to black 20.8b1
Augie Fackler <raf@durin42.com>
parents:
45681
diff
changeset
|
637 """ |
16133
84c58da3a1f8
encoding: introduce utf8-b helpers
Matt Mackall <mpm@selenic.com>
parents:
15769
diff
changeset
|
638 |
37946
57b0c7221dba
encoding: fix toutf8b() to resurrect lossy characters even if "\xed" in it
Yuya Nishihara <yuya@tcha.org>
parents:
36797
diff
changeset
|
639 if isinstance(s, localstr): |
57b0c7221dba
encoding: fix toutf8b() to resurrect lossy characters even if "\xed" in it
Yuya Nishihara <yuya@tcha.org>
parents:
36797
diff
changeset
|
640 # assume that the original UTF-8 sequence would never contain |
57b0c7221dba
encoding: fix toutf8b() to resurrect lossy characters even if "\xed" in it
Yuya Nishihara <yuya@tcha.org>
parents:
36797
diff
changeset
|
641 # invalid characters in U+DCxx range |
57b0c7221dba
encoding: fix toutf8b() to resurrect lossy characters even if "\xed" in it
Yuya Nishihara <yuya@tcha.org>
parents:
36797
diff
changeset
|
642 return s._utf8 |
37947
3ea3c96ada54
encoding: introduce tagging type for non-lossy non-ASCII string
Yuya Nishihara <yuya@tcha.org>
parents:
37946
diff
changeset
|
643 elif isinstance(s, safelocalstr): |
3ea3c96ada54
encoding: introduce tagging type for non-lossy non-ASCII string
Yuya Nishihara <yuya@tcha.org>
parents:
37946
diff
changeset
|
644 # already verified that s is non-lossy in legacy encoding, which |
3ea3c96ada54
encoding: introduce tagging type for non-lossy non-ASCII string
Yuya Nishihara <yuya@tcha.org>
parents:
37946
diff
changeset
|
645 # shouldn't contain characters in U+DCxx range |
3ea3c96ada54
encoding: introduce tagging type for non-lossy non-ASCII string
Yuya Nishihara <yuya@tcha.org>
parents:
37946
diff
changeset
|
646 return fromlocal(s) |
37946
57b0c7221dba
encoding: fix toutf8b() to resurrect lossy characters even if "\xed" in it
Yuya Nishihara <yuya@tcha.org>
parents:
36797
diff
changeset
|
647 elif isasciistr(s): |
33928
6c119dbfd0c0
encoding: add fast path of from/toutf8b() for ASCII strings
Yuya Nishihara <yuya@tcha.org>
parents:
33927
diff
changeset
|
648 return s |
43077
687b865b95ad
formatting: byteify all mercurial/ and hgext/ string literals
Augie Fackler <augie@google.com>
parents:
43076
diff
changeset
|
649 if b"\xed" not in s: |
26879
a24b98f4e03c
encoding: re-escape U+DCxx characters in toutf8b input (issue4927)
Matt Mackall <mpm@selenic.com>
parents:
26878
diff
changeset
|
650 try: |
34218
aa877860d4d7
py3: use 'surrogatepass' error handler to process U+DCxx transparently
Yuya Nishihara <yuya@tcha.org>
parents:
34216
diff
changeset
|
651 s.decode('utf-8', _utf8strict) |
26879
a24b98f4e03c
encoding: re-escape U+DCxx characters in toutf8b input (issue4927)
Matt Mackall <mpm@selenic.com>
parents:
26878
diff
changeset
|
652 return s |
a24b98f4e03c
encoding: re-escape U+DCxx characters in toutf8b input (issue4927)
Matt Mackall <mpm@selenic.com>
parents:
26878
diff
changeset
|
653 except UnicodeDecodeError: |
a24b98f4e03c
encoding: re-escape U+DCxx characters in toutf8b input (issue4927)
Matt Mackall <mpm@selenic.com>
parents:
26878
diff
changeset
|
654 pass |
26878
d7e83f106459
encoding: use getutf8char in toutf8b
Matt Mackall <mpm@selenic.com>
parents:
26877
diff
changeset
|
655 |
34216
1c601df9894c
py3: wrap bytes in encoding.from/toutf8b() with bytestr
Yuya Nishihara <yuya@tcha.org>
parents:
34200
diff
changeset
|
656 s = pycompat.bytestr(s) |
50400
95acba2c29f6
encoding: avoid quadratic time complexity when json-encoding non-UTF8 strings
Arseniy Alekseyev <aalekseyev@janestreet.com>
parents:
49284
diff
changeset
|
657 r = bytearray() |
26878
d7e83f106459
encoding: use getutf8char in toutf8b
Matt Mackall <mpm@selenic.com>
parents:
26877
diff
changeset
|
658 pos = 0 |
d7e83f106459
encoding: use getutf8char in toutf8b
Matt Mackall <mpm@selenic.com>
parents:
26877
diff
changeset
|
659 l = len(s) |
d7e83f106459
encoding: use getutf8char in toutf8b
Matt Mackall <mpm@selenic.com>
parents:
26877
diff
changeset
|
660 while pos < l: |
d7e83f106459
encoding: use getutf8char in toutf8b
Matt Mackall <mpm@selenic.com>
parents:
26877
diff
changeset
|
661 try: |
d7e83f106459
encoding: use getutf8char in toutf8b
Matt Mackall <mpm@selenic.com>
parents:
26877
diff
changeset
|
662 c = getutf8char(s, pos) |
43077
687b865b95ad
formatting: byteify all mercurial/ and hgext/ string literals
Augie Fackler <augie@google.com>
parents:
43076
diff
changeset
|
663 if b"\xed\xb0\x80" <= c <= b"\xed\xb3\xbf": |
26879
a24b98f4e03c
encoding: re-escape U+DCxx characters in toutf8b input (issue4927)
Matt Mackall <mpm@selenic.com>
parents:
26878
diff
changeset
|
664 # have to re-escape existing U+DCxx characters |
43076
2372284d9457
formatting: blacken the codebase
Augie Fackler <augie@google.com>
parents:
41836
diff
changeset
|
665 c = unichr(0xDC00 + ord(s[pos])).encode('utf-8', _utf8strict) |
26879
a24b98f4e03c
encoding: re-escape U+DCxx characters in toutf8b input (issue4927)
Matt Mackall <mpm@selenic.com>
parents:
26878
diff
changeset
|
666 pos += 1 |
a24b98f4e03c
encoding: re-escape U+DCxx characters in toutf8b input (issue4927)
Matt Mackall <mpm@selenic.com>
parents:
26878
diff
changeset
|
667 else: |
a24b98f4e03c
encoding: re-escape U+DCxx characters in toutf8b input (issue4927)
Matt Mackall <mpm@selenic.com>
parents:
26878
diff
changeset
|
668 pos += len(c) |
26878
d7e83f106459
encoding: use getutf8char in toutf8b
Matt Mackall <mpm@selenic.com>
parents:
26877
diff
changeset
|
669 except UnicodeDecodeError: |
43076
2372284d9457
formatting: blacken the codebase
Augie Fackler <augie@google.com>
parents:
41836
diff
changeset
|
670 c = unichr(0xDC00 + ord(s[pos])).encode('utf-8', _utf8strict) |
26878
d7e83f106459
encoding: use getutf8char in toutf8b
Matt Mackall <mpm@selenic.com>
parents:
26877
diff
changeset
|
671 pos += 1 |
d7e83f106459
encoding: use getutf8char in toutf8b
Matt Mackall <mpm@selenic.com>
parents:
26877
diff
changeset
|
672 r += c |
50400
95acba2c29f6
encoding: avoid quadratic time complexity when json-encoding non-UTF8 strings
Arseniy Alekseyev <aalekseyev@janestreet.com>
parents:
49284
diff
changeset
|
673 return bytes(r) |
16133
84c58da3a1f8
encoding: introduce utf8-b helpers
Matt Mackall <mpm@selenic.com>
parents:
15769
diff
changeset
|
674 |
43076
2372284d9457
formatting: blacken the codebase
Augie Fackler <augie@google.com>
parents:
41836
diff
changeset
|
675 |
51287
f15cb5111a1e
pytype: move some type comment to proper annotation
Pierre-Yves David <pierre-yves.david@octobus.net>
parents:
51285
diff
changeset
|
676 def fromutf8b(s: bytes) -> bytes: |
45942
89a2afe31e82
formating: upgrade to black 20.8b1
Augie Fackler <raf@durin42.com>
parents:
45681
diff
changeset
|
677 """Given a UTF-8b string, return a local, possibly-binary string. |
16133
84c58da3a1f8
encoding: introduce utf8-b helpers
Matt Mackall <mpm@selenic.com>
parents:
15769
diff
changeset
|
678 |
84c58da3a1f8
encoding: introduce utf8-b helpers
Matt Mackall <mpm@selenic.com>
parents:
15769
diff
changeset
|
679 return the original binary string. This |
84c58da3a1f8
encoding: introduce utf8-b helpers
Matt Mackall <mpm@selenic.com>
parents:
15769
diff
changeset
|
680 is a round-trip process for strings like filenames, but metadata |
84c58da3a1f8
encoding: introduce utf8-b helpers
Matt Mackall <mpm@selenic.com>
parents:
15769
diff
changeset
|
681 that's was passed through tolocal will remain in UTF-8. |
84c58da3a1f8
encoding: introduce utf8-b helpers
Matt Mackall <mpm@selenic.com>
parents:
15769
diff
changeset
|
682 |
26963
de5ae97ce9f4
encoding: extend test cases for utf8b
Matt Mackall <mpm@selenic.com>
parents:
26879
diff
changeset
|
683 >>> roundtrip = lambda x: fromutf8b(toutf8b(x)) == x |
34131
0fa781320203
doctest: bulk-replace string literals with b'' for Python 3
Yuya Nishihara <yuya@tcha.org>
parents:
33928
diff
changeset
|
684 >>> m = b"\\xc3\\xa9\\x99abcd" |
26963
de5ae97ce9f4
encoding: extend test cases for utf8b
Matt Mackall <mpm@selenic.com>
parents:
26879
diff
changeset
|
685 >>> toutf8b(m) |
16133
84c58da3a1f8
encoding: introduce utf8-b helpers
Matt Mackall <mpm@selenic.com>
parents:
15769
diff
changeset
|
686 '\\xc3\\xa9\\xed\\xb2\\x99abcd' |
26963
de5ae97ce9f4
encoding: extend test cases for utf8b
Matt Mackall <mpm@selenic.com>
parents:
26879
diff
changeset
|
687 >>> roundtrip(m) |
de5ae97ce9f4
encoding: extend test cases for utf8b
Matt Mackall <mpm@selenic.com>
parents:
26879
diff
changeset
|
688 True |
34131
0fa781320203
doctest: bulk-replace string literals with b'' for Python 3
Yuya Nishihara <yuya@tcha.org>
parents:
33928
diff
changeset
|
689 >>> roundtrip(b"\\xc2\\xc2\\x80") |
26963
de5ae97ce9f4
encoding: extend test cases for utf8b
Matt Mackall <mpm@selenic.com>
parents:
26879
diff
changeset
|
690 True |
34131
0fa781320203
doctest: bulk-replace string literals with b'' for Python 3
Yuya Nishihara <yuya@tcha.org>
parents:
33928
diff
changeset
|
691 >>> roundtrip(b"\\xef\\xbf\\xbd") |
26963
de5ae97ce9f4
encoding: extend test cases for utf8b
Matt Mackall <mpm@selenic.com>
parents:
26879
diff
changeset
|
692 True |
34131
0fa781320203
doctest: bulk-replace string literals with b'' for Python 3
Yuya Nishihara <yuya@tcha.org>
parents:
33928
diff
changeset
|
693 >>> roundtrip(b"\\xef\\xef\\xbf\\xbd") |
16133
84c58da3a1f8
encoding: introduce utf8-b helpers
Matt Mackall <mpm@selenic.com>
parents:
15769
diff
changeset
|
694 True |
34131
0fa781320203
doctest: bulk-replace string literals with b'' for Python 3
Yuya Nishihara <yuya@tcha.org>
parents:
33928
diff
changeset
|
695 >>> roundtrip(b"\\xf1\\x80\\x80\\x80\\x80") |
27699
c8d3392f76e1
encoding: handle UTF-16 internal limit with fromutf8b (issue5031)
Matt Mackall <mpm@selenic.com>
parents:
27356
diff
changeset
|
696 True |
45942
89a2afe31e82
formating: upgrade to black 20.8b1
Augie Fackler <raf@durin42.com>
parents:
45681
diff
changeset
|
697 """ |
16133
84c58da3a1f8
encoding: introduce utf8-b helpers
Matt Mackall <mpm@selenic.com>
parents:
15769
diff
changeset
|
698 |
33928
6c119dbfd0c0
encoding: add fast path of from/toutf8b() for ASCII strings
Yuya Nishihara <yuya@tcha.org>
parents:
33927
diff
changeset
|
699 if isasciistr(s): |
6c119dbfd0c0
encoding: add fast path of from/toutf8b() for ASCII strings
Yuya Nishihara <yuya@tcha.org>
parents:
33927
diff
changeset
|
700 return s |
16133
84c58da3a1f8
encoding: introduce utf8-b helpers
Matt Mackall <mpm@selenic.com>
parents:
15769
diff
changeset
|
701 # fast path - look for uDxxx prefixes in s |
43077
687b865b95ad
formatting: byteify all mercurial/ and hgext/ string literals
Augie Fackler <augie@google.com>
parents:
43076
diff
changeset
|
702 if b"\xed" not in s: |
16133
84c58da3a1f8
encoding: introduce utf8-b helpers
Matt Mackall <mpm@selenic.com>
parents:
15769
diff
changeset
|
703 return s |
84c58da3a1f8
encoding: introduce utf8-b helpers
Matt Mackall <mpm@selenic.com>
parents:
15769
diff
changeset
|
704 |
27699
c8d3392f76e1
encoding: handle UTF-16 internal limit with fromutf8b (issue5031)
Matt Mackall <mpm@selenic.com>
parents:
27356
diff
changeset
|
705 # We could do this with the unicode type but some Python builds |
c8d3392f76e1
encoding: handle UTF-16 internal limit with fromutf8b (issue5031)
Matt Mackall <mpm@selenic.com>
parents:
27356
diff
changeset
|
706 # use UTF-16 internally (issue5031) which causes non-BMP code |
c8d3392f76e1
encoding: handle UTF-16 internal limit with fromutf8b (issue5031)
Matt Mackall <mpm@selenic.com>
parents:
27356
diff
changeset
|
707 # points to be escaped. Instead, we use our handy getutf8char |
c8d3392f76e1
encoding: handle UTF-16 internal limit with fromutf8b (issue5031)
Matt Mackall <mpm@selenic.com>
parents:
27356
diff
changeset
|
708 # helper again to walk the string without "decoding" it. |
c8d3392f76e1
encoding: handle UTF-16 internal limit with fromutf8b (issue5031)
Matt Mackall <mpm@selenic.com>
parents:
27356
diff
changeset
|
709 |
34216
1c601df9894c
py3: wrap bytes in encoding.from/toutf8b() with bytestr
Yuya Nishihara <yuya@tcha.org>
parents:
34200
diff
changeset
|
710 s = pycompat.bytestr(s) |
50400
95acba2c29f6
encoding: avoid quadratic time complexity when json-encoding non-UTF8 strings
Arseniy Alekseyev <aalekseyev@janestreet.com>
parents:
49284
diff
changeset
|
711 r = bytearray() |
27699
c8d3392f76e1
encoding: handle UTF-16 internal limit with fromutf8b (issue5031)
Matt Mackall <mpm@selenic.com>
parents:
27356
diff
changeset
|
712 pos = 0 |
c8d3392f76e1
encoding: handle UTF-16 internal limit with fromutf8b (issue5031)
Matt Mackall <mpm@selenic.com>
parents:
27356
diff
changeset
|
713 l = len(s) |
c8d3392f76e1
encoding: handle UTF-16 internal limit with fromutf8b (issue5031)
Matt Mackall <mpm@selenic.com>
parents:
27356
diff
changeset
|
714 while pos < l: |
c8d3392f76e1
encoding: handle UTF-16 internal limit with fromutf8b (issue5031)
Matt Mackall <mpm@selenic.com>
parents:
27356
diff
changeset
|
715 c = getutf8char(s, pos) |
c8d3392f76e1
encoding: handle UTF-16 internal limit with fromutf8b (issue5031)
Matt Mackall <mpm@selenic.com>
parents:
27356
diff
changeset
|
716 pos += len(c) |
c8d3392f76e1
encoding: handle UTF-16 internal limit with fromutf8b (issue5031)
Matt Mackall <mpm@selenic.com>
parents:
27356
diff
changeset
|
717 # unescape U+DCxx characters |
43077
687b865b95ad
formatting: byteify all mercurial/ and hgext/ string literals
Augie Fackler <augie@google.com>
parents:
43076
diff
changeset
|
718 if b"\xed\xb0\x80" <= c <= b"\xed\xb3\xbf": |
43076
2372284d9457
formatting: blacken the codebase
Augie Fackler <augie@google.com>
parents:
41836
diff
changeset
|
719 c = pycompat.bytechr(ord(c.decode("utf-8", _utf8strict)) & 0xFF) |
27699
c8d3392f76e1
encoding: handle UTF-16 internal limit with fromutf8b (issue5031)
Matt Mackall <mpm@selenic.com>
parents:
27356
diff
changeset
|
720 r += c |
50400
95acba2c29f6
encoding: avoid quadratic time complexity when json-encoding non-UTF8 strings
Arseniy Alekseyev <aalekseyev@janestreet.com>
parents:
49284
diff
changeset
|
721 return bytes(r) |