Mercurial > hg-stable
annotate mercurial/encoding.py @ 48692:f1ed5c304f45
encoding: fix trim() to be O(n) instead of O(n^2)
`encoding.trim()` iterated over the possible lengths smaller than the
input and created a slice for each. It then calculated the column
width of the result, which is of course O(n), so the overall algorithm
was O(n). This patch rewrites it to iterate over the unicode
characters, keeping track of the length so far. Also, the old
algorithm started from the end of the string, which made it much worse
when the input is large and the limit is small (such as the typical 72
we pass to it).
You can time it by running something like this:
```
time python3 -c 'from mercurial.utils import stringutil; print(stringutil.ellipsis(b"0123456789" * 1000, 5))'
```
That drops from 4.05 s to 83 ms with this patch (and most of that is
of course startup time).
Differential Revision: https://phab.mercurial-scm.org/D12089
author | Martin von Zweigbergk <martinvonz@google.com> |
---|---|
date | Wed, 26 Jan 2022 10:11:01 -0800 |
parents | 28c62f83b652 |
children | 6000f5b25c9b |
rev | line source |
---|---|
8226
8b2cd04a6e97
put license and copyright info into comment blocks
Martin Geisler <mg@lazybytes.net>
parents:
8225
diff
changeset
|
1 # encoding.py - character transcoding support for Mercurial |
8b2cd04a6e97
put license and copyright info into comment blocks
Martin Geisler <mg@lazybytes.net>
parents:
8225
diff
changeset
|
2 # |
46819
d4ba4d51f85f
contributor: change mentions of mpm to olivia
Raphaël Gomès <rgomes@octobus.net>
parents:
46319
diff
changeset
|
3 # Copyright 2005-2009 Olivia Mackall <olivia@selenic.com> and others |
8226
8b2cd04a6e97
put license and copyright info into comment blocks
Martin Geisler <mg@lazybytes.net>
parents:
8225
diff
changeset
|
4 # |
8b2cd04a6e97
put license and copyright info into comment blocks
Martin Geisler <mg@lazybytes.net>
parents:
8225
diff
changeset
|
5 # This software may be used and distributed according to the terms of the |
10263 | 6 # GNU General Public License version 2 or any later version. |
7948
de377b1a9a84
move encoding bits from util to encoding
Matt Mackall <mpm@selenic.com>
parents:
diff
changeset
|
7 |
34152
a8994d08e4a2
doctest: use print_function and convert bytes to unicode where needed
Yuya Nishihara <yuya@tcha.org>
parents:
34151
diff
changeset
|
8 from __future__ import absolute_import, print_function |
27355
b479fc425a81
encoding: use absolute_import
Gregory Szorc <gregory.szorc@gmail.com>
parents:
26963
diff
changeset
|
9 |
b479fc425a81
encoding: use absolute_import
Gregory Szorc <gregory.szorc@gmail.com>
parents:
26963
diff
changeset
|
10 import locale |
b479fc425a81
encoding: use absolute_import
Gregory Szorc <gregory.szorc@gmail.com>
parents:
26963
diff
changeset
|
11 import os |
47621
d6ee6456bd5f
windows: enforce upper case drive letter for getcwd in mercurial too
Pierre-Yves David <pierre-yves.david@octobus.net>
parents:
47560
diff
changeset
|
12 import re |
27355
b479fc425a81
encoding: use absolute_import
Gregory Szorc <gregory.szorc@gmail.com>
parents:
26963
diff
changeset
|
13 import unicodedata |
b479fc425a81
encoding: use absolute_import
Gregory Szorc <gregory.szorc@gmail.com>
parents:
26963
diff
changeset
|
14 |
43089
c59eb1560c44
py3: manually import getattr where it is needed
Gregory Szorc <gregory.szorc@gmail.com>
parents:
43077
diff
changeset
|
15 from .pycompat import getattr |
27355
b479fc425a81
encoding: use absolute_import
Gregory Szorc <gregory.szorc@gmail.com>
parents:
26963
diff
changeset
|
16 from . import ( |
b479fc425a81
encoding: use absolute_import
Gregory Szorc <gregory.szorc@gmail.com>
parents:
26963
diff
changeset
|
17 error, |
32411
df448de7cf3b
parsers: switch to policy importer
Yuya Nishihara <yuya@tcha.org>
parents:
32339
diff
changeset
|
18 policy, |
30031
0f6d6fdd3c2a
pycompat: provide 'ispy3' constant
Yuya Nishihara <yuya@tcha.org>
parents:
28508
diff
changeset
|
19 pycompat, |
27355
b479fc425a81
encoding: use absolute_import
Gregory Szorc <gregory.szorc@gmail.com>
parents:
26963
diff
changeset
|
20 ) |
7948
de377b1a9a84
move encoding bits from util to encoding
Matt Mackall <mpm@selenic.com>
parents:
diff
changeset
|
21 |
43076
2372284d9457
formatting: blacken the codebase
Augie Fackler <augie@google.com>
parents:
41841
diff
changeset
|
22 from .pure import charencode as charencodepure |
33942
b9101467d88b
encoding: extract stub for fast JSON escape
Yuya Nishihara <yuya@tcha.org>
parents:
33873
diff
changeset
|
23 |
43793
7b14d649af1b
typing: consolidate "if not globals():" trick
Yuya Nishihara <yuya@tcha.org>
parents:
43725
diff
changeset
|
24 if pycompat.TYPE_CHECKING: |
43544
2ade00f3b03b
encoding: add comment-based type hints for pytype
Augie Fackler <augie@google.com>
parents:
43517
diff
changeset
|
25 from typing import ( |
2ade00f3b03b
encoding: add comment-based type hints for pytype
Augie Fackler <augie@google.com>
parents:
43517
diff
changeset
|
26 Any, |
2ade00f3b03b
encoding: add comment-based type hints for pytype
Augie Fackler <augie@google.com>
parents:
43517
diff
changeset
|
27 Callable, |
2ade00f3b03b
encoding: add comment-based type hints for pytype
Augie Fackler <augie@google.com>
parents:
43517
diff
changeset
|
28 List, |
2ade00f3b03b
encoding: add comment-based type hints for pytype
Augie Fackler <augie@google.com>
parents:
43517
diff
changeset
|
29 Text, |
2ade00f3b03b
encoding: add comment-based type hints for pytype
Augie Fackler <augie@google.com>
parents:
43517
diff
changeset
|
30 Type, |
2ade00f3b03b
encoding: add comment-based type hints for pytype
Augie Fackler <augie@google.com>
parents:
43517
diff
changeset
|
31 TypeVar, |
2ade00f3b03b
encoding: add comment-based type hints for pytype
Augie Fackler <augie@google.com>
parents:
43517
diff
changeset
|
32 Union, |
2ade00f3b03b
encoding: add comment-based type hints for pytype
Augie Fackler <augie@google.com>
parents:
43517
diff
changeset
|
33 ) |
2ade00f3b03b
encoding: add comment-based type hints for pytype
Augie Fackler <augie@google.com>
parents:
43517
diff
changeset
|
34 |
2ade00f3b03b
encoding: add comment-based type hints for pytype
Augie Fackler <augie@google.com>
parents:
43517
diff
changeset
|
35 # keep pyflakes happy |
2ade00f3b03b
encoding: add comment-based type hints for pytype
Augie Fackler <augie@google.com>
parents:
43517
diff
changeset
|
36 for t in (Any, Callable, List, Text, Type, Union): |
2ade00f3b03b
encoding: add comment-based type hints for pytype
Augie Fackler <augie@google.com>
parents:
43517
diff
changeset
|
37 assert t |
2ade00f3b03b
encoding: add comment-based type hints for pytype
Augie Fackler <augie@google.com>
parents:
43517
diff
changeset
|
38 |
43720
3364a15f61f0
typing: fix forward reference in _Tlocalstr type bound
Yuya Nishihara <yuya@tcha.org>
parents:
43719
diff
changeset
|
39 _Tlocalstr = TypeVar('_Tlocalstr', bound='localstr') |
43544
2ade00f3b03b
encoding: add comment-based type hints for pytype
Augie Fackler <augie@google.com>
parents:
43517
diff
changeset
|
40 |
43554
9f70512ae2cf
cleanup: remove pointless r-prefixes on single-quoted strings
Augie Fackler <augie@google.com>
parents:
43551
diff
changeset
|
41 charencode = policy.importmod('charencode') |
33782
f5fc54e7e467
encoding: drop circular import by proxying through '<policy>.charencode'
Yuya Nishihara <yuya@tcha.org>
parents:
33038
diff
changeset
|
42 |
33944
f4433f2713d0
encoding: add function to test if a str consists of ASCII characters
Yuya Nishihara <yuya@tcha.org>
parents:
33943
diff
changeset
|
43 isasciistr = charencode.isasciistr |
33782
f5fc54e7e467
encoding: drop circular import by proxying through '<policy>.charencode'
Yuya Nishihara <yuya@tcha.org>
parents:
33038
diff
changeset
|
44 asciilower = charencode.asciilower |
f5fc54e7e467
encoding: drop circular import by proxying through '<policy>.charencode'
Yuya Nishihara <yuya@tcha.org>
parents:
33038
diff
changeset
|
45 asciiupper = charencode.asciiupper |
33943
2c37f9dabc32
encoding: add fast path of jsonescape() (issue5533)
Yuya Nishihara <yuya@tcha.org>
parents:
33942
diff
changeset
|
46 _jsonescapeu8fast = charencode.jsonescapeu8fast |
33782
f5fc54e7e467
encoding: drop circular import by proxying through '<policy>.charencode'
Yuya Nishihara <yuya@tcha.org>
parents:
33038
diff
changeset
|
47 |
30033
02dbfaa6df0b
py3: convert encoding name and mode to str
Yuya Nishihara <yuya@tcha.org>
parents:
30031
diff
changeset
|
48 _sysstr = pycompat.sysstr |
02dbfaa6df0b
py3: convert encoding name and mode to str
Yuya Nishihara <yuya@tcha.org>
parents:
30031
diff
changeset
|
49 |
30031
0f6d6fdd3c2a
pycompat: provide 'ispy3' constant
Yuya Nishihara <yuya@tcha.org>
parents:
28508
diff
changeset
|
50 if pycompat.ispy3: |
28507
9bcbd9412225
encoding: make HFS+ ignore code Python 3 compatible
Gregory Szorc <gregory.szorc@gmail.com>
parents:
28069
diff
changeset
|
51 unichr = chr |
9bcbd9412225
encoding: make HFS+ ignore code Python 3 compatible
Gregory Szorc <gregory.szorc@gmail.com>
parents:
28069
diff
changeset
|
52 |
23596
885bd7c5c7e3
encoding: add hfsignoreclean to clean out HFS-ignored characters
Augie Fackler <raf@durin42.com>
parents:
22973
diff
changeset
|
53 # These unicode characters are ignored by HFS+ (Apple Technote 1150, |
885bd7c5c7e3
encoding: add hfsignoreclean to clean out HFS-ignored characters
Augie Fackler <raf@durin42.com>
parents:
22973
diff
changeset
|
54 # "Unicode Subtleties"), so we need to ignore them in some places for |
885bd7c5c7e3
encoding: add hfsignoreclean to clean out HFS-ignored characters
Augie Fackler <raf@durin42.com>
parents:
22973
diff
changeset
|
55 # sanity. |
43076
2372284d9457
formatting: blacken the codebase
Augie Fackler <augie@google.com>
parents:
41841
diff
changeset
|
56 _ignore = [ |
2372284d9457
formatting: blacken the codebase
Augie Fackler <augie@google.com>
parents:
41841
diff
changeset
|
57 unichr(int(x, 16)).encode("utf-8") |
43077
687b865b95ad
formatting: byteify all mercurial/ and hgext/ string literals
Augie Fackler <augie@google.com>
parents:
43076
diff
changeset
|
58 for x in b"200c 200d 200e 200f 202a 202b 202c 202d 202e " |
687b865b95ad
formatting: byteify all mercurial/ and hgext/ string literals
Augie Fackler <augie@google.com>
parents:
43076
diff
changeset
|
59 b"206a 206b 206c 206d 206e 206f feff".split() |
43076
2372284d9457
formatting: blacken the codebase
Augie Fackler <augie@google.com>
parents:
41841
diff
changeset
|
60 ] |
23596
885bd7c5c7e3
encoding: add hfsignoreclean to clean out HFS-ignored characters
Augie Fackler <raf@durin42.com>
parents:
22973
diff
changeset
|
61 # verify the next function will work |
43077
687b865b95ad
formatting: byteify all mercurial/ and hgext/ string literals
Augie Fackler <augie@google.com>
parents:
43076
diff
changeset
|
62 assert all(i.startswith((b"\xe2", b"\xef")) for i in _ignore) |
23596
885bd7c5c7e3
encoding: add hfsignoreclean to clean out HFS-ignored characters
Augie Fackler <raf@durin42.com>
parents:
22973
diff
changeset
|
63 |
43076
2372284d9457
formatting: blacken the codebase
Augie Fackler <augie@google.com>
parents:
41841
diff
changeset
|
64 |
23596
885bd7c5c7e3
encoding: add hfsignoreclean to clean out HFS-ignored characters
Augie Fackler <raf@durin42.com>
parents:
22973
diff
changeset
|
65 def hfsignoreclean(s): |
43544
2ade00f3b03b
encoding: add comment-based type hints for pytype
Augie Fackler <augie@google.com>
parents:
43517
diff
changeset
|
66 # type: (bytes) -> bytes |
23596
885bd7c5c7e3
encoding: add hfsignoreclean to clean out HFS-ignored characters
Augie Fackler <raf@durin42.com>
parents:
22973
diff
changeset
|
67 """Remove codepoints ignored by HFS+ from s. |
885bd7c5c7e3
encoding: add hfsignoreclean to clean out HFS-ignored characters
Augie Fackler <raf@durin42.com>
parents:
22973
diff
changeset
|
68 |
885bd7c5c7e3
encoding: add hfsignoreclean to clean out HFS-ignored characters
Augie Fackler <raf@durin42.com>
parents:
22973
diff
changeset
|
69 >>> hfsignoreclean(u'.h\u200cg'.encode('utf-8')) |
885bd7c5c7e3
encoding: add hfsignoreclean to clean out HFS-ignored characters
Augie Fackler <raf@durin42.com>
parents:
22973
diff
changeset
|
70 '.hg' |
885bd7c5c7e3
encoding: add hfsignoreclean to clean out HFS-ignored characters
Augie Fackler <raf@durin42.com>
parents:
22973
diff
changeset
|
71 >>> hfsignoreclean(u'.h\ufeffg'.encode('utf-8')) |
885bd7c5c7e3
encoding: add hfsignoreclean to clean out HFS-ignored characters
Augie Fackler <raf@durin42.com>
parents:
22973
diff
changeset
|
72 '.hg' |
885bd7c5c7e3
encoding: add hfsignoreclean to clean out HFS-ignored characters
Augie Fackler <raf@durin42.com>
parents:
22973
diff
changeset
|
73 """ |
43077
687b865b95ad
formatting: byteify all mercurial/ and hgext/ string literals
Augie Fackler <augie@google.com>
parents:
43076
diff
changeset
|
74 if b"\xe2" in s or b"\xef" in s: |
23596
885bd7c5c7e3
encoding: add hfsignoreclean to clean out HFS-ignored characters
Augie Fackler <raf@durin42.com>
parents:
22973
diff
changeset
|
75 for c in _ignore: |
43077
687b865b95ad
formatting: byteify all mercurial/ and hgext/ string literals
Augie Fackler <augie@google.com>
parents:
43076
diff
changeset
|
76 s = s.replace(c, b'') |
23596
885bd7c5c7e3
encoding: add hfsignoreclean to clean out HFS-ignored characters
Augie Fackler <raf@durin42.com>
parents:
22973
diff
changeset
|
77 return s |
885bd7c5c7e3
encoding: add hfsignoreclean to clean out HFS-ignored characters
Augie Fackler <raf@durin42.com>
parents:
22973
diff
changeset
|
78 |
43076
2372284d9457
formatting: blacken the codebase
Augie Fackler <augie@google.com>
parents:
41841
diff
changeset
|
79 |
30034
e4a6b439acc5
py3: provide encoding.environ which is a dict of bytes
Yuya Nishihara <yuya@tcha.org>
parents:
30033
diff
changeset
|
80 # encoding.environ is provided read-only, which may not be used to modify |
e4a6b439acc5
py3: provide encoding.environ which is a dict of bytes
Yuya Nishihara <yuya@tcha.org>
parents:
30033
diff
changeset
|
81 # the process environment |
43076
2372284d9457
formatting: blacken the codebase
Augie Fackler <augie@google.com>
parents:
41841
diff
changeset
|
82 _nativeenviron = not pycompat.ispy3 or os.supports_bytes_environ |
30034
e4a6b439acc5
py3: provide encoding.environ which is a dict of bytes
Yuya Nishihara <yuya@tcha.org>
parents:
30033
diff
changeset
|
83 if not pycompat.ispy3: |
32231
cf424dae5dc7
check-code: ignore re-exports of os.environ in encoding.py
Yuya Nishihara <yuya@tcha.org>
parents:
32205
diff
changeset
|
84 environ = os.environ # re-exports |
30034
e4a6b439acc5
py3: provide encoding.environ which is a dict of bytes
Yuya Nishihara <yuya@tcha.org>
parents:
30033
diff
changeset
|
85 elif _nativeenviron: |
32231
cf424dae5dc7
check-code: ignore re-exports of os.environ in encoding.py
Yuya Nishihara <yuya@tcha.org>
parents:
32205
diff
changeset
|
86 environ = os.environb # re-exports |
30034
e4a6b439acc5
py3: provide encoding.environ which is a dict of bytes
Yuya Nishihara <yuya@tcha.org>
parents:
30033
diff
changeset
|
87 else: |
e4a6b439acc5
py3: provide encoding.environ which is a dict of bytes
Yuya Nishihara <yuya@tcha.org>
parents:
30033
diff
changeset
|
88 # preferred encoding isn't known yet; use utf-8 to avoid unicode error |
e4a6b439acc5
py3: provide encoding.environ which is a dict of bytes
Yuya Nishihara <yuya@tcha.org>
parents:
30033
diff
changeset
|
89 # and recreate it once encoding is settled |
44470
9d2b2df2c2ba
cleanup: run pyupgrade on our source tree to clean up varying things
Augie Fackler <augie@google.com>
parents:
43807
diff
changeset
|
90 environ = { |
9d2b2df2c2ba
cleanup: run pyupgrade on our source tree to clean up varying things
Augie Fackler <augie@google.com>
parents:
43807
diff
changeset
|
91 k.encode('utf-8'): v.encode('utf-8') |
43076
2372284d9457
formatting: blacken the codebase
Augie Fackler <augie@google.com>
parents:
41841
diff
changeset
|
92 for k, v in os.environ.items() # re-exports |
44470
9d2b2df2c2ba
cleanup: run pyupgrade on our source tree to clean up varying things
Augie Fackler <augie@google.com>
parents:
43807
diff
changeset
|
93 } |
30034
e4a6b439acc5
py3: provide encoding.environ which is a dict of bytes
Yuya Nishihara <yuya@tcha.org>
parents:
30033
diff
changeset
|
94 |
39844
9e8fcd2e78c1
encoding: remove unnecessary lambdas from _encodingfixers
Martin von Zweigbergk <martinvonz@google.com>
parents:
39824
diff
changeset
|
95 _encodingrewrites = { |
43077
687b865b95ad
formatting: byteify all mercurial/ and hgext/ string literals
Augie Fackler <augie@google.com>
parents:
43076
diff
changeset
|
96 b'646': b'ascii', |
687b865b95ad
formatting: byteify all mercurial/ and hgext/ string literals
Augie Fackler <augie@google.com>
parents:
43076
diff
changeset
|
97 b'ANSI_X3.4-1968': b'ascii', |
11892
2be70ca17311
encoding: improve handling of buggy getpreferredencoding() on Mac OS X
Dan Villiom Podlaski Christiansen <danchr@gmail.com>
parents:
11297
diff
changeset
|
98 } |
37883
443029011990
encoding: alias cp65001 to utf-8 on Windows
Yuya Nishihara <yuya@tcha.org>
parents:
36805
diff
changeset
|
99 # cp65001 is a Windows variant of utf-8, which isn't supported on Python 2. |
443029011990
encoding: alias cp65001 to utf-8 on Windows
Yuya Nishihara <yuya@tcha.org>
parents:
36805
diff
changeset
|
100 # No idea if it should be rewritten to the canonical name 'utf-8' on Python 3. |
443029011990
encoding: alias cp65001 to utf-8 on Windows
Yuya Nishihara <yuya@tcha.org>
parents:
36805
diff
changeset
|
101 # https://bugs.python.org/issue13216 |
443029011990
encoding: alias cp65001 to utf-8 on Windows
Yuya Nishihara <yuya@tcha.org>
parents:
36805
diff
changeset
|
102 if pycompat.iswindows and not pycompat.ispy3: |
43077
687b865b95ad
formatting: byteify all mercurial/ and hgext/ string literals
Augie Fackler <augie@google.com>
parents:
43076
diff
changeset
|
103 _encodingrewrites[b'cp65001'] = b'utf-8' |
7948
de377b1a9a84
move encoding bits from util to encoding
Matt Mackall <mpm@selenic.com>
parents:
diff
changeset
|
104 |
de377b1a9a84
move encoding bits from util to encoding
Matt Mackall <mpm@selenic.com>
parents:
diff
changeset
|
105 try: |
43077
687b865b95ad
formatting: byteify all mercurial/ and hgext/ string literals
Augie Fackler <augie@google.com>
parents:
43076
diff
changeset
|
106 encoding = environ.get(b"HGENCODING") |
7948
de377b1a9a84
move encoding bits from util to encoding
Matt Mackall <mpm@selenic.com>
parents:
diff
changeset
|
107 if not encoding: |
43077
687b865b95ad
formatting: byteify all mercurial/ and hgext/ string literals
Augie Fackler <augie@google.com>
parents:
43076
diff
changeset
|
108 encoding = locale.getpreferredencoding().encode('ascii') or b'ascii' |
39844
9e8fcd2e78c1
encoding: remove unnecessary lambdas from _encodingfixers
Martin von Zweigbergk <martinvonz@google.com>
parents:
39824
diff
changeset
|
109 encoding = _encodingrewrites.get(encoding, encoding) |
7948
de377b1a9a84
move encoding bits from util to encoding
Matt Mackall <mpm@selenic.com>
parents:
diff
changeset
|
110 except locale.Error: |
43077
687b865b95ad
formatting: byteify all mercurial/ and hgext/ string literals
Augie Fackler <augie@google.com>
parents:
43076
diff
changeset
|
111 encoding = b'ascii' |
687b865b95ad
formatting: byteify all mercurial/ and hgext/ string literals
Augie Fackler <augie@google.com>
parents:
43076
diff
changeset
|
112 encodingmode = environ.get(b"HGENCODINGMODE", b"strict") |
687b865b95ad
formatting: byteify all mercurial/ and hgext/ string literals
Augie Fackler <augie@google.com>
parents:
43076
diff
changeset
|
113 fallbackencoding = b'ISO-8859-1' |
7948
de377b1a9a84
move encoding bits from util to encoding
Matt Mackall <mpm@selenic.com>
parents:
diff
changeset
|
114 |
43076
2372284d9457
formatting: blacken the codebase
Augie Fackler <augie@google.com>
parents:
41841
diff
changeset
|
115 |
33832
dabe1f11ae3a
py3: change encoding.localstr to a subclass of bytes, not str
Yuya Nishihara <yuya@tcha.org>
parents:
33782
diff
changeset
|
116 class localstr(bytes): |
45957
89a2afe31e82
formating: upgrade to black 20.8b1
Augie Fackler <raf@durin42.com>
parents:
45681
diff
changeset
|
117 """This class allows strings that are unmodified to be |
89a2afe31e82
formating: upgrade to black 20.8b1
Augie Fackler <raf@durin42.com>
parents:
45681
diff
changeset
|
118 round-tripped to the local encoding and back""" |
43076
2372284d9457
formatting: blacken the codebase
Augie Fackler <augie@google.com>
parents:
41841
diff
changeset
|
119 |
13046
7cc4263e07a9
encoding: add localstr class to track UTF-8 version of transcoded strings
Matt Mackall <mpm@selenic.com>
parents:
12866
diff
changeset
|
120 def __new__(cls, u, l): |
33832
dabe1f11ae3a
py3: change encoding.localstr to a subclass of bytes, not str
Yuya Nishihara <yuya@tcha.org>
parents:
33782
diff
changeset
|
121 s = bytes.__new__(cls, l) |
13046
7cc4263e07a9
encoding: add localstr class to track UTF-8 version of transcoded strings
Matt Mackall <mpm@selenic.com>
parents:
12866
diff
changeset
|
122 s._utf8 = u |
7cc4263e07a9
encoding: add localstr class to track UTF-8 version of transcoded strings
Matt Mackall <mpm@selenic.com>
parents:
12866
diff
changeset
|
123 return s |
43076
2372284d9457
formatting: blacken the codebase
Augie Fackler <augie@google.com>
parents:
41841
diff
changeset
|
124 |
43793
7b14d649af1b
typing: consolidate "if not globals():" trick
Yuya Nishihara <yuya@tcha.org>
parents:
43725
diff
changeset
|
125 if pycompat.TYPE_CHECKING: |
43725
da925257a39e
typing: add pseudo localstr.__init__() to help pytype
Yuya Nishihara <yuya@tcha.org>
parents:
43724
diff
changeset
|
126 # pseudo implementation to help pytype see localstr() constructor |
da925257a39e
typing: add pseudo localstr.__init__() to help pytype
Yuya Nishihara <yuya@tcha.org>
parents:
43724
diff
changeset
|
127 def __init__(self, u, l): |
da925257a39e
typing: add pseudo localstr.__init__() to help pytype
Yuya Nishihara <yuya@tcha.org>
parents:
43724
diff
changeset
|
128 # type: (bytes, bytes) -> None |
da925257a39e
typing: add pseudo localstr.__init__() to help pytype
Yuya Nishihara <yuya@tcha.org>
parents:
43724
diff
changeset
|
129 super(localstr, self).__init__(l) |
da925257a39e
typing: add pseudo localstr.__init__() to help pytype
Yuya Nishihara <yuya@tcha.org>
parents:
43724
diff
changeset
|
130 self._utf8 = u |
da925257a39e
typing: add pseudo localstr.__init__() to help pytype
Yuya Nishihara <yuya@tcha.org>
parents:
43724
diff
changeset
|
131 |
13046
7cc4263e07a9
encoding: add localstr class to track UTF-8 version of transcoded strings
Matt Mackall <mpm@selenic.com>
parents:
12866
diff
changeset
|
132 def __hash__(self): |
43076
2372284d9457
formatting: blacken the codebase
Augie Fackler <augie@google.com>
parents:
41841
diff
changeset
|
133 return hash(self._utf8) # avoid collisions in local string space |
2372284d9457
formatting: blacken the codebase
Augie Fackler <augie@google.com>
parents:
41841
diff
changeset
|
134 |
13046
7cc4263e07a9
encoding: add localstr class to track UTF-8 version of transcoded strings
Matt Mackall <mpm@selenic.com>
parents:
12866
diff
changeset
|
135 |
37991
3ea3c96ada54
encoding: introduce tagging type for non-lossy non-ASCII string
Yuya Nishihara <yuya@tcha.org>
parents:
37990
diff
changeset
|
136 class safelocalstr(bytes): |
3ea3c96ada54
encoding: introduce tagging type for non-lossy non-ASCII string
Yuya Nishihara <yuya@tcha.org>
parents:
37990
diff
changeset
|
137 """Tagged string denoting it was previously an internal UTF-8 string, |
3ea3c96ada54
encoding: introduce tagging type for non-lossy non-ASCII string
Yuya Nishihara <yuya@tcha.org>
parents:
37990
diff
changeset
|
138 and can be converted back to UTF-8 losslessly |
3ea3c96ada54
encoding: introduce tagging type for non-lossy non-ASCII string
Yuya Nishihara <yuya@tcha.org>
parents:
37990
diff
changeset
|
139 |
3ea3c96ada54
encoding: introduce tagging type for non-lossy non-ASCII string
Yuya Nishihara <yuya@tcha.org>
parents:
37990
diff
changeset
|
140 >>> assert safelocalstr(b'\\xc3') == b'\\xc3' |
3ea3c96ada54
encoding: introduce tagging type for non-lossy non-ASCII string
Yuya Nishihara <yuya@tcha.org>
parents:
37990
diff
changeset
|
141 >>> assert b'\\xc3' == safelocalstr(b'\\xc3') |
3ea3c96ada54
encoding: introduce tagging type for non-lossy non-ASCII string
Yuya Nishihara <yuya@tcha.org>
parents:
37990
diff
changeset
|
142 >>> assert b'\\xc3' in {safelocalstr(b'\\xc3'): 0} |
3ea3c96ada54
encoding: introduce tagging type for non-lossy non-ASCII string
Yuya Nishihara <yuya@tcha.org>
parents:
37990
diff
changeset
|
143 >>> assert safelocalstr(b'\\xc3') in {b'\\xc3': 0} |
3ea3c96ada54
encoding: introduce tagging type for non-lossy non-ASCII string
Yuya Nishihara <yuya@tcha.org>
parents:
37990
diff
changeset
|
144 """ |
3ea3c96ada54
encoding: introduce tagging type for non-lossy non-ASCII string
Yuya Nishihara <yuya@tcha.org>
parents:
37990
diff
changeset
|
145 |
43076
2372284d9457
formatting: blacken the codebase
Augie Fackler <augie@google.com>
parents:
41841
diff
changeset
|
146 |
7948
de377b1a9a84
move encoding bits from util to encoding
Matt Mackall <mpm@selenic.com>
parents:
diff
changeset
|
147 def tolocal(s): |
43721
b65fcccd9100
typing: fix argument type of encoding.tolocal() and .fromutf8b()
Yuya Nishihara <yuya@tcha.org>
parents:
43720
diff
changeset
|
148 # type: (bytes) -> bytes |
7948
de377b1a9a84
move encoding bits from util to encoding
Matt Mackall <mpm@selenic.com>
parents:
diff
changeset
|
149 """ |
de377b1a9a84
move encoding bits from util to encoding
Matt Mackall <mpm@selenic.com>
parents:
diff
changeset
|
150 Convert a string from internal UTF-8 to local encoding |
de377b1a9a84
move encoding bits from util to encoding
Matt Mackall <mpm@selenic.com>
parents:
diff
changeset
|
151 |
de377b1a9a84
move encoding bits from util to encoding
Matt Mackall <mpm@selenic.com>
parents:
diff
changeset
|
152 All internal strings should be UTF-8 but some repos before the |
de377b1a9a84
move encoding bits from util to encoding
Matt Mackall <mpm@selenic.com>
parents:
diff
changeset
|
153 implementation of locale support may contain latin1 or possibly |
de377b1a9a84
move encoding bits from util to encoding
Matt Mackall <mpm@selenic.com>
parents:
diff
changeset
|
154 other character sets. We attempt to decode everything strictly |
de377b1a9a84
move encoding bits from util to encoding
Matt Mackall <mpm@selenic.com>
parents:
diff
changeset
|
155 using UTF-8, then Latin-1, and failing that, we use UTF-8 and |
de377b1a9a84
move encoding bits from util to encoding
Matt Mackall <mpm@selenic.com>
parents:
diff
changeset
|
156 replace unknown characters. |
13046
7cc4263e07a9
encoding: add localstr class to track UTF-8 version of transcoded strings
Matt Mackall <mpm@selenic.com>
parents:
12866
diff
changeset
|
157 |
7cc4263e07a9
encoding: add localstr class to track UTF-8 version of transcoded strings
Matt Mackall <mpm@selenic.com>
parents:
12866
diff
changeset
|
158 The localstr class is used to cache the known UTF-8 encoding of |
7cc4263e07a9
encoding: add localstr class to track UTF-8 version of transcoded strings
Matt Mackall <mpm@selenic.com>
parents:
12866
diff
changeset
|
159 strings next to their local representation to allow lossless |
7cc4263e07a9
encoding: add localstr class to track UTF-8 version of transcoded strings
Matt Mackall <mpm@selenic.com>
parents:
12866
diff
changeset
|
160 round-trip conversion back to UTF-8. |
7cc4263e07a9
encoding: add localstr class to track UTF-8 version of transcoded strings
Matt Mackall <mpm@selenic.com>
parents:
12866
diff
changeset
|
161 |
34146
0fa781320203
doctest: bulk-replace string literals with b'' for Python 3
Yuya Nishihara <yuya@tcha.org>
parents:
33946
diff
changeset
|
162 >>> u = b'foo: \\xc3\\xa4' # utf-8 |
13046
7cc4263e07a9
encoding: add localstr class to track UTF-8 version of transcoded strings
Matt Mackall <mpm@selenic.com>
parents:
12866
diff
changeset
|
163 >>> l = tolocal(u) |
7cc4263e07a9
encoding: add localstr class to track UTF-8 version of transcoded strings
Matt Mackall <mpm@selenic.com>
parents:
12866
diff
changeset
|
164 >>> l |
7cc4263e07a9
encoding: add localstr class to track UTF-8 version of transcoded strings
Matt Mackall <mpm@selenic.com>
parents:
12866
diff
changeset
|
165 'foo: ?' |
7cc4263e07a9
encoding: add localstr class to track UTF-8 version of transcoded strings
Matt Mackall <mpm@selenic.com>
parents:
12866
diff
changeset
|
166 >>> fromlocal(l) |
7cc4263e07a9
encoding: add localstr class to track UTF-8 version of transcoded strings
Matt Mackall <mpm@selenic.com>
parents:
12866
diff
changeset
|
167 'foo: \\xc3\\xa4' |
34146
0fa781320203
doctest: bulk-replace string literals with b'' for Python 3
Yuya Nishihara <yuya@tcha.org>
parents:
33946
diff
changeset
|
168 >>> u2 = b'foo: \\xc3\\xa1' |
13046
7cc4263e07a9
encoding: add localstr class to track UTF-8 version of transcoded strings
Matt Mackall <mpm@selenic.com>
parents:
12866
diff
changeset
|
169 >>> d = { l: 1, tolocal(u2): 2 } |
18378
404feac78b8a
tests: stabilize doctest output
Mads Kiilerich <mads@kiilerich.com>
parents:
17424
diff
changeset
|
170 >>> len(d) # no collision |
404feac78b8a
tests: stabilize doctest output
Mads Kiilerich <mads@kiilerich.com>
parents:
17424
diff
changeset
|
171 2 |
34146
0fa781320203
doctest: bulk-replace string literals with b'' for Python 3
Yuya Nishihara <yuya@tcha.org>
parents:
33946
diff
changeset
|
172 >>> b'foo: ?' in d |
13046
7cc4263e07a9
encoding: add localstr class to track UTF-8 version of transcoded strings
Matt Mackall <mpm@selenic.com>
parents:
12866
diff
changeset
|
173 False |
34146
0fa781320203
doctest: bulk-replace string literals with b'' for Python 3
Yuya Nishihara <yuya@tcha.org>
parents:
33946
diff
changeset
|
174 >>> l1 = b'foo: \\xe4' # historical latin1 fallback |
13046
7cc4263e07a9
encoding: add localstr class to track UTF-8 version of transcoded strings
Matt Mackall <mpm@selenic.com>
parents:
12866
diff
changeset
|
175 >>> l = tolocal(l1) |
7cc4263e07a9
encoding: add localstr class to track UTF-8 version of transcoded strings
Matt Mackall <mpm@selenic.com>
parents:
12866
diff
changeset
|
176 >>> l |
7cc4263e07a9
encoding: add localstr class to track UTF-8 version of transcoded strings
Matt Mackall <mpm@selenic.com>
parents:
12866
diff
changeset
|
177 'foo: ?' |
7cc4263e07a9
encoding: add localstr class to track UTF-8 version of transcoded strings
Matt Mackall <mpm@selenic.com>
parents:
12866
diff
changeset
|
178 >>> fromlocal(l) # magically in utf-8 |
7cc4263e07a9
encoding: add localstr class to track UTF-8 version of transcoded strings
Matt Mackall <mpm@selenic.com>
parents:
12866
diff
changeset
|
179 'foo: \\xc3\\xa4' |
7948
de377b1a9a84
move encoding bits from util to encoding
Matt Mackall <mpm@selenic.com>
parents:
diff
changeset
|
180 """ |
13046
7cc4263e07a9
encoding: add localstr class to track UTF-8 version of transcoded strings
Matt Mackall <mpm@selenic.com>
parents:
12866
diff
changeset
|
181 |
33945
853574db5b12
encoding: add fast path of from/tolocal() for ASCII strings
Yuya Nishihara <yuya@tcha.org>
parents:
33944
diff
changeset
|
182 if isasciistr(s): |
853574db5b12
encoding: add fast path of from/tolocal() for ASCII strings
Yuya Nishihara <yuya@tcha.org>
parents:
33944
diff
changeset
|
183 return s |
853574db5b12
encoding: add fast path of from/tolocal() for ASCII strings
Yuya Nishihara <yuya@tcha.org>
parents:
33944
diff
changeset
|
184 |
16274
5d75eb8568d1
encoding: tune fast-path of tolocal a bit
Matt Mackall <mpm@selenic.com>
parents:
16133
diff
changeset
|
185 try: |
7948
de377b1a9a84
move encoding bits from util to encoding
Matt Mackall <mpm@selenic.com>
parents:
diff
changeset
|
186 try: |
16274
5d75eb8568d1
encoding: tune fast-path of tolocal a bit
Matt Mackall <mpm@selenic.com>
parents:
16133
diff
changeset
|
187 # make sure string is actually stored in UTF-8 |
5d75eb8568d1
encoding: tune fast-path of tolocal a bit
Matt Mackall <mpm@selenic.com>
parents:
16133
diff
changeset
|
188 u = s.decode('UTF-8') |
43077
687b865b95ad
formatting: byteify all mercurial/ and hgext/ string literals
Augie Fackler <augie@google.com>
parents:
43076
diff
changeset
|
189 if encoding == b'UTF-8': |
16274
5d75eb8568d1
encoding: tune fast-path of tolocal a bit
Matt Mackall <mpm@selenic.com>
parents:
16133
diff
changeset
|
190 # fast path |
5d75eb8568d1
encoding: tune fast-path of tolocal a bit
Matt Mackall <mpm@selenic.com>
parents:
16133
diff
changeset
|
191 return s |
43551
313e3a279828
cleanup: remove pointless r-prefixes on double-quoted strings
Augie Fackler <augie@google.com>
parents:
43544
diff
changeset
|
192 r = u.encode(_sysstr(encoding), "replace") |
30033
02dbfaa6df0b
py3: convert encoding name and mode to str
Yuya Nishihara <yuya@tcha.org>
parents:
30031
diff
changeset
|
193 if u == r.decode(_sysstr(encoding)): |
13940
b7b26e54e37a
encoding: avoid localstr when a string can be encoded losslessly (issue2763)
Matt Mackall <mpm@selenic.com>
parents:
13051
diff
changeset
|
194 # r is a safe, non-lossy encoding of s |
37991
3ea3c96ada54
encoding: introduce tagging type for non-lossy non-ASCII string
Yuya Nishihara <yuya@tcha.org>
parents:
37990
diff
changeset
|
195 return safelocalstr(r) |
16274
5d75eb8568d1
encoding: tune fast-path of tolocal a bit
Matt Mackall <mpm@selenic.com>
parents:
16133
diff
changeset
|
196 return localstr(s, r) |
5d75eb8568d1
encoding: tune fast-path of tolocal a bit
Matt Mackall <mpm@selenic.com>
parents:
16133
diff
changeset
|
197 except UnicodeDecodeError: |
5d75eb8568d1
encoding: tune fast-path of tolocal a bit
Matt Mackall <mpm@selenic.com>
parents:
16133
diff
changeset
|
198 # we should only get here if we're looking at an ancient changeset |
5d75eb8568d1
encoding: tune fast-path of tolocal a bit
Matt Mackall <mpm@selenic.com>
parents:
16133
diff
changeset
|
199 try: |
30033
02dbfaa6df0b
py3: convert encoding name and mode to str
Yuya Nishihara <yuya@tcha.org>
parents:
30031
diff
changeset
|
200 u = s.decode(_sysstr(fallbackencoding)) |
43551
313e3a279828
cleanup: remove pointless r-prefixes on double-quoted strings
Augie Fackler <augie@google.com>
parents:
43544
diff
changeset
|
201 r = u.encode(_sysstr(encoding), "replace") |
30033
02dbfaa6df0b
py3: convert encoding name and mode to str
Yuya Nishihara <yuya@tcha.org>
parents:
30031
diff
changeset
|
202 if u == r.decode(_sysstr(encoding)): |
16274
5d75eb8568d1
encoding: tune fast-path of tolocal a bit
Matt Mackall <mpm@selenic.com>
parents:
16133
diff
changeset
|
203 # r is a safe, non-lossy encoding of s |
37991
3ea3c96ada54
encoding: introduce tagging type for non-lossy non-ASCII string
Yuya Nishihara <yuya@tcha.org>
parents:
37990
diff
changeset
|
204 return safelocalstr(r) |
13940
b7b26e54e37a
encoding: avoid localstr when a string can be encoded losslessly (issue2763)
Matt Mackall <mpm@selenic.com>
parents:
13051
diff
changeset
|
205 return localstr(u.encode('UTF-8'), r) |
16274
5d75eb8568d1
encoding: tune fast-path of tolocal a bit
Matt Mackall <mpm@selenic.com>
parents:
16133
diff
changeset
|
206 except UnicodeDecodeError: |
43076
2372284d9457
formatting: blacken the codebase
Augie Fackler <augie@google.com>
parents:
41841
diff
changeset
|
207 u = s.decode("utf-8", "replace") # last ditch |
30033
02dbfaa6df0b
py3: convert encoding name and mode to str
Yuya Nishihara <yuya@tcha.org>
parents:
30031
diff
changeset
|
208 # can't round-trip |
43551
313e3a279828
cleanup: remove pointless r-prefixes on double-quoted strings
Augie Fackler <augie@google.com>
parents:
43544
diff
changeset
|
209 return u.encode(_sysstr(encoding), "replace") |
25660
328739ea70c3
global: mass rewrite to use modern exception syntax
Gregory Szorc <gregory.szorc@gmail.com>
parents:
24608
diff
changeset
|
210 except LookupError as k: |
45681
a736ab681b78
errors: stop passing non-strings to Abort's constructor
Martin von Zweigbergk <martinvonz@google.com>
parents:
44470
diff
changeset
|
211 raise error.Abort( |
a736ab681b78
errors: stop passing non-strings to Abort's constructor
Martin von Zweigbergk <martinvonz@google.com>
parents:
44470
diff
changeset
|
212 pycompat.bytestr(k), hint=b"please check your locale settings" |
a736ab681b78
errors: stop passing non-strings to Abort's constructor
Martin von Zweigbergk <martinvonz@google.com>
parents:
44470
diff
changeset
|
213 ) |
7948
de377b1a9a84
move encoding bits from util to encoding
Matt Mackall <mpm@selenic.com>
parents:
diff
changeset
|
214 |
43076
2372284d9457
formatting: blacken the codebase
Augie Fackler <augie@google.com>
parents:
41841
diff
changeset
|
215 |
7948
de377b1a9a84
move encoding bits from util to encoding
Matt Mackall <mpm@selenic.com>
parents:
diff
changeset
|
216 def fromlocal(s): |
43681
7edc07fb890c
encoding: fix bad type annotation
Augie Fackler <augie@google.com>
parents:
43554
diff
changeset
|
217 # type: (bytes) -> bytes |
7948
de377b1a9a84
move encoding bits from util to encoding
Matt Mackall <mpm@selenic.com>
parents:
diff
changeset
|
218 """ |
de377b1a9a84
move encoding bits from util to encoding
Matt Mackall <mpm@selenic.com>
parents:
diff
changeset
|
219 Convert a string from the local character encoding to UTF-8 |
de377b1a9a84
move encoding bits from util to encoding
Matt Mackall <mpm@selenic.com>
parents:
diff
changeset
|
220 |
de377b1a9a84
move encoding bits from util to encoding
Matt Mackall <mpm@selenic.com>
parents:
diff
changeset
|
221 We attempt to decode strings using the encoding mode set by |
de377b1a9a84
move encoding bits from util to encoding
Matt Mackall <mpm@selenic.com>
parents:
diff
changeset
|
222 HGENCODINGMODE, which defaults to 'strict'. In this mode, unknown |
de377b1a9a84
move encoding bits from util to encoding
Matt Mackall <mpm@selenic.com>
parents:
diff
changeset
|
223 characters will cause an error message. Other modes include |
de377b1a9a84
move encoding bits from util to encoding
Matt Mackall <mpm@selenic.com>
parents:
diff
changeset
|
224 'replace', which replaces unknown characters with a special |
de377b1a9a84
move encoding bits from util to encoding
Matt Mackall <mpm@selenic.com>
parents:
diff
changeset
|
225 Unicode character, and 'ignore', which drops the character. |
de377b1a9a84
move encoding bits from util to encoding
Matt Mackall <mpm@selenic.com>
parents:
diff
changeset
|
226 """ |
13046
7cc4263e07a9
encoding: add localstr class to track UTF-8 version of transcoded strings
Matt Mackall <mpm@selenic.com>
parents:
12866
diff
changeset
|
227 |
7cc4263e07a9
encoding: add localstr class to track UTF-8 version of transcoded strings
Matt Mackall <mpm@selenic.com>
parents:
12866
diff
changeset
|
228 # can we do a lossless round-trip? |
7cc4263e07a9
encoding: add localstr class to track UTF-8 version of transcoded strings
Matt Mackall <mpm@selenic.com>
parents:
12866
diff
changeset
|
229 if isinstance(s, localstr): |
7cc4263e07a9
encoding: add localstr class to track UTF-8 version of transcoded strings
Matt Mackall <mpm@selenic.com>
parents:
12866
diff
changeset
|
230 return s._utf8 |
33945
853574db5b12
encoding: add fast path of from/tolocal() for ASCII strings
Yuya Nishihara <yuya@tcha.org>
parents:
33944
diff
changeset
|
231 if isasciistr(s): |
853574db5b12
encoding: add fast path of from/tolocal() for ASCII strings
Yuya Nishihara <yuya@tcha.org>
parents:
33944
diff
changeset
|
232 return s |
13046
7cc4263e07a9
encoding: add localstr class to track UTF-8 version of transcoded strings
Matt Mackall <mpm@selenic.com>
parents:
12866
diff
changeset
|
233 |
7948
de377b1a9a84
move encoding bits from util to encoding
Matt Mackall <mpm@selenic.com>
parents:
diff
changeset
|
234 try: |
30033
02dbfaa6df0b
py3: convert encoding name and mode to str
Yuya Nishihara <yuya@tcha.org>
parents:
30031
diff
changeset
|
235 u = s.decode(_sysstr(encoding), _sysstr(encodingmode)) |
02dbfaa6df0b
py3: convert encoding name and mode to str
Yuya Nishihara <yuya@tcha.org>
parents:
30031
diff
changeset
|
236 return u.encode("utf-8") |
25660
328739ea70c3
global: mass rewrite to use modern exception syntax
Gregory Szorc <gregory.szorc@gmail.com>
parents:
24608
diff
changeset
|
237 except UnicodeDecodeError as inst: |
43076
2372284d9457
formatting: blacken the codebase
Augie Fackler <augie@google.com>
parents:
41841
diff
changeset
|
238 sub = s[max(0, inst.start - 10) : inst.start + 10] |
2372284d9457
formatting: blacken the codebase
Augie Fackler <augie@google.com>
parents:
41841
diff
changeset
|
239 raise error.Abort( |
43077
687b865b95ad
formatting: byteify all mercurial/ and hgext/ string literals
Augie Fackler <augie@google.com>
parents:
43076
diff
changeset
|
240 b"decoding near '%s': %s!" % (sub, pycompat.bytestr(inst)) |
43076
2372284d9457
formatting: blacken the codebase
Augie Fackler <augie@google.com>
parents:
41841
diff
changeset
|
241 ) |
25660
328739ea70c3
global: mass rewrite to use modern exception syntax
Gregory Szorc <gregory.szorc@gmail.com>
parents:
24608
diff
changeset
|
242 except LookupError as k: |
48030
28c62f83b652
encoding: force a few Errors to bytes before passing to `error.Abort`
Matt Harbison <matt_harbison@yahoo.com>
parents:
47621
diff
changeset
|
243 raise error.Abort( |
28c62f83b652
encoding: force a few Errors to bytes before passing to `error.Abort`
Matt Harbison <matt_harbison@yahoo.com>
parents:
47621
diff
changeset
|
244 pycompat.bytestr(k), hint=b"please check your locale settings" |
28c62f83b652
encoding: force a few Errors to bytes before passing to `error.Abort`
Matt Harbison <matt_harbison@yahoo.com>
parents:
47621
diff
changeset
|
245 ) |
7948
de377b1a9a84
move encoding bits from util to encoding
Matt Mackall <mpm@selenic.com>
parents:
diff
changeset
|
246 |
43076
2372284d9457
formatting: blacken the codebase
Augie Fackler <augie@google.com>
parents:
41841
diff
changeset
|
247 |
31456
067add650129
encoding: factor out unicode variants of from/tolocal()
Yuya Nishihara <yuya@tcha.org>
parents:
30627
diff
changeset
|
248 def unitolocal(u): |
43544
2ade00f3b03b
encoding: add comment-based type hints for pytype
Augie Fackler <augie@google.com>
parents:
43517
diff
changeset
|
249 # type: (Text) -> bytes |
31456
067add650129
encoding: factor out unicode variants of from/tolocal()
Yuya Nishihara <yuya@tcha.org>
parents:
30627
diff
changeset
|
250 """Convert a unicode string to a byte string of local encoding""" |
067add650129
encoding: factor out unicode variants of from/tolocal()
Yuya Nishihara <yuya@tcha.org>
parents:
30627
diff
changeset
|
251 return tolocal(u.encode('utf-8')) |
067add650129
encoding: factor out unicode variants of from/tolocal()
Yuya Nishihara <yuya@tcha.org>
parents:
30627
diff
changeset
|
252 |
43076
2372284d9457
formatting: blacken the codebase
Augie Fackler <augie@google.com>
parents:
41841
diff
changeset
|
253 |
31456
067add650129
encoding: factor out unicode variants of from/tolocal()
Yuya Nishihara <yuya@tcha.org>
parents:
30627
diff
changeset
|
254 def unifromlocal(s): |
43544
2ade00f3b03b
encoding: add comment-based type hints for pytype
Augie Fackler <augie@google.com>
parents:
43517
diff
changeset
|
255 # type: (bytes) -> Text |
31456
067add650129
encoding: factor out unicode variants of from/tolocal()
Yuya Nishihara <yuya@tcha.org>
parents:
30627
diff
changeset
|
256 """Convert a byte string of local encoding to a unicode string""" |
067add650129
encoding: factor out unicode variants of from/tolocal()
Yuya Nishihara <yuya@tcha.org>
parents:
30627
diff
changeset
|
257 return fromlocal(s).decode('utf-8') |
067add650129
encoding: factor out unicode variants of from/tolocal()
Yuya Nishihara <yuya@tcha.org>
parents:
30627
diff
changeset
|
258 |
43076
2372284d9457
formatting: blacken the codebase
Augie Fackler <augie@google.com>
parents:
41841
diff
changeset
|
259 |
33038
ce96efec8112
py3: add utility to forward __str__() to __bytes__()
Yuya Nishihara <yuya@tcha.org>
parents:
32570
diff
changeset
|
260 def unimethod(bytesfunc): |
43544
2ade00f3b03b
encoding: add comment-based type hints for pytype
Augie Fackler <augie@google.com>
parents:
43517
diff
changeset
|
261 # type: (Callable[[Any], bytes]) -> Callable[[Any], Text] |
33038
ce96efec8112
py3: add utility to forward __str__() to __bytes__()
Yuya Nishihara <yuya@tcha.org>
parents:
32570
diff
changeset
|
262 """Create a proxy method that forwards __unicode__() and __str__() of |
ce96efec8112
py3: add utility to forward __str__() to __bytes__()
Yuya Nishihara <yuya@tcha.org>
parents:
32570
diff
changeset
|
263 Python 3 to __bytes__()""" |
43076
2372284d9457
formatting: blacken the codebase
Augie Fackler <augie@google.com>
parents:
41841
diff
changeset
|
264 |
33038
ce96efec8112
py3: add utility to forward __str__() to __bytes__()
Yuya Nishihara <yuya@tcha.org>
parents:
32570
diff
changeset
|
265 def unifunc(obj): |
ce96efec8112
py3: add utility to forward __str__() to __bytes__()
Yuya Nishihara <yuya@tcha.org>
parents:
32570
diff
changeset
|
266 return unifromlocal(bytesfunc(obj)) |
43076
2372284d9457
formatting: blacken the codebase
Augie Fackler <augie@google.com>
parents:
41841
diff
changeset
|
267 |
33038
ce96efec8112
py3: add utility to forward __str__() to __bytes__()
Yuya Nishihara <yuya@tcha.org>
parents:
32570
diff
changeset
|
268 return unifunc |
ce96efec8112
py3: add utility to forward __str__() to __bytes__()
Yuya Nishihara <yuya@tcha.org>
parents:
32570
diff
changeset
|
269 |
43076
2372284d9457
formatting: blacken the codebase
Augie Fackler <augie@google.com>
parents:
41841
diff
changeset
|
270 |
31457
6419cd243017
encoding: add converter between native str and byte string
Yuya Nishihara <yuya@tcha.org>
parents:
31456
diff
changeset
|
271 # converter functions between native str and byte string. use these if the |
6419cd243017
encoding: add converter between native str and byte string
Yuya Nishihara <yuya@tcha.org>
parents:
31456
diff
changeset
|
272 # character encoding is not aware (e.g. exception message) or is known to |
6419cd243017
encoding: add converter between native str and byte string
Yuya Nishihara <yuya@tcha.org>
parents:
31456
diff
changeset
|
273 # be locale dependent (e.g. date formatting.) |
6419cd243017
encoding: add converter between native str and byte string
Yuya Nishihara <yuya@tcha.org>
parents:
31456
diff
changeset
|
274 if pycompat.ispy3: |
6419cd243017
encoding: add converter between native str and byte string
Yuya Nishihara <yuya@tcha.org>
parents:
31456
diff
changeset
|
275 strtolocal = unitolocal |
6419cd243017
encoding: add converter between native str and byte string
Yuya Nishihara <yuya@tcha.org>
parents:
31456
diff
changeset
|
276 strfromlocal = unifromlocal |
33038
ce96efec8112
py3: add utility to forward __str__() to __bytes__()
Yuya Nishihara <yuya@tcha.org>
parents:
32570
diff
changeset
|
277 strmethod = unimethod |
31457
6419cd243017
encoding: add converter between native str and byte string
Yuya Nishihara <yuya@tcha.org>
parents:
31456
diff
changeset
|
278 else: |
43517
5f2a8dabb0d8
encoding: define local identify functions with explicit type comments
Augie Fackler <augie@google.com>
parents:
43089
diff
changeset
|
279 |
5f2a8dabb0d8
encoding: define local identify functions with explicit type comments
Augie Fackler <augie@google.com>
parents:
43089
diff
changeset
|
280 def strtolocal(s): |
5f2a8dabb0d8
encoding: define local identify functions with explicit type comments
Augie Fackler <augie@google.com>
parents:
43089
diff
changeset
|
281 # type: (str) -> bytes |
43723
7f51bc36194d
typing: suppress error of py2 encoding.strtolocal() and .strfromlocal()
Yuya Nishihara <yuya@tcha.org>
parents:
43722
diff
changeset
|
282 return s # pytype: disable=bad-return-type |
43517
5f2a8dabb0d8
encoding: define local identify functions with explicit type comments
Augie Fackler <augie@google.com>
parents:
43089
diff
changeset
|
283 |
5f2a8dabb0d8
encoding: define local identify functions with explicit type comments
Augie Fackler <augie@google.com>
parents:
43089
diff
changeset
|
284 def strfromlocal(s): |
5f2a8dabb0d8
encoding: define local identify functions with explicit type comments
Augie Fackler <augie@google.com>
parents:
43089
diff
changeset
|
285 # type: (bytes) -> str |
43723
7f51bc36194d
typing: suppress error of py2 encoding.strtolocal() and .strfromlocal()
Yuya Nishihara <yuya@tcha.org>
parents:
43722
diff
changeset
|
286 return s # pytype: disable=bad-return-type |
43517
5f2a8dabb0d8
encoding: define local identify functions with explicit type comments
Augie Fackler <augie@google.com>
parents:
43089
diff
changeset
|
287 |
33038
ce96efec8112
py3: add utility to forward __str__() to __bytes__()
Yuya Nishihara <yuya@tcha.org>
parents:
32570
diff
changeset
|
288 strmethod = pycompat.identity |
31457
6419cd243017
encoding: add converter between native str and byte string
Yuya Nishihara <yuya@tcha.org>
parents:
31456
diff
changeset
|
289 |
47559
53a864a60281
encoding: move case-related utils up
Raphaël Gomès <rgomes@octobus.net>
parents:
46819
diff
changeset
|
290 |
53a864a60281
encoding: move case-related utils up
Raphaël Gomès <rgomes@octobus.net>
parents:
46819
diff
changeset
|
291 def lower(s): |
53a864a60281
encoding: move case-related utils up
Raphaël Gomès <rgomes@octobus.net>
parents:
46819
diff
changeset
|
292 # type: (bytes) -> bytes |
53a864a60281
encoding: move case-related utils up
Raphaël Gomès <rgomes@octobus.net>
parents:
46819
diff
changeset
|
293 """best-effort encoding-aware case-folding of local string s""" |
53a864a60281
encoding: move case-related utils up
Raphaël Gomès <rgomes@octobus.net>
parents:
46819
diff
changeset
|
294 try: |
53a864a60281
encoding: move case-related utils up
Raphaël Gomès <rgomes@octobus.net>
parents:
46819
diff
changeset
|
295 return asciilower(s) |
53a864a60281
encoding: move case-related utils up
Raphaël Gomès <rgomes@octobus.net>
parents:
46819
diff
changeset
|
296 except UnicodeDecodeError: |
53a864a60281
encoding: move case-related utils up
Raphaël Gomès <rgomes@octobus.net>
parents:
46819
diff
changeset
|
297 pass |
53a864a60281
encoding: move case-related utils up
Raphaël Gomès <rgomes@octobus.net>
parents:
46819
diff
changeset
|
298 try: |
53a864a60281
encoding: move case-related utils up
Raphaël Gomès <rgomes@octobus.net>
parents:
46819
diff
changeset
|
299 if isinstance(s, localstr): |
53a864a60281
encoding: move case-related utils up
Raphaël Gomès <rgomes@octobus.net>
parents:
46819
diff
changeset
|
300 u = s._utf8.decode("utf-8") |
53a864a60281
encoding: move case-related utils up
Raphaël Gomès <rgomes@octobus.net>
parents:
46819
diff
changeset
|
301 else: |
53a864a60281
encoding: move case-related utils up
Raphaël Gomès <rgomes@octobus.net>
parents:
46819
diff
changeset
|
302 u = s.decode(_sysstr(encoding), _sysstr(encodingmode)) |
53a864a60281
encoding: move case-related utils up
Raphaël Gomès <rgomes@octobus.net>
parents:
46819
diff
changeset
|
303 |
53a864a60281
encoding: move case-related utils up
Raphaël Gomès <rgomes@octobus.net>
parents:
46819
diff
changeset
|
304 lu = u.lower() |
53a864a60281
encoding: move case-related utils up
Raphaël Gomès <rgomes@octobus.net>
parents:
46819
diff
changeset
|
305 if u == lu: |
53a864a60281
encoding: move case-related utils up
Raphaël Gomès <rgomes@octobus.net>
parents:
46819
diff
changeset
|
306 return s # preserve localstring |
53a864a60281
encoding: move case-related utils up
Raphaël Gomès <rgomes@octobus.net>
parents:
46819
diff
changeset
|
307 return lu.encode(_sysstr(encoding)) |
53a864a60281
encoding: move case-related utils up
Raphaël Gomès <rgomes@octobus.net>
parents:
46819
diff
changeset
|
308 except UnicodeError: |
53a864a60281
encoding: move case-related utils up
Raphaël Gomès <rgomes@octobus.net>
parents:
46819
diff
changeset
|
309 return s.lower() # we don't know how to fold this except in ASCII |
53a864a60281
encoding: move case-related utils up
Raphaël Gomès <rgomes@octobus.net>
parents:
46819
diff
changeset
|
310 except LookupError as k: |
48030
28c62f83b652
encoding: force a few Errors to bytes before passing to `error.Abort`
Matt Harbison <matt_harbison@yahoo.com>
parents:
47621
diff
changeset
|
311 raise error.Abort( |
28c62f83b652
encoding: force a few Errors to bytes before passing to `error.Abort`
Matt Harbison <matt_harbison@yahoo.com>
parents:
47621
diff
changeset
|
312 pycompat.bytestr(k), hint=b"please check your locale settings" |
28c62f83b652
encoding: force a few Errors to bytes before passing to `error.Abort`
Matt Harbison <matt_harbison@yahoo.com>
parents:
47621
diff
changeset
|
313 ) |
47559
53a864a60281
encoding: move case-related utils up
Raphaël Gomès <rgomes@octobus.net>
parents:
46819
diff
changeset
|
314 |
53a864a60281
encoding: move case-related utils up
Raphaël Gomès <rgomes@octobus.net>
parents:
46819
diff
changeset
|
315 |
53a864a60281
encoding: move case-related utils up
Raphaël Gomès <rgomes@octobus.net>
parents:
46819
diff
changeset
|
316 def upper(s): |
53a864a60281
encoding: move case-related utils up
Raphaël Gomès <rgomes@octobus.net>
parents:
46819
diff
changeset
|
317 # type: (bytes) -> bytes |
53a864a60281
encoding: move case-related utils up
Raphaël Gomès <rgomes@octobus.net>
parents:
46819
diff
changeset
|
318 """best-effort encoding-aware case-folding of local string s""" |
53a864a60281
encoding: move case-related utils up
Raphaël Gomès <rgomes@octobus.net>
parents:
46819
diff
changeset
|
319 try: |
53a864a60281
encoding: move case-related utils up
Raphaël Gomès <rgomes@octobus.net>
parents:
46819
diff
changeset
|
320 return asciiupper(s) |
53a864a60281
encoding: move case-related utils up
Raphaël Gomès <rgomes@octobus.net>
parents:
46819
diff
changeset
|
321 except UnicodeDecodeError: |
53a864a60281
encoding: move case-related utils up
Raphaël Gomès <rgomes@octobus.net>
parents:
46819
diff
changeset
|
322 return upperfallback(s) |
53a864a60281
encoding: move case-related utils up
Raphaël Gomès <rgomes@octobus.net>
parents:
46819
diff
changeset
|
323 |
53a864a60281
encoding: move case-related utils up
Raphaël Gomès <rgomes@octobus.net>
parents:
46819
diff
changeset
|
324 |
53a864a60281
encoding: move case-related utils up
Raphaël Gomès <rgomes@octobus.net>
parents:
46819
diff
changeset
|
325 def upperfallback(s): |
53a864a60281
encoding: move case-related utils up
Raphaël Gomès <rgomes@octobus.net>
parents:
46819
diff
changeset
|
326 # type: (Any) -> Any |
53a864a60281
encoding: move case-related utils up
Raphaël Gomès <rgomes@octobus.net>
parents:
46819
diff
changeset
|
327 try: |
53a864a60281
encoding: move case-related utils up
Raphaël Gomès <rgomes@octobus.net>
parents:
46819
diff
changeset
|
328 if isinstance(s, localstr): |
53a864a60281
encoding: move case-related utils up
Raphaël Gomès <rgomes@octobus.net>
parents:
46819
diff
changeset
|
329 u = s._utf8.decode("utf-8") |
53a864a60281
encoding: move case-related utils up
Raphaël Gomès <rgomes@octobus.net>
parents:
46819
diff
changeset
|
330 else: |
53a864a60281
encoding: move case-related utils up
Raphaël Gomès <rgomes@octobus.net>
parents:
46819
diff
changeset
|
331 u = s.decode(_sysstr(encoding), _sysstr(encodingmode)) |
53a864a60281
encoding: move case-related utils up
Raphaël Gomès <rgomes@octobus.net>
parents:
46819
diff
changeset
|
332 |
53a864a60281
encoding: move case-related utils up
Raphaël Gomès <rgomes@octobus.net>
parents:
46819
diff
changeset
|
333 uu = u.upper() |
53a864a60281
encoding: move case-related utils up
Raphaël Gomès <rgomes@octobus.net>
parents:
46819
diff
changeset
|
334 if u == uu: |
53a864a60281
encoding: move case-related utils up
Raphaël Gomès <rgomes@octobus.net>
parents:
46819
diff
changeset
|
335 return s # preserve localstring |
53a864a60281
encoding: move case-related utils up
Raphaël Gomès <rgomes@octobus.net>
parents:
46819
diff
changeset
|
336 return uu.encode(_sysstr(encoding)) |
53a864a60281
encoding: move case-related utils up
Raphaël Gomès <rgomes@octobus.net>
parents:
46819
diff
changeset
|
337 except UnicodeError: |
53a864a60281
encoding: move case-related utils up
Raphaël Gomès <rgomes@octobus.net>
parents:
46819
diff
changeset
|
338 return s.upper() # we don't know how to fold this except in ASCII |
53a864a60281
encoding: move case-related utils up
Raphaël Gomès <rgomes@octobus.net>
parents:
46819
diff
changeset
|
339 except LookupError as k: |
48030
28c62f83b652
encoding: force a few Errors to bytes before passing to `error.Abort`
Matt Harbison <matt_harbison@yahoo.com>
parents:
47621
diff
changeset
|
340 raise error.Abort( |
28c62f83b652
encoding: force a few Errors to bytes before passing to `error.Abort`
Matt Harbison <matt_harbison@yahoo.com>
parents:
47621
diff
changeset
|
341 pycompat.bytestr(k), hint=b"please check your locale settings" |
28c62f83b652
encoding: force a few Errors to bytes before passing to `error.Abort`
Matt Harbison <matt_harbison@yahoo.com>
parents:
47621
diff
changeset
|
342 ) |
47559
53a864a60281
encoding: move case-related utils up
Raphaël Gomès <rgomes@octobus.net>
parents:
46819
diff
changeset
|
343 |
53a864a60281
encoding: move case-related utils up
Raphaël Gomès <rgomes@octobus.net>
parents:
46819
diff
changeset
|
344 |
30034
e4a6b439acc5
py3: provide encoding.environ which is a dict of bytes
Yuya Nishihara <yuya@tcha.org>
parents:
30033
diff
changeset
|
345 if not _nativeenviron: |
e4a6b439acc5
py3: provide encoding.environ which is a dict of bytes
Yuya Nishihara <yuya@tcha.org>
parents:
30033
diff
changeset
|
346 # now encoding and helper functions are available, recreate the environ |
e4a6b439acc5
py3: provide encoding.environ which is a dict of bytes
Yuya Nishihara <yuya@tcha.org>
parents:
30033
diff
changeset
|
347 # dict to be exported to other modules |
47560
af633293a5bd
windows: replicate the normalizing behavior of os.environ
Raphaël Gomès <rgomes@octobus.net>
parents:
47559
diff
changeset
|
348 if pycompat.iswindows and pycompat.ispy3: |
af633293a5bd
windows: replicate the normalizing behavior of os.environ
Raphaël Gomès <rgomes@octobus.net>
parents:
47559
diff
changeset
|
349 |
af633293a5bd
windows: replicate the normalizing behavior of os.environ
Raphaël Gomès <rgomes@octobus.net>
parents:
47559
diff
changeset
|
350 class WindowsEnviron(dict): |
af633293a5bd
windows: replicate the normalizing behavior of os.environ
Raphaël Gomès <rgomes@octobus.net>
parents:
47559
diff
changeset
|
351 """`os.environ` normalizes environment variables to uppercase on windows""" |
af633293a5bd
windows: replicate the normalizing behavior of os.environ
Raphaël Gomès <rgomes@octobus.net>
parents:
47559
diff
changeset
|
352 |
af633293a5bd
windows: replicate the normalizing behavior of os.environ
Raphaël Gomès <rgomes@octobus.net>
parents:
47559
diff
changeset
|
353 def get(self, key, default=None): |
af633293a5bd
windows: replicate the normalizing behavior of os.environ
Raphaël Gomès <rgomes@octobus.net>
parents:
47559
diff
changeset
|
354 return super().get(upper(key), default) |
af633293a5bd
windows: replicate the normalizing behavior of os.environ
Raphaël Gomès <rgomes@octobus.net>
parents:
47559
diff
changeset
|
355 |
af633293a5bd
windows: replicate the normalizing behavior of os.environ
Raphaël Gomès <rgomes@octobus.net>
parents:
47559
diff
changeset
|
356 environ = WindowsEnviron() |
af633293a5bd
windows: replicate the normalizing behavior of os.environ
Raphaël Gomès <rgomes@octobus.net>
parents:
47559
diff
changeset
|
357 |
af633293a5bd
windows: replicate the normalizing behavior of os.environ
Raphaël Gomès <rgomes@octobus.net>
parents:
47559
diff
changeset
|
358 for k, v in os.environ.items(): # re-exports |
af633293a5bd
windows: replicate the normalizing behavior of os.environ
Raphaël Gomès <rgomes@octobus.net>
parents:
47559
diff
changeset
|
359 environ[tolocal(k.encode('utf-8'))] = tolocal(v.encode('utf-8')) |
af633293a5bd
windows: replicate the normalizing behavior of os.environ
Raphaël Gomès <rgomes@octobus.net>
parents:
47559
diff
changeset
|
360 |
30034
e4a6b439acc5
py3: provide encoding.environ which is a dict of bytes
Yuya Nishihara <yuya@tcha.org>
parents:
30033
diff
changeset
|
361 |
47621
d6ee6456bd5f
windows: enforce upper case drive letter for getcwd in mercurial too
Pierre-Yves David <pierre-yves.david@octobus.net>
parents:
47560
diff
changeset
|
362 DRIVE_RE = re.compile(b'^[a-z]:') |
d6ee6456bd5f
windows: enforce upper case drive letter for getcwd in mercurial too
Pierre-Yves David <pierre-yves.david@octobus.net>
parents:
47560
diff
changeset
|
363 |
39823
24e493ec2229
py3: rename pycompat.getcwd() to encoding.getcwd() (API)
Matt Harbison <matt_harbison@yahoo.com>
parents:
38823
diff
changeset
|
364 if pycompat.ispy3: |
24e493ec2229
py3: rename pycompat.getcwd() to encoding.getcwd() (API)
Matt Harbison <matt_harbison@yahoo.com>
parents:
38823
diff
changeset
|
365 # os.getcwd() on Python 3 returns string, but it has os.getcwdb() which |
24e493ec2229
py3: rename pycompat.getcwd() to encoding.getcwd() (API)
Matt Harbison <matt_harbison@yahoo.com>
parents:
38823
diff
changeset
|
366 # returns bytes. |
39824
fb628c048d64
py3: don't use os.getcwdb() on Windows to avoid DeprecationWarnings
Matt Harbison <matt_harbison@yahoo.com>
parents:
39823
diff
changeset
|
367 if pycompat.iswindows: |
fb628c048d64
py3: don't use os.getcwdb() on Windows to avoid DeprecationWarnings
Matt Harbison <matt_harbison@yahoo.com>
parents:
39823
diff
changeset
|
368 # Python 3 on Windows issues a DeprecationWarning about using the bytes |
fb628c048d64
py3: don't use os.getcwdb() on Windows to avoid DeprecationWarnings
Matt Harbison <matt_harbison@yahoo.com>
parents:
39823
diff
changeset
|
369 # API when os.getcwdb() is called. |
46319
3dfebba99ef6
windows: wrap `os.getcwd()` in `os.path.realpath()` on py3
Matt Harbison <matt_harbison@yahoo.com>
parents:
45957
diff
changeset
|
370 # |
3dfebba99ef6
windows: wrap `os.getcwd()` in `os.path.realpath()` on py3
Matt Harbison <matt_harbison@yahoo.com>
parents:
45957
diff
changeset
|
371 # Additionally, py3.8+ uppercases the drive letter when calling |
3dfebba99ef6
windows: wrap `os.getcwd()` in `os.path.realpath()` on py3
Matt Harbison <matt_harbison@yahoo.com>
parents:
45957
diff
changeset
|
372 # os.path.realpath(), which is used on ``repo.root``. Since those |
3dfebba99ef6
windows: wrap `os.getcwd()` in `os.path.realpath()` on py3
Matt Harbison <matt_harbison@yahoo.com>
parents:
45957
diff
changeset
|
373 # strings are compared in various places as simple strings, also call |
3dfebba99ef6
windows: wrap `os.getcwd()` in `os.path.realpath()` on py3
Matt Harbison <matt_harbison@yahoo.com>
parents:
45957
diff
changeset
|
374 # realpath here. See https://bugs.python.org/issue40368 |
47621
d6ee6456bd5f
windows: enforce upper case drive letter for getcwd in mercurial too
Pierre-Yves David <pierre-yves.david@octobus.net>
parents:
47560
diff
changeset
|
375 # |
d6ee6456bd5f
windows: enforce upper case drive letter for getcwd in mercurial too
Pierre-Yves David <pierre-yves.david@octobus.net>
parents:
47560
diff
changeset
|
376 # However this is not reliable, so lets explicitly make this drive |
d6ee6456bd5f
windows: enforce upper case drive letter for getcwd in mercurial too
Pierre-Yves David <pierre-yves.david@octobus.net>
parents:
47560
diff
changeset
|
377 # letter upper case. |
d6ee6456bd5f
windows: enforce upper case drive letter for getcwd in mercurial too
Pierre-Yves David <pierre-yves.david@octobus.net>
parents:
47560
diff
changeset
|
378 # |
d6ee6456bd5f
windows: enforce upper case drive letter for getcwd in mercurial too
Pierre-Yves David <pierre-yves.david@octobus.net>
parents:
47560
diff
changeset
|
379 # note: we should consider dropping realpath here since it seems to |
d6ee6456bd5f
windows: enforce upper case drive letter for getcwd in mercurial too
Pierre-Yves David <pierre-yves.david@octobus.net>
parents:
47560
diff
changeset
|
380 # change the semantic of `getcwd`. |
d6ee6456bd5f
windows: enforce upper case drive letter for getcwd in mercurial too
Pierre-Yves David <pierre-yves.david@octobus.net>
parents:
47560
diff
changeset
|
381 |
d6ee6456bd5f
windows: enforce upper case drive letter for getcwd in mercurial too
Pierre-Yves David <pierre-yves.david@octobus.net>
parents:
47560
diff
changeset
|
382 def getcwd(): |
d6ee6456bd5f
windows: enforce upper case drive letter for getcwd in mercurial too
Pierre-Yves David <pierre-yves.david@octobus.net>
parents:
47560
diff
changeset
|
383 cwd = os.getcwd() # re-exports |
d6ee6456bd5f
windows: enforce upper case drive letter for getcwd in mercurial too
Pierre-Yves David <pierre-yves.david@octobus.net>
parents:
47560
diff
changeset
|
384 cwd = os.path.realpath(cwd) |
d6ee6456bd5f
windows: enforce upper case drive letter for getcwd in mercurial too
Pierre-Yves David <pierre-yves.david@octobus.net>
parents:
47560
diff
changeset
|
385 cwd = strtolocal(cwd) |
d6ee6456bd5f
windows: enforce upper case drive letter for getcwd in mercurial too
Pierre-Yves David <pierre-yves.david@octobus.net>
parents:
47560
diff
changeset
|
386 if DRIVE_RE.match(cwd): |
d6ee6456bd5f
windows: enforce upper case drive letter for getcwd in mercurial too
Pierre-Yves David <pierre-yves.david@octobus.net>
parents:
47560
diff
changeset
|
387 cwd = cwd[0:1].upper() + cwd[1:] |
d6ee6456bd5f
windows: enforce upper case drive letter for getcwd in mercurial too
Pierre-Yves David <pierre-yves.david@octobus.net>
parents:
47560
diff
changeset
|
388 return cwd |
d6ee6456bd5f
windows: enforce upper case drive letter for getcwd in mercurial too
Pierre-Yves David <pierre-yves.david@octobus.net>
parents:
47560
diff
changeset
|
389 |
39824
fb628c048d64
py3: don't use os.getcwdb() on Windows to avoid DeprecationWarnings
Matt Harbison <matt_harbison@yahoo.com>
parents:
39823
diff
changeset
|
390 else: |
fb628c048d64
py3: don't use os.getcwdb() on Windows to avoid DeprecationWarnings
Matt Harbison <matt_harbison@yahoo.com>
parents:
39823
diff
changeset
|
391 getcwd = os.getcwdb # re-exports |
39823
24e493ec2229
py3: rename pycompat.getcwd() to encoding.getcwd() (API)
Matt Harbison <matt_harbison@yahoo.com>
parents:
38823
diff
changeset
|
392 else: |
24e493ec2229
py3: rename pycompat.getcwd() to encoding.getcwd() (API)
Matt Harbison <matt_harbison@yahoo.com>
parents:
38823
diff
changeset
|
393 getcwd = os.getcwd # re-exports |
24e493ec2229
py3: rename pycompat.getcwd() to encoding.getcwd() (API)
Matt Harbison <matt_harbison@yahoo.com>
parents:
38823
diff
changeset
|
394 |
12866
eddc20306ab6
encoding: default ambiguous character to narrow
Matt Mackall <mpm@selenic.com>
parents:
12770
diff
changeset
|
395 # How to treat ambiguous-width characters. Set to 'wide' to treat as wide. |
43076
2372284d9457
formatting: blacken the codebase
Augie Fackler <augie@google.com>
parents:
41841
diff
changeset
|
396 _wide = _sysstr( |
43077
687b865b95ad
formatting: byteify all mercurial/ and hgext/ string literals
Augie Fackler <augie@google.com>
parents:
43076
diff
changeset
|
397 environ.get(b"HGENCODINGAMBIGUOUS", b"narrow") == b"wide" |
687b865b95ad
formatting: byteify all mercurial/ and hgext/ string literals
Augie Fackler <augie@google.com>
parents:
43076
diff
changeset
|
398 and b"WFA" |
687b865b95ad
formatting: byteify all mercurial/ and hgext/ string literals
Augie Fackler <augie@google.com>
parents:
43076
diff
changeset
|
399 or b"WF" |
43076
2372284d9457
formatting: blacken the codebase
Augie Fackler <augie@google.com>
parents:
41841
diff
changeset
|
400 ) |
2372284d9457
formatting: blacken the codebase
Augie Fackler <augie@google.com>
parents:
41841
diff
changeset
|
401 |
12866
eddc20306ab6
encoding: default ambiguous character to narrow
Matt Mackall <mpm@selenic.com>
parents:
12770
diff
changeset
|
402 |
7948
de377b1a9a84
move encoding bits from util to encoding
Matt Mackall <mpm@selenic.com>
parents:
diff
changeset
|
403 def colwidth(s): |
43544
2ade00f3b03b
encoding: add comment-based type hints for pytype
Augie Fackler <augie@google.com>
parents:
43517
diff
changeset
|
404 # type: (bytes) -> int |
43807
be8552f25cab
cleanup: fix docstring formatting
Matt Harbison <matt_harbison@yahoo.com>
parents:
43793
diff
changeset
|
405 """Find the column width of a string for display in the local encoding""" |
43554
9f70512ae2cf
cleanup: remove pointless r-prefixes on single-quoted strings
Augie Fackler <augie@google.com>
parents:
43551
diff
changeset
|
406 return ucolwidth(s.decode(_sysstr(encoding), 'replace')) |
15066
24efa83d81cb
i18n: calculate terminal columns by width information of each characters
FUJIWARA Katsunori <foozy@lares.dti.ne.jp>
parents:
14951
diff
changeset
|
407 |
43076
2372284d9457
formatting: blacken the codebase
Augie Fackler <augie@google.com>
parents:
41841
diff
changeset
|
408 |
15066
24efa83d81cb
i18n: calculate terminal columns by width information of each characters
FUJIWARA Katsunori <foozy@lares.dti.ne.jp>
parents:
14951
diff
changeset
|
409 def ucolwidth(d): |
43544
2ade00f3b03b
encoding: add comment-based type hints for pytype
Augie Fackler <augie@google.com>
parents:
43517
diff
changeset
|
410 # type: (Text) -> int |
43807
be8552f25cab
cleanup: fix docstring formatting
Matt Harbison <matt_harbison@yahoo.com>
parents:
43793
diff
changeset
|
411 """Find the column width of a Unicode string for display""" |
14951
61807854004e
encoding: use getattr isntead of hasattr
Augie Fackler <durin42@gmail.com>
parents:
14069
diff
changeset
|
412 eaw = getattr(unicodedata, 'east_asian_width', None) |
61807854004e
encoding: use getattr isntead of hasattr
Augie Fackler <durin42@gmail.com>
parents:
14069
diff
changeset
|
413 if eaw is not None: |
32570
044f3d7eb9ae
encoding: make sure "wide" variable never be referenced from other modules
Yuya Nishihara <yuya@tcha.org>
parents:
32562
diff
changeset
|
414 return sum([eaw(c) in _wide and 2 or 1 for c in d]) |
7948
de377b1a9a84
move encoding bits from util to encoding
Matt Mackall <mpm@selenic.com>
parents:
diff
changeset
|
415 return len(d) |
de377b1a9a84
move encoding bits from util to encoding
Matt Mackall <mpm@selenic.com>
parents:
diff
changeset
|
416 |
43076
2372284d9457
formatting: blacken the codebase
Augie Fackler <augie@google.com>
parents:
41841
diff
changeset
|
417 |
15143
16c129b0f465
encoding: add getcols to extract substrings based on column width
Matt Mackall <mpm@selenic.com>
parents:
15142
diff
changeset
|
418 def getcols(s, start, c): |
43544
2ade00f3b03b
encoding: add comment-based type hints for pytype
Augie Fackler <augie@google.com>
parents:
43517
diff
changeset
|
419 # type: (bytes, int, int) -> bytes |
45957
89a2afe31e82
formating: upgrade to black 20.8b1
Augie Fackler <raf@durin42.com>
parents:
45681
diff
changeset
|
420 """Use colwidth to find a c-column substring of s starting at byte |
89a2afe31e82
formating: upgrade to black 20.8b1
Augie Fackler <raf@durin42.com>
parents:
45681
diff
changeset
|
421 index start""" |
38823
e7aa113b14f7
global: use pycompat.xrange()
Gregory Szorc <gregory.szorc@gmail.com>
parents:
38739
diff
changeset
|
422 for x in pycompat.xrange(start + c, len(s)): |
15143
16c129b0f465
encoding: add getcols to extract substrings based on column width
Matt Mackall <mpm@selenic.com>
parents:
15142
diff
changeset
|
423 t = s[start:x] |
16c129b0f465
encoding: add getcols to extract substrings based on column width
Matt Mackall <mpm@selenic.com>
parents:
15142
diff
changeset
|
424 if colwidth(t) == c: |
16c129b0f465
encoding: add getcols to extract substrings based on column width
Matt Mackall <mpm@selenic.com>
parents:
15142
diff
changeset
|
425 return t |
43719
7cf332318f62
encoding: make getcols() raise exception explicitly
Yuya Nishihara <yuya@tcha.org>
parents:
43681
diff
changeset
|
426 raise ValueError('substring not found') |
15143
16c129b0f465
encoding: add getcols to extract substrings based on column width
Matt Mackall <mpm@selenic.com>
parents:
15142
diff
changeset
|
427 |
43076
2372284d9457
formatting: blacken the codebase
Augie Fackler <augie@google.com>
parents:
41841
diff
changeset
|
428 |
43077
687b865b95ad
formatting: byteify all mercurial/ and hgext/ string literals
Augie Fackler <augie@google.com>
parents:
43076
diff
changeset
|
429 def trim(s, width, ellipsis=b'', leftside=False): |
43544
2ade00f3b03b
encoding: add comment-based type hints for pytype
Augie Fackler <augie@google.com>
parents:
43517
diff
changeset
|
430 # type: (bytes, int, bytes, bool) -> bytes |
21856
d24969ee272f
encoding: add 'trim' to trim multi-byte characters at most specified columns
FUJIWARA Katsunori <foozy@lares.dti.ne.jp>
parents:
18378
diff
changeset
|
431 """Trim string 's' to at most 'width' columns (including 'ellipsis'). |
d24969ee272f
encoding: add 'trim' to trim multi-byte characters at most specified columns
FUJIWARA Katsunori <foozy@lares.dti.ne.jp>
parents:
18378
diff
changeset
|
432 |
21861
b515c3a63e96
encoding: add 'leftside' argument into 'trim' to switch trimming side
FUJIWARA Katsunori <foozy@lares.dti.ne.jp>
parents:
21856
diff
changeset
|
433 If 'leftside' is True, left side of string 's' is trimmed. |
b515c3a63e96
encoding: add 'leftside' argument into 'trim' to switch trimming side
FUJIWARA Katsunori <foozy@lares.dti.ne.jp>
parents:
21856
diff
changeset
|
434 'ellipsis' is always placed at trimmed side. |
b515c3a63e96
encoding: add 'leftside' argument into 'trim' to switch trimming side
FUJIWARA Katsunori <foozy@lares.dti.ne.jp>
parents:
21856
diff
changeset
|
435 |
34151
414a3513c2bd
doctest: do not embed non-ascii characters in docstring
Yuya Nishihara <yuya@tcha.org>
parents:
34150
diff
changeset
|
436 >>> from .node import bin |
34152
a8994d08e4a2
doctest: use print_function and convert bytes to unicode where needed
Yuya Nishihara <yuya@tcha.org>
parents:
34151
diff
changeset
|
437 >>> def bprint(s): |
a8994d08e4a2
doctest: use print_function and convert bytes to unicode where needed
Yuya Nishihara <yuya@tcha.org>
parents:
34151
diff
changeset
|
438 ... print(pycompat.sysstr(s)) |
34146
0fa781320203
doctest: bulk-replace string literals with b'' for Python 3
Yuya Nishihara <yuya@tcha.org>
parents:
33946
diff
changeset
|
439 >>> ellipsis = b'+++' |
27355
b479fc425a81
encoding: use absolute_import
Gregory Szorc <gregory.szorc@gmail.com>
parents:
26963
diff
changeset
|
440 >>> from . import encoding |
34146
0fa781320203
doctest: bulk-replace string literals with b'' for Python 3
Yuya Nishihara <yuya@tcha.org>
parents:
33946
diff
changeset
|
441 >>> encoding.encoding = b'utf-8' |
0fa781320203
doctest: bulk-replace string literals with b'' for Python 3
Yuya Nishihara <yuya@tcha.org>
parents:
33946
diff
changeset
|
442 >>> t = b'1234567890' |
34152
a8994d08e4a2
doctest: use print_function and convert bytes to unicode where needed
Yuya Nishihara <yuya@tcha.org>
parents:
34151
diff
changeset
|
443 >>> bprint(trim(t, 12, ellipsis=ellipsis)) |
21856
d24969ee272f
encoding: add 'trim' to trim multi-byte characters at most specified columns
FUJIWARA Katsunori <foozy@lares.dti.ne.jp>
parents:
18378
diff
changeset
|
444 1234567890 |
34152
a8994d08e4a2
doctest: use print_function and convert bytes to unicode where needed
Yuya Nishihara <yuya@tcha.org>
parents:
34151
diff
changeset
|
445 >>> bprint(trim(t, 10, ellipsis=ellipsis)) |
21856
d24969ee272f
encoding: add 'trim' to trim multi-byte characters at most specified columns
FUJIWARA Katsunori <foozy@lares.dti.ne.jp>
parents:
18378
diff
changeset
|
446 1234567890 |
34152
a8994d08e4a2
doctest: use print_function and convert bytes to unicode where needed
Yuya Nishihara <yuya@tcha.org>
parents:
34151
diff
changeset
|
447 >>> bprint(trim(t, 8, ellipsis=ellipsis)) |
21856
d24969ee272f
encoding: add 'trim' to trim multi-byte characters at most specified columns
FUJIWARA Katsunori <foozy@lares.dti.ne.jp>
parents:
18378
diff
changeset
|
448 12345+++ |
34152
a8994d08e4a2
doctest: use print_function and convert bytes to unicode where needed
Yuya Nishihara <yuya@tcha.org>
parents:
34151
diff
changeset
|
449 >>> bprint(trim(t, 8, ellipsis=ellipsis, leftside=True)) |
21861
b515c3a63e96
encoding: add 'leftside' argument into 'trim' to switch trimming side
FUJIWARA Katsunori <foozy@lares.dti.ne.jp>
parents:
21856
diff
changeset
|
450 +++67890 |
34152
a8994d08e4a2
doctest: use print_function and convert bytes to unicode where needed
Yuya Nishihara <yuya@tcha.org>
parents:
34151
diff
changeset
|
451 >>> bprint(trim(t, 8)) |
21856
d24969ee272f
encoding: add 'trim' to trim multi-byte characters at most specified columns
FUJIWARA Katsunori <foozy@lares.dti.ne.jp>
parents:
18378
diff
changeset
|
452 12345678 |
34152
a8994d08e4a2
doctest: use print_function and convert bytes to unicode where needed
Yuya Nishihara <yuya@tcha.org>
parents:
34151
diff
changeset
|
453 >>> bprint(trim(t, 8, leftside=True)) |
21861
b515c3a63e96
encoding: add 'leftside' argument into 'trim' to switch trimming side
FUJIWARA Katsunori <foozy@lares.dti.ne.jp>
parents:
21856
diff
changeset
|
454 34567890 |
34152
a8994d08e4a2
doctest: use print_function and convert bytes to unicode where needed
Yuya Nishihara <yuya@tcha.org>
parents:
34151
diff
changeset
|
455 >>> bprint(trim(t, 3, ellipsis=ellipsis)) |
21856
d24969ee272f
encoding: add 'trim' to trim multi-byte characters at most specified columns
FUJIWARA Katsunori <foozy@lares.dti.ne.jp>
parents:
18378
diff
changeset
|
456 +++ |
34152
a8994d08e4a2
doctest: use print_function and convert bytes to unicode where needed
Yuya Nishihara <yuya@tcha.org>
parents:
34151
diff
changeset
|
457 >>> bprint(trim(t, 1, ellipsis=ellipsis)) |
21856
d24969ee272f
encoding: add 'trim' to trim multi-byte characters at most specified columns
FUJIWARA Katsunori <foozy@lares.dti.ne.jp>
parents:
18378
diff
changeset
|
458 + |
d24969ee272f
encoding: add 'trim' to trim multi-byte characters at most specified columns
FUJIWARA Katsunori <foozy@lares.dti.ne.jp>
parents:
18378
diff
changeset
|
459 >>> u = u'\u3042\u3044\u3046\u3048\u304a' # 2 x 5 = 10 columns |
34150
e9e225f16932
doctest: pass encoding name as system string
Yuya Nishihara <yuya@tcha.org>
parents:
34146
diff
changeset
|
460 >>> t = u.encode(pycompat.sysstr(encoding.encoding)) |
34152
a8994d08e4a2
doctest: use print_function and convert bytes to unicode where needed
Yuya Nishihara <yuya@tcha.org>
parents:
34151
diff
changeset
|
461 >>> bprint(trim(t, 12, ellipsis=ellipsis)) |
21856
d24969ee272f
encoding: add 'trim' to trim multi-byte characters at most specified columns
FUJIWARA Katsunori <foozy@lares.dti.ne.jp>
parents:
18378
diff
changeset
|
462 \xe3\x81\x82\xe3\x81\x84\xe3\x81\x86\xe3\x81\x88\xe3\x81\x8a |
34152
a8994d08e4a2
doctest: use print_function and convert bytes to unicode where needed
Yuya Nishihara <yuya@tcha.org>
parents:
34151
diff
changeset
|
463 >>> bprint(trim(t, 10, ellipsis=ellipsis)) |
21856
d24969ee272f
encoding: add 'trim' to trim multi-byte characters at most specified columns
FUJIWARA Katsunori <foozy@lares.dti.ne.jp>
parents:
18378
diff
changeset
|
464 \xe3\x81\x82\xe3\x81\x84\xe3\x81\x86\xe3\x81\x88\xe3\x81\x8a |
34152
a8994d08e4a2
doctest: use print_function and convert bytes to unicode where needed
Yuya Nishihara <yuya@tcha.org>
parents:
34151
diff
changeset
|
465 >>> bprint(trim(t, 8, ellipsis=ellipsis)) |
21856
d24969ee272f
encoding: add 'trim' to trim multi-byte characters at most specified columns
FUJIWARA Katsunori <foozy@lares.dti.ne.jp>
parents:
18378
diff
changeset
|
466 \xe3\x81\x82\xe3\x81\x84+++ |
34152
a8994d08e4a2
doctest: use print_function and convert bytes to unicode where needed
Yuya Nishihara <yuya@tcha.org>
parents:
34151
diff
changeset
|
467 >>> bprint(trim(t, 8, ellipsis=ellipsis, leftside=True)) |
21861
b515c3a63e96
encoding: add 'leftside' argument into 'trim' to switch trimming side
FUJIWARA Katsunori <foozy@lares.dti.ne.jp>
parents:
21856
diff
changeset
|
468 +++\xe3\x81\x88\xe3\x81\x8a |
34152
a8994d08e4a2
doctest: use print_function and convert bytes to unicode where needed
Yuya Nishihara <yuya@tcha.org>
parents:
34151
diff
changeset
|
469 >>> bprint(trim(t, 5)) |
21856
d24969ee272f
encoding: add 'trim' to trim multi-byte characters at most specified columns
FUJIWARA Katsunori <foozy@lares.dti.ne.jp>
parents:
18378
diff
changeset
|
470 \xe3\x81\x82\xe3\x81\x84 |
34152
a8994d08e4a2
doctest: use print_function and convert bytes to unicode where needed
Yuya Nishihara <yuya@tcha.org>
parents:
34151
diff
changeset
|
471 >>> bprint(trim(t, 5, leftside=True)) |
21861
b515c3a63e96
encoding: add 'leftside' argument into 'trim' to switch trimming side
FUJIWARA Katsunori <foozy@lares.dti.ne.jp>
parents:
21856
diff
changeset
|
472 \xe3\x81\x88\xe3\x81\x8a |
34152
a8994d08e4a2
doctest: use print_function and convert bytes to unicode where needed
Yuya Nishihara <yuya@tcha.org>
parents:
34151
diff
changeset
|
473 >>> bprint(trim(t, 4, ellipsis=ellipsis)) |
21856
d24969ee272f
encoding: add 'trim' to trim multi-byte characters at most specified columns
FUJIWARA Katsunori <foozy@lares.dti.ne.jp>
parents:
18378
diff
changeset
|
474 +++ |
34152
a8994d08e4a2
doctest: use print_function and convert bytes to unicode where needed
Yuya Nishihara <yuya@tcha.org>
parents:
34151
diff
changeset
|
475 >>> bprint(trim(t, 4, ellipsis=ellipsis, leftside=True)) |
21861
b515c3a63e96
encoding: add 'leftside' argument into 'trim' to switch trimming side
FUJIWARA Katsunori <foozy@lares.dti.ne.jp>
parents:
21856
diff
changeset
|
476 +++ |
34151
414a3513c2bd
doctest: do not embed non-ascii characters in docstring
Yuya Nishihara <yuya@tcha.org>
parents:
34150
diff
changeset
|
477 >>> t = bin(b'112233445566778899aa') # invalid byte sequence |
34152
a8994d08e4a2
doctest: use print_function and convert bytes to unicode where needed
Yuya Nishihara <yuya@tcha.org>
parents:
34151
diff
changeset
|
478 >>> bprint(trim(t, 12, ellipsis=ellipsis)) |
21856
d24969ee272f
encoding: add 'trim' to trim multi-byte characters at most specified columns
FUJIWARA Katsunori <foozy@lares.dti.ne.jp>
parents:
18378
diff
changeset
|
479 \x11\x22\x33\x44\x55\x66\x77\x88\x99\xaa |
34152
a8994d08e4a2
doctest: use print_function and convert bytes to unicode where needed
Yuya Nishihara <yuya@tcha.org>
parents:
34151
diff
changeset
|
480 >>> bprint(trim(t, 10, ellipsis=ellipsis)) |
21856
d24969ee272f
encoding: add 'trim' to trim multi-byte characters at most specified columns
FUJIWARA Katsunori <foozy@lares.dti.ne.jp>
parents:
18378
diff
changeset
|
481 \x11\x22\x33\x44\x55\x66\x77\x88\x99\xaa |
34152
a8994d08e4a2
doctest: use print_function and convert bytes to unicode where needed
Yuya Nishihara <yuya@tcha.org>
parents:
34151
diff
changeset
|
482 >>> bprint(trim(t, 8, ellipsis=ellipsis)) |
21856
d24969ee272f
encoding: add 'trim' to trim multi-byte characters at most specified columns
FUJIWARA Katsunori <foozy@lares.dti.ne.jp>
parents:
18378
diff
changeset
|
483 \x11\x22\x33\x44\x55+++ |
34152
a8994d08e4a2
doctest: use print_function and convert bytes to unicode where needed
Yuya Nishihara <yuya@tcha.org>
parents:
34151
diff
changeset
|
484 >>> bprint(trim(t, 8, ellipsis=ellipsis, leftside=True)) |
21861
b515c3a63e96
encoding: add 'leftside' argument into 'trim' to switch trimming side
FUJIWARA Katsunori <foozy@lares.dti.ne.jp>
parents:
21856
diff
changeset
|
485 +++\x66\x77\x88\x99\xaa |
34152
a8994d08e4a2
doctest: use print_function and convert bytes to unicode where needed
Yuya Nishihara <yuya@tcha.org>
parents:
34151
diff
changeset
|
486 >>> bprint(trim(t, 8)) |
21856
d24969ee272f
encoding: add 'trim' to trim multi-byte characters at most specified columns
FUJIWARA Katsunori <foozy@lares.dti.ne.jp>
parents:
18378
diff
changeset
|
487 \x11\x22\x33\x44\x55\x66\x77\x88 |
34152
a8994d08e4a2
doctest: use print_function and convert bytes to unicode where needed
Yuya Nishihara <yuya@tcha.org>
parents:
34151
diff
changeset
|
488 >>> bprint(trim(t, 8, leftside=True)) |
21861
b515c3a63e96
encoding: add 'leftside' argument into 'trim' to switch trimming side
FUJIWARA Katsunori <foozy@lares.dti.ne.jp>
parents:
21856
diff
changeset
|
489 \x33\x44\x55\x66\x77\x88\x99\xaa |
34152
a8994d08e4a2
doctest: use print_function and convert bytes to unicode where needed
Yuya Nishihara <yuya@tcha.org>
parents:
34151
diff
changeset
|
490 >>> bprint(trim(t, 3, ellipsis=ellipsis)) |
21856
d24969ee272f
encoding: add 'trim' to trim multi-byte characters at most specified columns
FUJIWARA Katsunori <foozy@lares.dti.ne.jp>
parents:
18378
diff
changeset
|
491 +++ |
34152
a8994d08e4a2
doctest: use print_function and convert bytes to unicode where needed
Yuya Nishihara <yuya@tcha.org>
parents:
34151
diff
changeset
|
492 >>> bprint(trim(t, 1, ellipsis=ellipsis)) |
21856
d24969ee272f
encoding: add 'trim' to trim multi-byte characters at most specified columns
FUJIWARA Katsunori <foozy@lares.dti.ne.jp>
parents:
18378
diff
changeset
|
493 + |
d24969ee272f
encoding: add 'trim' to trim multi-byte characters at most specified columns
FUJIWARA Katsunori <foozy@lares.dti.ne.jp>
parents:
18378
diff
changeset
|
494 """ |
d24969ee272f
encoding: add 'trim' to trim multi-byte characters at most specified columns
FUJIWARA Katsunori <foozy@lares.dti.ne.jp>
parents:
18378
diff
changeset
|
495 try: |
30033
02dbfaa6df0b
py3: convert encoding name and mode to str
Yuya Nishihara <yuya@tcha.org>
parents:
30031
diff
changeset
|
496 u = s.decode(_sysstr(encoding)) |
21856
d24969ee272f
encoding: add 'trim' to trim multi-byte characters at most specified columns
FUJIWARA Katsunori <foozy@lares.dti.ne.jp>
parents:
18378
diff
changeset
|
497 except UnicodeDecodeError: |
43076
2372284d9457
formatting: blacken the codebase
Augie Fackler <augie@google.com>
parents:
41841
diff
changeset
|
498 if len(s) <= width: # trimming is not needed |
21856
d24969ee272f
encoding: add 'trim' to trim multi-byte characters at most specified columns
FUJIWARA Katsunori <foozy@lares.dti.ne.jp>
parents:
18378
diff
changeset
|
499 return s |
d24969ee272f
encoding: add 'trim' to trim multi-byte characters at most specified columns
FUJIWARA Katsunori <foozy@lares.dti.ne.jp>
parents:
18378
diff
changeset
|
500 width -= len(ellipsis) |
43076
2372284d9457
formatting: blacken the codebase
Augie Fackler <augie@google.com>
parents:
41841
diff
changeset
|
501 if width <= 0: # no enough room even for ellipsis |
2372284d9457
formatting: blacken the codebase
Augie Fackler <augie@google.com>
parents:
41841
diff
changeset
|
502 return ellipsis[: width + len(ellipsis)] |
21861
b515c3a63e96
encoding: add 'leftside' argument into 'trim' to switch trimming side
FUJIWARA Katsunori <foozy@lares.dti.ne.jp>
parents:
21856
diff
changeset
|
503 if leftside: |
b515c3a63e96
encoding: add 'leftside' argument into 'trim' to switch trimming side
FUJIWARA Katsunori <foozy@lares.dti.ne.jp>
parents:
21856
diff
changeset
|
504 return ellipsis + s[-width:] |
21856
d24969ee272f
encoding: add 'trim' to trim multi-byte characters at most specified columns
FUJIWARA Katsunori <foozy@lares.dti.ne.jp>
parents:
18378
diff
changeset
|
505 return s[:width] + ellipsis |
d24969ee272f
encoding: add 'trim' to trim multi-byte characters at most specified columns
FUJIWARA Katsunori <foozy@lares.dti.ne.jp>
parents:
18378
diff
changeset
|
506 |
43076
2372284d9457
formatting: blacken the codebase
Augie Fackler <augie@google.com>
parents:
41841
diff
changeset
|
507 if ucolwidth(u) <= width: # trimming is not needed |
21856
d24969ee272f
encoding: add 'trim' to trim multi-byte characters at most specified columns
FUJIWARA Katsunori <foozy@lares.dti.ne.jp>
parents:
18378
diff
changeset
|
508 return s |
d24969ee272f
encoding: add 'trim' to trim multi-byte characters at most specified columns
FUJIWARA Katsunori <foozy@lares.dti.ne.jp>
parents:
18378
diff
changeset
|
509 |
d24969ee272f
encoding: add 'trim' to trim multi-byte characters at most specified columns
FUJIWARA Katsunori <foozy@lares.dti.ne.jp>
parents:
18378
diff
changeset
|
510 width -= len(ellipsis) |
43076
2372284d9457
formatting: blacken the codebase
Augie Fackler <augie@google.com>
parents:
41841
diff
changeset
|
511 if width <= 0: # no enough room even for ellipsis |
2372284d9457
formatting: blacken the codebase
Augie Fackler <augie@google.com>
parents:
41841
diff
changeset
|
512 return ellipsis[: width + len(ellipsis)] |
21856
d24969ee272f
encoding: add 'trim' to trim multi-byte characters at most specified columns
FUJIWARA Katsunori <foozy@lares.dti.ne.jp>
parents:
18378
diff
changeset
|
513 |
48692
f1ed5c304f45
encoding: fix trim() to be O(n) instead of O(n^2)
Martin von Zweigbergk <martinvonz@google.com>
parents:
48030
diff
changeset
|
514 chars = list(u) |
21861
b515c3a63e96
encoding: add 'leftside' argument into 'trim' to switch trimming side
FUJIWARA Katsunori <foozy@lares.dti.ne.jp>
parents:
21856
diff
changeset
|
515 if leftside: |
48692
f1ed5c304f45
encoding: fix trim() to be O(n) instead of O(n^2)
Martin von Zweigbergk <martinvonz@google.com>
parents:
48030
diff
changeset
|
516 chars.reverse() |
f1ed5c304f45
encoding: fix trim() to be O(n) instead of O(n^2)
Martin von Zweigbergk <martinvonz@google.com>
parents:
48030
diff
changeset
|
517 width_so_far = 0 |
f1ed5c304f45
encoding: fix trim() to be O(n) instead of O(n^2)
Martin von Zweigbergk <martinvonz@google.com>
parents:
48030
diff
changeset
|
518 for i, c in enumerate(chars): |
f1ed5c304f45
encoding: fix trim() to be O(n) instead of O(n^2)
Martin von Zweigbergk <martinvonz@google.com>
parents:
48030
diff
changeset
|
519 width_so_far += ucolwidth(c) |
f1ed5c304f45
encoding: fix trim() to be O(n) instead of O(n^2)
Martin von Zweigbergk <martinvonz@google.com>
parents:
48030
diff
changeset
|
520 if width_so_far > width: |
f1ed5c304f45
encoding: fix trim() to be O(n) instead of O(n^2)
Martin von Zweigbergk <martinvonz@google.com>
parents:
48030
diff
changeset
|
521 break |
f1ed5c304f45
encoding: fix trim() to be O(n) instead of O(n^2)
Martin von Zweigbergk <martinvonz@google.com>
parents:
48030
diff
changeset
|
522 chars = chars[:i] |
f1ed5c304f45
encoding: fix trim() to be O(n) instead of O(n^2)
Martin von Zweigbergk <martinvonz@google.com>
parents:
48030
diff
changeset
|
523 if leftside: |
f1ed5c304f45
encoding: fix trim() to be O(n) instead of O(n^2)
Martin von Zweigbergk <martinvonz@google.com>
parents:
48030
diff
changeset
|
524 chars.reverse() |
f1ed5c304f45
encoding: fix trim() to be O(n) instead of O(n^2)
Martin von Zweigbergk <martinvonz@google.com>
parents:
48030
diff
changeset
|
525 u = u''.join(chars).encode(_sysstr(encoding)) |
f1ed5c304f45
encoding: fix trim() to be O(n) instead of O(n^2)
Martin von Zweigbergk <martinvonz@google.com>
parents:
48030
diff
changeset
|
526 if leftside: |
f1ed5c304f45
encoding: fix trim() to be O(n) instead of O(n^2)
Martin von Zweigbergk <martinvonz@google.com>
parents:
48030
diff
changeset
|
527 return ellipsis + u |
f1ed5c304f45
encoding: fix trim() to be O(n) instead of O(n^2)
Martin von Zweigbergk <martinvonz@google.com>
parents:
48030
diff
changeset
|
528 return u + ellipsis |
43076
2372284d9457
formatting: blacken the codebase
Augie Fackler <augie@google.com>
parents:
41841
diff
changeset
|
529 |
21856
d24969ee272f
encoding: add 'trim' to trim multi-byte characters at most specified columns
FUJIWARA Katsunori <foozy@lares.dti.ne.jp>
parents:
18378
diff
changeset
|
530 |
24593
f473a1fe5c7c
encoding: define an enum that specifies what normcase does to ASCII strings
Siddharth Agarwal <sid0@fb.com>
parents:
24578
diff
changeset
|
531 class normcasespecs(object): |
45957
89a2afe31e82
formating: upgrade to black 20.8b1
Augie Fackler <raf@durin42.com>
parents:
45681
diff
changeset
|
532 """what a platform's normcase does to ASCII strings |
24593
f473a1fe5c7c
encoding: define an enum that specifies what normcase does to ASCII strings
Siddharth Agarwal <sid0@fb.com>
parents:
24578
diff
changeset
|
533 |
f473a1fe5c7c
encoding: define an enum that specifies what normcase does to ASCII strings
Siddharth Agarwal <sid0@fb.com>
parents:
24578
diff
changeset
|
534 This is specified per platform, and should be consistent with what normcase |
f473a1fe5c7c
encoding: define an enum that specifies what normcase does to ASCII strings
Siddharth Agarwal <sid0@fb.com>
parents:
24578
diff
changeset
|
535 on that platform actually does. |
f473a1fe5c7c
encoding: define an enum that specifies what normcase does to ASCII strings
Siddharth Agarwal <sid0@fb.com>
parents:
24578
diff
changeset
|
536 |
f473a1fe5c7c
encoding: define an enum that specifies what normcase does to ASCII strings
Siddharth Agarwal <sid0@fb.com>
parents:
24578
diff
changeset
|
537 lower: normcase lowercases ASCII strings |
f473a1fe5c7c
encoding: define an enum that specifies what normcase does to ASCII strings
Siddharth Agarwal <sid0@fb.com>
parents:
24578
diff
changeset
|
538 upper: normcase uppercases ASCII strings |
24608
1c533e23ce95
util.h: define an enum for normcase specs
Siddharth Agarwal <sid0@fb.com>
parents:
24597
diff
changeset
|
539 other: the fallback function should always be called |
1c533e23ce95
util.h: define an enum for normcase specs
Siddharth Agarwal <sid0@fb.com>
parents:
24597
diff
changeset
|
540 |
45957
89a2afe31e82
formating: upgrade to black 20.8b1
Augie Fackler <raf@durin42.com>
parents:
45681
diff
changeset
|
541 This should be kept in sync with normcase_spec in util.h.""" |
43076
2372284d9457
formatting: blacken the codebase
Augie Fackler <augie@google.com>
parents:
41841
diff
changeset
|
542 |
24593
f473a1fe5c7c
encoding: define an enum that specifies what normcase does to ASCII strings
Siddharth Agarwal <sid0@fb.com>
parents:
24578
diff
changeset
|
543 lower = -1 |
f473a1fe5c7c
encoding: define an enum that specifies what normcase does to ASCII strings
Siddharth Agarwal <sid0@fb.com>
parents:
24578
diff
changeset
|
544 upper = 1 |
f473a1fe5c7c
encoding: define an enum that specifies what normcase does to ASCII strings
Siddharth Agarwal <sid0@fb.com>
parents:
24578
diff
changeset
|
545 other = 0 |
f473a1fe5c7c
encoding: define an enum that specifies what normcase does to ASCII strings
Siddharth Agarwal <sid0@fb.com>
parents:
24578
diff
changeset
|
546 |
43076
2372284d9457
formatting: blacken the codebase
Augie Fackler <augie@google.com>
parents:
41841
diff
changeset
|
547 |
28068
9ece901f7a19
encoding: add option to escape non-ascii characters in JSON
Yuya Nishihara <yuya@tcha.org>
parents:
28067
diff
changeset
|
548 def jsonescape(s, paranoid=False): |
43544
2ade00f3b03b
encoding: add comment-based type hints for pytype
Augie Fackler <augie@google.com>
parents:
43517
diff
changeset
|
549 # type: (Any, Any) -> Any |
45957
89a2afe31e82
formating: upgrade to black 20.8b1
Augie Fackler <raf@durin42.com>
parents:
45681
diff
changeset
|
550 """returns a string suitable for JSON |
22426
f6b533e64ed6
encoding: add json escaping filter
Matt Mackall <mpm@selenic.com>
parents:
22425
diff
changeset
|
551 |
f6b533e64ed6
encoding: add json escaping filter
Matt Mackall <mpm@selenic.com>
parents:
22425
diff
changeset
|
552 JSON is problematic for us because it doesn't support non-Unicode |
f6b533e64ed6
encoding: add json escaping filter
Matt Mackall <mpm@selenic.com>
parents:
22425
diff
changeset
|
553 bytes. To deal with this, we take the following approach: |
f6b533e64ed6
encoding: add json escaping filter
Matt Mackall <mpm@selenic.com>
parents:
22425
diff
changeset
|
554 |
37991
3ea3c96ada54
encoding: introduce tagging type for non-lossy non-ASCII string
Yuya Nishihara <yuya@tcha.org>
parents:
37990
diff
changeset
|
555 - localstr/safelocalstr objects are converted back to UTF-8 |
22426
f6b533e64ed6
encoding: add json escaping filter
Matt Mackall <mpm@selenic.com>
parents:
22425
diff
changeset
|
556 - valid UTF-8/ASCII strings are passed as-is |
f6b533e64ed6
encoding: add json escaping filter
Matt Mackall <mpm@selenic.com>
parents:
22425
diff
changeset
|
557 - other strings are converted to UTF-8b surrogate encoding |
f6b533e64ed6
encoding: add json escaping filter
Matt Mackall <mpm@selenic.com>
parents:
22425
diff
changeset
|
558 - apply JSON-specified string escaping |
f6b533e64ed6
encoding: add json escaping filter
Matt Mackall <mpm@selenic.com>
parents:
22425
diff
changeset
|
559 |
f6b533e64ed6
encoding: add json escaping filter
Matt Mackall <mpm@selenic.com>
parents:
22425
diff
changeset
|
560 (escapes are doubled in these tests) |
f6b533e64ed6
encoding: add json escaping filter
Matt Mackall <mpm@selenic.com>
parents:
22425
diff
changeset
|
561 |
34146
0fa781320203
doctest: bulk-replace string literals with b'' for Python 3
Yuya Nishihara <yuya@tcha.org>
parents:
33946
diff
changeset
|
562 >>> jsonescape(b'this is a test') |
22426
f6b533e64ed6
encoding: add json escaping filter
Matt Mackall <mpm@selenic.com>
parents:
22425
diff
changeset
|
563 'this is a test' |
34146
0fa781320203
doctest: bulk-replace string literals with b'' for Python 3
Yuya Nishihara <yuya@tcha.org>
parents:
33946
diff
changeset
|
564 >>> jsonescape(b'escape characters: \\0 \\x0b \\x7f') |
27881
ffa599f3f503
encoding: escape U+007F (DEL) character in JSON
Yuya Nishihara <yuya@tcha.org>
parents:
27699
diff
changeset
|
565 'escape characters: \\\\u0000 \\\\u000b \\\\u007f' |
34146
0fa781320203
doctest: bulk-replace string literals with b'' for Python 3
Yuya Nishihara <yuya@tcha.org>
parents:
33946
diff
changeset
|
566 >>> jsonescape(b'escape characters: \\b \\t \\n \\f \\r \\" \\\\') |
33943
2c37f9dabc32
encoding: add fast path of jsonescape() (issue5533)
Yuya Nishihara <yuya@tcha.org>
parents:
33942
diff
changeset
|
567 'escape characters: \\\\b \\\\t \\\\n \\\\f \\\\r \\\\" \\\\\\\\' |
34146
0fa781320203
doctest: bulk-replace string literals with b'' for Python 3
Yuya Nishihara <yuya@tcha.org>
parents:
33946
diff
changeset
|
568 >>> jsonescape(b'a weird byte: \\xdd') |
22426
f6b533e64ed6
encoding: add json escaping filter
Matt Mackall <mpm@selenic.com>
parents:
22425
diff
changeset
|
569 'a weird byte: \\xed\\xb3\\x9d' |
34146
0fa781320203
doctest: bulk-replace string literals with b'' for Python 3
Yuya Nishihara <yuya@tcha.org>
parents:
33946
diff
changeset
|
570 >>> jsonescape(b'utf-8: caf\\xc3\\xa9') |
22426
f6b533e64ed6
encoding: add json escaping filter
Matt Mackall <mpm@selenic.com>
parents:
22425
diff
changeset
|
571 'utf-8: caf\\xc3\\xa9' |
34146
0fa781320203
doctest: bulk-replace string literals with b'' for Python 3
Yuya Nishihara <yuya@tcha.org>
parents:
33946
diff
changeset
|
572 >>> jsonescape(b'') |
22426
f6b533e64ed6
encoding: add json escaping filter
Matt Mackall <mpm@selenic.com>
parents:
22425
diff
changeset
|
573 '' |
28068
9ece901f7a19
encoding: add option to escape non-ascii characters in JSON
Yuya Nishihara <yuya@tcha.org>
parents:
28067
diff
changeset
|
574 |
28069
b2d24c2898f9
encoding: backport paranoid escaping from templatefilters.jsonescape()
Yuya Nishihara <yuya@tcha.org>
parents:
28068
diff
changeset
|
575 If paranoid, non-ascii and common troublesome characters are also escaped. |
b2d24c2898f9
encoding: backport paranoid escaping from templatefilters.jsonescape()
Yuya Nishihara <yuya@tcha.org>
parents:
28068
diff
changeset
|
576 This is suitable for web output. |
28068
9ece901f7a19
encoding: add option to escape non-ascii characters in JSON
Yuya Nishihara <yuya@tcha.org>
parents:
28067
diff
changeset
|
577 |
34146
0fa781320203
doctest: bulk-replace string literals with b'' for Python 3
Yuya Nishihara <yuya@tcha.org>
parents:
33946
diff
changeset
|
578 >>> s = b'escape characters: \\0 \\x0b \\x7f' |
33943
2c37f9dabc32
encoding: add fast path of jsonescape() (issue5533)
Yuya Nishihara <yuya@tcha.org>
parents:
33942
diff
changeset
|
579 >>> assert jsonescape(s) == jsonescape(s, paranoid=True) |
34146
0fa781320203
doctest: bulk-replace string literals with b'' for Python 3
Yuya Nishihara <yuya@tcha.org>
parents:
33946
diff
changeset
|
580 >>> s = b'escape characters: \\b \\t \\n \\f \\r \\" \\\\' |
33943
2c37f9dabc32
encoding: add fast path of jsonescape() (issue5533)
Yuya Nishihara <yuya@tcha.org>
parents:
33942
diff
changeset
|
581 >>> assert jsonescape(s) == jsonescape(s, paranoid=True) |
34146
0fa781320203
doctest: bulk-replace string literals with b'' for Python 3
Yuya Nishihara <yuya@tcha.org>
parents:
33946
diff
changeset
|
582 >>> jsonescape(b'escape boundary: \\x7e \\x7f \\xc2\\x80', paranoid=True) |
28068
9ece901f7a19
encoding: add option to escape non-ascii characters in JSON
Yuya Nishihara <yuya@tcha.org>
parents:
28067
diff
changeset
|
583 'escape boundary: ~ \\\\u007f \\\\u0080' |
34146
0fa781320203
doctest: bulk-replace string literals with b'' for Python 3
Yuya Nishihara <yuya@tcha.org>
parents:
33946
diff
changeset
|
584 >>> jsonescape(b'a weird byte: \\xdd', paranoid=True) |
28068
9ece901f7a19
encoding: add option to escape non-ascii characters in JSON
Yuya Nishihara <yuya@tcha.org>
parents:
28067
diff
changeset
|
585 'a weird byte: \\\\udcdd' |
34146
0fa781320203
doctest: bulk-replace string literals with b'' for Python 3
Yuya Nishihara <yuya@tcha.org>
parents:
33946
diff
changeset
|
586 >>> jsonescape(b'utf-8: caf\\xc3\\xa9', paranoid=True) |
28068
9ece901f7a19
encoding: add option to escape non-ascii characters in JSON
Yuya Nishihara <yuya@tcha.org>
parents:
28067
diff
changeset
|
587 'utf-8: caf\\\\u00e9' |
34146
0fa781320203
doctest: bulk-replace string literals with b'' for Python 3
Yuya Nishihara <yuya@tcha.org>
parents:
33946
diff
changeset
|
588 >>> jsonescape(b'non-BMP: \\xf0\\x9d\\x84\\x9e', paranoid=True) |
28068
9ece901f7a19
encoding: add option to escape non-ascii characters in JSON
Yuya Nishihara <yuya@tcha.org>
parents:
28067
diff
changeset
|
589 'non-BMP: \\\\ud834\\\\udd1e' |
34146
0fa781320203
doctest: bulk-replace string literals with b'' for Python 3
Yuya Nishihara <yuya@tcha.org>
parents:
33946
diff
changeset
|
590 >>> jsonescape(b'<foo@example.org>', paranoid=True) |
28069
b2d24c2898f9
encoding: backport paranoid escaping from templatefilters.jsonescape()
Yuya Nishihara <yuya@tcha.org>
parents:
28068
diff
changeset
|
591 '\\\\u003cfoo@example.org\\\\u003e' |
45957
89a2afe31e82
formating: upgrade to black 20.8b1
Augie Fackler <raf@durin42.com>
parents:
45681
diff
changeset
|
592 """ |
22426
f6b533e64ed6
encoding: add json escaping filter
Matt Mackall <mpm@selenic.com>
parents:
22425
diff
changeset
|
593 |
28068
9ece901f7a19
encoding: add option to escape non-ascii characters in JSON
Yuya Nishihara <yuya@tcha.org>
parents:
28067
diff
changeset
|
594 u8chars = toutf8b(s) |
9ece901f7a19
encoding: add option to escape non-ascii characters in JSON
Yuya Nishihara <yuya@tcha.org>
parents:
28067
diff
changeset
|
595 try: |
33942
b9101467d88b
encoding: extract stub for fast JSON escape
Yuya Nishihara <yuya@tcha.org>
parents:
33873
diff
changeset
|
596 return _jsonescapeu8fast(u8chars, paranoid) |
b9101467d88b
encoding: extract stub for fast JSON escape
Yuya Nishihara <yuya@tcha.org>
parents:
33873
diff
changeset
|
597 except ValueError: |
28068
9ece901f7a19
encoding: add option to escape non-ascii characters in JSON
Yuya Nishihara <yuya@tcha.org>
parents:
28067
diff
changeset
|
598 pass |
33942
b9101467d88b
encoding: extract stub for fast JSON escape
Yuya Nishihara <yuya@tcha.org>
parents:
33873
diff
changeset
|
599 return charencodepure.jsonescapeu8fallback(u8chars, paranoid) |
22426
f6b533e64ed6
encoding: add json escaping filter
Matt Mackall <mpm@selenic.com>
parents:
22425
diff
changeset
|
600 |
43076
2372284d9457
formatting: blacken the codebase
Augie Fackler <augie@google.com>
parents:
41841
diff
changeset
|
601 |
34225
aa877860d4d7
py3: use 'surrogatepass' error handler to process U+DCxx transparently
Yuya Nishihara <yuya@tcha.org>
parents:
34223
diff
changeset
|
602 # We need to decode/encode U+DCxx codes transparently since invalid UTF-8 |
aa877860d4d7
py3: use 'surrogatepass' error handler to process U+DCxx transparently
Yuya Nishihara <yuya@tcha.org>
parents:
34223
diff
changeset
|
603 # bytes are mapped to that range. |
aa877860d4d7
py3: use 'surrogatepass' error handler to process U+DCxx transparently
Yuya Nishihara <yuya@tcha.org>
parents:
34223
diff
changeset
|
604 if pycompat.ispy3: |
aa877860d4d7
py3: use 'surrogatepass' error handler to process U+DCxx transparently
Yuya Nishihara <yuya@tcha.org>
parents:
34223
diff
changeset
|
605 _utf8strict = r'surrogatepass' |
aa877860d4d7
py3: use 'surrogatepass' error handler to process U+DCxx transparently
Yuya Nishihara <yuya@tcha.org>
parents:
34223
diff
changeset
|
606 else: |
aa877860d4d7
py3: use 'surrogatepass' error handler to process U+DCxx transparently
Yuya Nishihara <yuya@tcha.org>
parents:
34223
diff
changeset
|
607 _utf8strict = r'strict' |
aa877860d4d7
py3: use 'surrogatepass' error handler to process U+DCxx transparently
Yuya Nishihara <yuya@tcha.org>
parents:
34223
diff
changeset
|
608 |
26875
cf47bdb2183c
encoding: add getutf8char helper
Matt Mackall <mpm@selenic.com>
parents:
25660
diff
changeset
|
609 _utf8len = [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 2, 2, 3, 4] |
cf47bdb2183c
encoding: add getutf8char helper
Matt Mackall <mpm@selenic.com>
parents:
25660
diff
changeset
|
610 |
43076
2372284d9457
formatting: blacken the codebase
Augie Fackler <augie@google.com>
parents:
41841
diff
changeset
|
611 |
26875
cf47bdb2183c
encoding: add getutf8char helper
Matt Mackall <mpm@selenic.com>
parents:
25660
diff
changeset
|
612 def getutf8char(s, pos): |
43722
83a349aaeba3
typing: constrain argument/return types of encoding.toutf8b()
Yuya Nishihara <yuya@tcha.org>
parents:
43721
diff
changeset
|
613 # type: (bytes, int) -> bytes |
45957
89a2afe31e82
formating: upgrade to black 20.8b1
Augie Fackler <raf@durin42.com>
parents:
45681
diff
changeset
|
614 """get the next full utf-8 character in the given string, starting at pos |
26875
cf47bdb2183c
encoding: add getutf8char helper
Matt Mackall <mpm@selenic.com>
parents:
25660
diff
changeset
|
615 |
cf47bdb2183c
encoding: add getutf8char helper
Matt Mackall <mpm@selenic.com>
parents:
25660
diff
changeset
|
616 Raises a UnicodeError if the given location does not start a valid |
cf47bdb2183c
encoding: add getutf8char helper
Matt Mackall <mpm@selenic.com>
parents:
25660
diff
changeset
|
617 utf-8 character. |
45957
89a2afe31e82
formating: upgrade to black 20.8b1
Augie Fackler <raf@durin42.com>
parents:
45681
diff
changeset
|
618 """ |
26875
cf47bdb2183c
encoding: add getutf8char helper
Matt Mackall <mpm@selenic.com>
parents:
25660
diff
changeset
|
619 |
cf47bdb2183c
encoding: add getutf8char helper
Matt Mackall <mpm@selenic.com>
parents:
25660
diff
changeset
|
620 # find how many bytes to attempt decoding from first nibble |
43076
2372284d9457
formatting: blacken the codebase
Augie Fackler <augie@google.com>
parents:
41841
diff
changeset
|
621 l = _utf8len[ord(s[pos : pos + 1]) >> 4] |
2372284d9457
formatting: blacken the codebase
Augie Fackler <augie@google.com>
parents:
41841
diff
changeset
|
622 if not l: # ascii |
2372284d9457
formatting: blacken the codebase
Augie Fackler <augie@google.com>
parents:
41841
diff
changeset
|
623 return s[pos : pos + 1] |
26875
cf47bdb2183c
encoding: add getutf8char helper
Matt Mackall <mpm@selenic.com>
parents:
25660
diff
changeset
|
624 |
43076
2372284d9457
formatting: blacken the codebase
Augie Fackler <augie@google.com>
parents:
41841
diff
changeset
|
625 c = s[pos : pos + l] |
26875
cf47bdb2183c
encoding: add getutf8char helper
Matt Mackall <mpm@selenic.com>
parents:
25660
diff
changeset
|
626 # validate with attempted decode |
34225
aa877860d4d7
py3: use 'surrogatepass' error handler to process U+DCxx transparently
Yuya Nishihara <yuya@tcha.org>
parents:
34223
diff
changeset
|
627 c.decode("utf-8", _utf8strict) |
26875
cf47bdb2183c
encoding: add getutf8char helper
Matt Mackall <mpm@selenic.com>
parents:
25660
diff
changeset
|
628 return c |
cf47bdb2183c
encoding: add getutf8char helper
Matt Mackall <mpm@selenic.com>
parents:
25660
diff
changeset
|
629 |
43076
2372284d9457
formatting: blacken the codebase
Augie Fackler <augie@google.com>
parents:
41841
diff
changeset
|
630 |
16133
84c58da3a1f8
encoding: introduce utf8-b helpers
Matt Mackall <mpm@selenic.com>
parents:
15769
diff
changeset
|
631 def toutf8b(s): |
43722
83a349aaeba3
typing: constrain argument/return types of encoding.toutf8b()
Yuya Nishihara <yuya@tcha.org>
parents:
43721
diff
changeset
|
632 # type: (bytes) -> bytes |
45957
89a2afe31e82
formating: upgrade to black 20.8b1
Augie Fackler <raf@durin42.com>
parents:
45681
diff
changeset
|
633 """convert a local, possibly-binary string into UTF-8b |
16133
84c58da3a1f8
encoding: introduce utf8-b helpers
Matt Mackall <mpm@selenic.com>
parents:
15769
diff
changeset
|
634 |
84c58da3a1f8
encoding: introduce utf8-b helpers
Matt Mackall <mpm@selenic.com>
parents:
15769
diff
changeset
|
635 This is intended as a generic method to preserve data when working |
84c58da3a1f8
encoding: introduce utf8-b helpers
Matt Mackall <mpm@selenic.com>
parents:
15769
diff
changeset
|
636 with schemes like JSON and XML that have no provision for |
84c58da3a1f8
encoding: introduce utf8-b helpers
Matt Mackall <mpm@selenic.com>
parents:
15769
diff
changeset
|
637 arbitrary byte strings. As Mercurial often doesn't know |
84c58da3a1f8
encoding: introduce utf8-b helpers
Matt Mackall <mpm@selenic.com>
parents:
15769
diff
changeset
|
638 what encoding data is in, we use so-called UTF-8b. |
84c58da3a1f8
encoding: introduce utf8-b helpers
Matt Mackall <mpm@selenic.com>
parents:
15769
diff
changeset
|
639 |
84c58da3a1f8
encoding: introduce utf8-b helpers
Matt Mackall <mpm@selenic.com>
parents:
15769
diff
changeset
|
640 If a string is already valid UTF-8 (or ASCII), it passes unmodified. |
84c58da3a1f8
encoding: introduce utf8-b helpers
Matt Mackall <mpm@selenic.com>
parents:
15769
diff
changeset
|
641 Otherwise, unsupported bytes are mapped to UTF-16 surrogate range, |
84c58da3a1f8
encoding: introduce utf8-b helpers
Matt Mackall <mpm@selenic.com>
parents:
15769
diff
changeset
|
642 uDC00-uDCFF. |
84c58da3a1f8
encoding: introduce utf8-b helpers
Matt Mackall <mpm@selenic.com>
parents:
15769
diff
changeset
|
643 |
84c58da3a1f8
encoding: introduce utf8-b helpers
Matt Mackall <mpm@selenic.com>
parents:
15769
diff
changeset
|
644 Principles of operation: |
84c58da3a1f8
encoding: introduce utf8-b helpers
Matt Mackall <mpm@selenic.com>
parents:
15769
diff
changeset
|
645 |
17424
e7cfe3587ea4
fix trivial spelling errors
Mads Kiilerich <mads@kiilerich.com>
parents:
17236
diff
changeset
|
646 - ASCII and UTF-8 data successfully round-trips and is understood |
16133
84c58da3a1f8
encoding: introduce utf8-b helpers
Matt Mackall <mpm@selenic.com>
parents:
15769
diff
changeset
|
647 by Unicode-oriented clients |
84c58da3a1f8
encoding: introduce utf8-b helpers
Matt Mackall <mpm@selenic.com>
parents:
15769
diff
changeset
|
648 - filenames and file contents in arbitrary other encodings can have |
84c58da3a1f8
encoding: introduce utf8-b helpers
Matt Mackall <mpm@selenic.com>
parents:
15769
diff
changeset
|
649 be round-tripped or recovered by clueful clients |
84c58da3a1f8
encoding: introduce utf8-b helpers
Matt Mackall <mpm@selenic.com>
parents:
15769
diff
changeset
|
650 - local strings that have a cached known UTF-8 encoding (aka |
84c58da3a1f8
encoding: introduce utf8-b helpers
Matt Mackall <mpm@selenic.com>
parents:
15769
diff
changeset
|
651 localstr) get sent as UTF-8 so Unicode-oriented clients get the |
84c58da3a1f8
encoding: introduce utf8-b helpers
Matt Mackall <mpm@selenic.com>
parents:
15769
diff
changeset
|
652 Unicode data they want |
37991
3ea3c96ada54
encoding: introduce tagging type for non-lossy non-ASCII string
Yuya Nishihara <yuya@tcha.org>
parents:
37990
diff
changeset
|
653 - non-lossy local strings (aka safelocalstr) get sent as UTF-8 as well |
16133
84c58da3a1f8
encoding: introduce utf8-b helpers
Matt Mackall <mpm@selenic.com>
parents:
15769
diff
changeset
|
654 - because we must preserve UTF-8 bytestring in places such as |
84c58da3a1f8
encoding: introduce utf8-b helpers
Matt Mackall <mpm@selenic.com>
parents:
15769
diff
changeset
|
655 filenames, metadata can't be roundtripped without help |
84c58da3a1f8
encoding: introduce utf8-b helpers
Matt Mackall <mpm@selenic.com>
parents:
15769
diff
changeset
|
656 |
84c58da3a1f8
encoding: introduce utf8-b helpers
Matt Mackall <mpm@selenic.com>
parents:
15769
diff
changeset
|
657 (Note: "UTF-8b" often refers to decoding a mix of valid UTF-8 and |
84c58da3a1f8
encoding: introduce utf8-b helpers
Matt Mackall <mpm@selenic.com>
parents:
15769
diff
changeset
|
658 arbitrary bytes into an internal Unicode format that can be |
84c58da3a1f8
encoding: introduce utf8-b helpers
Matt Mackall <mpm@selenic.com>
parents:
15769
diff
changeset
|
659 re-encoded back into the original. Here we are exposing the |
84c58da3a1f8
encoding: introduce utf8-b helpers
Matt Mackall <mpm@selenic.com>
parents:
15769
diff
changeset
|
660 internal surrogate encoding as a UTF-8 string.) |
45957
89a2afe31e82
formating: upgrade to black 20.8b1
Augie Fackler <raf@durin42.com>
parents:
45681
diff
changeset
|
661 """ |
16133
84c58da3a1f8
encoding: introduce utf8-b helpers
Matt Mackall <mpm@selenic.com>
parents:
15769
diff
changeset
|
662 |
37990
57b0c7221dba
encoding: fix toutf8b() to resurrect lossy characters even if "\xed" in it
Yuya Nishihara <yuya@tcha.org>
parents:
36805
diff
changeset
|
663 if isinstance(s, localstr): |
57b0c7221dba
encoding: fix toutf8b() to resurrect lossy characters even if "\xed" in it
Yuya Nishihara <yuya@tcha.org>
parents:
36805
diff
changeset
|
664 # assume that the original UTF-8 sequence would never contain |
57b0c7221dba
encoding: fix toutf8b() to resurrect lossy characters even if "\xed" in it
Yuya Nishihara <yuya@tcha.org>
parents:
36805
diff
changeset
|
665 # invalid characters in U+DCxx range |
57b0c7221dba
encoding: fix toutf8b() to resurrect lossy characters even if "\xed" in it
Yuya Nishihara <yuya@tcha.org>
parents:
36805
diff
changeset
|
666 return s._utf8 |
37991
3ea3c96ada54
encoding: introduce tagging type for non-lossy non-ASCII string
Yuya Nishihara <yuya@tcha.org>
parents:
37990
diff
changeset
|
667 elif isinstance(s, safelocalstr): |
3ea3c96ada54
encoding: introduce tagging type for non-lossy non-ASCII string
Yuya Nishihara <yuya@tcha.org>
parents:
37990
diff
changeset
|
668 # already verified that s is non-lossy in legacy encoding, which |
3ea3c96ada54
encoding: introduce tagging type for non-lossy non-ASCII string
Yuya Nishihara <yuya@tcha.org>
parents:
37990
diff
changeset
|
669 # shouldn't contain characters in U+DCxx range |
3ea3c96ada54
encoding: introduce tagging type for non-lossy non-ASCII string
Yuya Nishihara <yuya@tcha.org>
parents:
37990
diff
changeset
|
670 return fromlocal(s) |
37990
57b0c7221dba
encoding: fix toutf8b() to resurrect lossy characters even if "\xed" in it
Yuya Nishihara <yuya@tcha.org>
parents:
36805
diff
changeset
|
671 elif isasciistr(s): |
33946
6c119dbfd0c0
encoding: add fast path of from/toutf8b() for ASCII strings
Yuya Nishihara <yuya@tcha.org>
parents:
33945
diff
changeset
|
672 return s |
43077
687b865b95ad
formatting: byteify all mercurial/ and hgext/ string literals
Augie Fackler <augie@google.com>
parents:
43076
diff
changeset
|
673 if b"\xed" not in s: |
26879
a24b98f4e03c
encoding: re-escape U+DCxx characters in toutf8b input (issue4927)
Matt Mackall <mpm@selenic.com>
parents:
26878
diff
changeset
|
674 try: |
34225
aa877860d4d7
py3: use 'surrogatepass' error handler to process U+DCxx transparently
Yuya Nishihara <yuya@tcha.org>
parents:
34223
diff
changeset
|
675 s.decode('utf-8', _utf8strict) |
26879
a24b98f4e03c
encoding: re-escape U+DCxx characters in toutf8b input (issue4927)
Matt Mackall <mpm@selenic.com>
parents:
26878
diff
changeset
|
676 return s |
a24b98f4e03c
encoding: re-escape U+DCxx characters in toutf8b input (issue4927)
Matt Mackall <mpm@selenic.com>
parents:
26878
diff
changeset
|
677 except UnicodeDecodeError: |
a24b98f4e03c
encoding: re-escape U+DCxx characters in toutf8b input (issue4927)
Matt Mackall <mpm@selenic.com>
parents:
26878
diff
changeset
|
678 pass |
26878
d7e83f106459
encoding: use getutf8char in toutf8b
Matt Mackall <mpm@selenic.com>
parents:
26877
diff
changeset
|
679 |
34223
1c601df9894c
py3: wrap bytes in encoding.from/toutf8b() with bytestr
Yuya Nishihara <yuya@tcha.org>
parents:
34207
diff
changeset
|
680 s = pycompat.bytestr(s) |
43077
687b865b95ad
formatting: byteify all mercurial/ and hgext/ string literals
Augie Fackler <augie@google.com>
parents:
43076
diff
changeset
|
681 r = b"" |
26878
d7e83f106459
encoding: use getutf8char in toutf8b
Matt Mackall <mpm@selenic.com>
parents:
26877
diff
changeset
|
682 pos = 0 |
d7e83f106459
encoding: use getutf8char in toutf8b
Matt Mackall <mpm@selenic.com>
parents:
26877
diff
changeset
|
683 l = len(s) |
d7e83f106459
encoding: use getutf8char in toutf8b
Matt Mackall <mpm@selenic.com>
parents:
26877
diff
changeset
|
684 while pos < l: |
d7e83f106459
encoding: use getutf8char in toutf8b
Matt Mackall <mpm@selenic.com>
parents:
26877
diff
changeset
|
685 try: |
d7e83f106459
encoding: use getutf8char in toutf8b
Matt Mackall <mpm@selenic.com>
parents:
26877
diff
changeset
|
686 c = getutf8char(s, pos) |
43077
687b865b95ad
formatting: byteify all mercurial/ and hgext/ string literals
Augie Fackler <augie@google.com>
parents:
43076
diff
changeset
|
687 if b"\xed\xb0\x80" <= c <= b"\xed\xb3\xbf": |
26879
a24b98f4e03c
encoding: re-escape U+DCxx characters in toutf8b input (issue4927)
Matt Mackall <mpm@selenic.com>
parents:
26878
diff
changeset
|
688 # have to re-escape existing U+DCxx characters |
43076
2372284d9457
formatting: blacken the codebase
Augie Fackler <augie@google.com>
parents:
41841
diff
changeset
|
689 c = unichr(0xDC00 + ord(s[pos])).encode('utf-8', _utf8strict) |
26879
a24b98f4e03c
encoding: re-escape U+DCxx characters in toutf8b input (issue4927)
Matt Mackall <mpm@selenic.com>
parents:
26878
diff
changeset
|
690 pos += 1 |
a24b98f4e03c
encoding: re-escape U+DCxx characters in toutf8b input (issue4927)
Matt Mackall <mpm@selenic.com>
parents:
26878
diff
changeset
|
691 else: |
a24b98f4e03c
encoding: re-escape U+DCxx characters in toutf8b input (issue4927)
Matt Mackall <mpm@selenic.com>
parents:
26878
diff
changeset
|
692 pos += len(c) |
26878
d7e83f106459
encoding: use getutf8char in toutf8b
Matt Mackall <mpm@selenic.com>
parents:
26877
diff
changeset
|
693 except UnicodeDecodeError: |
43076
2372284d9457
formatting: blacken the codebase
Augie Fackler <augie@google.com>
parents:
41841
diff
changeset
|
694 c = unichr(0xDC00 + ord(s[pos])).encode('utf-8', _utf8strict) |
26878
d7e83f106459
encoding: use getutf8char in toutf8b
Matt Mackall <mpm@selenic.com>
parents:
26877
diff
changeset
|
695 pos += 1 |
d7e83f106459
encoding: use getutf8char in toutf8b
Matt Mackall <mpm@selenic.com>
parents:
26877
diff
changeset
|
696 r += c |
d7e83f106459
encoding: use getutf8char in toutf8b
Matt Mackall <mpm@selenic.com>
parents:
26877
diff
changeset
|
697 return r |
16133
84c58da3a1f8
encoding: introduce utf8-b helpers
Matt Mackall <mpm@selenic.com>
parents:
15769
diff
changeset
|
698 |
43076
2372284d9457
formatting: blacken the codebase
Augie Fackler <augie@google.com>
parents:
41841
diff
changeset
|
699 |
16133
84c58da3a1f8
encoding: introduce utf8-b helpers
Matt Mackall <mpm@selenic.com>
parents:
15769
diff
changeset
|
700 def fromutf8b(s): |
43721
b65fcccd9100
typing: fix argument type of encoding.tolocal() and .fromutf8b()
Yuya Nishihara <yuya@tcha.org>
parents:
43720
diff
changeset
|
701 # type: (bytes) -> bytes |
45957
89a2afe31e82
formating: upgrade to black 20.8b1
Augie Fackler <raf@durin42.com>
parents:
45681
diff
changeset
|
702 """Given a UTF-8b string, return a local, possibly-binary string. |
16133
84c58da3a1f8
encoding: introduce utf8-b helpers
Matt Mackall <mpm@selenic.com>
parents:
15769
diff
changeset
|
703 |
84c58da3a1f8
encoding: introduce utf8-b helpers
Matt Mackall <mpm@selenic.com>
parents:
15769
diff
changeset
|
704 return the original binary string. This |
84c58da3a1f8
encoding: introduce utf8-b helpers
Matt Mackall <mpm@selenic.com>
parents:
15769
diff
changeset
|
705 is a round-trip process for strings like filenames, but metadata |
84c58da3a1f8
encoding: introduce utf8-b helpers
Matt Mackall <mpm@selenic.com>
parents:
15769
diff
changeset
|
706 that's was passed through tolocal will remain in UTF-8. |
84c58da3a1f8
encoding: introduce utf8-b helpers
Matt Mackall <mpm@selenic.com>
parents:
15769
diff
changeset
|
707 |
26963
de5ae97ce9f4
encoding: extend test cases for utf8b
Matt Mackall <mpm@selenic.com>
parents:
26879
diff
changeset
|
708 >>> roundtrip = lambda x: fromutf8b(toutf8b(x)) == x |
34146
0fa781320203
doctest: bulk-replace string literals with b'' for Python 3
Yuya Nishihara <yuya@tcha.org>
parents:
33946
diff
changeset
|
709 >>> m = b"\\xc3\\xa9\\x99abcd" |
26963
de5ae97ce9f4
encoding: extend test cases for utf8b
Matt Mackall <mpm@selenic.com>
parents:
26879
diff
changeset
|
710 >>> toutf8b(m) |
16133
84c58da3a1f8
encoding: introduce utf8-b helpers
Matt Mackall <mpm@selenic.com>
parents:
15769
diff
changeset
|
711 '\\xc3\\xa9\\xed\\xb2\\x99abcd' |
26963
de5ae97ce9f4
encoding: extend test cases for utf8b
Matt Mackall <mpm@selenic.com>
parents:
26879
diff
changeset
|
712 >>> roundtrip(m) |
de5ae97ce9f4
encoding: extend test cases for utf8b
Matt Mackall <mpm@selenic.com>
parents:
26879
diff
changeset
|
713 True |
34146
0fa781320203
doctest: bulk-replace string literals with b'' for Python 3
Yuya Nishihara <yuya@tcha.org>
parents:
33946
diff
changeset
|
714 >>> roundtrip(b"\\xc2\\xc2\\x80") |
26963
de5ae97ce9f4
encoding: extend test cases for utf8b
Matt Mackall <mpm@selenic.com>
parents:
26879
diff
changeset
|
715 True |
34146
0fa781320203
doctest: bulk-replace string literals with b'' for Python 3
Yuya Nishihara <yuya@tcha.org>
parents:
33946
diff
changeset
|
716 >>> roundtrip(b"\\xef\\xbf\\xbd") |
26963
de5ae97ce9f4
encoding: extend test cases for utf8b
Matt Mackall <mpm@selenic.com>
parents:
26879
diff
changeset
|
717 True |
34146
0fa781320203
doctest: bulk-replace string literals with b'' for Python 3
Yuya Nishihara <yuya@tcha.org>
parents:
33946
diff
changeset
|
718 >>> roundtrip(b"\\xef\\xef\\xbf\\xbd") |
16133
84c58da3a1f8
encoding: introduce utf8-b helpers
Matt Mackall <mpm@selenic.com>
parents:
15769
diff
changeset
|
719 True |
34146
0fa781320203
doctest: bulk-replace string literals with b'' for Python 3
Yuya Nishihara <yuya@tcha.org>
parents:
33946
diff
changeset
|
720 >>> roundtrip(b"\\xf1\\x80\\x80\\x80\\x80") |
27699
c8d3392f76e1
encoding: handle UTF-16 internal limit with fromutf8b (issue5031)
Matt Mackall <mpm@selenic.com>
parents:
27356
diff
changeset
|
721 True |
45957
89a2afe31e82
formating: upgrade to black 20.8b1
Augie Fackler <raf@durin42.com>
parents:
45681
diff
changeset
|
722 """ |
16133
84c58da3a1f8
encoding: introduce utf8-b helpers
Matt Mackall <mpm@selenic.com>
parents:
15769
diff
changeset
|
723 |
33946
6c119dbfd0c0
encoding: add fast path of from/toutf8b() for ASCII strings
Yuya Nishihara <yuya@tcha.org>
parents:
33945
diff
changeset
|
724 if isasciistr(s): |
6c119dbfd0c0
encoding: add fast path of from/toutf8b() for ASCII strings
Yuya Nishihara <yuya@tcha.org>
parents:
33945
diff
changeset
|
725 return s |
16133
84c58da3a1f8
encoding: introduce utf8-b helpers
Matt Mackall <mpm@selenic.com>
parents:
15769
diff
changeset
|
726 # fast path - look for uDxxx prefixes in s |
43077
687b865b95ad
formatting: byteify all mercurial/ and hgext/ string literals
Augie Fackler <augie@google.com>
parents:
43076
diff
changeset
|
727 if b"\xed" not in s: |
16133
84c58da3a1f8
encoding: introduce utf8-b helpers
Matt Mackall <mpm@selenic.com>
parents:
15769
diff
changeset
|
728 return s |
84c58da3a1f8
encoding: introduce utf8-b helpers
Matt Mackall <mpm@selenic.com>
parents:
15769
diff
changeset
|
729 |
27699
c8d3392f76e1
encoding: handle UTF-16 internal limit with fromutf8b (issue5031)
Matt Mackall <mpm@selenic.com>
parents:
27356
diff
changeset
|
730 # We could do this with the unicode type but some Python builds |
c8d3392f76e1
encoding: handle UTF-16 internal limit with fromutf8b (issue5031)
Matt Mackall <mpm@selenic.com>
parents:
27356
diff
changeset
|
731 # use UTF-16 internally (issue5031) which causes non-BMP code |
c8d3392f76e1
encoding: handle UTF-16 internal limit with fromutf8b (issue5031)
Matt Mackall <mpm@selenic.com>
parents:
27356
diff
changeset
|
732 # points to be escaped. Instead, we use our handy getutf8char |
c8d3392f76e1
encoding: handle UTF-16 internal limit with fromutf8b (issue5031)
Matt Mackall <mpm@selenic.com>
parents:
27356
diff
changeset
|
733 # helper again to walk the string without "decoding" it. |
c8d3392f76e1
encoding: handle UTF-16 internal limit with fromutf8b (issue5031)
Matt Mackall <mpm@selenic.com>
parents:
27356
diff
changeset
|
734 |
34223
1c601df9894c
py3: wrap bytes in encoding.from/toutf8b() with bytestr
Yuya Nishihara <yuya@tcha.org>
parents:
34207
diff
changeset
|
735 s = pycompat.bytestr(s) |
43077
687b865b95ad
formatting: byteify all mercurial/ and hgext/ string literals
Augie Fackler <augie@google.com>
parents:
43076
diff
changeset
|
736 r = b"" |
27699
c8d3392f76e1
encoding: handle UTF-16 internal limit with fromutf8b (issue5031)
Matt Mackall <mpm@selenic.com>
parents:
27356
diff
changeset
|
737 pos = 0 |
c8d3392f76e1
encoding: handle UTF-16 internal limit with fromutf8b (issue5031)
Matt Mackall <mpm@selenic.com>
parents:
27356
diff
changeset
|
738 l = len(s) |
c8d3392f76e1
encoding: handle UTF-16 internal limit with fromutf8b (issue5031)
Matt Mackall <mpm@selenic.com>
parents:
27356
diff
changeset
|
739 while pos < l: |
c8d3392f76e1
encoding: handle UTF-16 internal limit with fromutf8b (issue5031)
Matt Mackall <mpm@selenic.com>
parents:
27356
diff
changeset
|
740 c = getutf8char(s, pos) |
c8d3392f76e1
encoding: handle UTF-16 internal limit with fromutf8b (issue5031)
Matt Mackall <mpm@selenic.com>
parents:
27356
diff
changeset
|
741 pos += len(c) |
c8d3392f76e1
encoding: handle UTF-16 internal limit with fromutf8b (issue5031)
Matt Mackall <mpm@selenic.com>
parents:
27356
diff
changeset
|
742 # unescape U+DCxx characters |
43077
687b865b95ad
formatting: byteify all mercurial/ and hgext/ string literals
Augie Fackler <augie@google.com>
parents:
43076
diff
changeset
|
743 if b"\xed\xb0\x80" <= c <= b"\xed\xb3\xbf": |
43076
2372284d9457
formatting: blacken the codebase
Augie Fackler <augie@google.com>
parents:
41841
diff
changeset
|
744 c = pycompat.bytechr(ord(c.decode("utf-8", _utf8strict)) & 0xFF) |
27699
c8d3392f76e1
encoding: handle UTF-16 internal limit with fromutf8b (issue5031)
Matt Mackall <mpm@selenic.com>
parents:
27356
diff
changeset
|
745 r += c |
16133
84c58da3a1f8
encoding: introduce utf8-b helpers
Matt Mackall <mpm@selenic.com>
parents:
15769
diff
changeset
|
746 return r |