mercurial/encoding.py
author Anton Shestakov <av6@dwimlabs.net>
Sun, 14 Jan 2024 16:03:08 -0300
branchstable
changeset 51161 c7edfccfc11f
parent 51000 80c243eab724
child 51302 9d3721552b6c
permissions -rw-r--r--
tests: don't use "status" operand of dd in test-censor.t (issue6858) Some implementations don't have this operand, let's just direct stderr into /dev/null, that's pretty cross-platform. Also specify bs=512 (the default for me), because the default might be different on different systems. Other uses of dd in the tests do specify it, so this is more consistent.
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
8226
8b2cd04a6e97 put license and copyright info into comment blocks
Martin Geisler <mg@lazybytes.net>
parents: 8225
diff changeset
     1
# encoding.py - character transcoding support for Mercurial
8b2cd04a6e97 put license and copyright info into comment blocks
Martin Geisler <mg@lazybytes.net>
parents: 8225
diff changeset
     2
#
46819
d4ba4d51f85f contributor: change mentions of mpm to olivia
Raphaël Gomès <rgomes@octobus.net>
parents: 46319
diff changeset
     3
#  Copyright 2005-2009 Olivia Mackall <olivia@selenic.com> and others
8226
8b2cd04a6e97 put license and copyright info into comment blocks
Martin Geisler <mg@lazybytes.net>
parents: 8225
diff changeset
     4
#
8b2cd04a6e97 put license and copyright info into comment blocks
Martin Geisler <mg@lazybytes.net>
parents: 8225
diff changeset
     5
# This software may be used and distributed according to the terms of the
10263
25e572394f5c Update license to GPLv2+
Matt Mackall <mpm@selenic.com>
parents: 9574
diff changeset
     6
# GNU General Public License version 2 or any later version.
7948
de377b1a9a84 move encoding bits from util to encoding
Matt Mackall <mpm@selenic.com>
parents:
diff changeset
     7
27355
b479fc425a81 encoding: use absolute_import
Gregory Szorc <gregory.szorc@gmail.com>
parents: 26963
diff changeset
     8
b479fc425a81 encoding: use absolute_import
Gregory Szorc <gregory.szorc@gmail.com>
parents: 26963
diff changeset
     9
import locale
b479fc425a81 encoding: use absolute_import
Gregory Szorc <gregory.szorc@gmail.com>
parents: 26963
diff changeset
    10
import os
47621
d6ee6456bd5f windows: enforce upper case drive letter for getcwd in mercurial too
Pierre-Yves David <pierre-yves.david@octobus.net>
parents: 47560
diff changeset
    11
import re
27355
b479fc425a81 encoding: use absolute_import
Gregory Szorc <gregory.szorc@gmail.com>
parents: 26963
diff changeset
    12
import unicodedata
b479fc425a81 encoding: use absolute_import
Gregory Szorc <gregory.szorc@gmail.com>
parents: 26963
diff changeset
    13
b479fc425a81 encoding: use absolute_import
Gregory Szorc <gregory.szorc@gmail.com>
parents: 26963
diff changeset
    14
from . import (
b479fc425a81 encoding: use absolute_import
Gregory Szorc <gregory.szorc@gmail.com>
parents: 26963
diff changeset
    15
    error,
32411
df448de7cf3b parsers: switch to policy importer
Yuya Nishihara <yuya@tcha.org>
parents: 32339
diff changeset
    16
    policy,
30031
0f6d6fdd3c2a pycompat: provide 'ispy3' constant
Yuya Nishihara <yuya@tcha.org>
parents: 28508
diff changeset
    17
    pycompat,
27355
b479fc425a81 encoding: use absolute_import
Gregory Szorc <gregory.szorc@gmail.com>
parents: 26963
diff changeset
    18
)
7948
de377b1a9a84 move encoding bits from util to encoding
Matt Mackall <mpm@selenic.com>
parents:
diff changeset
    19
43076
2372284d9457 formatting: blacken the codebase
Augie Fackler <augie@google.com>
parents: 41841
diff changeset
    20
from .pure import charencode as charencodepure
33942
b9101467d88b encoding: extract stub for fast JSON escape
Yuya Nishihara <yuya@tcha.org>
parents: 33873
diff changeset
    21
43793
7b14d649af1b typing: consolidate "if not globals():" trick
Yuya Nishihara <yuya@tcha.org>
parents: 43725
diff changeset
    22
if pycompat.TYPE_CHECKING:
43544
2ade00f3b03b encoding: add comment-based type hints for pytype
Augie Fackler <augie@google.com>
parents: 43517
diff changeset
    23
    from typing import (
2ade00f3b03b encoding: add comment-based type hints for pytype
Augie Fackler <augie@google.com>
parents: 43517
diff changeset
    24
        Any,
2ade00f3b03b encoding: add comment-based type hints for pytype
Augie Fackler <augie@google.com>
parents: 43517
diff changeset
    25
        Callable,
2ade00f3b03b encoding: add comment-based type hints for pytype
Augie Fackler <augie@google.com>
parents: 43517
diff changeset
    26
        List,
2ade00f3b03b encoding: add comment-based type hints for pytype
Augie Fackler <augie@google.com>
parents: 43517
diff changeset
    27
        Text,
2ade00f3b03b encoding: add comment-based type hints for pytype
Augie Fackler <augie@google.com>
parents: 43517
diff changeset
    28
        Type,
2ade00f3b03b encoding: add comment-based type hints for pytype
Augie Fackler <augie@google.com>
parents: 43517
diff changeset
    29
        TypeVar,
2ade00f3b03b encoding: add comment-based type hints for pytype
Augie Fackler <augie@google.com>
parents: 43517
diff changeset
    30
        Union,
2ade00f3b03b encoding: add comment-based type hints for pytype
Augie Fackler <augie@google.com>
parents: 43517
diff changeset
    31
    )
2ade00f3b03b encoding: add comment-based type hints for pytype
Augie Fackler <augie@google.com>
parents: 43517
diff changeset
    32
2ade00f3b03b encoding: add comment-based type hints for pytype
Augie Fackler <augie@google.com>
parents: 43517
diff changeset
    33
    # keep pyflakes happy
2ade00f3b03b encoding: add comment-based type hints for pytype
Augie Fackler <augie@google.com>
parents: 43517
diff changeset
    34
    for t in (Any, Callable, List, Text, Type, Union):
2ade00f3b03b encoding: add comment-based type hints for pytype
Augie Fackler <augie@google.com>
parents: 43517
diff changeset
    35
        assert t
2ade00f3b03b encoding: add comment-based type hints for pytype
Augie Fackler <augie@google.com>
parents: 43517
diff changeset
    36
43720
3364a15f61f0 typing: fix forward reference in _Tlocalstr type bound
Yuya Nishihara <yuya@tcha.org>
parents: 43719
diff changeset
    37
    _Tlocalstr = TypeVar('_Tlocalstr', bound='localstr')
43544
2ade00f3b03b encoding: add comment-based type hints for pytype
Augie Fackler <augie@google.com>
parents: 43517
diff changeset
    38
43554
9f70512ae2cf cleanup: remove pointless r-prefixes on single-quoted strings
Augie Fackler <augie@google.com>
parents: 43551
diff changeset
    39
charencode = policy.importmod('charencode')
33782
f5fc54e7e467 encoding: drop circular import by proxying through '<policy>.charencode'
Yuya Nishihara <yuya@tcha.org>
parents: 33038
diff changeset
    40
33944
f4433f2713d0 encoding: add function to test if a str consists of ASCII characters
Yuya Nishihara <yuya@tcha.org>
parents: 33943
diff changeset
    41
isasciistr = charencode.isasciistr
33782
f5fc54e7e467 encoding: drop circular import by proxying through '<policy>.charencode'
Yuya Nishihara <yuya@tcha.org>
parents: 33038
diff changeset
    42
asciilower = charencode.asciilower
f5fc54e7e467 encoding: drop circular import by proxying through '<policy>.charencode'
Yuya Nishihara <yuya@tcha.org>
parents: 33038
diff changeset
    43
asciiupper = charencode.asciiupper
33943
2c37f9dabc32 encoding: add fast path of jsonescape() (issue5533)
Yuya Nishihara <yuya@tcha.org>
parents: 33942
diff changeset
    44
_jsonescapeu8fast = charencode.jsonescapeu8fast
33782
f5fc54e7e467 encoding: drop circular import by proxying through '<policy>.charencode'
Yuya Nishihara <yuya@tcha.org>
parents: 33038
diff changeset
    45
30033
02dbfaa6df0b py3: convert encoding name and mode to str
Yuya Nishihara <yuya@tcha.org>
parents: 30031
diff changeset
    46
_sysstr = pycompat.sysstr
02dbfaa6df0b py3: convert encoding name and mode to str
Yuya Nishihara <yuya@tcha.org>
parents: 30031
diff changeset
    47
48983
fa2b1a46d92e encoding: remove Python 2 support code
Gregory Szorc <gregory.szorc@gmail.com>
parents: 48966
diff changeset
    48
unichr = chr
28507
9bcbd9412225 encoding: make HFS+ ignore code Python 3 compatible
Gregory Szorc <gregory.szorc@gmail.com>
parents: 28069
diff changeset
    49
23596
885bd7c5c7e3 encoding: add hfsignoreclean to clean out HFS-ignored characters
Augie Fackler <raf@durin42.com>
parents: 22973
diff changeset
    50
# These unicode characters are ignored by HFS+ (Apple Technote 1150,
885bd7c5c7e3 encoding: add hfsignoreclean to clean out HFS-ignored characters
Augie Fackler <raf@durin42.com>
parents: 22973
diff changeset
    51
# "Unicode Subtleties"), so we need to ignore them in some places for
885bd7c5c7e3 encoding: add hfsignoreclean to clean out HFS-ignored characters
Augie Fackler <raf@durin42.com>
parents: 22973
diff changeset
    52
# sanity.
43076
2372284d9457 formatting: blacken the codebase
Augie Fackler <augie@google.com>
parents: 41841
diff changeset
    53
_ignore = [
2372284d9457 formatting: blacken the codebase
Augie Fackler <augie@google.com>
parents: 41841
diff changeset
    54
    unichr(int(x, 16)).encode("utf-8")
43077
687b865b95ad formatting: byteify all mercurial/ and hgext/ string literals
Augie Fackler <augie@google.com>
parents: 43076
diff changeset
    55
    for x in b"200c 200d 200e 200f 202a 202b 202c 202d 202e "
687b865b95ad formatting: byteify all mercurial/ and hgext/ string literals
Augie Fackler <augie@google.com>
parents: 43076
diff changeset
    56
    b"206a 206b 206c 206d 206e 206f feff".split()
43076
2372284d9457 formatting: blacken the codebase
Augie Fackler <augie@google.com>
parents: 41841
diff changeset
    57
]
23596
885bd7c5c7e3 encoding: add hfsignoreclean to clean out HFS-ignored characters
Augie Fackler <raf@durin42.com>
parents: 22973
diff changeset
    58
# verify the next function will work
43077
687b865b95ad formatting: byteify all mercurial/ and hgext/ string literals
Augie Fackler <augie@google.com>
parents: 43076
diff changeset
    59
assert all(i.startswith((b"\xe2", b"\xef")) for i in _ignore)
23596
885bd7c5c7e3 encoding: add hfsignoreclean to clean out HFS-ignored characters
Augie Fackler <raf@durin42.com>
parents: 22973
diff changeset
    60
43076
2372284d9457 formatting: blacken the codebase
Augie Fackler <augie@google.com>
parents: 41841
diff changeset
    61
23596
885bd7c5c7e3 encoding: add hfsignoreclean to clean out HFS-ignored characters
Augie Fackler <raf@durin42.com>
parents: 22973
diff changeset
    62
def hfsignoreclean(s):
43544
2ade00f3b03b encoding: add comment-based type hints for pytype
Augie Fackler <augie@google.com>
parents: 43517
diff changeset
    63
    # type: (bytes) -> bytes
23596
885bd7c5c7e3 encoding: add hfsignoreclean to clean out HFS-ignored characters
Augie Fackler <raf@durin42.com>
parents: 22973
diff changeset
    64
    """Remove codepoints ignored by HFS+ from s.
885bd7c5c7e3 encoding: add hfsignoreclean to clean out HFS-ignored characters
Augie Fackler <raf@durin42.com>
parents: 22973
diff changeset
    65
885bd7c5c7e3 encoding: add hfsignoreclean to clean out HFS-ignored characters
Augie Fackler <raf@durin42.com>
parents: 22973
diff changeset
    66
    >>> hfsignoreclean(u'.h\u200cg'.encode('utf-8'))
885bd7c5c7e3 encoding: add hfsignoreclean to clean out HFS-ignored characters
Augie Fackler <raf@durin42.com>
parents: 22973
diff changeset
    67
    '.hg'
885bd7c5c7e3 encoding: add hfsignoreclean to clean out HFS-ignored characters
Augie Fackler <raf@durin42.com>
parents: 22973
diff changeset
    68
    >>> hfsignoreclean(u'.h\ufeffg'.encode('utf-8'))
885bd7c5c7e3 encoding: add hfsignoreclean to clean out HFS-ignored characters
Augie Fackler <raf@durin42.com>
parents: 22973
diff changeset
    69
    '.hg'
885bd7c5c7e3 encoding: add hfsignoreclean to clean out HFS-ignored characters
Augie Fackler <raf@durin42.com>
parents: 22973
diff changeset
    70
    """
43077
687b865b95ad formatting: byteify all mercurial/ and hgext/ string literals
Augie Fackler <augie@google.com>
parents: 43076
diff changeset
    71
    if b"\xe2" in s or b"\xef" in s:
23596
885bd7c5c7e3 encoding: add hfsignoreclean to clean out HFS-ignored characters
Augie Fackler <raf@durin42.com>
parents: 22973
diff changeset
    72
        for c in _ignore:
43077
687b865b95ad formatting: byteify all mercurial/ and hgext/ string literals
Augie Fackler <augie@google.com>
parents: 43076
diff changeset
    73
            s = s.replace(c, b'')
23596
885bd7c5c7e3 encoding: add hfsignoreclean to clean out HFS-ignored characters
Augie Fackler <raf@durin42.com>
parents: 22973
diff changeset
    74
    return s
885bd7c5c7e3 encoding: add hfsignoreclean to clean out HFS-ignored characters
Augie Fackler <raf@durin42.com>
parents: 22973
diff changeset
    75
43076
2372284d9457 formatting: blacken the codebase
Augie Fackler <augie@google.com>
parents: 41841
diff changeset
    76
30034
e4a6b439acc5 py3: provide encoding.environ which is a dict of bytes
Yuya Nishihara <yuya@tcha.org>
parents: 30033
diff changeset
    77
# encoding.environ is provided read-only, which may not be used to modify
e4a6b439acc5 py3: provide encoding.environ which is a dict of bytes
Yuya Nishihara <yuya@tcha.org>
parents: 30033
diff changeset
    78
# the process environment
48983
fa2b1a46d92e encoding: remove Python 2 support code
Gregory Szorc <gregory.szorc@gmail.com>
parents: 48966
diff changeset
    79
_nativeenviron = os.supports_bytes_environ
fa2b1a46d92e encoding: remove Python 2 support code
Gregory Szorc <gregory.szorc@gmail.com>
parents: 48966
diff changeset
    80
if _nativeenviron:
32231
cf424dae5dc7 check-code: ignore re-exports of os.environ in encoding.py
Yuya Nishihara <yuya@tcha.org>
parents: 32205
diff changeset
    81
    environ = os.environb  # re-exports
51000
80c243eab724 openvms: duck-punch a bugfix into `environb` object
Jean-Francois Pieronne <jf.pieronne@laposte.net>
parents: 50952
diff changeset
    82
    if pycompat.sysplatform == b'OpenVMS':
80c243eab724 openvms: duck-punch a bugfix into `environb` object
Jean-Francois Pieronne <jf.pieronne@laposte.net>
parents: 50952
diff changeset
    83
        # workaround for a bug in VSI 3.10 port
80c243eab724 openvms: duck-punch a bugfix into `environb` object
Jean-Francois Pieronne <jf.pieronne@laposte.net>
parents: 50952
diff changeset
    84
        # os.environb is only populated with a few Predefined symbols
80c243eab724 openvms: duck-punch a bugfix into `environb` object
Jean-Francois Pieronne <jf.pieronne@laposte.net>
parents: 50952
diff changeset
    85
        def newget(self, key, default=None):
80c243eab724 openvms: duck-punch a bugfix into `environb` object
Jean-Francois Pieronne <jf.pieronne@laposte.net>
parents: 50952
diff changeset
    86
            # pytype on linux does not understand OpenVMS special modules
80c243eab724 openvms: duck-punch a bugfix into `environb` object
Jean-Francois Pieronne <jf.pieronne@laposte.net>
parents: 50952
diff changeset
    87
            import _decc  # pytype: disable=import-error
80c243eab724 openvms: duck-punch a bugfix into `environb` object
Jean-Francois Pieronne <jf.pieronne@laposte.net>
parents: 50952
diff changeset
    88
80c243eab724 openvms: duck-punch a bugfix into `environb` object
Jean-Francois Pieronne <jf.pieronne@laposte.net>
parents: 50952
diff changeset
    89
            v = _decc.getenv(key, None)
80c243eab724 openvms: duck-punch a bugfix into `environb` object
Jean-Francois Pieronne <jf.pieronne@laposte.net>
parents: 50952
diff changeset
    90
            if isinstance(key, bytes):
80c243eab724 openvms: duck-punch a bugfix into `environb` object
Jean-Francois Pieronne <jf.pieronne@laposte.net>
parents: 50952
diff changeset
    91
                return default if v is None else v.encode('latin-1')
80c243eab724 openvms: duck-punch a bugfix into `environb` object
Jean-Francois Pieronne <jf.pieronne@laposte.net>
parents: 50952
diff changeset
    92
            else:
80c243eab724 openvms: duck-punch a bugfix into `environb` object
Jean-Francois Pieronne <jf.pieronne@laposte.net>
parents: 50952
diff changeset
    93
                return default if v is None else v
80c243eab724 openvms: duck-punch a bugfix into `environb` object
Jean-Francois Pieronne <jf.pieronne@laposte.net>
parents: 50952
diff changeset
    94
80c243eab724 openvms: duck-punch a bugfix into `environb` object
Jean-Francois Pieronne <jf.pieronne@laposte.net>
parents: 50952
diff changeset
    95
        environ.__class__.get = newget
30034
e4a6b439acc5 py3: provide encoding.environ which is a dict of bytes
Yuya Nishihara <yuya@tcha.org>
parents: 30033
diff changeset
    96
else:
e4a6b439acc5 py3: provide encoding.environ which is a dict of bytes
Yuya Nishihara <yuya@tcha.org>
parents: 30033
diff changeset
    97
    # preferred encoding isn't known yet; use utf-8 to avoid unicode error
e4a6b439acc5 py3: provide encoding.environ which is a dict of bytes
Yuya Nishihara <yuya@tcha.org>
parents: 30033
diff changeset
    98
    # and recreate it once encoding is settled
44470
9d2b2df2c2ba cleanup: run pyupgrade on our source tree to clean up varying things
Augie Fackler <augie@google.com>
parents: 43807
diff changeset
    99
    environ = {
9d2b2df2c2ba cleanup: run pyupgrade on our source tree to clean up varying things
Augie Fackler <augie@google.com>
parents: 43807
diff changeset
   100
        k.encode('utf-8'): v.encode('utf-8')
43076
2372284d9457 formatting: blacken the codebase
Augie Fackler <augie@google.com>
parents: 41841
diff changeset
   101
        for k, v in os.environ.items()  # re-exports
44470
9d2b2df2c2ba cleanup: run pyupgrade on our source tree to clean up varying things
Augie Fackler <augie@google.com>
parents: 43807
diff changeset
   102
    }
30034
e4a6b439acc5 py3: provide encoding.environ which is a dict of bytes
Yuya Nishihara <yuya@tcha.org>
parents: 30033
diff changeset
   103
39844
9e8fcd2e78c1 encoding: remove unnecessary lambdas from _encodingfixers
Martin von Zweigbergk <martinvonz@google.com>
parents: 39824
diff changeset
   104
_encodingrewrites = {
43077
687b865b95ad formatting: byteify all mercurial/ and hgext/ string literals
Augie Fackler <augie@google.com>
parents: 43076
diff changeset
   105
    b'646': b'ascii',
687b865b95ad formatting: byteify all mercurial/ and hgext/ string literals
Augie Fackler <augie@google.com>
parents: 43076
diff changeset
   106
    b'ANSI_X3.4-1968': b'ascii',
11892
2be70ca17311 encoding: improve handling of buggy getpreferredencoding() on Mac OS X
Dan Villiom Podlaski Christiansen <danchr@gmail.com>
parents: 11297
diff changeset
   107
}
37883
443029011990 encoding: alias cp65001 to utf-8 on Windows
Yuya Nishihara <yuya@tcha.org>
parents: 36805
diff changeset
   108
# cp65001 is a Windows variant of utf-8, which isn't supported on Python 2.
443029011990 encoding: alias cp65001 to utf-8 on Windows
Yuya Nishihara <yuya@tcha.org>
parents: 36805
diff changeset
   109
# No idea if it should be rewritten to the canonical name 'utf-8' on Python 3.
443029011990 encoding: alias cp65001 to utf-8 on Windows
Yuya Nishihara <yuya@tcha.org>
parents: 36805
diff changeset
   110
# https://bugs.python.org/issue13216
48983
fa2b1a46d92e encoding: remove Python 2 support code
Gregory Szorc <gregory.szorc@gmail.com>
parents: 48966
diff changeset
   111
if pycompat.iswindows:
43077
687b865b95ad formatting: byteify all mercurial/ and hgext/ string literals
Augie Fackler <augie@google.com>
parents: 43076
diff changeset
   112
    _encodingrewrites[b'cp65001'] = b'utf-8'
7948
de377b1a9a84 move encoding bits from util to encoding
Matt Mackall <mpm@selenic.com>
parents:
diff changeset
   113
de377b1a9a84 move encoding bits from util to encoding
Matt Mackall <mpm@selenic.com>
parents:
diff changeset
   114
try:
43077
687b865b95ad formatting: byteify all mercurial/ and hgext/ string literals
Augie Fackler <augie@google.com>
parents: 43076
diff changeset
   115
    encoding = environ.get(b"HGENCODING")
7948
de377b1a9a84 move encoding bits from util to encoding
Matt Mackall <mpm@selenic.com>
parents:
diff changeset
   116
    if not encoding:
43077
687b865b95ad formatting: byteify all mercurial/ and hgext/ string literals
Augie Fackler <augie@google.com>
parents: 43076
diff changeset
   117
        encoding = locale.getpreferredencoding().encode('ascii') or b'ascii'
39844
9e8fcd2e78c1 encoding: remove unnecessary lambdas from _encodingfixers
Martin von Zweigbergk <martinvonz@google.com>
parents: 39824
diff changeset
   118
        encoding = _encodingrewrites.get(encoding, encoding)
7948
de377b1a9a84 move encoding bits from util to encoding
Matt Mackall <mpm@selenic.com>
parents:
diff changeset
   119
except locale.Error:
43077
687b865b95ad formatting: byteify all mercurial/ and hgext/ string literals
Augie Fackler <augie@google.com>
parents: 43076
diff changeset
   120
    encoding = b'ascii'
687b865b95ad formatting: byteify all mercurial/ and hgext/ string literals
Augie Fackler <augie@google.com>
parents: 43076
diff changeset
   121
encodingmode = environ.get(b"HGENCODINGMODE", b"strict")
687b865b95ad formatting: byteify all mercurial/ and hgext/ string literals
Augie Fackler <augie@google.com>
parents: 43076
diff changeset
   122
fallbackencoding = b'ISO-8859-1'
7948
de377b1a9a84 move encoding bits from util to encoding
Matt Mackall <mpm@selenic.com>
parents:
diff changeset
   123
43076
2372284d9457 formatting: blacken the codebase
Augie Fackler <augie@google.com>
parents: 41841
diff changeset
   124
33832
dabe1f11ae3a py3: change encoding.localstr to a subclass of bytes, not str
Yuya Nishihara <yuya@tcha.org>
parents: 33782
diff changeset
   125
class localstr(bytes):
45957
89a2afe31e82 formating: upgrade to black 20.8b1
Augie Fackler <raf@durin42.com>
parents: 45681
diff changeset
   126
    """This class allows strings that are unmodified to be
89a2afe31e82 formating: upgrade to black 20.8b1
Augie Fackler <raf@durin42.com>
parents: 45681
diff changeset
   127
    round-tripped to the local encoding and back"""
43076
2372284d9457 formatting: blacken the codebase
Augie Fackler <augie@google.com>
parents: 41841
diff changeset
   128
13046
7cc4263e07a9 encoding: add localstr class to track UTF-8 version of transcoded strings
Matt Mackall <mpm@selenic.com>
parents: 12866
diff changeset
   129
    def __new__(cls, u, l):
33832
dabe1f11ae3a py3: change encoding.localstr to a subclass of bytes, not str
Yuya Nishihara <yuya@tcha.org>
parents: 33782
diff changeset
   130
        s = bytes.__new__(cls, l)
13046
7cc4263e07a9 encoding: add localstr class to track UTF-8 version of transcoded strings
Matt Mackall <mpm@selenic.com>
parents: 12866
diff changeset
   131
        s._utf8 = u
7cc4263e07a9 encoding: add localstr class to track UTF-8 version of transcoded strings
Matt Mackall <mpm@selenic.com>
parents: 12866
diff changeset
   132
        return s
43076
2372284d9457 formatting: blacken the codebase
Augie Fackler <augie@google.com>
parents: 41841
diff changeset
   133
43793
7b14d649af1b typing: consolidate "if not globals():" trick
Yuya Nishihara <yuya@tcha.org>
parents: 43725
diff changeset
   134
    if pycompat.TYPE_CHECKING:
43725
da925257a39e typing: add pseudo localstr.__init__() to help pytype
Yuya Nishihara <yuya@tcha.org>
parents: 43724
diff changeset
   135
        # pseudo implementation to help pytype see localstr() constructor
da925257a39e typing: add pseudo localstr.__init__() to help pytype
Yuya Nishihara <yuya@tcha.org>
parents: 43724
diff changeset
   136
        def __init__(self, u, l):
da925257a39e typing: add pseudo localstr.__init__() to help pytype
Yuya Nishihara <yuya@tcha.org>
parents: 43724
diff changeset
   137
            # type: (bytes, bytes) -> None
da925257a39e typing: add pseudo localstr.__init__() to help pytype
Yuya Nishihara <yuya@tcha.org>
parents: 43724
diff changeset
   138
            super(localstr, self).__init__(l)
da925257a39e typing: add pseudo localstr.__init__() to help pytype
Yuya Nishihara <yuya@tcha.org>
parents: 43724
diff changeset
   139
            self._utf8 = u
da925257a39e typing: add pseudo localstr.__init__() to help pytype
Yuya Nishihara <yuya@tcha.org>
parents: 43724
diff changeset
   140
13046
7cc4263e07a9 encoding: add localstr class to track UTF-8 version of transcoded strings
Matt Mackall <mpm@selenic.com>
parents: 12866
diff changeset
   141
    def __hash__(self):
43076
2372284d9457 formatting: blacken the codebase
Augie Fackler <augie@google.com>
parents: 41841
diff changeset
   142
        return hash(self._utf8)  # avoid collisions in local string space
2372284d9457 formatting: blacken the codebase
Augie Fackler <augie@google.com>
parents: 41841
diff changeset
   143
13046
7cc4263e07a9 encoding: add localstr class to track UTF-8 version of transcoded strings
Matt Mackall <mpm@selenic.com>
parents: 12866
diff changeset
   144
37991
3ea3c96ada54 encoding: introduce tagging type for non-lossy non-ASCII string
Yuya Nishihara <yuya@tcha.org>
parents: 37990
diff changeset
   145
class safelocalstr(bytes):
3ea3c96ada54 encoding: introduce tagging type for non-lossy non-ASCII string
Yuya Nishihara <yuya@tcha.org>
parents: 37990
diff changeset
   146
    """Tagged string denoting it was previously an internal UTF-8 string,
3ea3c96ada54 encoding: introduce tagging type for non-lossy non-ASCII string
Yuya Nishihara <yuya@tcha.org>
parents: 37990
diff changeset
   147
    and can be converted back to UTF-8 losslessly
3ea3c96ada54 encoding: introduce tagging type for non-lossy non-ASCII string
Yuya Nishihara <yuya@tcha.org>
parents: 37990
diff changeset
   148
3ea3c96ada54 encoding: introduce tagging type for non-lossy non-ASCII string
Yuya Nishihara <yuya@tcha.org>
parents: 37990
diff changeset
   149
    >>> assert safelocalstr(b'\\xc3') == b'\\xc3'
3ea3c96ada54 encoding: introduce tagging type for non-lossy non-ASCII string
Yuya Nishihara <yuya@tcha.org>
parents: 37990
diff changeset
   150
    >>> assert b'\\xc3' == safelocalstr(b'\\xc3')
3ea3c96ada54 encoding: introduce tagging type for non-lossy non-ASCII string
Yuya Nishihara <yuya@tcha.org>
parents: 37990
diff changeset
   151
    >>> assert b'\\xc3' in {safelocalstr(b'\\xc3'): 0}
3ea3c96ada54 encoding: introduce tagging type for non-lossy non-ASCII string
Yuya Nishihara <yuya@tcha.org>
parents: 37990
diff changeset
   152
    >>> assert safelocalstr(b'\\xc3') in {b'\\xc3': 0}
3ea3c96ada54 encoding: introduce tagging type for non-lossy non-ASCII string
Yuya Nishihara <yuya@tcha.org>
parents: 37990
diff changeset
   153
    """
3ea3c96ada54 encoding: introduce tagging type for non-lossy non-ASCII string
Yuya Nishihara <yuya@tcha.org>
parents: 37990
diff changeset
   154
43076
2372284d9457 formatting: blacken the codebase
Augie Fackler <augie@google.com>
parents: 41841
diff changeset
   155
7948
de377b1a9a84 move encoding bits from util to encoding
Matt Mackall <mpm@selenic.com>
parents:
diff changeset
   156
def tolocal(s):
43721
b65fcccd9100 typing: fix argument type of encoding.tolocal() and .fromutf8b()
Yuya Nishihara <yuya@tcha.org>
parents: 43720
diff changeset
   157
    # type: (bytes) -> bytes
7948
de377b1a9a84 move encoding bits from util to encoding
Matt Mackall <mpm@selenic.com>
parents:
diff changeset
   158
    """
de377b1a9a84 move encoding bits from util to encoding
Matt Mackall <mpm@selenic.com>
parents:
diff changeset
   159
    Convert a string from internal UTF-8 to local encoding
de377b1a9a84 move encoding bits from util to encoding
Matt Mackall <mpm@selenic.com>
parents:
diff changeset
   160
de377b1a9a84 move encoding bits from util to encoding
Matt Mackall <mpm@selenic.com>
parents:
diff changeset
   161
    All internal strings should be UTF-8 but some repos before the
de377b1a9a84 move encoding bits from util to encoding
Matt Mackall <mpm@selenic.com>
parents:
diff changeset
   162
    implementation of locale support may contain latin1 or possibly
de377b1a9a84 move encoding bits from util to encoding
Matt Mackall <mpm@selenic.com>
parents:
diff changeset
   163
    other character sets. We attempt to decode everything strictly
de377b1a9a84 move encoding bits from util to encoding
Matt Mackall <mpm@selenic.com>
parents:
diff changeset
   164
    using UTF-8, then Latin-1, and failing that, we use UTF-8 and
de377b1a9a84 move encoding bits from util to encoding
Matt Mackall <mpm@selenic.com>
parents:
diff changeset
   165
    replace unknown characters.
13046
7cc4263e07a9 encoding: add localstr class to track UTF-8 version of transcoded strings
Matt Mackall <mpm@selenic.com>
parents: 12866
diff changeset
   166
7cc4263e07a9 encoding: add localstr class to track UTF-8 version of transcoded strings
Matt Mackall <mpm@selenic.com>
parents: 12866
diff changeset
   167
    The localstr class is used to cache the known UTF-8 encoding of
7cc4263e07a9 encoding: add localstr class to track UTF-8 version of transcoded strings
Matt Mackall <mpm@selenic.com>
parents: 12866
diff changeset
   168
    strings next to their local representation to allow lossless
7cc4263e07a9 encoding: add localstr class to track UTF-8 version of transcoded strings
Matt Mackall <mpm@selenic.com>
parents: 12866
diff changeset
   169
    round-trip conversion back to UTF-8.
7cc4263e07a9 encoding: add localstr class to track UTF-8 version of transcoded strings
Matt Mackall <mpm@selenic.com>
parents: 12866
diff changeset
   170
34146
0fa781320203 doctest: bulk-replace string literals with b'' for Python 3
Yuya Nishihara <yuya@tcha.org>
parents: 33946
diff changeset
   171
    >>> u = b'foo: \\xc3\\xa4' # utf-8
13046
7cc4263e07a9 encoding: add localstr class to track UTF-8 version of transcoded strings
Matt Mackall <mpm@selenic.com>
parents: 12866
diff changeset
   172
    >>> l = tolocal(u)
7cc4263e07a9 encoding: add localstr class to track UTF-8 version of transcoded strings
Matt Mackall <mpm@selenic.com>
parents: 12866
diff changeset
   173
    >>> l
7cc4263e07a9 encoding: add localstr class to track UTF-8 version of transcoded strings
Matt Mackall <mpm@selenic.com>
parents: 12866
diff changeset
   174
    'foo: ?'
7cc4263e07a9 encoding: add localstr class to track UTF-8 version of transcoded strings
Matt Mackall <mpm@selenic.com>
parents: 12866
diff changeset
   175
    >>> fromlocal(l)
7cc4263e07a9 encoding: add localstr class to track UTF-8 version of transcoded strings
Matt Mackall <mpm@selenic.com>
parents: 12866
diff changeset
   176
    'foo: \\xc3\\xa4'
34146
0fa781320203 doctest: bulk-replace string literals with b'' for Python 3
Yuya Nishihara <yuya@tcha.org>
parents: 33946
diff changeset
   177
    >>> u2 = b'foo: \\xc3\\xa1'
13046
7cc4263e07a9 encoding: add localstr class to track UTF-8 version of transcoded strings
Matt Mackall <mpm@selenic.com>
parents: 12866
diff changeset
   178
    >>> d = { l: 1, tolocal(u2): 2 }
18378
404feac78b8a tests: stabilize doctest output
Mads Kiilerich <mads@kiilerich.com>
parents: 17424
diff changeset
   179
    >>> len(d) # no collision
404feac78b8a tests: stabilize doctest output
Mads Kiilerich <mads@kiilerich.com>
parents: 17424
diff changeset
   180
    2
34146
0fa781320203 doctest: bulk-replace string literals with b'' for Python 3
Yuya Nishihara <yuya@tcha.org>
parents: 33946
diff changeset
   181
    >>> b'foo: ?' in d
13046
7cc4263e07a9 encoding: add localstr class to track UTF-8 version of transcoded strings
Matt Mackall <mpm@selenic.com>
parents: 12866
diff changeset
   182
    False
34146
0fa781320203 doctest: bulk-replace string literals with b'' for Python 3
Yuya Nishihara <yuya@tcha.org>
parents: 33946
diff changeset
   183
    >>> l1 = b'foo: \\xe4' # historical latin1 fallback
13046
7cc4263e07a9 encoding: add localstr class to track UTF-8 version of transcoded strings
Matt Mackall <mpm@selenic.com>
parents: 12866
diff changeset
   184
    >>> l = tolocal(l1)
7cc4263e07a9 encoding: add localstr class to track UTF-8 version of transcoded strings
Matt Mackall <mpm@selenic.com>
parents: 12866
diff changeset
   185
    >>> l
7cc4263e07a9 encoding: add localstr class to track UTF-8 version of transcoded strings
Matt Mackall <mpm@selenic.com>
parents: 12866
diff changeset
   186
    'foo: ?'
7cc4263e07a9 encoding: add localstr class to track UTF-8 version of transcoded strings
Matt Mackall <mpm@selenic.com>
parents: 12866
diff changeset
   187
    >>> fromlocal(l) # magically in utf-8
7cc4263e07a9 encoding: add localstr class to track UTF-8 version of transcoded strings
Matt Mackall <mpm@selenic.com>
parents: 12866
diff changeset
   188
    'foo: \\xc3\\xa4'
7948
de377b1a9a84 move encoding bits from util to encoding
Matt Mackall <mpm@selenic.com>
parents:
diff changeset
   189
    """
13046
7cc4263e07a9 encoding: add localstr class to track UTF-8 version of transcoded strings
Matt Mackall <mpm@selenic.com>
parents: 12866
diff changeset
   190
33945
853574db5b12 encoding: add fast path of from/tolocal() for ASCII strings
Yuya Nishihara <yuya@tcha.org>
parents: 33944
diff changeset
   191
    if isasciistr(s):
853574db5b12 encoding: add fast path of from/tolocal() for ASCII strings
Yuya Nishihara <yuya@tcha.org>
parents: 33944
diff changeset
   192
        return s
853574db5b12 encoding: add fast path of from/tolocal() for ASCII strings
Yuya Nishihara <yuya@tcha.org>
parents: 33944
diff changeset
   193
16274
5d75eb8568d1 encoding: tune fast-path of tolocal a bit
Matt Mackall <mpm@selenic.com>
parents: 16133
diff changeset
   194
    try:
7948
de377b1a9a84 move encoding bits from util to encoding
Matt Mackall <mpm@selenic.com>
parents:
diff changeset
   195
        try:
16274
5d75eb8568d1 encoding: tune fast-path of tolocal a bit
Matt Mackall <mpm@selenic.com>
parents: 16133
diff changeset
   196
            # make sure string is actually stored in UTF-8
5d75eb8568d1 encoding: tune fast-path of tolocal a bit
Matt Mackall <mpm@selenic.com>
parents: 16133
diff changeset
   197
            u = s.decode('UTF-8')
43077
687b865b95ad formatting: byteify all mercurial/ and hgext/ string literals
Augie Fackler <augie@google.com>
parents: 43076
diff changeset
   198
            if encoding == b'UTF-8':
16274
5d75eb8568d1 encoding: tune fast-path of tolocal a bit
Matt Mackall <mpm@selenic.com>
parents: 16133
diff changeset
   199
                # fast path
5d75eb8568d1 encoding: tune fast-path of tolocal a bit
Matt Mackall <mpm@selenic.com>
parents: 16133
diff changeset
   200
                return s
43551
313e3a279828 cleanup: remove pointless r-prefixes on double-quoted strings
Augie Fackler <augie@google.com>
parents: 43544
diff changeset
   201
            r = u.encode(_sysstr(encoding), "replace")
30033
02dbfaa6df0b py3: convert encoding name and mode to str
Yuya Nishihara <yuya@tcha.org>
parents: 30031
diff changeset
   202
            if u == r.decode(_sysstr(encoding)):
13940
b7b26e54e37a encoding: avoid localstr when a string can be encoded losslessly (issue2763)
Matt Mackall <mpm@selenic.com>
parents: 13051
diff changeset
   203
                # r is a safe, non-lossy encoding of s
37991
3ea3c96ada54 encoding: introduce tagging type for non-lossy non-ASCII string
Yuya Nishihara <yuya@tcha.org>
parents: 37990
diff changeset
   204
                return safelocalstr(r)
16274
5d75eb8568d1 encoding: tune fast-path of tolocal a bit
Matt Mackall <mpm@selenic.com>
parents: 16133
diff changeset
   205
            return localstr(s, r)
5d75eb8568d1 encoding: tune fast-path of tolocal a bit
Matt Mackall <mpm@selenic.com>
parents: 16133
diff changeset
   206
        except UnicodeDecodeError:
5d75eb8568d1 encoding: tune fast-path of tolocal a bit
Matt Mackall <mpm@selenic.com>
parents: 16133
diff changeset
   207
            # we should only get here if we're looking at an ancient changeset
5d75eb8568d1 encoding: tune fast-path of tolocal a bit
Matt Mackall <mpm@selenic.com>
parents: 16133
diff changeset
   208
            try:
30033
02dbfaa6df0b py3: convert encoding name and mode to str
Yuya Nishihara <yuya@tcha.org>
parents: 30031
diff changeset
   209
                u = s.decode(_sysstr(fallbackencoding))
43551
313e3a279828 cleanup: remove pointless r-prefixes on double-quoted strings
Augie Fackler <augie@google.com>
parents: 43544
diff changeset
   210
                r = u.encode(_sysstr(encoding), "replace")
30033
02dbfaa6df0b py3: convert encoding name and mode to str
Yuya Nishihara <yuya@tcha.org>
parents: 30031
diff changeset
   211
                if u == r.decode(_sysstr(encoding)):
16274
5d75eb8568d1 encoding: tune fast-path of tolocal a bit
Matt Mackall <mpm@selenic.com>
parents: 16133
diff changeset
   212
                    # r is a safe, non-lossy encoding of s
37991
3ea3c96ada54 encoding: introduce tagging type for non-lossy non-ASCII string
Yuya Nishihara <yuya@tcha.org>
parents: 37990
diff changeset
   213
                    return safelocalstr(r)
13940
b7b26e54e37a encoding: avoid localstr when a string can be encoded losslessly (issue2763)
Matt Mackall <mpm@selenic.com>
parents: 13051
diff changeset
   214
                return localstr(u.encode('UTF-8'), r)
16274
5d75eb8568d1 encoding: tune fast-path of tolocal a bit
Matt Mackall <mpm@selenic.com>
parents: 16133
diff changeset
   215
            except UnicodeDecodeError:
43076
2372284d9457 formatting: blacken the codebase
Augie Fackler <augie@google.com>
parents: 41841
diff changeset
   216
                u = s.decode("utf-8", "replace")  # last ditch
30033
02dbfaa6df0b py3: convert encoding name and mode to str
Yuya Nishihara <yuya@tcha.org>
parents: 30031
diff changeset
   217
                # can't round-trip
43551
313e3a279828 cleanup: remove pointless r-prefixes on double-quoted strings
Augie Fackler <augie@google.com>
parents: 43544
diff changeset
   218
                return u.encode(_sysstr(encoding), "replace")
25660
328739ea70c3 global: mass rewrite to use modern exception syntax
Gregory Szorc <gregory.szorc@gmail.com>
parents: 24608
diff changeset
   219
    except LookupError as k:
45681
a736ab681b78 errors: stop passing non-strings to Abort's constructor
Martin von Zweigbergk <martinvonz@google.com>
parents: 44470
diff changeset
   220
        raise error.Abort(
a736ab681b78 errors: stop passing non-strings to Abort's constructor
Martin von Zweigbergk <martinvonz@google.com>
parents: 44470
diff changeset
   221
            pycompat.bytestr(k), hint=b"please check your locale settings"
a736ab681b78 errors: stop passing non-strings to Abort's constructor
Martin von Zweigbergk <martinvonz@google.com>
parents: 44470
diff changeset
   222
        )
7948
de377b1a9a84 move encoding bits from util to encoding
Matt Mackall <mpm@selenic.com>
parents:
diff changeset
   223
43076
2372284d9457 formatting: blacken the codebase
Augie Fackler <augie@google.com>
parents: 41841
diff changeset
   224
7948
de377b1a9a84 move encoding bits from util to encoding
Matt Mackall <mpm@selenic.com>
parents:
diff changeset
   225
def fromlocal(s):
43681
7edc07fb890c encoding: fix bad type annotation
Augie Fackler <augie@google.com>
parents: 43554
diff changeset
   226
    # type: (bytes) -> bytes
7948
de377b1a9a84 move encoding bits from util to encoding
Matt Mackall <mpm@selenic.com>
parents:
diff changeset
   227
    """
de377b1a9a84 move encoding bits from util to encoding
Matt Mackall <mpm@selenic.com>
parents:
diff changeset
   228
    Convert a string from the local character encoding to UTF-8
de377b1a9a84 move encoding bits from util to encoding
Matt Mackall <mpm@selenic.com>
parents:
diff changeset
   229
de377b1a9a84 move encoding bits from util to encoding
Matt Mackall <mpm@selenic.com>
parents:
diff changeset
   230
    We attempt to decode strings using the encoding mode set by
de377b1a9a84 move encoding bits from util to encoding
Matt Mackall <mpm@selenic.com>
parents:
diff changeset
   231
    HGENCODINGMODE, which defaults to 'strict'. In this mode, unknown
de377b1a9a84 move encoding bits from util to encoding
Matt Mackall <mpm@selenic.com>
parents:
diff changeset
   232
    characters will cause an error message. Other modes include
de377b1a9a84 move encoding bits from util to encoding
Matt Mackall <mpm@selenic.com>
parents:
diff changeset
   233
    'replace', which replaces unknown characters with a special
de377b1a9a84 move encoding bits from util to encoding
Matt Mackall <mpm@selenic.com>
parents:
diff changeset
   234
    Unicode character, and 'ignore', which drops the character.
de377b1a9a84 move encoding bits from util to encoding
Matt Mackall <mpm@selenic.com>
parents:
diff changeset
   235
    """
13046
7cc4263e07a9 encoding: add localstr class to track UTF-8 version of transcoded strings
Matt Mackall <mpm@selenic.com>
parents: 12866
diff changeset
   236
7cc4263e07a9 encoding: add localstr class to track UTF-8 version of transcoded strings
Matt Mackall <mpm@selenic.com>
parents: 12866
diff changeset
   237
    # can we do a lossless round-trip?
7cc4263e07a9 encoding: add localstr class to track UTF-8 version of transcoded strings
Matt Mackall <mpm@selenic.com>
parents: 12866
diff changeset
   238
    if isinstance(s, localstr):
7cc4263e07a9 encoding: add localstr class to track UTF-8 version of transcoded strings
Matt Mackall <mpm@selenic.com>
parents: 12866
diff changeset
   239
        return s._utf8
33945
853574db5b12 encoding: add fast path of from/tolocal() for ASCII strings
Yuya Nishihara <yuya@tcha.org>
parents: 33944
diff changeset
   240
    if isasciistr(s):
853574db5b12 encoding: add fast path of from/tolocal() for ASCII strings
Yuya Nishihara <yuya@tcha.org>
parents: 33944
diff changeset
   241
        return s
13046
7cc4263e07a9 encoding: add localstr class to track UTF-8 version of transcoded strings
Matt Mackall <mpm@selenic.com>
parents: 12866
diff changeset
   242
7948
de377b1a9a84 move encoding bits from util to encoding
Matt Mackall <mpm@selenic.com>
parents:
diff changeset
   243
    try:
30033
02dbfaa6df0b py3: convert encoding name and mode to str
Yuya Nishihara <yuya@tcha.org>
parents: 30031
diff changeset
   244
        u = s.decode(_sysstr(encoding), _sysstr(encodingmode))
02dbfaa6df0b py3: convert encoding name and mode to str
Yuya Nishihara <yuya@tcha.org>
parents: 30031
diff changeset
   245
        return u.encode("utf-8")
25660
328739ea70c3 global: mass rewrite to use modern exception syntax
Gregory Szorc <gregory.szorc@gmail.com>
parents: 24608
diff changeset
   246
    except UnicodeDecodeError as inst:
43076
2372284d9457 formatting: blacken the codebase
Augie Fackler <augie@google.com>
parents: 41841
diff changeset
   247
        sub = s[max(0, inst.start - 10) : inst.start + 10]
2372284d9457 formatting: blacken the codebase
Augie Fackler <augie@google.com>
parents: 41841
diff changeset
   248
        raise error.Abort(
43077
687b865b95ad formatting: byteify all mercurial/ and hgext/ string literals
Augie Fackler <augie@google.com>
parents: 43076
diff changeset
   249
            b"decoding near '%s': %s!" % (sub, pycompat.bytestr(inst))
43076
2372284d9457 formatting: blacken the codebase
Augie Fackler <augie@google.com>
parents: 41841
diff changeset
   250
        )
25660
328739ea70c3 global: mass rewrite to use modern exception syntax
Gregory Szorc <gregory.szorc@gmail.com>
parents: 24608
diff changeset
   251
    except LookupError as k:
48030
28c62f83b652 encoding: force a few Errors to bytes before passing to `error.Abort`
Matt Harbison <matt_harbison@yahoo.com>
parents: 47621
diff changeset
   252
        raise error.Abort(
28c62f83b652 encoding: force a few Errors to bytes before passing to `error.Abort`
Matt Harbison <matt_harbison@yahoo.com>
parents: 47621
diff changeset
   253
            pycompat.bytestr(k), hint=b"please check your locale settings"
28c62f83b652 encoding: force a few Errors to bytes before passing to `error.Abort`
Matt Harbison <matt_harbison@yahoo.com>
parents: 47621
diff changeset
   254
        )
7948
de377b1a9a84 move encoding bits from util to encoding
Matt Mackall <mpm@selenic.com>
parents:
diff changeset
   255
43076
2372284d9457 formatting: blacken the codebase
Augie Fackler <augie@google.com>
parents: 41841
diff changeset
   256
31456
067add650129 encoding: factor out unicode variants of from/tolocal()
Yuya Nishihara <yuya@tcha.org>
parents: 30627
diff changeset
   257
def unitolocal(u):
43544
2ade00f3b03b encoding: add comment-based type hints for pytype
Augie Fackler <augie@google.com>
parents: 43517
diff changeset
   258
    # type: (Text) -> bytes
31456
067add650129 encoding: factor out unicode variants of from/tolocal()
Yuya Nishihara <yuya@tcha.org>
parents: 30627
diff changeset
   259
    """Convert a unicode string to a byte string of local encoding"""
067add650129 encoding: factor out unicode variants of from/tolocal()
Yuya Nishihara <yuya@tcha.org>
parents: 30627
diff changeset
   260
    return tolocal(u.encode('utf-8'))
067add650129 encoding: factor out unicode variants of from/tolocal()
Yuya Nishihara <yuya@tcha.org>
parents: 30627
diff changeset
   261
43076
2372284d9457 formatting: blacken the codebase
Augie Fackler <augie@google.com>
parents: 41841
diff changeset
   262
31456
067add650129 encoding: factor out unicode variants of from/tolocal()
Yuya Nishihara <yuya@tcha.org>
parents: 30627
diff changeset
   263
def unifromlocal(s):
43544
2ade00f3b03b encoding: add comment-based type hints for pytype
Augie Fackler <augie@google.com>
parents: 43517
diff changeset
   264
    # type: (bytes) -> Text
31456
067add650129 encoding: factor out unicode variants of from/tolocal()
Yuya Nishihara <yuya@tcha.org>
parents: 30627
diff changeset
   265
    """Convert a byte string of local encoding to a unicode string"""
067add650129 encoding: factor out unicode variants of from/tolocal()
Yuya Nishihara <yuya@tcha.org>
parents: 30627
diff changeset
   266
    return fromlocal(s).decode('utf-8')
067add650129 encoding: factor out unicode variants of from/tolocal()
Yuya Nishihara <yuya@tcha.org>
parents: 30627
diff changeset
   267
43076
2372284d9457 formatting: blacken the codebase
Augie Fackler <augie@google.com>
parents: 41841
diff changeset
   268
33038
ce96efec8112 py3: add utility to forward __str__() to __bytes__()
Yuya Nishihara <yuya@tcha.org>
parents: 32570
diff changeset
   269
def unimethod(bytesfunc):
43544
2ade00f3b03b encoding: add comment-based type hints for pytype
Augie Fackler <augie@google.com>
parents: 43517
diff changeset
   270
    # type: (Callable[[Any], bytes]) -> Callable[[Any], Text]
33038
ce96efec8112 py3: add utility to forward __str__() to __bytes__()
Yuya Nishihara <yuya@tcha.org>
parents: 32570
diff changeset
   271
    """Create a proxy method that forwards __unicode__() and __str__() of
ce96efec8112 py3: add utility to forward __str__() to __bytes__()
Yuya Nishihara <yuya@tcha.org>
parents: 32570
diff changeset
   272
    Python 3 to __bytes__()"""
43076
2372284d9457 formatting: blacken the codebase
Augie Fackler <augie@google.com>
parents: 41841
diff changeset
   273
33038
ce96efec8112 py3: add utility to forward __str__() to __bytes__()
Yuya Nishihara <yuya@tcha.org>
parents: 32570
diff changeset
   274
    def unifunc(obj):
ce96efec8112 py3: add utility to forward __str__() to __bytes__()
Yuya Nishihara <yuya@tcha.org>
parents: 32570
diff changeset
   275
        return unifromlocal(bytesfunc(obj))
43076
2372284d9457 formatting: blacken the codebase
Augie Fackler <augie@google.com>
parents: 41841
diff changeset
   276
33038
ce96efec8112 py3: add utility to forward __str__() to __bytes__()
Yuya Nishihara <yuya@tcha.org>
parents: 32570
diff changeset
   277
    return unifunc
ce96efec8112 py3: add utility to forward __str__() to __bytes__()
Yuya Nishihara <yuya@tcha.org>
parents: 32570
diff changeset
   278
43076
2372284d9457 formatting: blacken the codebase
Augie Fackler <augie@google.com>
parents: 41841
diff changeset
   279
31457
6419cd243017 encoding: add converter between native str and byte string
Yuya Nishihara <yuya@tcha.org>
parents: 31456
diff changeset
   280
# converter functions between native str and byte string. use these if the
6419cd243017 encoding: add converter between native str and byte string
Yuya Nishihara <yuya@tcha.org>
parents: 31456
diff changeset
   281
# character encoding is not aware (e.g. exception message) or is known to
6419cd243017 encoding: add converter between native str and byte string
Yuya Nishihara <yuya@tcha.org>
parents: 31456
diff changeset
   282
# be locale dependent (e.g. date formatting.)
48983
fa2b1a46d92e encoding: remove Python 2 support code
Gregory Szorc <gregory.szorc@gmail.com>
parents: 48966
diff changeset
   283
strtolocal = unitolocal
fa2b1a46d92e encoding: remove Python 2 support code
Gregory Szorc <gregory.szorc@gmail.com>
parents: 48966
diff changeset
   284
strfromlocal = unifromlocal
fa2b1a46d92e encoding: remove Python 2 support code
Gregory Szorc <gregory.szorc@gmail.com>
parents: 48966
diff changeset
   285
strmethod = unimethod
31457
6419cd243017 encoding: add converter between native str and byte string
Yuya Nishihara <yuya@tcha.org>
parents: 31456
diff changeset
   286
47559
53a864a60281 encoding: move case-related utils up
Raphaël Gomès <rgomes@octobus.net>
parents: 46819
diff changeset
   287
53a864a60281 encoding: move case-related utils up
Raphaël Gomès <rgomes@octobus.net>
parents: 46819
diff changeset
   288
def lower(s):
53a864a60281 encoding: move case-related utils up
Raphaël Gomès <rgomes@octobus.net>
parents: 46819
diff changeset
   289
    # type: (bytes) -> bytes
53a864a60281 encoding: move case-related utils up
Raphaël Gomès <rgomes@octobus.net>
parents: 46819
diff changeset
   290
    """best-effort encoding-aware case-folding of local string s"""
53a864a60281 encoding: move case-related utils up
Raphaël Gomès <rgomes@octobus.net>
parents: 46819
diff changeset
   291
    try:
53a864a60281 encoding: move case-related utils up
Raphaël Gomès <rgomes@octobus.net>
parents: 46819
diff changeset
   292
        return asciilower(s)
53a864a60281 encoding: move case-related utils up
Raphaël Gomès <rgomes@octobus.net>
parents: 46819
diff changeset
   293
    except UnicodeDecodeError:
53a864a60281 encoding: move case-related utils up
Raphaël Gomès <rgomes@octobus.net>
parents: 46819
diff changeset
   294
        pass
53a864a60281 encoding: move case-related utils up
Raphaël Gomès <rgomes@octobus.net>
parents: 46819
diff changeset
   295
    try:
53a864a60281 encoding: move case-related utils up
Raphaël Gomès <rgomes@octobus.net>
parents: 46819
diff changeset
   296
        if isinstance(s, localstr):
53a864a60281 encoding: move case-related utils up
Raphaël Gomès <rgomes@octobus.net>
parents: 46819
diff changeset
   297
            u = s._utf8.decode("utf-8")
53a864a60281 encoding: move case-related utils up
Raphaël Gomès <rgomes@octobus.net>
parents: 46819
diff changeset
   298
        else:
53a864a60281 encoding: move case-related utils up
Raphaël Gomès <rgomes@octobus.net>
parents: 46819
diff changeset
   299
            u = s.decode(_sysstr(encoding), _sysstr(encodingmode))
53a864a60281 encoding: move case-related utils up
Raphaël Gomès <rgomes@octobus.net>
parents: 46819
diff changeset
   300
53a864a60281 encoding: move case-related utils up
Raphaël Gomès <rgomes@octobus.net>
parents: 46819
diff changeset
   301
        lu = u.lower()
53a864a60281 encoding: move case-related utils up
Raphaël Gomès <rgomes@octobus.net>
parents: 46819
diff changeset
   302
        if u == lu:
53a864a60281 encoding: move case-related utils up
Raphaël Gomès <rgomes@octobus.net>
parents: 46819
diff changeset
   303
            return s  # preserve localstring
53a864a60281 encoding: move case-related utils up
Raphaël Gomès <rgomes@octobus.net>
parents: 46819
diff changeset
   304
        return lu.encode(_sysstr(encoding))
53a864a60281 encoding: move case-related utils up
Raphaël Gomès <rgomes@octobus.net>
parents: 46819
diff changeset
   305
    except UnicodeError:
53a864a60281 encoding: move case-related utils up
Raphaël Gomès <rgomes@octobus.net>
parents: 46819
diff changeset
   306
        return s.lower()  # we don't know how to fold this except in ASCII
53a864a60281 encoding: move case-related utils up
Raphaël Gomès <rgomes@octobus.net>
parents: 46819
diff changeset
   307
    except LookupError as k:
48030
28c62f83b652 encoding: force a few Errors to bytes before passing to `error.Abort`
Matt Harbison <matt_harbison@yahoo.com>
parents: 47621
diff changeset
   308
        raise error.Abort(
28c62f83b652 encoding: force a few Errors to bytes before passing to `error.Abort`
Matt Harbison <matt_harbison@yahoo.com>
parents: 47621
diff changeset
   309
            pycompat.bytestr(k), hint=b"please check your locale settings"
28c62f83b652 encoding: force a few Errors to bytes before passing to `error.Abort`
Matt Harbison <matt_harbison@yahoo.com>
parents: 47621
diff changeset
   310
        )
47559
53a864a60281 encoding: move case-related utils up
Raphaël Gomès <rgomes@octobus.net>
parents: 46819
diff changeset
   311
53a864a60281 encoding: move case-related utils up
Raphaël Gomès <rgomes@octobus.net>
parents: 46819
diff changeset
   312
53a864a60281 encoding: move case-related utils up
Raphaël Gomès <rgomes@octobus.net>
parents: 46819
diff changeset
   313
def upper(s):
53a864a60281 encoding: move case-related utils up
Raphaël Gomès <rgomes@octobus.net>
parents: 46819
diff changeset
   314
    # type: (bytes) -> bytes
53a864a60281 encoding: move case-related utils up
Raphaël Gomès <rgomes@octobus.net>
parents: 46819
diff changeset
   315
    """best-effort encoding-aware case-folding of local string s"""
53a864a60281 encoding: move case-related utils up
Raphaël Gomès <rgomes@octobus.net>
parents: 46819
diff changeset
   316
    try:
53a864a60281 encoding: move case-related utils up
Raphaël Gomès <rgomes@octobus.net>
parents: 46819
diff changeset
   317
        return asciiupper(s)
53a864a60281 encoding: move case-related utils up
Raphaël Gomès <rgomes@octobus.net>
parents: 46819
diff changeset
   318
    except UnicodeDecodeError:
53a864a60281 encoding: move case-related utils up
Raphaël Gomès <rgomes@octobus.net>
parents: 46819
diff changeset
   319
        return upperfallback(s)
53a864a60281 encoding: move case-related utils up
Raphaël Gomès <rgomes@octobus.net>
parents: 46819
diff changeset
   320
53a864a60281 encoding: move case-related utils up
Raphaël Gomès <rgomes@octobus.net>
parents: 46819
diff changeset
   321
53a864a60281 encoding: move case-related utils up
Raphaël Gomès <rgomes@octobus.net>
parents: 46819
diff changeset
   322
def upperfallback(s):
53a864a60281 encoding: move case-related utils up
Raphaël Gomès <rgomes@octobus.net>
parents: 46819
diff changeset
   323
    # type: (Any) -> Any
53a864a60281 encoding: move case-related utils up
Raphaël Gomès <rgomes@octobus.net>
parents: 46819
diff changeset
   324
    try:
53a864a60281 encoding: move case-related utils up
Raphaël Gomès <rgomes@octobus.net>
parents: 46819
diff changeset
   325
        if isinstance(s, localstr):
53a864a60281 encoding: move case-related utils up
Raphaël Gomès <rgomes@octobus.net>
parents: 46819
diff changeset
   326
            u = s._utf8.decode("utf-8")
53a864a60281 encoding: move case-related utils up
Raphaël Gomès <rgomes@octobus.net>
parents: 46819
diff changeset
   327
        else:
53a864a60281 encoding: move case-related utils up
Raphaël Gomès <rgomes@octobus.net>
parents: 46819
diff changeset
   328
            u = s.decode(_sysstr(encoding), _sysstr(encodingmode))
53a864a60281 encoding: move case-related utils up
Raphaël Gomès <rgomes@octobus.net>
parents: 46819
diff changeset
   329
53a864a60281 encoding: move case-related utils up
Raphaël Gomès <rgomes@octobus.net>
parents: 46819
diff changeset
   330
        uu = u.upper()
53a864a60281 encoding: move case-related utils up
Raphaël Gomès <rgomes@octobus.net>
parents: 46819
diff changeset
   331
        if u == uu:
53a864a60281 encoding: move case-related utils up
Raphaël Gomès <rgomes@octobus.net>
parents: 46819
diff changeset
   332
            return s  # preserve localstring
53a864a60281 encoding: move case-related utils up
Raphaël Gomès <rgomes@octobus.net>
parents: 46819
diff changeset
   333
        return uu.encode(_sysstr(encoding))
53a864a60281 encoding: move case-related utils up
Raphaël Gomès <rgomes@octobus.net>
parents: 46819
diff changeset
   334
    except UnicodeError:
53a864a60281 encoding: move case-related utils up
Raphaël Gomès <rgomes@octobus.net>
parents: 46819
diff changeset
   335
        return s.upper()  # we don't know how to fold this except in ASCII
53a864a60281 encoding: move case-related utils up
Raphaël Gomès <rgomes@octobus.net>
parents: 46819
diff changeset
   336
    except LookupError as k:
48030
28c62f83b652 encoding: force a few Errors to bytes before passing to `error.Abort`
Matt Harbison <matt_harbison@yahoo.com>
parents: 47621
diff changeset
   337
        raise error.Abort(
28c62f83b652 encoding: force a few Errors to bytes before passing to `error.Abort`
Matt Harbison <matt_harbison@yahoo.com>
parents: 47621
diff changeset
   338
            pycompat.bytestr(k), hint=b"please check your locale settings"
28c62f83b652 encoding: force a few Errors to bytes before passing to `error.Abort`
Matt Harbison <matt_harbison@yahoo.com>
parents: 47621
diff changeset
   339
        )
47559
53a864a60281 encoding: move case-related utils up
Raphaël Gomès <rgomes@octobus.net>
parents: 46819
diff changeset
   340
53a864a60281 encoding: move case-related utils up
Raphaël Gomès <rgomes@octobus.net>
parents: 46819
diff changeset
   341
30034
e4a6b439acc5 py3: provide encoding.environ which is a dict of bytes
Yuya Nishihara <yuya@tcha.org>
parents: 30033
diff changeset
   342
if not _nativeenviron:
e4a6b439acc5 py3: provide encoding.environ which is a dict of bytes
Yuya Nishihara <yuya@tcha.org>
parents: 30033
diff changeset
   343
    # now encoding and helper functions are available, recreate the environ
e4a6b439acc5 py3: provide encoding.environ which is a dict of bytes
Yuya Nishihara <yuya@tcha.org>
parents: 30033
diff changeset
   344
    # dict to be exported to other modules
48983
fa2b1a46d92e encoding: remove Python 2 support code
Gregory Szorc <gregory.szorc@gmail.com>
parents: 48966
diff changeset
   345
    if pycompat.iswindows:
47560
af633293a5bd windows: replicate the normalizing behavior of os.environ
Raphaël Gomès <rgomes@octobus.net>
parents: 47559
diff changeset
   346
af633293a5bd windows: replicate the normalizing behavior of os.environ
Raphaël Gomès <rgomes@octobus.net>
parents: 47559
diff changeset
   347
        class WindowsEnviron(dict):
af633293a5bd windows: replicate the normalizing behavior of os.environ
Raphaël Gomès <rgomes@octobus.net>
parents: 47559
diff changeset
   348
            """`os.environ` normalizes environment variables to uppercase on windows"""
af633293a5bd windows: replicate the normalizing behavior of os.environ
Raphaël Gomès <rgomes@octobus.net>
parents: 47559
diff changeset
   349
af633293a5bd windows: replicate the normalizing behavior of os.environ
Raphaël Gomès <rgomes@octobus.net>
parents: 47559
diff changeset
   350
            def get(self, key, default=None):
af633293a5bd windows: replicate the normalizing behavior of os.environ
Raphaël Gomès <rgomes@octobus.net>
parents: 47559
diff changeset
   351
                return super().get(upper(key), default)
af633293a5bd windows: replicate the normalizing behavior of os.environ
Raphaël Gomès <rgomes@octobus.net>
parents: 47559
diff changeset
   352
af633293a5bd windows: replicate the normalizing behavior of os.environ
Raphaël Gomès <rgomes@octobus.net>
parents: 47559
diff changeset
   353
        environ = WindowsEnviron()
af633293a5bd windows: replicate the normalizing behavior of os.environ
Raphaël Gomès <rgomes@octobus.net>
parents: 47559
diff changeset
   354
af633293a5bd windows: replicate the normalizing behavior of os.environ
Raphaël Gomès <rgomes@octobus.net>
parents: 47559
diff changeset
   355
    for k, v in os.environ.items():  # re-exports
af633293a5bd windows: replicate the normalizing behavior of os.environ
Raphaël Gomès <rgomes@octobus.net>
parents: 47559
diff changeset
   356
        environ[tolocal(k.encode('utf-8'))] = tolocal(v.encode('utf-8'))
af633293a5bd windows: replicate the normalizing behavior of os.environ
Raphaël Gomès <rgomes@octobus.net>
parents: 47559
diff changeset
   357
30034
e4a6b439acc5 py3: provide encoding.environ which is a dict of bytes
Yuya Nishihara <yuya@tcha.org>
parents: 30033
diff changeset
   358
47621
d6ee6456bd5f windows: enforce upper case drive letter for getcwd in mercurial too
Pierre-Yves David <pierre-yves.david@octobus.net>
parents: 47560
diff changeset
   359
DRIVE_RE = re.compile(b'^[a-z]:')
d6ee6456bd5f windows: enforce upper case drive letter for getcwd in mercurial too
Pierre-Yves David <pierre-yves.david@octobus.net>
parents: 47560
diff changeset
   360
48983
fa2b1a46d92e encoding: remove Python 2 support code
Gregory Szorc <gregory.szorc@gmail.com>
parents: 48966
diff changeset
   361
# os.getcwd() on Python 3 returns string, but it has os.getcwdb() which
fa2b1a46d92e encoding: remove Python 2 support code
Gregory Szorc <gregory.szorc@gmail.com>
parents: 48966
diff changeset
   362
# returns bytes.
fa2b1a46d92e encoding: remove Python 2 support code
Gregory Szorc <gregory.szorc@gmail.com>
parents: 48966
diff changeset
   363
if pycompat.iswindows:
fa2b1a46d92e encoding: remove Python 2 support code
Gregory Szorc <gregory.szorc@gmail.com>
parents: 48966
diff changeset
   364
    # Python 3 on Windows issues a DeprecationWarning about using the bytes
fa2b1a46d92e encoding: remove Python 2 support code
Gregory Szorc <gregory.szorc@gmail.com>
parents: 48966
diff changeset
   365
    # API when os.getcwdb() is called.
fa2b1a46d92e encoding: remove Python 2 support code
Gregory Szorc <gregory.szorc@gmail.com>
parents: 48966
diff changeset
   366
    #
fa2b1a46d92e encoding: remove Python 2 support code
Gregory Szorc <gregory.szorc@gmail.com>
parents: 48966
diff changeset
   367
    # Additionally, py3.8+ uppercases the drive letter when calling
fa2b1a46d92e encoding: remove Python 2 support code
Gregory Szorc <gregory.szorc@gmail.com>
parents: 48966
diff changeset
   368
    # os.path.realpath(), which is used on ``repo.root``.  Since those
fa2b1a46d92e encoding: remove Python 2 support code
Gregory Szorc <gregory.szorc@gmail.com>
parents: 48966
diff changeset
   369
    # strings are compared in various places as simple strings, also call
fa2b1a46d92e encoding: remove Python 2 support code
Gregory Szorc <gregory.szorc@gmail.com>
parents: 48966
diff changeset
   370
    # realpath here.  See https://bugs.python.org/issue40368
fa2b1a46d92e encoding: remove Python 2 support code
Gregory Szorc <gregory.szorc@gmail.com>
parents: 48966
diff changeset
   371
    #
fa2b1a46d92e encoding: remove Python 2 support code
Gregory Szorc <gregory.szorc@gmail.com>
parents: 48966
diff changeset
   372
    # However this is not reliable, so lets explicitly make this drive
fa2b1a46d92e encoding: remove Python 2 support code
Gregory Szorc <gregory.szorc@gmail.com>
parents: 48966
diff changeset
   373
    # letter upper case.
fa2b1a46d92e encoding: remove Python 2 support code
Gregory Szorc <gregory.szorc@gmail.com>
parents: 48966
diff changeset
   374
    #
fa2b1a46d92e encoding: remove Python 2 support code
Gregory Szorc <gregory.szorc@gmail.com>
parents: 48966
diff changeset
   375
    # note: we should consider dropping realpath here since it seems to
fa2b1a46d92e encoding: remove Python 2 support code
Gregory Szorc <gregory.szorc@gmail.com>
parents: 48966
diff changeset
   376
    # change the semantic of `getcwd`.
47621
d6ee6456bd5f windows: enforce upper case drive letter for getcwd in mercurial too
Pierre-Yves David <pierre-yves.david@octobus.net>
parents: 47560
diff changeset
   377
48983
fa2b1a46d92e encoding: remove Python 2 support code
Gregory Szorc <gregory.szorc@gmail.com>
parents: 48966
diff changeset
   378
    def getcwd():
fa2b1a46d92e encoding: remove Python 2 support code
Gregory Szorc <gregory.szorc@gmail.com>
parents: 48966
diff changeset
   379
        cwd = os.getcwd()  # re-exports
fa2b1a46d92e encoding: remove Python 2 support code
Gregory Szorc <gregory.szorc@gmail.com>
parents: 48966
diff changeset
   380
        cwd = os.path.realpath(cwd)
fa2b1a46d92e encoding: remove Python 2 support code
Gregory Szorc <gregory.szorc@gmail.com>
parents: 48966
diff changeset
   381
        cwd = strtolocal(cwd)
fa2b1a46d92e encoding: remove Python 2 support code
Gregory Szorc <gregory.szorc@gmail.com>
parents: 48966
diff changeset
   382
        if DRIVE_RE.match(cwd):
fa2b1a46d92e encoding: remove Python 2 support code
Gregory Szorc <gregory.szorc@gmail.com>
parents: 48966
diff changeset
   383
            cwd = cwd[0:1].upper() + cwd[1:]
fa2b1a46d92e encoding: remove Python 2 support code
Gregory Szorc <gregory.szorc@gmail.com>
parents: 48966
diff changeset
   384
        return cwd
47621
d6ee6456bd5f windows: enforce upper case drive letter for getcwd in mercurial too
Pierre-Yves David <pierre-yves.david@octobus.net>
parents: 47560
diff changeset
   385
48983
fa2b1a46d92e encoding: remove Python 2 support code
Gregory Szorc <gregory.szorc@gmail.com>
parents: 48966
diff changeset
   386
39823
24e493ec2229 py3: rename pycompat.getcwd() to encoding.getcwd() (API)
Matt Harbison <matt_harbison@yahoo.com>
parents: 38823
diff changeset
   387
else:
48983
fa2b1a46d92e encoding: remove Python 2 support code
Gregory Szorc <gregory.szorc@gmail.com>
parents: 48966
diff changeset
   388
    getcwd = os.getcwdb  # re-exports
39823
24e493ec2229 py3: rename pycompat.getcwd() to encoding.getcwd() (API)
Matt Harbison <matt_harbison@yahoo.com>
parents: 38823
diff changeset
   389
12866
eddc20306ab6 encoding: default ambiguous character to narrow
Matt Mackall <mpm@selenic.com>
parents: 12770
diff changeset
   390
# How to treat ambiguous-width characters. Set to 'wide' to treat as wide.
43076
2372284d9457 formatting: blacken the codebase
Augie Fackler <augie@google.com>
parents: 41841
diff changeset
   391
_wide = _sysstr(
43077
687b865b95ad formatting: byteify all mercurial/ and hgext/ string literals
Augie Fackler <augie@google.com>
parents: 43076
diff changeset
   392
    environ.get(b"HGENCODINGAMBIGUOUS", b"narrow") == b"wide"
687b865b95ad formatting: byteify all mercurial/ and hgext/ string literals
Augie Fackler <augie@google.com>
parents: 43076
diff changeset
   393
    and b"WFA"
687b865b95ad formatting: byteify all mercurial/ and hgext/ string literals
Augie Fackler <augie@google.com>
parents: 43076
diff changeset
   394
    or b"WF"
43076
2372284d9457 formatting: blacken the codebase
Augie Fackler <augie@google.com>
parents: 41841
diff changeset
   395
)
2372284d9457 formatting: blacken the codebase
Augie Fackler <augie@google.com>
parents: 41841
diff changeset
   396
12866
eddc20306ab6 encoding: default ambiguous character to narrow
Matt Mackall <mpm@selenic.com>
parents: 12770
diff changeset
   397
7948
de377b1a9a84 move encoding bits from util to encoding
Matt Mackall <mpm@selenic.com>
parents:
diff changeset
   398
def colwidth(s):
43544
2ade00f3b03b encoding: add comment-based type hints for pytype
Augie Fackler <augie@google.com>
parents: 43517
diff changeset
   399
    # type: (bytes) -> int
43807
be8552f25cab cleanup: fix docstring formatting
Matt Harbison <matt_harbison@yahoo.com>
parents: 43793
diff changeset
   400
    """Find the column width of a string for display in the local encoding"""
43554
9f70512ae2cf cleanup: remove pointless r-prefixes on single-quoted strings
Augie Fackler <augie@google.com>
parents: 43551
diff changeset
   401
    return ucolwidth(s.decode(_sysstr(encoding), 'replace'))
15066
24efa83d81cb i18n: calculate terminal columns by width information of each characters
FUJIWARA Katsunori <foozy@lares.dti.ne.jp>
parents: 14951
diff changeset
   402
43076
2372284d9457 formatting: blacken the codebase
Augie Fackler <augie@google.com>
parents: 41841
diff changeset
   403
15066
24efa83d81cb i18n: calculate terminal columns by width information of each characters
FUJIWARA Katsunori <foozy@lares.dti.ne.jp>
parents: 14951
diff changeset
   404
def ucolwidth(d):
43544
2ade00f3b03b encoding: add comment-based type hints for pytype
Augie Fackler <augie@google.com>
parents: 43517
diff changeset
   405
    # type: (Text) -> int
43807
be8552f25cab cleanup: fix docstring formatting
Matt Harbison <matt_harbison@yahoo.com>
parents: 43793
diff changeset
   406
    """Find the column width of a Unicode string for display"""
14951
61807854004e encoding: use getattr isntead of hasattr
Augie Fackler <durin42@gmail.com>
parents: 14069
diff changeset
   407
    eaw = getattr(unicodedata, 'east_asian_width', None)
61807854004e encoding: use getattr isntead of hasattr
Augie Fackler <durin42@gmail.com>
parents: 14069
diff changeset
   408
    if eaw is not None:
32570
044f3d7eb9ae encoding: make sure "wide" variable never be referenced from other modules
Yuya Nishihara <yuya@tcha.org>
parents: 32562
diff changeset
   409
        return sum([eaw(c) in _wide and 2 or 1 for c in d])
7948
de377b1a9a84 move encoding bits from util to encoding
Matt Mackall <mpm@selenic.com>
parents:
diff changeset
   410
    return len(d)
de377b1a9a84 move encoding bits from util to encoding
Matt Mackall <mpm@selenic.com>
parents:
diff changeset
   411
43076
2372284d9457 formatting: blacken the codebase
Augie Fackler <augie@google.com>
parents: 41841
diff changeset
   412
15143
16c129b0f465 encoding: add getcols to extract substrings based on column width
Matt Mackall <mpm@selenic.com>
parents: 15142
diff changeset
   413
def getcols(s, start, c):
43544
2ade00f3b03b encoding: add comment-based type hints for pytype
Augie Fackler <augie@google.com>
parents: 43517
diff changeset
   414
    # type: (bytes, int, int) -> bytes
45957
89a2afe31e82 formating: upgrade to black 20.8b1
Augie Fackler <raf@durin42.com>
parents: 45681
diff changeset
   415
    """Use colwidth to find a c-column substring of s starting at byte
89a2afe31e82 formating: upgrade to black 20.8b1
Augie Fackler <raf@durin42.com>
parents: 45681
diff changeset
   416
    index start"""
49292
d44e3c45f0e4 py3: replace `pycompat.xrange` by `range`
Manuel Jacob <me@manueljacob.de>
parents: 49037
diff changeset
   417
    for x in range(start + c, len(s)):
15143
16c129b0f465 encoding: add getcols to extract substrings based on column width
Matt Mackall <mpm@selenic.com>
parents: 15142
diff changeset
   418
        t = s[start:x]
16c129b0f465 encoding: add getcols to extract substrings based on column width
Matt Mackall <mpm@selenic.com>
parents: 15142
diff changeset
   419
        if colwidth(t) == c:
16c129b0f465 encoding: add getcols to extract substrings based on column width
Matt Mackall <mpm@selenic.com>
parents: 15142
diff changeset
   420
            return t
43719
7cf332318f62 encoding: make getcols() raise exception explicitly
Yuya Nishihara <yuya@tcha.org>
parents: 43681
diff changeset
   421
    raise ValueError('substring not found')
15143
16c129b0f465 encoding: add getcols to extract substrings based on column width
Matt Mackall <mpm@selenic.com>
parents: 15142
diff changeset
   422
43076
2372284d9457 formatting: blacken the codebase
Augie Fackler <augie@google.com>
parents: 41841
diff changeset
   423
43077
687b865b95ad formatting: byteify all mercurial/ and hgext/ string literals
Augie Fackler <augie@google.com>
parents: 43076
diff changeset
   424
def trim(s, width, ellipsis=b'', leftside=False):
43544
2ade00f3b03b encoding: add comment-based type hints for pytype
Augie Fackler <augie@google.com>
parents: 43517
diff changeset
   425
    # type: (bytes, int, bytes, bool) -> bytes
21856
d24969ee272f encoding: add 'trim' to trim multi-byte characters at most specified columns
FUJIWARA Katsunori <foozy@lares.dti.ne.jp>
parents: 18378
diff changeset
   426
    """Trim string 's' to at most 'width' columns (including 'ellipsis').
d24969ee272f encoding: add 'trim' to trim multi-byte characters at most specified columns
FUJIWARA Katsunori <foozy@lares.dti.ne.jp>
parents: 18378
diff changeset
   427
21861
b515c3a63e96 encoding: add 'leftside' argument into 'trim' to switch trimming side
FUJIWARA Katsunori <foozy@lares.dti.ne.jp>
parents: 21856
diff changeset
   428
    If 'leftside' is True, left side of string 's' is trimmed.
b515c3a63e96 encoding: add 'leftside' argument into 'trim' to switch trimming side
FUJIWARA Katsunori <foozy@lares.dti.ne.jp>
parents: 21856
diff changeset
   429
    'ellipsis' is always placed at trimmed side.
b515c3a63e96 encoding: add 'leftside' argument into 'trim' to switch trimming side
FUJIWARA Katsunori <foozy@lares.dti.ne.jp>
parents: 21856
diff changeset
   430
34151
414a3513c2bd doctest: do not embed non-ascii characters in docstring
Yuya Nishihara <yuya@tcha.org>
parents: 34150
diff changeset
   431
    >>> from .node import bin
34152
a8994d08e4a2 doctest: use print_function and convert bytes to unicode where needed
Yuya Nishihara <yuya@tcha.org>
parents: 34151
diff changeset
   432
    >>> def bprint(s):
a8994d08e4a2 doctest: use print_function and convert bytes to unicode where needed
Yuya Nishihara <yuya@tcha.org>
parents: 34151
diff changeset
   433
    ...     print(pycompat.sysstr(s))
34146
0fa781320203 doctest: bulk-replace string literals with b'' for Python 3
Yuya Nishihara <yuya@tcha.org>
parents: 33946
diff changeset
   434
    >>> ellipsis = b'+++'
27355
b479fc425a81 encoding: use absolute_import
Gregory Szorc <gregory.szorc@gmail.com>
parents: 26963
diff changeset
   435
    >>> from . import encoding
34146
0fa781320203 doctest: bulk-replace string literals with b'' for Python 3
Yuya Nishihara <yuya@tcha.org>
parents: 33946
diff changeset
   436
    >>> encoding.encoding = b'utf-8'
0fa781320203 doctest: bulk-replace string literals with b'' for Python 3
Yuya Nishihara <yuya@tcha.org>
parents: 33946
diff changeset
   437
    >>> t = b'1234567890'
34152
a8994d08e4a2 doctest: use print_function and convert bytes to unicode where needed
Yuya Nishihara <yuya@tcha.org>
parents: 34151
diff changeset
   438
    >>> bprint(trim(t, 12, ellipsis=ellipsis))
21856
d24969ee272f encoding: add 'trim' to trim multi-byte characters at most specified columns
FUJIWARA Katsunori <foozy@lares.dti.ne.jp>
parents: 18378
diff changeset
   439
    1234567890
34152
a8994d08e4a2 doctest: use print_function and convert bytes to unicode where needed
Yuya Nishihara <yuya@tcha.org>
parents: 34151
diff changeset
   440
    >>> bprint(trim(t, 10, ellipsis=ellipsis))
21856
d24969ee272f encoding: add 'trim' to trim multi-byte characters at most specified columns
FUJIWARA Katsunori <foozy@lares.dti.ne.jp>
parents: 18378
diff changeset
   441
    1234567890
34152
a8994d08e4a2 doctest: use print_function and convert bytes to unicode where needed
Yuya Nishihara <yuya@tcha.org>
parents: 34151
diff changeset
   442
    >>> bprint(trim(t, 8, ellipsis=ellipsis))
21856
d24969ee272f encoding: add 'trim' to trim multi-byte characters at most specified columns
FUJIWARA Katsunori <foozy@lares.dti.ne.jp>
parents: 18378
diff changeset
   443
    12345+++
34152
a8994d08e4a2 doctest: use print_function and convert bytes to unicode where needed
Yuya Nishihara <yuya@tcha.org>
parents: 34151
diff changeset
   444
    >>> bprint(trim(t, 8, ellipsis=ellipsis, leftside=True))
21861
b515c3a63e96 encoding: add 'leftside' argument into 'trim' to switch trimming side
FUJIWARA Katsunori <foozy@lares.dti.ne.jp>
parents: 21856
diff changeset
   445
    +++67890
34152
a8994d08e4a2 doctest: use print_function and convert bytes to unicode where needed
Yuya Nishihara <yuya@tcha.org>
parents: 34151
diff changeset
   446
    >>> bprint(trim(t, 8))
21856
d24969ee272f encoding: add 'trim' to trim multi-byte characters at most specified columns
FUJIWARA Katsunori <foozy@lares.dti.ne.jp>
parents: 18378
diff changeset
   447
    12345678
34152
a8994d08e4a2 doctest: use print_function and convert bytes to unicode where needed
Yuya Nishihara <yuya@tcha.org>
parents: 34151
diff changeset
   448
    >>> bprint(trim(t, 8, leftside=True))
21861
b515c3a63e96 encoding: add 'leftside' argument into 'trim' to switch trimming side
FUJIWARA Katsunori <foozy@lares.dti.ne.jp>
parents: 21856
diff changeset
   449
    34567890
34152
a8994d08e4a2 doctest: use print_function and convert bytes to unicode where needed
Yuya Nishihara <yuya@tcha.org>
parents: 34151
diff changeset
   450
    >>> bprint(trim(t, 3, ellipsis=ellipsis))
21856
d24969ee272f encoding: add 'trim' to trim multi-byte characters at most specified columns
FUJIWARA Katsunori <foozy@lares.dti.ne.jp>
parents: 18378
diff changeset
   451
    +++
34152
a8994d08e4a2 doctest: use print_function and convert bytes to unicode where needed
Yuya Nishihara <yuya@tcha.org>
parents: 34151
diff changeset
   452
    >>> bprint(trim(t, 1, ellipsis=ellipsis))
21856
d24969ee272f encoding: add 'trim' to trim multi-byte characters at most specified columns
FUJIWARA Katsunori <foozy@lares.dti.ne.jp>
parents: 18378
diff changeset
   453
    +
d24969ee272f encoding: add 'trim' to trim multi-byte characters at most specified columns
FUJIWARA Katsunori <foozy@lares.dti.ne.jp>
parents: 18378
diff changeset
   454
    >>> u = u'\u3042\u3044\u3046\u3048\u304a' # 2 x 5 = 10 columns
34150
e9e225f16932 doctest: pass encoding name as system string
Yuya Nishihara <yuya@tcha.org>
parents: 34146
diff changeset
   455
    >>> t = u.encode(pycompat.sysstr(encoding.encoding))
34152
a8994d08e4a2 doctest: use print_function and convert bytes to unicode where needed
Yuya Nishihara <yuya@tcha.org>
parents: 34151
diff changeset
   456
    >>> bprint(trim(t, 12, ellipsis=ellipsis))
21856
d24969ee272f encoding: add 'trim' to trim multi-byte characters at most specified columns
FUJIWARA Katsunori <foozy@lares.dti.ne.jp>
parents: 18378
diff changeset
   457
    \xe3\x81\x82\xe3\x81\x84\xe3\x81\x86\xe3\x81\x88\xe3\x81\x8a
34152
a8994d08e4a2 doctest: use print_function and convert bytes to unicode where needed
Yuya Nishihara <yuya@tcha.org>
parents: 34151
diff changeset
   458
    >>> bprint(trim(t, 10, ellipsis=ellipsis))
21856
d24969ee272f encoding: add 'trim' to trim multi-byte characters at most specified columns
FUJIWARA Katsunori <foozy@lares.dti.ne.jp>
parents: 18378
diff changeset
   459
    \xe3\x81\x82\xe3\x81\x84\xe3\x81\x86\xe3\x81\x88\xe3\x81\x8a
34152
a8994d08e4a2 doctest: use print_function and convert bytes to unicode where needed
Yuya Nishihara <yuya@tcha.org>
parents: 34151
diff changeset
   460
    >>> bprint(trim(t, 8, ellipsis=ellipsis))
21856
d24969ee272f encoding: add 'trim' to trim multi-byte characters at most specified columns
FUJIWARA Katsunori <foozy@lares.dti.ne.jp>
parents: 18378
diff changeset
   461
    \xe3\x81\x82\xe3\x81\x84+++
34152
a8994d08e4a2 doctest: use print_function and convert bytes to unicode where needed
Yuya Nishihara <yuya@tcha.org>
parents: 34151
diff changeset
   462
    >>> bprint(trim(t, 8, ellipsis=ellipsis, leftside=True))
21861
b515c3a63e96 encoding: add 'leftside' argument into 'trim' to switch trimming side
FUJIWARA Katsunori <foozy@lares.dti.ne.jp>
parents: 21856
diff changeset
   463
    +++\xe3\x81\x88\xe3\x81\x8a
34152
a8994d08e4a2 doctest: use print_function and convert bytes to unicode where needed
Yuya Nishihara <yuya@tcha.org>
parents: 34151
diff changeset
   464
    >>> bprint(trim(t, 5))
21856
d24969ee272f encoding: add 'trim' to trim multi-byte characters at most specified columns
FUJIWARA Katsunori <foozy@lares.dti.ne.jp>
parents: 18378
diff changeset
   465
    \xe3\x81\x82\xe3\x81\x84
34152
a8994d08e4a2 doctest: use print_function and convert bytes to unicode where needed
Yuya Nishihara <yuya@tcha.org>
parents: 34151
diff changeset
   466
    >>> bprint(trim(t, 5, leftside=True))
21861
b515c3a63e96 encoding: add 'leftside' argument into 'trim' to switch trimming side
FUJIWARA Katsunori <foozy@lares.dti.ne.jp>
parents: 21856
diff changeset
   467
    \xe3\x81\x88\xe3\x81\x8a
34152
a8994d08e4a2 doctest: use print_function and convert bytes to unicode where needed
Yuya Nishihara <yuya@tcha.org>
parents: 34151
diff changeset
   468
    >>> bprint(trim(t, 4, ellipsis=ellipsis))
21856
d24969ee272f encoding: add 'trim' to trim multi-byte characters at most specified columns
FUJIWARA Katsunori <foozy@lares.dti.ne.jp>
parents: 18378
diff changeset
   469
    +++
34152
a8994d08e4a2 doctest: use print_function and convert bytes to unicode where needed
Yuya Nishihara <yuya@tcha.org>
parents: 34151
diff changeset
   470
    >>> bprint(trim(t, 4, ellipsis=ellipsis, leftside=True))
21861
b515c3a63e96 encoding: add 'leftside' argument into 'trim' to switch trimming side
FUJIWARA Katsunori <foozy@lares.dti.ne.jp>
parents: 21856
diff changeset
   471
    +++
34151
414a3513c2bd doctest: do not embed non-ascii characters in docstring
Yuya Nishihara <yuya@tcha.org>
parents: 34150
diff changeset
   472
    >>> t = bin(b'112233445566778899aa') # invalid byte sequence
34152
a8994d08e4a2 doctest: use print_function and convert bytes to unicode where needed
Yuya Nishihara <yuya@tcha.org>
parents: 34151
diff changeset
   473
    >>> bprint(trim(t, 12, ellipsis=ellipsis))
21856
d24969ee272f encoding: add 'trim' to trim multi-byte characters at most specified columns
FUJIWARA Katsunori <foozy@lares.dti.ne.jp>
parents: 18378
diff changeset
   474
    \x11\x22\x33\x44\x55\x66\x77\x88\x99\xaa
34152
a8994d08e4a2 doctest: use print_function and convert bytes to unicode where needed
Yuya Nishihara <yuya@tcha.org>
parents: 34151
diff changeset
   475
    >>> bprint(trim(t, 10, ellipsis=ellipsis))
21856
d24969ee272f encoding: add 'trim' to trim multi-byte characters at most specified columns
FUJIWARA Katsunori <foozy@lares.dti.ne.jp>
parents: 18378
diff changeset
   476
    \x11\x22\x33\x44\x55\x66\x77\x88\x99\xaa
34152
a8994d08e4a2 doctest: use print_function and convert bytes to unicode where needed
Yuya Nishihara <yuya@tcha.org>
parents: 34151
diff changeset
   477
    >>> bprint(trim(t, 8, ellipsis=ellipsis))
21856
d24969ee272f encoding: add 'trim' to trim multi-byte characters at most specified columns
FUJIWARA Katsunori <foozy@lares.dti.ne.jp>
parents: 18378
diff changeset
   478
    \x11\x22\x33\x44\x55+++
34152
a8994d08e4a2 doctest: use print_function and convert bytes to unicode where needed
Yuya Nishihara <yuya@tcha.org>
parents: 34151
diff changeset
   479
    >>> bprint(trim(t, 8, ellipsis=ellipsis, leftside=True))
21861
b515c3a63e96 encoding: add 'leftside' argument into 'trim' to switch trimming side
FUJIWARA Katsunori <foozy@lares.dti.ne.jp>
parents: 21856
diff changeset
   480
    +++\x66\x77\x88\x99\xaa
34152
a8994d08e4a2 doctest: use print_function and convert bytes to unicode where needed
Yuya Nishihara <yuya@tcha.org>
parents: 34151
diff changeset
   481
    >>> bprint(trim(t, 8))
21856
d24969ee272f encoding: add 'trim' to trim multi-byte characters at most specified columns
FUJIWARA Katsunori <foozy@lares.dti.ne.jp>
parents: 18378
diff changeset
   482
    \x11\x22\x33\x44\x55\x66\x77\x88
34152
a8994d08e4a2 doctest: use print_function and convert bytes to unicode where needed
Yuya Nishihara <yuya@tcha.org>
parents: 34151
diff changeset
   483
    >>> bprint(trim(t, 8, leftside=True))
21861
b515c3a63e96 encoding: add 'leftside' argument into 'trim' to switch trimming side
FUJIWARA Katsunori <foozy@lares.dti.ne.jp>
parents: 21856
diff changeset
   484
    \x33\x44\x55\x66\x77\x88\x99\xaa
34152
a8994d08e4a2 doctest: use print_function and convert bytes to unicode where needed
Yuya Nishihara <yuya@tcha.org>
parents: 34151
diff changeset
   485
    >>> bprint(trim(t, 3, ellipsis=ellipsis))
21856
d24969ee272f encoding: add 'trim' to trim multi-byte characters at most specified columns
FUJIWARA Katsunori <foozy@lares.dti.ne.jp>
parents: 18378
diff changeset
   486
    +++
34152
a8994d08e4a2 doctest: use print_function and convert bytes to unicode where needed
Yuya Nishihara <yuya@tcha.org>
parents: 34151
diff changeset
   487
    >>> bprint(trim(t, 1, ellipsis=ellipsis))
21856
d24969ee272f encoding: add 'trim' to trim multi-byte characters at most specified columns
FUJIWARA Katsunori <foozy@lares.dti.ne.jp>
parents: 18378
diff changeset
   488
    +
d24969ee272f encoding: add 'trim' to trim multi-byte characters at most specified columns
FUJIWARA Katsunori <foozy@lares.dti.ne.jp>
parents: 18378
diff changeset
   489
    """
d24969ee272f encoding: add 'trim' to trim multi-byte characters at most specified columns
FUJIWARA Katsunori <foozy@lares.dti.ne.jp>
parents: 18378
diff changeset
   490
    try:
30033
02dbfaa6df0b py3: convert encoding name and mode to str
Yuya Nishihara <yuya@tcha.org>
parents: 30031
diff changeset
   491
        u = s.decode(_sysstr(encoding))
21856
d24969ee272f encoding: add 'trim' to trim multi-byte characters at most specified columns
FUJIWARA Katsunori <foozy@lares.dti.ne.jp>
parents: 18378
diff changeset
   492
    except UnicodeDecodeError:
43076
2372284d9457 formatting: blacken the codebase
Augie Fackler <augie@google.com>
parents: 41841
diff changeset
   493
        if len(s) <= width:  # trimming is not needed
21856
d24969ee272f encoding: add 'trim' to trim multi-byte characters at most specified columns
FUJIWARA Katsunori <foozy@lares.dti.ne.jp>
parents: 18378
diff changeset
   494
            return s
d24969ee272f encoding: add 'trim' to trim multi-byte characters at most specified columns
FUJIWARA Katsunori <foozy@lares.dti.ne.jp>
parents: 18378
diff changeset
   495
        width -= len(ellipsis)
43076
2372284d9457 formatting: blacken the codebase
Augie Fackler <augie@google.com>
parents: 41841
diff changeset
   496
        if width <= 0:  # no enough room even for ellipsis
2372284d9457 formatting: blacken the codebase
Augie Fackler <augie@google.com>
parents: 41841
diff changeset
   497
            return ellipsis[: width + len(ellipsis)]
21861
b515c3a63e96 encoding: add 'leftside' argument into 'trim' to switch trimming side
FUJIWARA Katsunori <foozy@lares.dti.ne.jp>
parents: 21856
diff changeset
   498
        if leftside:
b515c3a63e96 encoding: add 'leftside' argument into 'trim' to switch trimming side
FUJIWARA Katsunori <foozy@lares.dti.ne.jp>
parents: 21856
diff changeset
   499
            return ellipsis + s[-width:]
21856
d24969ee272f encoding: add 'trim' to trim multi-byte characters at most specified columns
FUJIWARA Katsunori <foozy@lares.dti.ne.jp>
parents: 18378
diff changeset
   500
        return s[:width] + ellipsis
d24969ee272f encoding: add 'trim' to trim multi-byte characters at most specified columns
FUJIWARA Katsunori <foozy@lares.dti.ne.jp>
parents: 18378
diff changeset
   501
43076
2372284d9457 formatting: blacken the codebase
Augie Fackler <augie@google.com>
parents: 41841
diff changeset
   502
    if ucolwidth(u) <= width:  # trimming is not needed
21856
d24969ee272f encoding: add 'trim' to trim multi-byte characters at most specified columns
FUJIWARA Katsunori <foozy@lares.dti.ne.jp>
parents: 18378
diff changeset
   503
        return s
d24969ee272f encoding: add 'trim' to trim multi-byte characters at most specified columns
FUJIWARA Katsunori <foozy@lares.dti.ne.jp>
parents: 18378
diff changeset
   504
d24969ee272f encoding: add 'trim' to trim multi-byte characters at most specified columns
FUJIWARA Katsunori <foozy@lares.dti.ne.jp>
parents: 18378
diff changeset
   505
    width -= len(ellipsis)
43076
2372284d9457 formatting: blacken the codebase
Augie Fackler <augie@google.com>
parents: 41841
diff changeset
   506
    if width <= 0:  # no enough room even for ellipsis
2372284d9457 formatting: blacken the codebase
Augie Fackler <augie@google.com>
parents: 41841
diff changeset
   507
        return ellipsis[: width + len(ellipsis)]
21856
d24969ee272f encoding: add 'trim' to trim multi-byte characters at most specified columns
FUJIWARA Katsunori <foozy@lares.dti.ne.jp>
parents: 18378
diff changeset
   508
48692
f1ed5c304f45 encoding: fix trim() to be O(n) instead of O(n^2)
Martin von Zweigbergk <martinvonz@google.com>
parents: 48030
diff changeset
   509
    chars = list(u)
21861
b515c3a63e96 encoding: add 'leftside' argument into 'trim' to switch trimming side
FUJIWARA Katsunori <foozy@lares.dti.ne.jp>
parents: 21856
diff changeset
   510
    if leftside:
48692
f1ed5c304f45 encoding: fix trim() to be O(n) instead of O(n^2)
Martin von Zweigbergk <martinvonz@google.com>
parents: 48030
diff changeset
   511
        chars.reverse()
f1ed5c304f45 encoding: fix trim() to be O(n) instead of O(n^2)
Martin von Zweigbergk <martinvonz@google.com>
parents: 48030
diff changeset
   512
    width_so_far = 0
f1ed5c304f45 encoding: fix trim() to be O(n) instead of O(n^2)
Martin von Zweigbergk <martinvonz@google.com>
parents: 48030
diff changeset
   513
    for i, c in enumerate(chars):
f1ed5c304f45 encoding: fix trim() to be O(n) instead of O(n^2)
Martin von Zweigbergk <martinvonz@google.com>
parents: 48030
diff changeset
   514
        width_so_far += ucolwidth(c)
f1ed5c304f45 encoding: fix trim() to be O(n) instead of O(n^2)
Martin von Zweigbergk <martinvonz@google.com>
parents: 48030
diff changeset
   515
        if width_so_far > width:
f1ed5c304f45 encoding: fix trim() to be O(n) instead of O(n^2)
Martin von Zweigbergk <martinvonz@google.com>
parents: 48030
diff changeset
   516
            break
f1ed5c304f45 encoding: fix trim() to be O(n) instead of O(n^2)
Martin von Zweigbergk <martinvonz@google.com>
parents: 48030
diff changeset
   517
    chars = chars[:i]
f1ed5c304f45 encoding: fix trim() to be O(n) instead of O(n^2)
Martin von Zweigbergk <martinvonz@google.com>
parents: 48030
diff changeset
   518
    if leftside:
f1ed5c304f45 encoding: fix trim() to be O(n) instead of O(n^2)
Martin von Zweigbergk <martinvonz@google.com>
parents: 48030
diff changeset
   519
        chars.reverse()
f1ed5c304f45 encoding: fix trim() to be O(n) instead of O(n^2)
Martin von Zweigbergk <martinvonz@google.com>
parents: 48030
diff changeset
   520
    u = u''.join(chars).encode(_sysstr(encoding))
f1ed5c304f45 encoding: fix trim() to be O(n) instead of O(n^2)
Martin von Zweigbergk <martinvonz@google.com>
parents: 48030
diff changeset
   521
    if leftside:
f1ed5c304f45 encoding: fix trim() to be O(n) instead of O(n^2)
Martin von Zweigbergk <martinvonz@google.com>
parents: 48030
diff changeset
   522
        return ellipsis + u
f1ed5c304f45 encoding: fix trim() to be O(n) instead of O(n^2)
Martin von Zweigbergk <martinvonz@google.com>
parents: 48030
diff changeset
   523
    return u + ellipsis
43076
2372284d9457 formatting: blacken the codebase
Augie Fackler <augie@google.com>
parents: 41841
diff changeset
   524
21856
d24969ee272f encoding: add 'trim' to trim multi-byte characters at most specified columns
FUJIWARA Katsunori <foozy@lares.dti.ne.jp>
parents: 18378
diff changeset
   525
49037
642e31cb55f0 py3: use class X: instead of class X(object):
Gregory Szorc <gregory.szorc@gmail.com>
parents: 48983
diff changeset
   526
class normcasespecs:
45957
89a2afe31e82 formating: upgrade to black 20.8b1
Augie Fackler <raf@durin42.com>
parents: 45681
diff changeset
   527
    """what a platform's normcase does to ASCII strings
24593
f473a1fe5c7c encoding: define an enum that specifies what normcase does to ASCII strings
Siddharth Agarwal <sid0@fb.com>
parents: 24578
diff changeset
   528
f473a1fe5c7c encoding: define an enum that specifies what normcase does to ASCII strings
Siddharth Agarwal <sid0@fb.com>
parents: 24578
diff changeset
   529
    This is specified per platform, and should be consistent with what normcase
f473a1fe5c7c encoding: define an enum that specifies what normcase does to ASCII strings
Siddharth Agarwal <sid0@fb.com>
parents: 24578
diff changeset
   530
    on that platform actually does.
f473a1fe5c7c encoding: define an enum that specifies what normcase does to ASCII strings
Siddharth Agarwal <sid0@fb.com>
parents: 24578
diff changeset
   531
f473a1fe5c7c encoding: define an enum that specifies what normcase does to ASCII strings
Siddharth Agarwal <sid0@fb.com>
parents: 24578
diff changeset
   532
    lower: normcase lowercases ASCII strings
f473a1fe5c7c encoding: define an enum that specifies what normcase does to ASCII strings
Siddharth Agarwal <sid0@fb.com>
parents: 24578
diff changeset
   533
    upper: normcase uppercases ASCII strings
24608
1c533e23ce95 util.h: define an enum for normcase specs
Siddharth Agarwal <sid0@fb.com>
parents: 24597
diff changeset
   534
    other: the fallback function should always be called
1c533e23ce95 util.h: define an enum for normcase specs
Siddharth Agarwal <sid0@fb.com>
parents: 24597
diff changeset
   535
45957
89a2afe31e82 formating: upgrade to black 20.8b1
Augie Fackler <raf@durin42.com>
parents: 45681
diff changeset
   536
    This should be kept in sync with normcase_spec in util.h."""
43076
2372284d9457 formatting: blacken the codebase
Augie Fackler <augie@google.com>
parents: 41841
diff changeset
   537
24593
f473a1fe5c7c encoding: define an enum that specifies what normcase does to ASCII strings
Siddharth Agarwal <sid0@fb.com>
parents: 24578
diff changeset
   538
    lower = -1
f473a1fe5c7c encoding: define an enum that specifies what normcase does to ASCII strings
Siddharth Agarwal <sid0@fb.com>
parents: 24578
diff changeset
   539
    upper = 1
f473a1fe5c7c encoding: define an enum that specifies what normcase does to ASCII strings
Siddharth Agarwal <sid0@fb.com>
parents: 24578
diff changeset
   540
    other = 0
f473a1fe5c7c encoding: define an enum that specifies what normcase does to ASCII strings
Siddharth Agarwal <sid0@fb.com>
parents: 24578
diff changeset
   541
43076
2372284d9457 formatting: blacken the codebase
Augie Fackler <augie@google.com>
parents: 41841
diff changeset
   542
28068
9ece901f7a19 encoding: add option to escape non-ascii characters in JSON
Yuya Nishihara <yuya@tcha.org>
parents: 28067
diff changeset
   543
def jsonescape(s, paranoid=False):
43544
2ade00f3b03b encoding: add comment-based type hints for pytype
Augie Fackler <augie@google.com>
parents: 43517
diff changeset
   544
    # type: (Any, Any) -> Any
45957
89a2afe31e82 formating: upgrade to black 20.8b1
Augie Fackler <raf@durin42.com>
parents: 45681
diff changeset
   545
    """returns a string suitable for JSON
22426
f6b533e64ed6 encoding: add json escaping filter
Matt Mackall <mpm@selenic.com>
parents: 22425
diff changeset
   546
f6b533e64ed6 encoding: add json escaping filter
Matt Mackall <mpm@selenic.com>
parents: 22425
diff changeset
   547
    JSON is problematic for us because it doesn't support non-Unicode
f6b533e64ed6 encoding: add json escaping filter
Matt Mackall <mpm@selenic.com>
parents: 22425
diff changeset
   548
    bytes. To deal with this, we take the following approach:
f6b533e64ed6 encoding: add json escaping filter
Matt Mackall <mpm@selenic.com>
parents: 22425
diff changeset
   549
37991
3ea3c96ada54 encoding: introduce tagging type for non-lossy non-ASCII string
Yuya Nishihara <yuya@tcha.org>
parents: 37990
diff changeset
   550
    - localstr/safelocalstr objects are converted back to UTF-8
22426
f6b533e64ed6 encoding: add json escaping filter
Matt Mackall <mpm@selenic.com>
parents: 22425
diff changeset
   551
    - valid UTF-8/ASCII strings are passed as-is
f6b533e64ed6 encoding: add json escaping filter
Matt Mackall <mpm@selenic.com>
parents: 22425
diff changeset
   552
    - other strings are converted to UTF-8b surrogate encoding
f6b533e64ed6 encoding: add json escaping filter
Matt Mackall <mpm@selenic.com>
parents: 22425
diff changeset
   553
    - apply JSON-specified string escaping
f6b533e64ed6 encoding: add json escaping filter
Matt Mackall <mpm@selenic.com>
parents: 22425
diff changeset
   554
f6b533e64ed6 encoding: add json escaping filter
Matt Mackall <mpm@selenic.com>
parents: 22425
diff changeset
   555
    (escapes are doubled in these tests)
f6b533e64ed6 encoding: add json escaping filter
Matt Mackall <mpm@selenic.com>
parents: 22425
diff changeset
   556
34146
0fa781320203 doctest: bulk-replace string literals with b'' for Python 3
Yuya Nishihara <yuya@tcha.org>
parents: 33946
diff changeset
   557
    >>> jsonescape(b'this is a test')
22426
f6b533e64ed6 encoding: add json escaping filter
Matt Mackall <mpm@selenic.com>
parents: 22425
diff changeset
   558
    'this is a test'
34146
0fa781320203 doctest: bulk-replace string literals with b'' for Python 3
Yuya Nishihara <yuya@tcha.org>
parents: 33946
diff changeset
   559
    >>> jsonescape(b'escape characters: \\0 \\x0b \\x7f')
27881
ffa599f3f503 encoding: escape U+007F (DEL) character in JSON
Yuya Nishihara <yuya@tcha.org>
parents: 27699
diff changeset
   560
    'escape characters: \\\\u0000 \\\\u000b \\\\u007f'
34146
0fa781320203 doctest: bulk-replace string literals with b'' for Python 3
Yuya Nishihara <yuya@tcha.org>
parents: 33946
diff changeset
   561
    >>> jsonescape(b'escape characters: \\b \\t \\n \\f \\r \\" \\\\')
33943
2c37f9dabc32 encoding: add fast path of jsonescape() (issue5533)
Yuya Nishihara <yuya@tcha.org>
parents: 33942
diff changeset
   562
    'escape characters: \\\\b \\\\t \\\\n \\\\f \\\\r \\\\" \\\\\\\\'
34146
0fa781320203 doctest: bulk-replace string literals with b'' for Python 3
Yuya Nishihara <yuya@tcha.org>
parents: 33946
diff changeset
   563
    >>> jsonescape(b'a weird byte: \\xdd')
22426
f6b533e64ed6 encoding: add json escaping filter
Matt Mackall <mpm@selenic.com>
parents: 22425
diff changeset
   564
    'a weird byte: \\xed\\xb3\\x9d'
34146
0fa781320203 doctest: bulk-replace string literals with b'' for Python 3
Yuya Nishihara <yuya@tcha.org>
parents: 33946
diff changeset
   565
    >>> jsonescape(b'utf-8: caf\\xc3\\xa9')
22426
f6b533e64ed6 encoding: add json escaping filter
Matt Mackall <mpm@selenic.com>
parents: 22425
diff changeset
   566
    'utf-8: caf\\xc3\\xa9'
34146
0fa781320203 doctest: bulk-replace string literals with b'' for Python 3
Yuya Nishihara <yuya@tcha.org>
parents: 33946
diff changeset
   567
    >>> jsonescape(b'')
22426
f6b533e64ed6 encoding: add json escaping filter
Matt Mackall <mpm@selenic.com>
parents: 22425
diff changeset
   568
    ''
28068
9ece901f7a19 encoding: add option to escape non-ascii characters in JSON
Yuya Nishihara <yuya@tcha.org>
parents: 28067
diff changeset
   569
28069
b2d24c2898f9 encoding: backport paranoid escaping from templatefilters.jsonescape()
Yuya Nishihara <yuya@tcha.org>
parents: 28068
diff changeset
   570
    If paranoid, non-ascii and common troublesome characters are also escaped.
b2d24c2898f9 encoding: backport paranoid escaping from templatefilters.jsonescape()
Yuya Nishihara <yuya@tcha.org>
parents: 28068
diff changeset
   571
    This is suitable for web output.
28068
9ece901f7a19 encoding: add option to escape non-ascii characters in JSON
Yuya Nishihara <yuya@tcha.org>
parents: 28067
diff changeset
   572
34146
0fa781320203 doctest: bulk-replace string literals with b'' for Python 3
Yuya Nishihara <yuya@tcha.org>
parents: 33946
diff changeset
   573
    >>> s = b'escape characters: \\0 \\x0b \\x7f'
33943
2c37f9dabc32 encoding: add fast path of jsonescape() (issue5533)
Yuya Nishihara <yuya@tcha.org>
parents: 33942
diff changeset
   574
    >>> assert jsonescape(s) == jsonescape(s, paranoid=True)
34146
0fa781320203 doctest: bulk-replace string literals with b'' for Python 3
Yuya Nishihara <yuya@tcha.org>
parents: 33946
diff changeset
   575
    >>> s = b'escape characters: \\b \\t \\n \\f \\r \\" \\\\'
33943
2c37f9dabc32 encoding: add fast path of jsonescape() (issue5533)
Yuya Nishihara <yuya@tcha.org>
parents: 33942
diff changeset
   576
    >>> assert jsonescape(s) == jsonescape(s, paranoid=True)
34146
0fa781320203 doctest: bulk-replace string literals with b'' for Python 3
Yuya Nishihara <yuya@tcha.org>
parents: 33946
diff changeset
   577
    >>> jsonescape(b'escape boundary: \\x7e \\x7f \\xc2\\x80', paranoid=True)
28068
9ece901f7a19 encoding: add option to escape non-ascii characters in JSON
Yuya Nishihara <yuya@tcha.org>
parents: 28067
diff changeset
   578
    'escape boundary: ~ \\\\u007f \\\\u0080'
34146
0fa781320203 doctest: bulk-replace string literals with b'' for Python 3
Yuya Nishihara <yuya@tcha.org>
parents: 33946
diff changeset
   579
    >>> jsonescape(b'a weird byte: \\xdd', paranoid=True)
28068
9ece901f7a19 encoding: add option to escape non-ascii characters in JSON
Yuya Nishihara <yuya@tcha.org>
parents: 28067
diff changeset
   580
    'a weird byte: \\\\udcdd'
34146
0fa781320203 doctest: bulk-replace string literals with b'' for Python 3
Yuya Nishihara <yuya@tcha.org>
parents: 33946
diff changeset
   581
    >>> jsonescape(b'utf-8: caf\\xc3\\xa9', paranoid=True)
28068
9ece901f7a19 encoding: add option to escape non-ascii characters in JSON
Yuya Nishihara <yuya@tcha.org>
parents: 28067
diff changeset
   582
    'utf-8: caf\\\\u00e9'
34146
0fa781320203 doctest: bulk-replace string literals with b'' for Python 3
Yuya Nishihara <yuya@tcha.org>
parents: 33946
diff changeset
   583
    >>> jsonescape(b'non-BMP: \\xf0\\x9d\\x84\\x9e', paranoid=True)
28068
9ece901f7a19 encoding: add option to escape non-ascii characters in JSON
Yuya Nishihara <yuya@tcha.org>
parents: 28067
diff changeset
   584
    'non-BMP: \\\\ud834\\\\udd1e'
34146
0fa781320203 doctest: bulk-replace string literals with b'' for Python 3
Yuya Nishihara <yuya@tcha.org>
parents: 33946
diff changeset
   585
    >>> jsonescape(b'<foo@example.org>', paranoid=True)
28069
b2d24c2898f9 encoding: backport paranoid escaping from templatefilters.jsonescape()
Yuya Nishihara <yuya@tcha.org>
parents: 28068
diff changeset
   586
    '\\\\u003cfoo@example.org\\\\u003e'
45957
89a2afe31e82 formating: upgrade to black 20.8b1
Augie Fackler <raf@durin42.com>
parents: 45681
diff changeset
   587
    """
22426
f6b533e64ed6 encoding: add json escaping filter
Matt Mackall <mpm@selenic.com>
parents: 22425
diff changeset
   588
28068
9ece901f7a19 encoding: add option to escape non-ascii characters in JSON
Yuya Nishihara <yuya@tcha.org>
parents: 28067
diff changeset
   589
    u8chars = toutf8b(s)
9ece901f7a19 encoding: add option to escape non-ascii characters in JSON
Yuya Nishihara <yuya@tcha.org>
parents: 28067
diff changeset
   590
    try:
33942
b9101467d88b encoding: extract stub for fast JSON escape
Yuya Nishihara <yuya@tcha.org>
parents: 33873
diff changeset
   591
        return _jsonescapeu8fast(u8chars, paranoid)
b9101467d88b encoding: extract stub for fast JSON escape
Yuya Nishihara <yuya@tcha.org>
parents: 33873
diff changeset
   592
    except ValueError:
28068
9ece901f7a19 encoding: add option to escape non-ascii characters in JSON
Yuya Nishihara <yuya@tcha.org>
parents: 28067
diff changeset
   593
        pass
33942
b9101467d88b encoding: extract stub for fast JSON escape
Yuya Nishihara <yuya@tcha.org>
parents: 33873
diff changeset
   594
    return charencodepure.jsonescapeu8fallback(u8chars, paranoid)
22426
f6b533e64ed6 encoding: add json escaping filter
Matt Mackall <mpm@selenic.com>
parents: 22425
diff changeset
   595
43076
2372284d9457 formatting: blacken the codebase
Augie Fackler <augie@google.com>
parents: 41841
diff changeset
   596
34225
aa877860d4d7 py3: use 'surrogatepass' error handler to process U+DCxx transparently
Yuya Nishihara <yuya@tcha.org>
parents: 34223
diff changeset
   597
# We need to decode/encode U+DCxx codes transparently since invalid UTF-8
aa877860d4d7 py3: use 'surrogatepass' error handler to process U+DCxx transparently
Yuya Nishihara <yuya@tcha.org>
parents: 34223
diff changeset
   598
# bytes are mapped to that range.
48983
fa2b1a46d92e encoding: remove Python 2 support code
Gregory Szorc <gregory.szorc@gmail.com>
parents: 48966
diff changeset
   599
_utf8strict = r'surrogatepass'
34225
aa877860d4d7 py3: use 'surrogatepass' error handler to process U+DCxx transparently
Yuya Nishihara <yuya@tcha.org>
parents: 34223
diff changeset
   600
26875
cf47bdb2183c encoding: add getutf8char helper
Matt Mackall <mpm@selenic.com>
parents: 25660
diff changeset
   601
_utf8len = [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 2, 2, 3, 4]
cf47bdb2183c encoding: add getutf8char helper
Matt Mackall <mpm@selenic.com>
parents: 25660
diff changeset
   602
43076
2372284d9457 formatting: blacken the codebase
Augie Fackler <augie@google.com>
parents: 41841
diff changeset
   603
26875
cf47bdb2183c encoding: add getutf8char helper
Matt Mackall <mpm@selenic.com>
parents: 25660
diff changeset
   604
def getutf8char(s, pos):
43722
83a349aaeba3 typing: constrain argument/return types of encoding.toutf8b()
Yuya Nishihara <yuya@tcha.org>
parents: 43721
diff changeset
   605
    # type: (bytes, int) -> bytes
45957
89a2afe31e82 formating: upgrade to black 20.8b1
Augie Fackler <raf@durin42.com>
parents: 45681
diff changeset
   606
    """get the next full utf-8 character in the given string, starting at pos
26875
cf47bdb2183c encoding: add getutf8char helper
Matt Mackall <mpm@selenic.com>
parents: 25660
diff changeset
   607
cf47bdb2183c encoding: add getutf8char helper
Matt Mackall <mpm@selenic.com>
parents: 25660
diff changeset
   608
    Raises a UnicodeError if the given location does not start a valid
cf47bdb2183c encoding: add getutf8char helper
Matt Mackall <mpm@selenic.com>
parents: 25660
diff changeset
   609
    utf-8 character.
45957
89a2afe31e82 formating: upgrade to black 20.8b1
Augie Fackler <raf@durin42.com>
parents: 45681
diff changeset
   610
    """
26875
cf47bdb2183c encoding: add getutf8char helper
Matt Mackall <mpm@selenic.com>
parents: 25660
diff changeset
   611
cf47bdb2183c encoding: add getutf8char helper
Matt Mackall <mpm@selenic.com>
parents: 25660
diff changeset
   612
    # find how many bytes to attempt decoding from first nibble
43076
2372284d9457 formatting: blacken the codebase
Augie Fackler <augie@google.com>
parents: 41841
diff changeset
   613
    l = _utf8len[ord(s[pos : pos + 1]) >> 4]
2372284d9457 formatting: blacken the codebase
Augie Fackler <augie@google.com>
parents: 41841
diff changeset
   614
    if not l:  # ascii
2372284d9457 formatting: blacken the codebase
Augie Fackler <augie@google.com>
parents: 41841
diff changeset
   615
        return s[pos : pos + 1]
26875
cf47bdb2183c encoding: add getutf8char helper
Matt Mackall <mpm@selenic.com>
parents: 25660
diff changeset
   616
43076
2372284d9457 formatting: blacken the codebase
Augie Fackler <augie@google.com>
parents: 41841
diff changeset
   617
    c = s[pos : pos + l]
26875
cf47bdb2183c encoding: add getutf8char helper
Matt Mackall <mpm@selenic.com>
parents: 25660
diff changeset
   618
    # validate with attempted decode
34225
aa877860d4d7 py3: use 'surrogatepass' error handler to process U+DCxx transparently
Yuya Nishihara <yuya@tcha.org>
parents: 34223
diff changeset
   619
    c.decode("utf-8", _utf8strict)
26875
cf47bdb2183c encoding: add getutf8char helper
Matt Mackall <mpm@selenic.com>
parents: 25660
diff changeset
   620
    return c
cf47bdb2183c encoding: add getutf8char helper
Matt Mackall <mpm@selenic.com>
parents: 25660
diff changeset
   621
43076
2372284d9457 formatting: blacken the codebase
Augie Fackler <augie@google.com>
parents: 41841
diff changeset
   622
16133
84c58da3a1f8 encoding: introduce utf8-b helpers
Matt Mackall <mpm@selenic.com>
parents: 15769
diff changeset
   623
def toutf8b(s):
43722
83a349aaeba3 typing: constrain argument/return types of encoding.toutf8b()
Yuya Nishihara <yuya@tcha.org>
parents: 43721
diff changeset
   624
    # type: (bytes) -> bytes
45957
89a2afe31e82 formating: upgrade to black 20.8b1
Augie Fackler <raf@durin42.com>
parents: 45681
diff changeset
   625
    """convert a local, possibly-binary string into UTF-8b
16133
84c58da3a1f8 encoding: introduce utf8-b helpers
Matt Mackall <mpm@selenic.com>
parents: 15769
diff changeset
   626
84c58da3a1f8 encoding: introduce utf8-b helpers
Matt Mackall <mpm@selenic.com>
parents: 15769
diff changeset
   627
    This is intended as a generic method to preserve data when working
84c58da3a1f8 encoding: introduce utf8-b helpers
Matt Mackall <mpm@selenic.com>
parents: 15769
diff changeset
   628
    with schemes like JSON and XML that have no provision for
84c58da3a1f8 encoding: introduce utf8-b helpers
Matt Mackall <mpm@selenic.com>
parents: 15769
diff changeset
   629
    arbitrary byte strings. As Mercurial often doesn't know
84c58da3a1f8 encoding: introduce utf8-b helpers
Matt Mackall <mpm@selenic.com>
parents: 15769
diff changeset
   630
    what encoding data is in, we use so-called UTF-8b.
84c58da3a1f8 encoding: introduce utf8-b helpers
Matt Mackall <mpm@selenic.com>
parents: 15769
diff changeset
   631
84c58da3a1f8 encoding: introduce utf8-b helpers
Matt Mackall <mpm@selenic.com>
parents: 15769
diff changeset
   632
    If a string is already valid UTF-8 (or ASCII), it passes unmodified.
84c58da3a1f8 encoding: introduce utf8-b helpers
Matt Mackall <mpm@selenic.com>
parents: 15769
diff changeset
   633
    Otherwise, unsupported bytes are mapped to UTF-16 surrogate range,
84c58da3a1f8 encoding: introduce utf8-b helpers
Matt Mackall <mpm@selenic.com>
parents: 15769
diff changeset
   634
    uDC00-uDCFF.
84c58da3a1f8 encoding: introduce utf8-b helpers
Matt Mackall <mpm@selenic.com>
parents: 15769
diff changeset
   635
84c58da3a1f8 encoding: introduce utf8-b helpers
Matt Mackall <mpm@selenic.com>
parents: 15769
diff changeset
   636
    Principles of operation:
84c58da3a1f8 encoding: introduce utf8-b helpers
Matt Mackall <mpm@selenic.com>
parents: 15769
diff changeset
   637
17424
e7cfe3587ea4 fix trivial spelling errors
Mads Kiilerich <mads@kiilerich.com>
parents: 17236
diff changeset
   638
    - ASCII and UTF-8 data successfully round-trips and is understood
16133
84c58da3a1f8 encoding: introduce utf8-b helpers
Matt Mackall <mpm@selenic.com>
parents: 15769
diff changeset
   639
      by Unicode-oriented clients
84c58da3a1f8 encoding: introduce utf8-b helpers
Matt Mackall <mpm@selenic.com>
parents: 15769
diff changeset
   640
    - filenames and file contents in arbitrary other encodings can have
84c58da3a1f8 encoding: introduce utf8-b helpers
Matt Mackall <mpm@selenic.com>
parents: 15769
diff changeset
   641
      be round-tripped or recovered by clueful clients
84c58da3a1f8 encoding: introduce utf8-b helpers
Matt Mackall <mpm@selenic.com>
parents: 15769
diff changeset
   642
    - local strings that have a cached known UTF-8 encoding (aka
84c58da3a1f8 encoding: introduce utf8-b helpers
Matt Mackall <mpm@selenic.com>
parents: 15769
diff changeset
   643
      localstr) get sent as UTF-8 so Unicode-oriented clients get the
84c58da3a1f8 encoding: introduce utf8-b helpers
Matt Mackall <mpm@selenic.com>
parents: 15769
diff changeset
   644
      Unicode data they want
37991
3ea3c96ada54 encoding: introduce tagging type for non-lossy non-ASCII string
Yuya Nishihara <yuya@tcha.org>
parents: 37990
diff changeset
   645
    - non-lossy local strings (aka safelocalstr) get sent as UTF-8 as well
16133
84c58da3a1f8 encoding: introduce utf8-b helpers
Matt Mackall <mpm@selenic.com>
parents: 15769
diff changeset
   646
    - because we must preserve UTF-8 bytestring in places such as
84c58da3a1f8 encoding: introduce utf8-b helpers
Matt Mackall <mpm@selenic.com>
parents: 15769
diff changeset
   647
      filenames, metadata can't be roundtripped without help
84c58da3a1f8 encoding: introduce utf8-b helpers
Matt Mackall <mpm@selenic.com>
parents: 15769
diff changeset
   648
84c58da3a1f8 encoding: introduce utf8-b helpers
Matt Mackall <mpm@selenic.com>
parents: 15769
diff changeset
   649
    (Note: "UTF-8b" often refers to decoding a mix of valid UTF-8 and
84c58da3a1f8 encoding: introduce utf8-b helpers
Matt Mackall <mpm@selenic.com>
parents: 15769
diff changeset
   650
    arbitrary bytes into an internal Unicode format that can be
84c58da3a1f8 encoding: introduce utf8-b helpers
Matt Mackall <mpm@selenic.com>
parents: 15769
diff changeset
   651
    re-encoded back into the original. Here we are exposing the
84c58da3a1f8 encoding: introduce utf8-b helpers
Matt Mackall <mpm@selenic.com>
parents: 15769
diff changeset
   652
    internal surrogate encoding as a UTF-8 string.)
45957
89a2afe31e82 formating: upgrade to black 20.8b1
Augie Fackler <raf@durin42.com>
parents: 45681
diff changeset
   653
    """
16133
84c58da3a1f8 encoding: introduce utf8-b helpers
Matt Mackall <mpm@selenic.com>
parents: 15769
diff changeset
   654
37990
57b0c7221dba encoding: fix toutf8b() to resurrect lossy characters even if "\xed" in it
Yuya Nishihara <yuya@tcha.org>
parents: 36805
diff changeset
   655
    if isinstance(s, localstr):
57b0c7221dba encoding: fix toutf8b() to resurrect lossy characters even if "\xed" in it
Yuya Nishihara <yuya@tcha.org>
parents: 36805
diff changeset
   656
        # assume that the original UTF-8 sequence would never contain
57b0c7221dba encoding: fix toutf8b() to resurrect lossy characters even if "\xed" in it
Yuya Nishihara <yuya@tcha.org>
parents: 36805
diff changeset
   657
        # invalid characters in U+DCxx range
57b0c7221dba encoding: fix toutf8b() to resurrect lossy characters even if "\xed" in it
Yuya Nishihara <yuya@tcha.org>
parents: 36805
diff changeset
   658
        return s._utf8
37991
3ea3c96ada54 encoding: introduce tagging type for non-lossy non-ASCII string
Yuya Nishihara <yuya@tcha.org>
parents: 37990
diff changeset
   659
    elif isinstance(s, safelocalstr):
3ea3c96ada54 encoding: introduce tagging type for non-lossy non-ASCII string
Yuya Nishihara <yuya@tcha.org>
parents: 37990
diff changeset
   660
        # already verified that s is non-lossy in legacy encoding, which
3ea3c96ada54 encoding: introduce tagging type for non-lossy non-ASCII string
Yuya Nishihara <yuya@tcha.org>
parents: 37990
diff changeset
   661
        # shouldn't contain characters in U+DCxx range
3ea3c96ada54 encoding: introduce tagging type for non-lossy non-ASCII string
Yuya Nishihara <yuya@tcha.org>
parents: 37990
diff changeset
   662
        return fromlocal(s)
37990
57b0c7221dba encoding: fix toutf8b() to resurrect lossy characters even if "\xed" in it
Yuya Nishihara <yuya@tcha.org>
parents: 36805
diff changeset
   663
    elif isasciistr(s):
33946
6c119dbfd0c0 encoding: add fast path of from/toutf8b() for ASCII strings
Yuya Nishihara <yuya@tcha.org>
parents: 33945
diff changeset
   664
        return s
43077
687b865b95ad formatting: byteify all mercurial/ and hgext/ string literals
Augie Fackler <augie@google.com>
parents: 43076
diff changeset
   665
    if b"\xed" not in s:
26879
a24b98f4e03c encoding: re-escape U+DCxx characters in toutf8b input (issue4927)
Matt Mackall <mpm@selenic.com>
parents: 26878
diff changeset
   666
        try:
34225
aa877860d4d7 py3: use 'surrogatepass' error handler to process U+DCxx transparently
Yuya Nishihara <yuya@tcha.org>
parents: 34223
diff changeset
   667
            s.decode('utf-8', _utf8strict)
26879
a24b98f4e03c encoding: re-escape U+DCxx characters in toutf8b input (issue4927)
Matt Mackall <mpm@selenic.com>
parents: 26878
diff changeset
   668
            return s
a24b98f4e03c encoding: re-escape U+DCxx characters in toutf8b input (issue4927)
Matt Mackall <mpm@selenic.com>
parents: 26878
diff changeset
   669
        except UnicodeDecodeError:
a24b98f4e03c encoding: re-escape U+DCxx characters in toutf8b input (issue4927)
Matt Mackall <mpm@selenic.com>
parents: 26878
diff changeset
   670
            pass
26878
d7e83f106459 encoding: use getutf8char in toutf8b
Matt Mackall <mpm@selenic.com>
parents: 26877
diff changeset
   671
34223
1c601df9894c py3: wrap bytes in encoding.from/toutf8b() with bytestr
Yuya Nishihara <yuya@tcha.org>
parents: 34207
diff changeset
   672
    s = pycompat.bytestr(s)
50434
95acba2c29f6 encoding: avoid quadratic time complexity when json-encoding non-UTF8 strings
Arseniy Alekseyev <aalekseyev@janestreet.com>
parents: 49292
diff changeset
   673
    r = bytearray()
26878
d7e83f106459 encoding: use getutf8char in toutf8b
Matt Mackall <mpm@selenic.com>
parents: 26877
diff changeset
   674
    pos = 0
d7e83f106459 encoding: use getutf8char in toutf8b
Matt Mackall <mpm@selenic.com>
parents: 26877
diff changeset
   675
    l = len(s)
d7e83f106459 encoding: use getutf8char in toutf8b
Matt Mackall <mpm@selenic.com>
parents: 26877
diff changeset
   676
    while pos < l:
d7e83f106459 encoding: use getutf8char in toutf8b
Matt Mackall <mpm@selenic.com>
parents: 26877
diff changeset
   677
        try:
d7e83f106459 encoding: use getutf8char in toutf8b
Matt Mackall <mpm@selenic.com>
parents: 26877
diff changeset
   678
            c = getutf8char(s, pos)
43077
687b865b95ad formatting: byteify all mercurial/ and hgext/ string literals
Augie Fackler <augie@google.com>
parents: 43076
diff changeset
   679
            if b"\xed\xb0\x80" <= c <= b"\xed\xb3\xbf":
26879
a24b98f4e03c encoding: re-escape U+DCxx characters in toutf8b input (issue4927)
Matt Mackall <mpm@selenic.com>
parents: 26878
diff changeset
   680
                # have to re-escape existing U+DCxx characters
43076
2372284d9457 formatting: blacken the codebase
Augie Fackler <augie@google.com>
parents: 41841
diff changeset
   681
                c = unichr(0xDC00 + ord(s[pos])).encode('utf-8', _utf8strict)
26879
a24b98f4e03c encoding: re-escape U+DCxx characters in toutf8b input (issue4927)
Matt Mackall <mpm@selenic.com>
parents: 26878
diff changeset
   682
                pos += 1
a24b98f4e03c encoding: re-escape U+DCxx characters in toutf8b input (issue4927)
Matt Mackall <mpm@selenic.com>
parents: 26878
diff changeset
   683
            else:
a24b98f4e03c encoding: re-escape U+DCxx characters in toutf8b input (issue4927)
Matt Mackall <mpm@selenic.com>
parents: 26878
diff changeset
   684
                pos += len(c)
26878
d7e83f106459 encoding: use getutf8char in toutf8b
Matt Mackall <mpm@selenic.com>
parents: 26877
diff changeset
   685
        except UnicodeDecodeError:
43076
2372284d9457 formatting: blacken the codebase
Augie Fackler <augie@google.com>
parents: 41841
diff changeset
   686
            c = unichr(0xDC00 + ord(s[pos])).encode('utf-8', _utf8strict)
26878
d7e83f106459 encoding: use getutf8char in toutf8b
Matt Mackall <mpm@selenic.com>
parents: 26877
diff changeset
   687
            pos += 1
d7e83f106459 encoding: use getutf8char in toutf8b
Matt Mackall <mpm@selenic.com>
parents: 26877
diff changeset
   688
        r += c
50434
95acba2c29f6 encoding: avoid quadratic time complexity when json-encoding non-UTF8 strings
Arseniy Alekseyev <aalekseyev@janestreet.com>
parents: 49292
diff changeset
   689
    return bytes(r)
16133
84c58da3a1f8 encoding: introduce utf8-b helpers
Matt Mackall <mpm@selenic.com>
parents: 15769
diff changeset
   690
43076
2372284d9457 formatting: blacken the codebase
Augie Fackler <augie@google.com>
parents: 41841
diff changeset
   691
16133
84c58da3a1f8 encoding: introduce utf8-b helpers
Matt Mackall <mpm@selenic.com>
parents: 15769
diff changeset
   692
def fromutf8b(s):
43721
b65fcccd9100 typing: fix argument type of encoding.tolocal() and .fromutf8b()
Yuya Nishihara <yuya@tcha.org>
parents: 43720
diff changeset
   693
    # type: (bytes) -> bytes
45957
89a2afe31e82 formating: upgrade to black 20.8b1
Augie Fackler <raf@durin42.com>
parents: 45681
diff changeset
   694
    """Given a UTF-8b string, return a local, possibly-binary string.
16133
84c58da3a1f8 encoding: introduce utf8-b helpers
Matt Mackall <mpm@selenic.com>
parents: 15769
diff changeset
   695
84c58da3a1f8 encoding: introduce utf8-b helpers
Matt Mackall <mpm@selenic.com>
parents: 15769
diff changeset
   696
    return the original binary string. This
84c58da3a1f8 encoding: introduce utf8-b helpers
Matt Mackall <mpm@selenic.com>
parents: 15769
diff changeset
   697
    is a round-trip process for strings like filenames, but metadata
84c58da3a1f8 encoding: introduce utf8-b helpers
Matt Mackall <mpm@selenic.com>
parents: 15769
diff changeset
   698
    that's was passed through tolocal will remain in UTF-8.
84c58da3a1f8 encoding: introduce utf8-b helpers
Matt Mackall <mpm@selenic.com>
parents: 15769
diff changeset
   699
26963
de5ae97ce9f4 encoding: extend test cases for utf8b
Matt Mackall <mpm@selenic.com>
parents: 26879
diff changeset
   700
    >>> roundtrip = lambda x: fromutf8b(toutf8b(x)) == x
34146
0fa781320203 doctest: bulk-replace string literals with b'' for Python 3
Yuya Nishihara <yuya@tcha.org>
parents: 33946
diff changeset
   701
    >>> m = b"\\xc3\\xa9\\x99abcd"
26963
de5ae97ce9f4 encoding: extend test cases for utf8b
Matt Mackall <mpm@selenic.com>
parents: 26879
diff changeset
   702
    >>> toutf8b(m)
16133
84c58da3a1f8 encoding: introduce utf8-b helpers
Matt Mackall <mpm@selenic.com>
parents: 15769
diff changeset
   703
    '\\xc3\\xa9\\xed\\xb2\\x99abcd'
26963
de5ae97ce9f4 encoding: extend test cases for utf8b
Matt Mackall <mpm@selenic.com>
parents: 26879
diff changeset
   704
    >>> roundtrip(m)
de5ae97ce9f4 encoding: extend test cases for utf8b
Matt Mackall <mpm@selenic.com>
parents: 26879
diff changeset
   705
    True
34146
0fa781320203 doctest: bulk-replace string literals with b'' for Python 3
Yuya Nishihara <yuya@tcha.org>
parents: 33946
diff changeset
   706
    >>> roundtrip(b"\\xc2\\xc2\\x80")
26963
de5ae97ce9f4 encoding: extend test cases for utf8b
Matt Mackall <mpm@selenic.com>
parents: 26879
diff changeset
   707
    True
34146
0fa781320203 doctest: bulk-replace string literals with b'' for Python 3
Yuya Nishihara <yuya@tcha.org>
parents: 33946
diff changeset
   708
    >>> roundtrip(b"\\xef\\xbf\\xbd")
26963
de5ae97ce9f4 encoding: extend test cases for utf8b
Matt Mackall <mpm@selenic.com>
parents: 26879
diff changeset
   709
    True
34146
0fa781320203 doctest: bulk-replace string literals with b'' for Python 3
Yuya Nishihara <yuya@tcha.org>
parents: 33946
diff changeset
   710
    >>> roundtrip(b"\\xef\\xef\\xbf\\xbd")
16133
84c58da3a1f8 encoding: introduce utf8-b helpers
Matt Mackall <mpm@selenic.com>
parents: 15769
diff changeset
   711
    True
34146
0fa781320203 doctest: bulk-replace string literals with b'' for Python 3
Yuya Nishihara <yuya@tcha.org>
parents: 33946
diff changeset
   712
    >>> roundtrip(b"\\xf1\\x80\\x80\\x80\\x80")
27699
c8d3392f76e1 encoding: handle UTF-16 internal limit with fromutf8b (issue5031)
Matt Mackall <mpm@selenic.com>
parents: 27356
diff changeset
   713
    True
45957
89a2afe31e82 formating: upgrade to black 20.8b1
Augie Fackler <raf@durin42.com>
parents: 45681
diff changeset
   714
    """
16133
84c58da3a1f8 encoding: introduce utf8-b helpers
Matt Mackall <mpm@selenic.com>
parents: 15769
diff changeset
   715
33946
6c119dbfd0c0 encoding: add fast path of from/toutf8b() for ASCII strings
Yuya Nishihara <yuya@tcha.org>
parents: 33945
diff changeset
   716
    if isasciistr(s):
6c119dbfd0c0 encoding: add fast path of from/toutf8b() for ASCII strings
Yuya Nishihara <yuya@tcha.org>
parents: 33945
diff changeset
   717
        return s
16133
84c58da3a1f8 encoding: introduce utf8-b helpers
Matt Mackall <mpm@selenic.com>
parents: 15769
diff changeset
   718
    # fast path - look for uDxxx prefixes in s
43077
687b865b95ad formatting: byteify all mercurial/ and hgext/ string literals
Augie Fackler <augie@google.com>
parents: 43076
diff changeset
   719
    if b"\xed" not in s:
16133
84c58da3a1f8 encoding: introduce utf8-b helpers
Matt Mackall <mpm@selenic.com>
parents: 15769
diff changeset
   720
        return s
84c58da3a1f8 encoding: introduce utf8-b helpers
Matt Mackall <mpm@selenic.com>
parents: 15769
diff changeset
   721
27699
c8d3392f76e1 encoding: handle UTF-16 internal limit with fromutf8b (issue5031)
Matt Mackall <mpm@selenic.com>
parents: 27356
diff changeset
   722
    # We could do this with the unicode type but some Python builds
c8d3392f76e1 encoding: handle UTF-16 internal limit with fromutf8b (issue5031)
Matt Mackall <mpm@selenic.com>
parents: 27356
diff changeset
   723
    # use UTF-16 internally (issue5031) which causes non-BMP code
c8d3392f76e1 encoding: handle UTF-16 internal limit with fromutf8b (issue5031)
Matt Mackall <mpm@selenic.com>
parents: 27356
diff changeset
   724
    # points to be escaped. Instead, we use our handy getutf8char
c8d3392f76e1 encoding: handle UTF-16 internal limit with fromutf8b (issue5031)
Matt Mackall <mpm@selenic.com>
parents: 27356
diff changeset
   725
    # helper again to walk the string without "decoding" it.
c8d3392f76e1 encoding: handle UTF-16 internal limit with fromutf8b (issue5031)
Matt Mackall <mpm@selenic.com>
parents: 27356
diff changeset
   726
34223
1c601df9894c py3: wrap bytes in encoding.from/toutf8b() with bytestr
Yuya Nishihara <yuya@tcha.org>
parents: 34207
diff changeset
   727
    s = pycompat.bytestr(s)
50434
95acba2c29f6 encoding: avoid quadratic time complexity when json-encoding non-UTF8 strings
Arseniy Alekseyev <aalekseyev@janestreet.com>
parents: 49292
diff changeset
   728
    r = bytearray()
27699
c8d3392f76e1 encoding: handle UTF-16 internal limit with fromutf8b (issue5031)
Matt Mackall <mpm@selenic.com>
parents: 27356
diff changeset
   729
    pos = 0
c8d3392f76e1 encoding: handle UTF-16 internal limit with fromutf8b (issue5031)
Matt Mackall <mpm@selenic.com>
parents: 27356
diff changeset
   730
    l = len(s)
c8d3392f76e1 encoding: handle UTF-16 internal limit with fromutf8b (issue5031)
Matt Mackall <mpm@selenic.com>
parents: 27356
diff changeset
   731
    while pos < l:
c8d3392f76e1 encoding: handle UTF-16 internal limit with fromutf8b (issue5031)
Matt Mackall <mpm@selenic.com>
parents: 27356
diff changeset
   732
        c = getutf8char(s, pos)
c8d3392f76e1 encoding: handle UTF-16 internal limit with fromutf8b (issue5031)
Matt Mackall <mpm@selenic.com>
parents: 27356
diff changeset
   733
        pos += len(c)
c8d3392f76e1 encoding: handle UTF-16 internal limit with fromutf8b (issue5031)
Matt Mackall <mpm@selenic.com>
parents: 27356
diff changeset
   734
        # unescape U+DCxx characters
43077
687b865b95ad formatting: byteify all mercurial/ and hgext/ string literals
Augie Fackler <augie@google.com>
parents: 43076
diff changeset
   735
        if b"\xed\xb0\x80" <= c <= b"\xed\xb3\xbf":
43076
2372284d9457 formatting: blacken the codebase
Augie Fackler <augie@google.com>
parents: 41841
diff changeset
   736
            c = pycompat.bytechr(ord(c.decode("utf-8", _utf8strict)) & 0xFF)
27699
c8d3392f76e1 encoding: handle UTF-16 internal limit with fromutf8b (issue5031)
Matt Mackall <mpm@selenic.com>
parents: 27356
diff changeset
   737
        r += c
50434
95acba2c29f6 encoding: avoid quadratic time complexity when json-encoding non-UTF8 strings
Arseniy Alekseyev <aalekseyev@janestreet.com>
parents: 49292
diff changeset
   738
    return bytes(r)