mercurial/encoding.py
author Pierre-Yves David <pierre-yves.david@octobus.net>
Tue, 24 Aug 2021 22:07:50 +0200
changeset 47939 74a8d5c6fdc6
parent 47621 d6ee6456bd5f
child 48030 28c62f83b652
permissions -rw-r--r--
dirstate: also wrap the new method in `dirstatenonnormalcheck` The goal of this is to make sure we set the data right, so we need to make sure it run after the new method, that we actually call, in addition to the old one, that we no longer call. Differential Revision: https://phab.mercurial-scm.org/D11343
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
8226
8b2cd04a6e97 put license and copyright info into comment blocks
Martin Geisler <mg@lazybytes.net>
parents: 8225
diff changeset
     1
# encoding.py - character transcoding support for Mercurial
8b2cd04a6e97 put license and copyright info into comment blocks
Martin Geisler <mg@lazybytes.net>
parents: 8225
diff changeset
     2
#
46819
d4ba4d51f85f contributor: change mentions of mpm to olivia
Raphaël Gomès <rgomes@octobus.net>
parents: 46319
diff changeset
     3
#  Copyright 2005-2009 Olivia Mackall <olivia@selenic.com> and others
8226
8b2cd04a6e97 put license and copyright info into comment blocks
Martin Geisler <mg@lazybytes.net>
parents: 8225
diff changeset
     4
#
8b2cd04a6e97 put license and copyright info into comment blocks
Martin Geisler <mg@lazybytes.net>
parents: 8225
diff changeset
     5
# This software may be used and distributed according to the terms of the
10263
25e572394f5c Update license to GPLv2+
Matt Mackall <mpm@selenic.com>
parents: 9574
diff changeset
     6
# GNU General Public License version 2 or any later version.
7948
de377b1a9a84 move encoding bits from util to encoding
Matt Mackall <mpm@selenic.com>
parents:
diff changeset
     7
34152
a8994d08e4a2 doctest: use print_function and convert bytes to unicode where needed
Yuya Nishihara <yuya@tcha.org>
parents: 34151
diff changeset
     8
from __future__ import absolute_import, print_function
27355
b479fc425a81 encoding: use absolute_import
Gregory Szorc <gregory.szorc@gmail.com>
parents: 26963
diff changeset
     9
b479fc425a81 encoding: use absolute_import
Gregory Szorc <gregory.szorc@gmail.com>
parents: 26963
diff changeset
    10
import locale
b479fc425a81 encoding: use absolute_import
Gregory Szorc <gregory.szorc@gmail.com>
parents: 26963
diff changeset
    11
import os
47621
d6ee6456bd5f windows: enforce upper case drive letter for getcwd in mercurial too
Pierre-Yves David <pierre-yves.david@octobus.net>
parents: 47560
diff changeset
    12
import re
27355
b479fc425a81 encoding: use absolute_import
Gregory Szorc <gregory.szorc@gmail.com>
parents: 26963
diff changeset
    13
import unicodedata
b479fc425a81 encoding: use absolute_import
Gregory Szorc <gregory.szorc@gmail.com>
parents: 26963
diff changeset
    14
43089
c59eb1560c44 py3: manually import getattr where it is needed
Gregory Szorc <gregory.szorc@gmail.com>
parents: 43077
diff changeset
    15
from .pycompat import getattr
27355
b479fc425a81 encoding: use absolute_import
Gregory Szorc <gregory.szorc@gmail.com>
parents: 26963
diff changeset
    16
from . import (
b479fc425a81 encoding: use absolute_import
Gregory Szorc <gregory.szorc@gmail.com>
parents: 26963
diff changeset
    17
    error,
32411
df448de7cf3b parsers: switch to policy importer
Yuya Nishihara <yuya@tcha.org>
parents: 32339
diff changeset
    18
    policy,
30031
0f6d6fdd3c2a pycompat: provide 'ispy3' constant
Yuya Nishihara <yuya@tcha.org>
parents: 28508
diff changeset
    19
    pycompat,
27355
b479fc425a81 encoding: use absolute_import
Gregory Szorc <gregory.szorc@gmail.com>
parents: 26963
diff changeset
    20
)
7948
de377b1a9a84 move encoding bits from util to encoding
Matt Mackall <mpm@selenic.com>
parents:
diff changeset
    21
43076
2372284d9457 formatting: blacken the codebase
Augie Fackler <augie@google.com>
parents: 41841
diff changeset
    22
from .pure import charencode as charencodepure
33942
b9101467d88b encoding: extract stub for fast JSON escape
Yuya Nishihara <yuya@tcha.org>
parents: 33873
diff changeset
    23
43793
7b14d649af1b typing: consolidate "if not globals():" trick
Yuya Nishihara <yuya@tcha.org>
parents: 43725
diff changeset
    24
if pycompat.TYPE_CHECKING:
43544
2ade00f3b03b encoding: add comment-based type hints for pytype
Augie Fackler <augie@google.com>
parents: 43517
diff changeset
    25
    from typing import (
2ade00f3b03b encoding: add comment-based type hints for pytype
Augie Fackler <augie@google.com>
parents: 43517
diff changeset
    26
        Any,
2ade00f3b03b encoding: add comment-based type hints for pytype
Augie Fackler <augie@google.com>
parents: 43517
diff changeset
    27
        Callable,
2ade00f3b03b encoding: add comment-based type hints for pytype
Augie Fackler <augie@google.com>
parents: 43517
diff changeset
    28
        List,
2ade00f3b03b encoding: add comment-based type hints for pytype
Augie Fackler <augie@google.com>
parents: 43517
diff changeset
    29
        Text,
2ade00f3b03b encoding: add comment-based type hints for pytype
Augie Fackler <augie@google.com>
parents: 43517
diff changeset
    30
        Type,
2ade00f3b03b encoding: add comment-based type hints for pytype
Augie Fackler <augie@google.com>
parents: 43517
diff changeset
    31
        TypeVar,
2ade00f3b03b encoding: add comment-based type hints for pytype
Augie Fackler <augie@google.com>
parents: 43517
diff changeset
    32
        Union,
2ade00f3b03b encoding: add comment-based type hints for pytype
Augie Fackler <augie@google.com>
parents: 43517
diff changeset
    33
    )
2ade00f3b03b encoding: add comment-based type hints for pytype
Augie Fackler <augie@google.com>
parents: 43517
diff changeset
    34
2ade00f3b03b encoding: add comment-based type hints for pytype
Augie Fackler <augie@google.com>
parents: 43517
diff changeset
    35
    # keep pyflakes happy
2ade00f3b03b encoding: add comment-based type hints for pytype
Augie Fackler <augie@google.com>
parents: 43517
diff changeset
    36
    for t in (Any, Callable, List, Text, Type, Union):
2ade00f3b03b encoding: add comment-based type hints for pytype
Augie Fackler <augie@google.com>
parents: 43517
diff changeset
    37
        assert t
2ade00f3b03b encoding: add comment-based type hints for pytype
Augie Fackler <augie@google.com>
parents: 43517
diff changeset
    38
43720
3364a15f61f0 typing: fix forward reference in _Tlocalstr type bound
Yuya Nishihara <yuya@tcha.org>
parents: 43719
diff changeset
    39
    _Tlocalstr = TypeVar('_Tlocalstr', bound='localstr')
43544
2ade00f3b03b encoding: add comment-based type hints for pytype
Augie Fackler <augie@google.com>
parents: 43517
diff changeset
    40
43554
9f70512ae2cf cleanup: remove pointless r-prefixes on single-quoted strings
Augie Fackler <augie@google.com>
parents: 43551
diff changeset
    41
charencode = policy.importmod('charencode')
33782
f5fc54e7e467 encoding: drop circular import by proxying through '<policy>.charencode'
Yuya Nishihara <yuya@tcha.org>
parents: 33038
diff changeset
    42
33944
f4433f2713d0 encoding: add function to test if a str consists of ASCII characters
Yuya Nishihara <yuya@tcha.org>
parents: 33943
diff changeset
    43
isasciistr = charencode.isasciistr
33782
f5fc54e7e467 encoding: drop circular import by proxying through '<policy>.charencode'
Yuya Nishihara <yuya@tcha.org>
parents: 33038
diff changeset
    44
asciilower = charencode.asciilower
f5fc54e7e467 encoding: drop circular import by proxying through '<policy>.charencode'
Yuya Nishihara <yuya@tcha.org>
parents: 33038
diff changeset
    45
asciiupper = charencode.asciiupper
33943
2c37f9dabc32 encoding: add fast path of jsonescape() (issue5533)
Yuya Nishihara <yuya@tcha.org>
parents: 33942
diff changeset
    46
_jsonescapeu8fast = charencode.jsonescapeu8fast
33782
f5fc54e7e467 encoding: drop circular import by proxying through '<policy>.charencode'
Yuya Nishihara <yuya@tcha.org>
parents: 33038
diff changeset
    47
30033
02dbfaa6df0b py3: convert encoding name and mode to str
Yuya Nishihara <yuya@tcha.org>
parents: 30031
diff changeset
    48
_sysstr = pycompat.sysstr
02dbfaa6df0b py3: convert encoding name and mode to str
Yuya Nishihara <yuya@tcha.org>
parents: 30031
diff changeset
    49
30031
0f6d6fdd3c2a pycompat: provide 'ispy3' constant
Yuya Nishihara <yuya@tcha.org>
parents: 28508
diff changeset
    50
if pycompat.ispy3:
28507
9bcbd9412225 encoding: make HFS+ ignore code Python 3 compatible
Gregory Szorc <gregory.szorc@gmail.com>
parents: 28069
diff changeset
    51
    unichr = chr
9bcbd9412225 encoding: make HFS+ ignore code Python 3 compatible
Gregory Szorc <gregory.szorc@gmail.com>
parents: 28069
diff changeset
    52
23596
885bd7c5c7e3 encoding: add hfsignoreclean to clean out HFS-ignored characters
Augie Fackler <raf@durin42.com>
parents: 22973
diff changeset
    53
# These unicode characters are ignored by HFS+ (Apple Technote 1150,
885bd7c5c7e3 encoding: add hfsignoreclean to clean out HFS-ignored characters
Augie Fackler <raf@durin42.com>
parents: 22973
diff changeset
    54
# "Unicode Subtleties"), so we need to ignore them in some places for
885bd7c5c7e3 encoding: add hfsignoreclean to clean out HFS-ignored characters
Augie Fackler <raf@durin42.com>
parents: 22973
diff changeset
    55
# sanity.
43076
2372284d9457 formatting: blacken the codebase
Augie Fackler <augie@google.com>
parents: 41841
diff changeset
    56
_ignore = [
2372284d9457 formatting: blacken the codebase
Augie Fackler <augie@google.com>
parents: 41841
diff changeset
    57
    unichr(int(x, 16)).encode("utf-8")
43077
687b865b95ad formatting: byteify all mercurial/ and hgext/ string literals
Augie Fackler <augie@google.com>
parents: 43076
diff changeset
    58
    for x in b"200c 200d 200e 200f 202a 202b 202c 202d 202e "
687b865b95ad formatting: byteify all mercurial/ and hgext/ string literals
Augie Fackler <augie@google.com>
parents: 43076
diff changeset
    59
    b"206a 206b 206c 206d 206e 206f feff".split()
43076
2372284d9457 formatting: blacken the codebase
Augie Fackler <augie@google.com>
parents: 41841
diff changeset
    60
]
23596
885bd7c5c7e3 encoding: add hfsignoreclean to clean out HFS-ignored characters
Augie Fackler <raf@durin42.com>
parents: 22973
diff changeset
    61
# verify the next function will work
43077
687b865b95ad formatting: byteify all mercurial/ and hgext/ string literals
Augie Fackler <augie@google.com>
parents: 43076
diff changeset
    62
assert all(i.startswith((b"\xe2", b"\xef")) for i in _ignore)
23596
885bd7c5c7e3 encoding: add hfsignoreclean to clean out HFS-ignored characters
Augie Fackler <raf@durin42.com>
parents: 22973
diff changeset
    63
43076
2372284d9457 formatting: blacken the codebase
Augie Fackler <augie@google.com>
parents: 41841
diff changeset
    64
23596
885bd7c5c7e3 encoding: add hfsignoreclean to clean out HFS-ignored characters
Augie Fackler <raf@durin42.com>
parents: 22973
diff changeset
    65
def hfsignoreclean(s):
43544
2ade00f3b03b encoding: add comment-based type hints for pytype
Augie Fackler <augie@google.com>
parents: 43517
diff changeset
    66
    # type: (bytes) -> bytes
23596
885bd7c5c7e3 encoding: add hfsignoreclean to clean out HFS-ignored characters
Augie Fackler <raf@durin42.com>
parents: 22973
diff changeset
    67
    """Remove codepoints ignored by HFS+ from s.
885bd7c5c7e3 encoding: add hfsignoreclean to clean out HFS-ignored characters
Augie Fackler <raf@durin42.com>
parents: 22973
diff changeset
    68
885bd7c5c7e3 encoding: add hfsignoreclean to clean out HFS-ignored characters
Augie Fackler <raf@durin42.com>
parents: 22973
diff changeset
    69
    >>> hfsignoreclean(u'.h\u200cg'.encode('utf-8'))
885bd7c5c7e3 encoding: add hfsignoreclean to clean out HFS-ignored characters
Augie Fackler <raf@durin42.com>
parents: 22973
diff changeset
    70
    '.hg'
885bd7c5c7e3 encoding: add hfsignoreclean to clean out HFS-ignored characters
Augie Fackler <raf@durin42.com>
parents: 22973
diff changeset
    71
    >>> hfsignoreclean(u'.h\ufeffg'.encode('utf-8'))
885bd7c5c7e3 encoding: add hfsignoreclean to clean out HFS-ignored characters
Augie Fackler <raf@durin42.com>
parents: 22973
diff changeset
    72
    '.hg'
885bd7c5c7e3 encoding: add hfsignoreclean to clean out HFS-ignored characters
Augie Fackler <raf@durin42.com>
parents: 22973
diff changeset
    73
    """
43077
687b865b95ad formatting: byteify all mercurial/ and hgext/ string literals
Augie Fackler <augie@google.com>
parents: 43076
diff changeset
    74
    if b"\xe2" in s or b"\xef" in s:
23596
885bd7c5c7e3 encoding: add hfsignoreclean to clean out HFS-ignored characters
Augie Fackler <raf@durin42.com>
parents: 22973
diff changeset
    75
        for c in _ignore:
43077
687b865b95ad formatting: byteify all mercurial/ and hgext/ string literals
Augie Fackler <augie@google.com>
parents: 43076
diff changeset
    76
            s = s.replace(c, b'')
23596
885bd7c5c7e3 encoding: add hfsignoreclean to clean out HFS-ignored characters
Augie Fackler <raf@durin42.com>
parents: 22973
diff changeset
    77
    return s
885bd7c5c7e3 encoding: add hfsignoreclean to clean out HFS-ignored characters
Augie Fackler <raf@durin42.com>
parents: 22973
diff changeset
    78
43076
2372284d9457 formatting: blacken the codebase
Augie Fackler <augie@google.com>
parents: 41841
diff changeset
    79
30034
e4a6b439acc5 py3: provide encoding.environ which is a dict of bytes
Yuya Nishihara <yuya@tcha.org>
parents: 30033
diff changeset
    80
# encoding.environ is provided read-only, which may not be used to modify
e4a6b439acc5 py3: provide encoding.environ which is a dict of bytes
Yuya Nishihara <yuya@tcha.org>
parents: 30033
diff changeset
    81
# the process environment
43076
2372284d9457 formatting: blacken the codebase
Augie Fackler <augie@google.com>
parents: 41841
diff changeset
    82
_nativeenviron = not pycompat.ispy3 or os.supports_bytes_environ
30034
e4a6b439acc5 py3: provide encoding.environ which is a dict of bytes
Yuya Nishihara <yuya@tcha.org>
parents: 30033
diff changeset
    83
if not pycompat.ispy3:
32231
cf424dae5dc7 check-code: ignore re-exports of os.environ in encoding.py
Yuya Nishihara <yuya@tcha.org>
parents: 32205
diff changeset
    84
    environ = os.environ  # re-exports
30034
e4a6b439acc5 py3: provide encoding.environ which is a dict of bytes
Yuya Nishihara <yuya@tcha.org>
parents: 30033
diff changeset
    85
elif _nativeenviron:
32231
cf424dae5dc7 check-code: ignore re-exports of os.environ in encoding.py
Yuya Nishihara <yuya@tcha.org>
parents: 32205
diff changeset
    86
    environ = os.environb  # re-exports
30034
e4a6b439acc5 py3: provide encoding.environ which is a dict of bytes
Yuya Nishihara <yuya@tcha.org>
parents: 30033
diff changeset
    87
else:
e4a6b439acc5 py3: provide encoding.environ which is a dict of bytes
Yuya Nishihara <yuya@tcha.org>
parents: 30033
diff changeset
    88
    # preferred encoding isn't known yet; use utf-8 to avoid unicode error
e4a6b439acc5 py3: provide encoding.environ which is a dict of bytes
Yuya Nishihara <yuya@tcha.org>
parents: 30033
diff changeset
    89
    # and recreate it once encoding is settled
44470
9d2b2df2c2ba cleanup: run pyupgrade on our source tree to clean up varying things
Augie Fackler <augie@google.com>
parents: 43807
diff changeset
    90
    environ = {
9d2b2df2c2ba cleanup: run pyupgrade on our source tree to clean up varying things
Augie Fackler <augie@google.com>
parents: 43807
diff changeset
    91
        k.encode('utf-8'): v.encode('utf-8')
43076
2372284d9457 formatting: blacken the codebase
Augie Fackler <augie@google.com>
parents: 41841
diff changeset
    92
        for k, v in os.environ.items()  # re-exports
44470
9d2b2df2c2ba cleanup: run pyupgrade on our source tree to clean up varying things
Augie Fackler <augie@google.com>
parents: 43807
diff changeset
    93
    }
30034
e4a6b439acc5 py3: provide encoding.environ which is a dict of bytes
Yuya Nishihara <yuya@tcha.org>
parents: 30033
diff changeset
    94
39844
9e8fcd2e78c1 encoding: remove unnecessary lambdas from _encodingfixers
Martin von Zweigbergk <martinvonz@google.com>
parents: 39824
diff changeset
    95
_encodingrewrites = {
43077
687b865b95ad formatting: byteify all mercurial/ and hgext/ string literals
Augie Fackler <augie@google.com>
parents: 43076
diff changeset
    96
    b'646': b'ascii',
687b865b95ad formatting: byteify all mercurial/ and hgext/ string literals
Augie Fackler <augie@google.com>
parents: 43076
diff changeset
    97
    b'ANSI_X3.4-1968': b'ascii',
11892
2be70ca17311 encoding: improve handling of buggy getpreferredencoding() on Mac OS X
Dan Villiom Podlaski Christiansen <danchr@gmail.com>
parents: 11297
diff changeset
    98
}
37883
443029011990 encoding: alias cp65001 to utf-8 on Windows
Yuya Nishihara <yuya@tcha.org>
parents: 36805
diff changeset
    99
# cp65001 is a Windows variant of utf-8, which isn't supported on Python 2.
443029011990 encoding: alias cp65001 to utf-8 on Windows
Yuya Nishihara <yuya@tcha.org>
parents: 36805
diff changeset
   100
# No idea if it should be rewritten to the canonical name 'utf-8' on Python 3.
443029011990 encoding: alias cp65001 to utf-8 on Windows
Yuya Nishihara <yuya@tcha.org>
parents: 36805
diff changeset
   101
# https://bugs.python.org/issue13216
443029011990 encoding: alias cp65001 to utf-8 on Windows
Yuya Nishihara <yuya@tcha.org>
parents: 36805
diff changeset
   102
if pycompat.iswindows and not pycompat.ispy3:
43077
687b865b95ad formatting: byteify all mercurial/ and hgext/ string literals
Augie Fackler <augie@google.com>
parents: 43076
diff changeset
   103
    _encodingrewrites[b'cp65001'] = b'utf-8'
7948
de377b1a9a84 move encoding bits from util to encoding
Matt Mackall <mpm@selenic.com>
parents:
diff changeset
   104
de377b1a9a84 move encoding bits from util to encoding
Matt Mackall <mpm@selenic.com>
parents:
diff changeset
   105
try:
43077
687b865b95ad formatting: byteify all mercurial/ and hgext/ string literals
Augie Fackler <augie@google.com>
parents: 43076
diff changeset
   106
    encoding = environ.get(b"HGENCODING")
7948
de377b1a9a84 move encoding bits from util to encoding
Matt Mackall <mpm@selenic.com>
parents:
diff changeset
   107
    if not encoding:
43077
687b865b95ad formatting: byteify all mercurial/ and hgext/ string literals
Augie Fackler <augie@google.com>
parents: 43076
diff changeset
   108
        encoding = locale.getpreferredencoding().encode('ascii') or b'ascii'
39844
9e8fcd2e78c1 encoding: remove unnecessary lambdas from _encodingfixers
Martin von Zweigbergk <martinvonz@google.com>
parents: 39824
diff changeset
   109
        encoding = _encodingrewrites.get(encoding, encoding)
7948
de377b1a9a84 move encoding bits from util to encoding
Matt Mackall <mpm@selenic.com>
parents:
diff changeset
   110
except locale.Error:
43077
687b865b95ad formatting: byteify all mercurial/ and hgext/ string literals
Augie Fackler <augie@google.com>
parents: 43076
diff changeset
   111
    encoding = b'ascii'
687b865b95ad formatting: byteify all mercurial/ and hgext/ string literals
Augie Fackler <augie@google.com>
parents: 43076
diff changeset
   112
encodingmode = environ.get(b"HGENCODINGMODE", b"strict")
687b865b95ad formatting: byteify all mercurial/ and hgext/ string literals
Augie Fackler <augie@google.com>
parents: 43076
diff changeset
   113
fallbackencoding = b'ISO-8859-1'
7948
de377b1a9a84 move encoding bits from util to encoding
Matt Mackall <mpm@selenic.com>
parents:
diff changeset
   114
43076
2372284d9457 formatting: blacken the codebase
Augie Fackler <augie@google.com>
parents: 41841
diff changeset
   115
33832
dabe1f11ae3a py3: change encoding.localstr to a subclass of bytes, not str
Yuya Nishihara <yuya@tcha.org>
parents: 33782
diff changeset
   116
class localstr(bytes):
45957
89a2afe31e82 formating: upgrade to black 20.8b1
Augie Fackler <raf@durin42.com>
parents: 45681
diff changeset
   117
    """This class allows strings that are unmodified to be
89a2afe31e82 formating: upgrade to black 20.8b1
Augie Fackler <raf@durin42.com>
parents: 45681
diff changeset
   118
    round-tripped to the local encoding and back"""
43076
2372284d9457 formatting: blacken the codebase
Augie Fackler <augie@google.com>
parents: 41841
diff changeset
   119
13046
7cc4263e07a9 encoding: add localstr class to track UTF-8 version of transcoded strings
Matt Mackall <mpm@selenic.com>
parents: 12866
diff changeset
   120
    def __new__(cls, u, l):
33832
dabe1f11ae3a py3: change encoding.localstr to a subclass of bytes, not str
Yuya Nishihara <yuya@tcha.org>
parents: 33782
diff changeset
   121
        s = bytes.__new__(cls, l)
13046
7cc4263e07a9 encoding: add localstr class to track UTF-8 version of transcoded strings
Matt Mackall <mpm@selenic.com>
parents: 12866
diff changeset
   122
        s._utf8 = u
7cc4263e07a9 encoding: add localstr class to track UTF-8 version of transcoded strings
Matt Mackall <mpm@selenic.com>
parents: 12866
diff changeset
   123
        return s
43076
2372284d9457 formatting: blacken the codebase
Augie Fackler <augie@google.com>
parents: 41841
diff changeset
   124
43793
7b14d649af1b typing: consolidate "if not globals():" trick
Yuya Nishihara <yuya@tcha.org>
parents: 43725
diff changeset
   125
    if pycompat.TYPE_CHECKING:
43725
da925257a39e typing: add pseudo localstr.__init__() to help pytype
Yuya Nishihara <yuya@tcha.org>
parents: 43724
diff changeset
   126
        # pseudo implementation to help pytype see localstr() constructor
da925257a39e typing: add pseudo localstr.__init__() to help pytype
Yuya Nishihara <yuya@tcha.org>
parents: 43724
diff changeset
   127
        def __init__(self, u, l):
da925257a39e typing: add pseudo localstr.__init__() to help pytype
Yuya Nishihara <yuya@tcha.org>
parents: 43724
diff changeset
   128
            # type: (bytes, bytes) -> None
da925257a39e typing: add pseudo localstr.__init__() to help pytype
Yuya Nishihara <yuya@tcha.org>
parents: 43724
diff changeset
   129
            super(localstr, self).__init__(l)
da925257a39e typing: add pseudo localstr.__init__() to help pytype
Yuya Nishihara <yuya@tcha.org>
parents: 43724
diff changeset
   130
            self._utf8 = u
da925257a39e typing: add pseudo localstr.__init__() to help pytype
Yuya Nishihara <yuya@tcha.org>
parents: 43724
diff changeset
   131
13046
7cc4263e07a9 encoding: add localstr class to track UTF-8 version of transcoded strings
Matt Mackall <mpm@selenic.com>
parents: 12866
diff changeset
   132
    def __hash__(self):
43076
2372284d9457 formatting: blacken the codebase
Augie Fackler <augie@google.com>
parents: 41841
diff changeset
   133
        return hash(self._utf8)  # avoid collisions in local string space
2372284d9457 formatting: blacken the codebase
Augie Fackler <augie@google.com>
parents: 41841
diff changeset
   134
13046
7cc4263e07a9 encoding: add localstr class to track UTF-8 version of transcoded strings
Matt Mackall <mpm@selenic.com>
parents: 12866
diff changeset
   135
37991
3ea3c96ada54 encoding: introduce tagging type for non-lossy non-ASCII string
Yuya Nishihara <yuya@tcha.org>
parents: 37990
diff changeset
   136
class safelocalstr(bytes):
3ea3c96ada54 encoding: introduce tagging type for non-lossy non-ASCII string
Yuya Nishihara <yuya@tcha.org>
parents: 37990
diff changeset
   137
    """Tagged string denoting it was previously an internal UTF-8 string,
3ea3c96ada54 encoding: introduce tagging type for non-lossy non-ASCII string
Yuya Nishihara <yuya@tcha.org>
parents: 37990
diff changeset
   138
    and can be converted back to UTF-8 losslessly
3ea3c96ada54 encoding: introduce tagging type for non-lossy non-ASCII string
Yuya Nishihara <yuya@tcha.org>
parents: 37990
diff changeset
   139
3ea3c96ada54 encoding: introduce tagging type for non-lossy non-ASCII string
Yuya Nishihara <yuya@tcha.org>
parents: 37990
diff changeset
   140
    >>> assert safelocalstr(b'\\xc3') == b'\\xc3'
3ea3c96ada54 encoding: introduce tagging type for non-lossy non-ASCII string
Yuya Nishihara <yuya@tcha.org>
parents: 37990
diff changeset
   141
    >>> assert b'\\xc3' == safelocalstr(b'\\xc3')
3ea3c96ada54 encoding: introduce tagging type for non-lossy non-ASCII string
Yuya Nishihara <yuya@tcha.org>
parents: 37990
diff changeset
   142
    >>> assert b'\\xc3' in {safelocalstr(b'\\xc3'): 0}
3ea3c96ada54 encoding: introduce tagging type for non-lossy non-ASCII string
Yuya Nishihara <yuya@tcha.org>
parents: 37990
diff changeset
   143
    >>> assert safelocalstr(b'\\xc3') in {b'\\xc3': 0}
3ea3c96ada54 encoding: introduce tagging type for non-lossy non-ASCII string
Yuya Nishihara <yuya@tcha.org>
parents: 37990
diff changeset
   144
    """
3ea3c96ada54 encoding: introduce tagging type for non-lossy non-ASCII string
Yuya Nishihara <yuya@tcha.org>
parents: 37990
diff changeset
   145
43076
2372284d9457 formatting: blacken the codebase
Augie Fackler <augie@google.com>
parents: 41841
diff changeset
   146
7948
de377b1a9a84 move encoding bits from util to encoding
Matt Mackall <mpm@selenic.com>
parents:
diff changeset
   147
def tolocal(s):
43721
b65fcccd9100 typing: fix argument type of encoding.tolocal() and .fromutf8b()
Yuya Nishihara <yuya@tcha.org>
parents: 43720
diff changeset
   148
    # type: (bytes) -> bytes
7948
de377b1a9a84 move encoding bits from util to encoding
Matt Mackall <mpm@selenic.com>
parents:
diff changeset
   149
    """
de377b1a9a84 move encoding bits from util to encoding
Matt Mackall <mpm@selenic.com>
parents:
diff changeset
   150
    Convert a string from internal UTF-8 to local encoding
de377b1a9a84 move encoding bits from util to encoding
Matt Mackall <mpm@selenic.com>
parents:
diff changeset
   151
de377b1a9a84 move encoding bits from util to encoding
Matt Mackall <mpm@selenic.com>
parents:
diff changeset
   152
    All internal strings should be UTF-8 but some repos before the
de377b1a9a84 move encoding bits from util to encoding
Matt Mackall <mpm@selenic.com>
parents:
diff changeset
   153
    implementation of locale support may contain latin1 or possibly
de377b1a9a84 move encoding bits from util to encoding
Matt Mackall <mpm@selenic.com>
parents:
diff changeset
   154
    other character sets. We attempt to decode everything strictly
de377b1a9a84 move encoding bits from util to encoding
Matt Mackall <mpm@selenic.com>
parents:
diff changeset
   155
    using UTF-8, then Latin-1, and failing that, we use UTF-8 and
de377b1a9a84 move encoding bits from util to encoding
Matt Mackall <mpm@selenic.com>
parents:
diff changeset
   156
    replace unknown characters.
13046
7cc4263e07a9 encoding: add localstr class to track UTF-8 version of transcoded strings
Matt Mackall <mpm@selenic.com>
parents: 12866
diff changeset
   157
7cc4263e07a9 encoding: add localstr class to track UTF-8 version of transcoded strings
Matt Mackall <mpm@selenic.com>
parents: 12866
diff changeset
   158
    The localstr class is used to cache the known UTF-8 encoding of
7cc4263e07a9 encoding: add localstr class to track UTF-8 version of transcoded strings
Matt Mackall <mpm@selenic.com>
parents: 12866
diff changeset
   159
    strings next to their local representation to allow lossless
7cc4263e07a9 encoding: add localstr class to track UTF-8 version of transcoded strings
Matt Mackall <mpm@selenic.com>
parents: 12866
diff changeset
   160
    round-trip conversion back to UTF-8.
7cc4263e07a9 encoding: add localstr class to track UTF-8 version of transcoded strings
Matt Mackall <mpm@selenic.com>
parents: 12866
diff changeset
   161
34146
0fa781320203 doctest: bulk-replace string literals with b'' for Python 3
Yuya Nishihara <yuya@tcha.org>
parents: 33946
diff changeset
   162
    >>> u = b'foo: \\xc3\\xa4' # utf-8
13046
7cc4263e07a9 encoding: add localstr class to track UTF-8 version of transcoded strings
Matt Mackall <mpm@selenic.com>
parents: 12866
diff changeset
   163
    >>> l = tolocal(u)
7cc4263e07a9 encoding: add localstr class to track UTF-8 version of transcoded strings
Matt Mackall <mpm@selenic.com>
parents: 12866
diff changeset
   164
    >>> l
7cc4263e07a9 encoding: add localstr class to track UTF-8 version of transcoded strings
Matt Mackall <mpm@selenic.com>
parents: 12866
diff changeset
   165
    'foo: ?'
7cc4263e07a9 encoding: add localstr class to track UTF-8 version of transcoded strings
Matt Mackall <mpm@selenic.com>
parents: 12866
diff changeset
   166
    >>> fromlocal(l)
7cc4263e07a9 encoding: add localstr class to track UTF-8 version of transcoded strings
Matt Mackall <mpm@selenic.com>
parents: 12866
diff changeset
   167
    'foo: \\xc3\\xa4'
34146
0fa781320203 doctest: bulk-replace string literals with b'' for Python 3
Yuya Nishihara <yuya@tcha.org>
parents: 33946
diff changeset
   168
    >>> u2 = b'foo: \\xc3\\xa1'
13046
7cc4263e07a9 encoding: add localstr class to track UTF-8 version of transcoded strings
Matt Mackall <mpm@selenic.com>
parents: 12866
diff changeset
   169
    >>> d = { l: 1, tolocal(u2): 2 }
18378
404feac78b8a tests: stabilize doctest output
Mads Kiilerich <mads@kiilerich.com>
parents: 17424
diff changeset
   170
    >>> len(d) # no collision
404feac78b8a tests: stabilize doctest output
Mads Kiilerich <mads@kiilerich.com>
parents: 17424
diff changeset
   171
    2
34146
0fa781320203 doctest: bulk-replace string literals with b'' for Python 3
Yuya Nishihara <yuya@tcha.org>
parents: 33946
diff changeset
   172
    >>> b'foo: ?' in d
13046
7cc4263e07a9 encoding: add localstr class to track UTF-8 version of transcoded strings
Matt Mackall <mpm@selenic.com>
parents: 12866
diff changeset
   173
    False
34146
0fa781320203 doctest: bulk-replace string literals with b'' for Python 3
Yuya Nishihara <yuya@tcha.org>
parents: 33946
diff changeset
   174
    >>> l1 = b'foo: \\xe4' # historical latin1 fallback
13046
7cc4263e07a9 encoding: add localstr class to track UTF-8 version of transcoded strings
Matt Mackall <mpm@selenic.com>
parents: 12866
diff changeset
   175
    >>> l = tolocal(l1)
7cc4263e07a9 encoding: add localstr class to track UTF-8 version of transcoded strings
Matt Mackall <mpm@selenic.com>
parents: 12866
diff changeset
   176
    >>> l
7cc4263e07a9 encoding: add localstr class to track UTF-8 version of transcoded strings
Matt Mackall <mpm@selenic.com>
parents: 12866
diff changeset
   177
    'foo: ?'
7cc4263e07a9 encoding: add localstr class to track UTF-8 version of transcoded strings
Matt Mackall <mpm@selenic.com>
parents: 12866
diff changeset
   178
    >>> fromlocal(l) # magically in utf-8
7cc4263e07a9 encoding: add localstr class to track UTF-8 version of transcoded strings
Matt Mackall <mpm@selenic.com>
parents: 12866
diff changeset
   179
    'foo: \\xc3\\xa4'
7948
de377b1a9a84 move encoding bits from util to encoding
Matt Mackall <mpm@selenic.com>
parents:
diff changeset
   180
    """
13046
7cc4263e07a9 encoding: add localstr class to track UTF-8 version of transcoded strings
Matt Mackall <mpm@selenic.com>
parents: 12866
diff changeset
   181
33945
853574db5b12 encoding: add fast path of from/tolocal() for ASCII strings
Yuya Nishihara <yuya@tcha.org>
parents: 33944
diff changeset
   182
    if isasciistr(s):
853574db5b12 encoding: add fast path of from/tolocal() for ASCII strings
Yuya Nishihara <yuya@tcha.org>
parents: 33944
diff changeset
   183
        return s
853574db5b12 encoding: add fast path of from/tolocal() for ASCII strings
Yuya Nishihara <yuya@tcha.org>
parents: 33944
diff changeset
   184
16274
5d75eb8568d1 encoding: tune fast-path of tolocal a bit
Matt Mackall <mpm@selenic.com>
parents: 16133
diff changeset
   185
    try:
7948
de377b1a9a84 move encoding bits from util to encoding
Matt Mackall <mpm@selenic.com>
parents:
diff changeset
   186
        try:
16274
5d75eb8568d1 encoding: tune fast-path of tolocal a bit
Matt Mackall <mpm@selenic.com>
parents: 16133
diff changeset
   187
            # make sure string is actually stored in UTF-8
5d75eb8568d1 encoding: tune fast-path of tolocal a bit
Matt Mackall <mpm@selenic.com>
parents: 16133
diff changeset
   188
            u = s.decode('UTF-8')
43077
687b865b95ad formatting: byteify all mercurial/ and hgext/ string literals
Augie Fackler <augie@google.com>
parents: 43076
diff changeset
   189
            if encoding == b'UTF-8':
16274
5d75eb8568d1 encoding: tune fast-path of tolocal a bit
Matt Mackall <mpm@selenic.com>
parents: 16133
diff changeset
   190
                # fast path
5d75eb8568d1 encoding: tune fast-path of tolocal a bit
Matt Mackall <mpm@selenic.com>
parents: 16133
diff changeset
   191
                return s
43551
313e3a279828 cleanup: remove pointless r-prefixes on double-quoted strings
Augie Fackler <augie@google.com>
parents: 43544
diff changeset
   192
            r = u.encode(_sysstr(encoding), "replace")
30033
02dbfaa6df0b py3: convert encoding name and mode to str
Yuya Nishihara <yuya@tcha.org>
parents: 30031
diff changeset
   193
            if u == r.decode(_sysstr(encoding)):
13940
b7b26e54e37a encoding: avoid localstr when a string can be encoded losslessly (issue2763)
Matt Mackall <mpm@selenic.com>
parents: 13051
diff changeset
   194
                # r is a safe, non-lossy encoding of s
37991
3ea3c96ada54 encoding: introduce tagging type for non-lossy non-ASCII string
Yuya Nishihara <yuya@tcha.org>
parents: 37990
diff changeset
   195
                return safelocalstr(r)
16274
5d75eb8568d1 encoding: tune fast-path of tolocal a bit
Matt Mackall <mpm@selenic.com>
parents: 16133
diff changeset
   196
            return localstr(s, r)
5d75eb8568d1 encoding: tune fast-path of tolocal a bit
Matt Mackall <mpm@selenic.com>
parents: 16133
diff changeset
   197
        except UnicodeDecodeError:
5d75eb8568d1 encoding: tune fast-path of tolocal a bit
Matt Mackall <mpm@selenic.com>
parents: 16133
diff changeset
   198
            # we should only get here if we're looking at an ancient changeset
5d75eb8568d1 encoding: tune fast-path of tolocal a bit
Matt Mackall <mpm@selenic.com>
parents: 16133
diff changeset
   199
            try:
30033
02dbfaa6df0b py3: convert encoding name and mode to str
Yuya Nishihara <yuya@tcha.org>
parents: 30031
diff changeset
   200
                u = s.decode(_sysstr(fallbackencoding))
43551
313e3a279828 cleanup: remove pointless r-prefixes on double-quoted strings
Augie Fackler <augie@google.com>
parents: 43544
diff changeset
   201
                r = u.encode(_sysstr(encoding), "replace")
30033
02dbfaa6df0b py3: convert encoding name and mode to str
Yuya Nishihara <yuya@tcha.org>
parents: 30031
diff changeset
   202
                if u == r.decode(_sysstr(encoding)):
16274
5d75eb8568d1 encoding: tune fast-path of tolocal a bit
Matt Mackall <mpm@selenic.com>
parents: 16133
diff changeset
   203
                    # r is a safe, non-lossy encoding of s
37991
3ea3c96ada54 encoding: introduce tagging type for non-lossy non-ASCII string
Yuya Nishihara <yuya@tcha.org>
parents: 37990
diff changeset
   204
                    return safelocalstr(r)
13940
b7b26e54e37a encoding: avoid localstr when a string can be encoded losslessly (issue2763)
Matt Mackall <mpm@selenic.com>
parents: 13051
diff changeset
   205
                return localstr(u.encode('UTF-8'), r)
16274
5d75eb8568d1 encoding: tune fast-path of tolocal a bit
Matt Mackall <mpm@selenic.com>
parents: 16133
diff changeset
   206
            except UnicodeDecodeError:
43076
2372284d9457 formatting: blacken the codebase
Augie Fackler <augie@google.com>
parents: 41841
diff changeset
   207
                u = s.decode("utf-8", "replace")  # last ditch
30033
02dbfaa6df0b py3: convert encoding name and mode to str
Yuya Nishihara <yuya@tcha.org>
parents: 30031
diff changeset
   208
                # can't round-trip
43551
313e3a279828 cleanup: remove pointless r-prefixes on double-quoted strings
Augie Fackler <augie@google.com>
parents: 43544
diff changeset
   209
                return u.encode(_sysstr(encoding), "replace")
25660
328739ea70c3 global: mass rewrite to use modern exception syntax
Gregory Szorc <gregory.szorc@gmail.com>
parents: 24608
diff changeset
   210
    except LookupError as k:
45681
a736ab681b78 errors: stop passing non-strings to Abort's constructor
Martin von Zweigbergk <martinvonz@google.com>
parents: 44470
diff changeset
   211
        raise error.Abort(
a736ab681b78 errors: stop passing non-strings to Abort's constructor
Martin von Zweigbergk <martinvonz@google.com>
parents: 44470
diff changeset
   212
            pycompat.bytestr(k), hint=b"please check your locale settings"
a736ab681b78 errors: stop passing non-strings to Abort's constructor
Martin von Zweigbergk <martinvonz@google.com>
parents: 44470
diff changeset
   213
        )
7948
de377b1a9a84 move encoding bits from util to encoding
Matt Mackall <mpm@selenic.com>
parents:
diff changeset
   214
43076
2372284d9457 formatting: blacken the codebase
Augie Fackler <augie@google.com>
parents: 41841
diff changeset
   215
7948
de377b1a9a84 move encoding bits from util to encoding
Matt Mackall <mpm@selenic.com>
parents:
diff changeset
   216
def fromlocal(s):
43681
7edc07fb890c encoding: fix bad type annotation
Augie Fackler <augie@google.com>
parents: 43554
diff changeset
   217
    # type: (bytes) -> bytes
7948
de377b1a9a84 move encoding bits from util to encoding
Matt Mackall <mpm@selenic.com>
parents:
diff changeset
   218
    """
de377b1a9a84 move encoding bits from util to encoding
Matt Mackall <mpm@selenic.com>
parents:
diff changeset
   219
    Convert a string from the local character encoding to UTF-8
de377b1a9a84 move encoding bits from util to encoding
Matt Mackall <mpm@selenic.com>
parents:
diff changeset
   220
de377b1a9a84 move encoding bits from util to encoding
Matt Mackall <mpm@selenic.com>
parents:
diff changeset
   221
    We attempt to decode strings using the encoding mode set by
de377b1a9a84 move encoding bits from util to encoding
Matt Mackall <mpm@selenic.com>
parents:
diff changeset
   222
    HGENCODINGMODE, which defaults to 'strict'. In this mode, unknown
de377b1a9a84 move encoding bits from util to encoding
Matt Mackall <mpm@selenic.com>
parents:
diff changeset
   223
    characters will cause an error message. Other modes include
de377b1a9a84 move encoding bits from util to encoding
Matt Mackall <mpm@selenic.com>
parents:
diff changeset
   224
    'replace', which replaces unknown characters with a special
de377b1a9a84 move encoding bits from util to encoding
Matt Mackall <mpm@selenic.com>
parents:
diff changeset
   225
    Unicode character, and 'ignore', which drops the character.
de377b1a9a84 move encoding bits from util to encoding
Matt Mackall <mpm@selenic.com>
parents:
diff changeset
   226
    """
13046
7cc4263e07a9 encoding: add localstr class to track UTF-8 version of transcoded strings
Matt Mackall <mpm@selenic.com>
parents: 12866
diff changeset
   227
7cc4263e07a9 encoding: add localstr class to track UTF-8 version of transcoded strings
Matt Mackall <mpm@selenic.com>
parents: 12866
diff changeset
   228
    # can we do a lossless round-trip?
7cc4263e07a9 encoding: add localstr class to track UTF-8 version of transcoded strings
Matt Mackall <mpm@selenic.com>
parents: 12866
diff changeset
   229
    if isinstance(s, localstr):
7cc4263e07a9 encoding: add localstr class to track UTF-8 version of transcoded strings
Matt Mackall <mpm@selenic.com>
parents: 12866
diff changeset
   230
        return s._utf8
33945
853574db5b12 encoding: add fast path of from/tolocal() for ASCII strings
Yuya Nishihara <yuya@tcha.org>
parents: 33944
diff changeset
   231
    if isasciistr(s):
853574db5b12 encoding: add fast path of from/tolocal() for ASCII strings
Yuya Nishihara <yuya@tcha.org>
parents: 33944
diff changeset
   232
        return s
13046
7cc4263e07a9 encoding: add localstr class to track UTF-8 version of transcoded strings
Matt Mackall <mpm@selenic.com>
parents: 12866
diff changeset
   233
7948
de377b1a9a84 move encoding bits from util to encoding
Matt Mackall <mpm@selenic.com>
parents:
diff changeset
   234
    try:
30033
02dbfaa6df0b py3: convert encoding name and mode to str
Yuya Nishihara <yuya@tcha.org>
parents: 30031
diff changeset
   235
        u = s.decode(_sysstr(encoding), _sysstr(encodingmode))
02dbfaa6df0b py3: convert encoding name and mode to str
Yuya Nishihara <yuya@tcha.org>
parents: 30031
diff changeset
   236
        return u.encode("utf-8")
25660
328739ea70c3 global: mass rewrite to use modern exception syntax
Gregory Szorc <gregory.szorc@gmail.com>
parents: 24608
diff changeset
   237
    except UnicodeDecodeError as inst:
43076
2372284d9457 formatting: blacken the codebase
Augie Fackler <augie@google.com>
parents: 41841
diff changeset
   238
        sub = s[max(0, inst.start - 10) : inst.start + 10]
2372284d9457 formatting: blacken the codebase
Augie Fackler <augie@google.com>
parents: 41841
diff changeset
   239
        raise error.Abort(
43077
687b865b95ad formatting: byteify all mercurial/ and hgext/ string literals
Augie Fackler <augie@google.com>
parents: 43076
diff changeset
   240
            b"decoding near '%s': %s!" % (sub, pycompat.bytestr(inst))
43076
2372284d9457 formatting: blacken the codebase
Augie Fackler <augie@google.com>
parents: 41841
diff changeset
   241
        )
25660
328739ea70c3 global: mass rewrite to use modern exception syntax
Gregory Szorc <gregory.szorc@gmail.com>
parents: 24608
diff changeset
   242
    except LookupError as k:
43077
687b865b95ad formatting: byteify all mercurial/ and hgext/ string literals
Augie Fackler <augie@google.com>
parents: 43076
diff changeset
   243
        raise error.Abort(k, hint=b"please check your locale settings")
7948
de377b1a9a84 move encoding bits from util to encoding
Matt Mackall <mpm@selenic.com>
parents:
diff changeset
   244
43076
2372284d9457 formatting: blacken the codebase
Augie Fackler <augie@google.com>
parents: 41841
diff changeset
   245
31456
067add650129 encoding: factor out unicode variants of from/tolocal()
Yuya Nishihara <yuya@tcha.org>
parents: 30627
diff changeset
   246
def unitolocal(u):
43544
2ade00f3b03b encoding: add comment-based type hints for pytype
Augie Fackler <augie@google.com>
parents: 43517
diff changeset
   247
    # type: (Text) -> bytes
31456
067add650129 encoding: factor out unicode variants of from/tolocal()
Yuya Nishihara <yuya@tcha.org>
parents: 30627
diff changeset
   248
    """Convert a unicode string to a byte string of local encoding"""
067add650129 encoding: factor out unicode variants of from/tolocal()
Yuya Nishihara <yuya@tcha.org>
parents: 30627
diff changeset
   249
    return tolocal(u.encode('utf-8'))
067add650129 encoding: factor out unicode variants of from/tolocal()
Yuya Nishihara <yuya@tcha.org>
parents: 30627
diff changeset
   250
43076
2372284d9457 formatting: blacken the codebase
Augie Fackler <augie@google.com>
parents: 41841
diff changeset
   251
31456
067add650129 encoding: factor out unicode variants of from/tolocal()
Yuya Nishihara <yuya@tcha.org>
parents: 30627
diff changeset
   252
def unifromlocal(s):
43544
2ade00f3b03b encoding: add comment-based type hints for pytype
Augie Fackler <augie@google.com>
parents: 43517
diff changeset
   253
    # type: (bytes) -> Text
31456
067add650129 encoding: factor out unicode variants of from/tolocal()
Yuya Nishihara <yuya@tcha.org>
parents: 30627
diff changeset
   254
    """Convert a byte string of local encoding to a unicode string"""
067add650129 encoding: factor out unicode variants of from/tolocal()
Yuya Nishihara <yuya@tcha.org>
parents: 30627
diff changeset
   255
    return fromlocal(s).decode('utf-8')
067add650129 encoding: factor out unicode variants of from/tolocal()
Yuya Nishihara <yuya@tcha.org>
parents: 30627
diff changeset
   256
43076
2372284d9457 formatting: blacken the codebase
Augie Fackler <augie@google.com>
parents: 41841
diff changeset
   257
33038
ce96efec8112 py3: add utility to forward __str__() to __bytes__()
Yuya Nishihara <yuya@tcha.org>
parents: 32570
diff changeset
   258
def unimethod(bytesfunc):
43544
2ade00f3b03b encoding: add comment-based type hints for pytype
Augie Fackler <augie@google.com>
parents: 43517
diff changeset
   259
    # type: (Callable[[Any], bytes]) -> Callable[[Any], Text]
33038
ce96efec8112 py3: add utility to forward __str__() to __bytes__()
Yuya Nishihara <yuya@tcha.org>
parents: 32570
diff changeset
   260
    """Create a proxy method that forwards __unicode__() and __str__() of
ce96efec8112 py3: add utility to forward __str__() to __bytes__()
Yuya Nishihara <yuya@tcha.org>
parents: 32570
diff changeset
   261
    Python 3 to __bytes__()"""
43076
2372284d9457 formatting: blacken the codebase
Augie Fackler <augie@google.com>
parents: 41841
diff changeset
   262
33038
ce96efec8112 py3: add utility to forward __str__() to __bytes__()
Yuya Nishihara <yuya@tcha.org>
parents: 32570
diff changeset
   263
    def unifunc(obj):
ce96efec8112 py3: add utility to forward __str__() to __bytes__()
Yuya Nishihara <yuya@tcha.org>
parents: 32570
diff changeset
   264
        return unifromlocal(bytesfunc(obj))
43076
2372284d9457 formatting: blacken the codebase
Augie Fackler <augie@google.com>
parents: 41841
diff changeset
   265
33038
ce96efec8112 py3: add utility to forward __str__() to __bytes__()
Yuya Nishihara <yuya@tcha.org>
parents: 32570
diff changeset
   266
    return unifunc
ce96efec8112 py3: add utility to forward __str__() to __bytes__()
Yuya Nishihara <yuya@tcha.org>
parents: 32570
diff changeset
   267
43076
2372284d9457 formatting: blacken the codebase
Augie Fackler <augie@google.com>
parents: 41841
diff changeset
   268
31457
6419cd243017 encoding: add converter between native str and byte string
Yuya Nishihara <yuya@tcha.org>
parents: 31456
diff changeset
   269
# converter functions between native str and byte string. use these if the
6419cd243017 encoding: add converter between native str and byte string
Yuya Nishihara <yuya@tcha.org>
parents: 31456
diff changeset
   270
# character encoding is not aware (e.g. exception message) or is known to
6419cd243017 encoding: add converter between native str and byte string
Yuya Nishihara <yuya@tcha.org>
parents: 31456
diff changeset
   271
# be locale dependent (e.g. date formatting.)
6419cd243017 encoding: add converter between native str and byte string
Yuya Nishihara <yuya@tcha.org>
parents: 31456
diff changeset
   272
if pycompat.ispy3:
6419cd243017 encoding: add converter between native str and byte string
Yuya Nishihara <yuya@tcha.org>
parents: 31456
diff changeset
   273
    strtolocal = unitolocal
6419cd243017 encoding: add converter between native str and byte string
Yuya Nishihara <yuya@tcha.org>
parents: 31456
diff changeset
   274
    strfromlocal = unifromlocal
33038
ce96efec8112 py3: add utility to forward __str__() to __bytes__()
Yuya Nishihara <yuya@tcha.org>
parents: 32570
diff changeset
   275
    strmethod = unimethod
31457
6419cd243017 encoding: add converter between native str and byte string
Yuya Nishihara <yuya@tcha.org>
parents: 31456
diff changeset
   276
else:
43517
5f2a8dabb0d8 encoding: define local identify functions with explicit type comments
Augie Fackler <augie@google.com>
parents: 43089
diff changeset
   277
5f2a8dabb0d8 encoding: define local identify functions with explicit type comments
Augie Fackler <augie@google.com>
parents: 43089
diff changeset
   278
    def strtolocal(s):
5f2a8dabb0d8 encoding: define local identify functions with explicit type comments
Augie Fackler <augie@google.com>
parents: 43089
diff changeset
   279
        # type: (str) -> bytes
43723
7f51bc36194d typing: suppress error of py2 encoding.strtolocal() and .strfromlocal()
Yuya Nishihara <yuya@tcha.org>
parents: 43722
diff changeset
   280
        return s  # pytype: disable=bad-return-type
43517
5f2a8dabb0d8 encoding: define local identify functions with explicit type comments
Augie Fackler <augie@google.com>
parents: 43089
diff changeset
   281
5f2a8dabb0d8 encoding: define local identify functions with explicit type comments
Augie Fackler <augie@google.com>
parents: 43089
diff changeset
   282
    def strfromlocal(s):
5f2a8dabb0d8 encoding: define local identify functions with explicit type comments
Augie Fackler <augie@google.com>
parents: 43089
diff changeset
   283
        # type: (bytes) -> str
43723
7f51bc36194d typing: suppress error of py2 encoding.strtolocal() and .strfromlocal()
Yuya Nishihara <yuya@tcha.org>
parents: 43722
diff changeset
   284
        return s  # pytype: disable=bad-return-type
43517
5f2a8dabb0d8 encoding: define local identify functions with explicit type comments
Augie Fackler <augie@google.com>
parents: 43089
diff changeset
   285
33038
ce96efec8112 py3: add utility to forward __str__() to __bytes__()
Yuya Nishihara <yuya@tcha.org>
parents: 32570
diff changeset
   286
    strmethod = pycompat.identity
31457
6419cd243017 encoding: add converter between native str and byte string
Yuya Nishihara <yuya@tcha.org>
parents: 31456
diff changeset
   287
47559
53a864a60281 encoding: move case-related utils up
Raphaël Gomès <rgomes@octobus.net>
parents: 46819
diff changeset
   288
53a864a60281 encoding: move case-related utils up
Raphaël Gomès <rgomes@octobus.net>
parents: 46819
diff changeset
   289
def lower(s):
53a864a60281 encoding: move case-related utils up
Raphaël Gomès <rgomes@octobus.net>
parents: 46819
diff changeset
   290
    # type: (bytes) -> bytes
53a864a60281 encoding: move case-related utils up
Raphaël Gomès <rgomes@octobus.net>
parents: 46819
diff changeset
   291
    """best-effort encoding-aware case-folding of local string s"""
53a864a60281 encoding: move case-related utils up
Raphaël Gomès <rgomes@octobus.net>
parents: 46819
diff changeset
   292
    try:
53a864a60281 encoding: move case-related utils up
Raphaël Gomès <rgomes@octobus.net>
parents: 46819
diff changeset
   293
        return asciilower(s)
53a864a60281 encoding: move case-related utils up
Raphaël Gomès <rgomes@octobus.net>
parents: 46819
diff changeset
   294
    except UnicodeDecodeError:
53a864a60281 encoding: move case-related utils up
Raphaël Gomès <rgomes@octobus.net>
parents: 46819
diff changeset
   295
        pass
53a864a60281 encoding: move case-related utils up
Raphaël Gomès <rgomes@octobus.net>
parents: 46819
diff changeset
   296
    try:
53a864a60281 encoding: move case-related utils up
Raphaël Gomès <rgomes@octobus.net>
parents: 46819
diff changeset
   297
        if isinstance(s, localstr):
53a864a60281 encoding: move case-related utils up
Raphaël Gomès <rgomes@octobus.net>
parents: 46819
diff changeset
   298
            u = s._utf8.decode("utf-8")
53a864a60281 encoding: move case-related utils up
Raphaël Gomès <rgomes@octobus.net>
parents: 46819
diff changeset
   299
        else:
53a864a60281 encoding: move case-related utils up
Raphaël Gomès <rgomes@octobus.net>
parents: 46819
diff changeset
   300
            u = s.decode(_sysstr(encoding), _sysstr(encodingmode))
53a864a60281 encoding: move case-related utils up
Raphaël Gomès <rgomes@octobus.net>
parents: 46819
diff changeset
   301
53a864a60281 encoding: move case-related utils up
Raphaël Gomès <rgomes@octobus.net>
parents: 46819
diff changeset
   302
        lu = u.lower()
53a864a60281 encoding: move case-related utils up
Raphaël Gomès <rgomes@octobus.net>
parents: 46819
diff changeset
   303
        if u == lu:
53a864a60281 encoding: move case-related utils up
Raphaël Gomès <rgomes@octobus.net>
parents: 46819
diff changeset
   304
            return s  # preserve localstring
53a864a60281 encoding: move case-related utils up
Raphaël Gomès <rgomes@octobus.net>
parents: 46819
diff changeset
   305
        return lu.encode(_sysstr(encoding))
53a864a60281 encoding: move case-related utils up
Raphaël Gomès <rgomes@octobus.net>
parents: 46819
diff changeset
   306
    except UnicodeError:
53a864a60281 encoding: move case-related utils up
Raphaël Gomès <rgomes@octobus.net>
parents: 46819
diff changeset
   307
        return s.lower()  # we don't know how to fold this except in ASCII
53a864a60281 encoding: move case-related utils up
Raphaël Gomès <rgomes@octobus.net>
parents: 46819
diff changeset
   308
    except LookupError as k:
53a864a60281 encoding: move case-related utils up
Raphaël Gomès <rgomes@octobus.net>
parents: 46819
diff changeset
   309
        raise error.Abort(k, hint=b"please check your locale settings")
53a864a60281 encoding: move case-related utils up
Raphaël Gomès <rgomes@octobus.net>
parents: 46819
diff changeset
   310
53a864a60281 encoding: move case-related utils up
Raphaël Gomès <rgomes@octobus.net>
parents: 46819
diff changeset
   311
53a864a60281 encoding: move case-related utils up
Raphaël Gomès <rgomes@octobus.net>
parents: 46819
diff changeset
   312
def upper(s):
53a864a60281 encoding: move case-related utils up
Raphaël Gomès <rgomes@octobus.net>
parents: 46819
diff changeset
   313
    # type: (bytes) -> bytes
53a864a60281 encoding: move case-related utils up
Raphaël Gomès <rgomes@octobus.net>
parents: 46819
diff changeset
   314
    """best-effort encoding-aware case-folding of local string s"""
53a864a60281 encoding: move case-related utils up
Raphaël Gomès <rgomes@octobus.net>
parents: 46819
diff changeset
   315
    try:
53a864a60281 encoding: move case-related utils up
Raphaël Gomès <rgomes@octobus.net>
parents: 46819
diff changeset
   316
        return asciiupper(s)
53a864a60281 encoding: move case-related utils up
Raphaël Gomès <rgomes@octobus.net>
parents: 46819
diff changeset
   317
    except UnicodeDecodeError:
53a864a60281 encoding: move case-related utils up
Raphaël Gomès <rgomes@octobus.net>
parents: 46819
diff changeset
   318
        return upperfallback(s)
53a864a60281 encoding: move case-related utils up
Raphaël Gomès <rgomes@octobus.net>
parents: 46819
diff changeset
   319
53a864a60281 encoding: move case-related utils up
Raphaël Gomès <rgomes@octobus.net>
parents: 46819
diff changeset
   320
53a864a60281 encoding: move case-related utils up
Raphaël Gomès <rgomes@octobus.net>
parents: 46819
diff changeset
   321
def upperfallback(s):
53a864a60281 encoding: move case-related utils up
Raphaël Gomès <rgomes@octobus.net>
parents: 46819
diff changeset
   322
    # type: (Any) -> Any
53a864a60281 encoding: move case-related utils up
Raphaël Gomès <rgomes@octobus.net>
parents: 46819
diff changeset
   323
    try:
53a864a60281 encoding: move case-related utils up
Raphaël Gomès <rgomes@octobus.net>
parents: 46819
diff changeset
   324
        if isinstance(s, localstr):
53a864a60281 encoding: move case-related utils up
Raphaël Gomès <rgomes@octobus.net>
parents: 46819
diff changeset
   325
            u = s._utf8.decode("utf-8")
53a864a60281 encoding: move case-related utils up
Raphaël Gomès <rgomes@octobus.net>
parents: 46819
diff changeset
   326
        else:
53a864a60281 encoding: move case-related utils up
Raphaël Gomès <rgomes@octobus.net>
parents: 46819
diff changeset
   327
            u = s.decode(_sysstr(encoding), _sysstr(encodingmode))
53a864a60281 encoding: move case-related utils up
Raphaël Gomès <rgomes@octobus.net>
parents: 46819
diff changeset
   328
53a864a60281 encoding: move case-related utils up
Raphaël Gomès <rgomes@octobus.net>
parents: 46819
diff changeset
   329
        uu = u.upper()
53a864a60281 encoding: move case-related utils up
Raphaël Gomès <rgomes@octobus.net>
parents: 46819
diff changeset
   330
        if u == uu:
53a864a60281 encoding: move case-related utils up
Raphaël Gomès <rgomes@octobus.net>
parents: 46819
diff changeset
   331
            return s  # preserve localstring
53a864a60281 encoding: move case-related utils up
Raphaël Gomès <rgomes@octobus.net>
parents: 46819
diff changeset
   332
        return uu.encode(_sysstr(encoding))
53a864a60281 encoding: move case-related utils up
Raphaël Gomès <rgomes@octobus.net>
parents: 46819
diff changeset
   333
    except UnicodeError:
53a864a60281 encoding: move case-related utils up
Raphaël Gomès <rgomes@octobus.net>
parents: 46819
diff changeset
   334
        return s.upper()  # we don't know how to fold this except in ASCII
53a864a60281 encoding: move case-related utils up
Raphaël Gomès <rgomes@octobus.net>
parents: 46819
diff changeset
   335
    except LookupError as k:
53a864a60281 encoding: move case-related utils up
Raphaël Gomès <rgomes@octobus.net>
parents: 46819
diff changeset
   336
        raise error.Abort(k, hint=b"please check your locale settings")
53a864a60281 encoding: move case-related utils up
Raphaël Gomès <rgomes@octobus.net>
parents: 46819
diff changeset
   337
53a864a60281 encoding: move case-related utils up
Raphaël Gomès <rgomes@octobus.net>
parents: 46819
diff changeset
   338
30034
e4a6b439acc5 py3: provide encoding.environ which is a dict of bytes
Yuya Nishihara <yuya@tcha.org>
parents: 30033
diff changeset
   339
if not _nativeenviron:
e4a6b439acc5 py3: provide encoding.environ which is a dict of bytes
Yuya Nishihara <yuya@tcha.org>
parents: 30033
diff changeset
   340
    # now encoding and helper functions are available, recreate the environ
e4a6b439acc5 py3: provide encoding.environ which is a dict of bytes
Yuya Nishihara <yuya@tcha.org>
parents: 30033
diff changeset
   341
    # dict to be exported to other modules
47560
af633293a5bd windows: replicate the normalizing behavior of os.environ
Raphaël Gomès <rgomes@octobus.net>
parents: 47559
diff changeset
   342
    if pycompat.iswindows and pycompat.ispy3:
af633293a5bd windows: replicate the normalizing behavior of os.environ
Raphaël Gomès <rgomes@octobus.net>
parents: 47559
diff changeset
   343
af633293a5bd windows: replicate the normalizing behavior of os.environ
Raphaël Gomès <rgomes@octobus.net>
parents: 47559
diff changeset
   344
        class WindowsEnviron(dict):
af633293a5bd windows: replicate the normalizing behavior of os.environ
Raphaël Gomès <rgomes@octobus.net>
parents: 47559
diff changeset
   345
            """`os.environ` normalizes environment variables to uppercase on windows"""
af633293a5bd windows: replicate the normalizing behavior of os.environ
Raphaël Gomès <rgomes@octobus.net>
parents: 47559
diff changeset
   346
af633293a5bd windows: replicate the normalizing behavior of os.environ
Raphaël Gomès <rgomes@octobus.net>
parents: 47559
diff changeset
   347
            def get(self, key, default=None):
af633293a5bd windows: replicate the normalizing behavior of os.environ
Raphaël Gomès <rgomes@octobus.net>
parents: 47559
diff changeset
   348
                return super().get(upper(key), default)
af633293a5bd windows: replicate the normalizing behavior of os.environ
Raphaël Gomès <rgomes@octobus.net>
parents: 47559
diff changeset
   349
af633293a5bd windows: replicate the normalizing behavior of os.environ
Raphaël Gomès <rgomes@octobus.net>
parents: 47559
diff changeset
   350
        environ = WindowsEnviron()
af633293a5bd windows: replicate the normalizing behavior of os.environ
Raphaël Gomès <rgomes@octobus.net>
parents: 47559
diff changeset
   351
af633293a5bd windows: replicate the normalizing behavior of os.environ
Raphaël Gomès <rgomes@octobus.net>
parents: 47559
diff changeset
   352
    for k, v in os.environ.items():  # re-exports
af633293a5bd windows: replicate the normalizing behavior of os.environ
Raphaël Gomès <rgomes@octobus.net>
parents: 47559
diff changeset
   353
        environ[tolocal(k.encode('utf-8'))] = tolocal(v.encode('utf-8'))
af633293a5bd windows: replicate the normalizing behavior of os.environ
Raphaël Gomès <rgomes@octobus.net>
parents: 47559
diff changeset
   354
30034
e4a6b439acc5 py3: provide encoding.environ which is a dict of bytes
Yuya Nishihara <yuya@tcha.org>
parents: 30033
diff changeset
   355
47621
d6ee6456bd5f windows: enforce upper case drive letter for getcwd in mercurial too
Pierre-Yves David <pierre-yves.david@octobus.net>
parents: 47560
diff changeset
   356
DRIVE_RE = re.compile(b'^[a-z]:')
d6ee6456bd5f windows: enforce upper case drive letter for getcwd in mercurial too
Pierre-Yves David <pierre-yves.david@octobus.net>
parents: 47560
diff changeset
   357
39823
24e493ec2229 py3: rename pycompat.getcwd() to encoding.getcwd() (API)
Matt Harbison <matt_harbison@yahoo.com>
parents: 38823
diff changeset
   358
if pycompat.ispy3:
24e493ec2229 py3: rename pycompat.getcwd() to encoding.getcwd() (API)
Matt Harbison <matt_harbison@yahoo.com>
parents: 38823
diff changeset
   359
    # os.getcwd() on Python 3 returns string, but it has os.getcwdb() which
24e493ec2229 py3: rename pycompat.getcwd() to encoding.getcwd() (API)
Matt Harbison <matt_harbison@yahoo.com>
parents: 38823
diff changeset
   360
    # returns bytes.
39824
fb628c048d64 py3: don't use os.getcwdb() on Windows to avoid DeprecationWarnings
Matt Harbison <matt_harbison@yahoo.com>
parents: 39823
diff changeset
   361
    if pycompat.iswindows:
fb628c048d64 py3: don't use os.getcwdb() on Windows to avoid DeprecationWarnings
Matt Harbison <matt_harbison@yahoo.com>
parents: 39823
diff changeset
   362
        # Python 3 on Windows issues a DeprecationWarning about using the bytes
fb628c048d64 py3: don't use os.getcwdb() on Windows to avoid DeprecationWarnings
Matt Harbison <matt_harbison@yahoo.com>
parents: 39823
diff changeset
   363
        # API when os.getcwdb() is called.
46319
3dfebba99ef6 windows: wrap `os.getcwd()` in `os.path.realpath()` on py3
Matt Harbison <matt_harbison@yahoo.com>
parents: 45957
diff changeset
   364
        #
3dfebba99ef6 windows: wrap `os.getcwd()` in `os.path.realpath()` on py3
Matt Harbison <matt_harbison@yahoo.com>
parents: 45957
diff changeset
   365
        # Additionally, py3.8+ uppercases the drive letter when calling
3dfebba99ef6 windows: wrap `os.getcwd()` in `os.path.realpath()` on py3
Matt Harbison <matt_harbison@yahoo.com>
parents: 45957
diff changeset
   366
        # os.path.realpath(), which is used on ``repo.root``.  Since those
3dfebba99ef6 windows: wrap `os.getcwd()` in `os.path.realpath()` on py3
Matt Harbison <matt_harbison@yahoo.com>
parents: 45957
diff changeset
   367
        # strings are compared in various places as simple strings, also call
3dfebba99ef6 windows: wrap `os.getcwd()` in `os.path.realpath()` on py3
Matt Harbison <matt_harbison@yahoo.com>
parents: 45957
diff changeset
   368
        # realpath here.  See https://bugs.python.org/issue40368
47621
d6ee6456bd5f windows: enforce upper case drive letter for getcwd in mercurial too
Pierre-Yves David <pierre-yves.david@octobus.net>
parents: 47560
diff changeset
   369
        #
d6ee6456bd5f windows: enforce upper case drive letter for getcwd in mercurial too
Pierre-Yves David <pierre-yves.david@octobus.net>
parents: 47560
diff changeset
   370
        # However this is not reliable, so lets explicitly make this drive
d6ee6456bd5f windows: enforce upper case drive letter for getcwd in mercurial too
Pierre-Yves David <pierre-yves.david@octobus.net>
parents: 47560
diff changeset
   371
        # letter upper case.
d6ee6456bd5f windows: enforce upper case drive letter for getcwd in mercurial too
Pierre-Yves David <pierre-yves.david@octobus.net>
parents: 47560
diff changeset
   372
        #
d6ee6456bd5f windows: enforce upper case drive letter for getcwd in mercurial too
Pierre-Yves David <pierre-yves.david@octobus.net>
parents: 47560
diff changeset
   373
        # note: we should consider dropping realpath here since it seems to
d6ee6456bd5f windows: enforce upper case drive letter for getcwd in mercurial too
Pierre-Yves David <pierre-yves.david@octobus.net>
parents: 47560
diff changeset
   374
        # change the semantic of `getcwd`.
d6ee6456bd5f windows: enforce upper case drive letter for getcwd in mercurial too
Pierre-Yves David <pierre-yves.david@octobus.net>
parents: 47560
diff changeset
   375
d6ee6456bd5f windows: enforce upper case drive letter for getcwd in mercurial too
Pierre-Yves David <pierre-yves.david@octobus.net>
parents: 47560
diff changeset
   376
        def getcwd():
d6ee6456bd5f windows: enforce upper case drive letter for getcwd in mercurial too
Pierre-Yves David <pierre-yves.david@octobus.net>
parents: 47560
diff changeset
   377
            cwd = os.getcwd()  # re-exports
d6ee6456bd5f windows: enforce upper case drive letter for getcwd in mercurial too
Pierre-Yves David <pierre-yves.david@octobus.net>
parents: 47560
diff changeset
   378
            cwd = os.path.realpath(cwd)
d6ee6456bd5f windows: enforce upper case drive letter for getcwd in mercurial too
Pierre-Yves David <pierre-yves.david@octobus.net>
parents: 47560
diff changeset
   379
            cwd = strtolocal(cwd)
d6ee6456bd5f windows: enforce upper case drive letter for getcwd in mercurial too
Pierre-Yves David <pierre-yves.david@octobus.net>
parents: 47560
diff changeset
   380
            if DRIVE_RE.match(cwd):
d6ee6456bd5f windows: enforce upper case drive letter for getcwd in mercurial too
Pierre-Yves David <pierre-yves.david@octobus.net>
parents: 47560
diff changeset
   381
                cwd = cwd[0:1].upper() + cwd[1:]
d6ee6456bd5f windows: enforce upper case drive letter for getcwd in mercurial too
Pierre-Yves David <pierre-yves.david@octobus.net>
parents: 47560
diff changeset
   382
            return cwd
d6ee6456bd5f windows: enforce upper case drive letter for getcwd in mercurial too
Pierre-Yves David <pierre-yves.david@octobus.net>
parents: 47560
diff changeset
   383
39824
fb628c048d64 py3: don't use os.getcwdb() on Windows to avoid DeprecationWarnings
Matt Harbison <matt_harbison@yahoo.com>
parents: 39823
diff changeset
   384
    else:
fb628c048d64 py3: don't use os.getcwdb() on Windows to avoid DeprecationWarnings
Matt Harbison <matt_harbison@yahoo.com>
parents: 39823
diff changeset
   385
        getcwd = os.getcwdb  # re-exports
39823
24e493ec2229 py3: rename pycompat.getcwd() to encoding.getcwd() (API)
Matt Harbison <matt_harbison@yahoo.com>
parents: 38823
diff changeset
   386
else:
24e493ec2229 py3: rename pycompat.getcwd() to encoding.getcwd() (API)
Matt Harbison <matt_harbison@yahoo.com>
parents: 38823
diff changeset
   387
    getcwd = os.getcwd  # re-exports
24e493ec2229 py3: rename pycompat.getcwd() to encoding.getcwd() (API)
Matt Harbison <matt_harbison@yahoo.com>
parents: 38823
diff changeset
   388
12866
eddc20306ab6 encoding: default ambiguous character to narrow
Matt Mackall <mpm@selenic.com>
parents: 12770
diff changeset
   389
# How to treat ambiguous-width characters. Set to 'wide' to treat as wide.
43076
2372284d9457 formatting: blacken the codebase
Augie Fackler <augie@google.com>
parents: 41841
diff changeset
   390
_wide = _sysstr(
43077
687b865b95ad formatting: byteify all mercurial/ and hgext/ string literals
Augie Fackler <augie@google.com>
parents: 43076
diff changeset
   391
    environ.get(b"HGENCODINGAMBIGUOUS", b"narrow") == b"wide"
687b865b95ad formatting: byteify all mercurial/ and hgext/ string literals
Augie Fackler <augie@google.com>
parents: 43076
diff changeset
   392
    and b"WFA"
687b865b95ad formatting: byteify all mercurial/ and hgext/ string literals
Augie Fackler <augie@google.com>
parents: 43076
diff changeset
   393
    or b"WF"
43076
2372284d9457 formatting: blacken the codebase
Augie Fackler <augie@google.com>
parents: 41841
diff changeset
   394
)
2372284d9457 formatting: blacken the codebase
Augie Fackler <augie@google.com>
parents: 41841
diff changeset
   395
12866
eddc20306ab6 encoding: default ambiguous character to narrow
Matt Mackall <mpm@selenic.com>
parents: 12770
diff changeset
   396
7948
de377b1a9a84 move encoding bits from util to encoding
Matt Mackall <mpm@selenic.com>
parents:
diff changeset
   397
def colwidth(s):
43544
2ade00f3b03b encoding: add comment-based type hints for pytype
Augie Fackler <augie@google.com>
parents: 43517
diff changeset
   398
    # type: (bytes) -> int
43807
be8552f25cab cleanup: fix docstring formatting
Matt Harbison <matt_harbison@yahoo.com>
parents: 43793
diff changeset
   399
    """Find the column width of a string for display in the local encoding"""
43554
9f70512ae2cf cleanup: remove pointless r-prefixes on single-quoted strings
Augie Fackler <augie@google.com>
parents: 43551
diff changeset
   400
    return ucolwidth(s.decode(_sysstr(encoding), 'replace'))
15066
24efa83d81cb i18n: calculate terminal columns by width information of each characters
FUJIWARA Katsunori <foozy@lares.dti.ne.jp>
parents: 14951
diff changeset
   401
43076
2372284d9457 formatting: blacken the codebase
Augie Fackler <augie@google.com>
parents: 41841
diff changeset
   402
15066
24efa83d81cb i18n: calculate terminal columns by width information of each characters
FUJIWARA Katsunori <foozy@lares.dti.ne.jp>
parents: 14951
diff changeset
   403
def ucolwidth(d):
43544
2ade00f3b03b encoding: add comment-based type hints for pytype
Augie Fackler <augie@google.com>
parents: 43517
diff changeset
   404
    # type: (Text) -> int
43807
be8552f25cab cleanup: fix docstring formatting
Matt Harbison <matt_harbison@yahoo.com>
parents: 43793
diff changeset
   405
    """Find the column width of a Unicode string for display"""
14951
61807854004e encoding: use getattr isntead of hasattr
Augie Fackler <durin42@gmail.com>
parents: 14069
diff changeset
   406
    eaw = getattr(unicodedata, 'east_asian_width', None)
61807854004e encoding: use getattr isntead of hasattr
Augie Fackler <durin42@gmail.com>
parents: 14069
diff changeset
   407
    if eaw is not None:
32570
044f3d7eb9ae encoding: make sure "wide" variable never be referenced from other modules
Yuya Nishihara <yuya@tcha.org>
parents: 32562
diff changeset
   408
        return sum([eaw(c) in _wide and 2 or 1 for c in d])
7948
de377b1a9a84 move encoding bits from util to encoding
Matt Mackall <mpm@selenic.com>
parents:
diff changeset
   409
    return len(d)
de377b1a9a84 move encoding bits from util to encoding
Matt Mackall <mpm@selenic.com>
parents:
diff changeset
   410
43076
2372284d9457 formatting: blacken the codebase
Augie Fackler <augie@google.com>
parents: 41841
diff changeset
   411
15143
16c129b0f465 encoding: add getcols to extract substrings based on column width
Matt Mackall <mpm@selenic.com>
parents: 15142
diff changeset
   412
def getcols(s, start, c):
43544
2ade00f3b03b encoding: add comment-based type hints for pytype
Augie Fackler <augie@google.com>
parents: 43517
diff changeset
   413
    # type: (bytes, int, int) -> bytes
45957
89a2afe31e82 formating: upgrade to black 20.8b1
Augie Fackler <raf@durin42.com>
parents: 45681
diff changeset
   414
    """Use colwidth to find a c-column substring of s starting at byte
89a2afe31e82 formating: upgrade to black 20.8b1
Augie Fackler <raf@durin42.com>
parents: 45681
diff changeset
   415
    index start"""
38823
e7aa113b14f7 global: use pycompat.xrange()
Gregory Szorc <gregory.szorc@gmail.com>
parents: 38739
diff changeset
   416
    for x in pycompat.xrange(start + c, len(s)):
15143
16c129b0f465 encoding: add getcols to extract substrings based on column width
Matt Mackall <mpm@selenic.com>
parents: 15142
diff changeset
   417
        t = s[start:x]
16c129b0f465 encoding: add getcols to extract substrings based on column width
Matt Mackall <mpm@selenic.com>
parents: 15142
diff changeset
   418
        if colwidth(t) == c:
16c129b0f465 encoding: add getcols to extract substrings based on column width
Matt Mackall <mpm@selenic.com>
parents: 15142
diff changeset
   419
            return t
43719
7cf332318f62 encoding: make getcols() raise exception explicitly
Yuya Nishihara <yuya@tcha.org>
parents: 43681
diff changeset
   420
    raise ValueError('substring not found')
15143
16c129b0f465 encoding: add getcols to extract substrings based on column width
Matt Mackall <mpm@selenic.com>
parents: 15142
diff changeset
   421
43076
2372284d9457 formatting: blacken the codebase
Augie Fackler <augie@google.com>
parents: 41841
diff changeset
   422
43077
687b865b95ad formatting: byteify all mercurial/ and hgext/ string literals
Augie Fackler <augie@google.com>
parents: 43076
diff changeset
   423
def trim(s, width, ellipsis=b'', leftside=False):
43544
2ade00f3b03b encoding: add comment-based type hints for pytype
Augie Fackler <augie@google.com>
parents: 43517
diff changeset
   424
    # type: (bytes, int, bytes, bool) -> bytes
21856
d24969ee272f encoding: add 'trim' to trim multi-byte characters at most specified columns
FUJIWARA Katsunori <foozy@lares.dti.ne.jp>
parents: 18378
diff changeset
   425
    """Trim string 's' to at most 'width' columns (including 'ellipsis').
d24969ee272f encoding: add 'trim' to trim multi-byte characters at most specified columns
FUJIWARA Katsunori <foozy@lares.dti.ne.jp>
parents: 18378
diff changeset
   426
21861
b515c3a63e96 encoding: add 'leftside' argument into 'trim' to switch trimming side
FUJIWARA Katsunori <foozy@lares.dti.ne.jp>
parents: 21856
diff changeset
   427
    If 'leftside' is True, left side of string 's' is trimmed.
b515c3a63e96 encoding: add 'leftside' argument into 'trim' to switch trimming side
FUJIWARA Katsunori <foozy@lares.dti.ne.jp>
parents: 21856
diff changeset
   428
    'ellipsis' is always placed at trimmed side.
b515c3a63e96 encoding: add 'leftside' argument into 'trim' to switch trimming side
FUJIWARA Katsunori <foozy@lares.dti.ne.jp>
parents: 21856
diff changeset
   429
34151
414a3513c2bd doctest: do not embed non-ascii characters in docstring
Yuya Nishihara <yuya@tcha.org>
parents: 34150
diff changeset
   430
    >>> from .node import bin
34152
a8994d08e4a2 doctest: use print_function and convert bytes to unicode where needed
Yuya Nishihara <yuya@tcha.org>
parents: 34151
diff changeset
   431
    >>> def bprint(s):
a8994d08e4a2 doctest: use print_function and convert bytes to unicode where needed
Yuya Nishihara <yuya@tcha.org>
parents: 34151
diff changeset
   432
    ...     print(pycompat.sysstr(s))
34146
0fa781320203 doctest: bulk-replace string literals with b'' for Python 3
Yuya Nishihara <yuya@tcha.org>
parents: 33946
diff changeset
   433
    >>> ellipsis = b'+++'
27355
b479fc425a81 encoding: use absolute_import
Gregory Szorc <gregory.szorc@gmail.com>
parents: 26963
diff changeset
   434
    >>> from . import encoding
34146
0fa781320203 doctest: bulk-replace string literals with b'' for Python 3
Yuya Nishihara <yuya@tcha.org>
parents: 33946
diff changeset
   435
    >>> encoding.encoding = b'utf-8'
0fa781320203 doctest: bulk-replace string literals with b'' for Python 3
Yuya Nishihara <yuya@tcha.org>
parents: 33946
diff changeset
   436
    >>> t = b'1234567890'
34152
a8994d08e4a2 doctest: use print_function and convert bytes to unicode where needed
Yuya Nishihara <yuya@tcha.org>
parents: 34151
diff changeset
   437
    >>> bprint(trim(t, 12, ellipsis=ellipsis))
21856
d24969ee272f encoding: add 'trim' to trim multi-byte characters at most specified columns
FUJIWARA Katsunori <foozy@lares.dti.ne.jp>
parents: 18378
diff changeset
   438
    1234567890
34152
a8994d08e4a2 doctest: use print_function and convert bytes to unicode where needed
Yuya Nishihara <yuya@tcha.org>
parents: 34151
diff changeset
   439
    >>> bprint(trim(t, 10, ellipsis=ellipsis))
21856
d24969ee272f encoding: add 'trim' to trim multi-byte characters at most specified columns
FUJIWARA Katsunori <foozy@lares.dti.ne.jp>
parents: 18378
diff changeset
   440
    1234567890
34152
a8994d08e4a2 doctest: use print_function and convert bytes to unicode where needed
Yuya Nishihara <yuya@tcha.org>
parents: 34151
diff changeset
   441
    >>> bprint(trim(t, 8, ellipsis=ellipsis))
21856
d24969ee272f encoding: add 'trim' to trim multi-byte characters at most specified columns
FUJIWARA Katsunori <foozy@lares.dti.ne.jp>
parents: 18378
diff changeset
   442
    12345+++
34152
a8994d08e4a2 doctest: use print_function and convert bytes to unicode where needed
Yuya Nishihara <yuya@tcha.org>
parents: 34151
diff changeset
   443
    >>> bprint(trim(t, 8, ellipsis=ellipsis, leftside=True))
21861
b515c3a63e96 encoding: add 'leftside' argument into 'trim' to switch trimming side
FUJIWARA Katsunori <foozy@lares.dti.ne.jp>
parents: 21856
diff changeset
   444
    +++67890
34152
a8994d08e4a2 doctest: use print_function and convert bytes to unicode where needed
Yuya Nishihara <yuya@tcha.org>
parents: 34151
diff changeset
   445
    >>> bprint(trim(t, 8))
21856
d24969ee272f encoding: add 'trim' to trim multi-byte characters at most specified columns
FUJIWARA Katsunori <foozy@lares.dti.ne.jp>
parents: 18378
diff changeset
   446
    12345678
34152
a8994d08e4a2 doctest: use print_function and convert bytes to unicode where needed
Yuya Nishihara <yuya@tcha.org>
parents: 34151
diff changeset
   447
    >>> bprint(trim(t, 8, leftside=True))
21861
b515c3a63e96 encoding: add 'leftside' argument into 'trim' to switch trimming side
FUJIWARA Katsunori <foozy@lares.dti.ne.jp>
parents: 21856
diff changeset
   448
    34567890
34152
a8994d08e4a2 doctest: use print_function and convert bytes to unicode where needed
Yuya Nishihara <yuya@tcha.org>
parents: 34151
diff changeset
   449
    >>> bprint(trim(t, 3, ellipsis=ellipsis))
21856
d24969ee272f encoding: add 'trim' to trim multi-byte characters at most specified columns
FUJIWARA Katsunori <foozy@lares.dti.ne.jp>
parents: 18378
diff changeset
   450
    +++
34152
a8994d08e4a2 doctest: use print_function and convert bytes to unicode where needed
Yuya Nishihara <yuya@tcha.org>
parents: 34151
diff changeset
   451
    >>> bprint(trim(t, 1, ellipsis=ellipsis))
21856
d24969ee272f encoding: add 'trim' to trim multi-byte characters at most specified columns
FUJIWARA Katsunori <foozy@lares.dti.ne.jp>
parents: 18378
diff changeset
   452
    +
d24969ee272f encoding: add 'trim' to trim multi-byte characters at most specified columns
FUJIWARA Katsunori <foozy@lares.dti.ne.jp>
parents: 18378
diff changeset
   453
    >>> u = u'\u3042\u3044\u3046\u3048\u304a' # 2 x 5 = 10 columns
34150
e9e225f16932 doctest: pass encoding name as system string
Yuya Nishihara <yuya@tcha.org>
parents: 34146
diff changeset
   454
    >>> t = u.encode(pycompat.sysstr(encoding.encoding))
34152
a8994d08e4a2 doctest: use print_function and convert bytes to unicode where needed
Yuya Nishihara <yuya@tcha.org>
parents: 34151
diff changeset
   455
    >>> bprint(trim(t, 12, ellipsis=ellipsis))
21856
d24969ee272f encoding: add 'trim' to trim multi-byte characters at most specified columns
FUJIWARA Katsunori <foozy@lares.dti.ne.jp>
parents: 18378
diff changeset
   456
    \xe3\x81\x82\xe3\x81\x84\xe3\x81\x86\xe3\x81\x88\xe3\x81\x8a
34152
a8994d08e4a2 doctest: use print_function and convert bytes to unicode where needed
Yuya Nishihara <yuya@tcha.org>
parents: 34151
diff changeset
   457
    >>> bprint(trim(t, 10, ellipsis=ellipsis))
21856
d24969ee272f encoding: add 'trim' to trim multi-byte characters at most specified columns
FUJIWARA Katsunori <foozy@lares.dti.ne.jp>
parents: 18378
diff changeset
   458
    \xe3\x81\x82\xe3\x81\x84\xe3\x81\x86\xe3\x81\x88\xe3\x81\x8a
34152
a8994d08e4a2 doctest: use print_function and convert bytes to unicode where needed
Yuya Nishihara <yuya@tcha.org>
parents: 34151
diff changeset
   459
    >>> bprint(trim(t, 8, ellipsis=ellipsis))
21856
d24969ee272f encoding: add 'trim' to trim multi-byte characters at most specified columns
FUJIWARA Katsunori <foozy@lares.dti.ne.jp>
parents: 18378
diff changeset
   460
    \xe3\x81\x82\xe3\x81\x84+++
34152
a8994d08e4a2 doctest: use print_function and convert bytes to unicode where needed
Yuya Nishihara <yuya@tcha.org>
parents: 34151
diff changeset
   461
    >>> bprint(trim(t, 8, ellipsis=ellipsis, leftside=True))
21861
b515c3a63e96 encoding: add 'leftside' argument into 'trim' to switch trimming side
FUJIWARA Katsunori <foozy@lares.dti.ne.jp>
parents: 21856
diff changeset
   462
    +++\xe3\x81\x88\xe3\x81\x8a
34152
a8994d08e4a2 doctest: use print_function and convert bytes to unicode where needed
Yuya Nishihara <yuya@tcha.org>
parents: 34151
diff changeset
   463
    >>> bprint(trim(t, 5))
21856
d24969ee272f encoding: add 'trim' to trim multi-byte characters at most specified columns
FUJIWARA Katsunori <foozy@lares.dti.ne.jp>
parents: 18378
diff changeset
   464
    \xe3\x81\x82\xe3\x81\x84
34152
a8994d08e4a2 doctest: use print_function and convert bytes to unicode where needed
Yuya Nishihara <yuya@tcha.org>
parents: 34151
diff changeset
   465
    >>> bprint(trim(t, 5, leftside=True))
21861
b515c3a63e96 encoding: add 'leftside' argument into 'trim' to switch trimming side
FUJIWARA Katsunori <foozy@lares.dti.ne.jp>
parents: 21856
diff changeset
   466
    \xe3\x81\x88\xe3\x81\x8a
34152
a8994d08e4a2 doctest: use print_function and convert bytes to unicode where needed
Yuya Nishihara <yuya@tcha.org>
parents: 34151
diff changeset
   467
    >>> bprint(trim(t, 4, ellipsis=ellipsis))
21856
d24969ee272f encoding: add 'trim' to trim multi-byte characters at most specified columns
FUJIWARA Katsunori <foozy@lares.dti.ne.jp>
parents: 18378
diff changeset
   468
    +++
34152
a8994d08e4a2 doctest: use print_function and convert bytes to unicode where needed
Yuya Nishihara <yuya@tcha.org>
parents: 34151
diff changeset
   469
    >>> bprint(trim(t, 4, ellipsis=ellipsis, leftside=True))
21861
b515c3a63e96 encoding: add 'leftside' argument into 'trim' to switch trimming side
FUJIWARA Katsunori <foozy@lares.dti.ne.jp>
parents: 21856
diff changeset
   470
    +++
34151
414a3513c2bd doctest: do not embed non-ascii characters in docstring
Yuya Nishihara <yuya@tcha.org>
parents: 34150
diff changeset
   471
    >>> t = bin(b'112233445566778899aa') # invalid byte sequence
34152
a8994d08e4a2 doctest: use print_function and convert bytes to unicode where needed
Yuya Nishihara <yuya@tcha.org>
parents: 34151
diff changeset
   472
    >>> bprint(trim(t, 12, ellipsis=ellipsis))
21856
d24969ee272f encoding: add 'trim' to trim multi-byte characters at most specified columns
FUJIWARA Katsunori <foozy@lares.dti.ne.jp>
parents: 18378
diff changeset
   473
    \x11\x22\x33\x44\x55\x66\x77\x88\x99\xaa
34152
a8994d08e4a2 doctest: use print_function and convert bytes to unicode where needed
Yuya Nishihara <yuya@tcha.org>
parents: 34151
diff changeset
   474
    >>> bprint(trim(t, 10, ellipsis=ellipsis))
21856
d24969ee272f encoding: add 'trim' to trim multi-byte characters at most specified columns
FUJIWARA Katsunori <foozy@lares.dti.ne.jp>
parents: 18378
diff changeset
   475
    \x11\x22\x33\x44\x55\x66\x77\x88\x99\xaa
34152
a8994d08e4a2 doctest: use print_function and convert bytes to unicode where needed
Yuya Nishihara <yuya@tcha.org>
parents: 34151
diff changeset
   476
    >>> bprint(trim(t, 8, ellipsis=ellipsis))
21856
d24969ee272f encoding: add 'trim' to trim multi-byte characters at most specified columns
FUJIWARA Katsunori <foozy@lares.dti.ne.jp>
parents: 18378
diff changeset
   477
    \x11\x22\x33\x44\x55+++
34152
a8994d08e4a2 doctest: use print_function and convert bytes to unicode where needed
Yuya Nishihara <yuya@tcha.org>
parents: 34151
diff changeset
   478
    >>> bprint(trim(t, 8, ellipsis=ellipsis, leftside=True))
21861
b515c3a63e96 encoding: add 'leftside' argument into 'trim' to switch trimming side
FUJIWARA Katsunori <foozy@lares.dti.ne.jp>
parents: 21856
diff changeset
   479
    +++\x66\x77\x88\x99\xaa
34152
a8994d08e4a2 doctest: use print_function and convert bytes to unicode where needed
Yuya Nishihara <yuya@tcha.org>
parents: 34151
diff changeset
   480
    >>> bprint(trim(t, 8))
21856
d24969ee272f encoding: add 'trim' to trim multi-byte characters at most specified columns
FUJIWARA Katsunori <foozy@lares.dti.ne.jp>
parents: 18378
diff changeset
   481
    \x11\x22\x33\x44\x55\x66\x77\x88
34152
a8994d08e4a2 doctest: use print_function and convert bytes to unicode where needed
Yuya Nishihara <yuya@tcha.org>
parents: 34151
diff changeset
   482
    >>> bprint(trim(t, 8, leftside=True))
21861
b515c3a63e96 encoding: add 'leftside' argument into 'trim' to switch trimming side
FUJIWARA Katsunori <foozy@lares.dti.ne.jp>
parents: 21856
diff changeset
   483
    \x33\x44\x55\x66\x77\x88\x99\xaa
34152
a8994d08e4a2 doctest: use print_function and convert bytes to unicode where needed
Yuya Nishihara <yuya@tcha.org>
parents: 34151
diff changeset
   484
    >>> bprint(trim(t, 3, ellipsis=ellipsis))
21856
d24969ee272f encoding: add 'trim' to trim multi-byte characters at most specified columns
FUJIWARA Katsunori <foozy@lares.dti.ne.jp>
parents: 18378
diff changeset
   485
    +++
34152
a8994d08e4a2 doctest: use print_function and convert bytes to unicode where needed
Yuya Nishihara <yuya@tcha.org>
parents: 34151
diff changeset
   486
    >>> bprint(trim(t, 1, ellipsis=ellipsis))
21856
d24969ee272f encoding: add 'trim' to trim multi-byte characters at most specified columns
FUJIWARA Katsunori <foozy@lares.dti.ne.jp>
parents: 18378
diff changeset
   487
    +
d24969ee272f encoding: add 'trim' to trim multi-byte characters at most specified columns
FUJIWARA Katsunori <foozy@lares.dti.ne.jp>
parents: 18378
diff changeset
   488
    """
d24969ee272f encoding: add 'trim' to trim multi-byte characters at most specified columns
FUJIWARA Katsunori <foozy@lares.dti.ne.jp>
parents: 18378
diff changeset
   489
    try:
30033
02dbfaa6df0b py3: convert encoding name and mode to str
Yuya Nishihara <yuya@tcha.org>
parents: 30031
diff changeset
   490
        u = s.decode(_sysstr(encoding))
21856
d24969ee272f encoding: add 'trim' to trim multi-byte characters at most specified columns
FUJIWARA Katsunori <foozy@lares.dti.ne.jp>
parents: 18378
diff changeset
   491
    except UnicodeDecodeError:
43076
2372284d9457 formatting: blacken the codebase
Augie Fackler <augie@google.com>
parents: 41841
diff changeset
   492
        if len(s) <= width:  # trimming is not needed
21856
d24969ee272f encoding: add 'trim' to trim multi-byte characters at most specified columns
FUJIWARA Katsunori <foozy@lares.dti.ne.jp>
parents: 18378
diff changeset
   493
            return s
d24969ee272f encoding: add 'trim' to trim multi-byte characters at most specified columns
FUJIWARA Katsunori <foozy@lares.dti.ne.jp>
parents: 18378
diff changeset
   494
        width -= len(ellipsis)
43076
2372284d9457 formatting: blacken the codebase
Augie Fackler <augie@google.com>
parents: 41841
diff changeset
   495
        if width <= 0:  # no enough room even for ellipsis
2372284d9457 formatting: blacken the codebase
Augie Fackler <augie@google.com>
parents: 41841
diff changeset
   496
            return ellipsis[: width + len(ellipsis)]
21861
b515c3a63e96 encoding: add 'leftside' argument into 'trim' to switch trimming side
FUJIWARA Katsunori <foozy@lares.dti.ne.jp>
parents: 21856
diff changeset
   497
        if leftside:
b515c3a63e96 encoding: add 'leftside' argument into 'trim' to switch trimming side
FUJIWARA Katsunori <foozy@lares.dti.ne.jp>
parents: 21856
diff changeset
   498
            return ellipsis + s[-width:]
21856
d24969ee272f encoding: add 'trim' to trim multi-byte characters at most specified columns
FUJIWARA Katsunori <foozy@lares.dti.ne.jp>
parents: 18378
diff changeset
   499
        return s[:width] + ellipsis
d24969ee272f encoding: add 'trim' to trim multi-byte characters at most specified columns
FUJIWARA Katsunori <foozy@lares.dti.ne.jp>
parents: 18378
diff changeset
   500
43076
2372284d9457 formatting: blacken the codebase
Augie Fackler <augie@google.com>
parents: 41841
diff changeset
   501
    if ucolwidth(u) <= width:  # trimming is not needed
21856
d24969ee272f encoding: add 'trim' to trim multi-byte characters at most specified columns
FUJIWARA Katsunori <foozy@lares.dti.ne.jp>
parents: 18378
diff changeset
   502
        return s
d24969ee272f encoding: add 'trim' to trim multi-byte characters at most specified columns
FUJIWARA Katsunori <foozy@lares.dti.ne.jp>
parents: 18378
diff changeset
   503
d24969ee272f encoding: add 'trim' to trim multi-byte characters at most specified columns
FUJIWARA Katsunori <foozy@lares.dti.ne.jp>
parents: 18378
diff changeset
   504
    width -= len(ellipsis)
43076
2372284d9457 formatting: blacken the codebase
Augie Fackler <augie@google.com>
parents: 41841
diff changeset
   505
    if width <= 0:  # no enough room even for ellipsis
2372284d9457 formatting: blacken the codebase
Augie Fackler <augie@google.com>
parents: 41841
diff changeset
   506
        return ellipsis[: width + len(ellipsis)]
21856
d24969ee272f encoding: add 'trim' to trim multi-byte characters at most specified columns
FUJIWARA Katsunori <foozy@lares.dti.ne.jp>
parents: 18378
diff changeset
   507
21861
b515c3a63e96 encoding: add 'leftside' argument into 'trim' to switch trimming side
FUJIWARA Katsunori <foozy@lares.dti.ne.jp>
parents: 21856
diff changeset
   508
    if leftside:
b515c3a63e96 encoding: add 'leftside' argument into 'trim' to switch trimming side
FUJIWARA Katsunori <foozy@lares.dti.ne.jp>
parents: 21856
diff changeset
   509
        uslice = lambda i: u[i:]
b515c3a63e96 encoding: add 'leftside' argument into 'trim' to switch trimming side
FUJIWARA Katsunori <foozy@lares.dti.ne.jp>
parents: 21856
diff changeset
   510
        concat = lambda s: ellipsis + s
b515c3a63e96 encoding: add 'leftside' argument into 'trim' to switch trimming side
FUJIWARA Katsunori <foozy@lares.dti.ne.jp>
parents: 21856
diff changeset
   511
    else:
b515c3a63e96 encoding: add 'leftside' argument into 'trim' to switch trimming side
FUJIWARA Katsunori <foozy@lares.dti.ne.jp>
parents: 21856
diff changeset
   512
        uslice = lambda i: u[:-i]
b515c3a63e96 encoding: add 'leftside' argument into 'trim' to switch trimming side
FUJIWARA Katsunori <foozy@lares.dti.ne.jp>
parents: 21856
diff changeset
   513
        concat = lambda s: s + ellipsis
38823
e7aa113b14f7 global: use pycompat.xrange()
Gregory Szorc <gregory.szorc@gmail.com>
parents: 38739
diff changeset
   514
    for i in pycompat.xrange(1, len(u)):
21856
d24969ee272f encoding: add 'trim' to trim multi-byte characters at most specified columns
FUJIWARA Katsunori <foozy@lares.dti.ne.jp>
parents: 18378
diff changeset
   515
        usub = uslice(i)
d24969ee272f encoding: add 'trim' to trim multi-byte characters at most specified columns
FUJIWARA Katsunori <foozy@lares.dti.ne.jp>
parents: 18378
diff changeset
   516
        if ucolwidth(usub) <= width:
30033
02dbfaa6df0b py3: convert encoding name and mode to str
Yuya Nishihara <yuya@tcha.org>
parents: 30031
diff changeset
   517
            return concat(usub.encode(_sysstr(encoding)))
43076
2372284d9457 formatting: blacken the codebase
Augie Fackler <augie@google.com>
parents: 41841
diff changeset
   518
    return ellipsis  # no enough room for multi-column characters
2372284d9457 formatting: blacken the codebase
Augie Fackler <augie@google.com>
parents: 41841
diff changeset
   519
21856
d24969ee272f encoding: add 'trim' to trim multi-byte characters at most specified columns
FUJIWARA Katsunori <foozy@lares.dti.ne.jp>
parents: 18378
diff changeset
   520
24593
f473a1fe5c7c encoding: define an enum that specifies what normcase does to ASCII strings
Siddharth Agarwal <sid0@fb.com>
parents: 24578
diff changeset
   521
class normcasespecs(object):
45957
89a2afe31e82 formating: upgrade to black 20.8b1
Augie Fackler <raf@durin42.com>
parents: 45681
diff changeset
   522
    """what a platform's normcase does to ASCII strings
24593
f473a1fe5c7c encoding: define an enum that specifies what normcase does to ASCII strings
Siddharth Agarwal <sid0@fb.com>
parents: 24578
diff changeset
   523
f473a1fe5c7c encoding: define an enum that specifies what normcase does to ASCII strings
Siddharth Agarwal <sid0@fb.com>
parents: 24578
diff changeset
   524
    This is specified per platform, and should be consistent with what normcase
f473a1fe5c7c encoding: define an enum that specifies what normcase does to ASCII strings
Siddharth Agarwal <sid0@fb.com>
parents: 24578
diff changeset
   525
    on that platform actually does.
f473a1fe5c7c encoding: define an enum that specifies what normcase does to ASCII strings
Siddharth Agarwal <sid0@fb.com>
parents: 24578
diff changeset
   526
f473a1fe5c7c encoding: define an enum that specifies what normcase does to ASCII strings
Siddharth Agarwal <sid0@fb.com>
parents: 24578
diff changeset
   527
    lower: normcase lowercases ASCII strings
f473a1fe5c7c encoding: define an enum that specifies what normcase does to ASCII strings
Siddharth Agarwal <sid0@fb.com>
parents: 24578
diff changeset
   528
    upper: normcase uppercases ASCII strings
24608
1c533e23ce95 util.h: define an enum for normcase specs
Siddharth Agarwal <sid0@fb.com>
parents: 24597
diff changeset
   529
    other: the fallback function should always be called
1c533e23ce95 util.h: define an enum for normcase specs
Siddharth Agarwal <sid0@fb.com>
parents: 24597
diff changeset
   530
45957
89a2afe31e82 formating: upgrade to black 20.8b1
Augie Fackler <raf@durin42.com>
parents: 45681
diff changeset
   531
    This should be kept in sync with normcase_spec in util.h."""
43076
2372284d9457 formatting: blacken the codebase
Augie Fackler <augie@google.com>
parents: 41841
diff changeset
   532
24593
f473a1fe5c7c encoding: define an enum that specifies what normcase does to ASCII strings
Siddharth Agarwal <sid0@fb.com>
parents: 24578
diff changeset
   533
    lower = -1
f473a1fe5c7c encoding: define an enum that specifies what normcase does to ASCII strings
Siddharth Agarwal <sid0@fb.com>
parents: 24578
diff changeset
   534
    upper = 1
f473a1fe5c7c encoding: define an enum that specifies what normcase does to ASCII strings
Siddharth Agarwal <sid0@fb.com>
parents: 24578
diff changeset
   535
    other = 0
f473a1fe5c7c encoding: define an enum that specifies what normcase does to ASCII strings
Siddharth Agarwal <sid0@fb.com>
parents: 24578
diff changeset
   536
43076
2372284d9457 formatting: blacken the codebase
Augie Fackler <augie@google.com>
parents: 41841
diff changeset
   537
28068
9ece901f7a19 encoding: add option to escape non-ascii characters in JSON
Yuya Nishihara <yuya@tcha.org>
parents: 28067
diff changeset
   538
def jsonescape(s, paranoid=False):
43544
2ade00f3b03b encoding: add comment-based type hints for pytype
Augie Fackler <augie@google.com>
parents: 43517
diff changeset
   539
    # type: (Any, Any) -> Any
45957
89a2afe31e82 formating: upgrade to black 20.8b1
Augie Fackler <raf@durin42.com>
parents: 45681
diff changeset
   540
    """returns a string suitable for JSON
22426
f6b533e64ed6 encoding: add json escaping filter
Matt Mackall <mpm@selenic.com>
parents: 22425
diff changeset
   541
f6b533e64ed6 encoding: add json escaping filter
Matt Mackall <mpm@selenic.com>
parents: 22425
diff changeset
   542
    JSON is problematic for us because it doesn't support non-Unicode
f6b533e64ed6 encoding: add json escaping filter
Matt Mackall <mpm@selenic.com>
parents: 22425
diff changeset
   543
    bytes. To deal with this, we take the following approach:
f6b533e64ed6 encoding: add json escaping filter
Matt Mackall <mpm@selenic.com>
parents: 22425
diff changeset
   544
37991
3ea3c96ada54 encoding: introduce tagging type for non-lossy non-ASCII string
Yuya Nishihara <yuya@tcha.org>
parents: 37990
diff changeset
   545
    - localstr/safelocalstr objects are converted back to UTF-8
22426
f6b533e64ed6 encoding: add json escaping filter
Matt Mackall <mpm@selenic.com>
parents: 22425
diff changeset
   546
    - valid UTF-8/ASCII strings are passed as-is
f6b533e64ed6 encoding: add json escaping filter
Matt Mackall <mpm@selenic.com>
parents: 22425
diff changeset
   547
    - other strings are converted to UTF-8b surrogate encoding
f6b533e64ed6 encoding: add json escaping filter
Matt Mackall <mpm@selenic.com>
parents: 22425
diff changeset
   548
    - apply JSON-specified string escaping
f6b533e64ed6 encoding: add json escaping filter
Matt Mackall <mpm@selenic.com>
parents: 22425
diff changeset
   549
f6b533e64ed6 encoding: add json escaping filter
Matt Mackall <mpm@selenic.com>
parents: 22425
diff changeset
   550
    (escapes are doubled in these tests)
f6b533e64ed6 encoding: add json escaping filter
Matt Mackall <mpm@selenic.com>
parents: 22425
diff changeset
   551
34146
0fa781320203 doctest: bulk-replace string literals with b'' for Python 3
Yuya Nishihara <yuya@tcha.org>
parents: 33946
diff changeset
   552
    >>> jsonescape(b'this is a test')
22426
f6b533e64ed6 encoding: add json escaping filter
Matt Mackall <mpm@selenic.com>
parents: 22425
diff changeset
   553
    'this is a test'
34146
0fa781320203 doctest: bulk-replace string literals with b'' for Python 3
Yuya Nishihara <yuya@tcha.org>
parents: 33946
diff changeset
   554
    >>> jsonescape(b'escape characters: \\0 \\x0b \\x7f')
27881
ffa599f3f503 encoding: escape U+007F (DEL) character in JSON
Yuya Nishihara <yuya@tcha.org>
parents: 27699
diff changeset
   555
    'escape characters: \\\\u0000 \\\\u000b \\\\u007f'
34146
0fa781320203 doctest: bulk-replace string literals with b'' for Python 3
Yuya Nishihara <yuya@tcha.org>
parents: 33946
diff changeset
   556
    >>> jsonescape(b'escape characters: \\b \\t \\n \\f \\r \\" \\\\')
33943
2c37f9dabc32 encoding: add fast path of jsonescape() (issue5533)
Yuya Nishihara <yuya@tcha.org>
parents: 33942
diff changeset
   557
    'escape characters: \\\\b \\\\t \\\\n \\\\f \\\\r \\\\" \\\\\\\\'
34146
0fa781320203 doctest: bulk-replace string literals with b'' for Python 3
Yuya Nishihara <yuya@tcha.org>
parents: 33946
diff changeset
   558
    >>> jsonescape(b'a weird byte: \\xdd')
22426
f6b533e64ed6 encoding: add json escaping filter
Matt Mackall <mpm@selenic.com>
parents: 22425
diff changeset
   559
    'a weird byte: \\xed\\xb3\\x9d'
34146
0fa781320203 doctest: bulk-replace string literals with b'' for Python 3
Yuya Nishihara <yuya@tcha.org>
parents: 33946
diff changeset
   560
    >>> jsonescape(b'utf-8: caf\\xc3\\xa9')
22426
f6b533e64ed6 encoding: add json escaping filter
Matt Mackall <mpm@selenic.com>
parents: 22425
diff changeset
   561
    'utf-8: caf\\xc3\\xa9'
34146
0fa781320203 doctest: bulk-replace string literals with b'' for Python 3
Yuya Nishihara <yuya@tcha.org>
parents: 33946
diff changeset
   562
    >>> jsonescape(b'')
22426
f6b533e64ed6 encoding: add json escaping filter
Matt Mackall <mpm@selenic.com>
parents: 22425
diff changeset
   563
    ''
28068
9ece901f7a19 encoding: add option to escape non-ascii characters in JSON
Yuya Nishihara <yuya@tcha.org>
parents: 28067
diff changeset
   564
28069
b2d24c2898f9 encoding: backport paranoid escaping from templatefilters.jsonescape()
Yuya Nishihara <yuya@tcha.org>
parents: 28068
diff changeset
   565
    If paranoid, non-ascii and common troublesome characters are also escaped.
b2d24c2898f9 encoding: backport paranoid escaping from templatefilters.jsonescape()
Yuya Nishihara <yuya@tcha.org>
parents: 28068
diff changeset
   566
    This is suitable for web output.
28068
9ece901f7a19 encoding: add option to escape non-ascii characters in JSON
Yuya Nishihara <yuya@tcha.org>
parents: 28067
diff changeset
   567
34146
0fa781320203 doctest: bulk-replace string literals with b'' for Python 3
Yuya Nishihara <yuya@tcha.org>
parents: 33946
diff changeset
   568
    >>> s = b'escape characters: \\0 \\x0b \\x7f'
33943
2c37f9dabc32 encoding: add fast path of jsonescape() (issue5533)
Yuya Nishihara <yuya@tcha.org>
parents: 33942
diff changeset
   569
    >>> assert jsonescape(s) == jsonescape(s, paranoid=True)
34146
0fa781320203 doctest: bulk-replace string literals with b'' for Python 3
Yuya Nishihara <yuya@tcha.org>
parents: 33946
diff changeset
   570
    >>> s = b'escape characters: \\b \\t \\n \\f \\r \\" \\\\'
33943
2c37f9dabc32 encoding: add fast path of jsonescape() (issue5533)
Yuya Nishihara <yuya@tcha.org>
parents: 33942
diff changeset
   571
    >>> assert jsonescape(s) == jsonescape(s, paranoid=True)
34146
0fa781320203 doctest: bulk-replace string literals with b'' for Python 3
Yuya Nishihara <yuya@tcha.org>
parents: 33946
diff changeset
   572
    >>> jsonescape(b'escape boundary: \\x7e \\x7f \\xc2\\x80', paranoid=True)
28068
9ece901f7a19 encoding: add option to escape non-ascii characters in JSON
Yuya Nishihara <yuya@tcha.org>
parents: 28067
diff changeset
   573
    'escape boundary: ~ \\\\u007f \\\\u0080'
34146
0fa781320203 doctest: bulk-replace string literals with b'' for Python 3
Yuya Nishihara <yuya@tcha.org>
parents: 33946
diff changeset
   574
    >>> jsonescape(b'a weird byte: \\xdd', paranoid=True)
28068
9ece901f7a19 encoding: add option to escape non-ascii characters in JSON
Yuya Nishihara <yuya@tcha.org>
parents: 28067
diff changeset
   575
    'a weird byte: \\\\udcdd'
34146
0fa781320203 doctest: bulk-replace string literals with b'' for Python 3
Yuya Nishihara <yuya@tcha.org>
parents: 33946
diff changeset
   576
    >>> jsonescape(b'utf-8: caf\\xc3\\xa9', paranoid=True)
28068
9ece901f7a19 encoding: add option to escape non-ascii characters in JSON
Yuya Nishihara <yuya@tcha.org>
parents: 28067
diff changeset
   577
    'utf-8: caf\\\\u00e9'
34146
0fa781320203 doctest: bulk-replace string literals with b'' for Python 3
Yuya Nishihara <yuya@tcha.org>
parents: 33946
diff changeset
   578
    >>> jsonescape(b'non-BMP: \\xf0\\x9d\\x84\\x9e', paranoid=True)
28068
9ece901f7a19 encoding: add option to escape non-ascii characters in JSON
Yuya Nishihara <yuya@tcha.org>
parents: 28067
diff changeset
   579
    'non-BMP: \\\\ud834\\\\udd1e'
34146
0fa781320203 doctest: bulk-replace string literals with b'' for Python 3
Yuya Nishihara <yuya@tcha.org>
parents: 33946
diff changeset
   580
    >>> jsonescape(b'<foo@example.org>', paranoid=True)
28069
b2d24c2898f9 encoding: backport paranoid escaping from templatefilters.jsonescape()
Yuya Nishihara <yuya@tcha.org>
parents: 28068
diff changeset
   581
    '\\\\u003cfoo@example.org\\\\u003e'
45957
89a2afe31e82 formating: upgrade to black 20.8b1
Augie Fackler <raf@durin42.com>
parents: 45681
diff changeset
   582
    """
22426
f6b533e64ed6 encoding: add json escaping filter
Matt Mackall <mpm@selenic.com>
parents: 22425
diff changeset
   583
28068
9ece901f7a19 encoding: add option to escape non-ascii characters in JSON
Yuya Nishihara <yuya@tcha.org>
parents: 28067
diff changeset
   584
    u8chars = toutf8b(s)
9ece901f7a19 encoding: add option to escape non-ascii characters in JSON
Yuya Nishihara <yuya@tcha.org>
parents: 28067
diff changeset
   585
    try:
33942
b9101467d88b encoding: extract stub for fast JSON escape
Yuya Nishihara <yuya@tcha.org>
parents: 33873
diff changeset
   586
        return _jsonescapeu8fast(u8chars, paranoid)
b9101467d88b encoding: extract stub for fast JSON escape
Yuya Nishihara <yuya@tcha.org>
parents: 33873
diff changeset
   587
    except ValueError:
28068
9ece901f7a19 encoding: add option to escape non-ascii characters in JSON
Yuya Nishihara <yuya@tcha.org>
parents: 28067
diff changeset
   588
        pass
33942
b9101467d88b encoding: extract stub for fast JSON escape
Yuya Nishihara <yuya@tcha.org>
parents: 33873
diff changeset
   589
    return charencodepure.jsonescapeu8fallback(u8chars, paranoid)
22426
f6b533e64ed6 encoding: add json escaping filter
Matt Mackall <mpm@selenic.com>
parents: 22425
diff changeset
   590
43076
2372284d9457 formatting: blacken the codebase
Augie Fackler <augie@google.com>
parents: 41841
diff changeset
   591
34225
aa877860d4d7 py3: use 'surrogatepass' error handler to process U+DCxx transparently
Yuya Nishihara <yuya@tcha.org>
parents: 34223
diff changeset
   592
# We need to decode/encode U+DCxx codes transparently since invalid UTF-8
aa877860d4d7 py3: use 'surrogatepass' error handler to process U+DCxx transparently
Yuya Nishihara <yuya@tcha.org>
parents: 34223
diff changeset
   593
# bytes are mapped to that range.
aa877860d4d7 py3: use 'surrogatepass' error handler to process U+DCxx transparently
Yuya Nishihara <yuya@tcha.org>
parents: 34223
diff changeset
   594
if pycompat.ispy3:
aa877860d4d7 py3: use 'surrogatepass' error handler to process U+DCxx transparently
Yuya Nishihara <yuya@tcha.org>
parents: 34223
diff changeset
   595
    _utf8strict = r'surrogatepass'
aa877860d4d7 py3: use 'surrogatepass' error handler to process U+DCxx transparently
Yuya Nishihara <yuya@tcha.org>
parents: 34223
diff changeset
   596
else:
aa877860d4d7 py3: use 'surrogatepass' error handler to process U+DCxx transparently
Yuya Nishihara <yuya@tcha.org>
parents: 34223
diff changeset
   597
    _utf8strict = r'strict'
aa877860d4d7 py3: use 'surrogatepass' error handler to process U+DCxx transparently
Yuya Nishihara <yuya@tcha.org>
parents: 34223
diff changeset
   598
26875
cf47bdb2183c encoding: add getutf8char helper
Matt Mackall <mpm@selenic.com>
parents: 25660
diff changeset
   599
_utf8len = [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 2, 2, 3, 4]
cf47bdb2183c encoding: add getutf8char helper
Matt Mackall <mpm@selenic.com>
parents: 25660
diff changeset
   600
43076
2372284d9457 formatting: blacken the codebase
Augie Fackler <augie@google.com>
parents: 41841
diff changeset
   601
26875
cf47bdb2183c encoding: add getutf8char helper
Matt Mackall <mpm@selenic.com>
parents: 25660
diff changeset
   602
def getutf8char(s, pos):
43722
83a349aaeba3 typing: constrain argument/return types of encoding.toutf8b()
Yuya Nishihara <yuya@tcha.org>
parents: 43721
diff changeset
   603
    # type: (bytes, int) -> bytes
45957
89a2afe31e82 formating: upgrade to black 20.8b1
Augie Fackler <raf@durin42.com>
parents: 45681
diff changeset
   604
    """get the next full utf-8 character in the given string, starting at pos
26875
cf47bdb2183c encoding: add getutf8char helper
Matt Mackall <mpm@selenic.com>
parents: 25660
diff changeset
   605
cf47bdb2183c encoding: add getutf8char helper
Matt Mackall <mpm@selenic.com>
parents: 25660
diff changeset
   606
    Raises a UnicodeError if the given location does not start a valid
cf47bdb2183c encoding: add getutf8char helper
Matt Mackall <mpm@selenic.com>
parents: 25660
diff changeset
   607
    utf-8 character.
45957
89a2afe31e82 formating: upgrade to black 20.8b1
Augie Fackler <raf@durin42.com>
parents: 45681
diff changeset
   608
    """
26875
cf47bdb2183c encoding: add getutf8char helper
Matt Mackall <mpm@selenic.com>
parents: 25660
diff changeset
   609
cf47bdb2183c encoding: add getutf8char helper
Matt Mackall <mpm@selenic.com>
parents: 25660
diff changeset
   610
    # find how many bytes to attempt decoding from first nibble
43076
2372284d9457 formatting: blacken the codebase
Augie Fackler <augie@google.com>
parents: 41841
diff changeset
   611
    l = _utf8len[ord(s[pos : pos + 1]) >> 4]
2372284d9457 formatting: blacken the codebase
Augie Fackler <augie@google.com>
parents: 41841
diff changeset
   612
    if not l:  # ascii
2372284d9457 formatting: blacken the codebase
Augie Fackler <augie@google.com>
parents: 41841
diff changeset
   613
        return s[pos : pos + 1]
26875
cf47bdb2183c encoding: add getutf8char helper
Matt Mackall <mpm@selenic.com>
parents: 25660
diff changeset
   614
43076
2372284d9457 formatting: blacken the codebase
Augie Fackler <augie@google.com>
parents: 41841
diff changeset
   615
    c = s[pos : pos + l]
26875
cf47bdb2183c encoding: add getutf8char helper
Matt Mackall <mpm@selenic.com>
parents: 25660
diff changeset
   616
    # validate with attempted decode
34225
aa877860d4d7 py3: use 'surrogatepass' error handler to process U+DCxx transparently
Yuya Nishihara <yuya@tcha.org>
parents: 34223
diff changeset
   617
    c.decode("utf-8", _utf8strict)
26875
cf47bdb2183c encoding: add getutf8char helper
Matt Mackall <mpm@selenic.com>
parents: 25660
diff changeset
   618
    return c
cf47bdb2183c encoding: add getutf8char helper
Matt Mackall <mpm@selenic.com>
parents: 25660
diff changeset
   619
43076
2372284d9457 formatting: blacken the codebase
Augie Fackler <augie@google.com>
parents: 41841
diff changeset
   620
16133
84c58da3a1f8 encoding: introduce utf8-b helpers
Matt Mackall <mpm@selenic.com>
parents: 15769
diff changeset
   621
def toutf8b(s):
43722
83a349aaeba3 typing: constrain argument/return types of encoding.toutf8b()
Yuya Nishihara <yuya@tcha.org>
parents: 43721
diff changeset
   622
    # type: (bytes) -> bytes
45957
89a2afe31e82 formating: upgrade to black 20.8b1
Augie Fackler <raf@durin42.com>
parents: 45681
diff changeset
   623
    """convert a local, possibly-binary string into UTF-8b
16133
84c58da3a1f8 encoding: introduce utf8-b helpers
Matt Mackall <mpm@selenic.com>
parents: 15769
diff changeset
   624
84c58da3a1f8 encoding: introduce utf8-b helpers
Matt Mackall <mpm@selenic.com>
parents: 15769
diff changeset
   625
    This is intended as a generic method to preserve data when working
84c58da3a1f8 encoding: introduce utf8-b helpers
Matt Mackall <mpm@selenic.com>
parents: 15769
diff changeset
   626
    with schemes like JSON and XML that have no provision for
84c58da3a1f8 encoding: introduce utf8-b helpers
Matt Mackall <mpm@selenic.com>
parents: 15769
diff changeset
   627
    arbitrary byte strings. As Mercurial often doesn't know
84c58da3a1f8 encoding: introduce utf8-b helpers
Matt Mackall <mpm@selenic.com>
parents: 15769
diff changeset
   628
    what encoding data is in, we use so-called UTF-8b.
84c58da3a1f8 encoding: introduce utf8-b helpers
Matt Mackall <mpm@selenic.com>
parents: 15769
diff changeset
   629
84c58da3a1f8 encoding: introduce utf8-b helpers
Matt Mackall <mpm@selenic.com>
parents: 15769
diff changeset
   630
    If a string is already valid UTF-8 (or ASCII), it passes unmodified.
84c58da3a1f8 encoding: introduce utf8-b helpers
Matt Mackall <mpm@selenic.com>
parents: 15769
diff changeset
   631
    Otherwise, unsupported bytes are mapped to UTF-16 surrogate range,
84c58da3a1f8 encoding: introduce utf8-b helpers
Matt Mackall <mpm@selenic.com>
parents: 15769
diff changeset
   632
    uDC00-uDCFF.
84c58da3a1f8 encoding: introduce utf8-b helpers
Matt Mackall <mpm@selenic.com>
parents: 15769
diff changeset
   633
84c58da3a1f8 encoding: introduce utf8-b helpers
Matt Mackall <mpm@selenic.com>
parents: 15769
diff changeset
   634
    Principles of operation:
84c58da3a1f8 encoding: introduce utf8-b helpers
Matt Mackall <mpm@selenic.com>
parents: 15769
diff changeset
   635
17424
e7cfe3587ea4 fix trivial spelling errors
Mads Kiilerich <mads@kiilerich.com>
parents: 17236
diff changeset
   636
    - ASCII and UTF-8 data successfully round-trips and is understood
16133
84c58da3a1f8 encoding: introduce utf8-b helpers
Matt Mackall <mpm@selenic.com>
parents: 15769
diff changeset
   637
      by Unicode-oriented clients
84c58da3a1f8 encoding: introduce utf8-b helpers
Matt Mackall <mpm@selenic.com>
parents: 15769
diff changeset
   638
    - filenames and file contents in arbitrary other encodings can have
84c58da3a1f8 encoding: introduce utf8-b helpers
Matt Mackall <mpm@selenic.com>
parents: 15769
diff changeset
   639
      be round-tripped or recovered by clueful clients
84c58da3a1f8 encoding: introduce utf8-b helpers
Matt Mackall <mpm@selenic.com>
parents: 15769
diff changeset
   640
    - local strings that have a cached known UTF-8 encoding (aka
84c58da3a1f8 encoding: introduce utf8-b helpers
Matt Mackall <mpm@selenic.com>
parents: 15769
diff changeset
   641
      localstr) get sent as UTF-8 so Unicode-oriented clients get the
84c58da3a1f8 encoding: introduce utf8-b helpers
Matt Mackall <mpm@selenic.com>
parents: 15769
diff changeset
   642
      Unicode data they want
37991
3ea3c96ada54 encoding: introduce tagging type for non-lossy non-ASCII string
Yuya Nishihara <yuya@tcha.org>
parents: 37990
diff changeset
   643
    - non-lossy local strings (aka safelocalstr) get sent as UTF-8 as well
16133
84c58da3a1f8 encoding: introduce utf8-b helpers
Matt Mackall <mpm@selenic.com>
parents: 15769
diff changeset
   644
    - because we must preserve UTF-8 bytestring in places such as
84c58da3a1f8 encoding: introduce utf8-b helpers
Matt Mackall <mpm@selenic.com>
parents: 15769
diff changeset
   645
      filenames, metadata can't be roundtripped without help
84c58da3a1f8 encoding: introduce utf8-b helpers
Matt Mackall <mpm@selenic.com>
parents: 15769
diff changeset
   646
84c58da3a1f8 encoding: introduce utf8-b helpers
Matt Mackall <mpm@selenic.com>
parents: 15769
diff changeset
   647
    (Note: "UTF-8b" often refers to decoding a mix of valid UTF-8 and
84c58da3a1f8 encoding: introduce utf8-b helpers
Matt Mackall <mpm@selenic.com>
parents: 15769
diff changeset
   648
    arbitrary bytes into an internal Unicode format that can be
84c58da3a1f8 encoding: introduce utf8-b helpers
Matt Mackall <mpm@selenic.com>
parents: 15769
diff changeset
   649
    re-encoded back into the original. Here we are exposing the
84c58da3a1f8 encoding: introduce utf8-b helpers
Matt Mackall <mpm@selenic.com>
parents: 15769
diff changeset
   650
    internal surrogate encoding as a UTF-8 string.)
45957
89a2afe31e82 formating: upgrade to black 20.8b1
Augie Fackler <raf@durin42.com>
parents: 45681
diff changeset
   651
    """
16133
84c58da3a1f8 encoding: introduce utf8-b helpers
Matt Mackall <mpm@selenic.com>
parents: 15769
diff changeset
   652
37990
57b0c7221dba encoding: fix toutf8b() to resurrect lossy characters even if "\xed" in it
Yuya Nishihara <yuya@tcha.org>
parents: 36805
diff changeset
   653
    if isinstance(s, localstr):
57b0c7221dba encoding: fix toutf8b() to resurrect lossy characters even if "\xed" in it
Yuya Nishihara <yuya@tcha.org>
parents: 36805
diff changeset
   654
        # assume that the original UTF-8 sequence would never contain
57b0c7221dba encoding: fix toutf8b() to resurrect lossy characters even if "\xed" in it
Yuya Nishihara <yuya@tcha.org>
parents: 36805
diff changeset
   655
        # invalid characters in U+DCxx range
57b0c7221dba encoding: fix toutf8b() to resurrect lossy characters even if "\xed" in it
Yuya Nishihara <yuya@tcha.org>
parents: 36805
diff changeset
   656
        return s._utf8
37991
3ea3c96ada54 encoding: introduce tagging type for non-lossy non-ASCII string
Yuya Nishihara <yuya@tcha.org>
parents: 37990
diff changeset
   657
    elif isinstance(s, safelocalstr):
3ea3c96ada54 encoding: introduce tagging type for non-lossy non-ASCII string
Yuya Nishihara <yuya@tcha.org>
parents: 37990
diff changeset
   658
        # already verified that s is non-lossy in legacy encoding, which
3ea3c96ada54 encoding: introduce tagging type for non-lossy non-ASCII string
Yuya Nishihara <yuya@tcha.org>
parents: 37990
diff changeset
   659
        # shouldn't contain characters in U+DCxx range
3ea3c96ada54 encoding: introduce tagging type for non-lossy non-ASCII string
Yuya Nishihara <yuya@tcha.org>
parents: 37990
diff changeset
   660
        return fromlocal(s)
37990
57b0c7221dba encoding: fix toutf8b() to resurrect lossy characters even if "\xed" in it
Yuya Nishihara <yuya@tcha.org>
parents: 36805
diff changeset
   661
    elif isasciistr(s):
33946
6c119dbfd0c0 encoding: add fast path of from/toutf8b() for ASCII strings
Yuya Nishihara <yuya@tcha.org>
parents: 33945
diff changeset
   662
        return s
43077
687b865b95ad formatting: byteify all mercurial/ and hgext/ string literals
Augie Fackler <augie@google.com>
parents: 43076
diff changeset
   663
    if b"\xed" not in s:
26879
a24b98f4e03c encoding: re-escape U+DCxx characters in toutf8b input (issue4927)
Matt Mackall <mpm@selenic.com>
parents: 26878
diff changeset
   664
        try:
34225
aa877860d4d7 py3: use 'surrogatepass' error handler to process U+DCxx transparently
Yuya Nishihara <yuya@tcha.org>
parents: 34223
diff changeset
   665
            s.decode('utf-8', _utf8strict)
26879
a24b98f4e03c encoding: re-escape U+DCxx characters in toutf8b input (issue4927)
Matt Mackall <mpm@selenic.com>
parents: 26878
diff changeset
   666
            return s
a24b98f4e03c encoding: re-escape U+DCxx characters in toutf8b input (issue4927)
Matt Mackall <mpm@selenic.com>
parents: 26878
diff changeset
   667
        except UnicodeDecodeError:
a24b98f4e03c encoding: re-escape U+DCxx characters in toutf8b input (issue4927)
Matt Mackall <mpm@selenic.com>
parents: 26878
diff changeset
   668
            pass
26878
d7e83f106459 encoding: use getutf8char in toutf8b
Matt Mackall <mpm@selenic.com>
parents: 26877
diff changeset
   669
34223
1c601df9894c py3: wrap bytes in encoding.from/toutf8b() with bytestr
Yuya Nishihara <yuya@tcha.org>
parents: 34207
diff changeset
   670
    s = pycompat.bytestr(s)
43077
687b865b95ad formatting: byteify all mercurial/ and hgext/ string literals
Augie Fackler <augie@google.com>
parents: 43076
diff changeset
   671
    r = b""
26878
d7e83f106459 encoding: use getutf8char in toutf8b
Matt Mackall <mpm@selenic.com>
parents: 26877
diff changeset
   672
    pos = 0
d7e83f106459 encoding: use getutf8char in toutf8b
Matt Mackall <mpm@selenic.com>
parents: 26877
diff changeset
   673
    l = len(s)
d7e83f106459 encoding: use getutf8char in toutf8b
Matt Mackall <mpm@selenic.com>
parents: 26877
diff changeset
   674
    while pos < l:
d7e83f106459 encoding: use getutf8char in toutf8b
Matt Mackall <mpm@selenic.com>
parents: 26877
diff changeset
   675
        try:
d7e83f106459 encoding: use getutf8char in toutf8b
Matt Mackall <mpm@selenic.com>
parents: 26877
diff changeset
   676
            c = getutf8char(s, pos)
43077
687b865b95ad formatting: byteify all mercurial/ and hgext/ string literals
Augie Fackler <augie@google.com>
parents: 43076
diff changeset
   677
            if b"\xed\xb0\x80" <= c <= b"\xed\xb3\xbf":
26879
a24b98f4e03c encoding: re-escape U+DCxx characters in toutf8b input (issue4927)
Matt Mackall <mpm@selenic.com>
parents: 26878
diff changeset
   678
                # have to re-escape existing U+DCxx characters
43076
2372284d9457 formatting: blacken the codebase
Augie Fackler <augie@google.com>
parents: 41841
diff changeset
   679
                c = unichr(0xDC00 + ord(s[pos])).encode('utf-8', _utf8strict)
26879
a24b98f4e03c encoding: re-escape U+DCxx characters in toutf8b input (issue4927)
Matt Mackall <mpm@selenic.com>
parents: 26878
diff changeset
   680
                pos += 1
a24b98f4e03c encoding: re-escape U+DCxx characters in toutf8b input (issue4927)
Matt Mackall <mpm@selenic.com>
parents: 26878
diff changeset
   681
            else:
a24b98f4e03c encoding: re-escape U+DCxx characters in toutf8b input (issue4927)
Matt Mackall <mpm@selenic.com>
parents: 26878
diff changeset
   682
                pos += len(c)
26878
d7e83f106459 encoding: use getutf8char in toutf8b
Matt Mackall <mpm@selenic.com>
parents: 26877
diff changeset
   683
        except UnicodeDecodeError:
43076
2372284d9457 formatting: blacken the codebase
Augie Fackler <augie@google.com>
parents: 41841
diff changeset
   684
            c = unichr(0xDC00 + ord(s[pos])).encode('utf-8', _utf8strict)
26878
d7e83f106459 encoding: use getutf8char in toutf8b
Matt Mackall <mpm@selenic.com>
parents: 26877
diff changeset
   685
            pos += 1
d7e83f106459 encoding: use getutf8char in toutf8b
Matt Mackall <mpm@selenic.com>
parents: 26877
diff changeset
   686
        r += c
d7e83f106459 encoding: use getutf8char in toutf8b
Matt Mackall <mpm@selenic.com>
parents: 26877
diff changeset
   687
    return r
16133
84c58da3a1f8 encoding: introduce utf8-b helpers
Matt Mackall <mpm@selenic.com>
parents: 15769
diff changeset
   688
43076
2372284d9457 formatting: blacken the codebase
Augie Fackler <augie@google.com>
parents: 41841
diff changeset
   689
16133
84c58da3a1f8 encoding: introduce utf8-b helpers
Matt Mackall <mpm@selenic.com>
parents: 15769
diff changeset
   690
def fromutf8b(s):
43721
b65fcccd9100 typing: fix argument type of encoding.tolocal() and .fromutf8b()
Yuya Nishihara <yuya@tcha.org>
parents: 43720
diff changeset
   691
    # type: (bytes) -> bytes
45957
89a2afe31e82 formating: upgrade to black 20.8b1
Augie Fackler <raf@durin42.com>
parents: 45681
diff changeset
   692
    """Given a UTF-8b string, return a local, possibly-binary string.
16133
84c58da3a1f8 encoding: introduce utf8-b helpers
Matt Mackall <mpm@selenic.com>
parents: 15769
diff changeset
   693
84c58da3a1f8 encoding: introduce utf8-b helpers
Matt Mackall <mpm@selenic.com>
parents: 15769
diff changeset
   694
    return the original binary string. This
84c58da3a1f8 encoding: introduce utf8-b helpers
Matt Mackall <mpm@selenic.com>
parents: 15769
diff changeset
   695
    is a round-trip process for strings like filenames, but metadata
84c58da3a1f8 encoding: introduce utf8-b helpers
Matt Mackall <mpm@selenic.com>
parents: 15769
diff changeset
   696
    that's was passed through tolocal will remain in UTF-8.
84c58da3a1f8 encoding: introduce utf8-b helpers
Matt Mackall <mpm@selenic.com>
parents: 15769
diff changeset
   697
26963
de5ae97ce9f4 encoding: extend test cases for utf8b
Matt Mackall <mpm@selenic.com>
parents: 26879
diff changeset
   698
    >>> roundtrip = lambda x: fromutf8b(toutf8b(x)) == x
34146
0fa781320203 doctest: bulk-replace string literals with b'' for Python 3
Yuya Nishihara <yuya@tcha.org>
parents: 33946
diff changeset
   699
    >>> m = b"\\xc3\\xa9\\x99abcd"
26963
de5ae97ce9f4 encoding: extend test cases for utf8b
Matt Mackall <mpm@selenic.com>
parents: 26879
diff changeset
   700
    >>> toutf8b(m)
16133
84c58da3a1f8 encoding: introduce utf8-b helpers
Matt Mackall <mpm@selenic.com>
parents: 15769
diff changeset
   701
    '\\xc3\\xa9\\xed\\xb2\\x99abcd'
26963
de5ae97ce9f4 encoding: extend test cases for utf8b
Matt Mackall <mpm@selenic.com>
parents: 26879
diff changeset
   702
    >>> roundtrip(m)
de5ae97ce9f4 encoding: extend test cases for utf8b
Matt Mackall <mpm@selenic.com>
parents: 26879
diff changeset
   703
    True
34146
0fa781320203 doctest: bulk-replace string literals with b'' for Python 3
Yuya Nishihara <yuya@tcha.org>
parents: 33946
diff changeset
   704
    >>> roundtrip(b"\\xc2\\xc2\\x80")
26963
de5ae97ce9f4 encoding: extend test cases for utf8b
Matt Mackall <mpm@selenic.com>
parents: 26879
diff changeset
   705
    True
34146
0fa781320203 doctest: bulk-replace string literals with b'' for Python 3
Yuya Nishihara <yuya@tcha.org>
parents: 33946
diff changeset
   706
    >>> roundtrip(b"\\xef\\xbf\\xbd")
26963
de5ae97ce9f4 encoding: extend test cases for utf8b
Matt Mackall <mpm@selenic.com>
parents: 26879
diff changeset
   707
    True
34146
0fa781320203 doctest: bulk-replace string literals with b'' for Python 3
Yuya Nishihara <yuya@tcha.org>
parents: 33946
diff changeset
   708
    >>> roundtrip(b"\\xef\\xef\\xbf\\xbd")
16133
84c58da3a1f8 encoding: introduce utf8-b helpers
Matt Mackall <mpm@selenic.com>
parents: 15769
diff changeset
   709
    True
34146
0fa781320203 doctest: bulk-replace string literals with b'' for Python 3
Yuya Nishihara <yuya@tcha.org>
parents: 33946
diff changeset
   710
    >>> roundtrip(b"\\xf1\\x80\\x80\\x80\\x80")
27699
c8d3392f76e1 encoding: handle UTF-16 internal limit with fromutf8b (issue5031)
Matt Mackall <mpm@selenic.com>
parents: 27356
diff changeset
   711
    True
45957
89a2afe31e82 formating: upgrade to black 20.8b1
Augie Fackler <raf@durin42.com>
parents: 45681
diff changeset
   712
    """
16133
84c58da3a1f8 encoding: introduce utf8-b helpers
Matt Mackall <mpm@selenic.com>
parents: 15769
diff changeset
   713
33946
6c119dbfd0c0 encoding: add fast path of from/toutf8b() for ASCII strings
Yuya Nishihara <yuya@tcha.org>
parents: 33945
diff changeset
   714
    if isasciistr(s):
6c119dbfd0c0 encoding: add fast path of from/toutf8b() for ASCII strings
Yuya Nishihara <yuya@tcha.org>
parents: 33945
diff changeset
   715
        return s
16133
84c58da3a1f8 encoding: introduce utf8-b helpers
Matt Mackall <mpm@selenic.com>
parents: 15769
diff changeset
   716
    # fast path - look for uDxxx prefixes in s
43077
687b865b95ad formatting: byteify all mercurial/ and hgext/ string literals
Augie Fackler <augie@google.com>
parents: 43076
diff changeset
   717
    if b"\xed" not in s:
16133
84c58da3a1f8 encoding: introduce utf8-b helpers
Matt Mackall <mpm@selenic.com>
parents: 15769
diff changeset
   718
        return s
84c58da3a1f8 encoding: introduce utf8-b helpers
Matt Mackall <mpm@selenic.com>
parents: 15769
diff changeset
   719
27699
c8d3392f76e1 encoding: handle UTF-16 internal limit with fromutf8b (issue5031)
Matt Mackall <mpm@selenic.com>
parents: 27356
diff changeset
   720
    # We could do this with the unicode type but some Python builds
c8d3392f76e1 encoding: handle UTF-16 internal limit with fromutf8b (issue5031)
Matt Mackall <mpm@selenic.com>
parents: 27356
diff changeset
   721
    # use UTF-16 internally (issue5031) which causes non-BMP code
c8d3392f76e1 encoding: handle UTF-16 internal limit with fromutf8b (issue5031)
Matt Mackall <mpm@selenic.com>
parents: 27356
diff changeset
   722
    # points to be escaped. Instead, we use our handy getutf8char
c8d3392f76e1 encoding: handle UTF-16 internal limit with fromutf8b (issue5031)
Matt Mackall <mpm@selenic.com>
parents: 27356
diff changeset
   723
    # helper again to walk the string without "decoding" it.
c8d3392f76e1 encoding: handle UTF-16 internal limit with fromutf8b (issue5031)
Matt Mackall <mpm@selenic.com>
parents: 27356
diff changeset
   724
34223
1c601df9894c py3: wrap bytes in encoding.from/toutf8b() with bytestr
Yuya Nishihara <yuya@tcha.org>
parents: 34207
diff changeset
   725
    s = pycompat.bytestr(s)
43077
687b865b95ad formatting: byteify all mercurial/ and hgext/ string literals
Augie Fackler <augie@google.com>
parents: 43076
diff changeset
   726
    r = b""
27699
c8d3392f76e1 encoding: handle UTF-16 internal limit with fromutf8b (issue5031)
Matt Mackall <mpm@selenic.com>
parents: 27356
diff changeset
   727
    pos = 0
c8d3392f76e1 encoding: handle UTF-16 internal limit with fromutf8b (issue5031)
Matt Mackall <mpm@selenic.com>
parents: 27356
diff changeset
   728
    l = len(s)
c8d3392f76e1 encoding: handle UTF-16 internal limit with fromutf8b (issue5031)
Matt Mackall <mpm@selenic.com>
parents: 27356
diff changeset
   729
    while pos < l:
c8d3392f76e1 encoding: handle UTF-16 internal limit with fromutf8b (issue5031)
Matt Mackall <mpm@selenic.com>
parents: 27356
diff changeset
   730
        c = getutf8char(s, pos)
c8d3392f76e1 encoding: handle UTF-16 internal limit with fromutf8b (issue5031)
Matt Mackall <mpm@selenic.com>
parents: 27356
diff changeset
   731
        pos += len(c)
c8d3392f76e1 encoding: handle UTF-16 internal limit with fromutf8b (issue5031)
Matt Mackall <mpm@selenic.com>
parents: 27356
diff changeset
   732
        # unescape U+DCxx characters
43077
687b865b95ad formatting: byteify all mercurial/ and hgext/ string literals
Augie Fackler <augie@google.com>
parents: 43076
diff changeset
   733
        if b"\xed\xb0\x80" <= c <= b"\xed\xb3\xbf":
43076
2372284d9457 formatting: blacken the codebase
Augie Fackler <augie@google.com>
parents: 41841
diff changeset
   734
            c = pycompat.bytechr(ord(c.decode("utf-8", _utf8strict)) & 0xFF)
27699
c8d3392f76e1 encoding: handle UTF-16 internal limit with fromutf8b (issue5031)
Matt Mackall <mpm@selenic.com>
parents: 27356
diff changeset
   735
        r += c
16133
84c58da3a1f8 encoding: introduce utf8-b helpers
Matt Mackall <mpm@selenic.com>
parents: 15769
diff changeset
   736
    return r