mercurial/encoding.py
author Gregory Szorc <gregory.szorc@gmail.com>
Thu, 24 Oct 2019 21:22:08 -0700
changeset 43517 24633444ff32
parent 43506 9f70512ae2cf
child 43637 7edc07fb890c
permissions -rw-r--r--
packaging: remove hg.exe.local file <exe>.local files are used by Windows to set up DLL redirection. But these files are ignored if you embed an application manifest in your binary, which we do. So the existence of this file serves no purpose. So we remove it. Differential Revision: https://phab.mercurial-scm.org/D7160
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
8226
8b2cd04a6e97 put license and copyright info into comment blocks
Martin Geisler <mg@lazybytes.net>
parents: 8225
diff changeset
     1
# encoding.py - character transcoding support for Mercurial
8b2cd04a6e97 put license and copyright info into comment blocks
Martin Geisler <mg@lazybytes.net>
parents: 8225
diff changeset
     2
#
8b2cd04a6e97 put license and copyright info into comment blocks
Martin Geisler <mg@lazybytes.net>
parents: 8225
diff changeset
     3
#  Copyright 2005-2009 Matt Mackall <mpm@selenic.com> and others
8b2cd04a6e97 put license and copyright info into comment blocks
Martin Geisler <mg@lazybytes.net>
parents: 8225
diff changeset
     4
#
8b2cd04a6e97 put license and copyright info into comment blocks
Martin Geisler <mg@lazybytes.net>
parents: 8225
diff changeset
     5
# This software may be used and distributed according to the terms of the
10263
25e572394f5c Update license to GPLv2+
Matt Mackall <mpm@selenic.com>
parents: 9574
diff changeset
     6
# GNU General Public License version 2 or any later version.
7948
de377b1a9a84 move encoding bits from util to encoding
Matt Mackall <mpm@selenic.com>
parents:
diff changeset
     7
34137
a8994d08e4a2 doctest: use print_function and convert bytes to unicode where needed
Yuya Nishihara <yuya@tcha.org>
parents: 34136
diff changeset
     8
from __future__ import absolute_import, print_function
27355
b479fc425a81 encoding: use absolute_import
Gregory Szorc <gregory.szorc@gmail.com>
parents: 26963
diff changeset
     9
b479fc425a81 encoding: use absolute_import
Gregory Szorc <gregory.szorc@gmail.com>
parents: 26963
diff changeset
    10
import locale
b479fc425a81 encoding: use absolute_import
Gregory Szorc <gregory.szorc@gmail.com>
parents: 26963
diff changeset
    11
import os
b479fc425a81 encoding: use absolute_import
Gregory Szorc <gregory.szorc@gmail.com>
parents: 26963
diff changeset
    12
import unicodedata
b479fc425a81 encoding: use absolute_import
Gregory Szorc <gregory.szorc@gmail.com>
parents: 26963
diff changeset
    13
43089
c59eb1560c44 py3: manually import getattr where it is needed
Gregory Szorc <gregory.szorc@gmail.com>
parents: 43077
diff changeset
    14
from .pycompat import getattr
27355
b479fc425a81 encoding: use absolute_import
Gregory Szorc <gregory.szorc@gmail.com>
parents: 26963
diff changeset
    15
from . import (
b479fc425a81 encoding: use absolute_import
Gregory Szorc <gregory.szorc@gmail.com>
parents: 26963
diff changeset
    16
    error,
32372
df448de7cf3b parsers: switch to policy importer
Yuya Nishihara <yuya@tcha.org>
parents: 32299
diff changeset
    17
    policy,
30030
0f6d6fdd3c2a pycompat: provide 'ispy3' constant
Yuya Nishihara <yuya@tcha.org>
parents: 28508
diff changeset
    18
    pycompat,
27355
b479fc425a81 encoding: use absolute_import
Gregory Szorc <gregory.szorc@gmail.com>
parents: 26963
diff changeset
    19
)
7948
de377b1a9a84 move encoding bits from util to encoding
Matt Mackall <mpm@selenic.com>
parents:
diff changeset
    20
43076
2372284d9457 formatting: blacken the codebase
Augie Fackler <augie@google.com>
parents: 41836
diff changeset
    21
from .pure import charencode as charencodepure
33924
b9101467d88b encoding: extract stub for fast JSON escape
Yuya Nishihara <yuya@tcha.org>
parents: 33852
diff changeset
    22
43496
2ade00f3b03b encoding: add comment-based type hints for pytype
Augie Fackler <augie@google.com>
parents: 43469
diff changeset
    23
if not globals():  # hide this from non-pytype users
2ade00f3b03b encoding: add comment-based type hints for pytype
Augie Fackler <augie@google.com>
parents: 43469
diff changeset
    24
    from typing import (
2ade00f3b03b encoding: add comment-based type hints for pytype
Augie Fackler <augie@google.com>
parents: 43469
diff changeset
    25
        Any,
2ade00f3b03b encoding: add comment-based type hints for pytype
Augie Fackler <augie@google.com>
parents: 43469
diff changeset
    26
        Callable,
2ade00f3b03b encoding: add comment-based type hints for pytype
Augie Fackler <augie@google.com>
parents: 43469
diff changeset
    27
        List,
2ade00f3b03b encoding: add comment-based type hints for pytype
Augie Fackler <augie@google.com>
parents: 43469
diff changeset
    28
        Text,
2ade00f3b03b encoding: add comment-based type hints for pytype
Augie Fackler <augie@google.com>
parents: 43469
diff changeset
    29
        Type,
2ade00f3b03b encoding: add comment-based type hints for pytype
Augie Fackler <augie@google.com>
parents: 43469
diff changeset
    30
        TypeVar,
2ade00f3b03b encoding: add comment-based type hints for pytype
Augie Fackler <augie@google.com>
parents: 43469
diff changeset
    31
        Union,
2ade00f3b03b encoding: add comment-based type hints for pytype
Augie Fackler <augie@google.com>
parents: 43469
diff changeset
    32
    )
2ade00f3b03b encoding: add comment-based type hints for pytype
Augie Fackler <augie@google.com>
parents: 43469
diff changeset
    33
2ade00f3b03b encoding: add comment-based type hints for pytype
Augie Fackler <augie@google.com>
parents: 43469
diff changeset
    34
    # keep pyflakes happy
2ade00f3b03b encoding: add comment-based type hints for pytype
Augie Fackler <augie@google.com>
parents: 43469
diff changeset
    35
    for t in (Any, Callable, List, Text, Type, Union):
2ade00f3b03b encoding: add comment-based type hints for pytype
Augie Fackler <augie@google.com>
parents: 43469
diff changeset
    36
        assert t
2ade00f3b03b encoding: add comment-based type hints for pytype
Augie Fackler <augie@google.com>
parents: 43469
diff changeset
    37
2ade00f3b03b encoding: add comment-based type hints for pytype
Augie Fackler <augie@google.com>
parents: 43469
diff changeset
    38
    _Tlocalstr = TypeVar('_Tlocalstr', bound=localstr)
2ade00f3b03b encoding: add comment-based type hints for pytype
Augie Fackler <augie@google.com>
parents: 43469
diff changeset
    39
43506
9f70512ae2cf cleanup: remove pointless r-prefixes on single-quoted strings
Augie Fackler <augie@google.com>
parents: 43503
diff changeset
    40
charencode = policy.importmod('charencode')
33761
f5fc54e7e467 encoding: drop circular import by proxying through '<policy>.charencode'
Yuya Nishihara <yuya@tcha.org>
parents: 33022
diff changeset
    41
33926
f4433f2713d0 encoding: add function to test if a str consists of ASCII characters
Yuya Nishihara <yuya@tcha.org>
parents: 33925
diff changeset
    42
isasciistr = charencode.isasciistr
33761
f5fc54e7e467 encoding: drop circular import by proxying through '<policy>.charencode'
Yuya Nishihara <yuya@tcha.org>
parents: 33022
diff changeset
    43
asciilower = charencode.asciilower
f5fc54e7e467 encoding: drop circular import by proxying through '<policy>.charencode'
Yuya Nishihara <yuya@tcha.org>
parents: 33022
diff changeset
    44
asciiupper = charencode.asciiupper
33925
2c37f9dabc32 encoding: add fast path of jsonescape() (issue5533)
Yuya Nishihara <yuya@tcha.org>
parents: 33924
diff changeset
    45
_jsonescapeu8fast = charencode.jsonescapeu8fast
33761
f5fc54e7e467 encoding: drop circular import by proxying through '<policy>.charencode'
Yuya Nishihara <yuya@tcha.org>
parents: 33022
diff changeset
    46
30033
02dbfaa6df0b py3: convert encoding name and mode to str
Yuya Nishihara <yuya@tcha.org>
parents: 30030
diff changeset
    47
_sysstr = pycompat.sysstr
02dbfaa6df0b py3: convert encoding name and mode to str
Yuya Nishihara <yuya@tcha.org>
parents: 30030
diff changeset
    48
30030
0f6d6fdd3c2a pycompat: provide 'ispy3' constant
Yuya Nishihara <yuya@tcha.org>
parents: 28508
diff changeset
    49
if pycompat.ispy3:
28507
9bcbd9412225 encoding: make HFS+ ignore code Python 3 compatible
Gregory Szorc <gregory.szorc@gmail.com>
parents: 28069
diff changeset
    50
    unichr = chr
9bcbd9412225 encoding: make HFS+ ignore code Python 3 compatible
Gregory Szorc <gregory.szorc@gmail.com>
parents: 28069
diff changeset
    51
23596
885bd7c5c7e3 encoding: add hfsignoreclean to clean out HFS-ignored characters
Augie Fackler <raf@durin42.com>
parents: 22973
diff changeset
    52
# These unicode characters are ignored by HFS+ (Apple Technote 1150,
885bd7c5c7e3 encoding: add hfsignoreclean to clean out HFS-ignored characters
Augie Fackler <raf@durin42.com>
parents: 22973
diff changeset
    53
# "Unicode Subtleties"), so we need to ignore them in some places for
885bd7c5c7e3 encoding: add hfsignoreclean to clean out HFS-ignored characters
Augie Fackler <raf@durin42.com>
parents: 22973
diff changeset
    54
# sanity.
43076
2372284d9457 formatting: blacken the codebase
Augie Fackler <augie@google.com>
parents: 41836
diff changeset
    55
_ignore = [
2372284d9457 formatting: blacken the codebase
Augie Fackler <augie@google.com>
parents: 41836
diff changeset
    56
    unichr(int(x, 16)).encode("utf-8")
43077
687b865b95ad formatting: byteify all mercurial/ and hgext/ string literals
Augie Fackler <augie@google.com>
parents: 43076
diff changeset
    57
    for x in b"200c 200d 200e 200f 202a 202b 202c 202d 202e "
687b865b95ad formatting: byteify all mercurial/ and hgext/ string literals
Augie Fackler <augie@google.com>
parents: 43076
diff changeset
    58
    b"206a 206b 206c 206d 206e 206f feff".split()
43076
2372284d9457 formatting: blacken the codebase
Augie Fackler <augie@google.com>
parents: 41836
diff changeset
    59
]
23596
885bd7c5c7e3 encoding: add hfsignoreclean to clean out HFS-ignored characters
Augie Fackler <raf@durin42.com>
parents: 22973
diff changeset
    60
# verify the next function will work
43077
687b865b95ad formatting: byteify all mercurial/ and hgext/ string literals
Augie Fackler <augie@google.com>
parents: 43076
diff changeset
    61
assert all(i.startswith((b"\xe2", b"\xef")) for i in _ignore)
23596
885bd7c5c7e3 encoding: add hfsignoreclean to clean out HFS-ignored characters
Augie Fackler <raf@durin42.com>
parents: 22973
diff changeset
    62
43076
2372284d9457 formatting: blacken the codebase
Augie Fackler <augie@google.com>
parents: 41836
diff changeset
    63
23596
885bd7c5c7e3 encoding: add hfsignoreclean to clean out HFS-ignored characters
Augie Fackler <raf@durin42.com>
parents: 22973
diff changeset
    64
def hfsignoreclean(s):
43496
2ade00f3b03b encoding: add comment-based type hints for pytype
Augie Fackler <augie@google.com>
parents: 43469
diff changeset
    65
    # type: (bytes) -> bytes
23596
885bd7c5c7e3 encoding: add hfsignoreclean to clean out HFS-ignored characters
Augie Fackler <raf@durin42.com>
parents: 22973
diff changeset
    66
    """Remove codepoints ignored by HFS+ from s.
885bd7c5c7e3 encoding: add hfsignoreclean to clean out HFS-ignored characters
Augie Fackler <raf@durin42.com>
parents: 22973
diff changeset
    67
885bd7c5c7e3 encoding: add hfsignoreclean to clean out HFS-ignored characters
Augie Fackler <raf@durin42.com>
parents: 22973
diff changeset
    68
    >>> hfsignoreclean(u'.h\u200cg'.encode('utf-8'))
885bd7c5c7e3 encoding: add hfsignoreclean to clean out HFS-ignored characters
Augie Fackler <raf@durin42.com>
parents: 22973
diff changeset
    69
    '.hg'
885bd7c5c7e3 encoding: add hfsignoreclean to clean out HFS-ignored characters
Augie Fackler <raf@durin42.com>
parents: 22973
diff changeset
    70
    >>> hfsignoreclean(u'.h\ufeffg'.encode('utf-8'))
885bd7c5c7e3 encoding: add hfsignoreclean to clean out HFS-ignored characters
Augie Fackler <raf@durin42.com>
parents: 22973
diff changeset
    71
    '.hg'
885bd7c5c7e3 encoding: add hfsignoreclean to clean out HFS-ignored characters
Augie Fackler <raf@durin42.com>
parents: 22973
diff changeset
    72
    """
43077
687b865b95ad formatting: byteify all mercurial/ and hgext/ string literals
Augie Fackler <augie@google.com>
parents: 43076
diff changeset
    73
    if b"\xe2" in s or b"\xef" in s:
23596
885bd7c5c7e3 encoding: add hfsignoreclean to clean out HFS-ignored characters
Augie Fackler <raf@durin42.com>
parents: 22973
diff changeset
    74
        for c in _ignore:
43077
687b865b95ad formatting: byteify all mercurial/ and hgext/ string literals
Augie Fackler <augie@google.com>
parents: 43076
diff changeset
    75
            s = s.replace(c, b'')
23596
885bd7c5c7e3 encoding: add hfsignoreclean to clean out HFS-ignored characters
Augie Fackler <raf@durin42.com>
parents: 22973
diff changeset
    76
    return s
885bd7c5c7e3 encoding: add hfsignoreclean to clean out HFS-ignored characters
Augie Fackler <raf@durin42.com>
parents: 22973
diff changeset
    77
43076
2372284d9457 formatting: blacken the codebase
Augie Fackler <augie@google.com>
parents: 41836
diff changeset
    78
30034
e4a6b439acc5 py3: provide encoding.environ which is a dict of bytes
Yuya Nishihara <yuya@tcha.org>
parents: 30033
diff changeset
    79
# encoding.environ is provided read-only, which may not be used to modify
e4a6b439acc5 py3: provide encoding.environ which is a dict of bytes
Yuya Nishihara <yuya@tcha.org>
parents: 30033
diff changeset
    80
# the process environment
43076
2372284d9457 formatting: blacken the codebase
Augie Fackler <augie@google.com>
parents: 41836
diff changeset
    81
_nativeenviron = not pycompat.ispy3 or os.supports_bytes_environ
30034
e4a6b439acc5 py3: provide encoding.environ which is a dict of bytes
Yuya Nishihara <yuya@tcha.org>
parents: 30033
diff changeset
    82
if not pycompat.ispy3:
32184
cf424dae5dc7 check-code: ignore re-exports of os.environ in encoding.py
Yuya Nishihara <yuya@tcha.org>
parents: 32156
diff changeset
    83
    environ = os.environ  # re-exports
30034
e4a6b439acc5 py3: provide encoding.environ which is a dict of bytes
Yuya Nishihara <yuya@tcha.org>
parents: 30033
diff changeset
    84
elif _nativeenviron:
32184
cf424dae5dc7 check-code: ignore re-exports of os.environ in encoding.py
Yuya Nishihara <yuya@tcha.org>
parents: 32156
diff changeset
    85
    environ = os.environb  # re-exports
30034
e4a6b439acc5 py3: provide encoding.environ which is a dict of bytes
Yuya Nishihara <yuya@tcha.org>
parents: 30033
diff changeset
    86
else:
e4a6b439acc5 py3: provide encoding.environ which is a dict of bytes
Yuya Nishihara <yuya@tcha.org>
parents: 30033
diff changeset
    87
    # preferred encoding isn't known yet; use utf-8 to avoid unicode error
e4a6b439acc5 py3: provide encoding.environ which is a dict of bytes
Yuya Nishihara <yuya@tcha.org>
parents: 30033
diff changeset
    88
    # and recreate it once encoding is settled
43076
2372284d9457 formatting: blacken the codebase
Augie Fackler <augie@google.com>
parents: 41836
diff changeset
    89
    environ = dict(
43506
9f70512ae2cf cleanup: remove pointless r-prefixes on single-quoted strings
Augie Fackler <augie@google.com>
parents: 43503
diff changeset
    90
        (k.encode('utf-8'), v.encode('utf-8'))
43076
2372284d9457 formatting: blacken the codebase
Augie Fackler <augie@google.com>
parents: 41836
diff changeset
    91
        for k, v in os.environ.items()  # re-exports
2372284d9457 formatting: blacken the codebase
Augie Fackler <augie@google.com>
parents: 41836
diff changeset
    92
    )
30034
e4a6b439acc5 py3: provide encoding.environ which is a dict of bytes
Yuya Nishihara <yuya@tcha.org>
parents: 30033
diff changeset
    93
39839
9e8fcd2e78c1 encoding: remove unnecessary lambdas from _encodingfixers
Martin von Zweigbergk <martinvonz@google.com>
parents: 39819
diff changeset
    94
_encodingrewrites = {
43077
687b865b95ad formatting: byteify all mercurial/ and hgext/ string literals
Augie Fackler <augie@google.com>
parents: 43076
diff changeset
    95
    b'646': b'ascii',
687b865b95ad formatting: byteify all mercurial/ and hgext/ string literals
Augie Fackler <augie@google.com>
parents: 43076
diff changeset
    96
    b'ANSI_X3.4-1968': b'ascii',
11892
2be70ca17311 encoding: improve handling of buggy getpreferredencoding() on Mac OS X
Dan Villiom Podlaski Christiansen <danchr@gmail.com>
parents: 11297
diff changeset
    97
}
38615
443029011990 encoding: alias cp65001 to utf-8 on Windows
Yuya Nishihara <yuya@tcha.org>
parents: 36797
diff changeset
    98
# cp65001 is a Windows variant of utf-8, which isn't supported on Python 2.
443029011990 encoding: alias cp65001 to utf-8 on Windows
Yuya Nishihara <yuya@tcha.org>
parents: 36797
diff changeset
    99
# No idea if it should be rewritten to the canonical name 'utf-8' on Python 3.
443029011990 encoding: alias cp65001 to utf-8 on Windows
Yuya Nishihara <yuya@tcha.org>
parents: 36797
diff changeset
   100
# https://bugs.python.org/issue13216
443029011990 encoding: alias cp65001 to utf-8 on Windows
Yuya Nishihara <yuya@tcha.org>
parents: 36797
diff changeset
   101
if pycompat.iswindows and not pycompat.ispy3:
43077
687b865b95ad formatting: byteify all mercurial/ and hgext/ string literals
Augie Fackler <augie@google.com>
parents: 43076
diff changeset
   102
    _encodingrewrites[b'cp65001'] = b'utf-8'
7948
de377b1a9a84 move encoding bits from util to encoding
Matt Mackall <mpm@selenic.com>
parents:
diff changeset
   103
de377b1a9a84 move encoding bits from util to encoding
Matt Mackall <mpm@selenic.com>
parents:
diff changeset
   104
try:
43077
687b865b95ad formatting: byteify all mercurial/ and hgext/ string literals
Augie Fackler <augie@google.com>
parents: 43076
diff changeset
   105
    encoding = environ.get(b"HGENCODING")
7948
de377b1a9a84 move encoding bits from util to encoding
Matt Mackall <mpm@selenic.com>
parents:
diff changeset
   106
    if not encoding:
43077
687b865b95ad formatting: byteify all mercurial/ and hgext/ string literals
Augie Fackler <augie@google.com>
parents: 43076
diff changeset
   107
        encoding = locale.getpreferredencoding().encode('ascii') or b'ascii'
39839
9e8fcd2e78c1 encoding: remove unnecessary lambdas from _encodingfixers
Martin von Zweigbergk <martinvonz@google.com>
parents: 39819
diff changeset
   108
        encoding = _encodingrewrites.get(encoding, encoding)
7948
de377b1a9a84 move encoding bits from util to encoding
Matt Mackall <mpm@selenic.com>
parents:
diff changeset
   109
except locale.Error:
43077
687b865b95ad formatting: byteify all mercurial/ and hgext/ string literals
Augie Fackler <augie@google.com>
parents: 43076
diff changeset
   110
    encoding = b'ascii'
687b865b95ad formatting: byteify all mercurial/ and hgext/ string literals
Augie Fackler <augie@google.com>
parents: 43076
diff changeset
   111
encodingmode = environ.get(b"HGENCODINGMODE", b"strict")
687b865b95ad formatting: byteify all mercurial/ and hgext/ string literals
Augie Fackler <augie@google.com>
parents: 43076
diff changeset
   112
fallbackencoding = b'ISO-8859-1'
7948
de377b1a9a84 move encoding bits from util to encoding
Matt Mackall <mpm@selenic.com>
parents:
diff changeset
   113
43076
2372284d9457 formatting: blacken the codebase
Augie Fackler <augie@google.com>
parents: 41836
diff changeset
   114
33811
dabe1f11ae3a py3: change encoding.localstr to a subclass of bytes, not str
Yuya Nishihara <yuya@tcha.org>
parents: 33761
diff changeset
   115
class localstr(bytes):
13046
7cc4263e07a9 encoding: add localstr class to track UTF-8 version of transcoded strings
Matt Mackall <mpm@selenic.com>
parents: 12866
diff changeset
   116
    '''This class allows strings that are unmodified to be
7cc4263e07a9 encoding: add localstr class to track UTF-8 version of transcoded strings
Matt Mackall <mpm@selenic.com>
parents: 12866
diff changeset
   117
    round-tripped to the local encoding and back'''
43076
2372284d9457 formatting: blacken the codebase
Augie Fackler <augie@google.com>
parents: 41836
diff changeset
   118
13046
7cc4263e07a9 encoding: add localstr class to track UTF-8 version of transcoded strings
Matt Mackall <mpm@selenic.com>
parents: 12866
diff changeset
   119
    def __new__(cls, u, l):
43496
2ade00f3b03b encoding: add comment-based type hints for pytype
Augie Fackler <augie@google.com>
parents: 43469
diff changeset
   120
        # type: (Type[_Tlocalstr], Text, bytes) -> _Tlocalstr
33811
dabe1f11ae3a py3: change encoding.localstr to a subclass of bytes, not str
Yuya Nishihara <yuya@tcha.org>
parents: 33761
diff changeset
   121
        s = bytes.__new__(cls, l)
13046
7cc4263e07a9 encoding: add localstr class to track UTF-8 version of transcoded strings
Matt Mackall <mpm@selenic.com>
parents: 12866
diff changeset
   122
        s._utf8 = u
7cc4263e07a9 encoding: add localstr class to track UTF-8 version of transcoded strings
Matt Mackall <mpm@selenic.com>
parents: 12866
diff changeset
   123
        return s
43076
2372284d9457 formatting: blacken the codebase
Augie Fackler <augie@google.com>
parents: 41836
diff changeset
   124
13046
7cc4263e07a9 encoding: add localstr class to track UTF-8 version of transcoded strings
Matt Mackall <mpm@selenic.com>
parents: 12866
diff changeset
   125
    def __hash__(self):
43076
2372284d9457 formatting: blacken the codebase
Augie Fackler <augie@google.com>
parents: 41836
diff changeset
   126
        return hash(self._utf8)  # avoid collisions in local string space
2372284d9457 formatting: blacken the codebase
Augie Fackler <augie@google.com>
parents: 41836
diff changeset
   127
13046
7cc4263e07a9 encoding: add localstr class to track UTF-8 version of transcoded strings
Matt Mackall <mpm@selenic.com>
parents: 12866
diff changeset
   128
37947
3ea3c96ada54 encoding: introduce tagging type for non-lossy non-ASCII string
Yuya Nishihara <yuya@tcha.org>
parents: 37946
diff changeset
   129
class safelocalstr(bytes):
3ea3c96ada54 encoding: introduce tagging type for non-lossy non-ASCII string
Yuya Nishihara <yuya@tcha.org>
parents: 37946
diff changeset
   130
    """Tagged string denoting it was previously an internal UTF-8 string,
3ea3c96ada54 encoding: introduce tagging type for non-lossy non-ASCII string
Yuya Nishihara <yuya@tcha.org>
parents: 37946
diff changeset
   131
    and can be converted back to UTF-8 losslessly
3ea3c96ada54 encoding: introduce tagging type for non-lossy non-ASCII string
Yuya Nishihara <yuya@tcha.org>
parents: 37946
diff changeset
   132
3ea3c96ada54 encoding: introduce tagging type for non-lossy non-ASCII string
Yuya Nishihara <yuya@tcha.org>
parents: 37946
diff changeset
   133
    >>> assert safelocalstr(b'\\xc3') == b'\\xc3'
3ea3c96ada54 encoding: introduce tagging type for non-lossy non-ASCII string
Yuya Nishihara <yuya@tcha.org>
parents: 37946
diff changeset
   134
    >>> assert b'\\xc3' == safelocalstr(b'\\xc3')
3ea3c96ada54 encoding: introduce tagging type for non-lossy non-ASCII string
Yuya Nishihara <yuya@tcha.org>
parents: 37946
diff changeset
   135
    >>> assert b'\\xc3' in {safelocalstr(b'\\xc3'): 0}
3ea3c96ada54 encoding: introduce tagging type for non-lossy non-ASCII string
Yuya Nishihara <yuya@tcha.org>
parents: 37946
diff changeset
   136
    >>> assert safelocalstr(b'\\xc3') in {b'\\xc3': 0}
3ea3c96ada54 encoding: introduce tagging type for non-lossy non-ASCII string
Yuya Nishihara <yuya@tcha.org>
parents: 37946
diff changeset
   137
    """
3ea3c96ada54 encoding: introduce tagging type for non-lossy non-ASCII string
Yuya Nishihara <yuya@tcha.org>
parents: 37946
diff changeset
   138
43076
2372284d9457 formatting: blacken the codebase
Augie Fackler <augie@google.com>
parents: 41836
diff changeset
   139
7948
de377b1a9a84 move encoding bits from util to encoding
Matt Mackall <mpm@selenic.com>
parents:
diff changeset
   140
def tolocal(s):
43496
2ade00f3b03b encoding: add comment-based type hints for pytype
Augie Fackler <augie@google.com>
parents: 43469
diff changeset
   141
    # type: (Text) -> bytes
7948
de377b1a9a84 move encoding bits from util to encoding
Matt Mackall <mpm@selenic.com>
parents:
diff changeset
   142
    """
de377b1a9a84 move encoding bits from util to encoding
Matt Mackall <mpm@selenic.com>
parents:
diff changeset
   143
    Convert a string from internal UTF-8 to local encoding
de377b1a9a84 move encoding bits from util to encoding
Matt Mackall <mpm@selenic.com>
parents:
diff changeset
   144
de377b1a9a84 move encoding bits from util to encoding
Matt Mackall <mpm@selenic.com>
parents:
diff changeset
   145
    All internal strings should be UTF-8 but some repos before the
de377b1a9a84 move encoding bits from util to encoding
Matt Mackall <mpm@selenic.com>
parents:
diff changeset
   146
    implementation of locale support may contain latin1 or possibly
de377b1a9a84 move encoding bits from util to encoding
Matt Mackall <mpm@selenic.com>
parents:
diff changeset
   147
    other character sets. We attempt to decode everything strictly
de377b1a9a84 move encoding bits from util to encoding
Matt Mackall <mpm@selenic.com>
parents:
diff changeset
   148
    using UTF-8, then Latin-1, and failing that, we use UTF-8 and
de377b1a9a84 move encoding bits from util to encoding
Matt Mackall <mpm@selenic.com>
parents:
diff changeset
   149
    replace unknown characters.
13046
7cc4263e07a9 encoding: add localstr class to track UTF-8 version of transcoded strings
Matt Mackall <mpm@selenic.com>
parents: 12866
diff changeset
   150
7cc4263e07a9 encoding: add localstr class to track UTF-8 version of transcoded strings
Matt Mackall <mpm@selenic.com>
parents: 12866
diff changeset
   151
    The localstr class is used to cache the known UTF-8 encoding of
7cc4263e07a9 encoding: add localstr class to track UTF-8 version of transcoded strings
Matt Mackall <mpm@selenic.com>
parents: 12866
diff changeset
   152
    strings next to their local representation to allow lossless
7cc4263e07a9 encoding: add localstr class to track UTF-8 version of transcoded strings
Matt Mackall <mpm@selenic.com>
parents: 12866
diff changeset
   153
    round-trip conversion back to UTF-8.
7cc4263e07a9 encoding: add localstr class to track UTF-8 version of transcoded strings
Matt Mackall <mpm@selenic.com>
parents: 12866
diff changeset
   154
34131
0fa781320203 doctest: bulk-replace string literals with b'' for Python 3
Yuya Nishihara <yuya@tcha.org>
parents: 33928
diff changeset
   155
    >>> u = b'foo: \\xc3\\xa4' # utf-8
13046
7cc4263e07a9 encoding: add localstr class to track UTF-8 version of transcoded strings
Matt Mackall <mpm@selenic.com>
parents: 12866
diff changeset
   156
    >>> l = tolocal(u)
7cc4263e07a9 encoding: add localstr class to track UTF-8 version of transcoded strings
Matt Mackall <mpm@selenic.com>
parents: 12866
diff changeset
   157
    >>> l
7cc4263e07a9 encoding: add localstr class to track UTF-8 version of transcoded strings
Matt Mackall <mpm@selenic.com>
parents: 12866
diff changeset
   158
    'foo: ?'
7cc4263e07a9 encoding: add localstr class to track UTF-8 version of transcoded strings
Matt Mackall <mpm@selenic.com>
parents: 12866
diff changeset
   159
    >>> fromlocal(l)
7cc4263e07a9 encoding: add localstr class to track UTF-8 version of transcoded strings
Matt Mackall <mpm@selenic.com>
parents: 12866
diff changeset
   160
    'foo: \\xc3\\xa4'
34131
0fa781320203 doctest: bulk-replace string literals with b'' for Python 3
Yuya Nishihara <yuya@tcha.org>
parents: 33928
diff changeset
   161
    >>> u2 = b'foo: \\xc3\\xa1'
13046
7cc4263e07a9 encoding: add localstr class to track UTF-8 version of transcoded strings
Matt Mackall <mpm@selenic.com>
parents: 12866
diff changeset
   162
    >>> d = { l: 1, tolocal(u2): 2 }
18378
404feac78b8a tests: stabilize doctest output
Mads Kiilerich <mads@kiilerich.com>
parents: 17424
diff changeset
   163
    >>> len(d) # no collision
404feac78b8a tests: stabilize doctest output
Mads Kiilerich <mads@kiilerich.com>
parents: 17424
diff changeset
   164
    2
34131
0fa781320203 doctest: bulk-replace string literals with b'' for Python 3
Yuya Nishihara <yuya@tcha.org>
parents: 33928
diff changeset
   165
    >>> b'foo: ?' in d
13046
7cc4263e07a9 encoding: add localstr class to track UTF-8 version of transcoded strings
Matt Mackall <mpm@selenic.com>
parents: 12866
diff changeset
   166
    False
34131
0fa781320203 doctest: bulk-replace string literals with b'' for Python 3
Yuya Nishihara <yuya@tcha.org>
parents: 33928
diff changeset
   167
    >>> l1 = b'foo: \\xe4' # historical latin1 fallback
13046
7cc4263e07a9 encoding: add localstr class to track UTF-8 version of transcoded strings
Matt Mackall <mpm@selenic.com>
parents: 12866
diff changeset
   168
    >>> l = tolocal(l1)
7cc4263e07a9 encoding: add localstr class to track UTF-8 version of transcoded strings
Matt Mackall <mpm@selenic.com>
parents: 12866
diff changeset
   169
    >>> l
7cc4263e07a9 encoding: add localstr class to track UTF-8 version of transcoded strings
Matt Mackall <mpm@selenic.com>
parents: 12866
diff changeset
   170
    'foo: ?'
7cc4263e07a9 encoding: add localstr class to track UTF-8 version of transcoded strings
Matt Mackall <mpm@selenic.com>
parents: 12866
diff changeset
   171
    >>> fromlocal(l) # magically in utf-8
7cc4263e07a9 encoding: add localstr class to track UTF-8 version of transcoded strings
Matt Mackall <mpm@selenic.com>
parents: 12866
diff changeset
   172
    'foo: \\xc3\\xa4'
7948
de377b1a9a84 move encoding bits from util to encoding
Matt Mackall <mpm@selenic.com>
parents:
diff changeset
   173
    """
13046
7cc4263e07a9 encoding: add localstr class to track UTF-8 version of transcoded strings
Matt Mackall <mpm@selenic.com>
parents: 12866
diff changeset
   174
33927
853574db5b12 encoding: add fast path of from/tolocal() for ASCII strings
Yuya Nishihara <yuya@tcha.org>
parents: 33926
diff changeset
   175
    if isasciistr(s):
853574db5b12 encoding: add fast path of from/tolocal() for ASCII strings
Yuya Nishihara <yuya@tcha.org>
parents: 33926
diff changeset
   176
        return s
853574db5b12 encoding: add fast path of from/tolocal() for ASCII strings
Yuya Nishihara <yuya@tcha.org>
parents: 33926
diff changeset
   177
16274
5d75eb8568d1 encoding: tune fast-path of tolocal a bit
Matt Mackall <mpm@selenic.com>
parents: 16133
diff changeset
   178
    try:
7948
de377b1a9a84 move encoding bits from util to encoding
Matt Mackall <mpm@selenic.com>
parents:
diff changeset
   179
        try:
16274
5d75eb8568d1 encoding: tune fast-path of tolocal a bit
Matt Mackall <mpm@selenic.com>
parents: 16133
diff changeset
   180
            # make sure string is actually stored in UTF-8
5d75eb8568d1 encoding: tune fast-path of tolocal a bit
Matt Mackall <mpm@selenic.com>
parents: 16133
diff changeset
   181
            u = s.decode('UTF-8')
43077
687b865b95ad formatting: byteify all mercurial/ and hgext/ string literals
Augie Fackler <augie@google.com>
parents: 43076
diff changeset
   182
            if encoding == b'UTF-8':
16274
5d75eb8568d1 encoding: tune fast-path of tolocal a bit
Matt Mackall <mpm@selenic.com>
parents: 16133
diff changeset
   183
                # fast path
5d75eb8568d1 encoding: tune fast-path of tolocal a bit
Matt Mackall <mpm@selenic.com>
parents: 16133
diff changeset
   184
                return s
43503
313e3a279828 cleanup: remove pointless r-prefixes on double-quoted strings
Augie Fackler <augie@google.com>
parents: 43496
diff changeset
   185
            r = u.encode(_sysstr(encoding), "replace")
30033
02dbfaa6df0b py3: convert encoding name and mode to str
Yuya Nishihara <yuya@tcha.org>
parents: 30030
diff changeset
   186
            if u == r.decode(_sysstr(encoding)):
13940
b7b26e54e37a encoding: avoid localstr when a string can be encoded losslessly (issue2763)
Matt Mackall <mpm@selenic.com>
parents: 13051
diff changeset
   187
                # r is a safe, non-lossy encoding of s
37947
3ea3c96ada54 encoding: introduce tagging type for non-lossy non-ASCII string
Yuya Nishihara <yuya@tcha.org>
parents: 37946
diff changeset
   188
                return safelocalstr(r)
16274
5d75eb8568d1 encoding: tune fast-path of tolocal a bit
Matt Mackall <mpm@selenic.com>
parents: 16133
diff changeset
   189
            return localstr(s, r)
5d75eb8568d1 encoding: tune fast-path of tolocal a bit
Matt Mackall <mpm@selenic.com>
parents: 16133
diff changeset
   190
        except UnicodeDecodeError:
5d75eb8568d1 encoding: tune fast-path of tolocal a bit
Matt Mackall <mpm@selenic.com>
parents: 16133
diff changeset
   191
            # we should only get here if we're looking at an ancient changeset
5d75eb8568d1 encoding: tune fast-path of tolocal a bit
Matt Mackall <mpm@selenic.com>
parents: 16133
diff changeset
   192
            try:
30033
02dbfaa6df0b py3: convert encoding name and mode to str
Yuya Nishihara <yuya@tcha.org>
parents: 30030
diff changeset
   193
                u = s.decode(_sysstr(fallbackencoding))
43503
313e3a279828 cleanup: remove pointless r-prefixes on double-quoted strings
Augie Fackler <augie@google.com>
parents: 43496
diff changeset
   194
                r = u.encode(_sysstr(encoding), "replace")
30033
02dbfaa6df0b py3: convert encoding name and mode to str
Yuya Nishihara <yuya@tcha.org>
parents: 30030
diff changeset
   195
                if u == r.decode(_sysstr(encoding)):
16274
5d75eb8568d1 encoding: tune fast-path of tolocal a bit
Matt Mackall <mpm@selenic.com>
parents: 16133
diff changeset
   196
                    # r is a safe, non-lossy encoding of s
37947
3ea3c96ada54 encoding: introduce tagging type for non-lossy non-ASCII string
Yuya Nishihara <yuya@tcha.org>
parents: 37946
diff changeset
   197
                    return safelocalstr(r)
13940
b7b26e54e37a encoding: avoid localstr when a string can be encoded losslessly (issue2763)
Matt Mackall <mpm@selenic.com>
parents: 13051
diff changeset
   198
                return localstr(u.encode('UTF-8'), r)
16274
5d75eb8568d1 encoding: tune fast-path of tolocal a bit
Matt Mackall <mpm@selenic.com>
parents: 16133
diff changeset
   199
            except UnicodeDecodeError:
43076
2372284d9457 formatting: blacken the codebase
Augie Fackler <augie@google.com>
parents: 41836
diff changeset
   200
                u = s.decode("utf-8", "replace")  # last ditch
30033
02dbfaa6df0b py3: convert encoding name and mode to str
Yuya Nishihara <yuya@tcha.org>
parents: 30030
diff changeset
   201
                # can't round-trip
43503
313e3a279828 cleanup: remove pointless r-prefixes on double-quoted strings
Augie Fackler <augie@google.com>
parents: 43496
diff changeset
   202
                return u.encode(_sysstr(encoding), "replace")
25660
328739ea70c3 global: mass rewrite to use modern exception syntax
Gregory Szorc <gregory.szorc@gmail.com>
parents: 24608
diff changeset
   203
    except LookupError as k:
43077
687b865b95ad formatting: byteify all mercurial/ and hgext/ string literals
Augie Fackler <augie@google.com>
parents: 43076
diff changeset
   204
        raise error.Abort(k, hint=b"please check your locale settings")
7948
de377b1a9a84 move encoding bits from util to encoding
Matt Mackall <mpm@selenic.com>
parents:
diff changeset
   205
43076
2372284d9457 formatting: blacken the codebase
Augie Fackler <augie@google.com>
parents: 41836
diff changeset
   206
7948
de377b1a9a84 move encoding bits from util to encoding
Matt Mackall <mpm@selenic.com>
parents:
diff changeset
   207
def fromlocal(s):
43496
2ade00f3b03b encoding: add comment-based type hints for pytype
Augie Fackler <augie@google.com>
parents: 43469
diff changeset
   208
    # type: (bytes) -> Text
7948
de377b1a9a84 move encoding bits from util to encoding
Matt Mackall <mpm@selenic.com>
parents:
diff changeset
   209
    """
de377b1a9a84 move encoding bits from util to encoding
Matt Mackall <mpm@selenic.com>
parents:
diff changeset
   210
    Convert a string from the local character encoding to UTF-8
de377b1a9a84 move encoding bits from util to encoding
Matt Mackall <mpm@selenic.com>
parents:
diff changeset
   211
de377b1a9a84 move encoding bits from util to encoding
Matt Mackall <mpm@selenic.com>
parents:
diff changeset
   212
    We attempt to decode strings using the encoding mode set by
de377b1a9a84 move encoding bits from util to encoding
Matt Mackall <mpm@selenic.com>
parents:
diff changeset
   213
    HGENCODINGMODE, which defaults to 'strict'. In this mode, unknown
de377b1a9a84 move encoding bits from util to encoding
Matt Mackall <mpm@selenic.com>
parents:
diff changeset
   214
    characters will cause an error message. Other modes include
de377b1a9a84 move encoding bits from util to encoding
Matt Mackall <mpm@selenic.com>
parents:
diff changeset
   215
    'replace', which replaces unknown characters with a special
de377b1a9a84 move encoding bits from util to encoding
Matt Mackall <mpm@selenic.com>
parents:
diff changeset
   216
    Unicode character, and 'ignore', which drops the character.
de377b1a9a84 move encoding bits from util to encoding
Matt Mackall <mpm@selenic.com>
parents:
diff changeset
   217
    """
13046
7cc4263e07a9 encoding: add localstr class to track UTF-8 version of transcoded strings
Matt Mackall <mpm@selenic.com>
parents: 12866
diff changeset
   218
7cc4263e07a9 encoding: add localstr class to track UTF-8 version of transcoded strings
Matt Mackall <mpm@selenic.com>
parents: 12866
diff changeset
   219
    # can we do a lossless round-trip?
7cc4263e07a9 encoding: add localstr class to track UTF-8 version of transcoded strings
Matt Mackall <mpm@selenic.com>
parents: 12866
diff changeset
   220
    if isinstance(s, localstr):
7cc4263e07a9 encoding: add localstr class to track UTF-8 version of transcoded strings
Matt Mackall <mpm@selenic.com>
parents: 12866
diff changeset
   221
        return s._utf8
33927
853574db5b12 encoding: add fast path of from/tolocal() for ASCII strings
Yuya Nishihara <yuya@tcha.org>
parents: 33926
diff changeset
   222
    if isasciistr(s):
853574db5b12 encoding: add fast path of from/tolocal() for ASCII strings
Yuya Nishihara <yuya@tcha.org>
parents: 33926
diff changeset
   223
        return s
13046
7cc4263e07a9 encoding: add localstr class to track UTF-8 version of transcoded strings
Matt Mackall <mpm@selenic.com>
parents: 12866
diff changeset
   224
7948
de377b1a9a84 move encoding bits from util to encoding
Matt Mackall <mpm@selenic.com>
parents:
diff changeset
   225
    try:
30033
02dbfaa6df0b py3: convert encoding name and mode to str
Yuya Nishihara <yuya@tcha.org>
parents: 30030
diff changeset
   226
        u = s.decode(_sysstr(encoding), _sysstr(encodingmode))
02dbfaa6df0b py3: convert encoding name and mode to str
Yuya Nishihara <yuya@tcha.org>
parents: 30030
diff changeset
   227
        return u.encode("utf-8")
25660
328739ea70c3 global: mass rewrite to use modern exception syntax
Gregory Szorc <gregory.szorc@gmail.com>
parents: 24608
diff changeset
   228
    except UnicodeDecodeError as inst:
43076
2372284d9457 formatting: blacken the codebase
Augie Fackler <augie@google.com>
parents: 41836
diff changeset
   229
        sub = s[max(0, inst.start - 10) : inst.start + 10]
2372284d9457 formatting: blacken the codebase
Augie Fackler <augie@google.com>
parents: 41836
diff changeset
   230
        raise error.Abort(
43077
687b865b95ad formatting: byteify all mercurial/ and hgext/ string literals
Augie Fackler <augie@google.com>
parents: 43076
diff changeset
   231
            b"decoding near '%s': %s!" % (sub, pycompat.bytestr(inst))
43076
2372284d9457 formatting: blacken the codebase
Augie Fackler <augie@google.com>
parents: 41836
diff changeset
   232
        )
25660
328739ea70c3 global: mass rewrite to use modern exception syntax
Gregory Szorc <gregory.szorc@gmail.com>
parents: 24608
diff changeset
   233
    except LookupError as k:
43077
687b865b95ad formatting: byteify all mercurial/ and hgext/ string literals
Augie Fackler <augie@google.com>
parents: 43076
diff changeset
   234
        raise error.Abort(k, hint=b"please check your locale settings")
7948
de377b1a9a84 move encoding bits from util to encoding
Matt Mackall <mpm@selenic.com>
parents:
diff changeset
   235
43076
2372284d9457 formatting: blacken the codebase
Augie Fackler <augie@google.com>
parents: 41836
diff changeset
   236
31447
067add650129 encoding: factor out unicode variants of from/tolocal()
Yuya Nishihara <yuya@tcha.org>
parents: 30622
diff changeset
   237
def unitolocal(u):
43496
2ade00f3b03b encoding: add comment-based type hints for pytype
Augie Fackler <augie@google.com>
parents: 43469
diff changeset
   238
    # type: (Text) -> bytes
31447
067add650129 encoding: factor out unicode variants of from/tolocal()
Yuya Nishihara <yuya@tcha.org>
parents: 30622
diff changeset
   239
    """Convert a unicode string to a byte string of local encoding"""
067add650129 encoding: factor out unicode variants of from/tolocal()
Yuya Nishihara <yuya@tcha.org>
parents: 30622
diff changeset
   240
    return tolocal(u.encode('utf-8'))
067add650129 encoding: factor out unicode variants of from/tolocal()
Yuya Nishihara <yuya@tcha.org>
parents: 30622
diff changeset
   241
43076
2372284d9457 formatting: blacken the codebase
Augie Fackler <augie@google.com>
parents: 41836
diff changeset
   242
31447
067add650129 encoding: factor out unicode variants of from/tolocal()
Yuya Nishihara <yuya@tcha.org>
parents: 30622
diff changeset
   243
def unifromlocal(s):
43496
2ade00f3b03b encoding: add comment-based type hints for pytype
Augie Fackler <augie@google.com>
parents: 43469
diff changeset
   244
    # type: (bytes) -> Text
31447
067add650129 encoding: factor out unicode variants of from/tolocal()
Yuya Nishihara <yuya@tcha.org>
parents: 30622
diff changeset
   245
    """Convert a byte string of local encoding to a unicode string"""
067add650129 encoding: factor out unicode variants of from/tolocal()
Yuya Nishihara <yuya@tcha.org>
parents: 30622
diff changeset
   246
    return fromlocal(s).decode('utf-8')
067add650129 encoding: factor out unicode variants of from/tolocal()
Yuya Nishihara <yuya@tcha.org>
parents: 30622
diff changeset
   247
43076
2372284d9457 formatting: blacken the codebase
Augie Fackler <augie@google.com>
parents: 41836
diff changeset
   248
33022
ce96efec8112 py3: add utility to forward __str__() to __bytes__()
Yuya Nishihara <yuya@tcha.org>
parents: 32537
diff changeset
   249
def unimethod(bytesfunc):
43496
2ade00f3b03b encoding: add comment-based type hints for pytype
Augie Fackler <augie@google.com>
parents: 43469
diff changeset
   250
    # type: (Callable[[Any], bytes]) -> Callable[[Any], Text]
33022
ce96efec8112 py3: add utility to forward __str__() to __bytes__()
Yuya Nishihara <yuya@tcha.org>
parents: 32537
diff changeset
   251
    """Create a proxy method that forwards __unicode__() and __str__() of
ce96efec8112 py3: add utility to forward __str__() to __bytes__()
Yuya Nishihara <yuya@tcha.org>
parents: 32537
diff changeset
   252
    Python 3 to __bytes__()"""
43076
2372284d9457 formatting: blacken the codebase
Augie Fackler <augie@google.com>
parents: 41836
diff changeset
   253
33022
ce96efec8112 py3: add utility to forward __str__() to __bytes__()
Yuya Nishihara <yuya@tcha.org>
parents: 32537
diff changeset
   254
    def unifunc(obj):
ce96efec8112 py3: add utility to forward __str__() to __bytes__()
Yuya Nishihara <yuya@tcha.org>
parents: 32537
diff changeset
   255
        return unifromlocal(bytesfunc(obj))
43076
2372284d9457 formatting: blacken the codebase
Augie Fackler <augie@google.com>
parents: 41836
diff changeset
   256
33022
ce96efec8112 py3: add utility to forward __str__() to __bytes__()
Yuya Nishihara <yuya@tcha.org>
parents: 32537
diff changeset
   257
    return unifunc
ce96efec8112 py3: add utility to forward __str__() to __bytes__()
Yuya Nishihara <yuya@tcha.org>
parents: 32537
diff changeset
   258
43076
2372284d9457 formatting: blacken the codebase
Augie Fackler <augie@google.com>
parents: 41836
diff changeset
   259
31448
6419cd243017 encoding: add converter between native str and byte string
Yuya Nishihara <yuya@tcha.org>
parents: 31447
diff changeset
   260
# converter functions between native str and byte string. use these if the
6419cd243017 encoding: add converter between native str and byte string
Yuya Nishihara <yuya@tcha.org>
parents: 31447
diff changeset
   261
# character encoding is not aware (e.g. exception message) or is known to
6419cd243017 encoding: add converter between native str and byte string
Yuya Nishihara <yuya@tcha.org>
parents: 31447
diff changeset
   262
# be locale dependent (e.g. date formatting.)
6419cd243017 encoding: add converter between native str and byte string
Yuya Nishihara <yuya@tcha.org>
parents: 31447
diff changeset
   263
if pycompat.ispy3:
6419cd243017 encoding: add converter between native str and byte string
Yuya Nishihara <yuya@tcha.org>
parents: 31447
diff changeset
   264
    strtolocal = unitolocal
6419cd243017 encoding: add converter between native str and byte string
Yuya Nishihara <yuya@tcha.org>
parents: 31447
diff changeset
   265
    strfromlocal = unifromlocal
33022
ce96efec8112 py3: add utility to forward __str__() to __bytes__()
Yuya Nishihara <yuya@tcha.org>
parents: 32537
diff changeset
   266
    strmethod = unimethod
31448
6419cd243017 encoding: add converter between native str and byte string
Yuya Nishihara <yuya@tcha.org>
parents: 31447
diff changeset
   267
else:
43469
5f2a8dabb0d8 encoding: define local identify functions with explicit type comments
Augie Fackler <augie@google.com>
parents: 43089
diff changeset
   268
5f2a8dabb0d8 encoding: define local identify functions with explicit type comments
Augie Fackler <augie@google.com>
parents: 43089
diff changeset
   269
    def strtolocal(s):
5f2a8dabb0d8 encoding: define local identify functions with explicit type comments
Augie Fackler <augie@google.com>
parents: 43089
diff changeset
   270
        # type: (str) -> bytes
5f2a8dabb0d8 encoding: define local identify functions with explicit type comments
Augie Fackler <augie@google.com>
parents: 43089
diff changeset
   271
        return s
5f2a8dabb0d8 encoding: define local identify functions with explicit type comments
Augie Fackler <augie@google.com>
parents: 43089
diff changeset
   272
5f2a8dabb0d8 encoding: define local identify functions with explicit type comments
Augie Fackler <augie@google.com>
parents: 43089
diff changeset
   273
    def strfromlocal(s):
5f2a8dabb0d8 encoding: define local identify functions with explicit type comments
Augie Fackler <augie@google.com>
parents: 43089
diff changeset
   274
        # type: (bytes) -> str
5f2a8dabb0d8 encoding: define local identify functions with explicit type comments
Augie Fackler <augie@google.com>
parents: 43089
diff changeset
   275
        return s
5f2a8dabb0d8 encoding: define local identify functions with explicit type comments
Augie Fackler <augie@google.com>
parents: 43089
diff changeset
   276
33022
ce96efec8112 py3: add utility to forward __str__() to __bytes__()
Yuya Nishihara <yuya@tcha.org>
parents: 32537
diff changeset
   277
    strmethod = pycompat.identity
31448
6419cd243017 encoding: add converter between native str and byte string
Yuya Nishihara <yuya@tcha.org>
parents: 31447
diff changeset
   278
30034
e4a6b439acc5 py3: provide encoding.environ which is a dict of bytes
Yuya Nishihara <yuya@tcha.org>
parents: 30033
diff changeset
   279
if not _nativeenviron:
e4a6b439acc5 py3: provide encoding.environ which is a dict of bytes
Yuya Nishihara <yuya@tcha.org>
parents: 30033
diff changeset
   280
    # now encoding and helper functions are available, recreate the environ
e4a6b439acc5 py3: provide encoding.environ which is a dict of bytes
Yuya Nishihara <yuya@tcha.org>
parents: 30033
diff changeset
   281
    # dict to be exported to other modules
43076
2372284d9457 formatting: blacken the codebase
Augie Fackler <augie@google.com>
parents: 41836
diff changeset
   282
    environ = dict(
43506
9f70512ae2cf cleanup: remove pointless r-prefixes on single-quoted strings
Augie Fackler <augie@google.com>
parents: 43503
diff changeset
   283
        (tolocal(k.encode('utf-8')), tolocal(v.encode('utf-8')))
43076
2372284d9457 formatting: blacken the codebase
Augie Fackler <augie@google.com>
parents: 41836
diff changeset
   284
        for k, v in os.environ.items()  # re-exports
2372284d9457 formatting: blacken the codebase
Augie Fackler <augie@google.com>
parents: 41836
diff changeset
   285
    )
30034
e4a6b439acc5 py3: provide encoding.environ which is a dict of bytes
Yuya Nishihara <yuya@tcha.org>
parents: 30033
diff changeset
   286
39818
24e493ec2229 py3: rename pycompat.getcwd() to encoding.getcwd() (API)
Matt Harbison <matt_harbison@yahoo.com>
parents: 38783
diff changeset
   287
if pycompat.ispy3:
24e493ec2229 py3: rename pycompat.getcwd() to encoding.getcwd() (API)
Matt Harbison <matt_harbison@yahoo.com>
parents: 38783
diff changeset
   288
    # os.getcwd() on Python 3 returns string, but it has os.getcwdb() which
24e493ec2229 py3: rename pycompat.getcwd() to encoding.getcwd() (API)
Matt Harbison <matt_harbison@yahoo.com>
parents: 38783
diff changeset
   289
    # returns bytes.
39819
fb628c048d64 py3: don't use os.getcwdb() on Windows to avoid DeprecationWarnings
Matt Harbison <matt_harbison@yahoo.com>
parents: 39818
diff changeset
   290
    if pycompat.iswindows:
fb628c048d64 py3: don't use os.getcwdb() on Windows to avoid DeprecationWarnings
Matt Harbison <matt_harbison@yahoo.com>
parents: 39818
diff changeset
   291
        # Python 3 on Windows issues a DeprecationWarning about using the bytes
fb628c048d64 py3: don't use os.getcwdb() on Windows to avoid DeprecationWarnings
Matt Harbison <matt_harbison@yahoo.com>
parents: 39818
diff changeset
   292
        # API when os.getcwdb() is called.
fb628c048d64 py3: don't use os.getcwdb() on Windows to avoid DeprecationWarnings
Matt Harbison <matt_harbison@yahoo.com>
parents: 39818
diff changeset
   293
        getcwd = lambda: strtolocal(os.getcwd())  # re-exports
fb628c048d64 py3: don't use os.getcwdb() on Windows to avoid DeprecationWarnings
Matt Harbison <matt_harbison@yahoo.com>
parents: 39818
diff changeset
   294
    else:
fb628c048d64 py3: don't use os.getcwdb() on Windows to avoid DeprecationWarnings
Matt Harbison <matt_harbison@yahoo.com>
parents: 39818
diff changeset
   295
        getcwd = os.getcwdb  # re-exports
39818
24e493ec2229 py3: rename pycompat.getcwd() to encoding.getcwd() (API)
Matt Harbison <matt_harbison@yahoo.com>
parents: 38783
diff changeset
   296
else:
24e493ec2229 py3: rename pycompat.getcwd() to encoding.getcwd() (API)
Matt Harbison <matt_harbison@yahoo.com>
parents: 38783
diff changeset
   297
    getcwd = os.getcwd  # re-exports
24e493ec2229 py3: rename pycompat.getcwd() to encoding.getcwd() (API)
Matt Harbison <matt_harbison@yahoo.com>
parents: 38783
diff changeset
   298
12866
eddc20306ab6 encoding: default ambiguous character to narrow
Matt Mackall <mpm@selenic.com>
parents: 12770
diff changeset
   299
# How to treat ambiguous-width characters. Set to 'wide' to treat as wide.
43076
2372284d9457 formatting: blacken the codebase
Augie Fackler <augie@google.com>
parents: 41836
diff changeset
   300
_wide = _sysstr(
43077
687b865b95ad formatting: byteify all mercurial/ and hgext/ string literals
Augie Fackler <augie@google.com>
parents: 43076
diff changeset
   301
    environ.get(b"HGENCODINGAMBIGUOUS", b"narrow") == b"wide"
687b865b95ad formatting: byteify all mercurial/ and hgext/ string literals
Augie Fackler <augie@google.com>
parents: 43076
diff changeset
   302
    and b"WFA"
687b865b95ad formatting: byteify all mercurial/ and hgext/ string literals
Augie Fackler <augie@google.com>
parents: 43076
diff changeset
   303
    or b"WF"
43076
2372284d9457 formatting: blacken the codebase
Augie Fackler <augie@google.com>
parents: 41836
diff changeset
   304
)
2372284d9457 formatting: blacken the codebase
Augie Fackler <augie@google.com>
parents: 41836
diff changeset
   305
12866
eddc20306ab6 encoding: default ambiguous character to narrow
Matt Mackall <mpm@selenic.com>
parents: 12770
diff changeset
   306
7948
de377b1a9a84 move encoding bits from util to encoding
Matt Mackall <mpm@selenic.com>
parents:
diff changeset
   307
def colwidth(s):
43496
2ade00f3b03b encoding: add comment-based type hints for pytype
Augie Fackler <augie@google.com>
parents: 43469
diff changeset
   308
    # type: (bytes) -> int
43077
687b865b95ad formatting: byteify all mercurial/ and hgext/ string literals
Augie Fackler <augie@google.com>
parents: 43076
diff changeset
   309
    b"Find the column width of a string for display in the local encoding"
43506
9f70512ae2cf cleanup: remove pointless r-prefixes on single-quoted strings
Augie Fackler <augie@google.com>
parents: 43503
diff changeset
   310
    return ucolwidth(s.decode(_sysstr(encoding), 'replace'))
15066
24efa83d81cb i18n: calculate terminal columns by width information of each characters
FUJIWARA Katsunori <foozy@lares.dti.ne.jp>
parents: 14951
diff changeset
   311
43076
2372284d9457 formatting: blacken the codebase
Augie Fackler <augie@google.com>
parents: 41836
diff changeset
   312
15066
24efa83d81cb i18n: calculate terminal columns by width information of each characters
FUJIWARA Katsunori <foozy@lares.dti.ne.jp>
parents: 14951
diff changeset
   313
def ucolwidth(d):
43496
2ade00f3b03b encoding: add comment-based type hints for pytype
Augie Fackler <augie@google.com>
parents: 43469
diff changeset
   314
    # type: (Text) -> int
43077
687b865b95ad formatting: byteify all mercurial/ and hgext/ string literals
Augie Fackler <augie@google.com>
parents: 43076
diff changeset
   315
    b"Find the column width of a Unicode string for display"
14951
61807854004e encoding: use getattr isntead of hasattr
Augie Fackler <durin42@gmail.com>
parents: 14069
diff changeset
   316
    eaw = getattr(unicodedata, 'east_asian_width', None)
61807854004e encoding: use getattr isntead of hasattr
Augie Fackler <durin42@gmail.com>
parents: 14069
diff changeset
   317
    if eaw is not None:
32537
044f3d7eb9ae encoding: make sure "wide" variable never be referenced from other modules
Yuya Nishihara <yuya@tcha.org>
parents: 32529
diff changeset
   318
        return sum([eaw(c) in _wide and 2 or 1 for c in d])
7948
de377b1a9a84 move encoding bits from util to encoding
Matt Mackall <mpm@selenic.com>
parents:
diff changeset
   319
    return len(d)
de377b1a9a84 move encoding bits from util to encoding
Matt Mackall <mpm@selenic.com>
parents:
diff changeset
   320
43076
2372284d9457 formatting: blacken the codebase
Augie Fackler <augie@google.com>
parents: 41836
diff changeset
   321
15143
16c129b0f465 encoding: add getcols to extract substrings based on column width
Matt Mackall <mpm@selenic.com>
parents: 15142
diff changeset
   322
def getcols(s, start, c):
43496
2ade00f3b03b encoding: add comment-based type hints for pytype
Augie Fackler <augie@google.com>
parents: 43469
diff changeset
   323
    # type: (bytes, int, int) -> bytes
15143
16c129b0f465 encoding: add getcols to extract substrings based on column width
Matt Mackall <mpm@selenic.com>
parents: 15142
diff changeset
   324
    '''Use colwidth to find a c-column substring of s starting at byte
16c129b0f465 encoding: add getcols to extract substrings based on column width
Matt Mackall <mpm@selenic.com>
parents: 15142
diff changeset
   325
    index start'''
38783
e7aa113b14f7 global: use pycompat.xrange()
Gregory Szorc <gregory.szorc@gmail.com>
parents: 38739
diff changeset
   326
    for x in pycompat.xrange(start + c, len(s)):
15143
16c129b0f465 encoding: add getcols to extract substrings based on column width
Matt Mackall <mpm@selenic.com>
parents: 15142
diff changeset
   327
        t = s[start:x]
16c129b0f465 encoding: add getcols to extract substrings based on column width
Matt Mackall <mpm@selenic.com>
parents: 15142
diff changeset
   328
        if colwidth(t) == c:
16c129b0f465 encoding: add getcols to extract substrings based on column width
Matt Mackall <mpm@selenic.com>
parents: 15142
diff changeset
   329
            return t
16c129b0f465 encoding: add getcols to extract substrings based on column width
Matt Mackall <mpm@selenic.com>
parents: 15142
diff changeset
   330
43076
2372284d9457 formatting: blacken the codebase
Augie Fackler <augie@google.com>
parents: 41836
diff changeset
   331
43077
687b865b95ad formatting: byteify all mercurial/ and hgext/ string literals
Augie Fackler <augie@google.com>
parents: 43076
diff changeset
   332
def trim(s, width, ellipsis=b'', leftside=False):
43496
2ade00f3b03b encoding: add comment-based type hints for pytype
Augie Fackler <augie@google.com>
parents: 43469
diff changeset
   333
    # type: (bytes, int, bytes, bool) -> bytes
21856
d24969ee272f encoding: add 'trim' to trim multi-byte characters at most specified columns
FUJIWARA Katsunori <foozy@lares.dti.ne.jp>
parents: 18378
diff changeset
   334
    """Trim string 's' to at most 'width' columns (including 'ellipsis').
d24969ee272f encoding: add 'trim' to trim multi-byte characters at most specified columns
FUJIWARA Katsunori <foozy@lares.dti.ne.jp>
parents: 18378
diff changeset
   335
21861
b515c3a63e96 encoding: add 'leftside' argument into 'trim' to switch trimming side
FUJIWARA Katsunori <foozy@lares.dti.ne.jp>
parents: 21856
diff changeset
   336
    If 'leftside' is True, left side of string 's' is trimmed.
b515c3a63e96 encoding: add 'leftside' argument into 'trim' to switch trimming side
FUJIWARA Katsunori <foozy@lares.dti.ne.jp>
parents: 21856
diff changeset
   337
    'ellipsis' is always placed at trimmed side.
b515c3a63e96 encoding: add 'leftside' argument into 'trim' to switch trimming side
FUJIWARA Katsunori <foozy@lares.dti.ne.jp>
parents: 21856
diff changeset
   338
34136
414a3513c2bd doctest: do not embed non-ascii characters in docstring
Yuya Nishihara <yuya@tcha.org>
parents: 34135
diff changeset
   339
    >>> from .node import bin
34137
a8994d08e4a2 doctest: use print_function and convert bytes to unicode where needed
Yuya Nishihara <yuya@tcha.org>
parents: 34136
diff changeset
   340
    >>> def bprint(s):
a8994d08e4a2 doctest: use print_function and convert bytes to unicode where needed
Yuya Nishihara <yuya@tcha.org>
parents: 34136
diff changeset
   341
    ...     print(pycompat.sysstr(s))
34131
0fa781320203 doctest: bulk-replace string literals with b'' for Python 3
Yuya Nishihara <yuya@tcha.org>
parents: 33928
diff changeset
   342
    >>> ellipsis = b'+++'
27355
b479fc425a81 encoding: use absolute_import
Gregory Szorc <gregory.szorc@gmail.com>
parents: 26963
diff changeset
   343
    >>> from . import encoding
34131
0fa781320203 doctest: bulk-replace string literals with b'' for Python 3
Yuya Nishihara <yuya@tcha.org>
parents: 33928
diff changeset
   344
    >>> encoding.encoding = b'utf-8'
0fa781320203 doctest: bulk-replace string literals with b'' for Python 3
Yuya Nishihara <yuya@tcha.org>
parents: 33928
diff changeset
   345
    >>> t = b'1234567890'
34137
a8994d08e4a2 doctest: use print_function and convert bytes to unicode where needed
Yuya Nishihara <yuya@tcha.org>
parents: 34136
diff changeset
   346
    >>> bprint(trim(t, 12, ellipsis=ellipsis))
21856
d24969ee272f encoding: add 'trim' to trim multi-byte characters at most specified columns
FUJIWARA Katsunori <foozy@lares.dti.ne.jp>
parents: 18378
diff changeset
   347
    1234567890
34137
a8994d08e4a2 doctest: use print_function and convert bytes to unicode where needed
Yuya Nishihara <yuya@tcha.org>
parents: 34136
diff changeset
   348
    >>> bprint(trim(t, 10, ellipsis=ellipsis))
21856
d24969ee272f encoding: add 'trim' to trim multi-byte characters at most specified columns
FUJIWARA Katsunori <foozy@lares.dti.ne.jp>
parents: 18378
diff changeset
   349
    1234567890
34137
a8994d08e4a2 doctest: use print_function and convert bytes to unicode where needed
Yuya Nishihara <yuya@tcha.org>
parents: 34136
diff changeset
   350
    >>> bprint(trim(t, 8, ellipsis=ellipsis))
21856
d24969ee272f encoding: add 'trim' to trim multi-byte characters at most specified columns
FUJIWARA Katsunori <foozy@lares.dti.ne.jp>
parents: 18378
diff changeset
   351
    12345+++
34137
a8994d08e4a2 doctest: use print_function and convert bytes to unicode where needed
Yuya Nishihara <yuya@tcha.org>
parents: 34136
diff changeset
   352
    >>> bprint(trim(t, 8, ellipsis=ellipsis, leftside=True))
21861
b515c3a63e96 encoding: add 'leftside' argument into 'trim' to switch trimming side
FUJIWARA Katsunori <foozy@lares.dti.ne.jp>
parents: 21856
diff changeset
   353
    +++67890
34137
a8994d08e4a2 doctest: use print_function and convert bytes to unicode where needed
Yuya Nishihara <yuya@tcha.org>
parents: 34136
diff changeset
   354
    >>> bprint(trim(t, 8))
21856
d24969ee272f encoding: add 'trim' to trim multi-byte characters at most specified columns
FUJIWARA Katsunori <foozy@lares.dti.ne.jp>
parents: 18378
diff changeset
   355
    12345678
34137
a8994d08e4a2 doctest: use print_function and convert bytes to unicode where needed
Yuya Nishihara <yuya@tcha.org>
parents: 34136
diff changeset
   356
    >>> bprint(trim(t, 8, leftside=True))
21861
b515c3a63e96 encoding: add 'leftside' argument into 'trim' to switch trimming side
FUJIWARA Katsunori <foozy@lares.dti.ne.jp>
parents: 21856
diff changeset
   357
    34567890
34137
a8994d08e4a2 doctest: use print_function and convert bytes to unicode where needed
Yuya Nishihara <yuya@tcha.org>
parents: 34136
diff changeset
   358
    >>> bprint(trim(t, 3, ellipsis=ellipsis))
21856
d24969ee272f encoding: add 'trim' to trim multi-byte characters at most specified columns
FUJIWARA Katsunori <foozy@lares.dti.ne.jp>
parents: 18378
diff changeset
   359
    +++
34137
a8994d08e4a2 doctest: use print_function and convert bytes to unicode where needed
Yuya Nishihara <yuya@tcha.org>
parents: 34136
diff changeset
   360
    >>> bprint(trim(t, 1, ellipsis=ellipsis))
21856
d24969ee272f encoding: add 'trim' to trim multi-byte characters at most specified columns
FUJIWARA Katsunori <foozy@lares.dti.ne.jp>
parents: 18378
diff changeset
   361
    +
d24969ee272f encoding: add 'trim' to trim multi-byte characters at most specified columns
FUJIWARA Katsunori <foozy@lares.dti.ne.jp>
parents: 18378
diff changeset
   362
    >>> u = u'\u3042\u3044\u3046\u3048\u304a' # 2 x 5 = 10 columns
34135
e9e225f16932 doctest: pass encoding name as system string
Yuya Nishihara <yuya@tcha.org>
parents: 34131
diff changeset
   363
    >>> t = u.encode(pycompat.sysstr(encoding.encoding))
34137
a8994d08e4a2 doctest: use print_function and convert bytes to unicode where needed
Yuya Nishihara <yuya@tcha.org>
parents: 34136
diff changeset
   364
    >>> bprint(trim(t, 12, ellipsis=ellipsis))
21856
d24969ee272f encoding: add 'trim' to trim multi-byte characters at most specified columns
FUJIWARA Katsunori <foozy@lares.dti.ne.jp>
parents: 18378
diff changeset
   365
    \xe3\x81\x82\xe3\x81\x84\xe3\x81\x86\xe3\x81\x88\xe3\x81\x8a
34137
a8994d08e4a2 doctest: use print_function and convert bytes to unicode where needed
Yuya Nishihara <yuya@tcha.org>
parents: 34136
diff changeset
   366
    >>> bprint(trim(t, 10, ellipsis=ellipsis))
21856
d24969ee272f encoding: add 'trim' to trim multi-byte characters at most specified columns
FUJIWARA Katsunori <foozy@lares.dti.ne.jp>
parents: 18378
diff changeset
   367
    \xe3\x81\x82\xe3\x81\x84\xe3\x81\x86\xe3\x81\x88\xe3\x81\x8a
34137
a8994d08e4a2 doctest: use print_function and convert bytes to unicode where needed
Yuya Nishihara <yuya@tcha.org>
parents: 34136
diff changeset
   368
    >>> bprint(trim(t, 8, ellipsis=ellipsis))
21856
d24969ee272f encoding: add 'trim' to trim multi-byte characters at most specified columns
FUJIWARA Katsunori <foozy@lares.dti.ne.jp>
parents: 18378
diff changeset
   369
    \xe3\x81\x82\xe3\x81\x84+++
34137
a8994d08e4a2 doctest: use print_function and convert bytes to unicode where needed
Yuya Nishihara <yuya@tcha.org>
parents: 34136
diff changeset
   370
    >>> bprint(trim(t, 8, ellipsis=ellipsis, leftside=True))
21861
b515c3a63e96 encoding: add 'leftside' argument into 'trim' to switch trimming side
FUJIWARA Katsunori <foozy@lares.dti.ne.jp>
parents: 21856
diff changeset
   371
    +++\xe3\x81\x88\xe3\x81\x8a
34137
a8994d08e4a2 doctest: use print_function and convert bytes to unicode where needed
Yuya Nishihara <yuya@tcha.org>
parents: 34136
diff changeset
   372
    >>> bprint(trim(t, 5))
21856
d24969ee272f encoding: add 'trim' to trim multi-byte characters at most specified columns
FUJIWARA Katsunori <foozy@lares.dti.ne.jp>
parents: 18378
diff changeset
   373
    \xe3\x81\x82\xe3\x81\x84
34137
a8994d08e4a2 doctest: use print_function and convert bytes to unicode where needed
Yuya Nishihara <yuya@tcha.org>
parents: 34136
diff changeset
   374
    >>> bprint(trim(t, 5, leftside=True))
21861
b515c3a63e96 encoding: add 'leftside' argument into 'trim' to switch trimming side
FUJIWARA Katsunori <foozy@lares.dti.ne.jp>
parents: 21856
diff changeset
   375
    \xe3\x81\x88\xe3\x81\x8a
34137
a8994d08e4a2 doctest: use print_function and convert bytes to unicode where needed
Yuya Nishihara <yuya@tcha.org>
parents: 34136
diff changeset
   376
    >>> bprint(trim(t, 4, ellipsis=ellipsis))
21856
d24969ee272f encoding: add 'trim' to trim multi-byte characters at most specified columns
FUJIWARA Katsunori <foozy@lares.dti.ne.jp>
parents: 18378
diff changeset
   377
    +++
34137
a8994d08e4a2 doctest: use print_function and convert bytes to unicode where needed
Yuya Nishihara <yuya@tcha.org>
parents: 34136
diff changeset
   378
    >>> bprint(trim(t, 4, ellipsis=ellipsis, leftside=True))
21861
b515c3a63e96 encoding: add 'leftside' argument into 'trim' to switch trimming side
FUJIWARA Katsunori <foozy@lares.dti.ne.jp>
parents: 21856
diff changeset
   379
    +++
34136
414a3513c2bd doctest: do not embed non-ascii characters in docstring
Yuya Nishihara <yuya@tcha.org>
parents: 34135
diff changeset
   380
    >>> t = bin(b'112233445566778899aa') # invalid byte sequence
34137
a8994d08e4a2 doctest: use print_function and convert bytes to unicode where needed
Yuya Nishihara <yuya@tcha.org>
parents: 34136
diff changeset
   381
    >>> bprint(trim(t, 12, ellipsis=ellipsis))
21856
d24969ee272f encoding: add 'trim' to trim multi-byte characters at most specified columns
FUJIWARA Katsunori <foozy@lares.dti.ne.jp>
parents: 18378
diff changeset
   382
    \x11\x22\x33\x44\x55\x66\x77\x88\x99\xaa
34137
a8994d08e4a2 doctest: use print_function and convert bytes to unicode where needed
Yuya Nishihara <yuya@tcha.org>
parents: 34136
diff changeset
   383
    >>> bprint(trim(t, 10, ellipsis=ellipsis))
21856
d24969ee272f encoding: add 'trim' to trim multi-byte characters at most specified columns
FUJIWARA Katsunori <foozy@lares.dti.ne.jp>
parents: 18378
diff changeset
   384
    \x11\x22\x33\x44\x55\x66\x77\x88\x99\xaa
34137
a8994d08e4a2 doctest: use print_function and convert bytes to unicode where needed
Yuya Nishihara <yuya@tcha.org>
parents: 34136
diff changeset
   385
    >>> bprint(trim(t, 8, ellipsis=ellipsis))
21856
d24969ee272f encoding: add 'trim' to trim multi-byte characters at most specified columns
FUJIWARA Katsunori <foozy@lares.dti.ne.jp>
parents: 18378
diff changeset
   386
    \x11\x22\x33\x44\x55+++
34137
a8994d08e4a2 doctest: use print_function and convert bytes to unicode where needed
Yuya Nishihara <yuya@tcha.org>
parents: 34136
diff changeset
   387
    >>> bprint(trim(t, 8, ellipsis=ellipsis, leftside=True))
21861
b515c3a63e96 encoding: add 'leftside' argument into 'trim' to switch trimming side
FUJIWARA Katsunori <foozy@lares.dti.ne.jp>
parents: 21856
diff changeset
   388
    +++\x66\x77\x88\x99\xaa
34137
a8994d08e4a2 doctest: use print_function and convert bytes to unicode where needed
Yuya Nishihara <yuya@tcha.org>
parents: 34136
diff changeset
   389
    >>> bprint(trim(t, 8))
21856
d24969ee272f encoding: add 'trim' to trim multi-byte characters at most specified columns
FUJIWARA Katsunori <foozy@lares.dti.ne.jp>
parents: 18378
diff changeset
   390
    \x11\x22\x33\x44\x55\x66\x77\x88
34137
a8994d08e4a2 doctest: use print_function and convert bytes to unicode where needed
Yuya Nishihara <yuya@tcha.org>
parents: 34136
diff changeset
   391
    >>> bprint(trim(t, 8, leftside=True))
21861
b515c3a63e96 encoding: add 'leftside' argument into 'trim' to switch trimming side
FUJIWARA Katsunori <foozy@lares.dti.ne.jp>
parents: 21856
diff changeset
   392
    \x33\x44\x55\x66\x77\x88\x99\xaa
34137
a8994d08e4a2 doctest: use print_function and convert bytes to unicode where needed
Yuya Nishihara <yuya@tcha.org>
parents: 34136
diff changeset
   393
    >>> bprint(trim(t, 3, ellipsis=ellipsis))
21856
d24969ee272f encoding: add 'trim' to trim multi-byte characters at most specified columns
FUJIWARA Katsunori <foozy@lares.dti.ne.jp>
parents: 18378
diff changeset
   394
    +++
34137
a8994d08e4a2 doctest: use print_function and convert bytes to unicode where needed
Yuya Nishihara <yuya@tcha.org>
parents: 34136
diff changeset
   395
    >>> bprint(trim(t, 1, ellipsis=ellipsis))
21856
d24969ee272f encoding: add 'trim' to trim multi-byte characters at most specified columns
FUJIWARA Katsunori <foozy@lares.dti.ne.jp>
parents: 18378
diff changeset
   396
    +
d24969ee272f encoding: add 'trim' to trim multi-byte characters at most specified columns
FUJIWARA Katsunori <foozy@lares.dti.ne.jp>
parents: 18378
diff changeset
   397
    """
d24969ee272f encoding: add 'trim' to trim multi-byte characters at most specified columns
FUJIWARA Katsunori <foozy@lares.dti.ne.jp>
parents: 18378
diff changeset
   398
    try:
30033
02dbfaa6df0b py3: convert encoding name and mode to str
Yuya Nishihara <yuya@tcha.org>
parents: 30030
diff changeset
   399
        u = s.decode(_sysstr(encoding))
21856
d24969ee272f encoding: add 'trim' to trim multi-byte characters at most specified columns
FUJIWARA Katsunori <foozy@lares.dti.ne.jp>
parents: 18378
diff changeset
   400
    except UnicodeDecodeError:
43076
2372284d9457 formatting: blacken the codebase
Augie Fackler <augie@google.com>
parents: 41836
diff changeset
   401
        if len(s) <= width:  # trimming is not needed
21856
d24969ee272f encoding: add 'trim' to trim multi-byte characters at most specified columns
FUJIWARA Katsunori <foozy@lares.dti.ne.jp>
parents: 18378
diff changeset
   402
            return s
d24969ee272f encoding: add 'trim' to trim multi-byte characters at most specified columns
FUJIWARA Katsunori <foozy@lares.dti.ne.jp>
parents: 18378
diff changeset
   403
        width -= len(ellipsis)
43076
2372284d9457 formatting: blacken the codebase
Augie Fackler <augie@google.com>
parents: 41836
diff changeset
   404
        if width <= 0:  # no enough room even for ellipsis
2372284d9457 formatting: blacken the codebase
Augie Fackler <augie@google.com>
parents: 41836
diff changeset
   405
            return ellipsis[: width + len(ellipsis)]
21861
b515c3a63e96 encoding: add 'leftside' argument into 'trim' to switch trimming side
FUJIWARA Katsunori <foozy@lares.dti.ne.jp>
parents: 21856
diff changeset
   406
        if leftside:
b515c3a63e96 encoding: add 'leftside' argument into 'trim' to switch trimming side
FUJIWARA Katsunori <foozy@lares.dti.ne.jp>
parents: 21856
diff changeset
   407
            return ellipsis + s[-width:]
21856
d24969ee272f encoding: add 'trim' to trim multi-byte characters at most specified columns
FUJIWARA Katsunori <foozy@lares.dti.ne.jp>
parents: 18378
diff changeset
   408
        return s[:width] + ellipsis
d24969ee272f encoding: add 'trim' to trim multi-byte characters at most specified columns
FUJIWARA Katsunori <foozy@lares.dti.ne.jp>
parents: 18378
diff changeset
   409
43076
2372284d9457 formatting: blacken the codebase
Augie Fackler <augie@google.com>
parents: 41836
diff changeset
   410
    if ucolwidth(u) <= width:  # trimming is not needed
21856
d24969ee272f encoding: add 'trim' to trim multi-byte characters at most specified columns
FUJIWARA Katsunori <foozy@lares.dti.ne.jp>
parents: 18378
diff changeset
   411
        return s
d24969ee272f encoding: add 'trim' to trim multi-byte characters at most specified columns
FUJIWARA Katsunori <foozy@lares.dti.ne.jp>
parents: 18378
diff changeset
   412
d24969ee272f encoding: add 'trim' to trim multi-byte characters at most specified columns
FUJIWARA Katsunori <foozy@lares.dti.ne.jp>
parents: 18378
diff changeset
   413
    width -= len(ellipsis)
43076
2372284d9457 formatting: blacken the codebase
Augie Fackler <augie@google.com>
parents: 41836
diff changeset
   414
    if width <= 0:  # no enough room even for ellipsis
2372284d9457 formatting: blacken the codebase
Augie Fackler <augie@google.com>
parents: 41836
diff changeset
   415
        return ellipsis[: width + len(ellipsis)]
21856
d24969ee272f encoding: add 'trim' to trim multi-byte characters at most specified columns
FUJIWARA Katsunori <foozy@lares.dti.ne.jp>
parents: 18378
diff changeset
   416
21861
b515c3a63e96 encoding: add 'leftside' argument into 'trim' to switch trimming side
FUJIWARA Katsunori <foozy@lares.dti.ne.jp>
parents: 21856
diff changeset
   417
    if leftside:
b515c3a63e96 encoding: add 'leftside' argument into 'trim' to switch trimming side
FUJIWARA Katsunori <foozy@lares.dti.ne.jp>
parents: 21856
diff changeset
   418
        uslice = lambda i: u[i:]
b515c3a63e96 encoding: add 'leftside' argument into 'trim' to switch trimming side
FUJIWARA Katsunori <foozy@lares.dti.ne.jp>
parents: 21856
diff changeset
   419
        concat = lambda s: ellipsis + s
b515c3a63e96 encoding: add 'leftside' argument into 'trim' to switch trimming side
FUJIWARA Katsunori <foozy@lares.dti.ne.jp>
parents: 21856
diff changeset
   420
    else:
b515c3a63e96 encoding: add 'leftside' argument into 'trim' to switch trimming side
FUJIWARA Katsunori <foozy@lares.dti.ne.jp>
parents: 21856
diff changeset
   421
        uslice = lambda i: u[:-i]
b515c3a63e96 encoding: add 'leftside' argument into 'trim' to switch trimming side
FUJIWARA Katsunori <foozy@lares.dti.ne.jp>
parents: 21856
diff changeset
   422
        concat = lambda s: s + ellipsis
38783
e7aa113b14f7 global: use pycompat.xrange()
Gregory Szorc <gregory.szorc@gmail.com>
parents: 38739
diff changeset
   423
    for i in pycompat.xrange(1, len(u)):
21856
d24969ee272f encoding: add 'trim' to trim multi-byte characters at most specified columns
FUJIWARA Katsunori <foozy@lares.dti.ne.jp>
parents: 18378
diff changeset
   424
        usub = uslice(i)
d24969ee272f encoding: add 'trim' to trim multi-byte characters at most specified columns
FUJIWARA Katsunori <foozy@lares.dti.ne.jp>
parents: 18378
diff changeset
   425
        if ucolwidth(usub) <= width:
30033
02dbfaa6df0b py3: convert encoding name and mode to str
Yuya Nishihara <yuya@tcha.org>
parents: 30030
diff changeset
   426
            return concat(usub.encode(_sysstr(encoding)))
43076
2372284d9457 formatting: blacken the codebase
Augie Fackler <augie@google.com>
parents: 41836
diff changeset
   427
    return ellipsis  # no enough room for multi-column characters
2372284d9457 formatting: blacken the codebase
Augie Fackler <augie@google.com>
parents: 41836
diff changeset
   428
21856
d24969ee272f encoding: add 'trim' to trim multi-byte characters at most specified columns
FUJIWARA Katsunori <foozy@lares.dti.ne.jp>
parents: 18378
diff changeset
   429
14069
e38846a79a23 encoding: add an encoding-aware lower function
Matt Mackall <mpm@selenic.com>
parents: 13940
diff changeset
   430
def lower(s):
43496
2ade00f3b03b encoding: add comment-based type hints for pytype
Augie Fackler <augie@google.com>
parents: 43469
diff changeset
   431
    # type: (bytes) -> bytes
43077
687b865b95ad formatting: byteify all mercurial/ and hgext/ string literals
Augie Fackler <augie@google.com>
parents: 43076
diff changeset
   432
    b"best-effort encoding-aware case-folding of local string s"
14069
e38846a79a23 encoding: add an encoding-aware lower function
Matt Mackall <mpm@selenic.com>
parents: 13940
diff changeset
   433
    try:
22779
d9585dda63c3 encoding.lower: use fast ASCII lower
Siddharth Agarwal <sid0@fb.com>
parents: 22778
diff changeset
   434
        return asciilower(s)
17235
3745ae495ce5 encoding: use s.decode to trigger UnicodeDecodeError
Martin Geisler <mg@aragost.com>
parents: 16493
diff changeset
   435
    except UnicodeDecodeError:
16387
c481761033bd encoding: add fast-path for ASCII lowercase
Matt Mackall <mpm@selenic.com>
parents: 16274
diff changeset
   436
        pass
c481761033bd encoding: add fast-path for ASCII lowercase
Matt Mackall <mpm@selenic.com>
parents: 16274
diff changeset
   437
    try:
14069
e38846a79a23 encoding: add an encoding-aware lower function
Matt Mackall <mpm@selenic.com>
parents: 13940
diff changeset
   438
        if isinstance(s, localstr):
e38846a79a23 encoding: add an encoding-aware lower function
Matt Mackall <mpm@selenic.com>
parents: 13940
diff changeset
   439
            u = s._utf8.decode("utf-8")
e38846a79a23 encoding: add an encoding-aware lower function
Matt Mackall <mpm@selenic.com>
parents: 13940
diff changeset
   440
        else:
30033
02dbfaa6df0b py3: convert encoding name and mode to str
Yuya Nishihara <yuya@tcha.org>
parents: 30030
diff changeset
   441
            u = s.decode(_sysstr(encoding), _sysstr(encodingmode))
14069
e38846a79a23 encoding: add an encoding-aware lower function
Matt Mackall <mpm@selenic.com>
parents: 13940
diff changeset
   442
e38846a79a23 encoding: add an encoding-aware lower function
Matt Mackall <mpm@selenic.com>
parents: 13940
diff changeset
   443
        lu = u.lower()
e38846a79a23 encoding: add an encoding-aware lower function
Matt Mackall <mpm@selenic.com>
parents: 13940
diff changeset
   444
        if u == lu:
43076
2372284d9457 formatting: blacken the codebase
Augie Fackler <augie@google.com>
parents: 41836
diff changeset
   445
            return s  # preserve localstring
30033
02dbfaa6df0b py3: convert encoding name and mode to str
Yuya Nishihara <yuya@tcha.org>
parents: 30030
diff changeset
   446
        return lu.encode(_sysstr(encoding))
14069
e38846a79a23 encoding: add an encoding-aware lower function
Matt Mackall <mpm@selenic.com>
parents: 13940
diff changeset
   447
    except UnicodeError:
43076
2372284d9457 formatting: blacken the codebase
Augie Fackler <augie@google.com>
parents: 41836
diff changeset
   448
        return s.lower()  # we don't know how to fold this except in ASCII
25660
328739ea70c3 global: mass rewrite to use modern exception syntax
Gregory Szorc <gregory.szorc@gmail.com>
parents: 24608
diff changeset
   449
    except LookupError as k:
43077
687b865b95ad formatting: byteify all mercurial/ and hgext/ string literals
Augie Fackler <augie@google.com>
parents: 43076
diff changeset
   450
        raise error.Abort(k, hint=b"please check your locale settings")
15672
2ebe3d0ce91d i18n: use encoding.lower/upper for encoding aware case folding
FUJIWARA Katsunori <foozy@lares.dti.ne.jp>
parents: 15143
diff changeset
   451
43076
2372284d9457 formatting: blacken the codebase
Augie Fackler <augie@google.com>
parents: 41836
diff changeset
   452
15672
2ebe3d0ce91d i18n: use encoding.lower/upper for encoding aware case folding
FUJIWARA Katsunori <foozy@lares.dti.ne.jp>
parents: 15143
diff changeset
   453
def upper(s):
43496
2ade00f3b03b encoding: add comment-based type hints for pytype
Augie Fackler <augie@google.com>
parents: 43469
diff changeset
   454
    # type: (bytes) -> bytes
43077
687b865b95ad formatting: byteify all mercurial/ and hgext/ string literals
Augie Fackler <augie@google.com>
parents: 43076
diff changeset
   455
    b"best-effort encoding-aware case-folding of local string s"
15672
2ebe3d0ce91d i18n: use encoding.lower/upper for encoding aware case folding
FUJIWARA Katsunori <foozy@lares.dti.ne.jp>
parents: 15143
diff changeset
   456
    try:
24578
ac08de78de7f encoding: use parsers.asciiupper when available
Siddharth Agarwal <sid0@fb.com>
parents: 23596
diff changeset
   457
        return asciiupper(s)
17236
9fb8312dbdbd encoding: add fast-path for ASCII uppercase.
Martin Geisler <mg@aragost.com>
parents: 17235
diff changeset
   458
    except UnicodeDecodeError:
24597
b4258d5a1600 encoding.upper: factor out fallback code
Siddharth Agarwal <sid0@fb.com>
parents: 24593
diff changeset
   459
        return upperfallback(s)
b4258d5a1600 encoding.upper: factor out fallback code
Siddharth Agarwal <sid0@fb.com>
parents: 24593
diff changeset
   460
43076
2372284d9457 formatting: blacken the codebase
Augie Fackler <augie@google.com>
parents: 41836
diff changeset
   461
24597
b4258d5a1600 encoding.upper: factor out fallback code
Siddharth Agarwal <sid0@fb.com>
parents: 24593
diff changeset
   462
def upperfallback(s):
43496
2ade00f3b03b encoding: add comment-based type hints for pytype
Augie Fackler <augie@google.com>
parents: 43469
diff changeset
   463
    # type: (Any) -> Any
17236
9fb8312dbdbd encoding: add fast-path for ASCII uppercase.
Martin Geisler <mg@aragost.com>
parents: 17235
diff changeset
   464
    try:
15672
2ebe3d0ce91d i18n: use encoding.lower/upper for encoding aware case folding
FUJIWARA Katsunori <foozy@lares.dti.ne.jp>
parents: 15143
diff changeset
   465
        if isinstance(s, localstr):
2ebe3d0ce91d i18n: use encoding.lower/upper for encoding aware case folding
FUJIWARA Katsunori <foozy@lares.dti.ne.jp>
parents: 15143
diff changeset
   466
            u = s._utf8.decode("utf-8")
2ebe3d0ce91d i18n: use encoding.lower/upper for encoding aware case folding
FUJIWARA Katsunori <foozy@lares.dti.ne.jp>
parents: 15143
diff changeset
   467
        else:
30033
02dbfaa6df0b py3: convert encoding name and mode to str
Yuya Nishihara <yuya@tcha.org>
parents: 30030
diff changeset
   468
            u = s.decode(_sysstr(encoding), _sysstr(encodingmode))
15672
2ebe3d0ce91d i18n: use encoding.lower/upper for encoding aware case folding
FUJIWARA Katsunori <foozy@lares.dti.ne.jp>
parents: 15143
diff changeset
   469
2ebe3d0ce91d i18n: use encoding.lower/upper for encoding aware case folding
FUJIWARA Katsunori <foozy@lares.dti.ne.jp>
parents: 15143
diff changeset
   470
        uu = u.upper()
2ebe3d0ce91d i18n: use encoding.lower/upper for encoding aware case folding
FUJIWARA Katsunori <foozy@lares.dti.ne.jp>
parents: 15143
diff changeset
   471
        if u == uu:
43076
2372284d9457 formatting: blacken the codebase
Augie Fackler <augie@google.com>
parents: 41836
diff changeset
   472
            return s  # preserve localstring
30033
02dbfaa6df0b py3: convert encoding name and mode to str
Yuya Nishihara <yuya@tcha.org>
parents: 30030
diff changeset
   473
        return uu.encode(_sysstr(encoding))
15672
2ebe3d0ce91d i18n: use encoding.lower/upper for encoding aware case folding
FUJIWARA Katsunori <foozy@lares.dti.ne.jp>
parents: 15143
diff changeset
   474
    except UnicodeError:
43076
2372284d9457 formatting: blacken the codebase
Augie Fackler <augie@google.com>
parents: 41836
diff changeset
   475
        return s.upper()  # we don't know how to fold this except in ASCII
25660
328739ea70c3 global: mass rewrite to use modern exception syntax
Gregory Szorc <gregory.szorc@gmail.com>
parents: 24608
diff changeset
   476
    except LookupError as k:
43077
687b865b95ad formatting: byteify all mercurial/ and hgext/ string literals
Augie Fackler <augie@google.com>
parents: 43076
diff changeset
   477
        raise error.Abort(k, hint=b"please check your locale settings")
16133
84c58da3a1f8 encoding: introduce utf8-b helpers
Matt Mackall <mpm@selenic.com>
parents: 15769
diff changeset
   478
43076
2372284d9457 formatting: blacken the codebase
Augie Fackler <augie@google.com>
parents: 41836
diff changeset
   479
24593
f473a1fe5c7c encoding: define an enum that specifies what normcase does to ASCII strings
Siddharth Agarwal <sid0@fb.com>
parents: 24578
diff changeset
   480
class normcasespecs(object):
f473a1fe5c7c encoding: define an enum that specifies what normcase does to ASCII strings
Siddharth Agarwal <sid0@fb.com>
parents: 24578
diff changeset
   481
    '''what a platform's normcase does to ASCII strings
f473a1fe5c7c encoding: define an enum that specifies what normcase does to ASCII strings
Siddharth Agarwal <sid0@fb.com>
parents: 24578
diff changeset
   482
f473a1fe5c7c encoding: define an enum that specifies what normcase does to ASCII strings
Siddharth Agarwal <sid0@fb.com>
parents: 24578
diff changeset
   483
    This is specified per platform, and should be consistent with what normcase
f473a1fe5c7c encoding: define an enum that specifies what normcase does to ASCII strings
Siddharth Agarwal <sid0@fb.com>
parents: 24578
diff changeset
   484
    on that platform actually does.
f473a1fe5c7c encoding: define an enum that specifies what normcase does to ASCII strings
Siddharth Agarwal <sid0@fb.com>
parents: 24578
diff changeset
   485
f473a1fe5c7c encoding: define an enum that specifies what normcase does to ASCII strings
Siddharth Agarwal <sid0@fb.com>
parents: 24578
diff changeset
   486
    lower: normcase lowercases ASCII strings
f473a1fe5c7c encoding: define an enum that specifies what normcase does to ASCII strings
Siddharth Agarwal <sid0@fb.com>
parents: 24578
diff changeset
   487
    upper: normcase uppercases ASCII strings
24608
1c533e23ce95 util.h: define an enum for normcase specs
Siddharth Agarwal <sid0@fb.com>
parents: 24597
diff changeset
   488
    other: the fallback function should always be called
1c533e23ce95 util.h: define an enum for normcase specs
Siddharth Agarwal <sid0@fb.com>
parents: 24597
diff changeset
   489
1c533e23ce95 util.h: define an enum for normcase specs
Siddharth Agarwal <sid0@fb.com>
parents: 24597
diff changeset
   490
    This should be kept in sync with normcase_spec in util.h.'''
43076
2372284d9457 formatting: blacken the codebase
Augie Fackler <augie@google.com>
parents: 41836
diff changeset
   491
24593
f473a1fe5c7c encoding: define an enum that specifies what normcase does to ASCII strings
Siddharth Agarwal <sid0@fb.com>
parents: 24578
diff changeset
   492
    lower = -1
f473a1fe5c7c encoding: define an enum that specifies what normcase does to ASCII strings
Siddharth Agarwal <sid0@fb.com>
parents: 24578
diff changeset
   493
    upper = 1
f473a1fe5c7c encoding: define an enum that specifies what normcase does to ASCII strings
Siddharth Agarwal <sid0@fb.com>
parents: 24578
diff changeset
   494
    other = 0
f473a1fe5c7c encoding: define an enum that specifies what normcase does to ASCII strings
Siddharth Agarwal <sid0@fb.com>
parents: 24578
diff changeset
   495
43076
2372284d9457 formatting: blacken the codebase
Augie Fackler <augie@google.com>
parents: 41836
diff changeset
   496
28068
9ece901f7a19 encoding: add option to escape non-ascii characters in JSON
Yuya Nishihara <yuya@tcha.org>
parents: 28067
diff changeset
   497
def jsonescape(s, paranoid=False):
43496
2ade00f3b03b encoding: add comment-based type hints for pytype
Augie Fackler <augie@google.com>
parents: 43469
diff changeset
   498
    # type: (Any, Any) -> Any
22426
f6b533e64ed6 encoding: add json escaping filter
Matt Mackall <mpm@selenic.com>
parents: 22425
diff changeset
   499
    '''returns a string suitable for JSON
f6b533e64ed6 encoding: add json escaping filter
Matt Mackall <mpm@selenic.com>
parents: 22425
diff changeset
   500
f6b533e64ed6 encoding: add json escaping filter
Matt Mackall <mpm@selenic.com>
parents: 22425
diff changeset
   501
    JSON is problematic for us because it doesn't support non-Unicode
f6b533e64ed6 encoding: add json escaping filter
Matt Mackall <mpm@selenic.com>
parents: 22425
diff changeset
   502
    bytes. To deal with this, we take the following approach:
f6b533e64ed6 encoding: add json escaping filter
Matt Mackall <mpm@selenic.com>
parents: 22425
diff changeset
   503
37947
3ea3c96ada54 encoding: introduce tagging type for non-lossy non-ASCII string
Yuya Nishihara <yuya@tcha.org>
parents: 37946
diff changeset
   504
    - localstr/safelocalstr objects are converted back to UTF-8
22426
f6b533e64ed6 encoding: add json escaping filter
Matt Mackall <mpm@selenic.com>
parents: 22425
diff changeset
   505
    - valid UTF-8/ASCII strings are passed as-is
f6b533e64ed6 encoding: add json escaping filter
Matt Mackall <mpm@selenic.com>
parents: 22425
diff changeset
   506
    - other strings are converted to UTF-8b surrogate encoding
f6b533e64ed6 encoding: add json escaping filter
Matt Mackall <mpm@selenic.com>
parents: 22425
diff changeset
   507
    - apply JSON-specified string escaping
f6b533e64ed6 encoding: add json escaping filter
Matt Mackall <mpm@selenic.com>
parents: 22425
diff changeset
   508
f6b533e64ed6 encoding: add json escaping filter
Matt Mackall <mpm@selenic.com>
parents: 22425
diff changeset
   509
    (escapes are doubled in these tests)
f6b533e64ed6 encoding: add json escaping filter
Matt Mackall <mpm@selenic.com>
parents: 22425
diff changeset
   510
34131
0fa781320203 doctest: bulk-replace string literals with b'' for Python 3
Yuya Nishihara <yuya@tcha.org>
parents: 33928
diff changeset
   511
    >>> jsonescape(b'this is a test')
22426
f6b533e64ed6 encoding: add json escaping filter
Matt Mackall <mpm@selenic.com>
parents: 22425
diff changeset
   512
    'this is a test'
34131
0fa781320203 doctest: bulk-replace string literals with b'' for Python 3
Yuya Nishihara <yuya@tcha.org>
parents: 33928
diff changeset
   513
    >>> jsonescape(b'escape characters: \\0 \\x0b \\x7f')
27881
ffa599f3f503 encoding: escape U+007F (DEL) character in JSON
Yuya Nishihara <yuya@tcha.org>
parents: 27699
diff changeset
   514
    'escape characters: \\\\u0000 \\\\u000b \\\\u007f'
34131
0fa781320203 doctest: bulk-replace string literals with b'' for Python 3
Yuya Nishihara <yuya@tcha.org>
parents: 33928
diff changeset
   515
    >>> jsonescape(b'escape characters: \\b \\t \\n \\f \\r \\" \\\\')
33925
2c37f9dabc32 encoding: add fast path of jsonescape() (issue5533)
Yuya Nishihara <yuya@tcha.org>
parents: 33924
diff changeset
   516
    'escape characters: \\\\b \\\\t \\\\n \\\\f \\\\r \\\\" \\\\\\\\'
34131
0fa781320203 doctest: bulk-replace string literals with b'' for Python 3
Yuya Nishihara <yuya@tcha.org>
parents: 33928
diff changeset
   517
    >>> jsonescape(b'a weird byte: \\xdd')
22426
f6b533e64ed6 encoding: add json escaping filter
Matt Mackall <mpm@selenic.com>
parents: 22425
diff changeset
   518
    'a weird byte: \\xed\\xb3\\x9d'
34131
0fa781320203 doctest: bulk-replace string literals with b'' for Python 3
Yuya Nishihara <yuya@tcha.org>
parents: 33928
diff changeset
   519
    >>> jsonescape(b'utf-8: caf\\xc3\\xa9')
22426
f6b533e64ed6 encoding: add json escaping filter
Matt Mackall <mpm@selenic.com>
parents: 22425
diff changeset
   520
    'utf-8: caf\\xc3\\xa9'
34131
0fa781320203 doctest: bulk-replace string literals with b'' for Python 3
Yuya Nishihara <yuya@tcha.org>
parents: 33928
diff changeset
   521
    >>> jsonescape(b'')
22426
f6b533e64ed6 encoding: add json escaping filter
Matt Mackall <mpm@selenic.com>
parents: 22425
diff changeset
   522
    ''
28068
9ece901f7a19 encoding: add option to escape non-ascii characters in JSON
Yuya Nishihara <yuya@tcha.org>
parents: 28067
diff changeset
   523
28069
b2d24c2898f9 encoding: backport paranoid escaping from templatefilters.jsonescape()
Yuya Nishihara <yuya@tcha.org>
parents: 28068
diff changeset
   524
    If paranoid, non-ascii and common troublesome characters are also escaped.
b2d24c2898f9 encoding: backport paranoid escaping from templatefilters.jsonescape()
Yuya Nishihara <yuya@tcha.org>
parents: 28068
diff changeset
   525
    This is suitable for web output.
28068
9ece901f7a19 encoding: add option to escape non-ascii characters in JSON
Yuya Nishihara <yuya@tcha.org>
parents: 28067
diff changeset
   526
34131
0fa781320203 doctest: bulk-replace string literals with b'' for Python 3
Yuya Nishihara <yuya@tcha.org>
parents: 33928
diff changeset
   527
    >>> s = b'escape characters: \\0 \\x0b \\x7f'
33925
2c37f9dabc32 encoding: add fast path of jsonescape() (issue5533)
Yuya Nishihara <yuya@tcha.org>
parents: 33924
diff changeset
   528
    >>> assert jsonescape(s) == jsonescape(s, paranoid=True)
34131
0fa781320203 doctest: bulk-replace string literals with b'' for Python 3
Yuya Nishihara <yuya@tcha.org>
parents: 33928
diff changeset
   529
    >>> s = b'escape characters: \\b \\t \\n \\f \\r \\" \\\\'
33925
2c37f9dabc32 encoding: add fast path of jsonescape() (issue5533)
Yuya Nishihara <yuya@tcha.org>
parents: 33924
diff changeset
   530
    >>> assert jsonescape(s) == jsonescape(s, paranoid=True)
34131
0fa781320203 doctest: bulk-replace string literals with b'' for Python 3
Yuya Nishihara <yuya@tcha.org>
parents: 33928
diff changeset
   531
    >>> jsonescape(b'escape boundary: \\x7e \\x7f \\xc2\\x80', paranoid=True)
28068
9ece901f7a19 encoding: add option to escape non-ascii characters in JSON
Yuya Nishihara <yuya@tcha.org>
parents: 28067
diff changeset
   532
    'escape boundary: ~ \\\\u007f \\\\u0080'
34131
0fa781320203 doctest: bulk-replace string literals with b'' for Python 3
Yuya Nishihara <yuya@tcha.org>
parents: 33928
diff changeset
   533
    >>> jsonescape(b'a weird byte: \\xdd', paranoid=True)
28068
9ece901f7a19 encoding: add option to escape non-ascii characters in JSON
Yuya Nishihara <yuya@tcha.org>
parents: 28067
diff changeset
   534
    'a weird byte: \\\\udcdd'
34131
0fa781320203 doctest: bulk-replace string literals with b'' for Python 3
Yuya Nishihara <yuya@tcha.org>
parents: 33928
diff changeset
   535
    >>> jsonescape(b'utf-8: caf\\xc3\\xa9', paranoid=True)
28068
9ece901f7a19 encoding: add option to escape non-ascii characters in JSON
Yuya Nishihara <yuya@tcha.org>
parents: 28067
diff changeset
   536
    'utf-8: caf\\\\u00e9'
34131
0fa781320203 doctest: bulk-replace string literals with b'' for Python 3
Yuya Nishihara <yuya@tcha.org>
parents: 33928
diff changeset
   537
    >>> jsonescape(b'non-BMP: \\xf0\\x9d\\x84\\x9e', paranoid=True)
28068
9ece901f7a19 encoding: add option to escape non-ascii characters in JSON
Yuya Nishihara <yuya@tcha.org>
parents: 28067
diff changeset
   538
    'non-BMP: \\\\ud834\\\\udd1e'
34131
0fa781320203 doctest: bulk-replace string literals with b'' for Python 3
Yuya Nishihara <yuya@tcha.org>
parents: 33928
diff changeset
   539
    >>> jsonescape(b'<foo@example.org>', paranoid=True)
28069
b2d24c2898f9 encoding: backport paranoid escaping from templatefilters.jsonescape()
Yuya Nishihara <yuya@tcha.org>
parents: 28068
diff changeset
   540
    '\\\\u003cfoo@example.org\\\\u003e'
22426
f6b533e64ed6 encoding: add json escaping filter
Matt Mackall <mpm@selenic.com>
parents: 22425
diff changeset
   541
    '''
f6b533e64ed6 encoding: add json escaping filter
Matt Mackall <mpm@selenic.com>
parents: 22425
diff changeset
   542
28068
9ece901f7a19 encoding: add option to escape non-ascii characters in JSON
Yuya Nishihara <yuya@tcha.org>
parents: 28067
diff changeset
   543
    u8chars = toutf8b(s)
9ece901f7a19 encoding: add option to escape non-ascii characters in JSON
Yuya Nishihara <yuya@tcha.org>
parents: 28067
diff changeset
   544
    try:
33924
b9101467d88b encoding: extract stub for fast JSON escape
Yuya Nishihara <yuya@tcha.org>
parents: 33852
diff changeset
   545
        return _jsonescapeu8fast(u8chars, paranoid)
b9101467d88b encoding: extract stub for fast JSON escape
Yuya Nishihara <yuya@tcha.org>
parents: 33852
diff changeset
   546
    except ValueError:
28068
9ece901f7a19 encoding: add option to escape non-ascii characters in JSON
Yuya Nishihara <yuya@tcha.org>
parents: 28067
diff changeset
   547
        pass
33924
b9101467d88b encoding: extract stub for fast JSON escape
Yuya Nishihara <yuya@tcha.org>
parents: 33852
diff changeset
   548
    return charencodepure.jsonescapeu8fallback(u8chars, paranoid)
22426
f6b533e64ed6 encoding: add json escaping filter
Matt Mackall <mpm@selenic.com>
parents: 22425
diff changeset
   549
43076
2372284d9457 formatting: blacken the codebase
Augie Fackler <augie@google.com>
parents: 41836
diff changeset
   550
34218
aa877860d4d7 py3: use 'surrogatepass' error handler to process U+DCxx transparently
Yuya Nishihara <yuya@tcha.org>
parents: 34216
diff changeset
   551
# We need to decode/encode U+DCxx codes transparently since invalid UTF-8
aa877860d4d7 py3: use 'surrogatepass' error handler to process U+DCxx transparently
Yuya Nishihara <yuya@tcha.org>
parents: 34216
diff changeset
   552
# bytes are mapped to that range.
aa877860d4d7 py3: use 'surrogatepass' error handler to process U+DCxx transparently
Yuya Nishihara <yuya@tcha.org>
parents: 34216
diff changeset
   553
if pycompat.ispy3:
aa877860d4d7 py3: use 'surrogatepass' error handler to process U+DCxx transparently
Yuya Nishihara <yuya@tcha.org>
parents: 34216
diff changeset
   554
    _utf8strict = r'surrogatepass'
aa877860d4d7 py3: use 'surrogatepass' error handler to process U+DCxx transparently
Yuya Nishihara <yuya@tcha.org>
parents: 34216
diff changeset
   555
else:
aa877860d4d7 py3: use 'surrogatepass' error handler to process U+DCxx transparently
Yuya Nishihara <yuya@tcha.org>
parents: 34216
diff changeset
   556
    _utf8strict = r'strict'
aa877860d4d7 py3: use 'surrogatepass' error handler to process U+DCxx transparently
Yuya Nishihara <yuya@tcha.org>
parents: 34216
diff changeset
   557
26875
cf47bdb2183c encoding: add getutf8char helper
Matt Mackall <mpm@selenic.com>
parents: 25660
diff changeset
   558
_utf8len = [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 2, 2, 3, 4]
cf47bdb2183c encoding: add getutf8char helper
Matt Mackall <mpm@selenic.com>
parents: 25660
diff changeset
   559
43076
2372284d9457 formatting: blacken the codebase
Augie Fackler <augie@google.com>
parents: 41836
diff changeset
   560
26875
cf47bdb2183c encoding: add getutf8char helper
Matt Mackall <mpm@selenic.com>
parents: 25660
diff changeset
   561
def getutf8char(s, pos):
43496
2ade00f3b03b encoding: add comment-based type hints for pytype
Augie Fackler <augie@google.com>
parents: 43469
diff changeset
   562
    # type: (Any, Any) -> Any
26875
cf47bdb2183c encoding: add getutf8char helper
Matt Mackall <mpm@selenic.com>
parents: 25660
diff changeset
   563
    '''get the next full utf-8 character in the given string, starting at pos
cf47bdb2183c encoding: add getutf8char helper
Matt Mackall <mpm@selenic.com>
parents: 25660
diff changeset
   564
cf47bdb2183c encoding: add getutf8char helper
Matt Mackall <mpm@selenic.com>
parents: 25660
diff changeset
   565
    Raises a UnicodeError if the given location does not start a valid
cf47bdb2183c encoding: add getutf8char helper
Matt Mackall <mpm@selenic.com>
parents: 25660
diff changeset
   566
    utf-8 character.
cf47bdb2183c encoding: add getutf8char helper
Matt Mackall <mpm@selenic.com>
parents: 25660
diff changeset
   567
    '''
cf47bdb2183c encoding: add getutf8char helper
Matt Mackall <mpm@selenic.com>
parents: 25660
diff changeset
   568
cf47bdb2183c encoding: add getutf8char helper
Matt Mackall <mpm@selenic.com>
parents: 25660
diff changeset
   569
    # find how many bytes to attempt decoding from first nibble
43076
2372284d9457 formatting: blacken the codebase
Augie Fackler <augie@google.com>
parents: 41836
diff changeset
   570
    l = _utf8len[ord(s[pos : pos + 1]) >> 4]
2372284d9457 formatting: blacken the codebase
Augie Fackler <augie@google.com>
parents: 41836
diff changeset
   571
    if not l:  # ascii
2372284d9457 formatting: blacken the codebase
Augie Fackler <augie@google.com>
parents: 41836
diff changeset
   572
        return s[pos : pos + 1]
26875
cf47bdb2183c encoding: add getutf8char helper
Matt Mackall <mpm@selenic.com>
parents: 25660
diff changeset
   573
43076
2372284d9457 formatting: blacken the codebase
Augie Fackler <augie@google.com>
parents: 41836
diff changeset
   574
    c = s[pos : pos + l]
26875
cf47bdb2183c encoding: add getutf8char helper
Matt Mackall <mpm@selenic.com>
parents: 25660
diff changeset
   575
    # validate with attempted decode
34218
aa877860d4d7 py3: use 'surrogatepass' error handler to process U+DCxx transparently
Yuya Nishihara <yuya@tcha.org>
parents: 34216
diff changeset
   576
    c.decode("utf-8", _utf8strict)
26875
cf47bdb2183c encoding: add getutf8char helper
Matt Mackall <mpm@selenic.com>
parents: 25660
diff changeset
   577
    return c
cf47bdb2183c encoding: add getutf8char helper
Matt Mackall <mpm@selenic.com>
parents: 25660
diff changeset
   578
43076
2372284d9457 formatting: blacken the codebase
Augie Fackler <augie@google.com>
parents: 41836
diff changeset
   579
16133
84c58da3a1f8 encoding: introduce utf8-b helpers
Matt Mackall <mpm@selenic.com>
parents: 15769
diff changeset
   580
def toutf8b(s):
43496
2ade00f3b03b encoding: add comment-based type hints for pytype
Augie Fackler <augie@google.com>
parents: 43469
diff changeset
   581
    # type: (Any) -> Any
16133
84c58da3a1f8 encoding: introduce utf8-b helpers
Matt Mackall <mpm@selenic.com>
parents: 15769
diff changeset
   582
    '''convert a local, possibly-binary string into UTF-8b
84c58da3a1f8 encoding: introduce utf8-b helpers
Matt Mackall <mpm@selenic.com>
parents: 15769
diff changeset
   583
84c58da3a1f8 encoding: introduce utf8-b helpers
Matt Mackall <mpm@selenic.com>
parents: 15769
diff changeset
   584
    This is intended as a generic method to preserve data when working
84c58da3a1f8 encoding: introduce utf8-b helpers
Matt Mackall <mpm@selenic.com>
parents: 15769
diff changeset
   585
    with schemes like JSON and XML that have no provision for
84c58da3a1f8 encoding: introduce utf8-b helpers
Matt Mackall <mpm@selenic.com>
parents: 15769
diff changeset
   586
    arbitrary byte strings. As Mercurial often doesn't know
84c58da3a1f8 encoding: introduce utf8-b helpers
Matt Mackall <mpm@selenic.com>
parents: 15769
diff changeset
   587
    what encoding data is in, we use so-called UTF-8b.
84c58da3a1f8 encoding: introduce utf8-b helpers
Matt Mackall <mpm@selenic.com>
parents: 15769
diff changeset
   588
84c58da3a1f8 encoding: introduce utf8-b helpers
Matt Mackall <mpm@selenic.com>
parents: 15769
diff changeset
   589
    If a string is already valid UTF-8 (or ASCII), it passes unmodified.
84c58da3a1f8 encoding: introduce utf8-b helpers
Matt Mackall <mpm@selenic.com>
parents: 15769
diff changeset
   590
    Otherwise, unsupported bytes are mapped to UTF-16 surrogate range,
84c58da3a1f8 encoding: introduce utf8-b helpers
Matt Mackall <mpm@selenic.com>
parents: 15769
diff changeset
   591
    uDC00-uDCFF.
84c58da3a1f8 encoding: introduce utf8-b helpers
Matt Mackall <mpm@selenic.com>
parents: 15769
diff changeset
   592
84c58da3a1f8 encoding: introduce utf8-b helpers
Matt Mackall <mpm@selenic.com>
parents: 15769
diff changeset
   593
    Principles of operation:
84c58da3a1f8 encoding: introduce utf8-b helpers
Matt Mackall <mpm@selenic.com>
parents: 15769
diff changeset
   594
17424
e7cfe3587ea4 fix trivial spelling errors
Mads Kiilerich <mads@kiilerich.com>
parents: 17236
diff changeset
   595
    - ASCII and UTF-8 data successfully round-trips and is understood
16133
84c58da3a1f8 encoding: introduce utf8-b helpers
Matt Mackall <mpm@selenic.com>
parents: 15769
diff changeset
   596
      by Unicode-oriented clients
84c58da3a1f8 encoding: introduce utf8-b helpers
Matt Mackall <mpm@selenic.com>
parents: 15769
diff changeset
   597
    - filenames and file contents in arbitrary other encodings can have
84c58da3a1f8 encoding: introduce utf8-b helpers
Matt Mackall <mpm@selenic.com>
parents: 15769
diff changeset
   598
      be round-tripped or recovered by clueful clients
84c58da3a1f8 encoding: introduce utf8-b helpers
Matt Mackall <mpm@selenic.com>
parents: 15769
diff changeset
   599
    - local strings that have a cached known UTF-8 encoding (aka
84c58da3a1f8 encoding: introduce utf8-b helpers
Matt Mackall <mpm@selenic.com>
parents: 15769
diff changeset
   600
      localstr) get sent as UTF-8 so Unicode-oriented clients get the
84c58da3a1f8 encoding: introduce utf8-b helpers
Matt Mackall <mpm@selenic.com>
parents: 15769
diff changeset
   601
      Unicode data they want
37947
3ea3c96ada54 encoding: introduce tagging type for non-lossy non-ASCII string
Yuya Nishihara <yuya@tcha.org>
parents: 37946
diff changeset
   602
    - non-lossy local strings (aka safelocalstr) get sent as UTF-8 as well
16133
84c58da3a1f8 encoding: introduce utf8-b helpers
Matt Mackall <mpm@selenic.com>
parents: 15769
diff changeset
   603
    - because we must preserve UTF-8 bytestring in places such as
84c58da3a1f8 encoding: introduce utf8-b helpers
Matt Mackall <mpm@selenic.com>
parents: 15769
diff changeset
   604
      filenames, metadata can't be roundtripped without help
84c58da3a1f8 encoding: introduce utf8-b helpers
Matt Mackall <mpm@selenic.com>
parents: 15769
diff changeset
   605
84c58da3a1f8 encoding: introduce utf8-b helpers
Matt Mackall <mpm@selenic.com>
parents: 15769
diff changeset
   606
    (Note: "UTF-8b" often refers to decoding a mix of valid UTF-8 and
84c58da3a1f8 encoding: introduce utf8-b helpers
Matt Mackall <mpm@selenic.com>
parents: 15769
diff changeset
   607
    arbitrary bytes into an internal Unicode format that can be
84c58da3a1f8 encoding: introduce utf8-b helpers
Matt Mackall <mpm@selenic.com>
parents: 15769
diff changeset
   608
    re-encoded back into the original. Here we are exposing the
84c58da3a1f8 encoding: introduce utf8-b helpers
Matt Mackall <mpm@selenic.com>
parents: 15769
diff changeset
   609
    internal surrogate encoding as a UTF-8 string.)
84c58da3a1f8 encoding: introduce utf8-b helpers
Matt Mackall <mpm@selenic.com>
parents: 15769
diff changeset
   610
    '''
84c58da3a1f8 encoding: introduce utf8-b helpers
Matt Mackall <mpm@selenic.com>
parents: 15769
diff changeset
   611
37946
57b0c7221dba encoding: fix toutf8b() to resurrect lossy characters even if "\xed" in it
Yuya Nishihara <yuya@tcha.org>
parents: 36797
diff changeset
   612
    if isinstance(s, localstr):
57b0c7221dba encoding: fix toutf8b() to resurrect lossy characters even if "\xed" in it
Yuya Nishihara <yuya@tcha.org>
parents: 36797
diff changeset
   613
        # assume that the original UTF-8 sequence would never contain
57b0c7221dba encoding: fix toutf8b() to resurrect lossy characters even if "\xed" in it
Yuya Nishihara <yuya@tcha.org>
parents: 36797
diff changeset
   614
        # invalid characters in U+DCxx range
57b0c7221dba encoding: fix toutf8b() to resurrect lossy characters even if "\xed" in it
Yuya Nishihara <yuya@tcha.org>
parents: 36797
diff changeset
   615
        return s._utf8
37947
3ea3c96ada54 encoding: introduce tagging type for non-lossy non-ASCII string
Yuya Nishihara <yuya@tcha.org>
parents: 37946
diff changeset
   616
    elif isinstance(s, safelocalstr):
3ea3c96ada54 encoding: introduce tagging type for non-lossy non-ASCII string
Yuya Nishihara <yuya@tcha.org>
parents: 37946
diff changeset
   617
        # already verified that s is non-lossy in legacy encoding, which
3ea3c96ada54 encoding: introduce tagging type for non-lossy non-ASCII string
Yuya Nishihara <yuya@tcha.org>
parents: 37946
diff changeset
   618
        # shouldn't contain characters in U+DCxx range
3ea3c96ada54 encoding: introduce tagging type for non-lossy non-ASCII string
Yuya Nishihara <yuya@tcha.org>
parents: 37946
diff changeset
   619
        return fromlocal(s)
37946
57b0c7221dba encoding: fix toutf8b() to resurrect lossy characters even if "\xed" in it
Yuya Nishihara <yuya@tcha.org>
parents: 36797
diff changeset
   620
    elif isasciistr(s):
33928
6c119dbfd0c0 encoding: add fast path of from/toutf8b() for ASCII strings
Yuya Nishihara <yuya@tcha.org>
parents: 33927
diff changeset
   621
        return s
43077
687b865b95ad formatting: byteify all mercurial/ and hgext/ string literals
Augie Fackler <augie@google.com>
parents: 43076
diff changeset
   622
    if b"\xed" not in s:
26879
a24b98f4e03c encoding: re-escape U+DCxx characters in toutf8b input (issue4927)
Matt Mackall <mpm@selenic.com>
parents: 26878
diff changeset
   623
        try:
34218
aa877860d4d7 py3: use 'surrogatepass' error handler to process U+DCxx transparently
Yuya Nishihara <yuya@tcha.org>
parents: 34216
diff changeset
   624
            s.decode('utf-8', _utf8strict)
26879
a24b98f4e03c encoding: re-escape U+DCxx characters in toutf8b input (issue4927)
Matt Mackall <mpm@selenic.com>
parents: 26878
diff changeset
   625
            return s
a24b98f4e03c encoding: re-escape U+DCxx characters in toutf8b input (issue4927)
Matt Mackall <mpm@selenic.com>
parents: 26878
diff changeset
   626
        except UnicodeDecodeError:
a24b98f4e03c encoding: re-escape U+DCxx characters in toutf8b input (issue4927)
Matt Mackall <mpm@selenic.com>
parents: 26878
diff changeset
   627
            pass
26878
d7e83f106459 encoding: use getutf8char in toutf8b
Matt Mackall <mpm@selenic.com>
parents: 26877
diff changeset
   628
34216
1c601df9894c py3: wrap bytes in encoding.from/toutf8b() with bytestr
Yuya Nishihara <yuya@tcha.org>
parents: 34200
diff changeset
   629
    s = pycompat.bytestr(s)
43077
687b865b95ad formatting: byteify all mercurial/ and hgext/ string literals
Augie Fackler <augie@google.com>
parents: 43076
diff changeset
   630
    r = b""
26878
d7e83f106459 encoding: use getutf8char in toutf8b
Matt Mackall <mpm@selenic.com>
parents: 26877
diff changeset
   631
    pos = 0
d7e83f106459 encoding: use getutf8char in toutf8b
Matt Mackall <mpm@selenic.com>
parents: 26877
diff changeset
   632
    l = len(s)
d7e83f106459 encoding: use getutf8char in toutf8b
Matt Mackall <mpm@selenic.com>
parents: 26877
diff changeset
   633
    while pos < l:
d7e83f106459 encoding: use getutf8char in toutf8b
Matt Mackall <mpm@selenic.com>
parents: 26877
diff changeset
   634
        try:
d7e83f106459 encoding: use getutf8char in toutf8b
Matt Mackall <mpm@selenic.com>
parents: 26877
diff changeset
   635
            c = getutf8char(s, pos)
43077
687b865b95ad formatting: byteify all mercurial/ and hgext/ string literals
Augie Fackler <augie@google.com>
parents: 43076
diff changeset
   636
            if b"\xed\xb0\x80" <= c <= b"\xed\xb3\xbf":
26879
a24b98f4e03c encoding: re-escape U+DCxx characters in toutf8b input (issue4927)
Matt Mackall <mpm@selenic.com>
parents: 26878
diff changeset
   637
                # have to re-escape existing U+DCxx characters
43076
2372284d9457 formatting: blacken the codebase
Augie Fackler <augie@google.com>
parents: 41836
diff changeset
   638
                c = unichr(0xDC00 + ord(s[pos])).encode('utf-8', _utf8strict)
26879
a24b98f4e03c encoding: re-escape U+DCxx characters in toutf8b input (issue4927)
Matt Mackall <mpm@selenic.com>
parents: 26878
diff changeset
   639
                pos += 1
a24b98f4e03c encoding: re-escape U+DCxx characters in toutf8b input (issue4927)
Matt Mackall <mpm@selenic.com>
parents: 26878
diff changeset
   640
            else:
a24b98f4e03c encoding: re-escape U+DCxx characters in toutf8b input (issue4927)
Matt Mackall <mpm@selenic.com>
parents: 26878
diff changeset
   641
                pos += len(c)
26878
d7e83f106459 encoding: use getutf8char in toutf8b
Matt Mackall <mpm@selenic.com>
parents: 26877
diff changeset
   642
        except UnicodeDecodeError:
43076
2372284d9457 formatting: blacken the codebase
Augie Fackler <augie@google.com>
parents: 41836
diff changeset
   643
            c = unichr(0xDC00 + ord(s[pos])).encode('utf-8', _utf8strict)
26878
d7e83f106459 encoding: use getutf8char in toutf8b
Matt Mackall <mpm@selenic.com>
parents: 26877
diff changeset
   644
            pos += 1
d7e83f106459 encoding: use getutf8char in toutf8b
Matt Mackall <mpm@selenic.com>
parents: 26877
diff changeset
   645
        r += c
d7e83f106459 encoding: use getutf8char in toutf8b
Matt Mackall <mpm@selenic.com>
parents: 26877
diff changeset
   646
    return r
16133
84c58da3a1f8 encoding: introduce utf8-b helpers
Matt Mackall <mpm@selenic.com>
parents: 15769
diff changeset
   647
43076
2372284d9457 formatting: blacken the codebase
Augie Fackler <augie@google.com>
parents: 41836
diff changeset
   648
16133
84c58da3a1f8 encoding: introduce utf8-b helpers
Matt Mackall <mpm@selenic.com>
parents: 15769
diff changeset
   649
def fromutf8b(s):
43496
2ade00f3b03b encoding: add comment-based type hints for pytype
Augie Fackler <augie@google.com>
parents: 43469
diff changeset
   650
    # type: (Text) -> bytes
16133
84c58da3a1f8 encoding: introduce utf8-b helpers
Matt Mackall <mpm@selenic.com>
parents: 15769
diff changeset
   651
    '''Given a UTF-8b string, return a local, possibly-binary string.
84c58da3a1f8 encoding: introduce utf8-b helpers
Matt Mackall <mpm@selenic.com>
parents: 15769
diff changeset
   652
84c58da3a1f8 encoding: introduce utf8-b helpers
Matt Mackall <mpm@selenic.com>
parents: 15769
diff changeset
   653
    return the original binary string. This
84c58da3a1f8 encoding: introduce utf8-b helpers
Matt Mackall <mpm@selenic.com>
parents: 15769
diff changeset
   654
    is a round-trip process for strings like filenames, but metadata
84c58da3a1f8 encoding: introduce utf8-b helpers
Matt Mackall <mpm@selenic.com>
parents: 15769
diff changeset
   655
    that's was passed through tolocal will remain in UTF-8.
84c58da3a1f8 encoding: introduce utf8-b helpers
Matt Mackall <mpm@selenic.com>
parents: 15769
diff changeset
   656
26963
de5ae97ce9f4 encoding: extend test cases for utf8b
Matt Mackall <mpm@selenic.com>
parents: 26879
diff changeset
   657
    >>> roundtrip = lambda x: fromutf8b(toutf8b(x)) == x
34131
0fa781320203 doctest: bulk-replace string literals with b'' for Python 3
Yuya Nishihara <yuya@tcha.org>
parents: 33928
diff changeset
   658
    >>> m = b"\\xc3\\xa9\\x99abcd"
26963
de5ae97ce9f4 encoding: extend test cases for utf8b
Matt Mackall <mpm@selenic.com>
parents: 26879
diff changeset
   659
    >>> toutf8b(m)
16133
84c58da3a1f8 encoding: introduce utf8-b helpers
Matt Mackall <mpm@selenic.com>
parents: 15769
diff changeset
   660
    '\\xc3\\xa9\\xed\\xb2\\x99abcd'
26963
de5ae97ce9f4 encoding: extend test cases for utf8b
Matt Mackall <mpm@selenic.com>
parents: 26879
diff changeset
   661
    >>> roundtrip(m)
de5ae97ce9f4 encoding: extend test cases for utf8b
Matt Mackall <mpm@selenic.com>
parents: 26879
diff changeset
   662
    True
34131
0fa781320203 doctest: bulk-replace string literals with b'' for Python 3
Yuya Nishihara <yuya@tcha.org>
parents: 33928
diff changeset
   663
    >>> roundtrip(b"\\xc2\\xc2\\x80")
26963
de5ae97ce9f4 encoding: extend test cases for utf8b
Matt Mackall <mpm@selenic.com>
parents: 26879
diff changeset
   664
    True
34131
0fa781320203 doctest: bulk-replace string literals with b'' for Python 3
Yuya Nishihara <yuya@tcha.org>
parents: 33928
diff changeset
   665
    >>> roundtrip(b"\\xef\\xbf\\xbd")
26963
de5ae97ce9f4 encoding: extend test cases for utf8b
Matt Mackall <mpm@selenic.com>
parents: 26879
diff changeset
   666
    True
34131
0fa781320203 doctest: bulk-replace string literals with b'' for Python 3
Yuya Nishihara <yuya@tcha.org>
parents: 33928
diff changeset
   667
    >>> roundtrip(b"\\xef\\xef\\xbf\\xbd")
16133
84c58da3a1f8 encoding: introduce utf8-b helpers
Matt Mackall <mpm@selenic.com>
parents: 15769
diff changeset
   668
    True
34131
0fa781320203 doctest: bulk-replace string literals with b'' for Python 3
Yuya Nishihara <yuya@tcha.org>
parents: 33928
diff changeset
   669
    >>> roundtrip(b"\\xf1\\x80\\x80\\x80\\x80")
27699
c8d3392f76e1 encoding: handle UTF-16 internal limit with fromutf8b (issue5031)
Matt Mackall <mpm@selenic.com>
parents: 27356
diff changeset
   670
    True
16133
84c58da3a1f8 encoding: introduce utf8-b helpers
Matt Mackall <mpm@selenic.com>
parents: 15769
diff changeset
   671
    '''
84c58da3a1f8 encoding: introduce utf8-b helpers
Matt Mackall <mpm@selenic.com>
parents: 15769
diff changeset
   672
33928
6c119dbfd0c0 encoding: add fast path of from/toutf8b() for ASCII strings
Yuya Nishihara <yuya@tcha.org>
parents: 33927
diff changeset
   673
    if isasciistr(s):
6c119dbfd0c0 encoding: add fast path of from/toutf8b() for ASCII strings
Yuya Nishihara <yuya@tcha.org>
parents: 33927
diff changeset
   674
        return s
16133
84c58da3a1f8 encoding: introduce utf8-b helpers
Matt Mackall <mpm@selenic.com>
parents: 15769
diff changeset
   675
    # fast path - look for uDxxx prefixes in s
43077
687b865b95ad formatting: byteify all mercurial/ and hgext/ string literals
Augie Fackler <augie@google.com>
parents: 43076
diff changeset
   676
    if b"\xed" not in s:
16133
84c58da3a1f8 encoding: introduce utf8-b helpers
Matt Mackall <mpm@selenic.com>
parents: 15769
diff changeset
   677
        return s
84c58da3a1f8 encoding: introduce utf8-b helpers
Matt Mackall <mpm@selenic.com>
parents: 15769
diff changeset
   678
27699
c8d3392f76e1 encoding: handle UTF-16 internal limit with fromutf8b (issue5031)
Matt Mackall <mpm@selenic.com>
parents: 27356
diff changeset
   679
    # We could do this with the unicode type but some Python builds
c8d3392f76e1 encoding: handle UTF-16 internal limit with fromutf8b (issue5031)
Matt Mackall <mpm@selenic.com>
parents: 27356
diff changeset
   680
    # use UTF-16 internally (issue5031) which causes non-BMP code
c8d3392f76e1 encoding: handle UTF-16 internal limit with fromutf8b (issue5031)
Matt Mackall <mpm@selenic.com>
parents: 27356
diff changeset
   681
    # points to be escaped. Instead, we use our handy getutf8char
c8d3392f76e1 encoding: handle UTF-16 internal limit with fromutf8b (issue5031)
Matt Mackall <mpm@selenic.com>
parents: 27356
diff changeset
   682
    # helper again to walk the string without "decoding" it.
c8d3392f76e1 encoding: handle UTF-16 internal limit with fromutf8b (issue5031)
Matt Mackall <mpm@selenic.com>
parents: 27356
diff changeset
   683
34216
1c601df9894c py3: wrap bytes in encoding.from/toutf8b() with bytestr
Yuya Nishihara <yuya@tcha.org>
parents: 34200
diff changeset
   684
    s = pycompat.bytestr(s)
43077
687b865b95ad formatting: byteify all mercurial/ and hgext/ string literals
Augie Fackler <augie@google.com>
parents: 43076
diff changeset
   685
    r = b""
27699
c8d3392f76e1 encoding: handle UTF-16 internal limit with fromutf8b (issue5031)
Matt Mackall <mpm@selenic.com>
parents: 27356
diff changeset
   686
    pos = 0
c8d3392f76e1 encoding: handle UTF-16 internal limit with fromutf8b (issue5031)
Matt Mackall <mpm@selenic.com>
parents: 27356
diff changeset
   687
    l = len(s)
c8d3392f76e1 encoding: handle UTF-16 internal limit with fromutf8b (issue5031)
Matt Mackall <mpm@selenic.com>
parents: 27356
diff changeset
   688
    while pos < l:
c8d3392f76e1 encoding: handle UTF-16 internal limit with fromutf8b (issue5031)
Matt Mackall <mpm@selenic.com>
parents: 27356
diff changeset
   689
        c = getutf8char(s, pos)
c8d3392f76e1 encoding: handle UTF-16 internal limit with fromutf8b (issue5031)
Matt Mackall <mpm@selenic.com>
parents: 27356
diff changeset
   690
        pos += len(c)
c8d3392f76e1 encoding: handle UTF-16 internal limit with fromutf8b (issue5031)
Matt Mackall <mpm@selenic.com>
parents: 27356
diff changeset
   691
        # unescape U+DCxx characters
43077
687b865b95ad formatting: byteify all mercurial/ and hgext/ string literals
Augie Fackler <augie@google.com>
parents: 43076
diff changeset
   692
        if b"\xed\xb0\x80" <= c <= b"\xed\xb3\xbf":
43076
2372284d9457 formatting: blacken the codebase
Augie Fackler <augie@google.com>
parents: 41836
diff changeset
   693
            c = pycompat.bytechr(ord(c.decode("utf-8", _utf8strict)) & 0xFF)
27699
c8d3392f76e1 encoding: handle UTF-16 internal limit with fromutf8b (issue5031)
Matt Mackall <mpm@selenic.com>
parents: 27356
diff changeset
   694
        r += c
16133
84c58da3a1f8 encoding: introduce utf8-b helpers
Matt Mackall <mpm@selenic.com>
parents: 15769
diff changeset
   695
    return r