--- a/mercurial/encoding.py Sat Oct 05 10:29:34 2019 -0400
+++ b/mercurial/encoding.py Sun Oct 06 09:45:02 2019 -0400
@@ -17,9 +17,7 @@
pycompat,
)
-from .pure import (
- charencode as charencodepure,
-)
+from .pure import charencode as charencodepure
charencode = policy.importmod(r'charencode')
@@ -36,12 +34,15 @@
# These unicode characters are ignored by HFS+ (Apple Technote 1150,
# "Unicode Subtleties"), so we need to ignore them in some places for
# sanity.
-_ignore = [unichr(int(x, 16)).encode("utf-8") for x in
- "200c 200d 200e 200f 202a 202b 202c 202d 202e "
- "206a 206b 206c 206d 206e 206f feff".split()]
+_ignore = [
+ unichr(int(x, 16)).encode("utf-8")
+ for x in "200c 200d 200e 200f 202a 202b 202c 202d 202e "
+ "206a 206b 206c 206d 206e 206f feff".split()
+]
# verify the next function will work
assert all(i.startswith(("\xe2", "\xef")) for i in _ignore)
+
def hfsignoreclean(s):
"""Remove codepoints ignored by HFS+ from s.
@@ -55,9 +56,10 @@
s = s.replace(c, '')
return s
+
# encoding.environ is provided read-only, which may not be used to modify
# the process environment
-_nativeenviron = (not pycompat.ispy3 or os.supports_bytes_environ)
+_nativeenviron = not pycompat.ispy3 or os.supports_bytes_environ
if not pycompat.ispy3:
environ = os.environ # re-exports
elif _nativeenviron:
@@ -65,8 +67,10 @@
else:
# preferred encoding isn't known yet; use utf-8 to avoid unicode error
# and recreate it once encoding is settled
- environ = dict((k.encode(r'utf-8'), v.encode(r'utf-8'))
- for k, v in os.environ.items()) # re-exports
+ environ = dict(
+ (k.encode(r'utf-8'), v.encode(r'utf-8'))
+ for k, v in os.environ.items() # re-exports
+ )
_encodingrewrites = {
'646': 'ascii',
@@ -88,15 +92,19 @@
encodingmode = environ.get("HGENCODINGMODE", "strict")
fallbackencoding = 'ISO-8859-1'
+
class localstr(bytes):
'''This class allows strings that are unmodified to be
round-tripped to the local encoding and back'''
+
def __new__(cls, u, l):
s = bytes.__new__(cls, l)
s._utf8 = u
return s
+
def __hash__(self):
- return hash(self._utf8) # avoid collisions in local string space
+ return hash(self._utf8) # avoid collisions in local string space
+
class safelocalstr(bytes):
"""Tagged string denoting it was previously an internal UTF-8 string,
@@ -108,6 +116,7 @@
>>> assert safelocalstr(b'\\xc3') in {b'\\xc3': 0}
"""
+
def tolocal(s):
"""
Convert a string from internal UTF-8 to local encoding
@@ -167,12 +176,13 @@
return safelocalstr(r)
return localstr(u.encode('UTF-8'), r)
except UnicodeDecodeError:
- u = s.decode("utf-8", "replace") # last ditch
+ u = s.decode("utf-8", "replace") # last ditch
# can't round-trip
return u.encode(_sysstr(encoding), r"replace")
except LookupError as k:
raise error.Abort(k, hint="please check your locale settings")
+
def fromlocal(s):
"""
Convert a string from the local character encoding to UTF-8
@@ -194,27 +204,34 @@
u = s.decode(_sysstr(encoding), _sysstr(encodingmode))
return u.encode("utf-8")
except UnicodeDecodeError as inst:
- sub = s[max(0, inst.start - 10):inst.start + 10]
- raise error.Abort("decoding near '%s': %s!"
- % (sub, pycompat.bytestr(inst)))
+ sub = s[max(0, inst.start - 10) : inst.start + 10]
+ raise error.Abort(
+ "decoding near '%s': %s!" % (sub, pycompat.bytestr(inst))
+ )
except LookupError as k:
raise error.Abort(k, hint="please check your locale settings")
+
def unitolocal(u):
"""Convert a unicode string to a byte string of local encoding"""
return tolocal(u.encode('utf-8'))
+
def unifromlocal(s):
"""Convert a byte string of local encoding to a unicode string"""
return fromlocal(s).decode('utf-8')
+
def unimethod(bytesfunc):
"""Create a proxy method that forwards __unicode__() and __str__() of
Python 3 to __bytes__()"""
+
def unifunc(obj):
return unifromlocal(bytesfunc(obj))
+
return unifunc
+
# converter functions between native str and byte string. use these if the
# character encoding is not aware (e.g. exception message) or is known to
# be locale dependent (e.g. date formatting.)
@@ -230,8 +247,10 @@
if not _nativeenviron:
# now encoding and helper functions are available, recreate the environ
# dict to be exported to other modules
- environ = dict((tolocal(k.encode(r'utf-8')), tolocal(v.encode(r'utf-8')))
- for k, v in os.environ.items()) # re-exports
+ environ = dict(
+ (tolocal(k.encode(r'utf-8')), tolocal(v.encode(r'utf-8')))
+ for k, v in os.environ.items() # re-exports
+ )
if pycompat.ispy3:
# os.getcwd() on Python 3 returns string, but it has os.getcwdb() which
@@ -246,13 +265,16 @@
getcwd = os.getcwd # re-exports
# How to treat ambiguous-width characters. Set to 'wide' to treat as wide.
-_wide = _sysstr(environ.get("HGENCODINGAMBIGUOUS", "narrow") == "wide"
- and "WFA" or "WF")
+_wide = _sysstr(
+ environ.get("HGENCODINGAMBIGUOUS", "narrow") == "wide" and "WFA" or "WF"
+)
+
def colwidth(s):
"Find the column width of a string for display in the local encoding"
return ucolwidth(s.decode(_sysstr(encoding), r'replace'))
+
def ucolwidth(d):
"Find the column width of a Unicode string for display"
eaw = getattr(unicodedata, 'east_asian_width', None)
@@ -260,6 +282,7 @@
return sum([eaw(c) in _wide and 2 or 1 for c in d])
return len(d)
+
def getcols(s, start, c):
'''Use colwidth to find a c-column substring of s starting at byte
index start'''
@@ -268,6 +291,7 @@
if colwidth(t) == c:
return t
+
def trim(s, width, ellipsis='', leftside=False):
"""Trim string 's' to at most 'width' columns (including 'ellipsis').
@@ -336,21 +360,21 @@
try:
u = s.decode(_sysstr(encoding))
except UnicodeDecodeError:
- if len(s) <= width: # trimming is not needed
+ if len(s) <= width: # trimming is not needed
return s
width -= len(ellipsis)
- if width <= 0: # no enough room even for ellipsis
- return ellipsis[:width + len(ellipsis)]
+ if width <= 0: # no enough room even for ellipsis
+ return ellipsis[: width + len(ellipsis)]
if leftside:
return ellipsis + s[-width:]
return s[:width] + ellipsis
- if ucolwidth(u) <= width: # trimming is not needed
+ if ucolwidth(u) <= width: # trimming is not needed
return s
width -= len(ellipsis)
- if width <= 0: # no enough room even for ellipsis
- return ellipsis[:width + len(ellipsis)]
+ if width <= 0: # no enough room even for ellipsis
+ return ellipsis[: width + len(ellipsis)]
if leftside:
uslice = lambda i: u[i:]
@@ -362,7 +386,8 @@
usub = uslice(i)
if ucolwidth(usub) <= width:
return concat(usub.encode(_sysstr(encoding)))
- return ellipsis # no enough room for multi-column characters
+ return ellipsis # no enough room for multi-column characters
+
def lower(s):
"best-effort encoding-aware case-folding of local string s"
@@ -378,13 +403,14 @@
lu = u.lower()
if u == lu:
- return s # preserve localstring
+ return s # preserve localstring
return lu.encode(_sysstr(encoding))
except UnicodeError:
- return s.lower() # we don't know how to fold this except in ASCII
+ return s.lower() # we don't know how to fold this except in ASCII
except LookupError as k:
raise error.Abort(k, hint="please check your locale settings")
+
def upper(s):
"best-effort encoding-aware case-folding of local string s"
try:
@@ -392,6 +418,7 @@
except UnicodeDecodeError:
return upperfallback(s)
+
def upperfallback(s):
try:
if isinstance(s, localstr):
@@ -401,13 +428,14 @@
uu = u.upper()
if u == uu:
- return s # preserve localstring
+ return s # preserve localstring
return uu.encode(_sysstr(encoding))
except UnicodeError:
- return s.upper() # we don't know how to fold this except in ASCII
+ return s.upper() # we don't know how to fold this except in ASCII
except LookupError as k:
raise error.Abort(k, hint="please check your locale settings")
+
class normcasespecs(object):
'''what a platform's normcase does to ASCII strings
@@ -419,10 +447,12 @@
other: the fallback function should always be called
This should be kept in sync with normcase_spec in util.h.'''
+
lower = -1
upper = 1
other = 0
+
def jsonescape(s, paranoid=False):
'''returns a string suitable for JSON
@@ -475,6 +505,7 @@
pass
return charencodepure.jsonescapeu8fallback(u8chars, paranoid)
+
# We need to decode/encode U+DCxx codes transparently since invalid UTF-8
# bytes are mapped to that range.
if pycompat.ispy3:
@@ -484,6 +515,7 @@
_utf8len = [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 2, 2, 3, 4]
+
def getutf8char(s, pos):
'''get the next full utf-8 character in the given string, starting at pos
@@ -492,15 +524,16 @@
'''
# find how many bytes to attempt decoding from first nibble
- l = _utf8len[ord(s[pos:pos + 1]) >> 4]
- if not l: # ascii
- return s[pos:pos + 1]
+ l = _utf8len[ord(s[pos : pos + 1]) >> 4]
+ if not l: # ascii
+ return s[pos : pos + 1]
- c = s[pos:pos + l]
+ c = s[pos : pos + l]
# validate with attempted decode
c.decode("utf-8", _utf8strict)
return c
+
def toutf8b(s):
'''convert a local, possibly-binary string into UTF-8b
@@ -558,16 +591,17 @@
c = getutf8char(s, pos)
if "\xed\xb0\x80" <= c <= "\xed\xb3\xbf":
# have to re-escape existing U+DCxx characters
- c = unichr(0xdc00 + ord(s[pos])).encode('utf-8', _utf8strict)
+ c = unichr(0xDC00 + ord(s[pos])).encode('utf-8', _utf8strict)
pos += 1
else:
pos += len(c)
except UnicodeDecodeError:
- c = unichr(0xdc00 + ord(s[pos])).encode('utf-8', _utf8strict)
+ c = unichr(0xDC00 + ord(s[pos])).encode('utf-8', _utf8strict)
pos += 1
r += c
return r
+
def fromutf8b(s):
'''Given a UTF-8b string, return a local, possibly-binary string.
@@ -611,6 +645,6 @@
pos += len(c)
# unescape U+DCxx characters
if "\xed\xb0\x80" <= c <= "\xed\xb3\xbf":
- c = pycompat.bytechr(ord(c.decode("utf-8", _utf8strict)) & 0xff)
+ c = pycompat.bytechr(ord(c.decode("utf-8", _utf8strict)) & 0xFF)
r += c
return r