comparison mercurial/encoding.py @ 43076:2372284d9457

formatting: blacken the codebase This is using my patch to black (https://github.com/psf/black/pull/826) so we don't un-wrap collection literals. Done with: hg files 'set:**.py - mercurial/thirdparty/** - "contrib/python-zstandard/**"' | xargs black -S # skip-blame mass-reformatting only # no-check-commit reformats foo_bar functions Differential Revision: https://phab.mercurial-scm.org/D6971
author Augie Fackler <augie@google.com>
date Sun, 06 Oct 2019 09:45:02 -0400
parents 25694a78e4a4
children 687b865b95ad
comparison
equal deleted inserted replaced
43075:57875cf423c9 43076:2372284d9457
15 error, 15 error,
16 policy, 16 policy,
17 pycompat, 17 pycompat,
18 ) 18 )
19 19
20 from .pure import ( 20 from .pure import charencode as charencodepure
21 charencode as charencodepure,
22 )
23 21
24 charencode = policy.importmod(r'charencode') 22 charencode = policy.importmod(r'charencode')
25 23
26 isasciistr = charencode.isasciistr 24 isasciistr = charencode.isasciistr
27 asciilower = charencode.asciilower 25 asciilower = charencode.asciilower
34 unichr = chr 32 unichr = chr
35 33
36 # These unicode characters are ignored by HFS+ (Apple Technote 1150, 34 # These unicode characters are ignored by HFS+ (Apple Technote 1150,
37 # "Unicode Subtleties"), so we need to ignore them in some places for 35 # "Unicode Subtleties"), so we need to ignore them in some places for
38 # sanity. 36 # sanity.
39 _ignore = [unichr(int(x, 16)).encode("utf-8") for x in 37 _ignore = [
40 "200c 200d 200e 200f 202a 202b 202c 202d 202e " 38 unichr(int(x, 16)).encode("utf-8")
41 "206a 206b 206c 206d 206e 206f feff".split()] 39 for x in "200c 200d 200e 200f 202a 202b 202c 202d 202e "
40 "206a 206b 206c 206d 206e 206f feff".split()
41 ]
42 # verify the next function will work 42 # verify the next function will work
43 assert all(i.startswith(("\xe2", "\xef")) for i in _ignore) 43 assert all(i.startswith(("\xe2", "\xef")) for i in _ignore)
44
44 45
45 def hfsignoreclean(s): 46 def hfsignoreclean(s):
46 """Remove codepoints ignored by HFS+ from s. 47 """Remove codepoints ignored by HFS+ from s.
47 48
48 >>> hfsignoreclean(u'.h\u200cg'.encode('utf-8')) 49 >>> hfsignoreclean(u'.h\u200cg'.encode('utf-8'))
53 if "\xe2" in s or "\xef" in s: 54 if "\xe2" in s or "\xef" in s:
54 for c in _ignore: 55 for c in _ignore:
55 s = s.replace(c, '') 56 s = s.replace(c, '')
56 return s 57 return s
57 58
59
58 # encoding.environ is provided read-only, which may not be used to modify 60 # encoding.environ is provided read-only, which may not be used to modify
59 # the process environment 61 # the process environment
60 _nativeenviron = (not pycompat.ispy3 or os.supports_bytes_environ) 62 _nativeenviron = not pycompat.ispy3 or os.supports_bytes_environ
61 if not pycompat.ispy3: 63 if not pycompat.ispy3:
62 environ = os.environ # re-exports 64 environ = os.environ # re-exports
63 elif _nativeenviron: 65 elif _nativeenviron:
64 environ = os.environb # re-exports 66 environ = os.environb # re-exports
65 else: 67 else:
66 # preferred encoding isn't known yet; use utf-8 to avoid unicode error 68 # preferred encoding isn't known yet; use utf-8 to avoid unicode error
67 # and recreate it once encoding is settled 69 # and recreate it once encoding is settled
68 environ = dict((k.encode(r'utf-8'), v.encode(r'utf-8')) 70 environ = dict(
69 for k, v in os.environ.items()) # re-exports 71 (k.encode(r'utf-8'), v.encode(r'utf-8'))
72 for k, v in os.environ.items() # re-exports
73 )
70 74
71 _encodingrewrites = { 75 _encodingrewrites = {
72 '646': 'ascii', 76 '646': 'ascii',
73 'ANSI_X3.4-1968': 'ascii', 77 'ANSI_X3.4-1968': 'ascii',
74 } 78 }
86 except locale.Error: 90 except locale.Error:
87 encoding = 'ascii' 91 encoding = 'ascii'
88 encodingmode = environ.get("HGENCODINGMODE", "strict") 92 encodingmode = environ.get("HGENCODINGMODE", "strict")
89 fallbackencoding = 'ISO-8859-1' 93 fallbackencoding = 'ISO-8859-1'
90 94
95
91 class localstr(bytes): 96 class localstr(bytes):
92 '''This class allows strings that are unmodified to be 97 '''This class allows strings that are unmodified to be
93 round-tripped to the local encoding and back''' 98 round-tripped to the local encoding and back'''
99
94 def __new__(cls, u, l): 100 def __new__(cls, u, l):
95 s = bytes.__new__(cls, l) 101 s = bytes.__new__(cls, l)
96 s._utf8 = u 102 s._utf8 = u
97 return s 103 return s
104
98 def __hash__(self): 105 def __hash__(self):
99 return hash(self._utf8) # avoid collisions in local string space 106 return hash(self._utf8) # avoid collisions in local string space
107
100 108
101 class safelocalstr(bytes): 109 class safelocalstr(bytes):
102 """Tagged string denoting it was previously an internal UTF-8 string, 110 """Tagged string denoting it was previously an internal UTF-8 string,
103 and can be converted back to UTF-8 losslessly 111 and can be converted back to UTF-8 losslessly
104 112
105 >>> assert safelocalstr(b'\\xc3') == b'\\xc3' 113 >>> assert safelocalstr(b'\\xc3') == b'\\xc3'
106 >>> assert b'\\xc3' == safelocalstr(b'\\xc3') 114 >>> assert b'\\xc3' == safelocalstr(b'\\xc3')
107 >>> assert b'\\xc3' in {safelocalstr(b'\\xc3'): 0} 115 >>> assert b'\\xc3' in {safelocalstr(b'\\xc3'): 0}
108 >>> assert safelocalstr(b'\\xc3') in {b'\\xc3': 0} 116 >>> assert safelocalstr(b'\\xc3') in {b'\\xc3': 0}
109 """ 117 """
118
110 119
111 def tolocal(s): 120 def tolocal(s):
112 """ 121 """
113 Convert a string from internal UTF-8 to local encoding 122 Convert a string from internal UTF-8 to local encoding
114 123
165 if u == r.decode(_sysstr(encoding)): 174 if u == r.decode(_sysstr(encoding)):
166 # r is a safe, non-lossy encoding of s 175 # r is a safe, non-lossy encoding of s
167 return safelocalstr(r) 176 return safelocalstr(r)
168 return localstr(u.encode('UTF-8'), r) 177 return localstr(u.encode('UTF-8'), r)
169 except UnicodeDecodeError: 178 except UnicodeDecodeError:
170 u = s.decode("utf-8", "replace") # last ditch 179 u = s.decode("utf-8", "replace") # last ditch
171 # can't round-trip 180 # can't round-trip
172 return u.encode(_sysstr(encoding), r"replace") 181 return u.encode(_sysstr(encoding), r"replace")
173 except LookupError as k: 182 except LookupError as k:
174 raise error.Abort(k, hint="please check your locale settings") 183 raise error.Abort(k, hint="please check your locale settings")
184
175 185
176 def fromlocal(s): 186 def fromlocal(s):
177 """ 187 """
178 Convert a string from the local character encoding to UTF-8 188 Convert a string from the local character encoding to UTF-8
179 189
192 202
193 try: 203 try:
194 u = s.decode(_sysstr(encoding), _sysstr(encodingmode)) 204 u = s.decode(_sysstr(encoding), _sysstr(encodingmode))
195 return u.encode("utf-8") 205 return u.encode("utf-8")
196 except UnicodeDecodeError as inst: 206 except UnicodeDecodeError as inst:
197 sub = s[max(0, inst.start - 10):inst.start + 10] 207 sub = s[max(0, inst.start - 10) : inst.start + 10]
198 raise error.Abort("decoding near '%s': %s!" 208 raise error.Abort(
199 % (sub, pycompat.bytestr(inst))) 209 "decoding near '%s': %s!" % (sub, pycompat.bytestr(inst))
210 )
200 except LookupError as k: 211 except LookupError as k:
201 raise error.Abort(k, hint="please check your locale settings") 212 raise error.Abort(k, hint="please check your locale settings")
213
202 214
203 def unitolocal(u): 215 def unitolocal(u):
204 """Convert a unicode string to a byte string of local encoding""" 216 """Convert a unicode string to a byte string of local encoding"""
205 return tolocal(u.encode('utf-8')) 217 return tolocal(u.encode('utf-8'))
206 218
219
207 def unifromlocal(s): 220 def unifromlocal(s):
208 """Convert a byte string of local encoding to a unicode string""" 221 """Convert a byte string of local encoding to a unicode string"""
209 return fromlocal(s).decode('utf-8') 222 return fromlocal(s).decode('utf-8')
210 223
224
211 def unimethod(bytesfunc): 225 def unimethod(bytesfunc):
212 """Create a proxy method that forwards __unicode__() and __str__() of 226 """Create a proxy method that forwards __unicode__() and __str__() of
213 Python 3 to __bytes__()""" 227 Python 3 to __bytes__()"""
228
214 def unifunc(obj): 229 def unifunc(obj):
215 return unifromlocal(bytesfunc(obj)) 230 return unifromlocal(bytesfunc(obj))
231
216 return unifunc 232 return unifunc
233
217 234
218 # converter functions between native str and byte string. use these if the 235 # converter functions between native str and byte string. use these if the
219 # character encoding is not aware (e.g. exception message) or is known to 236 # character encoding is not aware (e.g. exception message) or is known to
220 # be locale dependent (e.g. date formatting.) 237 # be locale dependent (e.g. date formatting.)
221 if pycompat.ispy3: 238 if pycompat.ispy3:
228 strmethod = pycompat.identity 245 strmethod = pycompat.identity
229 246
230 if not _nativeenviron: 247 if not _nativeenviron:
231 # now encoding and helper functions are available, recreate the environ 248 # now encoding and helper functions are available, recreate the environ
232 # dict to be exported to other modules 249 # dict to be exported to other modules
233 environ = dict((tolocal(k.encode(r'utf-8')), tolocal(v.encode(r'utf-8'))) 250 environ = dict(
234 for k, v in os.environ.items()) # re-exports 251 (tolocal(k.encode(r'utf-8')), tolocal(v.encode(r'utf-8')))
252 for k, v in os.environ.items() # re-exports
253 )
235 254
236 if pycompat.ispy3: 255 if pycompat.ispy3:
237 # os.getcwd() on Python 3 returns string, but it has os.getcwdb() which 256 # os.getcwd() on Python 3 returns string, but it has os.getcwdb() which
238 # returns bytes. 257 # returns bytes.
239 if pycompat.iswindows: 258 if pycompat.iswindows:
244 getcwd = os.getcwdb # re-exports 263 getcwd = os.getcwdb # re-exports
245 else: 264 else:
246 getcwd = os.getcwd # re-exports 265 getcwd = os.getcwd # re-exports
247 266
248 # How to treat ambiguous-width characters. Set to 'wide' to treat as wide. 267 # How to treat ambiguous-width characters. Set to 'wide' to treat as wide.
249 _wide = _sysstr(environ.get("HGENCODINGAMBIGUOUS", "narrow") == "wide" 268 _wide = _sysstr(
250 and "WFA" or "WF") 269 environ.get("HGENCODINGAMBIGUOUS", "narrow") == "wide" and "WFA" or "WF"
270 )
271
251 272
252 def colwidth(s): 273 def colwidth(s):
253 "Find the column width of a string for display in the local encoding" 274 "Find the column width of a string for display in the local encoding"
254 return ucolwidth(s.decode(_sysstr(encoding), r'replace')) 275 return ucolwidth(s.decode(_sysstr(encoding), r'replace'))
276
255 277
256 def ucolwidth(d): 278 def ucolwidth(d):
257 "Find the column width of a Unicode string for display" 279 "Find the column width of a Unicode string for display"
258 eaw = getattr(unicodedata, 'east_asian_width', None) 280 eaw = getattr(unicodedata, 'east_asian_width', None)
259 if eaw is not None: 281 if eaw is not None:
260 return sum([eaw(c) in _wide and 2 or 1 for c in d]) 282 return sum([eaw(c) in _wide and 2 or 1 for c in d])
261 return len(d) 283 return len(d)
284
262 285
263 def getcols(s, start, c): 286 def getcols(s, start, c):
264 '''Use colwidth to find a c-column substring of s starting at byte 287 '''Use colwidth to find a c-column substring of s starting at byte
265 index start''' 288 index start'''
266 for x in pycompat.xrange(start + c, len(s)): 289 for x in pycompat.xrange(start + c, len(s)):
267 t = s[start:x] 290 t = s[start:x]
268 if colwidth(t) == c: 291 if colwidth(t) == c:
269 return t 292 return t
293
270 294
271 def trim(s, width, ellipsis='', leftside=False): 295 def trim(s, width, ellipsis='', leftside=False):
272 """Trim string 's' to at most 'width' columns (including 'ellipsis'). 296 """Trim string 's' to at most 'width' columns (including 'ellipsis').
273 297
274 If 'leftside' is True, left side of string 's' is trimmed. 298 If 'leftside' is True, left side of string 's' is trimmed.
334 + 358 +
335 """ 359 """
336 try: 360 try:
337 u = s.decode(_sysstr(encoding)) 361 u = s.decode(_sysstr(encoding))
338 except UnicodeDecodeError: 362 except UnicodeDecodeError:
339 if len(s) <= width: # trimming is not needed 363 if len(s) <= width: # trimming is not needed
340 return s 364 return s
341 width -= len(ellipsis) 365 width -= len(ellipsis)
342 if width <= 0: # no enough room even for ellipsis 366 if width <= 0: # no enough room even for ellipsis
343 return ellipsis[:width + len(ellipsis)] 367 return ellipsis[: width + len(ellipsis)]
344 if leftside: 368 if leftside:
345 return ellipsis + s[-width:] 369 return ellipsis + s[-width:]
346 return s[:width] + ellipsis 370 return s[:width] + ellipsis
347 371
348 if ucolwidth(u) <= width: # trimming is not needed 372 if ucolwidth(u) <= width: # trimming is not needed
349 return s 373 return s
350 374
351 width -= len(ellipsis) 375 width -= len(ellipsis)
352 if width <= 0: # no enough room even for ellipsis 376 if width <= 0: # no enough room even for ellipsis
353 return ellipsis[:width + len(ellipsis)] 377 return ellipsis[: width + len(ellipsis)]
354 378
355 if leftside: 379 if leftside:
356 uslice = lambda i: u[i:] 380 uslice = lambda i: u[i:]
357 concat = lambda s: ellipsis + s 381 concat = lambda s: ellipsis + s
358 else: 382 else:
360 concat = lambda s: s + ellipsis 384 concat = lambda s: s + ellipsis
361 for i in pycompat.xrange(1, len(u)): 385 for i in pycompat.xrange(1, len(u)):
362 usub = uslice(i) 386 usub = uslice(i)
363 if ucolwidth(usub) <= width: 387 if ucolwidth(usub) <= width:
364 return concat(usub.encode(_sysstr(encoding))) 388 return concat(usub.encode(_sysstr(encoding)))
365 return ellipsis # no enough room for multi-column characters 389 return ellipsis # no enough room for multi-column characters
390
366 391
367 def lower(s): 392 def lower(s):
368 "best-effort encoding-aware case-folding of local string s" 393 "best-effort encoding-aware case-folding of local string s"
369 try: 394 try:
370 return asciilower(s) 395 return asciilower(s)
376 else: 401 else:
377 u = s.decode(_sysstr(encoding), _sysstr(encodingmode)) 402 u = s.decode(_sysstr(encoding), _sysstr(encodingmode))
378 403
379 lu = u.lower() 404 lu = u.lower()
380 if u == lu: 405 if u == lu:
381 return s # preserve localstring 406 return s # preserve localstring
382 return lu.encode(_sysstr(encoding)) 407 return lu.encode(_sysstr(encoding))
383 except UnicodeError: 408 except UnicodeError:
384 return s.lower() # we don't know how to fold this except in ASCII 409 return s.lower() # we don't know how to fold this except in ASCII
385 except LookupError as k: 410 except LookupError as k:
386 raise error.Abort(k, hint="please check your locale settings") 411 raise error.Abort(k, hint="please check your locale settings")
412
387 413
388 def upper(s): 414 def upper(s):
389 "best-effort encoding-aware case-folding of local string s" 415 "best-effort encoding-aware case-folding of local string s"
390 try: 416 try:
391 return asciiupper(s) 417 return asciiupper(s)
392 except UnicodeDecodeError: 418 except UnicodeDecodeError:
393 return upperfallback(s) 419 return upperfallback(s)
420
394 421
395 def upperfallback(s): 422 def upperfallback(s):
396 try: 423 try:
397 if isinstance(s, localstr): 424 if isinstance(s, localstr):
398 u = s._utf8.decode("utf-8") 425 u = s._utf8.decode("utf-8")
399 else: 426 else:
400 u = s.decode(_sysstr(encoding), _sysstr(encodingmode)) 427 u = s.decode(_sysstr(encoding), _sysstr(encodingmode))
401 428
402 uu = u.upper() 429 uu = u.upper()
403 if u == uu: 430 if u == uu:
404 return s # preserve localstring 431 return s # preserve localstring
405 return uu.encode(_sysstr(encoding)) 432 return uu.encode(_sysstr(encoding))
406 except UnicodeError: 433 except UnicodeError:
407 return s.upper() # we don't know how to fold this except in ASCII 434 return s.upper() # we don't know how to fold this except in ASCII
408 except LookupError as k: 435 except LookupError as k:
409 raise error.Abort(k, hint="please check your locale settings") 436 raise error.Abort(k, hint="please check your locale settings")
437
410 438
411 class normcasespecs(object): 439 class normcasespecs(object):
412 '''what a platform's normcase does to ASCII strings 440 '''what a platform's normcase does to ASCII strings
413 441
414 This is specified per platform, and should be consistent with what normcase 442 This is specified per platform, and should be consistent with what normcase
417 lower: normcase lowercases ASCII strings 445 lower: normcase lowercases ASCII strings
418 upper: normcase uppercases ASCII strings 446 upper: normcase uppercases ASCII strings
419 other: the fallback function should always be called 447 other: the fallback function should always be called
420 448
421 This should be kept in sync with normcase_spec in util.h.''' 449 This should be kept in sync with normcase_spec in util.h.'''
450
422 lower = -1 451 lower = -1
423 upper = 1 452 upper = 1
424 other = 0 453 other = 0
454
425 455
426 def jsonescape(s, paranoid=False): 456 def jsonescape(s, paranoid=False):
427 '''returns a string suitable for JSON 457 '''returns a string suitable for JSON
428 458
429 JSON is problematic for us because it doesn't support non-Unicode 459 JSON is problematic for us because it doesn't support non-Unicode
473 return _jsonescapeu8fast(u8chars, paranoid) 503 return _jsonescapeu8fast(u8chars, paranoid)
474 except ValueError: 504 except ValueError:
475 pass 505 pass
476 return charencodepure.jsonescapeu8fallback(u8chars, paranoid) 506 return charencodepure.jsonescapeu8fallback(u8chars, paranoid)
477 507
508
478 # We need to decode/encode U+DCxx codes transparently since invalid UTF-8 509 # We need to decode/encode U+DCxx codes transparently since invalid UTF-8
479 # bytes are mapped to that range. 510 # bytes are mapped to that range.
480 if pycompat.ispy3: 511 if pycompat.ispy3:
481 _utf8strict = r'surrogatepass' 512 _utf8strict = r'surrogatepass'
482 else: 513 else:
483 _utf8strict = r'strict' 514 _utf8strict = r'strict'
484 515
485 _utf8len = [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 2, 2, 3, 4] 516 _utf8len = [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 2, 2, 3, 4]
486 517
518
487 def getutf8char(s, pos): 519 def getutf8char(s, pos):
488 '''get the next full utf-8 character in the given string, starting at pos 520 '''get the next full utf-8 character in the given string, starting at pos
489 521
490 Raises a UnicodeError if the given location does not start a valid 522 Raises a UnicodeError if the given location does not start a valid
491 utf-8 character. 523 utf-8 character.
492 ''' 524 '''
493 525
494 # find how many bytes to attempt decoding from first nibble 526 # find how many bytes to attempt decoding from first nibble
495 l = _utf8len[ord(s[pos:pos + 1]) >> 4] 527 l = _utf8len[ord(s[pos : pos + 1]) >> 4]
496 if not l: # ascii 528 if not l: # ascii
497 return s[pos:pos + 1] 529 return s[pos : pos + 1]
498 530
499 c = s[pos:pos + l] 531 c = s[pos : pos + l]
500 # validate with attempted decode 532 # validate with attempted decode
501 c.decode("utf-8", _utf8strict) 533 c.decode("utf-8", _utf8strict)
502 return c 534 return c
535
503 536
504 def toutf8b(s): 537 def toutf8b(s):
505 '''convert a local, possibly-binary string into UTF-8b 538 '''convert a local, possibly-binary string into UTF-8b
506 539
507 This is intended as a generic method to preserve data when working 540 This is intended as a generic method to preserve data when working
556 while pos < l: 589 while pos < l:
557 try: 590 try:
558 c = getutf8char(s, pos) 591 c = getutf8char(s, pos)
559 if "\xed\xb0\x80" <= c <= "\xed\xb3\xbf": 592 if "\xed\xb0\x80" <= c <= "\xed\xb3\xbf":
560 # have to re-escape existing U+DCxx characters 593 # have to re-escape existing U+DCxx characters
561 c = unichr(0xdc00 + ord(s[pos])).encode('utf-8', _utf8strict) 594 c = unichr(0xDC00 + ord(s[pos])).encode('utf-8', _utf8strict)
562 pos += 1 595 pos += 1
563 else: 596 else:
564 pos += len(c) 597 pos += len(c)
565 except UnicodeDecodeError: 598 except UnicodeDecodeError:
566 c = unichr(0xdc00 + ord(s[pos])).encode('utf-8', _utf8strict) 599 c = unichr(0xDC00 + ord(s[pos])).encode('utf-8', _utf8strict)
567 pos += 1 600 pos += 1
568 r += c 601 r += c
569 return r 602 return r
603
570 604
571 def fromutf8b(s): 605 def fromutf8b(s):
572 '''Given a UTF-8b string, return a local, possibly-binary string. 606 '''Given a UTF-8b string, return a local, possibly-binary string.
573 607
574 return the original binary string. This 608 return the original binary string. This
609 while pos < l: 643 while pos < l:
610 c = getutf8char(s, pos) 644 c = getutf8char(s, pos)
611 pos += len(c) 645 pos += len(c)
612 # unescape U+DCxx characters 646 # unescape U+DCxx characters
613 if "\xed\xb0\x80" <= c <= "\xed\xb3\xbf": 647 if "\xed\xb0\x80" <= c <= "\xed\xb3\xbf":
614 c = pycompat.bytechr(ord(c.decode("utf-8", _utf8strict)) & 0xff) 648 c = pycompat.bytechr(ord(c.decode("utf-8", _utf8strict)) & 0xFF)
615 r += c 649 r += c
616 return r 650 return r