comparison mercurial/encoding.py @ 45942:89a2afe31e82

formating: upgrade to black 20.8b1 This required a couple of small tweaks to un-confuse black, but now it works. Big formatting changes come from: * Dramatically improved collection-splitting logic upstream * Black having a strong (correct IMO) opinion that """ is better than ''' Differential Revision: https://phab.mercurial-scm.org/D9430
author Augie Fackler <raf@durin42.com>
date Fri, 27 Nov 2020 17:03:29 -0500
parents a736ab681b78
children 3dfebba99ef6
comparison
equal deleted inserted replaced
45941:346af7687c6f 45942:89a2afe31e82
111 encodingmode = environ.get(b"HGENCODINGMODE", b"strict") 111 encodingmode = environ.get(b"HGENCODINGMODE", b"strict")
112 fallbackencoding = b'ISO-8859-1' 112 fallbackencoding = b'ISO-8859-1'
113 113
114 114
115 class localstr(bytes): 115 class localstr(bytes):
116 '''This class allows strings that are unmodified to be 116 """This class allows strings that are unmodified to be
117 round-tripped to the local encoding and back''' 117 round-tripped to the local encoding and back"""
118 118
119 def __new__(cls, u, l): 119 def __new__(cls, u, l):
120 s = bytes.__new__(cls, l) 120 s = bytes.__new__(cls, l)
121 s._utf8 = u 121 s._utf8 = u
122 return s 122 return s
327 return len(d) 327 return len(d)
328 328
329 329
330 def getcols(s, start, c): 330 def getcols(s, start, c):
331 # type: (bytes, int, int) -> bytes 331 # type: (bytes, int, int) -> bytes
332 '''Use colwidth to find a c-column substring of s starting at byte 332 """Use colwidth to find a c-column substring of s starting at byte
333 index start''' 333 index start"""
334 for x in pycompat.xrange(start + c, len(s)): 334 for x in pycompat.xrange(start + c, len(s)):
335 t = s[start:x] 335 t = s[start:x]
336 if colwidth(t) == c: 336 if colwidth(t) == c:
337 return t 337 return t
338 raise ValueError('substring not found') 338 raise ValueError('substring not found')
485 except LookupError as k: 485 except LookupError as k:
486 raise error.Abort(k, hint=b"please check your locale settings") 486 raise error.Abort(k, hint=b"please check your locale settings")
487 487
488 488
489 class normcasespecs(object): 489 class normcasespecs(object):
490 '''what a platform's normcase does to ASCII strings 490 """what a platform's normcase does to ASCII strings
491 491
492 This is specified per platform, and should be consistent with what normcase 492 This is specified per platform, and should be consistent with what normcase
493 on that platform actually does. 493 on that platform actually does.
494 494
495 lower: normcase lowercases ASCII strings 495 lower: normcase lowercases ASCII strings
496 upper: normcase uppercases ASCII strings 496 upper: normcase uppercases ASCII strings
497 other: the fallback function should always be called 497 other: the fallback function should always be called
498 498
499 This should be kept in sync with normcase_spec in util.h.''' 499 This should be kept in sync with normcase_spec in util.h."""
500 500
501 lower = -1 501 lower = -1
502 upper = 1 502 upper = 1
503 other = 0 503 other = 0
504 504
505 505
506 def jsonescape(s, paranoid=False): 506 def jsonescape(s, paranoid=False):
507 # type: (Any, Any) -> Any 507 # type: (Any, Any) -> Any
508 '''returns a string suitable for JSON 508 """returns a string suitable for JSON
509 509
510 JSON is problematic for us because it doesn't support non-Unicode 510 JSON is problematic for us because it doesn't support non-Unicode
511 bytes. To deal with this, we take the following approach: 511 bytes. To deal with this, we take the following approach:
512 512
513 - localstr/safelocalstr objects are converted back to UTF-8 513 - localstr/safelocalstr objects are converted back to UTF-8
545 'utf-8: caf\\\\u00e9' 545 'utf-8: caf\\\\u00e9'
546 >>> jsonescape(b'non-BMP: \\xf0\\x9d\\x84\\x9e', paranoid=True) 546 >>> jsonescape(b'non-BMP: \\xf0\\x9d\\x84\\x9e', paranoid=True)
547 'non-BMP: \\\\ud834\\\\udd1e' 547 'non-BMP: \\\\ud834\\\\udd1e'
548 >>> jsonescape(b'<foo@example.org>', paranoid=True) 548 >>> jsonescape(b'<foo@example.org>', paranoid=True)
549 '\\\\u003cfoo@example.org\\\\u003e' 549 '\\\\u003cfoo@example.org\\\\u003e'
550 ''' 550 """
551 551
552 u8chars = toutf8b(s) 552 u8chars = toutf8b(s)
553 try: 553 try:
554 return _jsonescapeu8fast(u8chars, paranoid) 554 return _jsonescapeu8fast(u8chars, paranoid)
555 except ValueError: 555 except ValueError:
567 _utf8len = [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 2, 2, 3, 4] 567 _utf8len = [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 2, 2, 3, 4]
568 568
569 569
570 def getutf8char(s, pos): 570 def getutf8char(s, pos):
571 # type: (bytes, int) -> bytes 571 # type: (bytes, int) -> bytes
572 '''get the next full utf-8 character in the given string, starting at pos 572 """get the next full utf-8 character in the given string, starting at pos
573 573
574 Raises a UnicodeError if the given location does not start a valid 574 Raises a UnicodeError if the given location does not start a valid
575 utf-8 character. 575 utf-8 character.
576 ''' 576 """
577 577
578 # find how many bytes to attempt decoding from first nibble 578 # find how many bytes to attempt decoding from first nibble
579 l = _utf8len[ord(s[pos : pos + 1]) >> 4] 579 l = _utf8len[ord(s[pos : pos + 1]) >> 4]
580 if not l: # ascii 580 if not l: # ascii
581 return s[pos : pos + 1] 581 return s[pos : pos + 1]
586 return c 586 return c
587 587
588 588
589 def toutf8b(s): 589 def toutf8b(s):
590 # type: (bytes) -> bytes 590 # type: (bytes) -> bytes
591 '''convert a local, possibly-binary string into UTF-8b 591 """convert a local, possibly-binary string into UTF-8b
592 592
593 This is intended as a generic method to preserve data when working 593 This is intended as a generic method to preserve data when working
594 with schemes like JSON and XML that have no provision for 594 with schemes like JSON and XML that have no provision for
595 arbitrary byte strings. As Mercurial often doesn't know 595 arbitrary byte strings. As Mercurial often doesn't know
596 what encoding data is in, we use so-called UTF-8b. 596 what encoding data is in, we use so-called UTF-8b.
614 614
615 (Note: "UTF-8b" often refers to decoding a mix of valid UTF-8 and 615 (Note: "UTF-8b" often refers to decoding a mix of valid UTF-8 and
616 arbitrary bytes into an internal Unicode format that can be 616 arbitrary bytes into an internal Unicode format that can be
617 re-encoded back into the original. Here we are exposing the 617 re-encoded back into the original. Here we are exposing the
618 internal surrogate encoding as a UTF-8 string.) 618 internal surrogate encoding as a UTF-8 string.)
619 ''' 619 """
620 620
621 if isinstance(s, localstr): 621 if isinstance(s, localstr):
622 # assume that the original UTF-8 sequence would never contain 622 # assume that the original UTF-8 sequence would never contain
623 # invalid characters in U+DCxx range 623 # invalid characters in U+DCxx range
624 return s._utf8 624 return s._utf8
655 return r 655 return r
656 656
657 657
658 def fromutf8b(s): 658 def fromutf8b(s):
659 # type: (bytes) -> bytes 659 # type: (bytes) -> bytes
660 '''Given a UTF-8b string, return a local, possibly-binary string. 660 """Given a UTF-8b string, return a local, possibly-binary string.
661 661
662 return the original binary string. This 662 return the original binary string. This
663 is a round-trip process for strings like filenames, but metadata 663 is a round-trip process for strings like filenames, but metadata
664 that's was passed through tolocal will remain in UTF-8. 664 that's was passed through tolocal will remain in UTF-8.
665 665
675 True 675 True
676 >>> roundtrip(b"\\xef\\xef\\xbf\\xbd") 676 >>> roundtrip(b"\\xef\\xef\\xbf\\xbd")
677 True 677 True
678 >>> roundtrip(b"\\xf1\\x80\\x80\\x80\\x80") 678 >>> roundtrip(b"\\xf1\\x80\\x80\\x80\\x80")
679 True 679 True
680 ''' 680 """
681 681
682 if isasciistr(s): 682 if isasciistr(s):
683 return s 683 return s
684 # fast path - look for uDxxx prefixes in s 684 # fast path - look for uDxxx prefixes in s
685 if b"\xed" not in s: 685 if b"\xed" not in s: