comparison mercurial/util.py @ 37083:f99d64e8a4e4

stringutil: move generic string helpers to new module Per https://phab.mercurial-scm.org/D2903#46738 URL and file paths functions are left since they are big enough to make separate modules.
author Yuya Nishihara <yuya@tcha.org>
date Thu, 22 Mar 2018 21:19:31 +0900
parents 1a1d1c44b570
children f0b6fbea00cf
comparison
equal deleted inserted replaced
37082:1a1d1c44b570 37083:f99d64e8a4e4
15 15
16 from __future__ import absolute_import, print_function 16 from __future__ import absolute_import, print_function
17 17
18 import abc 18 import abc
19 import bz2 19 import bz2
20 import codecs
21 import collections 20 import collections
22 import contextlib 21 import contextlib
23 import errno 22 import errno
24 import gc 23 import gc
25 import hashlib 24 import hashlib
35 import socket 34 import socket
36 import stat 35 import stat
37 import subprocess 36 import subprocess
38 import sys 37 import sys
39 import tempfile 38 import tempfile
40 import textwrap
41 import time 39 import time
42 import traceback 40 import traceback
43 import warnings 41 import warnings
44 import zlib 42 import zlib
45 43
50 node as nodemod, 48 node as nodemod,
51 policy, 49 policy,
52 pycompat, 50 pycompat,
53 urllibcompat, 51 urllibcompat,
54 ) 52 )
55 from .utils import dateutil 53 from .utils import (
54 dateutil,
55 stringutil,
56 )
56 57
57 base85 = policy.importmod(r'base85') 58 base85 = policy.importmod(r'base85')
58 osutil = policy.importmod(r'osutil') 59 osutil = policy.importmod(r'osutil')
59 parsers = policy.importmod(r'parsers') 60 parsers = policy.importmod(r'parsers')
60 61
806 807
807 def setsockopt(self, *args, **kwargs): 808 def setsockopt(self, *args, **kwargs):
808 return object.__getattribute__(self, r'_observedcall')( 809 return object.__getattribute__(self, r'_observedcall')(
809 r'setsockopt', *args, **kwargs) 810 r'setsockopt', *args, **kwargs)
810 811
811 _DATA_ESCAPE_MAP = {pycompat.bytechr(i): br'\x%02x' % i for i in range(256)}
812 _DATA_ESCAPE_MAP.update({
813 b'\\': b'\\\\',
814 b'\r': br'\r',
815 b'\n': br'\n',
816 })
817 _DATA_ESCAPE_RE = remod.compile(br'[\x00-\x08\x0a-\x1f\\\x7f-\xff]')
818
819 def escapedata(s):
820 if isinstance(s, bytearray):
821 s = bytes(s)
822
823 return _DATA_ESCAPE_RE.sub(lambda m: _DATA_ESCAPE_MAP[m.group(0)], s)
824
825 class baseproxyobserver(object): 812 class baseproxyobserver(object):
826 def _writedata(self, data): 813 def _writedata(self, data):
827 if not self.logdata: 814 if not self.logdata:
828 if self.logdataapis: 815 if self.logdataapis:
829 self.fh.write('\n') 816 self.fh.write('\n')
1564 "filter a string through a command that transforms its input to its output" 1551 "filter a string through a command that transforms its input to its output"
1565 for name, fn in filtertable.iteritems(): 1552 for name, fn in filtertable.iteritems():
1566 if cmd.startswith(name): 1553 if cmd.startswith(name):
1567 return fn(s, cmd[len(name):].lstrip()) 1554 return fn(s, cmd[len(name):].lstrip())
1568 return pipefilter(s, cmd) 1555 return pipefilter(s, cmd)
1569
1570 def binary(s):
1571 """return true if a string is binary data"""
1572 return bool(s and '\0' in s)
1573 1556
1574 def increasingchunks(source, min=1024, max=65536): 1557 def increasingchunks(source, min=1024, max=65536):
1575 '''return no less than min bytes per chunk while data remains, 1558 '''return no less than min bytes per chunk while data remains,
1576 doubling min after each chunk until it reaches max''' 1559 doubling min after each chunk until it reaches max'''
1577 def log2(x): 1560 def log2(x):
2569 return None 2552 return None
2570 2553
2571 b[0:len(res)] = res 2554 b[0:len(res)] = res
2572 return len(res) 2555 return len(res)
2573 2556
2574 def stringmatcher(pattern, casesensitive=True):
2575 """
2576 accepts a string, possibly starting with 're:' or 'literal:' prefix.
2577 returns the matcher name, pattern, and matcher function.
2578 missing or unknown prefixes are treated as literal matches.
2579
2580 helper for tests:
2581 >>> def test(pattern, *tests):
2582 ... kind, pattern, matcher = stringmatcher(pattern)
2583 ... return (kind, pattern, [bool(matcher(t)) for t in tests])
2584 >>> def itest(pattern, *tests):
2585 ... kind, pattern, matcher = stringmatcher(pattern, casesensitive=False)
2586 ... return (kind, pattern, [bool(matcher(t)) for t in tests])
2587
2588 exact matching (no prefix):
2589 >>> test(b'abcdefg', b'abc', b'def', b'abcdefg')
2590 ('literal', 'abcdefg', [False, False, True])
2591
2592 regex matching ('re:' prefix)
2593 >>> test(b're:a.+b', b'nomatch', b'fooadef', b'fooadefbar')
2594 ('re', 'a.+b', [False, False, True])
2595
2596 force exact matches ('literal:' prefix)
2597 >>> test(b'literal:re:foobar', b'foobar', b're:foobar')
2598 ('literal', 're:foobar', [False, True])
2599
2600 unknown prefixes are ignored and treated as literals
2601 >>> test(b'foo:bar', b'foo', b'bar', b'foo:bar')
2602 ('literal', 'foo:bar', [False, False, True])
2603
2604 case insensitive regex matches
2605 >>> itest(b're:A.+b', b'nomatch', b'fooadef', b'fooadefBar')
2606 ('re', 'A.+b', [False, False, True])
2607
2608 case insensitive literal matches
2609 >>> itest(b'ABCDEFG', b'abc', b'def', b'abcdefg')
2610 ('literal', 'ABCDEFG', [False, False, True])
2611 """
2612 if pattern.startswith('re:'):
2613 pattern = pattern[3:]
2614 try:
2615 flags = 0
2616 if not casesensitive:
2617 flags = remod.I
2618 regex = remod.compile(pattern, flags)
2619 except remod.error as e:
2620 raise error.ParseError(_('invalid regular expression: %s')
2621 % e)
2622 return 're', pattern, regex.search
2623 elif pattern.startswith('literal:'):
2624 pattern = pattern[8:]
2625
2626 match = pattern.__eq__
2627
2628 if not casesensitive:
2629 ipat = encoding.lower(pattern)
2630 match = lambda s: ipat == encoding.lower(s)
2631 return 'literal', pattern, match
2632
2633 def shortuser(user):
2634 """Return a short representation of a user name or email address."""
2635 f = user.find('@')
2636 if f >= 0:
2637 user = user[:f]
2638 f = user.find('<')
2639 if f >= 0:
2640 user = user[f + 1:]
2641 f = user.find(' ')
2642 if f >= 0:
2643 user = user[:f]
2644 f = user.find('.')
2645 if f >= 0:
2646 user = user[:f]
2647 return user
2648
2649 def emailuser(user):
2650 """Return the user portion of an email address."""
2651 f = user.find('@')
2652 if f >= 0:
2653 user = user[:f]
2654 f = user.find('<')
2655 if f >= 0:
2656 user = user[f + 1:]
2657 return user
2658
2659 def email(author):
2660 '''get email of author.'''
2661 r = author.find('>')
2662 if r == -1:
2663 r = None
2664 return author[author.find('<') + 1:r]
2665
2666 def ellipsis(text, maxlength=400):
2667 """Trim string to at most maxlength (default: 400) columns in display."""
2668 return encoding.trim(text, maxlength, ellipsis='...')
2669
2670 def unitcountfn(*unittable): 2557 def unitcountfn(*unittable):
2671 '''return a function that renders a readable count of some quantity''' 2558 '''return a function that renders a readable count of some quantity'''
2672 2559
2673 def go(count): 2560 def go(count):
2674 for multiplier, divisor, format in unittable: 2561 for multiplier, divisor, format in unittable:
2748 nativeeolwriter = _crlfwriter 2635 nativeeolwriter = _crlfwriter
2749 else: 2636 else:
2750 tonativeeol = pycompat.identity 2637 tonativeeol = pycompat.identity
2751 fromnativeeol = pycompat.identity 2638 fromnativeeol = pycompat.identity
2752 nativeeolwriter = pycompat.identity 2639 nativeeolwriter = pycompat.identity
2753
2754 def escapestr(s):
2755 # call underlying function of s.encode('string_escape') directly for
2756 # Python 3 compatibility
2757 return codecs.escape_encode(s)[0]
2758
2759 def unescapestr(s):
2760 return codecs.escape_decode(s)[0]
2761
2762 def forcebytestr(obj):
2763 """Portably format an arbitrary object (e.g. exception) into a byte
2764 string."""
2765 try:
2766 return pycompat.bytestr(obj)
2767 except UnicodeEncodeError:
2768 # non-ascii string, may be lossy
2769 return pycompat.bytestr(encoding.strtolocal(str(obj)))
2770
2771 def uirepr(s):
2772 # Avoid double backslash in Windows path repr()
2773 return pycompat.byterepr(pycompat.bytestr(s)).replace(b'\\\\', b'\\')
2774
2775 # delay import of textwrap
2776 def _MBTextWrapper(**kwargs):
2777 class tw(textwrap.TextWrapper):
2778 """
2779 Extend TextWrapper for width-awareness.
2780
2781 Neither number of 'bytes' in any encoding nor 'characters' is
2782 appropriate to calculate terminal columns for specified string.
2783
2784 Original TextWrapper implementation uses built-in 'len()' directly,
2785 so overriding is needed to use width information of each characters.
2786
2787 In addition, characters classified into 'ambiguous' width are
2788 treated as wide in East Asian area, but as narrow in other.
2789
2790 This requires use decision to determine width of such characters.
2791 """
2792 def _cutdown(self, ucstr, space_left):
2793 l = 0
2794 colwidth = encoding.ucolwidth
2795 for i in xrange(len(ucstr)):
2796 l += colwidth(ucstr[i])
2797 if space_left < l:
2798 return (ucstr[:i], ucstr[i:])
2799 return ucstr, ''
2800
2801 # overriding of base class
2802 def _handle_long_word(self, reversed_chunks, cur_line, cur_len, width):
2803 space_left = max(width - cur_len, 1)
2804
2805 if self.break_long_words:
2806 cut, res = self._cutdown(reversed_chunks[-1], space_left)
2807 cur_line.append(cut)
2808 reversed_chunks[-1] = res
2809 elif not cur_line:
2810 cur_line.append(reversed_chunks.pop())
2811
2812 # this overriding code is imported from TextWrapper of Python 2.6
2813 # to calculate columns of string by 'encoding.ucolwidth()'
2814 def _wrap_chunks(self, chunks):
2815 colwidth = encoding.ucolwidth
2816
2817 lines = []
2818 if self.width <= 0:
2819 raise ValueError("invalid width %r (must be > 0)" % self.width)
2820
2821 # Arrange in reverse order so items can be efficiently popped
2822 # from a stack of chucks.
2823 chunks.reverse()
2824
2825 while chunks:
2826
2827 # Start the list of chunks that will make up the current line.
2828 # cur_len is just the length of all the chunks in cur_line.
2829 cur_line = []
2830 cur_len = 0
2831
2832 # Figure out which static string will prefix this line.
2833 if lines:
2834 indent = self.subsequent_indent
2835 else:
2836 indent = self.initial_indent
2837
2838 # Maximum width for this line.
2839 width = self.width - len(indent)
2840
2841 # First chunk on line is whitespace -- drop it, unless this
2842 # is the very beginning of the text (i.e. no lines started yet).
2843 if self.drop_whitespace and chunks[-1].strip() == r'' and lines:
2844 del chunks[-1]
2845
2846 while chunks:
2847 l = colwidth(chunks[-1])
2848
2849 # Can at least squeeze this chunk onto the current line.
2850 if cur_len + l <= width:
2851 cur_line.append(chunks.pop())
2852 cur_len += l
2853
2854 # Nope, this line is full.
2855 else:
2856 break
2857
2858 # The current line is full, and the next chunk is too big to
2859 # fit on *any* line (not just this one).
2860 if chunks and colwidth(chunks[-1]) > width:
2861 self._handle_long_word(chunks, cur_line, cur_len, width)
2862
2863 # If the last chunk on this line is all whitespace, drop it.
2864 if (self.drop_whitespace and
2865 cur_line and cur_line[-1].strip() == r''):
2866 del cur_line[-1]
2867
2868 # Convert current line back to a string and store it in list
2869 # of all lines (return value).
2870 if cur_line:
2871 lines.append(indent + r''.join(cur_line))
2872
2873 return lines
2874
2875 global _MBTextWrapper
2876 _MBTextWrapper = tw
2877 return tw(**kwargs)
2878
2879 def wrap(line, width, initindent='', hangindent=''):
2880 maxindent = max(len(hangindent), len(initindent))
2881 if width <= maxindent:
2882 # adjust for weird terminal size
2883 width = max(78, maxindent + 1)
2884 line = line.decode(pycompat.sysstr(encoding.encoding),
2885 pycompat.sysstr(encoding.encodingmode))
2886 initindent = initindent.decode(pycompat.sysstr(encoding.encoding),
2887 pycompat.sysstr(encoding.encodingmode))
2888 hangindent = hangindent.decode(pycompat.sysstr(encoding.encoding),
2889 pycompat.sysstr(encoding.encodingmode))
2890 wrapper = _MBTextWrapper(width=width,
2891 initial_indent=initindent,
2892 subsequent_indent=hangindent)
2893 return wrapper.fill(line).encode(pycompat.sysstr(encoding.encoding))
2894 2640
2895 if (pyplatform.python_implementation() == 'CPython' and 2641 if (pyplatform.python_implementation() == 'CPython' and
2896 sys.version_info < (3, 0)): 2642 sys.version_info < (3, 0)):
2897 # There is an issue in CPython that some IO methods do not handle EINTR 2643 # There is an issue in CPython that some IO methods do not handle EINTR
2898 # correctly. The following table shows what CPython version (and functions) 2644 # correctly. The following table shows what CPython version (and functions)
3062 try: 2808 try:
3063 return socket.getservbyname(pycompat.sysstr(port)) 2809 return socket.getservbyname(pycompat.sysstr(port))
3064 except socket.error: 2810 except socket.error:
3065 raise Abort(_("no port number associated with service '%s'") % port) 2811 raise Abort(_("no port number associated with service '%s'") % port)
3066 2812
3067 _booleans = {'1': True, 'yes': True, 'true': True, 'on': True, 'always': True,
3068 '0': False, 'no': False, 'false': False, 'off': False,
3069 'never': False}
3070
3071 def parsebool(s):
3072 """Parse s into a boolean.
3073
3074 If s is not a valid boolean, returns None.
3075 """
3076 return _booleans.get(s.lower(), None)
3077
3078 class url(object): 2813 class url(object):
3079 r"""Reliable URL parser. 2814 r"""Reliable URL parser.
3080 2815
3081 This parses URLs and provides attributes for the following 2816 This parses URLs and provides attributes for the following
3082 components: 2817 components:
4339 shortdate = _deprecatedfunc(dateutil.shortdate, '4.6') 4074 shortdate = _deprecatedfunc(dateutil.shortdate, '4.6')
4340 parsetimezone = _deprecatedfunc(dateutil.parsetimezone, '4.6') 4075 parsetimezone = _deprecatedfunc(dateutil.parsetimezone, '4.6')
4341 strdate = _deprecatedfunc(dateutil.strdate, '4.6') 4076 strdate = _deprecatedfunc(dateutil.strdate, '4.6')
4342 parsedate = _deprecatedfunc(dateutil.parsedate, '4.6') 4077 parsedate = _deprecatedfunc(dateutil.parsedate, '4.6')
4343 matchdate = _deprecatedfunc(dateutil.matchdate, '4.6') 4078 matchdate = _deprecatedfunc(dateutil.matchdate, '4.6')
4079
4080 def _deprecatedfunc(func, version): # TODO
4081 return func
4082 escapedata = _deprecatedfunc(stringutil.escapedata, '4.6')
4083 binary = _deprecatedfunc(stringutil.binary, '4.6')
4084 stringmatcher = _deprecatedfunc(stringutil.stringmatcher, '4.6')
4085 shortuser = _deprecatedfunc(stringutil.shortuser, '4.6')
4086 emailuser = _deprecatedfunc(stringutil.emailuser, '4.6')
4087 email = _deprecatedfunc(stringutil.email, '4.6')
4088 ellipsis = _deprecatedfunc(stringutil.ellipsis, '4.6')
4089 escapestr = _deprecatedfunc(stringutil.escapestr, '4.6')
4090 unescapestr = _deprecatedfunc(stringutil.unescapestr, '4.6')
4091 forcebytestr = _deprecatedfunc(stringutil.forcebytestr, '4.6')
4092 uirepr = _deprecatedfunc(stringutil.uirepr, '4.6')
4093 wrap = _deprecatedfunc(stringutil.wrap, '4.6')
4094 parsebool = _deprecatedfunc(stringutil.parsebool, '4.6')