Mercurial > hg-stable
comparison mercurial/util.py @ 37086:f99d64e8a4e4
stringutil: move generic string helpers to new module
Per https://phab.mercurial-scm.org/D2903#46738
URL and file paths functions are left since they are big enough to make
separate modules.
author | Yuya Nishihara <yuya@tcha.org> |
---|---|
date | Thu, 22 Mar 2018 21:19:31 +0900 |
parents | 1a1d1c44b570 |
children | f0b6fbea00cf |
comparison
equal
deleted
inserted
replaced
37085:1a1d1c44b570 | 37086:f99d64e8a4e4 |
---|---|
15 | 15 |
16 from __future__ import absolute_import, print_function | 16 from __future__ import absolute_import, print_function |
17 | 17 |
18 import abc | 18 import abc |
19 import bz2 | 19 import bz2 |
20 import codecs | |
21 import collections | 20 import collections |
22 import contextlib | 21 import contextlib |
23 import errno | 22 import errno |
24 import gc | 23 import gc |
25 import hashlib | 24 import hashlib |
35 import socket | 34 import socket |
36 import stat | 35 import stat |
37 import subprocess | 36 import subprocess |
38 import sys | 37 import sys |
39 import tempfile | 38 import tempfile |
40 import textwrap | |
41 import time | 39 import time |
42 import traceback | 40 import traceback |
43 import warnings | 41 import warnings |
44 import zlib | 42 import zlib |
45 | 43 |
50 node as nodemod, | 48 node as nodemod, |
51 policy, | 49 policy, |
52 pycompat, | 50 pycompat, |
53 urllibcompat, | 51 urllibcompat, |
54 ) | 52 ) |
55 from .utils import dateutil | 53 from .utils import ( |
54 dateutil, | |
55 stringutil, | |
56 ) | |
56 | 57 |
57 base85 = policy.importmod(r'base85') | 58 base85 = policy.importmod(r'base85') |
58 osutil = policy.importmod(r'osutil') | 59 osutil = policy.importmod(r'osutil') |
59 parsers = policy.importmod(r'parsers') | 60 parsers = policy.importmod(r'parsers') |
60 | 61 |
806 | 807 |
807 def setsockopt(self, *args, **kwargs): | 808 def setsockopt(self, *args, **kwargs): |
808 return object.__getattribute__(self, r'_observedcall')( | 809 return object.__getattribute__(self, r'_observedcall')( |
809 r'setsockopt', *args, **kwargs) | 810 r'setsockopt', *args, **kwargs) |
810 | 811 |
811 _DATA_ESCAPE_MAP = {pycompat.bytechr(i): br'\x%02x' % i for i in range(256)} | |
812 _DATA_ESCAPE_MAP.update({ | |
813 b'\\': b'\\\\', | |
814 b'\r': br'\r', | |
815 b'\n': br'\n', | |
816 }) | |
817 _DATA_ESCAPE_RE = remod.compile(br'[\x00-\x08\x0a-\x1f\\\x7f-\xff]') | |
818 | |
819 def escapedata(s): | |
820 if isinstance(s, bytearray): | |
821 s = bytes(s) | |
822 | |
823 return _DATA_ESCAPE_RE.sub(lambda m: _DATA_ESCAPE_MAP[m.group(0)], s) | |
824 | |
825 class baseproxyobserver(object): | 812 class baseproxyobserver(object): |
826 def _writedata(self, data): | 813 def _writedata(self, data): |
827 if not self.logdata: | 814 if not self.logdata: |
828 if self.logdataapis: | 815 if self.logdataapis: |
829 self.fh.write('\n') | 816 self.fh.write('\n') |
1564 "filter a string through a command that transforms its input to its output" | 1551 "filter a string through a command that transforms its input to its output" |
1565 for name, fn in filtertable.iteritems(): | 1552 for name, fn in filtertable.iteritems(): |
1566 if cmd.startswith(name): | 1553 if cmd.startswith(name): |
1567 return fn(s, cmd[len(name):].lstrip()) | 1554 return fn(s, cmd[len(name):].lstrip()) |
1568 return pipefilter(s, cmd) | 1555 return pipefilter(s, cmd) |
1569 | |
1570 def binary(s): | |
1571 """return true if a string is binary data""" | |
1572 return bool(s and '\0' in s) | |
1573 | 1556 |
1574 def increasingchunks(source, min=1024, max=65536): | 1557 def increasingchunks(source, min=1024, max=65536): |
1575 '''return no less than min bytes per chunk while data remains, | 1558 '''return no less than min bytes per chunk while data remains, |
1576 doubling min after each chunk until it reaches max''' | 1559 doubling min after each chunk until it reaches max''' |
1577 def log2(x): | 1560 def log2(x): |
2569 return None | 2552 return None |
2570 | 2553 |
2571 b[0:len(res)] = res | 2554 b[0:len(res)] = res |
2572 return len(res) | 2555 return len(res) |
2573 | 2556 |
2574 def stringmatcher(pattern, casesensitive=True): | |
2575 """ | |
2576 accepts a string, possibly starting with 're:' or 'literal:' prefix. | |
2577 returns the matcher name, pattern, and matcher function. | |
2578 missing or unknown prefixes are treated as literal matches. | |
2579 | |
2580 helper for tests: | |
2581 >>> def test(pattern, *tests): | |
2582 ... kind, pattern, matcher = stringmatcher(pattern) | |
2583 ... return (kind, pattern, [bool(matcher(t)) for t in tests]) | |
2584 >>> def itest(pattern, *tests): | |
2585 ... kind, pattern, matcher = stringmatcher(pattern, casesensitive=False) | |
2586 ... return (kind, pattern, [bool(matcher(t)) for t in tests]) | |
2587 | |
2588 exact matching (no prefix): | |
2589 >>> test(b'abcdefg', b'abc', b'def', b'abcdefg') | |
2590 ('literal', 'abcdefg', [False, False, True]) | |
2591 | |
2592 regex matching ('re:' prefix) | |
2593 >>> test(b're:a.+b', b'nomatch', b'fooadef', b'fooadefbar') | |
2594 ('re', 'a.+b', [False, False, True]) | |
2595 | |
2596 force exact matches ('literal:' prefix) | |
2597 >>> test(b'literal:re:foobar', b'foobar', b're:foobar') | |
2598 ('literal', 're:foobar', [False, True]) | |
2599 | |
2600 unknown prefixes are ignored and treated as literals | |
2601 >>> test(b'foo:bar', b'foo', b'bar', b'foo:bar') | |
2602 ('literal', 'foo:bar', [False, False, True]) | |
2603 | |
2604 case insensitive regex matches | |
2605 >>> itest(b're:A.+b', b'nomatch', b'fooadef', b'fooadefBar') | |
2606 ('re', 'A.+b', [False, False, True]) | |
2607 | |
2608 case insensitive literal matches | |
2609 >>> itest(b'ABCDEFG', b'abc', b'def', b'abcdefg') | |
2610 ('literal', 'ABCDEFG', [False, False, True]) | |
2611 """ | |
2612 if pattern.startswith('re:'): | |
2613 pattern = pattern[3:] | |
2614 try: | |
2615 flags = 0 | |
2616 if not casesensitive: | |
2617 flags = remod.I | |
2618 regex = remod.compile(pattern, flags) | |
2619 except remod.error as e: | |
2620 raise error.ParseError(_('invalid regular expression: %s') | |
2621 % e) | |
2622 return 're', pattern, regex.search | |
2623 elif pattern.startswith('literal:'): | |
2624 pattern = pattern[8:] | |
2625 | |
2626 match = pattern.__eq__ | |
2627 | |
2628 if not casesensitive: | |
2629 ipat = encoding.lower(pattern) | |
2630 match = lambda s: ipat == encoding.lower(s) | |
2631 return 'literal', pattern, match | |
2632 | |
2633 def shortuser(user): | |
2634 """Return a short representation of a user name or email address.""" | |
2635 f = user.find('@') | |
2636 if f >= 0: | |
2637 user = user[:f] | |
2638 f = user.find('<') | |
2639 if f >= 0: | |
2640 user = user[f + 1:] | |
2641 f = user.find(' ') | |
2642 if f >= 0: | |
2643 user = user[:f] | |
2644 f = user.find('.') | |
2645 if f >= 0: | |
2646 user = user[:f] | |
2647 return user | |
2648 | |
2649 def emailuser(user): | |
2650 """Return the user portion of an email address.""" | |
2651 f = user.find('@') | |
2652 if f >= 0: | |
2653 user = user[:f] | |
2654 f = user.find('<') | |
2655 if f >= 0: | |
2656 user = user[f + 1:] | |
2657 return user | |
2658 | |
2659 def email(author): | |
2660 '''get email of author.''' | |
2661 r = author.find('>') | |
2662 if r == -1: | |
2663 r = None | |
2664 return author[author.find('<') + 1:r] | |
2665 | |
2666 def ellipsis(text, maxlength=400): | |
2667 """Trim string to at most maxlength (default: 400) columns in display.""" | |
2668 return encoding.trim(text, maxlength, ellipsis='...') | |
2669 | |
2670 def unitcountfn(*unittable): | 2557 def unitcountfn(*unittable): |
2671 '''return a function that renders a readable count of some quantity''' | 2558 '''return a function that renders a readable count of some quantity''' |
2672 | 2559 |
2673 def go(count): | 2560 def go(count): |
2674 for multiplier, divisor, format in unittable: | 2561 for multiplier, divisor, format in unittable: |
2748 nativeeolwriter = _crlfwriter | 2635 nativeeolwriter = _crlfwriter |
2749 else: | 2636 else: |
2750 tonativeeol = pycompat.identity | 2637 tonativeeol = pycompat.identity |
2751 fromnativeeol = pycompat.identity | 2638 fromnativeeol = pycompat.identity |
2752 nativeeolwriter = pycompat.identity | 2639 nativeeolwriter = pycompat.identity |
2753 | |
2754 def escapestr(s): | |
2755 # call underlying function of s.encode('string_escape') directly for | |
2756 # Python 3 compatibility | |
2757 return codecs.escape_encode(s)[0] | |
2758 | |
2759 def unescapestr(s): | |
2760 return codecs.escape_decode(s)[0] | |
2761 | |
2762 def forcebytestr(obj): | |
2763 """Portably format an arbitrary object (e.g. exception) into a byte | |
2764 string.""" | |
2765 try: | |
2766 return pycompat.bytestr(obj) | |
2767 except UnicodeEncodeError: | |
2768 # non-ascii string, may be lossy | |
2769 return pycompat.bytestr(encoding.strtolocal(str(obj))) | |
2770 | |
2771 def uirepr(s): | |
2772 # Avoid double backslash in Windows path repr() | |
2773 return pycompat.byterepr(pycompat.bytestr(s)).replace(b'\\\\', b'\\') | |
2774 | |
2775 # delay import of textwrap | |
2776 def _MBTextWrapper(**kwargs): | |
2777 class tw(textwrap.TextWrapper): | |
2778 """ | |
2779 Extend TextWrapper for width-awareness. | |
2780 | |
2781 Neither number of 'bytes' in any encoding nor 'characters' is | |
2782 appropriate to calculate terminal columns for specified string. | |
2783 | |
2784 Original TextWrapper implementation uses built-in 'len()' directly, | |
2785 so overriding is needed to use width information of each characters. | |
2786 | |
2787 In addition, characters classified into 'ambiguous' width are | |
2788 treated as wide in East Asian area, but as narrow in other. | |
2789 | |
2790 This requires use decision to determine width of such characters. | |
2791 """ | |
2792 def _cutdown(self, ucstr, space_left): | |
2793 l = 0 | |
2794 colwidth = encoding.ucolwidth | |
2795 for i in xrange(len(ucstr)): | |
2796 l += colwidth(ucstr[i]) | |
2797 if space_left < l: | |
2798 return (ucstr[:i], ucstr[i:]) | |
2799 return ucstr, '' | |
2800 | |
2801 # overriding of base class | |
2802 def _handle_long_word(self, reversed_chunks, cur_line, cur_len, width): | |
2803 space_left = max(width - cur_len, 1) | |
2804 | |
2805 if self.break_long_words: | |
2806 cut, res = self._cutdown(reversed_chunks[-1], space_left) | |
2807 cur_line.append(cut) | |
2808 reversed_chunks[-1] = res | |
2809 elif not cur_line: | |
2810 cur_line.append(reversed_chunks.pop()) | |
2811 | |
2812 # this overriding code is imported from TextWrapper of Python 2.6 | |
2813 # to calculate columns of string by 'encoding.ucolwidth()' | |
2814 def _wrap_chunks(self, chunks): | |
2815 colwidth = encoding.ucolwidth | |
2816 | |
2817 lines = [] | |
2818 if self.width <= 0: | |
2819 raise ValueError("invalid width %r (must be > 0)" % self.width) | |
2820 | |
2821 # Arrange in reverse order so items can be efficiently popped | |
2822 # from a stack of chucks. | |
2823 chunks.reverse() | |
2824 | |
2825 while chunks: | |
2826 | |
2827 # Start the list of chunks that will make up the current line. | |
2828 # cur_len is just the length of all the chunks in cur_line. | |
2829 cur_line = [] | |
2830 cur_len = 0 | |
2831 | |
2832 # Figure out which static string will prefix this line. | |
2833 if lines: | |
2834 indent = self.subsequent_indent | |
2835 else: | |
2836 indent = self.initial_indent | |
2837 | |
2838 # Maximum width for this line. | |
2839 width = self.width - len(indent) | |
2840 | |
2841 # First chunk on line is whitespace -- drop it, unless this | |
2842 # is the very beginning of the text (i.e. no lines started yet). | |
2843 if self.drop_whitespace and chunks[-1].strip() == r'' and lines: | |
2844 del chunks[-1] | |
2845 | |
2846 while chunks: | |
2847 l = colwidth(chunks[-1]) | |
2848 | |
2849 # Can at least squeeze this chunk onto the current line. | |
2850 if cur_len + l <= width: | |
2851 cur_line.append(chunks.pop()) | |
2852 cur_len += l | |
2853 | |
2854 # Nope, this line is full. | |
2855 else: | |
2856 break | |
2857 | |
2858 # The current line is full, and the next chunk is too big to | |
2859 # fit on *any* line (not just this one). | |
2860 if chunks and colwidth(chunks[-1]) > width: | |
2861 self._handle_long_word(chunks, cur_line, cur_len, width) | |
2862 | |
2863 # If the last chunk on this line is all whitespace, drop it. | |
2864 if (self.drop_whitespace and | |
2865 cur_line and cur_line[-1].strip() == r''): | |
2866 del cur_line[-1] | |
2867 | |
2868 # Convert current line back to a string and store it in list | |
2869 # of all lines (return value). | |
2870 if cur_line: | |
2871 lines.append(indent + r''.join(cur_line)) | |
2872 | |
2873 return lines | |
2874 | |
2875 global _MBTextWrapper | |
2876 _MBTextWrapper = tw | |
2877 return tw(**kwargs) | |
2878 | |
2879 def wrap(line, width, initindent='', hangindent=''): | |
2880 maxindent = max(len(hangindent), len(initindent)) | |
2881 if width <= maxindent: | |
2882 # adjust for weird terminal size | |
2883 width = max(78, maxindent + 1) | |
2884 line = line.decode(pycompat.sysstr(encoding.encoding), | |
2885 pycompat.sysstr(encoding.encodingmode)) | |
2886 initindent = initindent.decode(pycompat.sysstr(encoding.encoding), | |
2887 pycompat.sysstr(encoding.encodingmode)) | |
2888 hangindent = hangindent.decode(pycompat.sysstr(encoding.encoding), | |
2889 pycompat.sysstr(encoding.encodingmode)) | |
2890 wrapper = _MBTextWrapper(width=width, | |
2891 initial_indent=initindent, | |
2892 subsequent_indent=hangindent) | |
2893 return wrapper.fill(line).encode(pycompat.sysstr(encoding.encoding)) | |
2894 | 2640 |
2895 if (pyplatform.python_implementation() == 'CPython' and | 2641 if (pyplatform.python_implementation() == 'CPython' and |
2896 sys.version_info < (3, 0)): | 2642 sys.version_info < (3, 0)): |
2897 # There is an issue in CPython that some IO methods do not handle EINTR | 2643 # There is an issue in CPython that some IO methods do not handle EINTR |
2898 # correctly. The following table shows what CPython version (and functions) | 2644 # correctly. The following table shows what CPython version (and functions) |
3062 try: | 2808 try: |
3063 return socket.getservbyname(pycompat.sysstr(port)) | 2809 return socket.getservbyname(pycompat.sysstr(port)) |
3064 except socket.error: | 2810 except socket.error: |
3065 raise Abort(_("no port number associated with service '%s'") % port) | 2811 raise Abort(_("no port number associated with service '%s'") % port) |
3066 | 2812 |
3067 _booleans = {'1': True, 'yes': True, 'true': True, 'on': True, 'always': True, | |
3068 '0': False, 'no': False, 'false': False, 'off': False, | |
3069 'never': False} | |
3070 | |
3071 def parsebool(s): | |
3072 """Parse s into a boolean. | |
3073 | |
3074 If s is not a valid boolean, returns None. | |
3075 """ | |
3076 return _booleans.get(s.lower(), None) | |
3077 | |
3078 class url(object): | 2813 class url(object): |
3079 r"""Reliable URL parser. | 2814 r"""Reliable URL parser. |
3080 | 2815 |
3081 This parses URLs and provides attributes for the following | 2816 This parses URLs and provides attributes for the following |
3082 components: | 2817 components: |
4339 shortdate = _deprecatedfunc(dateutil.shortdate, '4.6') | 4074 shortdate = _deprecatedfunc(dateutil.shortdate, '4.6') |
4340 parsetimezone = _deprecatedfunc(dateutil.parsetimezone, '4.6') | 4075 parsetimezone = _deprecatedfunc(dateutil.parsetimezone, '4.6') |
4341 strdate = _deprecatedfunc(dateutil.strdate, '4.6') | 4076 strdate = _deprecatedfunc(dateutil.strdate, '4.6') |
4342 parsedate = _deprecatedfunc(dateutil.parsedate, '4.6') | 4077 parsedate = _deprecatedfunc(dateutil.parsedate, '4.6') |
4343 matchdate = _deprecatedfunc(dateutil.matchdate, '4.6') | 4078 matchdate = _deprecatedfunc(dateutil.matchdate, '4.6') |
4079 | |
4080 def _deprecatedfunc(func, version): # TODO | |
4081 return func | |
4082 escapedata = _deprecatedfunc(stringutil.escapedata, '4.6') | |
4083 binary = _deprecatedfunc(stringutil.binary, '4.6') | |
4084 stringmatcher = _deprecatedfunc(stringutil.stringmatcher, '4.6') | |
4085 shortuser = _deprecatedfunc(stringutil.shortuser, '4.6') | |
4086 emailuser = _deprecatedfunc(stringutil.emailuser, '4.6') | |
4087 email = _deprecatedfunc(stringutil.email, '4.6') | |
4088 ellipsis = _deprecatedfunc(stringutil.ellipsis, '4.6') | |
4089 escapestr = _deprecatedfunc(stringutil.escapestr, '4.6') | |
4090 unescapestr = _deprecatedfunc(stringutil.unescapestr, '4.6') | |
4091 forcebytestr = _deprecatedfunc(stringutil.forcebytestr, '4.6') | |
4092 uirepr = _deprecatedfunc(stringutil.uirepr, '4.6') | |
4093 wrap = _deprecatedfunc(stringutil.wrap, '4.6') | |
4094 parsebool = _deprecatedfunc(stringutil.parsebool, '4.6') |