Mercurial > hg
changeset 37083:f99d64e8a4e4
stringutil: move generic string helpers to new module
Per https://phab.mercurial-scm.org/D2903#46738
URL and file paths functions are left since they are big enough to make
separate modules.
author | Yuya Nishihara <yuya@tcha.org> |
---|---|
date | Thu, 22 Mar 2018 21:19:31 +0900 |
parents | 1a1d1c44b570 |
children | f0b6fbea00cf |
files | mercurial/util.py mercurial/utils/stringutil.py tests/test-doctest.py |
diffstat | 3 files changed, 309 insertions(+), 269 deletions(-) [+] |
line wrap: on
line diff
--- a/mercurial/util.py Thu Mar 22 21:32:19 2018 +0900 +++ b/mercurial/util.py Thu Mar 22 21:19:31 2018 +0900 @@ -17,7 +17,6 @@ import abc import bz2 -import codecs import collections import contextlib import errno @@ -37,7 +36,6 @@ import subprocess import sys import tempfile -import textwrap import time import traceback import warnings @@ -52,7 +50,10 @@ pycompat, urllibcompat, ) -from .utils import dateutil +from .utils import ( + dateutil, + stringutil, +) base85 = policy.importmod(r'base85') osutil = policy.importmod(r'osutil') @@ -808,20 +809,6 @@ return object.__getattribute__(self, r'_observedcall')( r'setsockopt', *args, **kwargs) -_DATA_ESCAPE_MAP = {pycompat.bytechr(i): br'\x%02x' % i for i in range(256)} -_DATA_ESCAPE_MAP.update({ - b'\\': b'\\\\', - b'\r': br'\r', - b'\n': br'\n', -}) -_DATA_ESCAPE_RE = remod.compile(br'[\x00-\x08\x0a-\x1f\\\x7f-\xff]') - -def escapedata(s): - if isinstance(s, bytearray): - s = bytes(s) - - return _DATA_ESCAPE_RE.sub(lambda m: _DATA_ESCAPE_MAP[m.group(0)], s) - class baseproxyobserver(object): def _writedata(self, data): if not self.logdata: @@ -1567,10 +1554,6 @@ return fn(s, cmd[len(name):].lstrip()) return pipefilter(s, cmd) -def binary(s): - """return true if a string is binary data""" - return bool(s and '\0' in s) - def increasingchunks(source, min=1024, max=65536): '''return no less than min bytes per chunk while data remains, doubling min after each chunk until it reaches max''' @@ -2571,102 +2554,6 @@ b[0:len(res)] = res return len(res) -def stringmatcher(pattern, casesensitive=True): - """ - accepts a string, possibly starting with 're:' or 'literal:' prefix. - returns the matcher name, pattern, and matcher function. - missing or unknown prefixes are treated as literal matches. - - helper for tests: - >>> def test(pattern, *tests): - ... kind, pattern, matcher = stringmatcher(pattern) - ... return (kind, pattern, [bool(matcher(t)) for t in tests]) - >>> def itest(pattern, *tests): - ... kind, pattern, matcher = stringmatcher(pattern, casesensitive=False) - ... return (kind, pattern, [bool(matcher(t)) for t in tests]) - - exact matching (no prefix): - >>> test(b'abcdefg', b'abc', b'def', b'abcdefg') - ('literal', 'abcdefg', [False, False, True]) - - regex matching ('re:' prefix) - >>> test(b're:a.+b', b'nomatch', b'fooadef', b'fooadefbar') - ('re', 'a.+b', [False, False, True]) - - force exact matches ('literal:' prefix) - >>> test(b'literal:re:foobar', b'foobar', b're:foobar') - ('literal', 're:foobar', [False, True]) - - unknown prefixes are ignored and treated as literals - >>> test(b'foo:bar', b'foo', b'bar', b'foo:bar') - ('literal', 'foo:bar', [False, False, True]) - - case insensitive regex matches - >>> itest(b're:A.+b', b'nomatch', b'fooadef', b'fooadefBar') - ('re', 'A.+b', [False, False, True]) - - case insensitive literal matches - >>> itest(b'ABCDEFG', b'abc', b'def', b'abcdefg') - ('literal', 'ABCDEFG', [False, False, True]) - """ - if pattern.startswith('re:'): - pattern = pattern[3:] - try: - flags = 0 - if not casesensitive: - flags = remod.I - regex = remod.compile(pattern, flags) - except remod.error as e: - raise error.ParseError(_('invalid regular expression: %s') - % e) - return 're', pattern, regex.search - elif pattern.startswith('literal:'): - pattern = pattern[8:] - - match = pattern.__eq__ - - if not casesensitive: - ipat = encoding.lower(pattern) - match = lambda s: ipat == encoding.lower(s) - return 'literal', pattern, match - -def shortuser(user): - """Return a short representation of a user name or email address.""" - f = user.find('@') - if f >= 0: - user = user[:f] - f = user.find('<') - if f >= 0: - user = user[f + 1:] - f = user.find(' ') - if f >= 0: - user = user[:f] - f = user.find('.') - if f >= 0: - user = user[:f] - return user - -def emailuser(user): - """Return the user portion of an email address.""" - f = user.find('@') - if f >= 0: - user = user[:f] - f = user.find('<') - if f >= 0: - user = user[f + 1:] - return user - -def email(author): - '''get email of author.''' - r = author.find('>') - if r == -1: - r = None - return author[author.find('<') + 1:r] - -def ellipsis(text, maxlength=400): - """Trim string to at most maxlength (default: 400) columns in display.""" - return encoding.trim(text, maxlength, ellipsis='...') - def unitcountfn(*unittable): '''return a function that renders a readable count of some quantity''' @@ -2751,147 +2638,6 @@ fromnativeeol = pycompat.identity nativeeolwriter = pycompat.identity -def escapestr(s): - # call underlying function of s.encode('string_escape') directly for - # Python 3 compatibility - return codecs.escape_encode(s)[0] - -def unescapestr(s): - return codecs.escape_decode(s)[0] - -def forcebytestr(obj): - """Portably format an arbitrary object (e.g. exception) into a byte - string.""" - try: - return pycompat.bytestr(obj) - except UnicodeEncodeError: - # non-ascii string, may be lossy - return pycompat.bytestr(encoding.strtolocal(str(obj))) - -def uirepr(s): - # Avoid double backslash in Windows path repr() - return pycompat.byterepr(pycompat.bytestr(s)).replace(b'\\\\', b'\\') - -# delay import of textwrap -def _MBTextWrapper(**kwargs): - class tw(textwrap.TextWrapper): - """ - Extend TextWrapper for width-awareness. - - Neither number of 'bytes' in any encoding nor 'characters' is - appropriate to calculate terminal columns for specified string. - - Original TextWrapper implementation uses built-in 'len()' directly, - so overriding is needed to use width information of each characters. - - In addition, characters classified into 'ambiguous' width are - treated as wide in East Asian area, but as narrow in other. - - This requires use decision to determine width of such characters. - """ - def _cutdown(self, ucstr, space_left): - l = 0 - colwidth = encoding.ucolwidth - for i in xrange(len(ucstr)): - l += colwidth(ucstr[i]) - if space_left < l: - return (ucstr[:i], ucstr[i:]) - return ucstr, '' - - # overriding of base class - def _handle_long_word(self, reversed_chunks, cur_line, cur_len, width): - space_left = max(width - cur_len, 1) - - if self.break_long_words: - cut, res = self._cutdown(reversed_chunks[-1], space_left) - cur_line.append(cut) - reversed_chunks[-1] = res - elif not cur_line: - cur_line.append(reversed_chunks.pop()) - - # this overriding code is imported from TextWrapper of Python 2.6 - # to calculate columns of string by 'encoding.ucolwidth()' - def _wrap_chunks(self, chunks): - colwidth = encoding.ucolwidth - - lines = [] - if self.width <= 0: - raise ValueError("invalid width %r (must be > 0)" % self.width) - - # Arrange in reverse order so items can be efficiently popped - # from a stack of chucks. - chunks.reverse() - - while chunks: - - # Start the list of chunks that will make up the current line. - # cur_len is just the length of all the chunks in cur_line. - cur_line = [] - cur_len = 0 - - # Figure out which static string will prefix this line. - if lines: - indent = self.subsequent_indent - else: - indent = self.initial_indent - - # Maximum width for this line. - width = self.width - len(indent) - - # First chunk on line is whitespace -- drop it, unless this - # is the very beginning of the text (i.e. no lines started yet). - if self.drop_whitespace and chunks[-1].strip() == r'' and lines: - del chunks[-1] - - while chunks: - l = colwidth(chunks[-1]) - - # Can at least squeeze this chunk onto the current line. - if cur_len + l <= width: - cur_line.append(chunks.pop()) - cur_len += l - - # Nope, this line is full. - else: - break - - # The current line is full, and the next chunk is too big to - # fit on *any* line (not just this one). - if chunks and colwidth(chunks[-1]) > width: - self._handle_long_word(chunks, cur_line, cur_len, width) - - # If the last chunk on this line is all whitespace, drop it. - if (self.drop_whitespace and - cur_line and cur_line[-1].strip() == r''): - del cur_line[-1] - - # Convert current line back to a string and store it in list - # of all lines (return value). - if cur_line: - lines.append(indent + r''.join(cur_line)) - - return lines - - global _MBTextWrapper - _MBTextWrapper = tw - return tw(**kwargs) - -def wrap(line, width, initindent='', hangindent=''): - maxindent = max(len(hangindent), len(initindent)) - if width <= maxindent: - # adjust for weird terminal size - width = max(78, maxindent + 1) - line = line.decode(pycompat.sysstr(encoding.encoding), - pycompat.sysstr(encoding.encodingmode)) - initindent = initindent.decode(pycompat.sysstr(encoding.encoding), - pycompat.sysstr(encoding.encodingmode)) - hangindent = hangindent.decode(pycompat.sysstr(encoding.encoding), - pycompat.sysstr(encoding.encodingmode)) - wrapper = _MBTextWrapper(width=width, - initial_indent=initindent, - subsequent_indent=hangindent) - return wrapper.fill(line).encode(pycompat.sysstr(encoding.encoding)) - if (pyplatform.python_implementation() == 'CPython' and sys.version_info < (3, 0)): # There is an issue in CPython that some IO methods do not handle EINTR @@ -3064,17 +2810,6 @@ except socket.error: raise Abort(_("no port number associated with service '%s'") % port) -_booleans = {'1': True, 'yes': True, 'true': True, 'on': True, 'always': True, - '0': False, 'no': False, 'false': False, 'off': False, - 'never': False} - -def parsebool(s): - """Parse s into a boolean. - - If s is not a valid boolean, returns None. - """ - return _booleans.get(s.lower(), None) - class url(object): r"""Reliable URL parser. @@ -4341,3 +4076,19 @@ strdate = _deprecatedfunc(dateutil.strdate, '4.6') parsedate = _deprecatedfunc(dateutil.parsedate, '4.6') matchdate = _deprecatedfunc(dateutil.matchdate, '4.6') + +def _deprecatedfunc(func, version): # TODO + return func +escapedata = _deprecatedfunc(stringutil.escapedata, '4.6') +binary = _deprecatedfunc(stringutil.binary, '4.6') +stringmatcher = _deprecatedfunc(stringutil.stringmatcher, '4.6') +shortuser = _deprecatedfunc(stringutil.shortuser, '4.6') +emailuser = _deprecatedfunc(stringutil.emailuser, '4.6') +email = _deprecatedfunc(stringutil.email, '4.6') +ellipsis = _deprecatedfunc(stringutil.ellipsis, '4.6') +escapestr = _deprecatedfunc(stringutil.escapestr, '4.6') +unescapestr = _deprecatedfunc(stringutil.unescapestr, '4.6') +forcebytestr = _deprecatedfunc(stringutil.forcebytestr, '4.6') +uirepr = _deprecatedfunc(stringutil.uirepr, '4.6') +wrap = _deprecatedfunc(stringutil.wrap, '4.6') +parsebool = _deprecatedfunc(stringutil.parsebool, '4.6')
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/mercurial/utils/stringutil.py Thu Mar 22 21:19:31 2018 +0900 @@ -0,0 +1,288 @@ +# stringutil.py - utility for generic string formatting, parsing, etc. +# +# Copyright 2005 K. Thananchayan <thananck@yahoo.com> +# Copyright 2005-2007 Matt Mackall <mpm@selenic.com> +# Copyright 2006 Vadim Gelfer <vadim.gelfer@gmail.com> +# +# This software may be used and distributed according to the terms of the +# GNU General Public License version 2 or any later version. + +from __future__ import absolute_import + +import codecs +import re as remod +import textwrap + +from ..i18n import _ + +from .. import ( + encoding, + error, + pycompat, +) + +_DATA_ESCAPE_MAP = {pycompat.bytechr(i): br'\x%02x' % i for i in range(256)} +_DATA_ESCAPE_MAP.update({ + b'\\': b'\\\\', + b'\r': br'\r', + b'\n': br'\n', +}) +_DATA_ESCAPE_RE = remod.compile(br'[\x00-\x08\x0a-\x1f\\\x7f-\xff]') + +def escapedata(s): + if isinstance(s, bytearray): + s = bytes(s) + + return _DATA_ESCAPE_RE.sub(lambda m: _DATA_ESCAPE_MAP[m.group(0)], s) + +def binary(s): + """return true if a string is binary data""" + return bool(s and '\0' in s) + +def stringmatcher(pattern, casesensitive=True): + """ + accepts a string, possibly starting with 're:' or 'literal:' prefix. + returns the matcher name, pattern, and matcher function. + missing or unknown prefixes are treated as literal matches. + + helper for tests: + >>> def test(pattern, *tests): + ... kind, pattern, matcher = stringmatcher(pattern) + ... return (kind, pattern, [bool(matcher(t)) for t in tests]) + >>> def itest(pattern, *tests): + ... kind, pattern, matcher = stringmatcher(pattern, casesensitive=False) + ... return (kind, pattern, [bool(matcher(t)) for t in tests]) + + exact matching (no prefix): + >>> test(b'abcdefg', b'abc', b'def', b'abcdefg') + ('literal', 'abcdefg', [False, False, True]) + + regex matching ('re:' prefix) + >>> test(b're:a.+b', b'nomatch', b'fooadef', b'fooadefbar') + ('re', 'a.+b', [False, False, True]) + + force exact matches ('literal:' prefix) + >>> test(b'literal:re:foobar', b'foobar', b're:foobar') + ('literal', 're:foobar', [False, True]) + + unknown prefixes are ignored and treated as literals + >>> test(b'foo:bar', b'foo', b'bar', b'foo:bar') + ('literal', 'foo:bar', [False, False, True]) + + case insensitive regex matches + >>> itest(b're:A.+b', b'nomatch', b'fooadef', b'fooadefBar') + ('re', 'A.+b', [False, False, True]) + + case insensitive literal matches + >>> itest(b'ABCDEFG', b'abc', b'def', b'abcdefg') + ('literal', 'ABCDEFG', [False, False, True]) + """ + if pattern.startswith('re:'): + pattern = pattern[3:] + try: + flags = 0 + if not casesensitive: + flags = remod.I + regex = remod.compile(pattern, flags) + except remod.error as e: + raise error.ParseError(_('invalid regular expression: %s') + % e) + return 're', pattern, regex.search + elif pattern.startswith('literal:'): + pattern = pattern[8:] + + match = pattern.__eq__ + + if not casesensitive: + ipat = encoding.lower(pattern) + match = lambda s: ipat == encoding.lower(s) + return 'literal', pattern, match + +def shortuser(user): + """Return a short representation of a user name or email address.""" + f = user.find('@') + if f >= 0: + user = user[:f] + f = user.find('<') + if f >= 0: + user = user[f + 1:] + f = user.find(' ') + if f >= 0: + user = user[:f] + f = user.find('.') + if f >= 0: + user = user[:f] + return user + +def emailuser(user): + """Return the user portion of an email address.""" + f = user.find('@') + if f >= 0: + user = user[:f] + f = user.find('<') + if f >= 0: + user = user[f + 1:] + return user + +def email(author): + '''get email of author.''' + r = author.find('>') + if r == -1: + r = None + return author[author.find('<') + 1:r] + +def ellipsis(text, maxlength=400): + """Trim string to at most maxlength (default: 400) columns in display.""" + return encoding.trim(text, maxlength, ellipsis='...') + +def escapestr(s): + # call underlying function of s.encode('string_escape') directly for + # Python 3 compatibility + return codecs.escape_encode(s)[0] + +def unescapestr(s): + return codecs.escape_decode(s)[0] + +def forcebytestr(obj): + """Portably format an arbitrary object (e.g. exception) into a byte + string.""" + try: + return pycompat.bytestr(obj) + except UnicodeEncodeError: + # non-ascii string, may be lossy + return pycompat.bytestr(encoding.strtolocal(str(obj))) + +def uirepr(s): + # Avoid double backslash in Windows path repr() + return pycompat.byterepr(pycompat.bytestr(s)).replace(b'\\\\', b'\\') + +# delay import of textwrap +def _MBTextWrapper(**kwargs): + class tw(textwrap.TextWrapper): + """ + Extend TextWrapper for width-awareness. + + Neither number of 'bytes' in any encoding nor 'characters' is + appropriate to calculate terminal columns for specified string. + + Original TextWrapper implementation uses built-in 'len()' directly, + so overriding is needed to use width information of each characters. + + In addition, characters classified into 'ambiguous' width are + treated as wide in East Asian area, but as narrow in other. + + This requires use decision to determine width of such characters. + """ + def _cutdown(self, ucstr, space_left): + l = 0 + colwidth = encoding.ucolwidth + for i in xrange(len(ucstr)): + l += colwidth(ucstr[i]) + if space_left < l: + return (ucstr[:i], ucstr[i:]) + return ucstr, '' + + # overriding of base class + def _handle_long_word(self, reversed_chunks, cur_line, cur_len, width): + space_left = max(width - cur_len, 1) + + if self.break_long_words: + cut, res = self._cutdown(reversed_chunks[-1], space_left) + cur_line.append(cut) + reversed_chunks[-1] = res + elif not cur_line: + cur_line.append(reversed_chunks.pop()) + + # this overriding code is imported from TextWrapper of Python 2.6 + # to calculate columns of string by 'encoding.ucolwidth()' + def _wrap_chunks(self, chunks): + colwidth = encoding.ucolwidth + + lines = [] + if self.width <= 0: + raise ValueError("invalid width %r (must be > 0)" % self.width) + + # Arrange in reverse order so items can be efficiently popped + # from a stack of chucks. + chunks.reverse() + + while chunks: + + # Start the list of chunks that will make up the current line. + # cur_len is just the length of all the chunks in cur_line. + cur_line = [] + cur_len = 0 + + # Figure out which static string will prefix this line. + if lines: + indent = self.subsequent_indent + else: + indent = self.initial_indent + + # Maximum width for this line. + width = self.width - len(indent) + + # First chunk on line is whitespace -- drop it, unless this + # is the very beginning of the text (i.e. no lines started yet). + if self.drop_whitespace and chunks[-1].strip() == r'' and lines: + del chunks[-1] + + while chunks: + l = colwidth(chunks[-1]) + + # Can at least squeeze this chunk onto the current line. + if cur_len + l <= width: + cur_line.append(chunks.pop()) + cur_len += l + + # Nope, this line is full. + else: + break + + # The current line is full, and the next chunk is too big to + # fit on *any* line (not just this one). + if chunks and colwidth(chunks[-1]) > width: + self._handle_long_word(chunks, cur_line, cur_len, width) + + # If the last chunk on this line is all whitespace, drop it. + if (self.drop_whitespace and + cur_line and cur_line[-1].strip() == r''): + del cur_line[-1] + + # Convert current line back to a string and store it in list + # of all lines (return value). + if cur_line: + lines.append(indent + r''.join(cur_line)) + + return lines + + global _MBTextWrapper + _MBTextWrapper = tw + return tw(**kwargs) + +def wrap(line, width, initindent='', hangindent=''): + maxindent = max(len(hangindent), len(initindent)) + if width <= maxindent: + # adjust for weird terminal size + width = max(78, maxindent + 1) + line = line.decode(pycompat.sysstr(encoding.encoding), + pycompat.sysstr(encoding.encodingmode)) + initindent = initindent.decode(pycompat.sysstr(encoding.encoding), + pycompat.sysstr(encoding.encodingmode)) + hangindent = hangindent.decode(pycompat.sysstr(encoding.encoding), + pycompat.sysstr(encoding.encodingmode)) + wrapper = _MBTextWrapper(width=width, + initial_indent=initindent, + subsequent_indent=hangindent) + return wrapper.fill(line).encode(pycompat.sysstr(encoding.encoding)) + +_booleans = {'1': True, 'yes': True, 'true': True, 'on': True, 'always': True, + '0': False, 'no': False, 'false': False, 'off': False, + 'never': False} + +def parsebool(s): + """Parse s into a boolean. + + If s is not a valid boolean, returns None. + """ + return _booleans.get(s.lower(), None)
--- a/tests/test-doctest.py Thu Mar 22 21:32:19 2018 +0900 +++ b/tests/test-doctest.py Thu Mar 22 21:19:31 2018 +0900 @@ -70,6 +70,7 @@ testmod('mercurial.url') testmod('mercurial.util') testmod('mercurial.util', testtarget='platform') +testmod('mercurial.utils.stringutil') testmod('hgext.convert.convcmd') testmod('hgext.convert.cvsps') testmod('hgext.convert.filemap')