mercurial/match.py
author Durham Goode <durham@fb.com>
Mon, 18 May 2015 16:27:56 -0700
changeset 25214 08703b10c3ae
parent 25213 08a8e9da0ae7
child 25215 4040e06e9b99
permissions -rw-r--r--
match: add optional warn argument Occasionally the matcher will want to print warning messages instead of throwing exceptions (like if it encounters a bad syntax parameter when parsing files). Let's add an optional warn argument that can provide this. The next patch will actually use this argument.

# match.py - filename matching
#
#  Copyright 2008, 2009 Matt Mackall <mpm@selenic.com> and others
#
# This software may be used and distributed according to the terms of the
# GNU General Public License version 2 or any later version.

import re
import util, pathutil
from i18n import _

propertycache = util.propertycache

def _rematcher(regex):
    '''compile the regexp with the best available regexp engine and return a
    matcher function'''
    m = util.re.compile(regex)
    try:
        # slightly faster, provided by facebook's re2 bindings
        return m.test_match
    except AttributeError:
        return m.match

def _expandsets(kindpats, ctx, listsubrepos):
    '''Returns the kindpats list with the 'set' patterns expanded.'''
    fset = set()
    other = []

    for kind, pat, source in kindpats:
        if kind == 'set':
            if not ctx:
                raise util.Abort("fileset expression with no context")
            s = ctx.getfileset(pat)
            fset.update(s)

            if listsubrepos:
                for subpath in ctx.substate:
                    s = ctx.sub(subpath).getfileset(pat)
                    fset.update(subpath + '/' + f for f in s)

            continue
        other.append((kind, pat, source))
    return fset, other

def _kindpatsalwaysmatch(kindpats):
    """"Checks whether the kindspats match everything, as e.g.
    'relpath:.' does.
    """
    for kind, pat, source in kindpats:
        if pat != '' or kind not in ['relpath', 'glob']:
            return False
    return True

class match(object):
    def __init__(self, root, cwd, patterns, include=[], exclude=[],
                 default='glob', exact=False, auditor=None, ctx=None,
                 listsubrepos=False, warn=None):
        """build an object to match a set of file patterns

        arguments:
        root - the canonical root of the tree you're matching against
        cwd - the current working directory, if relevant
        patterns - patterns to find
        include - patterns to include (unless they are excluded)
        exclude - patterns to exclude (even if they are included)
        default - if a pattern in patterns has no explicit type, assume this one
        exact - patterns are actually filenames (include/exclude still apply)
        warn - optional function used for printing warnings

        a pattern is one of:
        'glob:<glob>' - a glob relative to cwd
        're:<regexp>' - a regular expression
        'path:<path>' - a path relative to repository root
        'relglob:<glob>' - an unrooted glob (*.c matches C files in all dirs)
        'relpath:<path>' - a path relative to cwd
        'relre:<regexp>' - a regexp that needn't match the start of a name
        'set:<fileset>' - a fileset expression
        '<something>' - a pattern of the specified default type
        """

        self._root = root
        self._cwd = cwd
        self._files = [] # exact files and roots of patterns
        self._anypats = bool(include or exclude)
        self._always = False
        self._pathrestricted = bool(include or exclude or patterns)
        self._warn = warn

        matchfns = []
        if include:
            kindpats = self._normalize(include, 'glob', root, cwd, auditor)
            self.includepat, im = _buildmatch(ctx, kindpats, '(?:/|$)',
                                              listsubrepos)
            matchfns.append(im)
        if exclude:
            kindpats = self._normalize(exclude, 'glob', root, cwd, auditor)
            self.excludepat, em = _buildmatch(ctx, kindpats, '(?:/|$)',
                                              listsubrepos)
            matchfns.append(lambda f: not em(f))
        if exact:
            if isinstance(patterns, list):
                self._files = patterns
            else:
                self._files = list(patterns)
            matchfns.append(self.exact)
        elif patterns:
            kindpats = self._normalize(patterns, default, root, cwd, auditor)
            if not _kindpatsalwaysmatch(kindpats):
                self._files = _roots(kindpats)
                self._anypats = self._anypats or _anypats(kindpats)
                self.patternspat, pm = _buildmatch(ctx, kindpats, '$',
                                                   listsubrepos)
                matchfns.append(pm)

        if not matchfns:
            m = util.always
            self._always = True
        elif len(matchfns) == 1:
            m = matchfns[0]
        else:
            def m(f):
                for matchfn in matchfns:
                    if not matchfn(f):
                        return False
                return True

        self.matchfn = m
        self._fileroots = set(self._files)

    def __call__(self, fn):
        return self.matchfn(fn)
    def __iter__(self):
        for f in self._files:
            yield f

    # Callbacks related to how the matcher is used by dirstate.walk.
    # Subscribers to these events must monkeypatch the matcher object.
    def bad(self, f, msg):
        '''Callback from dirstate.walk for each explicit file that can't be
        found/accessed, with an error message.'''
        pass

    # If an explicitdir is set, it will be called when an explicitly listed
    # directory is visited.
    explicitdir = None

    # If an traversedir is set, it will be called when a directory discovered
    # by recursive traversal is visited.
    traversedir = None

    def abs(self, f):
        '''Convert a repo path back to path that is relative to the root of the
        matcher.'''
        return f

    def rel(self, f):
        '''Convert repo path back to path that is relative to cwd of matcher.'''
        return util.pathto(self._root, self._cwd, f)

    def uipath(self, f):
        '''Convert repo path to a display path.  If patterns or -I/-X were used
        to create this matcher, the display path will be relative to cwd.
        Otherwise it is relative to the root of the repo.'''
        return (self._pathrestricted and self.rel(f)) or self.abs(f)

    def files(self):
        '''Explicitly listed files or patterns or roots:
        if no patterns or .always(): empty list,
        if exact: list exact files,
        if not .anypats(): list all files and dirs,
        else: optimal roots'''
        return self._files

    @propertycache
    def _dirs(self):
        return set(util.dirs(self._fileroots)) | set(['.'])

    def visitdir(self, dir):
        return (not self._fileroots or '.' in self._fileroots or
                dir in self._fileroots or dir in self._dirs or
                any(parentdir in self._fileroots
                    for parentdir in util.finddirs(dir)))

    def exact(self, f):
        '''Returns True if f is in .files().'''
        return f in self._fileroots

    def anypats(self):
        '''Matcher uses patterns or include/exclude.'''
        return self._anypats

    def always(self):
        '''Matcher will match everything and .files() will be empty
        - optimization might be possible and necessary.'''
        return self._always

    def ispartial(self):
        '''True if the matcher won't always match.

        Although it's just the inverse of _always in this implementation,
        an extenion such as narrowhg might make it return something
        slightly different.'''
        return not self._always

    def isexact(self):
        return self.matchfn == self.exact

    def _normalize(self, patterns, default, root, cwd, auditor):
        '''Convert 'kind:pat' from the patterns list to tuples with kind and
        normalized and rooted patterns and with listfiles expanded.'''
        kindpats = []
        for kind, pat in [_patsplit(p, default) for p in patterns]:
            if kind in ('glob', 'relpath'):
                pat = pathutil.canonpath(root, cwd, pat, auditor)
            elif kind in ('relglob', 'path'):
                pat = util.normpath(pat)
            elif kind in ('listfile', 'listfile0'):
                try:
                    files = util.readfile(pat)
                    if kind == 'listfile0':
                        files = files.split('\0')
                    else:
                        files = files.splitlines()
                    files = [f for f in files if f]
                except EnvironmentError:
                    raise util.Abort(_("unable to read file list (%s)") % pat)
                for k, p, source in self._normalize(files, default, root, cwd,
                                                    auditor):
                    kindpats.append((k, p, pat))
                continue
            # else: re or relre - which cannot be normalized
            kindpats.append((kind, pat, ''))
        return kindpats

def exact(root, cwd, files):
    return match(root, cwd, files, exact=True)

def always(root, cwd):
    return match(root, cwd, [])

class narrowmatcher(match):
    """Adapt a matcher to work on a subdirectory only.

    The paths are remapped to remove/insert the path as needed:

    >>> m1 = match('root', '', ['a.txt', 'sub/b.txt'])
    >>> m2 = narrowmatcher('sub', m1)
    >>> bool(m2('a.txt'))
    False
    >>> bool(m2('b.txt'))
    True
    >>> bool(m2.matchfn('a.txt'))
    False
    >>> bool(m2.matchfn('b.txt'))
    True
    >>> m2.files()
    ['b.txt']
    >>> m2.exact('b.txt')
    True
    >>> util.pconvert(m2.rel('b.txt'))
    'sub/b.txt'
    >>> def bad(f, msg):
    ...     print "%s: %s" % (f, msg)
    >>> m1.bad = bad
    >>> m2.bad('x.txt', 'No such file')
    sub/x.txt: No such file
    >>> m2.abs('c.txt')
    'sub/c.txt'
    """

    def __init__(self, path, matcher):
        self._root = matcher._root
        self._cwd = matcher._cwd
        self._path = path
        self._matcher = matcher
        self._always = matcher._always
        self._pathrestricted = matcher._pathrestricted

        self._files = [f[len(path) + 1:] for f in matcher._files
                       if f.startswith(path + "/")]

        # If the parent repo had a path to this subrepo and no patterns are
        # specified, this submatcher always matches.
        if not self._always and not matcher._anypats:
            self._always = any(f == path for f in matcher._files)

        self._anypats = matcher._anypats
        self.matchfn = lambda fn: matcher.matchfn(self._path + "/" + fn)
        self._fileroots = set(self._files)

    def abs(self, f):
        return self._matcher.abs(self._path + "/" + f)

    def bad(self, f, msg):
        self._matcher.bad(self._path + "/" + f, msg)

    def rel(self, f):
        return self._matcher.rel(self._path + "/" + f)

class icasefsmatcher(match):
    """A matcher for wdir on case insensitive filesystems, which normalizes the
    given patterns to the case in the filesystem.
    """

    def __init__(self, root, cwd, patterns, include, exclude, default, auditor,
                 ctx, listsubrepos=False):
        init = super(icasefsmatcher, self).__init__
        self._dsnormalize = ctx.repo().dirstate.normalize

        init(root, cwd, patterns, include, exclude, default, auditor=auditor,
             ctx=ctx, listsubrepos=listsubrepos)

        # m.exact(file) must be based off of the actual user input, otherwise
        # inexact case matches are treated as exact, and not noted without -v.
        if self._files:
            self._fileroots = set(_roots(self._kp))

    def _normalize(self, patterns, default, root, cwd, auditor):
        self._kp = super(icasefsmatcher, self)._normalize(patterns, default,
                                                          root, cwd, auditor)
        kindpats = []
        for kind, pats, source in self._kp:
            if kind not in ('re', 'relre'):  # regex can't be normalized
                pats = self._dsnormalize(pats)
            kindpats.append((kind, pats, source))
        return kindpats

def patkind(pattern, default=None):
    '''If pattern is 'kind:pat' with a known kind, return kind.'''
    return _patsplit(pattern, default)[0]

def _patsplit(pattern, default):
    """Split a string into the optional pattern kind prefix and the actual
    pattern."""
    if ':' in pattern:
        kind, pat = pattern.split(':', 1)
        if kind in ('re', 'glob', 'path', 'relglob', 'relpath', 'relre',
                    'listfile', 'listfile0', 'set'):
            return kind, pat
    return default, pattern

def _globre(pat):
    r'''Convert an extended glob string to a regexp string.

    >>> print _globre(r'?')
    .
    >>> print _globre(r'*')
    [^/]*
    >>> print _globre(r'**')
    .*
    >>> print _globre(r'**/a')
    (?:.*/)?a
    >>> print _globre(r'a/**/b')
    a\/(?:.*/)?b
    >>> print _globre(r'[a*?!^][^b][!c]')
    [a*?!^][\^b][^c]
    >>> print _globre(r'{a,b}')
    (?:a|b)
    >>> print _globre(r'.\*\?')
    \.\*\?
    '''
    i, n = 0, len(pat)
    res = ''
    group = 0
    escape = util.re.escape
    def peek():
        return i < n and pat[i]
    while i < n:
        c = pat[i]
        i += 1
        if c not in '*?[{},\\':
            res += escape(c)
        elif c == '*':
            if peek() == '*':
                i += 1
                if peek() == '/':
                    i += 1
                    res += '(?:.*/)?'
                else:
                    res += '.*'
            else:
                res += '[^/]*'
        elif c == '?':
            res += '.'
        elif c == '[':
            j = i
            if j < n and pat[j] in '!]':
                j += 1
            while j < n and pat[j] != ']':
                j += 1
            if j >= n:
                res += '\\['
            else:
                stuff = pat[i:j].replace('\\','\\\\')
                i = j + 1
                if stuff[0] == '!':
                    stuff = '^' + stuff[1:]
                elif stuff[0] == '^':
                    stuff = '\\' + stuff
                res = '%s[%s]' % (res, stuff)
        elif c == '{':
            group += 1
            res += '(?:'
        elif c == '}' and group:
            res += ')'
            group -= 1
        elif c == ',' and group:
            res += '|'
        elif c == '\\':
            p = peek()
            if p:
                i += 1
                res += escape(p)
            else:
                res += escape(c)
        else:
            res += escape(c)
    return res

def _regex(kind, pat, globsuffix):
    '''Convert a (normalized) pattern of any kind into a regular expression.
    globsuffix is appended to the regexp of globs.'''
    if not pat:
        return ''
    if kind == 're':
        return pat
    if kind == 'path':
        return '^' + util.re.escape(pat) + '(?:/|$)'
    if kind == 'relglob':
        return '(?:|.*/)' + _globre(pat) + globsuffix
    if kind == 'relpath':
        return util.re.escape(pat) + '(?:/|$)'
    if kind == 'relre':
        if pat.startswith('^'):
            return pat
        return '.*' + pat
    return _globre(pat) + globsuffix

def _buildmatch(ctx, kindpats, globsuffix, listsubrepos):
    '''Return regexp string and a matcher function for kindpats.
    globsuffix is appended to the regexp of globs.'''
    fset, kindpats = _expandsets(kindpats, ctx, listsubrepos)
    if not kindpats:
        return "", fset.__contains__

    regex, mf = _buildregexmatch(kindpats, globsuffix)
    if fset:
        return regex, lambda f: f in fset or mf(f)
    return regex, mf

def _buildregexmatch(kindpats, globsuffix):
    """Build a match function from a list of kinds and kindpats,
    return regexp string and a matcher function."""
    try:
        regex = '(?:%s)' % '|'.join([_regex(k, p, globsuffix)
                                     for (k, p, s) in kindpats])
        if len(regex) > 20000:
            raise OverflowError
        return regex, _rematcher(regex)
    except OverflowError:
        # We're using a Python with a tiny regex engine and we
        # made it explode, so we'll divide the pattern list in two
        # until it works
        l = len(kindpats)
        if l < 2:
            raise
        regexa, a = _buildregexmatch(kindpats[:l//2], globsuffix)
        regexb, b = _buildregexmatch(kindpats[l//2:], globsuffix)
        return regex, lambda s: a(s) or b(s)
    except re.error:
        for k, p, s in kindpats:
            try:
                _rematcher('(?:%s)' % _regex(k, p, globsuffix))
            except re.error:
                if s:
                    raise util.Abort(_("%s: invalid pattern (%s): %s") %
                                     (s, k, p))
                else:
                    raise util.Abort(_("invalid pattern (%s): %s") % (k, p))
        raise util.Abort(_("invalid pattern"))

def _roots(kindpats):
    '''return roots and exact explicitly listed files from patterns

    >>> _roots([('glob', 'g/*', ''), ('glob', 'g', ''), ('glob', 'g*', '')])
    ['g', 'g', '.']
    >>> _roots([('relpath', 'r', ''), ('path', 'p/p', ''), ('path', '', '')])
    ['r', 'p/p', '.']
    >>> _roots([('relglob', 'rg*', ''), ('re', 're/', ''), ('relre', 'rr', '')])
    ['.', '.', '.']
    '''
    r = []
    for kind, pat, source in kindpats:
        if kind == 'glob': # find the non-glob prefix
            root = []
            for p in pat.split('/'):
                if '[' in p or '{' in p or '*' in p or '?' in p:
                    break
                root.append(p)
            r.append('/'.join(root) or '.')
        elif kind in ('relpath', 'path'):
            r.append(pat or '.')
        else: # relglob, re, relre
            r.append('.')
    return r

def _anypats(kindpats):
    for kind, pat, source in kindpats:
        if kind in ('glob', 're', 'relglob', 'relre', 'set'):
            return True

_commentre = None

def readpatternfile(filepath, warn):
    '''parse a pattern file, returning a list of
    patterns. These patterns should be given to compile()
    to be validated and converted into a match function.'''
    syntaxes = {'re': 'relre:', 'regexp': 'relre:', 'glob': 'relglob:'}
    syntax = 'relre:'
    patterns = []

    fp = open(filepath)
    for line in fp:
        if "#" in line:
            global _commentre
            if not _commentre:
                _commentre = re.compile(r'((^|[^\\])(\\\\)*)#.*')
            # remove comments prefixed by an even number of escapes
            line = _commentre.sub(r'\1', line)
            # fixup properly escaped comments that survived the above
            line = line.replace("\\#", "#")
        line = line.rstrip()
        if not line:
            continue

        if line.startswith('syntax:'):
            s = line[7:].strip()
            try:
                syntax = syntaxes[s]
            except KeyError:
                if warn:
                    warn(_("%s: ignoring invalid syntax '%s'\n") %
                         (filepath, s))
            continue

        linesyntax = syntax
        for s, rels in syntaxes.iteritems():
            if line.startswith(rels):
                linesyntax = rels
                line = line[len(rels):]
                break
            elif line.startswith(s+':'):
                linesyntax = rels
                line = line[len(s) + 1:]
                break
        patterns.append(linesyntax + line)
    fp.close()
    return patterns