view hgext/convert/filemap.py @ 25566:15412bba5a68

revset: prefetch all attributes before loop in _revsbetween Python is slow at attributes lookup. No, really, I mean -slow-. prefetching these three methods give use a measurable performance boost. revset #0: 0::tip plain 0) 0.037655 1) 0.034290 91%
author Pierre-Yves David <pierre-yves.david@fb.com>
date Thu, 11 Jun 2015 11:42:46 -0700
parents 3f0744eeaeaf
children d859123e0f47
line wrap: on
line source

# Copyright 2007 Bryan O'Sullivan <bos@serpentine.com>
# Copyright 2007 Alexis S. L. Carvalho <alexis@cecm.usp.br>
#
# This software may be used and distributed according to the terms of the
# GNU General Public License version 2 or any later version.

import posixpath
import shlex
from mercurial.i18n import _
from mercurial import util, error
from common import SKIPREV, converter_source

def rpairs(path):
    '''Yield tuples with path split at '/', starting with the full path.
    No leading, trailing or double '/', please.
    >>> for x in rpairs('foo/bar/baz'): print x
    ('foo/bar/baz', '')
    ('foo/bar', 'baz')
    ('foo', 'bar/baz')
    ('.', 'foo/bar/baz')
    '''
    i = len(path)
    while i != -1:
        yield path[:i], path[i + 1:]
        i = path.rfind('/', 0, i)
    yield '.', path

def normalize(path):
    ''' We use posixpath.normpath to support cross-platform path format.
    However, it doesn't handle None input. So we wrap it up. '''
    if path is None:
        return None
    return posixpath.normpath(path)

class filemapper(object):
    '''Map and filter filenames when importing.
    A name can be mapped to itself, a new name, or None (omit from new
    repository).'''

    def __init__(self, ui, path=None):
        self.ui = ui
        self.include = {}
        self.exclude = {}
        self.rename = {}
        if path:
            if self.parse(path):
                raise util.Abort(_('errors in filemap'))

    def parse(self, path):
        errs = 0
        def check(name, mapping, listname):
            if not name:
                self.ui.warn(_('%s:%d: path to %s is missing\n') %
                             (lex.infile, lex.lineno, listname))
                return 1
            if name in mapping:
                self.ui.warn(_('%s:%d: %r already in %s list\n') %
                             (lex.infile, lex.lineno, name, listname))
                return 1
            if (name.startswith('/') or
                name.endswith('/') or
                '//' in name):
                self.ui.warn(_('%s:%d: superfluous / in %s %r\n') %
                             (lex.infile, lex.lineno, listname, name))
                return 1
            return 0
        lex = shlex.shlex(open(path), path, True)
        lex.wordchars += '!@#$%^&*()-=+[]{}|;:,./<>?'
        cmd = lex.get_token()
        while cmd:
            if cmd == 'include':
                name = normalize(lex.get_token())
                errs += check(name, self.exclude, 'exclude')
                self.include[name] = name
            elif cmd == 'exclude':
                name = normalize(lex.get_token())
                errs += check(name, self.include, 'include')
                errs += check(name, self.rename, 'rename')
                self.exclude[name] = name
            elif cmd == 'rename':
                src = normalize(lex.get_token())
                dest = normalize(lex.get_token())
                errs += check(src, self.exclude, 'exclude')
                self.rename[src] = dest
            elif cmd == 'source':
                errs += self.parse(normalize(lex.get_token()))
            else:
                self.ui.warn(_('%s:%d: unknown directive %r\n') %
                             (lex.infile, lex.lineno, cmd))
                errs += 1
            cmd = lex.get_token()
        return errs

    def lookup(self, name, mapping):
        name = normalize(name)
        for pre, suf in rpairs(name):
            try:
                return mapping[pre], pre, suf
            except KeyError:
                pass
        return '', name, ''

    def __call__(self, name):
        if self.include:
            inc = self.lookup(name, self.include)[0]
        else:
            inc = name
        if self.exclude:
            exc = self.lookup(name, self.exclude)[0]
        else:
            exc = ''
        if (not self.include and exc) or (len(inc) <= len(exc)):
            return None
        newpre, pre, suf = self.lookup(name, self.rename)
        if newpre:
            if newpre == '.':
                return suf
            if suf:
                if newpre.endswith('/'):
                    return newpre + suf
                return newpre + '/' + suf
            return newpre
        return name

    def active(self):
        return bool(self.include or self.exclude or self.rename)

# This class does two additional things compared to a regular source:
#
# - Filter and rename files.  This is mostly wrapped by the filemapper
#   class above. We hide the original filename in the revision that is
#   returned by getchanges to be able to find things later in getfile.
#
# - Return only revisions that matter for the files we're interested in.
#   This involves rewriting the parents of the original revision to
#   create a graph that is restricted to those revisions.
#
#   This set of revisions includes not only revisions that directly
#   touch files we're interested in, but also merges that merge two
#   or more interesting revisions.

class filemap_source(converter_source):
    def __init__(self, ui, baseconverter, filemap):
        super(filemap_source, self).__init__(ui)
        self.base = baseconverter
        self.filemapper = filemapper(ui, filemap)
        self.commits = {}
        # if a revision rev has parent p in the original revision graph, then
        # rev will have parent self.parentmap[p] in the restricted graph.
        self.parentmap = {}
        # self.wantedancestors[rev] is the set of all ancestors of rev that
        # are in the restricted graph.
        self.wantedancestors = {}
        self.convertedorder = None
        self._rebuilt = False
        self.origparents = {}
        self.children = {}
        self.seenchildren = {}

    def before(self):
        self.base.before()

    def after(self):
        self.base.after()

    def setrevmap(self, revmap):
        # rebuild our state to make things restartable
        #
        # To avoid calling getcommit for every revision that has already
        # been converted, we rebuild only the parentmap, delaying the
        # rebuild of wantedancestors until we need it (i.e. until a
        # merge).
        #
        # We assume the order argument lists the revisions in
        # topological order, so that we can infer which revisions were
        # wanted by previous runs.
        self._rebuilt = not revmap
        seen = {SKIPREV: SKIPREV}
        dummyset = set()
        converted = []
        for rev in revmap.order:
            mapped = revmap[rev]
            wanted = mapped not in seen
            if wanted:
                seen[mapped] = rev
                self.parentmap[rev] = rev
            else:
                self.parentmap[rev] = seen[mapped]
            self.wantedancestors[rev] = dummyset
            arg = seen[mapped]
            if arg == SKIPREV:
                arg = None
            converted.append((rev, wanted, arg))
        self.convertedorder = converted
        return self.base.setrevmap(revmap)

    def rebuild(self):
        if self._rebuilt:
            return True
        self._rebuilt = True
        self.parentmap.clear()
        self.wantedancestors.clear()
        self.seenchildren.clear()
        for rev, wanted, arg in self.convertedorder:
            if rev not in self.origparents:
                try:
                    self.origparents[rev] = self.getcommit(rev).parents
                except error.RepoLookupError:
                    self.ui.debug("unknown revmap source: %s\n" % rev)
                    continue
            if arg is not None:
                self.children[arg] = self.children.get(arg, 0) + 1

        for rev, wanted, arg in self.convertedorder:
            try:
                parents = self.origparents[rev]
            except KeyError:
                continue # unknown revmap source
            if wanted:
                self.mark_wanted(rev, parents)
            else:
                self.mark_not_wanted(rev, arg)
            self._discard(arg, *parents)

        return True

    def getheads(self):
        return self.base.getheads()

    def getcommit(self, rev):
        # We want to save a reference to the commit objects to be able
        # to rewrite their parents later on.
        c = self.commits[rev] = self.base.getcommit(rev)
        for p in c.parents:
            self.children[p] = self.children.get(p, 0) + 1
        return c

    def _cachedcommit(self, rev):
        if rev in self.commits:
            return self.commits[rev]
        return self.base.getcommit(rev)

    def _discard(self, *revs):
        for r in revs:
            if r is None:
                continue
            self.seenchildren[r] = self.seenchildren.get(r, 0) + 1
            if self.seenchildren[r] == self.children[r]:
                self.wantedancestors.pop(r, None)
                self.parentmap.pop(r, None)
                del self.seenchildren[r]
                if self._rebuilt:
                    del self.children[r]

    def wanted(self, rev, i):
        # Return True if we're directly interested in rev.
        #
        # i is an index selecting one of the parents of rev (if rev
        # has no parents, i is None).  getchangedfiles will give us
        # the list of files that are different in rev and in the parent
        # indicated by i.  If we're interested in any of these files,
        # we're interested in rev.
        try:
            files = self.base.getchangedfiles(rev, i)
        except NotImplementedError:
            raise util.Abort(_("source repository doesn't support --filemap"))
        for f in files:
            if self.filemapper(f):
                return True
        return False

    def mark_not_wanted(self, rev, p):
        # Mark rev as not interesting and update data structures.

        if p is None:
            # A root revision. Use SKIPREV to indicate that it doesn't
            # map to any revision in the restricted graph.  Put SKIPREV
            # in the set of wanted ancestors to simplify code elsewhere
            self.parentmap[rev] = SKIPREV
            self.wantedancestors[rev] = set((SKIPREV,))
            return

        # Reuse the data from our parent.
        self.parentmap[rev] = self.parentmap[p]
        self.wantedancestors[rev] = self.wantedancestors[p]

    def mark_wanted(self, rev, parents):
        # Mark rev ss wanted and update data structures.

        # rev will be in the restricted graph, so children of rev in
        # the original graph should still have rev as a parent in the
        # restricted graph.
        self.parentmap[rev] = rev

        # The set of wanted ancestors of rev is the union of the sets
        # of wanted ancestors of its parents. Plus rev itself.
        wrev = set()
        for p in parents:
            if p in self.wantedancestors:
                wrev.update(self.wantedancestors[p])
            else:
                self.ui.warn(_('warning: %s parent %s is missing\n') %
                             (rev, p))
        wrev.add(rev)
        self.wantedancestors[rev] = wrev

    def getchanges(self, rev, full):
        parents = self.commits[rev].parents
        if len(parents) > 1:
            self.rebuild()

        # To decide whether we're interested in rev we:
        #
        # - calculate what parents rev will have if it turns out we're
        #   interested in it.  If it's going to have more than 1 parent,
        #   we're interested in it.
        #
        # - otherwise, we'll compare it with the single parent we found.
        #   If any of the files we're interested in is different in the
        #   the two revisions, we're interested in rev.

        # A parent p is interesting if its mapped version (self.parentmap[p]):
        # - is not SKIPREV
        # - is still not in the list of parents (we don't want duplicates)
        # - is not an ancestor of the mapped versions of the other parents or
        #   there is no parent in the same branch than the current revision.
        mparents = []
        knownparents = set()
        branch = self.commits[rev].branch
        hasbranchparent = False
        for i, p1 in enumerate(parents):
            mp1 = self.parentmap[p1]
            if mp1 == SKIPREV or mp1 in knownparents:
                continue
            isancestor = any(p2 for p2 in parents
                                  if p1 != p2 and mp1 != self.parentmap[p2]
                                  and mp1 in self.wantedancestors[p2])
            if not isancestor and not hasbranchparent and len(parents) > 1:
                # This could be expensive, avoid unnecessary calls.
                if self._cachedcommit(p1).branch == branch:
                    hasbranchparent = True
            mparents.append((p1, mp1, i, isancestor))
            knownparents.add(mp1)
        # Discard parents ancestors of other parents if there is a
        # non-ancestor one on the same branch than current revision.
        if hasbranchparent:
            mparents = [p for p in mparents if not p[3]]
        wp = None
        if mparents:
            wp = max(p[2] for p in mparents)
            mparents = [p[1] for p in mparents]
        elif parents:
            wp = 0

        self.origparents[rev] = parents

        closed = False
        if 'close' in self.commits[rev].extra:
            # A branch closing revision is only useful if one of its
            # parents belong to the branch being closed
            pbranches = [self._cachedcommit(p).branch for p in mparents]
            if branch in pbranches:
                closed = True

        if len(mparents) < 2 and not closed and not self.wanted(rev, wp):
            # We don't want this revision.
            # Update our state and tell the convert process to map this
            # revision to the same revision its parent as mapped to.
            p = None
            if parents:
                p = parents[wp]
            self.mark_not_wanted(rev, p)
            self.convertedorder.append((rev, False, p))
            self._discard(*parents)
            return self.parentmap[rev]

        # We want this revision.
        # Rewrite the parents of the commit object
        self.commits[rev].parents = mparents
        self.mark_wanted(rev, parents)
        self.convertedorder.append((rev, True, None))
        self._discard(*parents)

        # Get the real changes and do the filtering/mapping. To be
        # able to get the files later on in getfile, we hide the
        # original filename in the rev part of the return value.
        changes, copies, cleanp2 = self.base.getchanges(rev, full)
        files = {}
        ncleanp2 = set(cleanp2)
        for f, r in changes:
            newf = self.filemapper(f)
            if newf and (newf != f or newf not in files):
                files[newf] = (f, r)
                if newf != f:
                    ncleanp2.discard(f)
        files = sorted(files.items())

        ncopies = {}
        for c in copies:
            newc = self.filemapper(c)
            if newc:
                newsource = self.filemapper(copies[c])
                if newsource:
                    ncopies[newc] = newsource

        return files, ncopies, ncleanp2

    def getfile(self, name, rev):
        realname, realrev = rev
        return self.base.getfile(realname, realrev)

    def gettags(self):
        return self.base.gettags()

    def hasnativeorder(self):
        return self.base.hasnativeorder()

    def lookuprev(self, rev):
        return self.base.lookuprev(rev)

    def getbookmarks(self):
        return self.base.getbookmarks()

    def converted(self, rev, sinkrev):
        self.base.converted(rev, sinkrev)