mercurial/similar.py
author Angel Ezquerra <angel.ezquerra@gmail.com>
Thu, 13 Dec 2012 23:37:53 +0100
changeset 18109 9e3910db4e78
parent 16683 525fdb738975
child 27359 a56c47ed3885
permissions -rw-r--r--
subrepo: append subrepo path to subrepo error messages This change appends the subrepo path to subrepo errors. That is, when there is an error performing an operation a subrepo, rather than displaying a message such as: pushing subrepo MYSUBREPO to PATH searching for changes abort: push creates new remote head HEADHASH! hint: did you forget to merge? use push -f to force mercurial will show: pushing subrepo MYSUBREPO to PATH searching for changes abort: push creates new remote head HEADHASH! (in subrepo MYSUBREPO) hint: did you forget to merge? use push -f to force The rationale for this change is that the current error messages make it hard for TortoiseHg (and similar tools) to tell the user which subrepo caused the push failure. The "(in subrepo MYSUBREPO)" message has been added to those subrepo methods were it made sense (by using a decorator). We avoid appending "(in subrepo XXX)" multiple times when subrepos are nexted by throwing a "SubrepoAbort" exception after the extra message is appended. The decorator will then "ignore" (i.e. just re-raise) the exception and never add the message again. A small drawback of this method is that part of the exception trace is lost when the exception is catched and re-raised by the annotatesubrepoerror decorator. Also, because the state() function already printed the subrepo path when it threw an error, that error has been changed to avoid duplicating the subrepo path in the error message. Note that I have also updated several subrepo related tests to reflect these changes.

# similar.py - mechanisms for finding similar files
#
# Copyright 2005-2007 Matt Mackall <mpm@selenic.com>
#
# This software may be used and distributed according to the terms of the
# GNU General Public License version 2 or any later version.

from i18n import _
import util
import mdiff
import bdiff

def _findexactmatches(repo, added, removed):
    '''find renamed files that have no changes

    Takes a list of new filectxs and a list of removed filectxs, and yields
    (before, after) tuples of exact matches.
    '''
    numfiles = len(added) + len(removed)

    # Get hashes of removed files.
    hashes = {}
    for i, fctx in enumerate(removed):
        repo.ui.progress(_('searching for exact renames'), i, total=numfiles)
        h = util.sha1(fctx.data()).digest()
        hashes[h] = fctx

    # For each added file, see if it corresponds to a removed file.
    for i, fctx in enumerate(added):
        repo.ui.progress(_('searching for exact renames'), i + len(removed),
                total=numfiles)
        h = util.sha1(fctx.data()).digest()
        if h in hashes:
            yield (hashes[h], fctx)

    # Done
    repo.ui.progress(_('searching for exact renames'), None)

def _findsimilarmatches(repo, added, removed, threshold):
    '''find potentially renamed files based on similar file content

    Takes a list of new filectxs and a list of removed filectxs, and yields
    (before, after, score) tuples of partial matches.
    '''
    copies = {}
    for i, r in enumerate(removed):
        repo.ui.progress(_('searching for similar files'), i,
                         total=len(removed))

        # lazily load text
        @util.cachefunc
        def data():
            orig = r.data()
            return orig, mdiff.splitnewlines(orig)

        def score(text):
            orig, lines = data()
            # bdiff.blocks() returns blocks of matching lines
            # count the number of bytes in each
            equal = 0
            matches = bdiff.blocks(text, orig)
            for x1, x2, y1, y2 in matches:
                for line in lines[y1:y2]:
                    equal += len(line)

            lengths = len(text) + len(orig)
            return equal * 2.0 / lengths

        for a in added:
            bestscore = copies.get(a, (None, threshold))[1]
            myscore = score(a.data())
            if myscore >= bestscore:
                copies[a] = (r, myscore)
    repo.ui.progress(_('searching'), None)

    for dest, v in copies.iteritems():
        source, score = v
        yield source, dest, score

def findrenames(repo, added, removed, threshold):
    '''find renamed files -- yields (before, after, score) tuples'''
    parentctx = repo['.']
    workingctx = repo[None]

    # Zero length files will be frequently unrelated to each other, and
    # tracking the deletion/addition of such a file will probably cause more
    # harm than good. We strip them out here to avoid matching them later on.
    addedfiles = set([workingctx[fp] for fp in added
            if workingctx[fp].size() > 0])
    removedfiles = set([parentctx[fp] for fp in removed
            if fp in parentctx and parentctx[fp].size() > 0])

    # Find exact matches.
    for (a, b) in _findexactmatches(repo,
            sorted(addedfiles), sorted(removedfiles)):
        addedfiles.remove(b)
        yield (a.path(), b.path(), 1.0)

    # If the user requested similar files to be matched, search for them also.
    if threshold < 1.0:
        for (a, b, score) in _findsimilarmatches(repo,
                sorted(addedfiles), sorted(removedfiles), threshold):
            yield (a.path(), b.path(), score)