mercurial/similar.py
author FUJIWARA Katsunori <foozy@lares.dti.ne.jp>
Sat, 24 Dec 2011 19:05:35 +0900
branchstable
changeset 15723 1581da01d5c4
parent 11085 0c8646292ca4
child 16683 525fdb738975
permissions -rw-r--r--
windows: use normalized path as path to subrepo path to subrepo is used to identify or check location of subrepo. it should be normalized (in "/" delimiter form), because it is also used with narrowmatcher which uses only normalized path even on Windows environment. this patch applies "util.pconvert()" on path to subrepo (called "subpath") to normalize it. for this patch, referers of below were checked. - subrepo.state() - subrepo.itersubrepos() - subrepo.subrepo() - context.sub() - context.substate() typical usecase is: for subpath in ctx.substate: sub = ctx.sub(subpath) ... ctx.substate[subpath] .... in this case, normalization has no side effect, because keys given from substate are used as key itself. other cases shown below also seem to require subpath to be normalized. - path components are joined by "/", in "commands.forget()": for subpath in ctx.substate: subforget[subpath + '/' + fsub] = (fsub, sub) - normalized "file" is used to check below condition, in "commands.revert()", "localrepository.commit()", and "localrepository._checknested()" file in ctx.substate - substate.keys() is passed to dirstate.walk()/status() which use only normalized pathes

# similar.py - mechanisms for finding similar files
#
# Copyright 2005-2007 Matt Mackall <mpm@selenic.com>
#
# This software may be used and distributed according to the terms of the
# GNU General Public License version 2 or any later version.

from i18n import _
import util
import mdiff
import bdiff

def _findexactmatches(repo, added, removed):
    '''find renamed files that have no changes

    Takes a list of new filectxs and a list of removed filectxs, and yields
    (before, after) tuples of exact matches.
    '''
    numfiles = len(added) + len(removed)

    # Get hashes of removed files.
    hashes = {}
    for i, fctx in enumerate(removed):
        repo.ui.progress(_('searching for exact renames'), i, total=numfiles)
        h = util.sha1(fctx.data()).digest()
        hashes[h] = fctx

    # For each added file, see if it corresponds to a removed file.
    for i, fctx in enumerate(added):
        repo.ui.progress(_('searching for exact renames'), i + len(removed),
                total=numfiles)
        h = util.sha1(fctx.data()).digest()
        if h in hashes:
            yield (hashes[h], fctx)

    # Done
    repo.ui.progress(_('searching for exact renames'), None)

def _findsimilarmatches(repo, added, removed, threshold):
    '''find potentially renamed files based on similar file content

    Takes a list of new filectxs and a list of removed filectxs, and yields
    (before, after, score) tuples of partial matches.
    '''
    copies = {}
    for i, r in enumerate(removed):
        repo.ui.progress(_('searching for similar files'), i, total=len(removed))

        # lazily load text
        @util.cachefunc
        def data():
            orig = r.data()
            return orig, mdiff.splitnewlines(orig)

        def score(text):
            orig, lines = data()
            # bdiff.blocks() returns blocks of matching lines
            # count the number of bytes in each
            equal = 0
            matches = bdiff.blocks(text, orig)
            for x1, x2, y1, y2 in matches:
                for line in lines[y1:y2]:
                    equal += len(line)

            lengths = len(text) + len(orig)
            return equal * 2.0 / lengths

        for a in added:
            bestscore = copies.get(a, (None, threshold))[1]
            myscore = score(a.data())
            if myscore >= bestscore:
                copies[a] = (r, myscore)
    repo.ui.progress(_('searching'), None)

    for dest, v in copies.iteritems():
        source, score = v
        yield source, dest, score

def findrenames(repo, added, removed, threshold):
    '''find renamed files -- yields (before, after, score) tuples'''
    parentctx = repo['.']
    workingctx = repo[None]

    # Zero length files will be frequently unrelated to each other, and
    # tracking the deletion/addition of such a file will probably cause more
    # harm than good. We strip them out here to avoid matching them later on.
    addedfiles = set([workingctx[fp] for fp in added
            if workingctx[fp].size() > 0])
    removedfiles = set([parentctx[fp] for fp in removed
            if fp in parentctx and parentctx[fp].size() > 0])

    # Find exact matches.
    for (a, b) in _findexactmatches(repo,
            sorted(addedfiles), sorted(removedfiles)):
        addedfiles.remove(b)
        yield (a.path(), b.path(), 1.0)

    # If the user requested similar files to be matched, search for them also.
    if threshold < 1.0:
        for (a, b, score) in _findsimilarmatches(repo,
                sorted(addedfiles), sorted(removedfiles), threshold):
            yield (a.path(), b.path(), score)