view mercurial/similar.py @ 23845:0a7fd54d4e60

revset: introduce "_parsealiasdecl" to parse alias declarations strictly This patch introduces "_parsealiasdecl" to parse alias declarations strictly. For example, "_parsealiasdecl" can detect problems below, which current implementation can't. - un-closed parenthesis causes being treated as "alias symbol" because all of declarations not in "func(....)" style are recognized as "alias symbol". for example, "foo($1, $2" is treated as the alias symbol. - alias symbol/function names aren't examined whether they are valid as symbol or not for example, "foo bar" can be treated as the alias symbol, but of course such invalid symbol can't be referred in revset. - just splitting argument list by "," causes overlooking syntax problems in the declaration for example, all of invalid declarations below are overlooked: - foo("bar") => taking one argument named as '"bar"' - foo("unclosed) => taking one argument named as '"unclosed' - foo(bar::baz) => taking one argument named as 'bar::baz' - foo(bar($1)) => taking one argument named as 'bar($1)' To decrease complication of patch, current implementation for alias declarations is replaced by "_parsealiasdecl" in the subsequent patch. This patch just introduces it. This patch defines "_parsealiasdecl" not as a method of "revsetalias" class but as a one of "revset" module, because of ease of testing by doctest. This patch factors some helper functions for "tree" out, because: - direct accessing like "if tree[0] == 'func' and len(tree) > 1" decreases readability - subsequent patch (and also existing code paths, in the future) can use them for readability This patch also factors "_tokenizealias" out, because it can be used also for parsing alias definitions strictly.
author FUJIWARA Katsunori <foozy@lares.dti.ne.jp>
date Sat, 10 Jan 2015 23:18:11 +0900
parents 525fdb738975
children a56c47ed3885
line wrap: on
line source

# similar.py - mechanisms for finding similar files
#
# Copyright 2005-2007 Matt Mackall <mpm@selenic.com>
#
# This software may be used and distributed according to the terms of the
# GNU General Public License version 2 or any later version.

from i18n import _
import util
import mdiff
import bdiff

def _findexactmatches(repo, added, removed):
    '''find renamed files that have no changes

    Takes a list of new filectxs and a list of removed filectxs, and yields
    (before, after) tuples of exact matches.
    '''
    numfiles = len(added) + len(removed)

    # Get hashes of removed files.
    hashes = {}
    for i, fctx in enumerate(removed):
        repo.ui.progress(_('searching for exact renames'), i, total=numfiles)
        h = util.sha1(fctx.data()).digest()
        hashes[h] = fctx

    # For each added file, see if it corresponds to a removed file.
    for i, fctx in enumerate(added):
        repo.ui.progress(_('searching for exact renames'), i + len(removed),
                total=numfiles)
        h = util.sha1(fctx.data()).digest()
        if h in hashes:
            yield (hashes[h], fctx)

    # Done
    repo.ui.progress(_('searching for exact renames'), None)

def _findsimilarmatches(repo, added, removed, threshold):
    '''find potentially renamed files based on similar file content

    Takes a list of new filectxs and a list of removed filectxs, and yields
    (before, after, score) tuples of partial matches.
    '''
    copies = {}
    for i, r in enumerate(removed):
        repo.ui.progress(_('searching for similar files'), i,
                         total=len(removed))

        # lazily load text
        @util.cachefunc
        def data():
            orig = r.data()
            return orig, mdiff.splitnewlines(orig)

        def score(text):
            orig, lines = data()
            # bdiff.blocks() returns blocks of matching lines
            # count the number of bytes in each
            equal = 0
            matches = bdiff.blocks(text, orig)
            for x1, x2, y1, y2 in matches:
                for line in lines[y1:y2]:
                    equal += len(line)

            lengths = len(text) + len(orig)
            return equal * 2.0 / lengths

        for a in added:
            bestscore = copies.get(a, (None, threshold))[1]
            myscore = score(a.data())
            if myscore >= bestscore:
                copies[a] = (r, myscore)
    repo.ui.progress(_('searching'), None)

    for dest, v in copies.iteritems():
        source, score = v
        yield source, dest, score

def findrenames(repo, added, removed, threshold):
    '''find renamed files -- yields (before, after, score) tuples'''
    parentctx = repo['.']
    workingctx = repo[None]

    # Zero length files will be frequently unrelated to each other, and
    # tracking the deletion/addition of such a file will probably cause more
    # harm than good. We strip them out here to avoid matching them later on.
    addedfiles = set([workingctx[fp] for fp in added
            if workingctx[fp].size() > 0])
    removedfiles = set([parentctx[fp] for fp in removed
            if fp in parentctx and parentctx[fp].size() > 0])

    # Find exact matches.
    for (a, b) in _findexactmatches(repo,
            sorted(addedfiles), sorted(removedfiles)):
        addedfiles.remove(b)
        yield (a.path(), b.path(), 1.0)

    # If the user requested similar files to be matched, search for them also.
    if threshold < 1.0:
        for (a, b, score) in _findsimilarmatches(repo,
                sorted(addedfiles), sorted(removedfiles), threshold):
            yield (a.path(), b.path(), score)