view mercurial/similar.py @ 23971:6becb9dbca25 stable

merge: mark .hgsubstate as possibly dirty before submerge for consistency Before this patch, failure of updating subrepos may cause inconsistent ".hgsubstate". For example: 1. dirstate entry for ".hgsubstate" of the parent repo is filled with valid size/date (via "hg state" or so) 2. "hg update" is invoked at the parent repo 3. ".hgsubstate" of the parent repo is updated on the filesystem as a part of "g"(et) action in "merge.applyupdates" 4. it is assumed that size/date of ".hgsubstate" on the filesystem aren't changed from ones at (1) this is not so difficult condition, because just changing hash ids (every ids are same in length) in ".hgsubstate" doesn't change the file size of it 5. "subrepo.submerge()" is invoked to update subrepos 6. failure of updating in one of subrepos raises exception (e.g. "untracked file differs") 7. "hg update" is aborted without updating dirstate of the parent repo dirstate entry for ".hgsubstate" still holds size/date at (1) Then, ".hgsubstate" of the parent repo is treated as "CLEAN" unexpectedly, because updating ".hgsubstate" at (3) doesn't change size/date of it on the filesystem: see assumption at (4). This inconsistent ".hgsubstate" status causes unexpected behavior, for example: - "hg revert" forgets to revert ".hgsubstate" - "hg update" misunderstands that (not yet updated) subrepos diverge (then, it shows the prompt to confirm user's decision) To avoid inconsistent ".hgsubstate" status above, this patch marks ".hgsubstate" as possibly dirty before "submerge" invocation. "normallookup"-ed (= dirty) dirstate should be written out, even if processing is aborted by failure. This patch marks ".hgsubstate" as possibly dirty before "submerge", also when it is removed or merged while merging, for safety. This should prevent Mercurial from misunderstanding inconsistent ".hgsubstate" as clean. To satisfy conditions at (1) and (4) above, this patch uses "hg status --config debug.dirstate.delaywrite=2" (to fill valid size/date into dirstate) and "touch" (to fix date of the file).
author FUJIWARA Katsunori <foozy@lares.dti.ne.jp>
date Fri, 30 Jan 2015 04:59:05 +0900
parents 525fdb738975
children a56c47ed3885
line wrap: on
line source

# similar.py - mechanisms for finding similar files
#
# Copyright 2005-2007 Matt Mackall <mpm@selenic.com>
#
# This software may be used and distributed according to the terms of the
# GNU General Public License version 2 or any later version.

from i18n import _
import util
import mdiff
import bdiff

def _findexactmatches(repo, added, removed):
    '''find renamed files that have no changes

    Takes a list of new filectxs and a list of removed filectxs, and yields
    (before, after) tuples of exact matches.
    '''
    numfiles = len(added) + len(removed)

    # Get hashes of removed files.
    hashes = {}
    for i, fctx in enumerate(removed):
        repo.ui.progress(_('searching for exact renames'), i, total=numfiles)
        h = util.sha1(fctx.data()).digest()
        hashes[h] = fctx

    # For each added file, see if it corresponds to a removed file.
    for i, fctx in enumerate(added):
        repo.ui.progress(_('searching for exact renames'), i + len(removed),
                total=numfiles)
        h = util.sha1(fctx.data()).digest()
        if h in hashes:
            yield (hashes[h], fctx)

    # Done
    repo.ui.progress(_('searching for exact renames'), None)

def _findsimilarmatches(repo, added, removed, threshold):
    '''find potentially renamed files based on similar file content

    Takes a list of new filectxs and a list of removed filectxs, and yields
    (before, after, score) tuples of partial matches.
    '''
    copies = {}
    for i, r in enumerate(removed):
        repo.ui.progress(_('searching for similar files'), i,
                         total=len(removed))

        # lazily load text
        @util.cachefunc
        def data():
            orig = r.data()
            return orig, mdiff.splitnewlines(orig)

        def score(text):
            orig, lines = data()
            # bdiff.blocks() returns blocks of matching lines
            # count the number of bytes in each
            equal = 0
            matches = bdiff.blocks(text, orig)
            for x1, x2, y1, y2 in matches:
                for line in lines[y1:y2]:
                    equal += len(line)

            lengths = len(text) + len(orig)
            return equal * 2.0 / lengths

        for a in added:
            bestscore = copies.get(a, (None, threshold))[1]
            myscore = score(a.data())
            if myscore >= bestscore:
                copies[a] = (r, myscore)
    repo.ui.progress(_('searching'), None)

    for dest, v in copies.iteritems():
        source, score = v
        yield source, dest, score

def findrenames(repo, added, removed, threshold):
    '''find renamed files -- yields (before, after, score) tuples'''
    parentctx = repo['.']
    workingctx = repo[None]

    # Zero length files will be frequently unrelated to each other, and
    # tracking the deletion/addition of such a file will probably cause more
    # harm than good. We strip them out here to avoid matching them later on.
    addedfiles = set([workingctx[fp] for fp in added
            if workingctx[fp].size() > 0])
    removedfiles = set([parentctx[fp] for fp in removed
            if fp in parentctx and parentctx[fp].size() > 0])

    # Find exact matches.
    for (a, b) in _findexactmatches(repo,
            sorted(addedfiles), sorted(removedfiles)):
        addedfiles.remove(b)
        yield (a.path(), b.path(), 1.0)

    # If the user requested similar files to be matched, search for them also.
    if threshold < 1.0:
        for (a, b, score) in _findsimilarmatches(repo,
                sorted(addedfiles), sorted(removedfiles), threshold):
            yield (a.path(), b.path(), score)