mercurial/similar.py
author Pulkit Goyal <7895pulkit@gmail.com>
Fri, 13 May 2016 03:18:04 +0530
changeset 29158 7c0297bfe8bf
parent 28468 0d6b3630b9a3
child 29337 f72d0c2148da
permissions -rw-r--r--
py3: make raise statement python3 compatible In python3 raise error, message has been changed to raise error(message) In additional to that nodes.SkipNode is changed to nodes.SkipNode() so that it creates an instance directly.
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
11059
ef4aa90b1e58 Move 'findrenames' code into its own file.
David Greenaway <hg-dev@davidgreenaway.com>
parents:
diff changeset
     1
# similar.py - mechanisms for finding similar files
ef4aa90b1e58 Move 'findrenames' code into its own file.
David Greenaway <hg-dev@davidgreenaway.com>
parents:
diff changeset
     2
#
ef4aa90b1e58 Move 'findrenames' code into its own file.
David Greenaway <hg-dev@davidgreenaway.com>
parents:
diff changeset
     3
# Copyright 2005-2007 Matt Mackall <mpm@selenic.com>
ef4aa90b1e58 Move 'findrenames' code into its own file.
David Greenaway <hg-dev@davidgreenaway.com>
parents:
diff changeset
     4
#
ef4aa90b1e58 Move 'findrenames' code into its own file.
David Greenaway <hg-dev@davidgreenaway.com>
parents:
diff changeset
     5
# This software may be used and distributed according to the terms of the
ef4aa90b1e58 Move 'findrenames' code into its own file.
David Greenaway <hg-dev@davidgreenaway.com>
parents:
diff changeset
     6
# GNU General Public License version 2 or any later version.
ef4aa90b1e58 Move 'findrenames' code into its own file.
David Greenaway <hg-dev@davidgreenaway.com>
parents:
diff changeset
     7
27359
a56c47ed3885 similar: use absolute_import
Gregory Szorc <gregory.szorc@gmail.com>
parents: 16683
diff changeset
     8
from __future__ import absolute_import
a56c47ed3885 similar: use absolute_import
Gregory Szorc <gregory.szorc@gmail.com>
parents: 16683
diff changeset
     9
a56c47ed3885 similar: use absolute_import
Gregory Szorc <gregory.szorc@gmail.com>
parents: 16683
diff changeset
    10
from .i18n import _
a56c47ed3885 similar: use absolute_import
Gregory Szorc <gregory.szorc@gmail.com>
parents: 16683
diff changeset
    11
from . import (
a56c47ed3885 similar: use absolute_import
Gregory Szorc <gregory.szorc@gmail.com>
parents: 16683
diff changeset
    12
    bdiff,
a56c47ed3885 similar: use absolute_import
Gregory Szorc <gregory.szorc@gmail.com>
parents: 16683
diff changeset
    13
    mdiff,
a56c47ed3885 similar: use absolute_import
Gregory Szorc <gregory.szorc@gmail.com>
parents: 16683
diff changeset
    14
    util,
a56c47ed3885 similar: use absolute_import
Gregory Szorc <gregory.szorc@gmail.com>
parents: 16683
diff changeset
    15
)
11059
ef4aa90b1e58 Move 'findrenames' code into its own file.
David Greenaway <hg-dev@davidgreenaway.com>
parents:
diff changeset
    16
11060
e6df01776e08 findrenames: Optimise "addremove -s100" by matching files by their SHA1 hashes.
David Greenaway <hg-dev@davidgreenaway.com>
parents: 11059
diff changeset
    17
def _findexactmatches(repo, added, removed):
e6df01776e08 findrenames: Optimise "addremove -s100" by matching files by their SHA1 hashes.
David Greenaway <hg-dev@davidgreenaway.com>
parents: 11059
diff changeset
    18
    '''find renamed files that have no changes
e6df01776e08 findrenames: Optimise "addremove -s100" by matching files by their SHA1 hashes.
David Greenaway <hg-dev@davidgreenaway.com>
parents: 11059
diff changeset
    19
e6df01776e08 findrenames: Optimise "addremove -s100" by matching files by their SHA1 hashes.
David Greenaway <hg-dev@davidgreenaway.com>
parents: 11059
diff changeset
    20
    Takes a list of new filectxs and a list of removed filectxs, and yields
e6df01776e08 findrenames: Optimise "addremove -s100" by matching files by their SHA1 hashes.
David Greenaway <hg-dev@davidgreenaway.com>
parents: 11059
diff changeset
    21
    (before, after) tuples of exact matches.
e6df01776e08 findrenames: Optimise "addremove -s100" by matching files by their SHA1 hashes.
David Greenaway <hg-dev@davidgreenaway.com>
parents: 11059
diff changeset
    22
    '''
e6df01776e08 findrenames: Optimise "addremove -s100" by matching files by their SHA1 hashes.
David Greenaway <hg-dev@davidgreenaway.com>
parents: 11059
diff changeset
    23
    numfiles = len(added) + len(removed)
e6df01776e08 findrenames: Optimise "addremove -s100" by matching files by their SHA1 hashes.
David Greenaway <hg-dev@davidgreenaway.com>
parents: 11059
diff changeset
    24
e6df01776e08 findrenames: Optimise "addremove -s100" by matching files by their SHA1 hashes.
David Greenaway <hg-dev@davidgreenaway.com>
parents: 11059
diff changeset
    25
    # Get hashes of removed files.
e6df01776e08 findrenames: Optimise "addremove -s100" by matching files by their SHA1 hashes.
David Greenaway <hg-dev@davidgreenaway.com>
parents: 11059
diff changeset
    26
    hashes = {}
e6df01776e08 findrenames: Optimise "addremove -s100" by matching files by their SHA1 hashes.
David Greenaway <hg-dev@davidgreenaway.com>
parents: 11059
diff changeset
    27
    for i, fctx in enumerate(removed):
28468
0d6b3630b9a3 similar: specify unit for ui.progress when operating on files
Anton Shestakov <av6@dwimlabs.net>
parents: 27359
diff changeset
    28
        repo.ui.progress(_('searching for exact renames'), i, total=numfiles,
0d6b3630b9a3 similar: specify unit for ui.progress when operating on files
Anton Shestakov <av6@dwimlabs.net>
parents: 27359
diff changeset
    29
                         unit=_('files'))
11060
e6df01776e08 findrenames: Optimise "addremove -s100" by matching files by their SHA1 hashes.
David Greenaway <hg-dev@davidgreenaway.com>
parents: 11059
diff changeset
    30
        h = util.sha1(fctx.data()).digest()
e6df01776e08 findrenames: Optimise "addremove -s100" by matching files by their SHA1 hashes.
David Greenaway <hg-dev@davidgreenaway.com>
parents: 11059
diff changeset
    31
        hashes[h] = fctx
e6df01776e08 findrenames: Optimise "addremove -s100" by matching files by their SHA1 hashes.
David Greenaway <hg-dev@davidgreenaway.com>
parents: 11059
diff changeset
    32
e6df01776e08 findrenames: Optimise "addremove -s100" by matching files by their SHA1 hashes.
David Greenaway <hg-dev@davidgreenaway.com>
parents: 11059
diff changeset
    33
    # For each added file, see if it corresponds to a removed file.
e6df01776e08 findrenames: Optimise "addremove -s100" by matching files by their SHA1 hashes.
David Greenaway <hg-dev@davidgreenaway.com>
parents: 11059
diff changeset
    34
    for i, fctx in enumerate(added):
e6df01776e08 findrenames: Optimise "addremove -s100" by matching files by their SHA1 hashes.
David Greenaway <hg-dev@davidgreenaway.com>
parents: 11059
diff changeset
    35
        repo.ui.progress(_('searching for exact renames'), i + len(removed),
28468
0d6b3630b9a3 similar: specify unit for ui.progress when operating on files
Anton Shestakov <av6@dwimlabs.net>
parents: 27359
diff changeset
    36
                total=numfiles, unit=_('files'))
11060
e6df01776e08 findrenames: Optimise "addremove -s100" by matching files by their SHA1 hashes.
David Greenaway <hg-dev@davidgreenaway.com>
parents: 11059
diff changeset
    37
        h = util.sha1(fctx.data()).digest()
e6df01776e08 findrenames: Optimise "addremove -s100" by matching files by their SHA1 hashes.
David Greenaway <hg-dev@davidgreenaway.com>
parents: 11059
diff changeset
    38
        if h in hashes:
e6df01776e08 findrenames: Optimise "addremove -s100" by matching files by their SHA1 hashes.
David Greenaway <hg-dev@davidgreenaway.com>
parents: 11059
diff changeset
    39
            yield (hashes[h], fctx)
e6df01776e08 findrenames: Optimise "addremove -s100" by matching files by their SHA1 hashes.
David Greenaway <hg-dev@davidgreenaway.com>
parents: 11059
diff changeset
    40
e6df01776e08 findrenames: Optimise "addremove -s100" by matching files by their SHA1 hashes.
David Greenaway <hg-dev@davidgreenaway.com>
parents: 11059
diff changeset
    41
    # Done
e6df01776e08 findrenames: Optimise "addremove -s100" by matching files by their SHA1 hashes.
David Greenaway <hg-dev@davidgreenaway.com>
parents: 11059
diff changeset
    42
    repo.ui.progress(_('searching for exact renames'), None)
e6df01776e08 findrenames: Optimise "addremove -s100" by matching files by their SHA1 hashes.
David Greenaway <hg-dev@davidgreenaway.com>
parents: 11059
diff changeset
    43
e6df01776e08 findrenames: Optimise "addremove -s100" by matching files by their SHA1 hashes.
David Greenaway <hg-dev@davidgreenaway.com>
parents: 11059
diff changeset
    44
def _findsimilarmatches(repo, added, removed, threshold):
e6df01776e08 findrenames: Optimise "addremove -s100" by matching files by their SHA1 hashes.
David Greenaway <hg-dev@davidgreenaway.com>
parents: 11059
diff changeset
    45
    '''find potentially renamed files based on similar file content
e6df01776e08 findrenames: Optimise "addremove -s100" by matching files by their SHA1 hashes.
David Greenaway <hg-dev@davidgreenaway.com>
parents: 11059
diff changeset
    46
e6df01776e08 findrenames: Optimise "addremove -s100" by matching files by their SHA1 hashes.
David Greenaway <hg-dev@davidgreenaway.com>
parents: 11059
diff changeset
    47
    Takes a list of new filectxs and a list of removed filectxs, and yields
e6df01776e08 findrenames: Optimise "addremove -s100" by matching files by their SHA1 hashes.
David Greenaway <hg-dev@davidgreenaway.com>
parents: 11059
diff changeset
    48
    (before, after, score) tuples of partial matches.
e6df01776e08 findrenames: Optimise "addremove -s100" by matching files by their SHA1 hashes.
David Greenaway <hg-dev@davidgreenaway.com>
parents: 11059
diff changeset
    49
    '''
11059
ef4aa90b1e58 Move 'findrenames' code into its own file.
David Greenaway <hg-dev@davidgreenaway.com>
parents:
diff changeset
    50
    copies = {}
ef4aa90b1e58 Move 'findrenames' code into its own file.
David Greenaway <hg-dev@davidgreenaway.com>
parents:
diff changeset
    51
    for i, r in enumerate(removed):
16683
525fdb738975 cleanup: eradicate long lines
Brodie Rao <brodie@sf.io>
parents: 11085
diff changeset
    52
        repo.ui.progress(_('searching for similar files'), i,
28468
0d6b3630b9a3 similar: specify unit for ui.progress when operating on files
Anton Shestakov <av6@dwimlabs.net>
parents: 27359
diff changeset
    53
                         total=len(removed), unit=_('files'))
11059
ef4aa90b1e58 Move 'findrenames' code into its own file.
David Greenaway <hg-dev@davidgreenaway.com>
parents:
diff changeset
    54
ef4aa90b1e58 Move 'findrenames' code into its own file.
David Greenaway <hg-dev@davidgreenaway.com>
parents:
diff changeset
    55
        # lazily load text
ef4aa90b1e58 Move 'findrenames' code into its own file.
David Greenaway <hg-dev@davidgreenaway.com>
parents:
diff changeset
    56
        @util.cachefunc
ef4aa90b1e58 Move 'findrenames' code into its own file.
David Greenaway <hg-dev@davidgreenaway.com>
parents:
diff changeset
    57
        def data():
11060
e6df01776e08 findrenames: Optimise "addremove -s100" by matching files by their SHA1 hashes.
David Greenaway <hg-dev@davidgreenaway.com>
parents: 11059
diff changeset
    58
            orig = r.data()
11059
ef4aa90b1e58 Move 'findrenames' code into its own file.
David Greenaway <hg-dev@davidgreenaway.com>
parents:
diff changeset
    59
            return orig, mdiff.splitnewlines(orig)
ef4aa90b1e58 Move 'findrenames' code into its own file.
David Greenaway <hg-dev@davidgreenaway.com>
parents:
diff changeset
    60
ef4aa90b1e58 Move 'findrenames' code into its own file.
David Greenaway <hg-dev@davidgreenaway.com>
parents:
diff changeset
    61
        def score(text):
ef4aa90b1e58 Move 'findrenames' code into its own file.
David Greenaway <hg-dev@davidgreenaway.com>
parents:
diff changeset
    62
            orig, lines = data()
ef4aa90b1e58 Move 'findrenames' code into its own file.
David Greenaway <hg-dev@davidgreenaway.com>
parents:
diff changeset
    63
            # bdiff.blocks() returns blocks of matching lines
ef4aa90b1e58 Move 'findrenames' code into its own file.
David Greenaway <hg-dev@davidgreenaway.com>
parents:
diff changeset
    64
            # count the number of bytes in each
ef4aa90b1e58 Move 'findrenames' code into its own file.
David Greenaway <hg-dev@davidgreenaway.com>
parents:
diff changeset
    65
            equal = 0
ef4aa90b1e58 Move 'findrenames' code into its own file.
David Greenaway <hg-dev@davidgreenaway.com>
parents:
diff changeset
    66
            matches = bdiff.blocks(text, orig)
ef4aa90b1e58 Move 'findrenames' code into its own file.
David Greenaway <hg-dev@davidgreenaway.com>
parents:
diff changeset
    67
            for x1, x2, y1, y2 in matches:
ef4aa90b1e58 Move 'findrenames' code into its own file.
David Greenaway <hg-dev@davidgreenaway.com>
parents:
diff changeset
    68
                for line in lines[y1:y2]:
ef4aa90b1e58 Move 'findrenames' code into its own file.
David Greenaway <hg-dev@davidgreenaway.com>
parents:
diff changeset
    69
                    equal += len(line)
ef4aa90b1e58 Move 'findrenames' code into its own file.
David Greenaway <hg-dev@davidgreenaway.com>
parents:
diff changeset
    70
ef4aa90b1e58 Move 'findrenames' code into its own file.
David Greenaway <hg-dev@davidgreenaway.com>
parents:
diff changeset
    71
            lengths = len(text) + len(orig)
ef4aa90b1e58 Move 'findrenames' code into its own file.
David Greenaway <hg-dev@davidgreenaway.com>
parents:
diff changeset
    72
            return equal * 2.0 / lengths
ef4aa90b1e58 Move 'findrenames' code into its own file.
David Greenaway <hg-dev@davidgreenaway.com>
parents:
diff changeset
    73
ef4aa90b1e58 Move 'findrenames' code into its own file.
David Greenaway <hg-dev@davidgreenaway.com>
parents:
diff changeset
    74
        for a in added:
ef4aa90b1e58 Move 'findrenames' code into its own file.
David Greenaway <hg-dev@davidgreenaway.com>
parents:
diff changeset
    75
            bestscore = copies.get(a, (None, threshold))[1]
11060
e6df01776e08 findrenames: Optimise "addremove -s100" by matching files by their SHA1 hashes.
David Greenaway <hg-dev@davidgreenaway.com>
parents: 11059
diff changeset
    76
            myscore = score(a.data())
11059
ef4aa90b1e58 Move 'findrenames' code into its own file.
David Greenaway <hg-dev@davidgreenaway.com>
parents:
diff changeset
    77
            if myscore >= bestscore:
ef4aa90b1e58 Move 'findrenames' code into its own file.
David Greenaway <hg-dev@davidgreenaway.com>
parents:
diff changeset
    78
                copies[a] = (r, myscore)
ef4aa90b1e58 Move 'findrenames' code into its own file.
David Greenaway <hg-dev@davidgreenaway.com>
parents:
diff changeset
    79
    repo.ui.progress(_('searching'), None)
ef4aa90b1e58 Move 'findrenames' code into its own file.
David Greenaway <hg-dev@davidgreenaway.com>
parents:
diff changeset
    80
ef4aa90b1e58 Move 'findrenames' code into its own file.
David Greenaway <hg-dev@davidgreenaway.com>
parents:
diff changeset
    81
    for dest, v in copies.iteritems():
ef4aa90b1e58 Move 'findrenames' code into its own file.
David Greenaway <hg-dev@davidgreenaway.com>
parents:
diff changeset
    82
        source, score = v
ef4aa90b1e58 Move 'findrenames' code into its own file.
David Greenaway <hg-dev@davidgreenaway.com>
parents:
diff changeset
    83
        yield source, dest, score
ef4aa90b1e58 Move 'findrenames' code into its own file.
David Greenaway <hg-dev@davidgreenaway.com>
parents:
diff changeset
    84
11060
e6df01776e08 findrenames: Optimise "addremove -s100" by matching files by their SHA1 hashes.
David Greenaway <hg-dev@davidgreenaway.com>
parents: 11059
diff changeset
    85
def findrenames(repo, added, removed, threshold):
e6df01776e08 findrenames: Optimise "addremove -s100" by matching files by their SHA1 hashes.
David Greenaway <hg-dev@davidgreenaway.com>
parents: 11059
diff changeset
    86
    '''find renamed files -- yields (before, after, score) tuples'''
e6df01776e08 findrenames: Optimise "addremove -s100" by matching files by their SHA1 hashes.
David Greenaway <hg-dev@davidgreenaway.com>
parents: 11059
diff changeset
    87
    parentctx = repo['.']
e6df01776e08 findrenames: Optimise "addremove -s100" by matching files by their SHA1 hashes.
David Greenaway <hg-dev@davidgreenaway.com>
parents: 11059
diff changeset
    88
    workingctx = repo[None]
11059
ef4aa90b1e58 Move 'findrenames' code into its own file.
David Greenaway <hg-dev@davidgreenaway.com>
parents:
diff changeset
    89
11060
e6df01776e08 findrenames: Optimise "addremove -s100" by matching files by their SHA1 hashes.
David Greenaway <hg-dev@davidgreenaway.com>
parents: 11059
diff changeset
    90
    # Zero length files will be frequently unrelated to each other, and
e6df01776e08 findrenames: Optimise "addremove -s100" by matching files by their SHA1 hashes.
David Greenaway <hg-dev@davidgreenaway.com>
parents: 11059
diff changeset
    91
    # tracking the deletion/addition of such a file will probably cause more
e6df01776e08 findrenames: Optimise "addremove -s100" by matching files by their SHA1 hashes.
David Greenaway <hg-dev@davidgreenaway.com>
parents: 11059
diff changeset
    92
    # harm than good. We strip them out here to avoid matching them later on.
e6df01776e08 findrenames: Optimise "addremove -s100" by matching files by their SHA1 hashes.
David Greenaway <hg-dev@davidgreenaway.com>
parents: 11059
diff changeset
    93
    addedfiles = set([workingctx[fp] for fp in added
e6df01776e08 findrenames: Optimise "addremove -s100" by matching files by their SHA1 hashes.
David Greenaway <hg-dev@davidgreenaway.com>
parents: 11059
diff changeset
    94
            if workingctx[fp].size() > 0])
e6df01776e08 findrenames: Optimise "addremove -s100" by matching files by their SHA1 hashes.
David Greenaway <hg-dev@davidgreenaway.com>
parents: 11059
diff changeset
    95
    removedfiles = set([parentctx[fp] for fp in removed
e6df01776e08 findrenames: Optimise "addremove -s100" by matching files by their SHA1 hashes.
David Greenaway <hg-dev@davidgreenaway.com>
parents: 11059
diff changeset
    96
            if fp in parentctx and parentctx[fp].size() > 0])
e6df01776e08 findrenames: Optimise "addremove -s100" by matching files by their SHA1 hashes.
David Greenaway <hg-dev@davidgreenaway.com>
parents: 11059
diff changeset
    97
e6df01776e08 findrenames: Optimise "addremove -s100" by matching files by their SHA1 hashes.
David Greenaway <hg-dev@davidgreenaway.com>
parents: 11059
diff changeset
    98
    # Find exact matches.
e6df01776e08 findrenames: Optimise "addremove -s100" by matching files by their SHA1 hashes.
David Greenaway <hg-dev@davidgreenaway.com>
parents: 11059
diff changeset
    99
    for (a, b) in _findexactmatches(repo,
11085
0c8646292ca4 fix coding style
Benoit Boissinot <benoit.boissinot@ens-lyon.org>
parents: 11060
diff changeset
   100
            sorted(addedfiles), sorted(removedfiles)):
11060
e6df01776e08 findrenames: Optimise "addremove -s100" by matching files by their SHA1 hashes.
David Greenaway <hg-dev@davidgreenaway.com>
parents: 11059
diff changeset
   101
        addedfiles.remove(b)
e6df01776e08 findrenames: Optimise "addremove -s100" by matching files by their SHA1 hashes.
David Greenaway <hg-dev@davidgreenaway.com>
parents: 11059
diff changeset
   102
        yield (a.path(), b.path(), 1.0)
e6df01776e08 findrenames: Optimise "addremove -s100" by matching files by their SHA1 hashes.
David Greenaway <hg-dev@davidgreenaway.com>
parents: 11059
diff changeset
   103
e6df01776e08 findrenames: Optimise "addremove -s100" by matching files by their SHA1 hashes.
David Greenaway <hg-dev@davidgreenaway.com>
parents: 11059
diff changeset
   104
    # If the user requested similar files to be matched, search for them also.
e6df01776e08 findrenames: Optimise "addremove -s100" by matching files by their SHA1 hashes.
David Greenaway <hg-dev@davidgreenaway.com>
parents: 11059
diff changeset
   105
    if threshold < 1.0:
e6df01776e08 findrenames: Optimise "addremove -s100" by matching files by their SHA1 hashes.
David Greenaway <hg-dev@davidgreenaway.com>
parents: 11059
diff changeset
   106
        for (a, b, score) in _findsimilarmatches(repo,
e6df01776e08 findrenames: Optimise "addremove -s100" by matching files by their SHA1 hashes.
David Greenaway <hg-dev@davidgreenaway.com>
parents: 11059
diff changeset
   107
                sorted(addedfiles), sorted(removedfiles), threshold):
e6df01776e08 findrenames: Optimise "addremove -s100" by matching files by their SHA1 hashes.
David Greenaway <hg-dev@davidgreenaway.com>
parents: 11059
diff changeset
   108
            yield (a.path(), b.path(), score)
e6df01776e08 findrenames: Optimise "addremove -s100" by matching files by their SHA1 hashes.
David Greenaway <hg-dev@davidgreenaway.com>
parents: 11059
diff changeset
   109