mercurial/similar.py
author Augie Fackler <augie@google.com>
Sat, 17 Feb 2018 22:38:53 -0500
changeset 36275 0cacd4df6eb0
parent 32202 ded48ad55146
child 38348 cd196be26cb7
permissions -rw-r--r--
webcommands: use pycompat.strkwargs as needed Differential Revision: https://phab.mercurial-scm.org/D2305
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
11059
ef4aa90b1e58 Move 'findrenames' code into its own file.
David Greenaway <hg-dev@davidgreenaway.com>
parents:
diff changeset
     1
# similar.py - mechanisms for finding similar files
ef4aa90b1e58 Move 'findrenames' code into its own file.
David Greenaway <hg-dev@davidgreenaway.com>
parents:
diff changeset
     2
#
ef4aa90b1e58 Move 'findrenames' code into its own file.
David Greenaway <hg-dev@davidgreenaway.com>
parents:
diff changeset
     3
# Copyright 2005-2007 Matt Mackall <mpm@selenic.com>
ef4aa90b1e58 Move 'findrenames' code into its own file.
David Greenaway <hg-dev@davidgreenaway.com>
parents:
diff changeset
     4
#
ef4aa90b1e58 Move 'findrenames' code into its own file.
David Greenaway <hg-dev@davidgreenaway.com>
parents:
diff changeset
     5
# This software may be used and distributed according to the terms of the
ef4aa90b1e58 Move 'findrenames' code into its own file.
David Greenaway <hg-dev@davidgreenaway.com>
parents:
diff changeset
     6
# GNU General Public License version 2 or any later version.
ef4aa90b1e58 Move 'findrenames' code into its own file.
David Greenaway <hg-dev@davidgreenaway.com>
parents:
diff changeset
     7
27359
a56c47ed3885 similar: use absolute_import
Gregory Szorc <gregory.szorc@gmail.com>
parents: 16683
diff changeset
     8
from __future__ import absolute_import
a56c47ed3885 similar: use absolute_import
Gregory Szorc <gregory.szorc@gmail.com>
parents: 16683
diff changeset
     9
a56c47ed3885 similar: use absolute_import
Gregory Szorc <gregory.szorc@gmail.com>
parents: 16683
diff changeset
    10
from .i18n import _
a56c47ed3885 similar: use absolute_import
Gregory Szorc <gregory.szorc@gmail.com>
parents: 16683
diff changeset
    11
from . import (
a56c47ed3885 similar: use absolute_import
Gregory Szorc <gregory.szorc@gmail.com>
parents: 16683
diff changeset
    12
    mdiff,
a56c47ed3885 similar: use absolute_import
Gregory Szorc <gregory.szorc@gmail.com>
parents: 16683
diff changeset
    13
)
11059
ef4aa90b1e58 Move 'findrenames' code into its own file.
David Greenaway <hg-dev@davidgreenaway.com>
parents:
diff changeset
    14
11060
e6df01776e08 findrenames: Optimise "addremove -s100" by matching files by their SHA1 hashes.
David Greenaway <hg-dev@davidgreenaway.com>
parents: 11059
diff changeset
    15
def _findexactmatches(repo, added, removed):
e6df01776e08 findrenames: Optimise "addremove -s100" by matching files by their SHA1 hashes.
David Greenaway <hg-dev@davidgreenaway.com>
parents: 11059
diff changeset
    16
    '''find renamed files that have no changes
e6df01776e08 findrenames: Optimise "addremove -s100" by matching files by their SHA1 hashes.
David Greenaway <hg-dev@davidgreenaway.com>
parents: 11059
diff changeset
    17
e6df01776e08 findrenames: Optimise "addremove -s100" by matching files by their SHA1 hashes.
David Greenaway <hg-dev@davidgreenaway.com>
parents: 11059
diff changeset
    18
    Takes a list of new filectxs and a list of removed filectxs, and yields
e6df01776e08 findrenames: Optimise "addremove -s100" by matching files by their SHA1 hashes.
David Greenaway <hg-dev@davidgreenaway.com>
parents: 11059
diff changeset
    19
    (before, after) tuples of exact matches.
e6df01776e08 findrenames: Optimise "addremove -s100" by matching files by their SHA1 hashes.
David Greenaway <hg-dev@davidgreenaway.com>
parents: 11059
diff changeset
    20
    '''
e6df01776e08 findrenames: Optimise "addremove -s100" by matching files by their SHA1 hashes.
David Greenaway <hg-dev@davidgreenaway.com>
parents: 11059
diff changeset
    21
    numfiles = len(added) + len(removed)
e6df01776e08 findrenames: Optimise "addremove -s100" by matching files by their SHA1 hashes.
David Greenaway <hg-dev@davidgreenaway.com>
parents: 11059
diff changeset
    22
31584
985a98c6bad0 similar: use cheaper hash() function to test exact matches
Yuya Nishihara <yuya@tcha.org>
parents: 31583
diff changeset
    23
    # Build table of removed files: {hash(fctx.data()): [fctx, ...]}.
985a98c6bad0 similar: use cheaper hash() function to test exact matches
Yuya Nishihara <yuya@tcha.org>
parents: 31583
diff changeset
    24
    # We use hash() to discard fctx.data() from memory.
11060
e6df01776e08 findrenames: Optimise "addremove -s100" by matching files by their SHA1 hashes.
David Greenaway <hg-dev@davidgreenaway.com>
parents: 11059
diff changeset
    25
    hashes = {}
31584
985a98c6bad0 similar: use cheaper hash() function to test exact matches
Yuya Nishihara <yuya@tcha.org>
parents: 31583
diff changeset
    26
    for i, fctx in enumerate(removed):
28468
0d6b3630b9a3 similar: specify unit for ui.progress when operating on files
Anton Shestakov <av6@dwimlabs.net>
parents: 27359
diff changeset
    27
        repo.ui.progress(_('searching for exact renames'), i, total=numfiles,
0d6b3630b9a3 similar: specify unit for ui.progress when operating on files
Anton Shestakov <av6@dwimlabs.net>
parents: 27359
diff changeset
    28
                         unit=_('files'))
31584
985a98c6bad0 similar: use cheaper hash() function to test exact matches
Yuya Nishihara <yuya@tcha.org>
parents: 31583
diff changeset
    29
        h = hash(fctx.data())
985a98c6bad0 similar: use cheaper hash() function to test exact matches
Yuya Nishihara <yuya@tcha.org>
parents: 31583
diff changeset
    30
        if h not in hashes:
985a98c6bad0 similar: use cheaper hash() function to test exact matches
Yuya Nishihara <yuya@tcha.org>
parents: 31583
diff changeset
    31
            hashes[h] = [fctx]
985a98c6bad0 similar: use cheaper hash() function to test exact matches
Yuya Nishihara <yuya@tcha.org>
parents: 31583
diff changeset
    32
        else:
985a98c6bad0 similar: use cheaper hash() function to test exact matches
Yuya Nishihara <yuya@tcha.org>
parents: 31583
diff changeset
    33
            hashes[h].append(fctx)
11060
e6df01776e08 findrenames: Optimise "addremove -s100" by matching files by their SHA1 hashes.
David Greenaway <hg-dev@davidgreenaway.com>
parents: 11059
diff changeset
    34
e6df01776e08 findrenames: Optimise "addremove -s100" by matching files by their SHA1 hashes.
David Greenaway <hg-dev@davidgreenaway.com>
parents: 11059
diff changeset
    35
    # For each added file, see if it corresponds to a removed file.
e6df01776e08 findrenames: Optimise "addremove -s100" by matching files by their SHA1 hashes.
David Greenaway <hg-dev@davidgreenaway.com>
parents: 11059
diff changeset
    36
    for i, fctx in enumerate(added):
e6df01776e08 findrenames: Optimise "addremove -s100" by matching files by their SHA1 hashes.
David Greenaway <hg-dev@davidgreenaway.com>
parents: 11059
diff changeset
    37
        repo.ui.progress(_('searching for exact renames'), i + len(removed),
28468
0d6b3630b9a3 similar: specify unit for ui.progress when operating on files
Anton Shestakov <av6@dwimlabs.net>
parents: 27359
diff changeset
    38
                total=numfiles, unit=_('files'))
31210
e1d035905b2e similar: compare between actual file contents for exact identity
FUJIWARA Katsunori <foozy@lares.dti.ne.jp>
parents: 30809
diff changeset
    39
        adata = fctx.data()
31584
985a98c6bad0 similar: use cheaper hash() function to test exact matches
Yuya Nishihara <yuya@tcha.org>
parents: 31583
diff changeset
    40
        h = hash(adata)
985a98c6bad0 similar: use cheaper hash() function to test exact matches
Yuya Nishihara <yuya@tcha.org>
parents: 31583
diff changeset
    41
        for rfctx in hashes.get(h, []):
31210
e1d035905b2e similar: compare between actual file contents for exact identity
FUJIWARA Katsunori <foozy@lares.dti.ne.jp>
parents: 30809
diff changeset
    42
            # compare between actual file contents for exact identity
e1d035905b2e similar: compare between actual file contents for exact identity
FUJIWARA Katsunori <foozy@lares.dti.ne.jp>
parents: 30809
diff changeset
    43
            if adata == rfctx.data():
e1d035905b2e similar: compare between actual file contents for exact identity
FUJIWARA Katsunori <foozy@lares.dti.ne.jp>
parents: 30809
diff changeset
    44
                yield (rfctx, fctx)
31584
985a98c6bad0 similar: use cheaper hash() function to test exact matches
Yuya Nishihara <yuya@tcha.org>
parents: 31583
diff changeset
    45
                break
11060
e6df01776e08 findrenames: Optimise "addremove -s100" by matching files by their SHA1 hashes.
David Greenaway <hg-dev@davidgreenaway.com>
parents: 11059
diff changeset
    46
e6df01776e08 findrenames: Optimise "addremove -s100" by matching files by their SHA1 hashes.
David Greenaway <hg-dev@davidgreenaway.com>
parents: 11059
diff changeset
    47
    # Done
e6df01776e08 findrenames: Optimise "addremove -s100" by matching files by their SHA1 hashes.
David Greenaway <hg-dev@davidgreenaway.com>
parents: 11059
diff changeset
    48
    repo.ui.progress(_('searching for exact renames'), None)
e6df01776e08 findrenames: Optimise "addremove -s100" by matching files by their SHA1 hashes.
David Greenaway <hg-dev@davidgreenaway.com>
parents: 11059
diff changeset
    49
30805
0ae287eb6a4f similar: move score function to module level
Sean Farley <sean@farley.io>
parents: 30791
diff changeset
    50
def _ctxdata(fctx):
0ae287eb6a4f similar: move score function to module level
Sean Farley <sean@farley.io>
parents: 30791
diff changeset
    51
    # lazily load text
0ae287eb6a4f similar: move score function to module level
Sean Farley <sean@farley.io>
parents: 30791
diff changeset
    52
    orig = fctx.data()
0ae287eb6a4f similar: move score function to module level
Sean Farley <sean@farley.io>
parents: 30791
diff changeset
    53
    return orig, mdiff.splitnewlines(orig)
0ae287eb6a4f similar: move score function to module level
Sean Farley <sean@farley.io>
parents: 30791
diff changeset
    54
30809
8614546154cb similar: remove caching from the module level
Pierre-Yves David <pierre-yves.david@ens-lyon.org>
parents: 30805
diff changeset
    55
def _score(fctx, otherdata):
8614546154cb similar: remove caching from the module level
Pierre-Yves David <pierre-yves.david@ens-lyon.org>
parents: 30805
diff changeset
    56
    orig, lines = otherdata
8614546154cb similar: remove caching from the module level
Pierre-Yves David <pierre-yves.david@ens-lyon.org>
parents: 30805
diff changeset
    57
    text = fctx.data()
32202
ded48ad55146 bdiff: proxy through mdiff module
Yuya Nishihara <yuya@tcha.org>
parents: 31584
diff changeset
    58
    # mdiff.blocks() returns blocks of matching lines
30805
0ae287eb6a4f similar: move score function to module level
Sean Farley <sean@farley.io>
parents: 30791
diff changeset
    59
    # count the number of bytes in each
0ae287eb6a4f similar: move score function to module level
Sean Farley <sean@farley.io>
parents: 30791
diff changeset
    60
    equal = 0
32202
ded48ad55146 bdiff: proxy through mdiff module
Yuya Nishihara <yuya@tcha.org>
parents: 31584
diff changeset
    61
    matches = mdiff.blocks(text, orig)
30805
0ae287eb6a4f similar: move score function to module level
Sean Farley <sean@farley.io>
parents: 30791
diff changeset
    62
    for x1, x2, y1, y2 in matches:
0ae287eb6a4f similar: move score function to module level
Sean Farley <sean@farley.io>
parents: 30791
diff changeset
    63
        for line in lines[y1:y2]:
0ae287eb6a4f similar: move score function to module level
Sean Farley <sean@farley.io>
parents: 30791
diff changeset
    64
            equal += len(line)
0ae287eb6a4f similar: move score function to module level
Sean Farley <sean@farley.io>
parents: 30791
diff changeset
    65
0ae287eb6a4f similar: move score function to module level
Sean Farley <sean@farley.io>
parents: 30791
diff changeset
    66
    lengths = len(text) + len(orig)
0ae287eb6a4f similar: move score function to module level
Sean Farley <sean@farley.io>
parents: 30791
diff changeset
    67
    return equal * 2.0 / lengths
0ae287eb6a4f similar: move score function to module level
Sean Farley <sean@farley.io>
parents: 30791
diff changeset
    68
30809
8614546154cb similar: remove caching from the module level
Pierre-Yves David <pierre-yves.david@ens-lyon.org>
parents: 30805
diff changeset
    69
def score(fctx1, fctx2):
8614546154cb similar: remove caching from the module level
Pierre-Yves David <pierre-yves.david@ens-lyon.org>
parents: 30805
diff changeset
    70
    return _score(fctx1, _ctxdata(fctx2))
8614546154cb similar: remove caching from the module level
Pierre-Yves David <pierre-yves.david@ens-lyon.org>
parents: 30805
diff changeset
    71
11060
e6df01776e08 findrenames: Optimise "addremove -s100" by matching files by their SHA1 hashes.
David Greenaway <hg-dev@davidgreenaway.com>
parents: 11059
diff changeset
    72
def _findsimilarmatches(repo, added, removed, threshold):
e6df01776e08 findrenames: Optimise "addremove -s100" by matching files by their SHA1 hashes.
David Greenaway <hg-dev@davidgreenaway.com>
parents: 11059
diff changeset
    73
    '''find potentially renamed files based on similar file content
e6df01776e08 findrenames: Optimise "addremove -s100" by matching files by their SHA1 hashes.
David Greenaway <hg-dev@davidgreenaway.com>
parents: 11059
diff changeset
    74
e6df01776e08 findrenames: Optimise "addremove -s100" by matching files by their SHA1 hashes.
David Greenaway <hg-dev@davidgreenaway.com>
parents: 11059
diff changeset
    75
    Takes a list of new filectxs and a list of removed filectxs, and yields
e6df01776e08 findrenames: Optimise "addremove -s100" by matching files by their SHA1 hashes.
David Greenaway <hg-dev@davidgreenaway.com>
parents: 11059
diff changeset
    76
    (before, after, score) tuples of partial matches.
e6df01776e08 findrenames: Optimise "addremove -s100" by matching files by their SHA1 hashes.
David Greenaway <hg-dev@davidgreenaway.com>
parents: 11059
diff changeset
    77
    '''
11059
ef4aa90b1e58 Move 'findrenames' code into its own file.
David Greenaway <hg-dev@davidgreenaway.com>
parents:
diff changeset
    78
    copies = {}
ef4aa90b1e58 Move 'findrenames' code into its own file.
David Greenaway <hg-dev@davidgreenaway.com>
parents:
diff changeset
    79
    for i, r in enumerate(removed):
16683
525fdb738975 cleanup: eradicate long lines
Brodie Rao <brodie@sf.io>
parents: 11085
diff changeset
    80
        repo.ui.progress(_('searching for similar files'), i,
28468
0d6b3630b9a3 similar: specify unit for ui.progress when operating on files
Anton Shestakov <av6@dwimlabs.net>
parents: 27359
diff changeset
    81
                         total=len(removed), unit=_('files'))
11059
ef4aa90b1e58 Move 'findrenames' code into its own file.
David Greenaway <hg-dev@davidgreenaway.com>
parents:
diff changeset
    82
30809
8614546154cb similar: remove caching from the module level
Pierre-Yves David <pierre-yves.david@ens-lyon.org>
parents: 30805
diff changeset
    83
        data = None
11059
ef4aa90b1e58 Move 'findrenames' code into its own file.
David Greenaway <hg-dev@davidgreenaway.com>
parents:
diff changeset
    84
        for a in added:
ef4aa90b1e58 Move 'findrenames' code into its own file.
David Greenaway <hg-dev@davidgreenaway.com>
parents:
diff changeset
    85
            bestscore = copies.get(a, (None, threshold))[1]
30809
8614546154cb similar: remove caching from the module level
Pierre-Yves David <pierre-yves.david@ens-lyon.org>
parents: 30805
diff changeset
    86
            if data is None:
8614546154cb similar: remove caching from the module level
Pierre-Yves David <pierre-yves.david@ens-lyon.org>
parents: 30805
diff changeset
    87
                data = _ctxdata(r)
8614546154cb similar: remove caching from the module level
Pierre-Yves David <pierre-yves.david@ens-lyon.org>
parents: 30805
diff changeset
    88
            myscore = _score(a, data)
31583
2efd9771323e similar: take the first match instead of the last
Yuya Nishihara <yuya@tcha.org>
parents: 31582
diff changeset
    89
            if myscore > bestscore:
11059
ef4aa90b1e58 Move 'findrenames' code into its own file.
David Greenaway <hg-dev@davidgreenaway.com>
parents:
diff changeset
    90
                copies[a] = (r, myscore)
ef4aa90b1e58 Move 'findrenames' code into its own file.
David Greenaway <hg-dev@davidgreenaway.com>
parents:
diff changeset
    91
    repo.ui.progress(_('searching'), None)
ef4aa90b1e58 Move 'findrenames' code into its own file.
David Greenaway <hg-dev@davidgreenaway.com>
parents:
diff changeset
    92
ef4aa90b1e58 Move 'findrenames' code into its own file.
David Greenaway <hg-dev@davidgreenaway.com>
parents:
diff changeset
    93
    for dest, v in copies.iteritems():
30791
ada160a8cfd8 similar: rename local variable to not collide with previous
Sean Farley <sean@farley.io>
parents: 29341
diff changeset
    94
        source, bscore = v
ada160a8cfd8 similar: rename local variable to not collide with previous
Sean Farley <sean@farley.io>
parents: 29341
diff changeset
    95
        yield source, dest, bscore
11059
ef4aa90b1e58 Move 'findrenames' code into its own file.
David Greenaway <hg-dev@davidgreenaway.com>
parents:
diff changeset
    96
31582
2e254165a37c similar: do not look up and create filectx more than once
Yuya Nishihara <yuya@tcha.org>
parents: 31581
diff changeset
    97
def _dropempty(fctxs):
2e254165a37c similar: do not look up and create filectx more than once
Yuya Nishihara <yuya@tcha.org>
parents: 31581
diff changeset
    98
    return [x for x in fctxs if x.size() > 0]
2e254165a37c similar: do not look up and create filectx more than once
Yuya Nishihara <yuya@tcha.org>
parents: 31581
diff changeset
    99
11060
e6df01776e08 findrenames: Optimise "addremove -s100" by matching files by their SHA1 hashes.
David Greenaway <hg-dev@davidgreenaway.com>
parents: 11059
diff changeset
   100
def findrenames(repo, added, removed, threshold):
e6df01776e08 findrenames: Optimise "addremove -s100" by matching files by their SHA1 hashes.
David Greenaway <hg-dev@davidgreenaway.com>
parents: 11059
diff changeset
   101
    '''find renamed files -- yields (before, after, score) tuples'''
31581
b1528d195a13 similar: use common names for changectx variables
Yuya Nishihara <yuya@tcha.org>
parents: 31580
diff changeset
   102
    wctx = repo[None]
b1528d195a13 similar: use common names for changectx variables
Yuya Nishihara <yuya@tcha.org>
parents: 31580
diff changeset
   103
    pctx = wctx.p1()
11059
ef4aa90b1e58 Move 'findrenames' code into its own file.
David Greenaway <hg-dev@davidgreenaway.com>
parents:
diff changeset
   104
11060
e6df01776e08 findrenames: Optimise "addremove -s100" by matching files by their SHA1 hashes.
David Greenaway <hg-dev@davidgreenaway.com>
parents: 11059
diff changeset
   105
    # Zero length files will be frequently unrelated to each other, and
e6df01776e08 findrenames: Optimise "addremove -s100" by matching files by their SHA1 hashes.
David Greenaway <hg-dev@davidgreenaway.com>
parents: 11059
diff changeset
   106
    # tracking the deletion/addition of such a file will probably cause more
e6df01776e08 findrenames: Optimise "addremove -s100" by matching files by their SHA1 hashes.
David Greenaway <hg-dev@davidgreenaway.com>
parents: 11059
diff changeset
   107
    # harm than good. We strip them out here to avoid matching them later on.
31582
2e254165a37c similar: do not look up and create filectx more than once
Yuya Nishihara <yuya@tcha.org>
parents: 31581
diff changeset
   108
    addedfiles = _dropempty(wctx[fp] for fp in sorted(added))
2e254165a37c similar: do not look up and create filectx more than once
Yuya Nishihara <yuya@tcha.org>
parents: 31581
diff changeset
   109
    removedfiles = _dropempty(pctx[fp] for fp in sorted(removed) if fp in pctx)
11060
e6df01776e08 findrenames: Optimise "addremove -s100" by matching files by their SHA1 hashes.
David Greenaway <hg-dev@davidgreenaway.com>
parents: 11059
diff changeset
   110
e6df01776e08 findrenames: Optimise "addremove -s100" by matching files by their SHA1 hashes.
David Greenaway <hg-dev@davidgreenaway.com>
parents: 11059
diff changeset
   111
    # Find exact matches.
31580
d3e2af4e0128 similar: get rid of quadratic addedfiles.remove()
Yuya Nishihara <yuya@tcha.org>
parents: 31579
diff changeset
   112
    matchedfiles = set()
d3e2af4e0128 similar: get rid of quadratic addedfiles.remove()
Yuya Nishihara <yuya@tcha.org>
parents: 31579
diff changeset
   113
    for (a, b) in _findexactmatches(repo, addedfiles, removedfiles):
d3e2af4e0128 similar: get rid of quadratic addedfiles.remove()
Yuya Nishihara <yuya@tcha.org>
parents: 31579
diff changeset
   114
        matchedfiles.add(b)
11060
e6df01776e08 findrenames: Optimise "addremove -s100" by matching files by their SHA1 hashes.
David Greenaway <hg-dev@davidgreenaway.com>
parents: 11059
diff changeset
   115
        yield (a.path(), b.path(), 1.0)
e6df01776e08 findrenames: Optimise "addremove -s100" by matching files by their SHA1 hashes.
David Greenaway <hg-dev@davidgreenaway.com>
parents: 11059
diff changeset
   116
e6df01776e08 findrenames: Optimise "addremove -s100" by matching files by their SHA1 hashes.
David Greenaway <hg-dev@davidgreenaway.com>
parents: 11059
diff changeset
   117
    # If the user requested similar files to be matched, search for them also.
e6df01776e08 findrenames: Optimise "addremove -s100" by matching files by their SHA1 hashes.
David Greenaway <hg-dev@davidgreenaway.com>
parents: 11059
diff changeset
   118
    if threshold < 1.0:
31580
d3e2af4e0128 similar: get rid of quadratic addedfiles.remove()
Yuya Nishihara <yuya@tcha.org>
parents: 31579
diff changeset
   119
        addedfiles = [x for x in addedfiles if x not in matchedfiles]
31579
3a383caa97f4 similar: sort files not by object id but by path for stable result
Yuya Nishihara <yuya@tcha.org>
parents: 31210
diff changeset
   120
        for (a, b, score) in _findsimilarmatches(repo, addedfiles,
3a383caa97f4 similar: sort files not by object id but by path for stable result
Yuya Nishihara <yuya@tcha.org>
parents: 31210
diff changeset
   121
                                                 removedfiles, threshold):
11060
e6df01776e08 findrenames: Optimise "addremove -s100" by matching files by their SHA1 hashes.
David Greenaway <hg-dev@davidgreenaway.com>
parents: 11059
diff changeset
   122
            yield (a.path(), b.path(), score)