mercurial/similar.py
changeset 11059 ef4aa90b1e58
child 11060 e6df01776e08
equal deleted inserted replaced
11058:f6dcbeb5babe 11059:ef4aa90b1e58
       
     1 # similar.py - mechanisms for finding similar files
       
     2 #
       
     3 # Copyright 2005-2007 Matt Mackall <mpm@selenic.com>
       
     4 #
       
     5 # This software may be used and distributed according to the terms of the
       
     6 # GNU General Public License version 2 or any later version.
       
     7 
       
     8 from i18n import _
       
     9 import util
       
    10 import mdiff
       
    11 import bdiff
       
    12 
       
    13 def findrenames(repo, added, removed, threshold):
       
    14     '''find renamed files -- yields (before, after, score) tuples'''
       
    15     copies = {}
       
    16     ctx = repo['.']
       
    17     for i, r in enumerate(removed):
       
    18         repo.ui.progress(_('searching'), i, total=len(removed))
       
    19         if r not in ctx:
       
    20             continue
       
    21         fctx = ctx.filectx(r)
       
    22 
       
    23         # lazily load text
       
    24         @util.cachefunc
       
    25         def data():
       
    26             orig = fctx.data()
       
    27             return orig, mdiff.splitnewlines(orig)
       
    28 
       
    29         def score(text):
       
    30             if not len(text):
       
    31                 return 0.0
       
    32             if not fctx.cmp(text):
       
    33                 return 1.0
       
    34             if threshold == 1.0:
       
    35                 return 0.0
       
    36             orig, lines = data()
       
    37             # bdiff.blocks() returns blocks of matching lines
       
    38             # count the number of bytes in each
       
    39             equal = 0
       
    40             matches = bdiff.blocks(text, orig)
       
    41             for x1, x2, y1, y2 in matches:
       
    42                 for line in lines[y1:y2]:
       
    43                     equal += len(line)
       
    44 
       
    45             lengths = len(text) + len(orig)
       
    46             return equal * 2.0 / lengths
       
    47 
       
    48         for a in added:
       
    49             bestscore = copies.get(a, (None, threshold))[1]
       
    50             myscore = score(repo.wread(a))
       
    51             if myscore >= bestscore:
       
    52                 copies[a] = (r, myscore)
       
    53     repo.ui.progress(_('searching'), None)
       
    54 
       
    55     for dest, v in copies.iteritems():
       
    56         source, score = v
       
    57         yield source, dest, score
       
    58 
       
    59