mercurial/similar.py
changeset 30805 0ae287eb6a4f
parent 30791 ada160a8cfd8
child 30809 8614546154cb
equal deleted inserted replaced
30804:4227f80f72b2 30805:0ae287eb6a4f
    41             yield (hashes[h], fctx)
    41             yield (hashes[h], fctx)
    42 
    42 
    43     # Done
    43     # Done
    44     repo.ui.progress(_('searching for exact renames'), None)
    44     repo.ui.progress(_('searching for exact renames'), None)
    45 
    45 
       
    46 @util.cachefunc
       
    47 def _ctxdata(fctx):
       
    48     # lazily load text
       
    49     orig = fctx.data()
       
    50     return orig, mdiff.splitnewlines(orig)
       
    51 
       
    52 @util.cachefunc
       
    53 def score(fctx1, fctx2):
       
    54     text = fctx1.data()
       
    55     orig, lines = _ctxdata(fctx2)
       
    56     # bdiff.blocks() returns blocks of matching lines
       
    57     # count the number of bytes in each
       
    58     equal = 0
       
    59     matches = bdiff.blocks(text, orig)
       
    60     for x1, x2, y1, y2 in matches:
       
    61         for line in lines[y1:y2]:
       
    62             equal += len(line)
       
    63 
       
    64     lengths = len(text) + len(orig)
       
    65     return equal * 2.0 / lengths
       
    66 
    46 def _findsimilarmatches(repo, added, removed, threshold):
    67 def _findsimilarmatches(repo, added, removed, threshold):
    47     '''find potentially renamed files based on similar file content
    68     '''find potentially renamed files based on similar file content
    48 
    69 
    49     Takes a list of new filectxs and a list of removed filectxs, and yields
    70     Takes a list of new filectxs and a list of removed filectxs, and yields
    50     (before, after, score) tuples of partial matches.
    71     (before, after, score) tuples of partial matches.
    52     copies = {}
    73     copies = {}
    53     for i, r in enumerate(removed):
    74     for i, r in enumerate(removed):
    54         repo.ui.progress(_('searching for similar files'), i,
    75         repo.ui.progress(_('searching for similar files'), i,
    55                          total=len(removed), unit=_('files'))
    76                          total=len(removed), unit=_('files'))
    56 
    77 
    57         # lazily load text
       
    58         @util.cachefunc
       
    59         def data():
       
    60             orig = r.data()
       
    61             return orig, mdiff.splitnewlines(orig)
       
    62 
       
    63         def score(text):
       
    64             orig, lines = data()
       
    65             # bdiff.blocks() returns blocks of matching lines
       
    66             # count the number of bytes in each
       
    67             equal = 0
       
    68             matches = bdiff.blocks(text, orig)
       
    69             for x1, x2, y1, y2 in matches:
       
    70                 for line in lines[y1:y2]:
       
    71                     equal += len(line)
       
    72 
       
    73             lengths = len(text) + len(orig)
       
    74             return equal * 2.0 / lengths
       
    75 
       
    76         for a in added:
    78         for a in added:
    77             bestscore = copies.get(a, (None, threshold))[1]
    79             bestscore = copies.get(a, (None, threshold))[1]
    78             myscore = score(a.data())
    80             myscore = score(a, r)
    79             if myscore >= bestscore:
    81             if myscore >= bestscore:
    80                 copies[a] = (r, myscore)
    82                 copies[a] = (r, myscore)
    81     repo.ui.progress(_('searching'), None)
    83     repo.ui.progress(_('searching'), None)
    82 
    84 
    83     for dest, v in copies.iteritems():
    85     for dest, v in copies.iteritems():