|
1 # similar.py - mechanisms for finding similar files |
|
2 # |
|
3 # Copyright 2005-2007 Matt Mackall <mpm@selenic.com> |
|
4 # |
|
5 # This software may be used and distributed according to the terms of the |
|
6 # GNU General Public License version 2 or any later version. |
|
7 |
|
8 from i18n import _ |
|
9 import util |
|
10 import mdiff |
|
11 import bdiff |
|
12 |
|
13 def findrenames(repo, added, removed, threshold): |
|
14 '''find renamed files -- yields (before, after, score) tuples''' |
|
15 copies = {} |
|
16 ctx = repo['.'] |
|
17 for i, r in enumerate(removed): |
|
18 repo.ui.progress(_('searching'), i, total=len(removed)) |
|
19 if r not in ctx: |
|
20 continue |
|
21 fctx = ctx.filectx(r) |
|
22 |
|
23 # lazily load text |
|
24 @util.cachefunc |
|
25 def data(): |
|
26 orig = fctx.data() |
|
27 return orig, mdiff.splitnewlines(orig) |
|
28 |
|
29 def score(text): |
|
30 if not len(text): |
|
31 return 0.0 |
|
32 if not fctx.cmp(text): |
|
33 return 1.0 |
|
34 if threshold == 1.0: |
|
35 return 0.0 |
|
36 orig, lines = data() |
|
37 # bdiff.blocks() returns blocks of matching lines |
|
38 # count the number of bytes in each |
|
39 equal = 0 |
|
40 matches = bdiff.blocks(text, orig) |
|
41 for x1, x2, y1, y2 in matches: |
|
42 for line in lines[y1:y2]: |
|
43 equal += len(line) |
|
44 |
|
45 lengths = len(text) + len(orig) |
|
46 return equal * 2.0 / lengths |
|
47 |
|
48 for a in added: |
|
49 bestscore = copies.get(a, (None, threshold))[1] |
|
50 myscore = score(repo.wread(a)) |
|
51 if myscore >= bestscore: |
|
52 copies[a] = (r, myscore) |
|
53 repo.ui.progress(_('searching'), None) |
|
54 |
|
55 for dest, v in copies.iteritems(): |
|
56 source, score = v |
|
57 yield source, dest, score |
|
58 |
|
59 |