annotate mercurial/similar.py @ 16774:69af967b6d6f

httpclient: update to c5abd358e543 of httpplus
author Augie Fackler <raf@durin42.com>
date Fri, 18 May 2012 17:05:17 -0500
parents 525fdb738975
children a56c47ed3885
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
11059
ef4aa90b1e58 Move 'findrenames' code into its own file.
David Greenaway <hg-dev@davidgreenaway.com>
parents:
diff changeset
1 # similar.py - mechanisms for finding similar files
ef4aa90b1e58 Move 'findrenames' code into its own file.
David Greenaway <hg-dev@davidgreenaway.com>
parents:
diff changeset
2 #
ef4aa90b1e58 Move 'findrenames' code into its own file.
David Greenaway <hg-dev@davidgreenaway.com>
parents:
diff changeset
3 # Copyright 2005-2007 Matt Mackall <mpm@selenic.com>
ef4aa90b1e58 Move 'findrenames' code into its own file.
David Greenaway <hg-dev@davidgreenaway.com>
parents:
diff changeset
4 #
ef4aa90b1e58 Move 'findrenames' code into its own file.
David Greenaway <hg-dev@davidgreenaway.com>
parents:
diff changeset
5 # This software may be used and distributed according to the terms of the
ef4aa90b1e58 Move 'findrenames' code into its own file.
David Greenaway <hg-dev@davidgreenaway.com>
parents:
diff changeset
6 # GNU General Public License version 2 or any later version.
ef4aa90b1e58 Move 'findrenames' code into its own file.
David Greenaway <hg-dev@davidgreenaway.com>
parents:
diff changeset
7
ef4aa90b1e58 Move 'findrenames' code into its own file.
David Greenaway <hg-dev@davidgreenaway.com>
parents:
diff changeset
8 from i18n import _
ef4aa90b1e58 Move 'findrenames' code into its own file.
David Greenaway <hg-dev@davidgreenaway.com>
parents:
diff changeset
9 import util
ef4aa90b1e58 Move 'findrenames' code into its own file.
David Greenaway <hg-dev@davidgreenaway.com>
parents:
diff changeset
10 import mdiff
ef4aa90b1e58 Move 'findrenames' code into its own file.
David Greenaway <hg-dev@davidgreenaway.com>
parents:
diff changeset
11 import bdiff
ef4aa90b1e58 Move 'findrenames' code into its own file.
David Greenaway <hg-dev@davidgreenaway.com>
parents:
diff changeset
12
11060
e6df01776e08 findrenames: Optimise "addremove -s100" by matching files by their SHA1 hashes.
David Greenaway <hg-dev@davidgreenaway.com>
parents: 11059
diff changeset
13 def _findexactmatches(repo, added, removed):
e6df01776e08 findrenames: Optimise "addremove -s100" by matching files by their SHA1 hashes.
David Greenaway <hg-dev@davidgreenaway.com>
parents: 11059
diff changeset
14 '''find renamed files that have no changes
e6df01776e08 findrenames: Optimise "addremove -s100" by matching files by their SHA1 hashes.
David Greenaway <hg-dev@davidgreenaway.com>
parents: 11059
diff changeset
15
e6df01776e08 findrenames: Optimise "addremove -s100" by matching files by their SHA1 hashes.
David Greenaway <hg-dev@davidgreenaway.com>
parents: 11059
diff changeset
16 Takes a list of new filectxs and a list of removed filectxs, and yields
e6df01776e08 findrenames: Optimise "addremove -s100" by matching files by their SHA1 hashes.
David Greenaway <hg-dev@davidgreenaway.com>
parents: 11059
diff changeset
17 (before, after) tuples of exact matches.
e6df01776e08 findrenames: Optimise "addremove -s100" by matching files by their SHA1 hashes.
David Greenaway <hg-dev@davidgreenaway.com>
parents: 11059
diff changeset
18 '''
e6df01776e08 findrenames: Optimise "addremove -s100" by matching files by their SHA1 hashes.
David Greenaway <hg-dev@davidgreenaway.com>
parents: 11059
diff changeset
19 numfiles = len(added) + len(removed)
e6df01776e08 findrenames: Optimise "addremove -s100" by matching files by their SHA1 hashes.
David Greenaway <hg-dev@davidgreenaway.com>
parents: 11059
diff changeset
20
e6df01776e08 findrenames: Optimise "addremove -s100" by matching files by their SHA1 hashes.
David Greenaway <hg-dev@davidgreenaway.com>
parents: 11059
diff changeset
21 # Get hashes of removed files.
e6df01776e08 findrenames: Optimise "addremove -s100" by matching files by their SHA1 hashes.
David Greenaway <hg-dev@davidgreenaway.com>
parents: 11059
diff changeset
22 hashes = {}
e6df01776e08 findrenames: Optimise "addremove -s100" by matching files by their SHA1 hashes.
David Greenaway <hg-dev@davidgreenaway.com>
parents: 11059
diff changeset
23 for i, fctx in enumerate(removed):
e6df01776e08 findrenames: Optimise "addremove -s100" by matching files by their SHA1 hashes.
David Greenaway <hg-dev@davidgreenaway.com>
parents: 11059
diff changeset
24 repo.ui.progress(_('searching for exact renames'), i, total=numfiles)
e6df01776e08 findrenames: Optimise "addremove -s100" by matching files by their SHA1 hashes.
David Greenaway <hg-dev@davidgreenaway.com>
parents: 11059
diff changeset
25 h = util.sha1(fctx.data()).digest()
e6df01776e08 findrenames: Optimise "addremove -s100" by matching files by their SHA1 hashes.
David Greenaway <hg-dev@davidgreenaway.com>
parents: 11059
diff changeset
26 hashes[h] = fctx
e6df01776e08 findrenames: Optimise "addremove -s100" by matching files by their SHA1 hashes.
David Greenaway <hg-dev@davidgreenaway.com>
parents: 11059
diff changeset
27
e6df01776e08 findrenames: Optimise "addremove -s100" by matching files by their SHA1 hashes.
David Greenaway <hg-dev@davidgreenaway.com>
parents: 11059
diff changeset
28 # For each added file, see if it corresponds to a removed file.
e6df01776e08 findrenames: Optimise "addremove -s100" by matching files by their SHA1 hashes.
David Greenaway <hg-dev@davidgreenaway.com>
parents: 11059
diff changeset
29 for i, fctx in enumerate(added):
e6df01776e08 findrenames: Optimise "addremove -s100" by matching files by their SHA1 hashes.
David Greenaway <hg-dev@davidgreenaway.com>
parents: 11059
diff changeset
30 repo.ui.progress(_('searching for exact renames'), i + len(removed),
e6df01776e08 findrenames: Optimise "addremove -s100" by matching files by their SHA1 hashes.
David Greenaway <hg-dev@davidgreenaway.com>
parents: 11059
diff changeset
31 total=numfiles)
e6df01776e08 findrenames: Optimise "addremove -s100" by matching files by their SHA1 hashes.
David Greenaway <hg-dev@davidgreenaway.com>
parents: 11059
diff changeset
32 h = util.sha1(fctx.data()).digest()
e6df01776e08 findrenames: Optimise "addremove -s100" by matching files by their SHA1 hashes.
David Greenaway <hg-dev@davidgreenaway.com>
parents: 11059
diff changeset
33 if h in hashes:
e6df01776e08 findrenames: Optimise "addremove -s100" by matching files by their SHA1 hashes.
David Greenaway <hg-dev@davidgreenaway.com>
parents: 11059
diff changeset
34 yield (hashes[h], fctx)
e6df01776e08 findrenames: Optimise "addremove -s100" by matching files by their SHA1 hashes.
David Greenaway <hg-dev@davidgreenaway.com>
parents: 11059
diff changeset
35
e6df01776e08 findrenames: Optimise "addremove -s100" by matching files by their SHA1 hashes.
David Greenaway <hg-dev@davidgreenaway.com>
parents: 11059
diff changeset
36 # Done
e6df01776e08 findrenames: Optimise "addremove -s100" by matching files by their SHA1 hashes.
David Greenaway <hg-dev@davidgreenaway.com>
parents: 11059
diff changeset
37 repo.ui.progress(_('searching for exact renames'), None)
e6df01776e08 findrenames: Optimise "addremove -s100" by matching files by their SHA1 hashes.
David Greenaway <hg-dev@davidgreenaway.com>
parents: 11059
diff changeset
38
e6df01776e08 findrenames: Optimise "addremove -s100" by matching files by their SHA1 hashes.
David Greenaway <hg-dev@davidgreenaway.com>
parents: 11059
diff changeset
39 def _findsimilarmatches(repo, added, removed, threshold):
e6df01776e08 findrenames: Optimise "addremove -s100" by matching files by their SHA1 hashes.
David Greenaway <hg-dev@davidgreenaway.com>
parents: 11059
diff changeset
40 '''find potentially renamed files based on similar file content
e6df01776e08 findrenames: Optimise "addremove -s100" by matching files by their SHA1 hashes.
David Greenaway <hg-dev@davidgreenaway.com>
parents: 11059
diff changeset
41
e6df01776e08 findrenames: Optimise "addremove -s100" by matching files by their SHA1 hashes.
David Greenaway <hg-dev@davidgreenaway.com>
parents: 11059
diff changeset
42 Takes a list of new filectxs and a list of removed filectxs, and yields
e6df01776e08 findrenames: Optimise "addremove -s100" by matching files by their SHA1 hashes.
David Greenaway <hg-dev@davidgreenaway.com>
parents: 11059
diff changeset
43 (before, after, score) tuples of partial matches.
e6df01776e08 findrenames: Optimise "addremove -s100" by matching files by their SHA1 hashes.
David Greenaway <hg-dev@davidgreenaway.com>
parents: 11059
diff changeset
44 '''
11059
ef4aa90b1e58 Move 'findrenames' code into its own file.
David Greenaway <hg-dev@davidgreenaway.com>
parents:
diff changeset
45 copies = {}
ef4aa90b1e58 Move 'findrenames' code into its own file.
David Greenaway <hg-dev@davidgreenaway.com>
parents:
diff changeset
46 for i, r in enumerate(removed):
16683
525fdb738975 cleanup: eradicate long lines
Brodie Rao <brodie@sf.io>
parents: 11085
diff changeset
47 repo.ui.progress(_('searching for similar files'), i,
525fdb738975 cleanup: eradicate long lines
Brodie Rao <brodie@sf.io>
parents: 11085
diff changeset
48 total=len(removed))
11059
ef4aa90b1e58 Move 'findrenames' code into its own file.
David Greenaway <hg-dev@davidgreenaway.com>
parents:
diff changeset
49
ef4aa90b1e58 Move 'findrenames' code into its own file.
David Greenaway <hg-dev@davidgreenaway.com>
parents:
diff changeset
50 # lazily load text
ef4aa90b1e58 Move 'findrenames' code into its own file.
David Greenaway <hg-dev@davidgreenaway.com>
parents:
diff changeset
51 @util.cachefunc
ef4aa90b1e58 Move 'findrenames' code into its own file.
David Greenaway <hg-dev@davidgreenaway.com>
parents:
diff changeset
52 def data():
11060
e6df01776e08 findrenames: Optimise "addremove -s100" by matching files by their SHA1 hashes.
David Greenaway <hg-dev@davidgreenaway.com>
parents: 11059
diff changeset
53 orig = r.data()
11059
ef4aa90b1e58 Move 'findrenames' code into its own file.
David Greenaway <hg-dev@davidgreenaway.com>
parents:
diff changeset
54 return orig, mdiff.splitnewlines(orig)
ef4aa90b1e58 Move 'findrenames' code into its own file.
David Greenaway <hg-dev@davidgreenaway.com>
parents:
diff changeset
55
ef4aa90b1e58 Move 'findrenames' code into its own file.
David Greenaway <hg-dev@davidgreenaway.com>
parents:
diff changeset
56 def score(text):
ef4aa90b1e58 Move 'findrenames' code into its own file.
David Greenaway <hg-dev@davidgreenaway.com>
parents:
diff changeset
57 orig, lines = data()
ef4aa90b1e58 Move 'findrenames' code into its own file.
David Greenaway <hg-dev@davidgreenaway.com>
parents:
diff changeset
58 # bdiff.blocks() returns blocks of matching lines
ef4aa90b1e58 Move 'findrenames' code into its own file.
David Greenaway <hg-dev@davidgreenaway.com>
parents:
diff changeset
59 # count the number of bytes in each
ef4aa90b1e58 Move 'findrenames' code into its own file.
David Greenaway <hg-dev@davidgreenaway.com>
parents:
diff changeset
60 equal = 0
ef4aa90b1e58 Move 'findrenames' code into its own file.
David Greenaway <hg-dev@davidgreenaway.com>
parents:
diff changeset
61 matches = bdiff.blocks(text, orig)
ef4aa90b1e58 Move 'findrenames' code into its own file.
David Greenaway <hg-dev@davidgreenaway.com>
parents:
diff changeset
62 for x1, x2, y1, y2 in matches:
ef4aa90b1e58 Move 'findrenames' code into its own file.
David Greenaway <hg-dev@davidgreenaway.com>
parents:
diff changeset
63 for line in lines[y1:y2]:
ef4aa90b1e58 Move 'findrenames' code into its own file.
David Greenaway <hg-dev@davidgreenaway.com>
parents:
diff changeset
64 equal += len(line)
ef4aa90b1e58 Move 'findrenames' code into its own file.
David Greenaway <hg-dev@davidgreenaway.com>
parents:
diff changeset
65
ef4aa90b1e58 Move 'findrenames' code into its own file.
David Greenaway <hg-dev@davidgreenaway.com>
parents:
diff changeset
66 lengths = len(text) + len(orig)
ef4aa90b1e58 Move 'findrenames' code into its own file.
David Greenaway <hg-dev@davidgreenaway.com>
parents:
diff changeset
67 return equal * 2.0 / lengths
ef4aa90b1e58 Move 'findrenames' code into its own file.
David Greenaway <hg-dev@davidgreenaway.com>
parents:
diff changeset
68
ef4aa90b1e58 Move 'findrenames' code into its own file.
David Greenaway <hg-dev@davidgreenaway.com>
parents:
diff changeset
69 for a in added:
ef4aa90b1e58 Move 'findrenames' code into its own file.
David Greenaway <hg-dev@davidgreenaway.com>
parents:
diff changeset
70 bestscore = copies.get(a, (None, threshold))[1]
11060
e6df01776e08 findrenames: Optimise "addremove -s100" by matching files by their SHA1 hashes.
David Greenaway <hg-dev@davidgreenaway.com>
parents: 11059
diff changeset
71 myscore = score(a.data())
11059
ef4aa90b1e58 Move 'findrenames' code into its own file.
David Greenaway <hg-dev@davidgreenaway.com>
parents:
diff changeset
72 if myscore >= bestscore:
ef4aa90b1e58 Move 'findrenames' code into its own file.
David Greenaway <hg-dev@davidgreenaway.com>
parents:
diff changeset
73 copies[a] = (r, myscore)
ef4aa90b1e58 Move 'findrenames' code into its own file.
David Greenaway <hg-dev@davidgreenaway.com>
parents:
diff changeset
74 repo.ui.progress(_('searching'), None)
ef4aa90b1e58 Move 'findrenames' code into its own file.
David Greenaway <hg-dev@davidgreenaway.com>
parents:
diff changeset
75
ef4aa90b1e58 Move 'findrenames' code into its own file.
David Greenaway <hg-dev@davidgreenaway.com>
parents:
diff changeset
76 for dest, v in copies.iteritems():
ef4aa90b1e58 Move 'findrenames' code into its own file.
David Greenaway <hg-dev@davidgreenaway.com>
parents:
diff changeset
77 source, score = v
ef4aa90b1e58 Move 'findrenames' code into its own file.
David Greenaway <hg-dev@davidgreenaway.com>
parents:
diff changeset
78 yield source, dest, score
ef4aa90b1e58 Move 'findrenames' code into its own file.
David Greenaway <hg-dev@davidgreenaway.com>
parents:
diff changeset
79
11060
e6df01776e08 findrenames: Optimise "addremove -s100" by matching files by their SHA1 hashes.
David Greenaway <hg-dev@davidgreenaway.com>
parents: 11059
diff changeset
80 def findrenames(repo, added, removed, threshold):
e6df01776e08 findrenames: Optimise "addremove -s100" by matching files by their SHA1 hashes.
David Greenaway <hg-dev@davidgreenaway.com>
parents: 11059
diff changeset
81 '''find renamed files -- yields (before, after, score) tuples'''
e6df01776e08 findrenames: Optimise "addremove -s100" by matching files by their SHA1 hashes.
David Greenaway <hg-dev@davidgreenaway.com>
parents: 11059
diff changeset
82 parentctx = repo['.']
e6df01776e08 findrenames: Optimise "addremove -s100" by matching files by their SHA1 hashes.
David Greenaway <hg-dev@davidgreenaway.com>
parents: 11059
diff changeset
83 workingctx = repo[None]
11059
ef4aa90b1e58 Move 'findrenames' code into its own file.
David Greenaway <hg-dev@davidgreenaway.com>
parents:
diff changeset
84
11060
e6df01776e08 findrenames: Optimise "addremove -s100" by matching files by their SHA1 hashes.
David Greenaway <hg-dev@davidgreenaway.com>
parents: 11059
diff changeset
85 # Zero length files will be frequently unrelated to each other, and
e6df01776e08 findrenames: Optimise "addremove -s100" by matching files by their SHA1 hashes.
David Greenaway <hg-dev@davidgreenaway.com>
parents: 11059
diff changeset
86 # tracking the deletion/addition of such a file will probably cause more
e6df01776e08 findrenames: Optimise "addremove -s100" by matching files by their SHA1 hashes.
David Greenaway <hg-dev@davidgreenaway.com>
parents: 11059
diff changeset
87 # harm than good. We strip them out here to avoid matching them later on.
e6df01776e08 findrenames: Optimise "addremove -s100" by matching files by their SHA1 hashes.
David Greenaway <hg-dev@davidgreenaway.com>
parents: 11059
diff changeset
88 addedfiles = set([workingctx[fp] for fp in added
e6df01776e08 findrenames: Optimise "addremove -s100" by matching files by their SHA1 hashes.
David Greenaway <hg-dev@davidgreenaway.com>
parents: 11059
diff changeset
89 if workingctx[fp].size() > 0])
e6df01776e08 findrenames: Optimise "addremove -s100" by matching files by their SHA1 hashes.
David Greenaway <hg-dev@davidgreenaway.com>
parents: 11059
diff changeset
90 removedfiles = set([parentctx[fp] for fp in removed
e6df01776e08 findrenames: Optimise "addremove -s100" by matching files by their SHA1 hashes.
David Greenaway <hg-dev@davidgreenaway.com>
parents: 11059
diff changeset
91 if fp in parentctx and parentctx[fp].size() > 0])
e6df01776e08 findrenames: Optimise "addremove -s100" by matching files by their SHA1 hashes.
David Greenaway <hg-dev@davidgreenaway.com>
parents: 11059
diff changeset
92
e6df01776e08 findrenames: Optimise "addremove -s100" by matching files by their SHA1 hashes.
David Greenaway <hg-dev@davidgreenaway.com>
parents: 11059
diff changeset
93 # Find exact matches.
e6df01776e08 findrenames: Optimise "addremove -s100" by matching files by their SHA1 hashes.
David Greenaway <hg-dev@davidgreenaway.com>
parents: 11059
diff changeset
94 for (a, b) in _findexactmatches(repo,
11085
0c8646292ca4 fix coding style
Benoit Boissinot <benoit.boissinot@ens-lyon.org>
parents: 11060
diff changeset
95 sorted(addedfiles), sorted(removedfiles)):
11060
e6df01776e08 findrenames: Optimise "addremove -s100" by matching files by their SHA1 hashes.
David Greenaway <hg-dev@davidgreenaway.com>
parents: 11059
diff changeset
96 addedfiles.remove(b)
e6df01776e08 findrenames: Optimise "addremove -s100" by matching files by their SHA1 hashes.
David Greenaway <hg-dev@davidgreenaway.com>
parents: 11059
diff changeset
97 yield (a.path(), b.path(), 1.0)
e6df01776e08 findrenames: Optimise "addremove -s100" by matching files by their SHA1 hashes.
David Greenaway <hg-dev@davidgreenaway.com>
parents: 11059
diff changeset
98
e6df01776e08 findrenames: Optimise "addremove -s100" by matching files by their SHA1 hashes.
David Greenaway <hg-dev@davidgreenaway.com>
parents: 11059
diff changeset
99 # If the user requested similar files to be matched, search for them also.
e6df01776e08 findrenames: Optimise "addremove -s100" by matching files by their SHA1 hashes.
David Greenaway <hg-dev@davidgreenaway.com>
parents: 11059
diff changeset
100 if threshold < 1.0:
e6df01776e08 findrenames: Optimise "addremove -s100" by matching files by their SHA1 hashes.
David Greenaway <hg-dev@davidgreenaway.com>
parents: 11059
diff changeset
101 for (a, b, score) in _findsimilarmatches(repo,
e6df01776e08 findrenames: Optimise "addremove -s100" by matching files by their SHA1 hashes.
David Greenaway <hg-dev@davidgreenaway.com>
parents: 11059
diff changeset
102 sorted(addedfiles), sorted(removedfiles), threshold):
e6df01776e08 findrenames: Optimise "addremove -s100" by matching files by their SHA1 hashes.
David Greenaway <hg-dev@davidgreenaway.com>
parents: 11059
diff changeset
103 yield (a.path(), b.path(), score)
e6df01776e08 findrenames: Optimise "addremove -s100" by matching files by their SHA1 hashes.
David Greenaway <hg-dev@davidgreenaway.com>
parents: 11059
diff changeset
104