diff mercurial/similar.py @ 31585:3a383caa97f4

similar: sort files not by object id but by path for stable result Perhaps the original implementation would want to sort added/removed files alphabetically, but actually it did sort fctx objects by memory location. This patch removes the use of set()s in order to preserve the order of added/removed files. addedfiles.remove() becomes quadratic, but its cost appears not dominant. Anyway, the quadratic behavior will be eliminated by the next patch. Benchmark with 50k added/removed files, on tmpfs: $ mkdir src $ for n in `seq 0 49`; do > mkdir `printf src/%02d $n` > done $ for n in `seq 0 49999`; do > f=`printf src/%02d/%05d $(($n/1000)) $n` > dd if=/dev/urandom of=$f bs=8k count=1 status=none > done $ hg ci -qAm 'add 50k files of random content' $ mv src dest $ hg addremove --dry-run --time -q original: real 16.550 secs (user 15.000+0.000 sys 1.540+0.000) this patch: real 16.730 secs (user 15.280+0.000 sys 1.440+0.000)
author Yuya Nishihara <yuya@tcha.org>
date Sun, 15 Mar 2015 18:58:56 +0900
parents e1d035905b2e
children d3e2af4e0128
line wrap: on
line diff
--- a/mercurial/similar.py	Sun Mar 12 01:34:17 2017 -0800
+++ b/mercurial/similar.py	Sun Mar 15 18:58:56 2015 +0900
@@ -101,19 +101,18 @@
     # Zero length files will be frequently unrelated to each other, and
     # tracking the deletion/addition of such a file will probably cause more
     # harm than good. We strip them out here to avoid matching them later on.
-    addedfiles = set([workingctx[fp] for fp in added
-            if workingctx[fp].size() > 0])
-    removedfiles = set([parentctx[fp] for fp in removed
-            if fp in parentctx and parentctx[fp].size() > 0])
+    addedfiles = [workingctx[fp] for fp in sorted(added)
+                  if workingctx[fp].size() > 0]
+    removedfiles = [parentctx[fp] for fp in sorted(removed)
+                    if fp in parentctx and parentctx[fp].size() > 0]
 
     # Find exact matches.
-    for (a, b) in _findexactmatches(repo,
-            sorted(addedfiles), sorted(removedfiles)):
+    for (a, b) in _findexactmatches(repo, addedfiles[:], removedfiles):
         addedfiles.remove(b)
         yield (a.path(), b.path(), 1.0)
 
     # If the user requested similar files to be matched, search for them also.
     if threshold < 1.0:
-        for (a, b, score) in _findsimilarmatches(repo,
-                sorted(addedfiles), sorted(removedfiles), threshold):
+        for (a, b, score) in _findsimilarmatches(repo, addedfiles,
+                                                 removedfiles, threshold):
             yield (a.path(), b.path(), score)