convert: use a priority queue for sorting commits, to make sorting faster
authorArseniy Alekseyev <aalekseyev@janestreet.com>
Thu, 23 Feb 2023 23:25:28 +0100
changeset 50220 02fe65f74be5
parent 50219 829aa604d71a
child 50221 e9d92faa08f8
convert: use a priority queue for sorting commits, to make sorting faster To achieve this, we turn commit sorters into classes so they can encapsulate state. This reduces the sorting time from ~30s to ~10s on a 500k-commit prefix of a repo I tried to convert. (and probably reduces the time to sort the whole repo from many tens of minutes to minutes, but I didn't try that again) The date caching gets removed because priority queue already caches the key.
hgext/convert/convcmd.py
--- a/hgext/convert/convcmd.py	Tue Feb 21 13:26:07 2023 -0500
+++ b/hgext/convert/convcmd.py	Thu Feb 23 23:25:28 2023 +0100
@@ -6,6 +6,7 @@
 # GNU General Public License version 2 or any later version.
 
 import collections
+import heapq
 import os
 import shutil
 
@@ -198,6 +199,59 @@
         self.progress.complete()
 
 
+# Sorters are used by the `toposort` function to maintain a set of revisions
+# which can be converted immediately and pick one
+class branchsorter:
+    """If the previously converted revision has a child in the
+    eligible revisions list, pick it. Return the list head
+    otherwise. Branch sort attempts to minimize branch
+    switching, which is harmful for Mercurial backend
+    compression.
+    """
+
+    def __init__(self, parents):
+        self.nodes = []
+        self.parents = parents
+        self.prev = None
+
+    def picknext(self):
+        next = self.nodes[0]
+        for n in self.nodes:
+            if self.prev in self.parents[n]:
+                next = n
+                break
+        self.prev = next
+        self.nodes.remove(next)
+        return next
+
+    def insert(self, node):
+        self.nodes.insert(0, node)
+
+    def __len__(self):
+        return self.nodes.__len__()
+
+
+class keysorter:
+    """Key-based sort, ties broken by insertion order"""
+
+    def __init__(self, keyfn):
+        self.heap = []
+        self.keyfn = keyfn
+        self.counter = 0
+
+    def picknext(self):
+        return heapq.heappop(self.heap)[2]
+
+    def insert(self, node):
+        counter = self.counter
+        self.counter = counter + 1
+        key = self.keyfn(node)
+        heapq.heappush(self.heap, (key, counter, node))
+
+    def __len__(self):
+        return self.heap.__len__()
+
+
 class converter:
     def __init__(self, ui, source, dest, revmapfile, opts):
 
@@ -364,37 +418,10 @@
 
             return children, roots
 
-        # Sort functions are supposed to take a list of revisions which
-        # can be converted immediately and pick one
-
-        def makebranchsorter():
-            """If the previously converted revision has a child in the
-            eligible revisions list, pick it. Return the list head
-            otherwise. Branch sort attempts to minimize branch
-            switching, which is harmful for Mercurial backend
-            compression.
-            """
-            prev = [None]
-
-            def picknext(nodes):
-                next = nodes[0]
-                for n in nodes:
-                    if prev[0] in parents[n]:
-                        next = n
-                        break
-                prev[0] = next
-                return next
-
-            return picknext
-
         def makesourcesorter():
             """Source specific sort."""
             keyfn = lambda n: self.commitcache[n].sortkey
-
-            def picknext(nodes):
-                return sorted(nodes, key=keyfn)[0]
-
-            return picknext
+            return keysorter(keyfn)
 
         def makeclosesorter():
             """Close order sort."""
@@ -402,44 +429,36 @@
                 b'close' not in self.commitcache[n].extra,
                 self.commitcache[n].sortkey,
             )
-
-            def picknext(nodes):
-                return sorted(nodes, key=keyfn)[0]
-
-            return picknext
+            return keysorter(keyfn)
 
         def makedatesorter():
             """Sort revisions by date."""
-            dates = {}
 
             def getdate(n):
-                if n not in dates:
-                    dates[n] = dateutil.parsedate(self.commitcache[n].date)
-                return dates[n]
+                return dateutil.parsedate(self.commitcache[n].date)
 
-            def picknext(nodes):
-                return min([(getdate(n), n) for n in nodes])[1]
-
-            return picknext
+            return keysorter(getdate)
 
         if sortmode == b'branchsort':
-            picknext = makebranchsorter()
+            sorter = branchsorter(parents)
         elif sortmode == b'datesort':
-            picknext = makedatesorter()
+            sorter = makedatesorter()
         elif sortmode == b'sourcesort':
-            picknext = makesourcesorter()
+            sorter = makesourcesorter()
         elif sortmode == b'closesort':
-            picknext = makeclosesorter()
+            sorter = makeclosesorter()
         else:
             raise error.Abort(_(b'unknown sort mode: %s') % sortmode)
 
-        children, actives = mapchildren(parents)
+        children, roots = mapchildren(parents)
+
+        for node in roots:
+            sorter.insert(node)
 
         s = []
         pendings = {}
-        while actives:
-            n = picknext(actives)
-            actives.remove(n)
+        while sorter:
+            n = sorter.picknext()
             s.append(n)
 
             # Update dependents list
@@ -455,7 +474,7 @@
                     )
                 if not pendings[c]:
                     # Parents are converted, node is eligible
-                    actives.insert(0, c)
+                    sorter.insert(c)
                     pendings[c] = None
 
         if len(s) != len(parents):