convert: use a priority queue for sorting commits, to make sorting faster
To achieve this, we turn commit sorters into classes so they can
encapsulate state.
This reduces the sorting time from ~30s to ~10s on a 500k-commit
prefix of a repo I tried to convert. (and probably reduces the time
to sort the whole repo from many tens of minutes to minutes, but I
didn't try that again)
The date caching gets removed because priority queue already
caches the key.
--- a/hgext/convert/convcmd.py Tue Feb 21 13:26:07 2023 -0500
+++ b/hgext/convert/convcmd.py Thu Feb 23 23:25:28 2023 +0100
@@ -6,6 +6,7 @@
# GNU General Public License version 2 or any later version.
import collections
+import heapq
import os
import shutil
@@ -198,6 +199,59 @@
self.progress.complete()
+# Sorters are used by the `toposort` function to maintain a set of revisions
+# which can be converted immediately and pick one
+class branchsorter:
+ """If the previously converted revision has a child in the
+ eligible revisions list, pick it. Return the list head
+ otherwise. Branch sort attempts to minimize branch
+ switching, which is harmful for Mercurial backend
+ compression.
+ """
+
+ def __init__(self, parents):
+ self.nodes = []
+ self.parents = parents
+ self.prev = None
+
+ def picknext(self):
+ next = self.nodes[0]
+ for n in self.nodes:
+ if self.prev in self.parents[n]:
+ next = n
+ break
+ self.prev = next
+ self.nodes.remove(next)
+ return next
+
+ def insert(self, node):
+ self.nodes.insert(0, node)
+
+ def __len__(self):
+ return self.nodes.__len__()
+
+
+class keysorter:
+ """Key-based sort, ties broken by insertion order"""
+
+ def __init__(self, keyfn):
+ self.heap = []
+ self.keyfn = keyfn
+ self.counter = 0
+
+ def picknext(self):
+ return heapq.heappop(self.heap)[2]
+
+ def insert(self, node):
+ counter = self.counter
+ self.counter = counter + 1
+ key = self.keyfn(node)
+ heapq.heappush(self.heap, (key, counter, node))
+
+ def __len__(self):
+ return self.heap.__len__()
+
+
class converter:
def __init__(self, ui, source, dest, revmapfile, opts):
@@ -364,37 +418,10 @@
return children, roots
- # Sort functions are supposed to take a list of revisions which
- # can be converted immediately and pick one
-
- def makebranchsorter():
- """If the previously converted revision has a child in the
- eligible revisions list, pick it. Return the list head
- otherwise. Branch sort attempts to minimize branch
- switching, which is harmful for Mercurial backend
- compression.
- """
- prev = [None]
-
- def picknext(nodes):
- next = nodes[0]
- for n in nodes:
- if prev[0] in parents[n]:
- next = n
- break
- prev[0] = next
- return next
-
- return picknext
-
def makesourcesorter():
"""Source specific sort."""
keyfn = lambda n: self.commitcache[n].sortkey
-
- def picknext(nodes):
- return sorted(nodes, key=keyfn)[0]
-
- return picknext
+ return keysorter(keyfn)
def makeclosesorter():
"""Close order sort."""
@@ -402,44 +429,36 @@
b'close' not in self.commitcache[n].extra,
self.commitcache[n].sortkey,
)
-
- def picknext(nodes):
- return sorted(nodes, key=keyfn)[0]
-
- return picknext
+ return keysorter(keyfn)
def makedatesorter():
"""Sort revisions by date."""
- dates = {}
def getdate(n):
- if n not in dates:
- dates[n] = dateutil.parsedate(self.commitcache[n].date)
- return dates[n]
+ return dateutil.parsedate(self.commitcache[n].date)
- def picknext(nodes):
- return min([(getdate(n), n) for n in nodes])[1]
-
- return picknext
+ return keysorter(getdate)
if sortmode == b'branchsort':
- picknext = makebranchsorter()
+ sorter = branchsorter(parents)
elif sortmode == b'datesort':
- picknext = makedatesorter()
+ sorter = makedatesorter()
elif sortmode == b'sourcesort':
- picknext = makesourcesorter()
+ sorter = makesourcesorter()
elif sortmode == b'closesort':
- picknext = makeclosesorter()
+ sorter = makeclosesorter()
else:
raise error.Abort(_(b'unknown sort mode: %s') % sortmode)
- children, actives = mapchildren(parents)
+ children, roots = mapchildren(parents)
+
+ for node in roots:
+ sorter.insert(node)
s = []
pendings = {}
- while actives:
- n = picknext(actives)
- actives.remove(n)
+ while sorter:
+ n = sorter.picknext()
s.append(n)
# Update dependents list
@@ -455,7 +474,7 @@
)
if not pendings[c]:
# Parents are converted, node is eligible
- actives.insert(0, c)
+ sorter.insert(c)
pendings[c] = None
if len(s) != len(parents):