ancestor: faster algorithm for difference of ancestor sets
authorSiddharth Agarwal <sid0@fb.com>
Mon, 26 Nov 2012 11:46:51 -0800
changeset 17970 0b03454abae7
parent 17969 6c67deb3d373
child 17971 e1b9a78a7aed
ancestor: faster algorithm for difference of ancestor sets One of the major reasons rebase is slow in large repositories is the computation of the detach set: the set of ancestors of the changesets to rebase not in the destination parent. This is currently done via a revset that does two walks all the way to the root of the DAG. Instead of doing that, to find ancestors of a set <revs> not in another set <common> we walk up the tree in reverse revision number order, maintaining sets of nodes visited from <revs>, <common> or both. For the common case where the sets are close both topologically and in revision number (relative to repository size), this has been found to speed up rebase by around 15-20%. When the nodes are farther apart and the DAG is highly branching, it is harder to say which would win. Here's how long computing the detach set takes in a linear repository with over 400000 changesets, rebasing near tip: Rebasing across 4 changesets Revset method: 2.2s New algorithm: 0.00015s Rebasing across 250 changesets Revset method: 2.2s New algorithm: 0.00069s Rebasing across 10000 changesets Revset method: 2.4s New algorithm: 0.019s
mercurial/ancestor.py
--- a/mercurial/ancestor.py	Fri Nov 23 11:59:44 2012 -0500
+++ b/mercurial/ancestor.py	Mon Nov 26 11:46:51 2012 -0800
@@ -6,6 +6,7 @@
 # GNU General Public License version 2 or any later version.
 
 import heapq
+from node import nullrev
 
 def ancestor(a, b, pfunc):
     """
@@ -89,3 +90,166 @@
                 gx = x.next()
     except StopIteration:
         return None
+
+def missingancestors(revs, bases, pfunc):
+    """Return all the ancestors of revs that are not ancestors of bases.
+
+    This may include elements from revs.
+
+    Equivalent to the revset (::revs - ::bases). Revs are returned in
+    revision number order, which is a topological order.
+
+    revs and bases should both be iterables. pfunc must return a list of
+    parent revs for a given revs.
+
+    graph is a dict of child->parent adjacency lists for this graph:
+    o  13
+    |
+    | o  12
+    | |
+    | | o    11
+    | | |\
+    | | | | o  10
+    | | | | |
+    | o---+ |  9
+    | | | | |
+    o | | | |  8
+     / / / /
+    | | o |  7
+    | | | |
+    o---+ |  6
+     / / /
+    | | o  5
+    | |/
+    | o  4
+    | |
+    o |  3
+    | |
+    | o  2
+    |/
+    o  1
+    |
+    o  0
+    >>> graph = {0: [-1], 1: [0], 2: [1], 3: [1], 4: [2], 5: [4], 6: [4],
+    ...          7: [4], 8: [-1], 9: [6, 7], 10: [5], 11: [3, 7], 12: [9],
+    ...          13: [8]}
+    >>> pfunc = graph.get
+
+    Empty revs
+    >>> missingancestors([], [1], pfunc)
+    []
+    >>> missingancestors([], [], pfunc)
+    []
+
+    If bases is empty, it's the same as if it were [nullrev]
+    >>> missingancestors([12], [], pfunc)
+    [0, 1, 2, 4, 6, 7, 9, 12]
+
+    Trivial case: revs == bases
+    >>> missingancestors([0], [0], pfunc)
+    []
+    >>> missingancestors([4, 5, 6], [6, 5, 4], pfunc)
+    []
+
+    With nullrev
+    >>> missingancestors([-1], [12], pfunc)
+    []
+    >>> missingancestors([12], [-1], pfunc)
+    [0, 1, 2, 4, 6, 7, 9, 12]
+
+    9 is a parent of 12. 7 is a parent of 9, so an ancestor of 12. 6 is an
+    ancestor of 12 but not of 7.
+    >>> missingancestors([12], [9], pfunc)
+    [12]
+    >>> missingancestors([9], [12], pfunc)
+    []
+    >>> missingancestors([12, 9], [7], pfunc)
+    [6, 9, 12]
+    >>> missingancestors([7, 6], [12], pfunc)
+    []
+
+    More complex cases
+    >>> missingancestors([10], [11, 12], pfunc)
+    [5, 10]
+    >>> missingancestors([11], [10], pfunc)
+    [3, 7, 11]
+    >>> missingancestors([11], [10, 12], pfunc)
+    [3, 11]
+    >>> missingancestors([12], [10], pfunc)
+    [6, 7, 9, 12]
+    >>> missingancestors([12], [11], pfunc)
+    [6, 9, 12]
+    >>> missingancestors([10, 11, 12], [13], pfunc)
+    [0, 1, 2, 3, 4, 5, 6, 7, 9, 10, 11, 12]
+    >>> missingancestors([13], [10, 11, 12], pfunc)
+    [8, 13]
+    """
+
+    revsvisit = set(revs)
+    basesvisit = set(bases)
+    if not revsvisit:
+        return []
+    if not basesvisit:
+        basesvisit.add(nullrev)
+    start = max(max(revsvisit), max(basesvisit))
+    bothvisit = revsvisit.intersection(basesvisit)
+    revsvisit.difference_update(bothvisit)
+    basesvisit.difference_update(bothvisit)
+    # At this point, we hold the invariants that:
+    # - revsvisit is the set of nodes we know are an ancestor of at least one
+    #   of the nodes in revs
+    # - basesvisit is the same for bases
+    # - bothvisit is the set of nodes we know are ancestors of at least one of
+    #   the nodes in revs and one of the nodes in bases
+    # - a node may be in none or one, but not more, of revsvisit, basesvisit
+    #   and bothvisit at any given time
+    # Now we walk down in reverse topo order, adding parents of nodes already
+    # visited to the sets while maintaining the invariants. When a node is
+    # found in both revsvisit and basesvisit, it is removed from them and
+    # added to bothvisit instead. When revsvisit becomes empty, there are no
+    # more ancestors of revs that aren't also ancestors of bases, so exit.
+
+    missing = []
+    for curr in xrange(start, nullrev, -1):
+        if not revsvisit:
+            break
+
+        if curr in bothvisit:
+            bothvisit.remove(curr)
+            # curr's parents might have made it into revsvisit or basesvisit
+            # through another path
+            for p in pfunc(curr):
+                revsvisit.discard(p)
+                basesvisit.discard(p)
+                bothvisit.add(p)
+            continue
+
+        # curr will never be in both revsvisit and basesvisit, since if it
+        # were it'd have been pushed to bothvisit
+        if curr in revsvisit:
+            missing.append(curr)
+            thisvisit = revsvisit
+            othervisit = basesvisit
+        elif curr in basesvisit:
+            thisvisit = basesvisit
+            othervisit = revsvisit
+        else:
+            # not an ancestor of a or b: ignore
+            continue
+
+        thisvisit.remove(curr)
+        for p in pfunc(curr):
+            if p == nullrev:
+                pass
+            elif p in othervisit or p in bothvisit:
+                # p is implicitly in thisvisit. This means p is or should be
+                # in bothvisit
+                revsvisit.discard(p)
+                basesvisit.discard(p)
+                bothvisit.add(p)
+            else:
+                # visit later
+                thisvisit.add(p)
+
+    missing.reverse()
+    return missing