Mercurial > hg
changeset 32903:27932a76a88d
dagop: split module hosting DAG-related algorithms from revset
This module hosts the following functions. They are somewhat similar (e.g.
scanning revisions using heap queue or stack) and seem non-trivial in
algorithmic point of view.
- _revancestors()
- _revdescendants()
- reachableroots()
- _toposort()
I was thinking of adding revset._fileancestors() generator for better follow()
implementation, but it would be called from context.py as well. So I decided
to create new module.
Naming is hard. I couldn't come up with any better module name, so it's called
"dag operation" now. I rejected the following candidates:
- ancestor.py - existing, revlog-level DAG algorithm
- ancestorset.py - doesn't always return a set
- dagalgorithm.py - hard to type
- dagutil.py - existing
- revancestor.py - I want to add fileancestors()
% wc -l mercurial/dagop.py mercurial/revset.py
339 mercurial/dagop.py
2020 mercurial/revset.py
2359 total
author | Yuya Nishihara <yuya@tcha.org> |
---|---|
date | Sun, 16 Oct 2016 18:03:24 +0900 |
parents | 642feee29d70 |
children | 582080a4a812 |
files | mercurial/dagop.py mercurial/graphmod.py mercurial/revset.py |
diffstat | 3 files changed, 350 insertions(+), 330 deletions(-) [+] |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/mercurial/dagop.py Sun Oct 16 18:03:24 2016 +0900 @@ -0,0 +1,339 @@ +# dagop.py - graph ancestry and topology algorithm for revset +# +# Copyright 2010 Matt Mackall <mpm@selenic.com> +# +# This software may be used and distributed according to the terms of the +# GNU General Public License version 2 or any later version. + +from __future__ import absolute_import + +import heapq + +from . import ( + error, + node, + smartset, +) + +baseset = smartset.baseset +generatorset = smartset.generatorset + +def revancestors(repo, revs, followfirst): + """Like revlog.ancestors(), but supports followfirst.""" + if followfirst: + cut = 1 + else: + cut = None + cl = repo.changelog + + def iterate(): + revs.sort(reverse=True) + irevs = iter(revs) + h = [] + + inputrev = next(irevs, None) + if inputrev is not None: + heapq.heappush(h, -inputrev) + + seen = set() + while h: + current = -heapq.heappop(h) + if current == inputrev: + inputrev = next(irevs, None) + if inputrev is not None: + heapq.heappush(h, -inputrev) + if current not in seen: + seen.add(current) + yield current + try: + for parent in cl.parentrevs(current)[:cut]: + if parent != node.nullrev: + heapq.heappush(h, -parent) + except error.WdirUnsupported: + for parent in repo[current].parents()[:cut]: + if parent.rev() != node.nullrev: + heapq.heappush(h, -parent.rev()) + + return generatorset(iterate(), iterasc=False) + +def revdescendants(repo, revs, followfirst): + """Like revlog.descendants() but supports followfirst.""" + if followfirst: + cut = 1 + else: + cut = None + + def iterate(): + cl = repo.changelog + # XXX this should be 'parentset.min()' assuming 'parentset' is a + # smartset (and if it is not, it should.) + first = min(revs) + nullrev = node.nullrev + if first == nullrev: + # Are there nodes with a null first parent and a non-null + # second one? Maybe. Do we care? Probably not. + for i in cl: + yield i + else: + seen = set(revs) + for i in cl.revs(first + 1): + for x in cl.parentrevs(i)[:cut]: + if x != nullrev and x in seen: + seen.add(i) + yield i + break + + return generatorset(iterate(), iterasc=True) + +def _reachablerootspure(repo, minroot, roots, heads, includepath): + """return (heads(::<roots> and ::<heads>)) + + If includepath is True, return (<roots>::<heads>).""" + if not roots: + return [] + parentrevs = repo.changelog.parentrevs + roots = set(roots) + visit = list(heads) + reachable = set() + seen = {} + # prefetch all the things! (because python is slow) + reached = reachable.add + dovisit = visit.append + nextvisit = visit.pop + # open-code the post-order traversal due to the tiny size of + # sys.getrecursionlimit() + while visit: + rev = nextvisit() + if rev in roots: + reached(rev) + if not includepath: + continue + parents = parentrevs(rev) + seen[rev] = parents + for parent in parents: + if parent >= minroot and parent not in seen: + dovisit(parent) + if not reachable: + return baseset() + if not includepath: + return reachable + for rev in sorted(seen): + for parent in seen[rev]: + if parent in reachable: + reached(rev) + return reachable + +def reachableroots(repo, roots, heads, includepath=False): + """return (heads(::<roots> and ::<heads>)) + + If includepath is True, return (<roots>::<heads>).""" + if not roots: + return baseset() + minroot = roots.min() + roots = list(roots) + heads = list(heads) + try: + revs = repo.changelog.reachableroots(minroot, heads, roots, includepath) + except AttributeError: + revs = _reachablerootspure(repo, minroot, roots, heads, includepath) + revs = baseset(revs) + revs.sort() + return revs + +def toposort(revs, parentsfunc, firstbranch=()): + """Yield revisions from heads to roots one (topo) branch at a time. + + This function aims to be used by a graph generator that wishes to minimize + the number of parallel branches and their interleaving. + + Example iteration order (numbers show the "true" order in a changelog): + + o 4 + | + o 1 + | + | o 3 + | | + | o 2 + |/ + o 0 + + Note that the ancestors of merges are understood by the current + algorithm to be on the same branch. This means no reordering will + occur behind a merge. + """ + + ### Quick summary of the algorithm + # + # This function is based around a "retention" principle. We keep revisions + # in memory until we are ready to emit a whole branch that immediately + # "merges" into an existing one. This reduces the number of parallel + # branches with interleaved revisions. + # + # During iteration revs are split into two groups: + # A) revision already emitted + # B) revision in "retention". They are stored as different subgroups. + # + # for each REV, we do the following logic: + # + # 1) if REV is a parent of (A), we will emit it. If there is a + # retention group ((B) above) that is blocked on REV being + # available, we emit all the revisions out of that retention + # group first. + # + # 2) else, we'll search for a subgroup in (B) awaiting for REV to be + # available, if such subgroup exist, we add REV to it and the subgroup is + # now awaiting for REV.parents() to be available. + # + # 3) finally if no such group existed in (B), we create a new subgroup. + # + # + # To bootstrap the algorithm, we emit the tipmost revision (which + # puts it in group (A) from above). + + revs.sort(reverse=True) + + # Set of parents of revision that have been emitted. They can be considered + # unblocked as the graph generator is already aware of them so there is no + # need to delay the revisions that reference them. + # + # If someone wants to prioritize a branch over the others, pre-filling this + # set will force all other branches to wait until this branch is ready to be + # emitted. + unblocked = set(firstbranch) + + # list of groups waiting to be displayed, each group is defined by: + # + # (revs: lists of revs waiting to be displayed, + # blocked: set of that cannot be displayed before those in 'revs') + # + # The second value ('blocked') correspond to parents of any revision in the + # group ('revs') that is not itself contained in the group. The main idea + # of this algorithm is to delay as much as possible the emission of any + # revision. This means waiting for the moment we are about to display + # these parents to display the revs in a group. + # + # This first implementation is smart until it encounters a merge: it will + # emit revs as soon as any parent is about to be emitted and can grow an + # arbitrary number of revs in 'blocked'. In practice this mean we properly + # retains new branches but gives up on any special ordering for ancestors + # of merges. The implementation can be improved to handle this better. + # + # The first subgroup is special. It corresponds to all the revision that + # were already emitted. The 'revs' lists is expected to be empty and the + # 'blocked' set contains the parents revisions of already emitted revision. + # + # You could pre-seed the <parents> set of groups[0] to a specific + # changesets to select what the first emitted branch should be. + groups = [([], unblocked)] + pendingheap = [] + pendingset = set() + + heapq.heapify(pendingheap) + heappop = heapq.heappop + heappush = heapq.heappush + for currentrev in revs: + # Heap works with smallest element, we want highest so we invert + if currentrev not in pendingset: + heappush(pendingheap, -currentrev) + pendingset.add(currentrev) + # iterates on pending rev until after the current rev have been + # processed. + rev = None + while rev != currentrev: + rev = -heappop(pendingheap) + pendingset.remove(rev) + + # Seek for a subgroup blocked, waiting for the current revision. + matching = [i for i, g in enumerate(groups) if rev in g[1]] + + if matching: + # The main idea is to gather together all sets that are blocked + # on the same revision. + # + # Groups are merged when a common blocking ancestor is + # observed. For example, given two groups: + # + # revs [5, 4] waiting for 1 + # revs [3, 2] waiting for 1 + # + # These two groups will be merged when we process + # 1. In theory, we could have merged the groups when + # we added 2 to the group it is now in (we could have + # noticed the groups were both blocked on 1 then), but + # the way it works now makes the algorithm simpler. + # + # We also always keep the oldest subgroup first. We can + # probably improve the behavior by having the longest set + # first. That way, graph algorithms could minimise the length + # of parallel lines their drawing. This is currently not done. + targetidx = matching.pop(0) + trevs, tparents = groups[targetidx] + for i in matching: + gr = groups[i] + trevs.extend(gr[0]) + tparents |= gr[1] + # delete all merged subgroups (except the one we kept) + # (starting from the last subgroup for performance and + # sanity reasons) + for i in reversed(matching): + del groups[i] + else: + # This is a new head. We create a new subgroup for it. + targetidx = len(groups) + groups.append(([], {rev})) + + gr = groups[targetidx] + + # We now add the current nodes to this subgroups. This is done + # after the subgroup merging because all elements from a subgroup + # that relied on this rev must precede it. + # + # we also update the <parents> set to include the parents of the + # new nodes. + if rev == currentrev: # only display stuff in rev + gr[0].append(rev) + gr[1].remove(rev) + parents = [p for p in parentsfunc(rev) if p > node.nullrev] + gr[1].update(parents) + for p in parents: + if p not in pendingset: + pendingset.add(p) + heappush(pendingheap, -p) + + # Look for a subgroup to display + # + # When unblocked is empty (if clause), we were not waiting for any + # revisions during the first iteration (if no priority was given) or + # if we emitted a whole disconnected set of the graph (reached a + # root). In that case we arbitrarily take the oldest known + # subgroup. The heuristic could probably be better. + # + # Otherwise (elif clause) if the subgroup is blocked on + # a revision we just emitted, we can safely emit it as + # well. + if not unblocked: + if len(groups) > 1: # display other subset + targetidx = 1 + gr = groups[1] + elif not gr[1] & unblocked: + gr = None + + if gr is not None: + # update the set of awaited revisions with the one from the + # subgroup + unblocked |= gr[1] + # output all revisions in the subgroup + for r in gr[0]: + yield r + # delete the subgroup that you just output + # unless it is groups[0] in which case you just empty it. + if targetidx: + del groups[targetidx] + else: + gr[0][:] = [] + # Check if we have some subgroup waiting for revisions we are not going to + # iterate over + for g in groups: + for r in g[0]: + yield r
--- a/mercurial/graphmod.py Thu Jun 15 17:14:53 2017 -0700 +++ b/mercurial/graphmod.py Sun Oct 16 18:03:24 2016 +0900 @@ -21,7 +21,7 @@ from .node import nullrev from . import ( - revset, + dagop, smartset, util, ) @@ -70,7 +70,7 @@ # through all revs (issue4782) if not isinstance(revs, smartset.baseset): revs = smartset.baseset(revs) - gp = gpcache[mpar] = sorted(set(revset.reachableroots( + gp = gpcache[mpar] = sorted(set(dagop.reachableroots( repo, revs, [mpar]))) if not gp: parents.append((MISSINGPARENT, mpar))
--- a/mercurial/revset.py Thu Jun 15 17:14:53 2017 -0700 +++ b/mercurial/revset.py Sun Oct 16 18:03:24 2016 +0900 @@ -7,11 +7,11 @@ from __future__ import absolute_import -import heapq import re from .i18n import _ from . import ( + dagop, destutil, encoding, error, @@ -49,128 +49,6 @@ spanset = smartset.spanset fullreposet = smartset.fullreposet -def _revancestors(repo, revs, followfirst): - """Like revlog.ancestors(), but supports followfirst.""" - if followfirst: - cut = 1 - else: - cut = None - cl = repo.changelog - - def iterate(): - revs.sort(reverse=True) - irevs = iter(revs) - h = [] - - inputrev = next(irevs, None) - if inputrev is not None: - heapq.heappush(h, -inputrev) - - seen = set() - while h: - current = -heapq.heappop(h) - if current == inputrev: - inputrev = next(irevs, None) - if inputrev is not None: - heapq.heappush(h, -inputrev) - if current not in seen: - seen.add(current) - yield current - try: - for parent in cl.parentrevs(current)[:cut]: - if parent != node.nullrev: - heapq.heappush(h, -parent) - except error.WdirUnsupported: - for parent in repo[current].parents()[:cut]: - if parent.rev() != node.nullrev: - heapq.heappush(h, -parent.rev()) - - return generatorset(iterate(), iterasc=False) - -def _revdescendants(repo, revs, followfirst): - """Like revlog.descendants() but supports followfirst.""" - if followfirst: - cut = 1 - else: - cut = None - - def iterate(): - cl = repo.changelog - # XXX this should be 'parentset.min()' assuming 'parentset' is a - # smartset (and if it is not, it should.) - first = min(revs) - nullrev = node.nullrev - if first == nullrev: - # Are there nodes with a null first parent and a non-null - # second one? Maybe. Do we care? Probably not. - for i in cl: - yield i - else: - seen = set(revs) - for i in cl.revs(first + 1): - for x in cl.parentrevs(i)[:cut]: - if x != nullrev and x in seen: - seen.add(i) - yield i - break - - return generatorset(iterate(), iterasc=True) - -def _reachablerootspure(repo, minroot, roots, heads, includepath): - """return (heads(::<roots> and ::<heads>)) - - If includepath is True, return (<roots>::<heads>).""" - if not roots: - return [] - parentrevs = repo.changelog.parentrevs - roots = set(roots) - visit = list(heads) - reachable = set() - seen = {} - # prefetch all the things! (because python is slow) - reached = reachable.add - dovisit = visit.append - nextvisit = visit.pop - # open-code the post-order traversal due to the tiny size of - # sys.getrecursionlimit() - while visit: - rev = nextvisit() - if rev in roots: - reached(rev) - if not includepath: - continue - parents = parentrevs(rev) - seen[rev] = parents - for parent in parents: - if parent >= minroot and parent not in seen: - dovisit(parent) - if not reachable: - return baseset() - if not includepath: - return reachable - for rev in sorted(seen): - for parent in seen[rev]: - if parent in reachable: - reached(rev) - return reachable - -def reachableroots(repo, roots, heads, includepath=False): - """return (heads(::<roots> and ::<heads>)) - - If includepath is True, return (<roots>::<heads>).""" - if not roots: - return baseset() - minroot = roots.min() - roots = list(roots) - heads = list(heads) - try: - revs = repo.changelog.reachableroots(minroot, heads, roots, includepath) - except AttributeError: - revs = _reachablerootspure(repo, minroot, roots, heads, includepath) - revs = baseset(revs) - revs.sort() - return revs - # helpers def getset(repo, subset, x): @@ -242,8 +120,8 @@ def dagrange(repo, subset, x, y, order): r = fullreposet(repo) - xs = reachableroots(repo, getset(repo, r, x), getset(repo, r, y), - includepath=True) + xs = dagop.reachableroots(repo, getset(repo, r, x), getset(repo, r, y), + includepath=True) return subset & xs def andset(repo, subset, x, y, order): @@ -364,7 +242,7 @@ heads = getset(repo, fullreposet(repo), x) if not heads: return baseset() - s = _revancestors(repo, heads, followfirst) + s = dagop.revancestors(repo, heads, followfirst) return subset & s @predicate('ancestors(set)', safe=True) @@ -697,7 +575,7 @@ roots = getset(repo, fullreposet(repo), x) if not roots: return baseset() - s = _revdescendants(repo, roots, followfirst) + s = dagop.revdescendants(repo, roots, followfirst) # Both sets need to be ascending in order to lazily return the union # in the correct order. @@ -916,7 +794,7 @@ # include the revision responsible for the most recent version s.add(fctx.introrev()) else: - s = _revancestors(repo, baseset([c.rev()]), followfirst) + s = dagop.revancestors(repo, baseset([c.rev()]), followfirst) return subset & s @@ -1355,7 +1233,7 @@ if not include: return baseset() - descendants = set(_revdescendants(repo, include, False)) + descendants = set(dagop.revdescendants(repo, include, False)) exclude = [rev for rev in cl.headrevs() if not rev in descendants and not rev in include] else: @@ -1857,7 +1735,8 @@ firstbranch = () if 'topo.firstbranch' in opts: firstbranch = getset(repo, subset, opts['topo.firstbranch']) - revs = baseset(_toposort(revs, repo.changelog.parentrevs, firstbranch), + revs = baseset(dagop.toposort(revs, repo.changelog.parentrevs, + firstbranch), istopo=True) if keyflags[0][1]: revs.reverse() @@ -1869,204 +1748,6 @@ ctxs.sort(key=_sortkeyfuncs[k], reverse=reverse) return baseset([c.rev() for c in ctxs]) -def _toposort(revs, parentsfunc, firstbranch=()): - """Yield revisions from heads to roots one (topo) branch at a time. - - This function aims to be used by a graph generator that wishes to minimize - the number of parallel branches and their interleaving. - - Example iteration order (numbers show the "true" order in a changelog): - - o 4 - | - o 1 - | - | o 3 - | | - | o 2 - |/ - o 0 - - Note that the ancestors of merges are understood by the current - algorithm to be on the same branch. This means no reordering will - occur behind a merge. - """ - - ### Quick summary of the algorithm - # - # This function is based around a "retention" principle. We keep revisions - # in memory until we are ready to emit a whole branch that immediately - # "merges" into an existing one. This reduces the number of parallel - # branches with interleaved revisions. - # - # During iteration revs are split into two groups: - # A) revision already emitted - # B) revision in "retention". They are stored as different subgroups. - # - # for each REV, we do the following logic: - # - # 1) if REV is a parent of (A), we will emit it. If there is a - # retention group ((B) above) that is blocked on REV being - # available, we emit all the revisions out of that retention - # group first. - # - # 2) else, we'll search for a subgroup in (B) awaiting for REV to be - # available, if such subgroup exist, we add REV to it and the subgroup is - # now awaiting for REV.parents() to be available. - # - # 3) finally if no such group existed in (B), we create a new subgroup. - # - # - # To bootstrap the algorithm, we emit the tipmost revision (which - # puts it in group (A) from above). - - revs.sort(reverse=True) - - # Set of parents of revision that have been emitted. They can be considered - # unblocked as the graph generator is already aware of them so there is no - # need to delay the revisions that reference them. - # - # If someone wants to prioritize a branch over the others, pre-filling this - # set will force all other branches to wait until this branch is ready to be - # emitted. - unblocked = set(firstbranch) - - # list of groups waiting to be displayed, each group is defined by: - # - # (revs: lists of revs waiting to be displayed, - # blocked: set of that cannot be displayed before those in 'revs') - # - # The second value ('blocked') correspond to parents of any revision in the - # group ('revs') that is not itself contained in the group. The main idea - # of this algorithm is to delay as much as possible the emission of any - # revision. This means waiting for the moment we are about to display - # these parents to display the revs in a group. - # - # This first implementation is smart until it encounters a merge: it will - # emit revs as soon as any parent is about to be emitted and can grow an - # arbitrary number of revs in 'blocked'. In practice this mean we properly - # retains new branches but gives up on any special ordering for ancestors - # of merges. The implementation can be improved to handle this better. - # - # The first subgroup is special. It corresponds to all the revision that - # were already emitted. The 'revs' lists is expected to be empty and the - # 'blocked' set contains the parents revisions of already emitted revision. - # - # You could pre-seed the <parents> set of groups[0] to a specific - # changesets to select what the first emitted branch should be. - groups = [([], unblocked)] - pendingheap = [] - pendingset = set() - - heapq.heapify(pendingheap) - heappop = heapq.heappop - heappush = heapq.heappush - for currentrev in revs: - # Heap works with smallest element, we want highest so we invert - if currentrev not in pendingset: - heappush(pendingheap, -currentrev) - pendingset.add(currentrev) - # iterates on pending rev until after the current rev have been - # processed. - rev = None - while rev != currentrev: - rev = -heappop(pendingheap) - pendingset.remove(rev) - - # Seek for a subgroup blocked, waiting for the current revision. - matching = [i for i, g in enumerate(groups) if rev in g[1]] - - if matching: - # The main idea is to gather together all sets that are blocked - # on the same revision. - # - # Groups are merged when a common blocking ancestor is - # observed. For example, given two groups: - # - # revs [5, 4] waiting for 1 - # revs [3, 2] waiting for 1 - # - # These two groups will be merged when we process - # 1. In theory, we could have merged the groups when - # we added 2 to the group it is now in (we could have - # noticed the groups were both blocked on 1 then), but - # the way it works now makes the algorithm simpler. - # - # We also always keep the oldest subgroup first. We can - # probably improve the behavior by having the longest set - # first. That way, graph algorithms could minimise the length - # of parallel lines their drawing. This is currently not done. - targetidx = matching.pop(0) - trevs, tparents = groups[targetidx] - for i in matching: - gr = groups[i] - trevs.extend(gr[0]) - tparents |= gr[1] - # delete all merged subgroups (except the one we kept) - # (starting from the last subgroup for performance and - # sanity reasons) - for i in reversed(matching): - del groups[i] - else: - # This is a new head. We create a new subgroup for it. - targetidx = len(groups) - groups.append(([], {rev})) - - gr = groups[targetidx] - - # We now add the current nodes to this subgroups. This is done - # after the subgroup merging because all elements from a subgroup - # that relied on this rev must precede it. - # - # we also update the <parents> set to include the parents of the - # new nodes. - if rev == currentrev: # only display stuff in rev - gr[0].append(rev) - gr[1].remove(rev) - parents = [p for p in parentsfunc(rev) if p > node.nullrev] - gr[1].update(parents) - for p in parents: - if p not in pendingset: - pendingset.add(p) - heappush(pendingheap, -p) - - # Look for a subgroup to display - # - # When unblocked is empty (if clause), we were not waiting for any - # revisions during the first iteration (if no priority was given) or - # if we emitted a whole disconnected set of the graph (reached a - # root). In that case we arbitrarily take the oldest known - # subgroup. The heuristic could probably be better. - # - # Otherwise (elif clause) if the subgroup is blocked on - # a revision we just emitted, we can safely emit it as - # well. - if not unblocked: - if len(groups) > 1: # display other subset - targetidx = 1 - gr = groups[1] - elif not gr[1] & unblocked: - gr = None - - if gr is not None: - # update the set of awaited revisions with the one from the - # subgroup - unblocked |= gr[1] - # output all revisions in the subgroup - for r in gr[0]: - yield r - # delete the subgroup that you just output - # unless it is groups[0] in which case you just empty it. - if targetidx: - del groups[targetidx] - else: - gr[0][:] = [] - # Check if we have some subgroup waiting for revisions we are not going to - # iterate over - for g in groups: - for r in g[0]: - yield r - @predicate('subrepo([pattern])') def subrepo(repo, subset, x): """Changesets that add, modify or remove the given subrepo. If no subrepo