revlogdeltas: split candidate groups selection from the filtering logic
The group iteration has two main components:
* walking candidates, the logic that we are about to extend to build
intermediate snapshots,
* Making sure we test diffs against interesting bases. No duplicated tests,
skipping empty revisions, etc.
We split `_candidategroups` to separate the two components. This achieves two
goals:
* It will be simpler to update the walking logic for intermediate snapshots,
* We can gather the filtering logic from `finddeltainfo` into
`_candidategroups` to centralize it.
--- a/mercurial/revlogutils/deltas.py Sat Aug 18 07:32:05 2018 +0200
+++ b/mercurial/revlogutils/deltas.py Wed Aug 29 09:55:11 2018 -0700
@@ -569,9 +569,29 @@
return True
def _candidategroups(revlog, p1, p2, cachedelta):
+ """Provides group of revision to be tested as delta base
+
+ This top level function focus on emitting groups with unique and worthwhile
+ content. See _raw_candidate_groups for details about the group order.
"""
- Provides revisions that present an interest to be diffed against,
- grouped by level of easiness.
+ # should we try to build a delta?
+ if not (len(revlog) and revlog._storedeltachains):
+ return
+
+ tested = set([nullrev])
+ for group in _rawgroups(revlog, p1, p2, cachedelta):
+ group = tuple(r for r in group if r not in tested)
+ tested.update(group)
+ if group:
+ yield group
+
+def _rawgroups(revlog, p1, p2, cachedelta):
+ """Provides group of revision to be tested as delta base
+
+ This lower level function focus on emitting delta theorically interresting
+ without looking it any practical details.
+
+ The group order aims at providing fast or small candidates first.
"""
gdelta = revlog._generaldelta
curr = len(revlog)
@@ -590,29 +610,33 @@
yield (cachedelta[0],)
tested.add(cachedelta[0])
- if gdelta:
- # exclude already lazy tested base if any
- parents = [p for p in (p1, p2)
- if p != nullrev and p not in tested]
+ # This condition is true most of the time when processing
+ # changegroup data into a generaldelta repo. The only time it
+ # isn't true is if this is the first revision in a delta chain
+ # or if ``format.generaldelta=true`` disabled ``lazydeltabase``.
+ if cachedelta and gdelta and revlog._lazydeltabase:
+ # Assume what we received from the server is a good choice
+ # build delta will reuse the cache
+ yield (cachedelta[0],)
+
+ if gdelta:
+ # exclude already lazy tested base if any
+ parents = [p for p in (p1, p2) if p != nullrev]
- if not revlog._deltabothparents and len(parents) == 2:
- parents.sort()
- # To minimize the chance of having to build a fulltext,
- # pick first whichever parent is closest to us (max rev)
- yield (parents[1],)
- # then the other one (min rev) if the first did not fit
- yield (parents[0],)
- tested.update(parents)
- elif len(parents) > 0:
- # Test all parents (1 or 2), and keep the best candidate
- yield parents
- tested.update(parents)
+ if not revlog._deltabothparents and len(parents) == 2:
+ parents.sort()
+ # To minimize the chance of having to build a fulltext,
+ # pick first whichever parent is closest to us (max rev)
+ yield (parents[1],)
+ # then the other one (min rev) if the first did not fit
+ yield (parents[0],)
+ elif len(parents) > 0:
+ # Test all parents (1 or 2), and keep the best candidate
+ yield parents
- if prev not in tested:
- # other approach failed try against prev to hopefully save us a
- # fulltext.
- yield (prev,)
- tested.add(prev)
+ # other approach failed try against prev to hopefully save us a
+ # fulltext.
+ yield (prev,)
class deltacomputer(object):
def __init__(self, revlog):