--- a/mercurial/revlogutils/deltas.py Fri Dec 22 12:58:54 2023 +0100
+++ b/mercurial/revlogutils/deltas.py Mon Nov 20 04:44:40 2023 +0100
@@ -675,169 +675,199 @@
LIMIT_BASE2TEXT = 500
-def _candidategroups(
- revlog,
- textlen,
- p1,
- p2,
- cachedelta,
- excluded_bases=None,
- target_rev=None,
- snapshot_cache=None,
-):
- """Provides group of revision to be tested as delta base
-
- This top level function focus on emitting groups with unique and worthwhile
- content. See _raw_candidate_groups for details about the group order.
- """
- # should we try to build a delta?
- if not (len(revlog) and revlog._storedeltachains):
- yield None
- return
-
- if target_rev is None:
- target_rev = len(revlog)
+class _DeltaSearch:
+ """perform the search of a good delta for a single revlog revision
- if not revlog.delta_config.general_delta:
- # before general delta, there is only one possible delta base
- yield (target_rev - 1,)
- yield None
- return
+ note: some of the deltacomputer.finddeltainfo logic should probably move
+ here.
+ """
- # the DELTA_BASE_REUSE_FORCE case should have been taken care of sooner so
- # we should never end up asking such question. Adding the assert as a
- # safe-guard to detect anything that would be fishy in this regard.
- assert (
- cachedelta is None
- or cachedelta[2] != DELTA_BASE_REUSE_FORCE
- or not revlog.delta_config.general_delta
- )
-
- deltalength = revlog.length
- deltaparent = revlog.deltaparent
- sparse = revlog.delta_config.sparse_revlog
- good = None
-
- deltas_limit = textlen * LIMIT_DELTA2TEXT
- group_chunk_size = revlog.delta_config.candidate_group_chunk_size
-
- tested = {nullrev}
- candidates = _refinedgroups(
+ def __init__(
+ self,
revlog,
+ textlen,
p1,
p2,
cachedelta,
- snapshot_cache=snapshot_cache,
- )
- while True:
- temptative = candidates.send(good)
- if temptative is None:
- break
- group = []
- for rev in temptative:
- # skip over empty delta (no need to include them in a chain)
- while not (rev == nullrev or rev in tested or deltalength(rev)):
- tested.add(rev)
- rev = deltaparent(rev)
- # no need to try a delta against nullrev, this will be done as a
- # last resort.
- if rev == nullrev:
- continue
- # filter out revision we tested already
- if rev in tested:
- continue
+ excluded_bases=None,
+ target_rev=None,
+ snapshot_cache=None,
+ ):
+ self.revlog = revlog
+ self.textlen = textlen
+ self.p1 = p1
+ self.p2 = p2
+ self.cachedelta = cachedelta
+ self.excluded_bases = excluded_bases
+ self.target_rev = target_rev
+ self.snapshot_cache = snapshot_cache
+
+ def candidate_groups(self):
+ """Provides group of revision to be tested as delta base
+
+ This top level function focus on emitting groups with unique and
+ worthwhile content. See _raw_candidate_groups for details about the
+ group order.
+ """
+ # should we try to build a delta?
+ if not (len(self.revlog) and self.revlog._storedeltachains):
+ yield None
+ return
+
+ if self.target_rev is None:
+ self.target_rev = len(self.revlog)
+
+ if not self.revlog.delta_config.general_delta:
+ # before general delta, there is only one possible delta base
+ yield (self.target_rev - 1,)
+ yield None
+ return
- # an higher authority deamed the base unworthy (e.g. censored)
- if excluded_bases is not None and rev in excluded_bases:
- tested.add(rev)
- continue
- # We are in some recomputation cases and that rev is too high in
- # the revlog
- if target_rev is not None and rev >= target_rev:
- tested.add(rev)
- continue
- # filter out delta base that will never produce good delta
- if deltas_limit < revlog.length(rev):
- tested.add(rev)
- continue
- if sparse and revlog.rawsize(rev) < (textlen // LIMIT_BASE2TEXT):
- tested.add(rev)
- continue
- # no delta for rawtext-changing revs (see "candelta" for why)
- if revlog.flags(rev) & REVIDX_RAWTEXT_CHANGING_FLAGS:
- tested.add(rev)
- continue
+ # the DELTA_BASE_REUSE_FORCE case should have been taken care of sooner
+ # so we should never end up asking such question. Adding the assert as
+ # a safe-guard to detect anything that would be fishy in this regard.
+ assert (
+ self.cachedelta is None
+ or self.cachedelta[2] != DELTA_BASE_REUSE_FORCE
+ or not self.revlog.delta_config.general_delta
+ )
+
+ deltalength = self.revlog.length
+ deltaparent = self.revlog.deltaparent
+ sparse = self.revlog.delta_config.sparse_revlog
+ good = None
+
+ deltas_limit = self.textlen * LIMIT_DELTA2TEXT
+ group_chunk_size = self.revlog.delta_config.candidate_group_chunk_size
+
+ tested = {nullrev}
+ candidates = _refinedgroups(
+ self.revlog,
+ self.p1,
+ self.p2,
+ self.cachedelta,
+ snapshot_cache=self.snapshot_cache,
+ )
+ while True:
+ temptative = candidates.send(good)
+ if temptative is None:
+ break
+ group = []
+ for rev in temptative:
+ # skip over empty delta (no need to include them in a chain)
+ while not (rev == nullrev or rev in tested or deltalength(rev)):
+ tested.add(rev)
+ rev = deltaparent(rev)
+ # no need to try a delta against nullrev, this will be done as
+ # a last resort.
+ if rev == nullrev:
+ continue
+ # filter out revision we tested already
+ if rev in tested:
+ continue
- # If we reach here, we are about to build and test a delta.
- # The delta building process will compute the chaininfo in all
- # case, since that computation is cached, it is fine to access it
- # here too.
- chainlen, chainsize = revlog._chaininfo(rev)
- # if chain will be too long, skip base
- if (
- revlog.delta_config.max_chain_len
- and chainlen >= revlog.delta_config.max_chain_len
- ):
- tested.add(rev)
- continue
- # if chain already have too much data, skip base
- if deltas_limit < chainsize:
- tested.add(rev)
- continue
- if sparse and revlog.delta_config.upper_bound_comp is not None:
- maxcomp = revlog.delta_config.upper_bound_comp
- basenotsnap = (p1, p2, nullrev)
- if rev not in basenotsnap and revlog.issnapshot(rev):
- snapshotdepth = revlog.snapshotdepth(rev)
- # If text is significantly larger than the base, we can
- # expect the resulting delta to be proportional to the size
- # difference
- revsize = revlog.rawsize(rev)
- rawsizedistance = max(textlen - revsize, 0)
- # use an estimate of the compression upper bound.
- lowestrealisticdeltalen = rawsizedistance // maxcomp
+ # an higher authority deamed the base unworthy (e.g. censored)
+ if (
+ self.excluded_bases is not None
+ and rev in self.excluded_bases
+ ):
+ tested.add(rev)
+ continue
+ # We are in some recomputation cases and that rev is too high
+ # in the revlog
+ if self.target_rev is not None and rev >= self.target_rev:
+ tested.add(rev)
+ continue
+ # filter out delta base that will never produce good delta
+ if deltas_limit < self.revlog.length(rev):
+ tested.add(rev)
+ continue
+ if sparse and self.revlog.rawsize(rev) < (
+ self.textlen // LIMIT_BASE2TEXT
+ ):
+ tested.add(rev)
+ continue
+ # no delta for rawtext-changing revs (see "candelta" for why)
+ if self.revlog.flags(rev) & REVIDX_RAWTEXT_CHANGING_FLAGS:
+ tested.add(rev)
+ continue
- # check the absolute constraint on the delta size
- snapshotlimit = textlen >> snapshotdepth
- if snapshotlimit < lowestrealisticdeltalen:
- # delta lower bound is larger than accepted upper bound
- tested.add(rev)
- continue
-
- # check the relative constraint on the delta size
- revlength = revlog.length(rev)
- if revlength < lowestrealisticdeltalen:
- # delta probable lower bound is larger than target base
- tested.add(rev)
- continue
+ # If we reach here, we are about to build and test a delta.
+ # The delta building process will compute the chaininfo in all
+ # case, since that computation is cached, it is fine to access
+ # it here too.
+ chainlen, chainsize = self.revlog._chaininfo(rev)
+ # if chain will be too long, skip base
+ if (
+ self.revlog.delta_config.max_chain_len
+ and chainlen >= self.revlog.delta_config.max_chain_len
+ ):
+ tested.add(rev)
+ continue
+ # if chain already have too much data, skip base
+ if deltas_limit < chainsize:
+ tested.add(rev)
+ continue
+ if (
+ sparse
+ and self.revlog.delta_config.upper_bound_comp is not None
+ ):
+ maxcomp = self.revlog.delta_config.upper_bound_comp
+ basenotsnap = (self.p1, self.p2, nullrev)
+ if rev not in basenotsnap and self.revlog.issnapshot(rev):
+ snapshotdepth = self.revlog.snapshotdepth(rev)
+ # If text is significantly larger than the base, we can
+ # expect the resulting delta to be proportional to the size
+ # difference
+ revsize = self.revlog.rawsize(rev)
+ rawsizedistance = max(self.textlen - revsize, 0)
+ # use an estimate of the compression upper bound.
+ lowestrealisticdeltalen = rawsizedistance // maxcomp
- group.append(rev)
- if group:
- # When the size of the candidate group is big, it can result in a
- # quite significant performance impact. To reduce this, we can send
- # them in smaller batches until the new batch does not provide any
- # improvements.
- #
- # This might reduce the overall efficiency of the compression in
- # some corner cases, but that should also prevent very pathological
- # cases from being an issue. (eg. 20 000 candidates).
- #
- # XXX note that the ordering of the group becomes important as it
- # now impacts the final result. The current order is unprocessed
- # and can be improved.
- if group_chunk_size == 0:
- tested.update(group)
- good = yield tuple(group)
- else:
- prev_good = good
- for start in range(0, len(group), group_chunk_size):
- sub_group = group[start : start + group_chunk_size]
- tested.update(sub_group)
- good = yield tuple(sub_group)
- if prev_good == good:
- break
+ # check the absolute constraint on the delta size
+ snapshotlimit = self.textlen >> snapshotdepth
+ if snapshotlimit < lowestrealisticdeltalen:
+ # delta lower bound is larger than accepted upper
+ # bound
+ tested.add(rev)
+ continue
+
+ # check the relative constraint on the delta size
+ revlength = self.revlog.length(rev)
+ if revlength < lowestrealisticdeltalen:
+ # delta probable lower bound is larger than target
+ # base
+ tested.add(rev)
+ continue
- yield None
+ group.append(rev)
+ if group:
+ # When the size of the candidate group is big, it can result in
+ # a quite significant performance impact. To reduce this, we
+ # can send them in smaller batches until the new batch does not
+ # provide any improvements.
+ #
+ # This might reduce the overall efficiency of the compression
+ # in some corner cases, but that should also prevent very
+ # pathological cases from being an issue. (eg. 20 000
+ # candidates).
+ #
+ # XXX note that the ordering of the group becomes important as
+ # it now impacts the final result. The current order is
+ # unprocessed and can be improved.
+ if group_chunk_size == 0:
+ tested.update(group)
+ good = yield tuple(group)
+ else:
+ prev_good = good
+ for start in range(0, len(group), group_chunk_size):
+ sub_group = group[start : start + group_chunk_size]
+ tested.update(sub_group)
+ good = yield tuple(sub_group)
+ if prev_good == good:
+ break
+
+ yield None
def _refinedgroups(revlog, p1, p2, cachedelta, snapshot_cache=None):
@@ -1410,7 +1440,7 @@
msg %= target_rev
self._write_debug(msg)
- groups = _candidategroups(
+ search = _DeltaSearch(
self.revlog,
revinfo.textlen,
p1r,
@@ -1420,6 +1450,8 @@
target_rev,
snapshot_cache=self._snapshot_cache,
)
+
+ groups = search.candidate_groups()
candidaterevs = next(groups)
while candidaterevs is not None:
dbg_try_rounds += 1