changeset 51320:c82e03b102a6

delta-find: introduce a _DeltaSearch object That object represent the search of a good delta for one revision. It will replace the interleaved generator currently in use. It will make the logic more explicit and easier to split into different subclass for the algorithm variant. We will move content gradually before doing deeper rework. For now, we only move the `_candidategroups` function here. More will follow in the same series.
author Pierre-Yves David <pierre-yves.david@octobus.net>
date Mon, 20 Nov 2023 04:44:40 +0100
parents da7ecb4deaec
children c83074405276
files mercurial/revlogutils/deltas.py
diffstat 1 files changed, 185 insertions(+), 153 deletions(-) [+]
line wrap: on
line diff
--- a/mercurial/revlogutils/deltas.py	Fri Dec 22 12:58:54 2023 +0100
+++ b/mercurial/revlogutils/deltas.py	Mon Nov 20 04:44:40 2023 +0100
@@ -675,169 +675,199 @@
 LIMIT_BASE2TEXT = 500
 
 
-def _candidategroups(
-    revlog,
-    textlen,
-    p1,
-    p2,
-    cachedelta,
-    excluded_bases=None,
-    target_rev=None,
-    snapshot_cache=None,
-):
-    """Provides group of revision to be tested as delta base
-
-    This top level function focus on emitting groups with unique and worthwhile
-    content. See _raw_candidate_groups for details about the group order.
-    """
-    # should we try to build a delta?
-    if not (len(revlog) and revlog._storedeltachains):
-        yield None
-        return
-
-    if target_rev is None:
-        target_rev = len(revlog)
+class _DeltaSearch:
+    """perform the search of a good delta for a single revlog revision
 
-    if not revlog.delta_config.general_delta:
-        # before general delta, there is only one possible delta base
-        yield (target_rev - 1,)
-        yield None
-        return
+    note: some of the deltacomputer.finddeltainfo logic should probably move
+    here.
+    """
 
-    # the DELTA_BASE_REUSE_FORCE case should have been taken care of sooner so
-    # we should never end up asking such question. Adding the assert as a
-    # safe-guard to detect anything that would be fishy in this regard.
-    assert (
-        cachedelta is None
-        or cachedelta[2] != DELTA_BASE_REUSE_FORCE
-        or not revlog.delta_config.general_delta
-    )
-
-    deltalength = revlog.length
-    deltaparent = revlog.deltaparent
-    sparse = revlog.delta_config.sparse_revlog
-    good = None
-
-    deltas_limit = textlen * LIMIT_DELTA2TEXT
-    group_chunk_size = revlog.delta_config.candidate_group_chunk_size
-
-    tested = {nullrev}
-    candidates = _refinedgroups(
+    def __init__(
+        self,
         revlog,
+        textlen,
         p1,
         p2,
         cachedelta,
-        snapshot_cache=snapshot_cache,
-    )
-    while True:
-        temptative = candidates.send(good)
-        if temptative is None:
-            break
-        group = []
-        for rev in temptative:
-            # skip over empty delta (no need to include them in a chain)
-            while not (rev == nullrev or rev in tested or deltalength(rev)):
-                tested.add(rev)
-                rev = deltaparent(rev)
-            # no need to try a delta against nullrev, this will be done as a
-            # last resort.
-            if rev == nullrev:
-                continue
-            # filter out revision we tested already
-            if rev in tested:
-                continue
+        excluded_bases=None,
+        target_rev=None,
+        snapshot_cache=None,
+    ):
+        self.revlog = revlog
+        self.textlen = textlen
+        self.p1 = p1
+        self.p2 = p2
+        self.cachedelta = cachedelta
+        self.excluded_bases = excluded_bases
+        self.target_rev = target_rev
+        self.snapshot_cache = snapshot_cache
+
+    def candidate_groups(self):
+        """Provides group of revision to be tested as delta base
+
+        This top level function focus on emitting groups with unique and
+        worthwhile content. See _raw_candidate_groups for details about the
+        group order.
+        """
+        # should we try to build a delta?
+        if not (len(self.revlog) and self.revlog._storedeltachains):
+            yield None
+            return
+
+        if self.target_rev is None:
+            self.target_rev = len(self.revlog)
+
+        if not self.revlog.delta_config.general_delta:
+            # before general delta, there is only one possible delta base
+            yield (self.target_rev - 1,)
+            yield None
+            return
 
-            # an higher authority deamed the base unworthy (e.g. censored)
-            if excluded_bases is not None and rev in excluded_bases:
-                tested.add(rev)
-                continue
-            # We are in some recomputation cases and that rev is too high in
-            # the revlog
-            if target_rev is not None and rev >= target_rev:
-                tested.add(rev)
-                continue
-            # filter out delta base that will never produce good delta
-            if deltas_limit < revlog.length(rev):
-                tested.add(rev)
-                continue
-            if sparse and revlog.rawsize(rev) < (textlen // LIMIT_BASE2TEXT):
-                tested.add(rev)
-                continue
-            # no delta for rawtext-changing revs (see "candelta" for why)
-            if revlog.flags(rev) & REVIDX_RAWTEXT_CHANGING_FLAGS:
-                tested.add(rev)
-                continue
+        # the DELTA_BASE_REUSE_FORCE case should have been taken care of sooner
+        # so we should never end up asking such question. Adding the assert as
+        # a safe-guard to detect anything that would be fishy in this regard.
+        assert (
+            self.cachedelta is None
+            or self.cachedelta[2] != DELTA_BASE_REUSE_FORCE
+            or not self.revlog.delta_config.general_delta
+        )
+
+        deltalength = self.revlog.length
+        deltaparent = self.revlog.deltaparent
+        sparse = self.revlog.delta_config.sparse_revlog
+        good = None
+
+        deltas_limit = self.textlen * LIMIT_DELTA2TEXT
+        group_chunk_size = self.revlog.delta_config.candidate_group_chunk_size
+
+        tested = {nullrev}
+        candidates = _refinedgroups(
+            self.revlog,
+            self.p1,
+            self.p2,
+            self.cachedelta,
+            snapshot_cache=self.snapshot_cache,
+        )
+        while True:
+            temptative = candidates.send(good)
+            if temptative is None:
+                break
+            group = []
+            for rev in temptative:
+                # skip over empty delta (no need to include them in a chain)
+                while not (rev == nullrev or rev in tested or deltalength(rev)):
+                    tested.add(rev)
+                    rev = deltaparent(rev)
+                # no need to try a delta against nullrev, this will be done as
+                # a last resort.
+                if rev == nullrev:
+                    continue
+                # filter out revision we tested already
+                if rev in tested:
+                    continue
 
-            # If we reach here, we are about to build and test a delta.
-            # The delta building process will compute the chaininfo in all
-            # case, since that computation is cached, it is fine to access it
-            # here too.
-            chainlen, chainsize = revlog._chaininfo(rev)
-            # if chain will be too long, skip base
-            if (
-                revlog.delta_config.max_chain_len
-                and chainlen >= revlog.delta_config.max_chain_len
-            ):
-                tested.add(rev)
-                continue
-            # if chain already have too much data, skip base
-            if deltas_limit < chainsize:
-                tested.add(rev)
-                continue
-            if sparse and revlog.delta_config.upper_bound_comp is not None:
-                maxcomp = revlog.delta_config.upper_bound_comp
-                basenotsnap = (p1, p2, nullrev)
-                if rev not in basenotsnap and revlog.issnapshot(rev):
-                    snapshotdepth = revlog.snapshotdepth(rev)
-                    # If text is significantly larger than the base, we can
-                    # expect the resulting delta to be proportional to the size
-                    # difference
-                    revsize = revlog.rawsize(rev)
-                    rawsizedistance = max(textlen - revsize, 0)
-                    # use an estimate of the compression upper bound.
-                    lowestrealisticdeltalen = rawsizedistance // maxcomp
+                # an higher authority deamed the base unworthy (e.g. censored)
+                if (
+                    self.excluded_bases is not None
+                    and rev in self.excluded_bases
+                ):
+                    tested.add(rev)
+                    continue
+                # We are in some recomputation cases and that rev is too high
+                # in the revlog
+                if self.target_rev is not None and rev >= self.target_rev:
+                    tested.add(rev)
+                    continue
+                # filter out delta base that will never produce good delta
+                if deltas_limit < self.revlog.length(rev):
+                    tested.add(rev)
+                    continue
+                if sparse and self.revlog.rawsize(rev) < (
+                    self.textlen // LIMIT_BASE2TEXT
+                ):
+                    tested.add(rev)
+                    continue
+                # no delta for rawtext-changing revs (see "candelta" for why)
+                if self.revlog.flags(rev) & REVIDX_RAWTEXT_CHANGING_FLAGS:
+                    tested.add(rev)
+                    continue
 
-                    # check the absolute constraint on the delta size
-                    snapshotlimit = textlen >> snapshotdepth
-                    if snapshotlimit < lowestrealisticdeltalen:
-                        # delta lower bound is larger than accepted upper bound
-                        tested.add(rev)
-                        continue
-
-                    # check the relative constraint on the delta size
-                    revlength = revlog.length(rev)
-                    if revlength < lowestrealisticdeltalen:
-                        # delta probable lower bound is larger than target base
-                        tested.add(rev)
-                        continue
+                # If we reach here, we are about to build and test a delta.
+                # The delta building process will compute the chaininfo in all
+                # case, since that computation is cached, it is fine to access
+                # it here too.
+                chainlen, chainsize = self.revlog._chaininfo(rev)
+                # if chain will be too long, skip base
+                if (
+                    self.revlog.delta_config.max_chain_len
+                    and chainlen >= self.revlog.delta_config.max_chain_len
+                ):
+                    tested.add(rev)
+                    continue
+                # if chain already have too much data, skip base
+                if deltas_limit < chainsize:
+                    tested.add(rev)
+                    continue
+                if (
+                    sparse
+                    and self.revlog.delta_config.upper_bound_comp is not None
+                ):
+                    maxcomp = self.revlog.delta_config.upper_bound_comp
+                    basenotsnap = (self.p1, self.p2, nullrev)
+                    if rev not in basenotsnap and self.revlog.issnapshot(rev):
+                        snapshotdepth = self.revlog.snapshotdepth(rev)
+                        # If text is significantly larger than the base, we can
+                        # expect the resulting delta to be proportional to the size
+                        # difference
+                        revsize = self.revlog.rawsize(rev)
+                        rawsizedistance = max(self.textlen - revsize, 0)
+                        # use an estimate of the compression upper bound.
+                        lowestrealisticdeltalen = rawsizedistance // maxcomp
 
-            group.append(rev)
-        if group:
-            # When the size of the candidate group is big, it can result in a
-            # quite significant performance impact. To reduce this, we can send
-            # them in smaller batches until the new batch does not provide any
-            # improvements.
-            #
-            # This might reduce the overall efficiency of the compression in
-            # some corner cases, but that should also prevent very pathological
-            # cases from being an issue. (eg. 20 000 candidates).
-            #
-            # XXX note that the ordering of the group becomes important as it
-            # now impacts the final result. The current order is unprocessed
-            # and can be improved.
-            if group_chunk_size == 0:
-                tested.update(group)
-                good = yield tuple(group)
-            else:
-                prev_good = good
-                for start in range(0, len(group), group_chunk_size):
-                    sub_group = group[start : start + group_chunk_size]
-                    tested.update(sub_group)
-                    good = yield tuple(sub_group)
-                    if prev_good == good:
-                        break
+                        # check the absolute constraint on the delta size
+                        snapshotlimit = self.textlen >> snapshotdepth
+                        if snapshotlimit < lowestrealisticdeltalen:
+                            # delta lower bound is larger than accepted upper
+                            # bound
+                            tested.add(rev)
+                            continue
+
+                        # check the relative constraint on the delta size
+                        revlength = self.revlog.length(rev)
+                        if revlength < lowestrealisticdeltalen:
+                            # delta probable lower bound is larger than target
+                            # base
+                            tested.add(rev)
+                            continue
 
-    yield None
+                group.append(rev)
+            if group:
+                # When the size of the candidate group is big, it can result in
+                # a quite significant performance impact. To reduce this, we
+                # can send them in smaller batches until the new batch does not
+                # provide any improvements.
+                #
+                # This might reduce the overall efficiency of the compression
+                # in some corner cases, but that should also prevent very
+                # pathological cases from being an issue. (eg. 20 000
+                # candidates).
+                #
+                # XXX note that the ordering of the group becomes important as
+                # it now impacts the final result. The current order is
+                # unprocessed and can be improved.
+                if group_chunk_size == 0:
+                    tested.update(group)
+                    good = yield tuple(group)
+                else:
+                    prev_good = good
+                    for start in range(0, len(group), group_chunk_size):
+                        sub_group = group[start : start + group_chunk_size]
+                        tested.update(sub_group)
+                        good = yield tuple(sub_group)
+                        if prev_good == good:
+                            break
+
+        yield None
 
 
 def _refinedgroups(revlog, p1, p2, cachedelta, snapshot_cache=None):
@@ -1410,7 +1440,7 @@
             msg %= target_rev
             self._write_debug(msg)
 
-        groups = _candidategroups(
+        search = _DeltaSearch(
             self.revlog,
             revinfo.textlen,
             p1r,
@@ -1420,6 +1450,8 @@
             target_rev,
             snapshot_cache=self._snapshot_cache,
         )
+
+        groups = search.candidate_groups()
         candidaterevs = next(groups)
         while candidaterevs is not None:
             dbg_try_rounds += 1