delta-find: move pre-filtering of candidates in its own function
authorPierre-Yves David <pierre-yves.david@octobus.net>
Thu, 23 Nov 2023 04:21:07 +0100
changeset 51335 02cc19f4f091
parent 51334 d0d869fccd20
child 51336 898c212e1b2f
delta-find: move pre-filtering of candidates in its own function This organise the code further and open the way to specialization via sub-classing. Something important for the coming changes.
mercurial/revlogutils/deltas.py
--- a/mercurial/revlogutils/deltas.py	Fri Dec 29 13:35:08 2023 +0100
+++ b/mercurial/revlogutils/deltas.py	Thu Nov 23 04:21:07 2023 +0100
@@ -675,12 +675,8 @@
             yield None
             return
 
-        deltalength = self.revlog.length
-        deltaparent = self.revlog.deltaparent
-        sparse = self.revlog.delta_config.sparse_revlog
         good = None
 
-        deltas_limit = self.textlen * LIMIT_DELTA2TEXT
         group_chunk_size = self.revlog.delta_config.candidate_group_chunk_size
 
         tested = self.tested  # prefetch for speed and code compactness
@@ -689,98 +685,7 @@
             temptative = candidates.send(good)
             if temptative is None:
                 break
-            group = []
-            for rev in temptative:
-                # skip over empty delta (no need to include them in a chain)
-                while not (rev == nullrev or rev in tested or deltalength(rev)):
-                    tested.add(rev)
-                    rev = deltaparent(rev)
-                # no need to try a delta against nullrev, this will be done as
-                # a last resort.
-                if rev == nullrev:
-                    continue
-                # filter out revision we tested already
-                if rev in tested:
-                    continue
-
-                # an higher authority deamed the base unworthy (e.g. censored)
-                if (
-                    self.excluded_bases is not None
-                    and rev in self.excluded_bases
-                ):
-                    tested.add(rev)
-                    continue
-                # We are in some recomputation cases and that rev is too high
-                # in the revlog
-                if self.target_rev is not None and rev >= self.target_rev:
-                    tested.add(rev)
-                    continue
-                # filter out delta base that will never produce good delta
-                #
-                # if the delta of that base is already bigger than the limit
-                # for the delta chain size, doing a delta is hopeless.
-                if deltas_limit < self.revlog.length(rev):
-                    tested.add(rev)
-                    continue
-                if sparse and self.revlog.rawsize(rev) < (
-                    self.textlen // LIMIT_BASE2TEXT
-                ):
-                    tested.add(rev)
-                    continue
-                # no delta for rawtext-changing revs (see "candelta" for why)
-                if self.revlog.flags(rev) & REVIDX_RAWTEXT_CHANGING_FLAGS:
-                    tested.add(rev)
-                    continue
-
-                # If we reach here, we are about to build and test a delta.
-                # The delta building process will compute the chaininfo in all
-                # case, since that computation is cached, it is fine to access
-                # it here too.
-                chainlen, chainsize = self.revlog._chaininfo(rev)
-                # if chain will be too long, skip base
-                if (
-                    self.revlog.delta_config.max_chain_len
-                    and chainlen >= self.revlog.delta_config.max_chain_len
-                ):
-                    tested.add(rev)
-                    continue
-                # if chain already have too much data, skip base
-                if deltas_limit < chainsize:
-                    tested.add(rev)
-                    continue
-                if (
-                    sparse
-                    and self.revlog.delta_config.upper_bound_comp is not None
-                ):
-                    maxcomp = self.revlog.delta_config.upper_bound_comp
-                    basenotsnap = (self.p1, self.p2, nullrev)
-                    if rev not in basenotsnap and self.revlog.issnapshot(rev):
-                        snapshotdepth = self.revlog.snapshotdepth(rev)
-                        # If text is significantly larger than the base, we can
-                        # expect the resulting delta to be proportional to the size
-                        # difference
-                        revsize = self.revlog.rawsize(rev)
-                        rawsizedistance = max(self.textlen - revsize, 0)
-                        # use an estimate of the compression upper bound.
-                        lowestrealisticdeltalen = rawsizedistance // maxcomp
-
-                        # check the absolute constraint on the delta size
-                        snapshotlimit = self.textlen >> snapshotdepth
-                        if snapshotlimit < lowestrealisticdeltalen:
-                            # delta lower bound is larger than accepted upper
-                            # bound
-                            tested.add(rev)
-                            continue
-
-                        # check the relative constraint on the delta size
-                        revlength = self.revlog.length(rev)
-                        if revlength < lowestrealisticdeltalen:
-                            # delta probable lower bound is larger than target
-                            # base
-                            tested.add(rev)
-                            continue
-
-                group.append(rev)
+            group = self._pre_filter_candidate_revs(temptative)
             if group:
                 # When the size of the candidate group is big, it can result in
                 # a quite significant performance impact. To reduce this, we
@@ -809,6 +714,114 @@
 
         yield None
 
+    def _pre_filter_candidate_revs(self, temptative):
+        """filter possible candidate before computing a delta
+
+        This function use various criteria to pre-filter candidate delta base
+        before we compute a delta and evaluate its quality.
+
+        Such pre-filter limit the number of computed delta, an expensive operation.
+
+        return the updated list of revision to test
+        """
+        deltalength = self.revlog.length
+        deltaparent = self.revlog.deltaparent
+        deltas_limit = self.revinfo.textlen * LIMIT_DELTA2TEXT
+
+        sparse = self.revlog.delta_config.sparse_revlog
+        tested = self.tested
+        group = []
+        for rev in temptative:
+            # skip over empty delta (no need to include them in a chain)
+            while not (rev == nullrev or rev in tested or deltalength(rev)):
+                tested.add(rev)
+                rev = deltaparent(rev)
+            # no need to try a delta against nullrev, this will be done as
+            # a last resort.
+            if rev == nullrev:
+                continue
+            # filter out revision we tested already
+            if rev in tested:
+                continue
+
+            # an higher authority deamed the base unworthy (e.g. censored)
+            if self.excluded_bases is not None and rev in self.excluded_bases:
+                tested.add(rev)
+                continue
+            # We are in some recomputation cases and that rev is too high
+            # in the revlog
+            if self.target_rev is not None and rev >= self.target_rev:
+                tested.add(rev)
+                continue
+            # filter out delta base that will never produce good delta
+            #
+            # if the delta of that base is already bigger than the limit
+            # for the delta chain size, doing a delta is hopeless.
+            if deltas_limit < self.revlog.length(rev):
+                tested.add(rev)
+                continue
+
+            # if the revision we test again is too small, the resulting delta
+            # will be large anyway as that amount of data to be added is big
+            if sparse and self.revlog.rawsize(rev) < (
+                self.textlen // LIMIT_BASE2TEXT
+            ):
+                tested.add(rev)
+                continue
+
+            # no delta for rawtext-changing revs (see "candelta" for why)
+            if self.revlog.flags(rev) & REVIDX_RAWTEXT_CHANGING_FLAGS:
+                tested.add(rev)
+                continue
+
+            # If we reach here, we are about to build and test a delta.
+            # The delta building process will compute the chaininfo in all
+            # case, since that computation is cached, it is fine to access
+            # it here too.
+            chainlen, chainsize = self.revlog._chaininfo(rev)
+            # if chain will be too long, skip base
+            if (
+                self.revlog.delta_config.max_chain_len
+                and chainlen >= self.revlog.delta_config.max_chain_len
+            ):
+                tested.add(rev)
+                continue
+            # if chain already have too much data, skip base
+            if deltas_limit < chainsize:
+                tested.add(rev)
+                continue
+            if sparse and self.revlog.delta_config.upper_bound_comp is not None:
+                maxcomp = self.revlog.delta_config.upper_bound_comp
+                basenotsnap = (self.p1, self.p2, nullrev)
+                if rev not in basenotsnap and self.revlog.issnapshot(rev):
+                    snapshotdepth = self.revlog.snapshotdepth(rev)
+                    # If text is significantly larger than the base, we can
+                    # expect the resulting delta to be proportional to the size
+                    # difference
+                    revsize = self.revlog.rawsize(rev)
+                    rawsizedistance = max(self.textlen - revsize, 0)
+                    # use an estimate of the compression upper bound.
+                    lowestrealisticdeltalen = rawsizedistance // maxcomp
+
+                    # check the absolute constraint on the delta size
+                    snapshotlimit = self.textlen >> snapshotdepth
+                    if snapshotlimit < lowestrealisticdeltalen:
+                        # delta lower bound is larger than accepted upper
+                        # bound
+                        tested.add(rev)
+                        continue
+
+                    # check the relative constraint on the delta size
+                    revlength = self.revlog.length(rev)
+                    if revlength < lowestrealisticdeltalen:
+                        # delta probable lower bound is larger than target
+                        # base
+                        tested.add(rev)
+                        continue
+
+            group.append(rev)
+        return group
+
     def _refined_groups(self):
         good = None
         # First we try to reuse a the delta contained in the bundle.  (or from