changeset 50672:3b56395404a1

stream-clone: avoid opening a revlog in case we do not need it Opening an revlog has a cost, especially if it is inline as we have to scan the file and construct an index. To prevent the associated slowdown, we just do a minimal scan to check that an inline file is still inline, and simply stream the file without creating a revlog when we can. This provides a big boost compared to the previous changeset, even if the full generation is still penalized by the initial gathering of information. All benchmarks are run on linux with Python 3.10.7. # benchmark.name = hg.exchange.stream.generate # benchmark.variants.version = v2 ### Compared to the previous changesets We get a large win all across the board! # mercurial-2018-08-01-zstd-sparse-revlog before: 0.250694 seconds after: 0.105986 seconds (-57.72%) # pypy-2018-08-01-zstd-sparse-revlog before: 3.885657 seconds after: 1.709748 seconds (-56.00%) # netbeans-2018-08-01-zstd-sparse-revlog before: 16.679371 seconds after: 7.687469 seconds (-53.91%) # mozilla-central-2018-08-01-zstd-sparse-revlog before: 38.575482 seconds after: 17.520316 seconds (-54.58%) # mozilla-try-2019-02-18-zstd-sparse-revlog before: 81.160994 seconds after: 37.073753 seconds (-54.32%) ### Compared to 6.4.3 We are still significantly slower than 6.4.3, the extra time is usually twice slower than the extra time we observe on the locked section, which is a quite interesting information. Except for mercurial-central that is much faster. That discrepancy is not really explained yet. # mercurial-2018-08-01-zstd-sparse-revlog 6.4.3: 0.072560 seconds after: 0.105986 seconds (+46.07%) (- 0.03 seconds) # pypy-2018-08-01-zstd-sparse-revlog 6.4.3: 1.211193 seconds after: 1.709748 seconds (+41.16%) (-0.45 seconds) # netbeans-2018-08-01-zstd-sparse-revlog 6.4.3: 4.932843 seconds after: 7.687469 seconds (+55.84%) (-2.75 seconds) # mozilla-central-2018-08-01-zstd-sparse-revlog 6.4.3: 34.012226 seconds after: 17.520316 seconds (-48.49%) (-16.49 seconds) # mozilla-try-2019-02-18-zstd-sparse-revlog 6.4.3: 23.850555 seconds after: 37.073753 seconds (+55.44%) (+13.22 seconds)
author Arseniy Alekseyev <aalekseyev@janestreet.com>
date Wed, 31 May 2023 10:37:55 +0100
parents e06d1a779eb6
children 5d84b1385f7f
files mercurial/revlog.py mercurial/store.py
diffstat 2 files changed, 61 insertions(+), 21 deletions(-) [+]
line wrap: on
line diff
--- a/mercurial/revlog.py	Tue May 30 17:43:59 2023 +0100
+++ b/mercurial/revlog.py	Wed May 31 10:37:55 2023 +0100
@@ -290,6 +290,16 @@
 
     _flagserrorclass = error.RevlogError
 
+    @staticmethod
+    def is_inline_index(header_bytes):
+        header = INDEX_HEADER.unpack(header_bytes)[0]
+
+        _format_flags = header & ~0xFFFF
+        _format_version = header & 0xFFFF
+
+        features = FEATURES_BY_VERSION[_format_version]
+        return features[b'inline'](_format_flags)
+
     def __init__(
         self,
         opener,
--- a/mercurial/store.py	Tue May 30 17:43:59 2023 +0100
+++ b/mercurial/store.py	Wed May 31 10:37:55 2023 +0100
@@ -16,6 +16,9 @@
 from .pycompat import getattr
 from .thirdparty import attr
 from .node import hex
+from .revlogutils.constants import (
+    INDEX_HEADER,
+)
 from . import (
     changelog,
     error,
@@ -23,6 +26,7 @@
     manifest,
     policy,
     pycompat,
+    revlog as revlogmod,
     util,
     vfs as vfsmod,
 )
@@ -619,44 +623,70 @@
         copies=None,
         max_changeset=None,
     ):
-        if repo is None or max_changeset is None:
-            return super().get_streams(
-                repo=repo,
-                vfs=vfs,
-                copies=copies,
-                max_changeset=max_changeset,
-            )
-        if any(k.endswith(b'.idx') for k in self._details.keys()):
+        if (
+            repo is None
+            or max_changeset is None
             # This use revlog-v2, ignore for now
+            or any(k.endswith(b'.idx') for k in self._details.keys())
+            # This is not inline, no race expected
+            or b'.d' in self._details
+        ):
             return super().get_streams(
                 repo=repo,
                 vfs=vfs,
                 copies=copies,
                 max_changeset=max_changeset,
             )
-        name_to_ext = {}
-        for ext in self._details.keys():
-            name_to_ext[self._path_prefix + ext] = ext
+
         name_to_size = {}
         for f in self.files():
             name_to_size[f.unencoded_path] = f.file_size(None)
+
         stream = [
             f.get_stream(vfs, copies)
             for f in self.files()
-            if name_to_ext[f.unencoded_path] not in (b'.d', b'.i')
+            if not f.unencoded_path.endswith(b'.i')
         ]
 
-        is_inline = b'.d' not in self._details
+        index_path = self._path_prefix + b'.i'
 
-        rl = self.get_revlog_instance(repo).get_revlog()
-        rl_stream = rl.get_streams(max_changeset, force_inline=is_inline)
+        index_file = None
+        try:
+            index_file = vfs(index_path)
+            header = index_file.read(INDEX_HEADER.size)
+            if revlogmod.revlog.is_inline_index(header):
+                size = name_to_size[index_path]
 
-        for name, s, size in rl_stream:
-            if name_to_size.get(name, 0) != size:
-                msg = _(b"expected %d bytes but %d provided for %s")
-                msg %= name_to_size.get(name, 0), size, name
-                raise error.Abort(msg)
-        stream.extend(rl_stream)
+                # no split underneath, just return the stream
+                def get_stream():
+                    fp = index_file
+                    try:
+                        fp.seek(0)
+                        yield None
+                        if size <= 65536:
+                            yield fp.read(size)
+                        else:
+                            yield from util.filechunkiter(fp, limit=size)
+                    finally:
+                        fp.close()
+
+                s = get_stream()
+                next(s)
+                index_file = None
+                stream.append((index_path, s, size))
+            else:
+                rl = self.get_revlog_instance(repo).get_revlog()
+                rl_stream = rl.get_streams(max_changeset, force_inline=True)
+                for name, s, size in rl_stream:
+                    if name_to_size.get(name, 0) != size:
+                        msg = _(b"expected %d bytes but %d provided for %s")
+                        msg %= name_to_size.get(name, 0), size, name
+                        raise error.Abort(msg)
+                stream.extend(rl_stream)
+        finally:
+            if index_file is not None:
+                index_file.close()
+
         files = self.files()
         assert len(stream) == len(files), (
             stream,