stream-clone: avoid opening a revlog in case we do not need it
Opening an revlog has a cost, especially if it is inline as we have to scan the
file and construct an index.
To prevent the associated slowdown, we just do a minimal scan to check that an
inline file is still inline, and simply stream the file without creating a
revlog when we can.
This provides a big boost compared to the previous changeset, even if the full
generation is still penalized by the initial gathering of information.
All benchmarks are run on linux with Python 3.10.7.
# benchmark.name = hg.exchange.stream.generate
# benchmark.variants.version = v2
### Compared to the previous changesets
We get a large win all across the board!
# mercurial-2018-08-01-zstd-sparse-revlog
before: 0.250694 seconds
after: 0.105986 seconds (-57.72%)
# pypy-2018-08-01-zstd-sparse-revlog
before: 3.885657 seconds
after: 1.709748 seconds (-56.00%)
# netbeans-2018-08-01-zstd-sparse-revlog
before: 16.679371 seconds
after: 7.687469 seconds (-53.91%)
# mozilla-central-2018-08-01-zstd-sparse-revlog
before: 38.575482 seconds
after: 17.520316 seconds (-54.58%)
# mozilla-try-2019-02-18-zstd-sparse-revlog
before: 81.160994 seconds
after: 37.073753 seconds (-54.32%)
### Compared to 6.4.3
We are still significantly slower than 6.4.3, the extra time is usually twice
slower than the extra time we observe on the locked section, which is a quite
interesting information.
Except for mercurial-central that is much faster. That discrepancy is not really
explained yet.
# mercurial-2018-08-01-zstd-sparse-revlog
6.4.3: 0.072560 seconds
after: 0.105986 seconds (+46.07%) (- 0.03 seconds)
# pypy-2018-08-01-zstd-sparse-revlog
6.4.3: 1.211193 seconds
after: 1.709748 seconds (+41.16%) (-0.45 seconds)
# netbeans-2018-08-01-zstd-sparse-revlog
6.4.3: 4.932843 seconds
after: 7.687469 seconds (+55.84%) (-2.75 seconds)
# mozilla-central-2018-08-01-zstd-sparse-revlog
6.4.3: 34.012226 seconds
after: 17.520316 seconds (-48.49%) (-16.49 seconds)
# mozilla-try-2019-02-18-zstd-sparse-revlog
6.4.3: 23.850555 seconds
after: 37.073753 seconds (+55.44%) (+13.22 seconds)
--- a/mercurial/revlog.py Tue May 30 17:43:59 2023 +0100
+++ b/mercurial/revlog.py Wed May 31 10:37:55 2023 +0100
@@ -290,6 +290,16 @@
_flagserrorclass = error.RevlogError
+ @staticmethod
+ def is_inline_index(header_bytes):
+ header = INDEX_HEADER.unpack(header_bytes)[0]
+
+ _format_flags = header & ~0xFFFF
+ _format_version = header & 0xFFFF
+
+ features = FEATURES_BY_VERSION[_format_version]
+ return features[b'inline'](_format_flags)
+
def __init__(
self,
opener,
--- a/mercurial/store.py Tue May 30 17:43:59 2023 +0100
+++ b/mercurial/store.py Wed May 31 10:37:55 2023 +0100
@@ -16,6 +16,9 @@
from .pycompat import getattr
from .thirdparty import attr
from .node import hex
+from .revlogutils.constants import (
+ INDEX_HEADER,
+)
from . import (
changelog,
error,
@@ -23,6 +26,7 @@
manifest,
policy,
pycompat,
+ revlog as revlogmod,
util,
vfs as vfsmod,
)
@@ -619,44 +623,70 @@
copies=None,
max_changeset=None,
):
- if repo is None or max_changeset is None:
- return super().get_streams(
- repo=repo,
- vfs=vfs,
- copies=copies,
- max_changeset=max_changeset,
- )
- if any(k.endswith(b'.idx') for k in self._details.keys()):
+ if (
+ repo is None
+ or max_changeset is None
# This use revlog-v2, ignore for now
+ or any(k.endswith(b'.idx') for k in self._details.keys())
+ # This is not inline, no race expected
+ or b'.d' in self._details
+ ):
return super().get_streams(
repo=repo,
vfs=vfs,
copies=copies,
max_changeset=max_changeset,
)
- name_to_ext = {}
- for ext in self._details.keys():
- name_to_ext[self._path_prefix + ext] = ext
+
name_to_size = {}
for f in self.files():
name_to_size[f.unencoded_path] = f.file_size(None)
+
stream = [
f.get_stream(vfs, copies)
for f in self.files()
- if name_to_ext[f.unencoded_path] not in (b'.d', b'.i')
+ if not f.unencoded_path.endswith(b'.i')
]
- is_inline = b'.d' not in self._details
+ index_path = self._path_prefix + b'.i'
- rl = self.get_revlog_instance(repo).get_revlog()
- rl_stream = rl.get_streams(max_changeset, force_inline=is_inline)
+ index_file = None
+ try:
+ index_file = vfs(index_path)
+ header = index_file.read(INDEX_HEADER.size)
+ if revlogmod.revlog.is_inline_index(header):
+ size = name_to_size[index_path]
- for name, s, size in rl_stream:
- if name_to_size.get(name, 0) != size:
- msg = _(b"expected %d bytes but %d provided for %s")
- msg %= name_to_size.get(name, 0), size, name
- raise error.Abort(msg)
- stream.extend(rl_stream)
+ # no split underneath, just return the stream
+ def get_stream():
+ fp = index_file
+ try:
+ fp.seek(0)
+ yield None
+ if size <= 65536:
+ yield fp.read(size)
+ else:
+ yield from util.filechunkiter(fp, limit=size)
+ finally:
+ fp.close()
+
+ s = get_stream()
+ next(s)
+ index_file = None
+ stream.append((index_path, s, size))
+ else:
+ rl = self.get_revlog_instance(repo).get_revlog()
+ rl_stream = rl.get_streams(max_changeset, force_inline=True)
+ for name, s, size in rl_stream:
+ if name_to_size.get(name, 0) != size:
+ msg = _(b"expected %d bytes but %d provided for %s")
+ msg %= name_to_size.get(name, 0), size, name
+ raise error.Abort(msg)
+ stream.extend(rl_stream)
+ finally:
+ if index_file is not None:
+ index_file.close()
+
files = self.files()
assert len(stream) == len(files), (
stream,