Mercurial > hg
changeset 50672:3b56395404a1
stream-clone: avoid opening a revlog in case we do not need it
Opening an revlog has a cost, especially if it is inline as we have to scan the
file and construct an index.
To prevent the associated slowdown, we just do a minimal scan to check that an
inline file is still inline, and simply stream the file without creating a
revlog when we can.
This provides a big boost compared to the previous changeset, even if the full
generation is still penalized by the initial gathering of information.
All benchmarks are run on linux with Python 3.10.7.
# benchmark.name = hg.exchange.stream.generate
# benchmark.variants.version = v2
### Compared to the previous changesets
We get a large win all across the board!
# mercurial-2018-08-01-zstd-sparse-revlog
before: 0.250694 seconds
after: 0.105986 seconds (-57.72%)
# pypy-2018-08-01-zstd-sparse-revlog
before: 3.885657 seconds
after: 1.709748 seconds (-56.00%)
# netbeans-2018-08-01-zstd-sparse-revlog
before: 16.679371 seconds
after: 7.687469 seconds (-53.91%)
# mozilla-central-2018-08-01-zstd-sparse-revlog
before: 38.575482 seconds
after: 17.520316 seconds (-54.58%)
# mozilla-try-2019-02-18-zstd-sparse-revlog
before: 81.160994 seconds
after: 37.073753 seconds (-54.32%)
### Compared to 6.4.3
We are still significantly slower than 6.4.3, the extra time is usually twice
slower than the extra time we observe on the locked section, which is a quite
interesting information.
Except for mercurial-central that is much faster. That discrepancy is not really
explained yet.
# mercurial-2018-08-01-zstd-sparse-revlog
6.4.3: 0.072560 seconds
after: 0.105986 seconds (+46.07%) (- 0.03 seconds)
# pypy-2018-08-01-zstd-sparse-revlog
6.4.3: 1.211193 seconds
after: 1.709748 seconds (+41.16%) (-0.45 seconds)
# netbeans-2018-08-01-zstd-sparse-revlog
6.4.3: 4.932843 seconds
after: 7.687469 seconds (+55.84%) (-2.75 seconds)
# mozilla-central-2018-08-01-zstd-sparse-revlog
6.4.3: 34.012226 seconds
after: 17.520316 seconds (-48.49%) (-16.49 seconds)
# mozilla-try-2019-02-18-zstd-sparse-revlog
6.4.3: 23.850555 seconds
after: 37.073753 seconds (+55.44%) (+13.22 seconds)
author | Arseniy Alekseyev <aalekseyev@janestreet.com> |
---|---|
date | Wed, 31 May 2023 10:37:55 +0100 |
parents | e06d1a779eb6 |
children | 5d84b1385f7f |
files | mercurial/revlog.py mercurial/store.py |
diffstat | 2 files changed, 61 insertions(+), 21 deletions(-) [+] |
line wrap: on
line diff
--- a/mercurial/revlog.py Tue May 30 17:43:59 2023 +0100 +++ b/mercurial/revlog.py Wed May 31 10:37:55 2023 +0100 @@ -290,6 +290,16 @@ _flagserrorclass = error.RevlogError + @staticmethod + def is_inline_index(header_bytes): + header = INDEX_HEADER.unpack(header_bytes)[0] + + _format_flags = header & ~0xFFFF + _format_version = header & 0xFFFF + + features = FEATURES_BY_VERSION[_format_version] + return features[b'inline'](_format_flags) + def __init__( self, opener,
--- a/mercurial/store.py Tue May 30 17:43:59 2023 +0100 +++ b/mercurial/store.py Wed May 31 10:37:55 2023 +0100 @@ -16,6 +16,9 @@ from .pycompat import getattr from .thirdparty import attr from .node import hex +from .revlogutils.constants import ( + INDEX_HEADER, +) from . import ( changelog, error, @@ -23,6 +26,7 @@ manifest, policy, pycompat, + revlog as revlogmod, util, vfs as vfsmod, ) @@ -619,44 +623,70 @@ copies=None, max_changeset=None, ): - if repo is None or max_changeset is None: - return super().get_streams( - repo=repo, - vfs=vfs, - copies=copies, - max_changeset=max_changeset, - ) - if any(k.endswith(b'.idx') for k in self._details.keys()): + if ( + repo is None + or max_changeset is None # This use revlog-v2, ignore for now + or any(k.endswith(b'.idx') for k in self._details.keys()) + # This is not inline, no race expected + or b'.d' in self._details + ): return super().get_streams( repo=repo, vfs=vfs, copies=copies, max_changeset=max_changeset, ) - name_to_ext = {} - for ext in self._details.keys(): - name_to_ext[self._path_prefix + ext] = ext + name_to_size = {} for f in self.files(): name_to_size[f.unencoded_path] = f.file_size(None) + stream = [ f.get_stream(vfs, copies) for f in self.files() - if name_to_ext[f.unencoded_path] not in (b'.d', b'.i') + if not f.unencoded_path.endswith(b'.i') ] - is_inline = b'.d' not in self._details + index_path = self._path_prefix + b'.i' - rl = self.get_revlog_instance(repo).get_revlog() - rl_stream = rl.get_streams(max_changeset, force_inline=is_inline) + index_file = None + try: + index_file = vfs(index_path) + header = index_file.read(INDEX_HEADER.size) + if revlogmod.revlog.is_inline_index(header): + size = name_to_size[index_path] - for name, s, size in rl_stream: - if name_to_size.get(name, 0) != size: - msg = _(b"expected %d bytes but %d provided for %s") - msg %= name_to_size.get(name, 0), size, name - raise error.Abort(msg) - stream.extend(rl_stream) + # no split underneath, just return the stream + def get_stream(): + fp = index_file + try: + fp.seek(0) + yield None + if size <= 65536: + yield fp.read(size) + else: + yield from util.filechunkiter(fp, limit=size) + finally: + fp.close() + + s = get_stream() + next(s) + index_file = None + stream.append((index_path, s, size)) + else: + rl = self.get_revlog_instance(repo).get_revlog() + rl_stream = rl.get_streams(max_changeset, force_inline=True) + for name, s, size in rl_stream: + if name_to_size.get(name, 0) != size: + msg = _(b"expected %d bytes but %d provided for %s") + msg %= name_to_size.get(name, 0), size, name + raise error.Abort(msg) + stream.extend(rl_stream) + finally: + if index_file is not None: + index_file.close() + files = self.files() assert len(stream) == len(files), ( stream,