# HG changeset patch # User Raphaël Gomès # Date 1732114399 -3600 # Node ID 51a350a22d0c8f8c85801d1782fd08f953676e1a # Parent 65d516db7309848a472bc236409460eae8922cdb# Parent 854e2b9bca57ff6aedb98e38d6992c8bc3118338 branching: merge stable into default diff -r 854e2b9bca57 -r 51a350a22d0c contrib/perf.py --- a/contrib/perf.py Wed Nov 20 15:38:57 2024 +0100 +++ b/contrib/perf.py Wed Nov 20 15:53:19 2024 +0100 @@ -3780,6 +3780,11 @@ rl = cmdutil.openrevlog(repo, b'perfrevlogchunks', file_, opts) + if rl.uses_rust: + raise NotImplementedError( + "perfrevlogchunks is not implemented for the Rust revlog" + ) + # - _chunkraw was renamed to _getsegmentforrevs # - _getsegmentforrevs was moved on the inner object try: @@ -3960,6 +3965,10 @@ raise error.CommandError(b'perfrevlogrevision', b'invalid arguments') r = cmdutil.openrevlog(repo, b'perfrevlogrevision', file_, opts) + if r.uses_rust: + raise NotImplementedError( + "perfrevlogrevision is not implemented for the Rust revlog" + ) # _chunkraw was renamed to _getsegmentforrevs. try: diff -r 854e2b9bca57 -r 51a350a22d0c mercurial/changelog.py --- a/mercurial/changelog.py Wed Nov 20 15:38:57 2024 +0100 +++ b/mercurial/changelog.py Wed Nov 20 15:53:19 2024 +0100 @@ -324,6 +324,7 @@ self._format_flags &= ~revlog.FLAG_GENERALDELTA self.delta_config.general_delta = False + self.data_config.generaldelta = False # Delta chains for changelogs tend to be very small because entries # tend to be small and don't delta well with each. So disable delta diff -r 854e2b9bca57 -r 51a350a22d0c mercurial/pure/parsers.py --- a/mercurial/pure/parsers.py Wed Nov 20 15:38:57 2024 +0100 +++ b/mercurial/pure/parsers.py Wed Nov 20 15:53:19 2024 +0100 @@ -672,6 +672,9 @@ r = (offset,) + r[1:] return r + def __delitem__(self, i): + raise NotImplementedError() + def _unpack_entry(self, rev, data): r = self.index_format.unpack(data) r = r + ( diff -r 854e2b9bca57 -r 51a350a22d0c mercurial/revlog.py --- a/mercurial/revlog.py Wed Nov 20 15:38:57 2024 +0100 +++ b/mercurial/revlog.py Wed Nov 20 15:53:19 2024 +0100 @@ -17,7 +17,6 @@ import binascii import collections import contextlib -import functools import io import os import struct @@ -83,6 +82,7 @@ if typing.TYPE_CHECKING: # noinspection PyPackageRequirements import attr + from .pure.parsers import BaseIndexObject from . import ( ancestor, @@ -381,7 +381,7 @@ default_compression_header, ): self.opener = opener - self.index = index + self.index: BaseIndexObject = index self.index_file = index_file self.data_file = data_file @@ -528,7 +528,9 @@ generaldelta = self.delta_config.general_delta # Try C implementation. try: - return self.index.deltachain(rev, stoprev, generaldelta) + return self.index.deltachain( + rev, stoprev, generaldelta + ) # pytype: disable=attribute-error except AttributeError: pass @@ -1246,6 +1248,71 @@ return self.canonical_index_file +if typing.TYPE_CHECKING: + # Tell Pytype what kind of object we expect + ProxyBase = BaseIndexObject +else: + ProxyBase = object + + +class RustIndexProxy(ProxyBase): + """Wrapper around the Rust index to fake having direct access to the index. + + Rust enforces xor mutability (one mutable reference XOR 1..n non-mutable), + so we can't expose the index from Rust directly, since the `InnerRevlog` + already has ownership of the index. This object redirects all calls to the + index through the Rust-backed `InnerRevlog` glue which defines all + necessary forwarding methods. + """ + + def __init__(self, inner): + # Do not rename as it's being used to access the index from Rust + self.inner = inner + + # TODO possibly write all index methods manually to save on overhead? + def __getattr__(self, name): + return getattr(self.inner, f"_index_{name}") + + # Magic methods need to be defined explicitely + def __len__(self): + return self.inner._index___len__() + + def __getitem__(self, key): + return self.inner._index___getitem__(key) + + def __contains__(self, key): + return self.inner._index___contains__(key) + + def __delitem__(self, key): + return self.inner._index___delitem__(key) + + +class RustVFSWrapper: + """Used to wrap a Python VFS to pass it to Rust to lower the overhead of + calling back multiple times into Python. + """ + + def __init__(self, inner): + self.inner = inner + + def __call__( + self, + path: bytes, + mode: bytes = b"rb", + atomictemp=False, + checkambig=False, + ): + fd = self.inner.__call__( + path=path, mode=mode, atomictemp=atomictemp, checkambig=checkambig + ) + # Information that Rust needs to get ownership of the file that's + # being opened. + return (os.dup(fd.fileno()), fd._tempname if atomictemp else None) + + def __getattr__(self, name): + return getattr(self.inner, name) + + class revlog: """ the underlying revision storage object @@ -1358,6 +1425,7 @@ self._trypending = trypending self._try_split = try_split self._may_inline = may_inline + self.uses_rust = False self.opener = opener if persistentnodemap: self._nodemap_file = nodemaputil.get_nodemap_file(self) @@ -1392,7 +1460,7 @@ # Maps rev to chain base rev. self._chainbasecache = util.lrucachedict(100) - self.index = None + self.index: Optional[BaseIndexObject] = None self._docket = None self._nodemap_docket = None # Mapping of partial identifiers to full nodes. @@ -1406,8 +1474,8 @@ # prevent nesting of addgroup self._adding_group = None - chunk_cache = self._loadindex() - self._load_inner(chunk_cache) + index, chunk_cache = self._loadindex() + self._load_inner(index, chunk_cache) self._concurrencychecker = concurrencychecker def _init_opts(self): @@ -1647,6 +1715,7 @@ self.delta_config.general_delta = features[b'generaldelta']( self._format_flags ) + self.data_config.generaldelta = self.delta_config.general_delta self.feature_config.has_side_data = features[b'sidedata'] if not features[b'docket']: @@ -1677,6 +1746,7 @@ self._inline = False # generaldelta implied by version 2 revlogs. self.delta_config.general_delta = True + self.data_config.generaldelta = True # the logic for persistent nodemap will be dealt with within the # main docket, so disable it for now. self._nodemap_file = None @@ -1705,7 +1775,12 @@ ) use_rust_index = False - if rustrevlog is not None and self._nodemap_file is not None: + rust_applicable = self._nodemap_file is not None + rust_applicable = rust_applicable or self.target[0] == KIND_FILELOG + rust_applicable = rust_applicable and getattr( + self.opener, "rust_compatible", True + ) + if rustrevlog is not None and rust_applicable: # we would like to use the rust_index in all case, especially # because it is necessary for AncestorsIterator and LazyAncestors # since the 6.7 cycle. @@ -1715,6 +1790,20 @@ # repository. use_rust_index = True + if self._format_version != REVLOGV1: + use_rust_index = False + + if hasattr(self.opener, "fncache"): + vfs = self.opener.vfs + if not self.opener.uses_dotencode: + use_rust_index = False + if not isinstance(vfs, vfsmod.vfs): + # Be cautious since we don't support other vfs + use_rust_index = False + else: + # Rust only supports repos with fncache + use_rust_index = False + self._parse_index = parse_index_v1 if self._format_version == REVLOGV0: self._parse_index = revlogv0.parse_index_v0 @@ -1724,58 +1813,103 @@ self._parse_index = parse_index_cl_v2 elif devel_nodemap: self._parse_index = parse_index_v1_nodemap - elif use_rust_index: - self._parse_index = functools.partial( - parse_index_v1_rust, default_header=new_header - ) - try: - d = self._parse_index(index_data, self._inline) - index, chunkcache = d - use_nodemap = ( - not self._inline - and self._nodemap_file is not None - and hasattr(index, 'update_nodemap_data') - ) - if use_nodemap: - nodemap_data = nodemaputil.persisted_data(self) - if nodemap_data is not None: - docket = nodemap_data[0] - if ( - len(d[0]) > docket.tip_rev - and d[0][docket.tip_rev][7] == docket.tip_node - ): - # no changelog tampering - self._nodemap_docket = docket - index.update_nodemap_data(*nodemap_data) - except (ValueError, IndexError): - raise error.RevlogError( - _(b"index %s is corrupted") % self.display_id - ) - self.index = index + + if use_rust_index: + # Let the Rust code parse its own index + index, chunkcache = (index_data, None) + self.uses_rust = True + else: + try: + d = self._parse_index(index_data, self._inline) + index, chunkcache = d + self._register_nodemap_info(index) + except (ValueError, IndexError): + raise error.RevlogError( + _(b"index %s is corrupted") % self.display_id + ) # revnum -> (chain-length, sum-delta-length) self._chaininfocache = util.lrucachedict(500) - return chunkcache - - def _load_inner(self, chunk_cache): + return index, chunkcache + + def _load_inner(self, index, chunk_cache): if self._docket is None: default_compression_header = None else: default_compression_header = self._docket.default_compression_header - self._inner = _InnerRevlog( - opener=self.opener, - index=self.index, - index_file=self._indexfile, - data_file=self._datafile, - sidedata_file=self._sidedatafile, - inline=self._inline, - data_config=self.data_config, - delta_config=self.delta_config, - feature_config=self.feature_config, - chunk_cache=chunk_cache, - default_compression_header=default_compression_header, + if self.uses_rust: + vfs_is_readonly = False + fncache = None + + if hasattr(self.opener, "vfs"): + vfs = self.opener + if isinstance(vfs, vfsmod.readonlyvfs): + vfs_is_readonly = True + vfs = vfs.vfs + fncache = vfs.fncache + vfs = vfs.vfs + else: + vfs = self.opener + + vfs_base = vfs.base + assert fncache is not None, "Rust only supports repos with fncache" + + self._inner = rustrevlog.InnerRevlog( + vfs_base=vfs_base, + fncache=fncache, + vfs_is_readonly=vfs_is_readonly, + index_data=index, + index_file=self._indexfile, + data_file=self._datafile, + sidedata_file=self._sidedatafile, + inline=self._inline, + data_config=self.data_config, + delta_config=self.delta_config, + feature_config=self.feature_config, + chunk_cache=chunk_cache, + default_compression_header=default_compression_header, + revlog_type=self.target[0], + use_persistent_nodemap=self._nodemap_file is not None, + ) + self.index = RustIndexProxy(self._inner) + self._register_nodemap_info(self.index) + self.uses_rust = True + else: + self._inner = _InnerRevlog( + opener=self.opener, + index=index, + index_file=self._indexfile, + data_file=self._datafile, + sidedata_file=self._sidedatafile, + inline=self._inline, + data_config=self.data_config, + delta_config=self.delta_config, + feature_config=self.feature_config, + chunk_cache=chunk_cache, + default_compression_header=default_compression_header, + ) + self.index = self._inner.index + + def _register_nodemap_info(self, index): + use_nodemap = ( + not self._inline + and self._nodemap_file is not None + and hasattr(index, 'update_nodemap_data') ) + if use_nodemap: + nodemap_data = nodemaputil.persisted_data(self) + if nodemap_data is not None: + docket = nodemap_data[0] + if ( + len(index) > docket.tip_rev + and index[docket.tip_rev][7] == docket.tip_node + ): + # no changelog tampering + self._nodemap_docket = docket + index.update_nodemap_data( + *nodemap_data + ) # pytype: disable=attribute-error def get_revlog(self): """simple function to mirror API of other not-really-revlog API""" @@ -1867,7 +2001,9 @@ nodemap_data = nodemaputil.persisted_data(self) if nodemap_data is not None: self._nodemap_docket = nodemap_data[0] - self.index.update_nodemap_data(*nodemap_data) + self.index.update_nodemap_data( + *nodemap_data + ) # pytype: disable=attribute-error def rev(self, node): """return the revision number associated with a """ @@ -2366,23 +2502,26 @@ def headrevs(self, revs=None, stop_rev=None): if revs is None: return self.index.headrevs(None, stop_rev) - assert stop_rev is None if rustdagop is not None and self.index.rust_ext_compat: return rustdagop.headrevs(self.index, revs) return dagop.headrevs(revs, self._uncheckedparentrevs) def headrevsdiff(self, start, stop): try: - return self.index.headrevsdiff(start, stop) + return self.index.headrevsdiff( + start, stop + ) # pytype: disable=attribute-error except AttributeError: return dagop.headrevsdiff(self._uncheckedparentrevs, start, stop) def computephases(self, roots): - return self.index.computephasesmapsets(roots) + return self.index.computephasesmapsets( + roots + ) # pytype: disable=attribute-error def _head_node_ids(self): try: - return self.index.head_node_ids() + return self.index.head_node_ids() # pytype: disable=attribute-error except AttributeError: return [self.node(r) for r in self.headrevs()] @@ -2440,7 +2579,9 @@ def _commonancestorsheads(self, *revs): """calculate all the heads of the common ancestors of revs""" try: - ancs = self.index.commonancestorsheads(*revs) + ancs = self.index.commonancestorsheads( + *revs + ) # pytype: disable=attribute-error except (AttributeError, OverflowError): # C implementation failed ancs = ancestor.commonancestorsheads(self.parentrevs, *revs) return ancs @@ -2474,7 +2615,7 @@ try: return self.index.reachableroots2( minroot, heads, roots, includepath - ) + ) # pytype: disable=attribute-error except AttributeError: return dagop._reachablerootspure( self.parentrevs, minroot, roots, heads, includepath @@ -2485,7 +2626,7 @@ a, b = self.rev(a), self.rev(b) try: - ancs = self.index.ancestors(a, b) + ancs = self.index.ancestors(a, b) # pytype: disable=attribute-error except (AttributeError, OverflowError): ancs = ancestor.ancestors(self.parentrevs, a, b) if ancs: @@ -2532,7 +2673,9 @@ maybewdir = self.nodeconstants.wdirhex.startswith(id) ambiguous = False try: - partial = self.index.partialmatch(id) + partial = self.index.partialmatch( + id + ) # pytype: disable=attribute-error if partial and self.hasnode(partial): if maybewdir: # single 'ff...' match in radix tree, ambiguous with wdir @@ -2634,7 +2777,10 @@ if not getattr(self, 'filteredrevs', None): try: - length = max(self.index.shortest(node), minlength) + shortest = self.index.shortest( + node + ) # pytype: disable=attribute-error + length = max(shortest, minlength) return disambiguate(hexnode, length) except error.RevlogError: if node != self.nodeconstants.wdirid: @@ -4087,7 +4233,9 @@ ifh.seek(startrev * self.index.entry_size) for i, e in enumerate(new_entries): rev = startrev + i - self.index.replace_sidedata_info(rev, *e) + self.index.replace_sidedata_info( + rev, *e + ) # pytype: disable=attribute-error packed = self.index.entry_binary(rev) if rev == 0 and self._docket is None: header = self._format_flags | self._format_version diff -r 854e2b9bca57 -r 51a350a22d0c mercurial/revlogutils/rewrite.py --- a/mercurial/revlogutils/rewrite.py Wed Nov 20 15:38:57 2024 +0100 +++ b/mercurial/revlogutils/rewrite.py Wed Nov 20 15:53:19 2024 +0100 @@ -136,8 +136,8 @@ rl.opener.rename(newrl._datafile, rl._datafile) rl.clearcaches() - chunk_cache = rl._loadindex() - rl._load_inner(chunk_cache) + index, chunk_cache = rl._loadindex() + rl._load_inner(index, chunk_cache) def v2_censor(revlog, tr, censor_nodes, tombstone=b''): @@ -327,7 +327,8 @@ # reload the revlog internal information revlog.clearcaches() - revlog._loadindex(docket=docket) + index, chunk_cache = revlog._loadindex(docket=docket) + revlog._load_inner(index, chunk_cache) @contextlib.contextmanager def all_files_opener(): @@ -569,7 +570,8 @@ rl.opener.rename(new_file_path, index_file) rl.clearcaches() - rl._loadindex() + index, chunk_cache = rl._loadindex() + rl._load_inner(index, chunk_cache) finally: util.tryunlink(new_file_path) diff -r 854e2b9bca57 -r 51a350a22d0c mercurial/statichttprepo.py --- a/mercurial/statichttprepo.py Wed Nov 20 15:38:57 2024 +0100 +++ b/mercurial/statichttprepo.py Wed Nov 20 15:53:19 2024 +0100 @@ -219,6 +219,9 @@ self.store = localrepo.makestore(requirements, self.path, vfsclass) self.spath = self.store.path self.svfs = self.store.opener + # We can't use Rust because the Rust code cannot cope with the + # `httprangereader` (yet?) + self.svfs.rust_compatible = False self.sjoin = self.store.join self._filecache = {} self.requirements = requirements diff -r 854e2b9bca57 -r 51a350a22d0c mercurial/store.py --- a/mercurial/store.py Wed Nov 20 15:38:57 2024 +0100 +++ b/mercurial/store.py Wed Nov 20 15:53:19 2024 +0100 @@ -998,12 +998,16 @@ # set of new additions to fncache self.addls = set() + @property + def is_loaded(self): + return self.entries is not None + def ensureloaded(self, warn=None): """read the fncache file if not already read. If the file on disk is corrupted, raise. If warn is provided, warn and keep going instead.""" - if self.entries is None: + if not self.is_loaded: self._load(warn) def _load(self, warn=None): @@ -1058,7 +1062,7 @@ def write(self, tr): if self._dirty: - assert self.entries is not None + assert self.is_loaded self.entries = self.entries | self.addls self.addls = set() tr.addbackup(b'fncache') @@ -1083,13 +1087,13 @@ def add(self, fn): if fn in self._ignores: return - if self.entries is None: + if not self.is_loaded: self._load() if fn not in self.entries: self.addls.add(fn) def remove(self, fn): - if self.entries is None: + if not self.is_loaded: self._load() if fn in self.addls: self.addls.remove(fn) @@ -1103,12 +1107,12 @@ def __contains__(self, fn): if fn in self.addls: return True - if self.entries is None: + if not self.is_loaded: self._load() return fn in self.entries def __iter__(self): - if self.entries is None: + if not self.is_loaded: self._load() return iter(self.entries | self.addls) @@ -1116,8 +1120,9 @@ class _fncachevfs(vfsmod.proxyvfs): def __init__(self, vfs, fnc, encode): vfsmod.proxyvfs.__init__(self, vfs) - self.fncache = fnc + self.fncache: fncache = fnc self.encode = encode + self.uses_dotencode = encode is _pathencode def __call__(self, path, mode=b'r', *args, **kw): encoded = self.encode(path) @@ -1128,7 +1133,7 @@ ): # do not trigger a fncache load when adding a file that already is # known to exist. - notload = self.fncache.entries is None and ( + notload = not self.fncache.is_loaded and ( # if the file has size zero, it should be considered as missing. # Such zero-size files are the result of truncation when a # transaction is aborted. diff -r 854e2b9bca57 -r 51a350a22d0c mercurial/vfs.py --- a/mercurial/vfs.py Wed Nov 20 15:38:57 2024 +0100 +++ b/mercurial/vfs.py Wed Nov 20 15:53:19 2024 +0100 @@ -82,6 +82,10 @@ # encoded vfs (see issue6546) _dir_sep: bytes = b'/' + # Used to disable the Rust `InnerRevlog` in case the VFS is not supported + # by the Rust code + rust_compatible = True + # TODO: type return, which is util.posixfile wrapped by a proxy @abc.abstractmethod def __call__(self, path: bytes, mode: bytes = b'rb', **kwargs) -> Any: diff -r 854e2b9bca57 -r 51a350a22d0c rust/Cargo.lock --- a/rust/Cargo.lock Wed Nov 20 15:38:57 2024 +0100 +++ b/rust/Cargo.lock Wed Nov 20 15:53:19 2024 +0100 @@ -15,6 +15,7 @@ checksum = "bf6ccdb167abbf410dcb915cabd428929d7f6a04980b54a11f26a39f1c7f7107" dependencies = [ "cfg-if", + "getrandom 0.2.8", "once_cell", "version_check", ] @@ -509,14 +510,14 @@ [[package]] name = "filetime" -version = "0.2.25" +version = "0.2.23" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "35c0522e981e68cbfa8c3f978441a5f34b30b96e146b33cd3359176b50fe8586" +checksum = "1ee447700ac8aa0b2f2bd7bc4462ad686ba06baa6727ac149a2d6277f0d240fd" dependencies = [ "cfg-if", "libc", - "libredox", - "windows-sys 0.59.0", + "redox_syscall 0.4.1", + "windows-sys 0.52.0", ] [[package]] @@ -596,9 +597,9 @@ [[package]] name = "hashbrown" -version = "0.13.1" +version = "0.13.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "33ff8ae62cd3a9102e5637afc8452c55acf3844001bd5374e0b0bd7b6616c038" +checksum = "43a3c133739dddd0d2990f9a4bdf8eb4b21ef50e4851ca85ab661199821d510e" dependencies = [ "ahash", "rayon", @@ -642,7 +643,7 @@ "filetime", "flate2", "format-bytes", - "hashbrown 0.13.1", + "hashbrown 0.13.2", "home", "im-rc", "indicatif", @@ -660,6 +661,7 @@ "rayon", "regex", "same-file", + "schnellru", "self_cell", "serde", "sha-1 0.10.0", @@ -681,6 +683,8 @@ "hg-core", "libc", "log", + "logging_timer", + "python3-sys", "stable_deref_trait", "vcsgraph", ] @@ -823,7 +827,6 @@ dependencies = [ "bitflags 2.6.0", "libc", - "redox_syscall 0.5.3", ] [[package]] @@ -1220,11 +1223,11 @@ [[package]] name = "redox_syscall" -version = "0.5.3" +version = "0.4.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "2a908a6e00f1fdd0dfd9c0eb08ce85126f6d8bbda50017e74bc4a4b7d4a926a4" +checksum = "4722d768eff46b75989dd134e5c353f0d6296e5aaa3132e776cbdb56be7731aa" dependencies = [ - "bitflags 2.6.0", + "bitflags 1.3.2", ] [[package]] @@ -1312,6 +1315,17 @@ ] [[package]] +name = "schnellru" +version = "0.2.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c9a8ef13a93c54d20580de1e5c413e624e53121d42fc7e2c11d10ef7f8b02367" +dependencies = [ + "ahash", + "cfg-if", + "hashbrown 0.13.2", +] + +[[package]] name = "scopeguard" version = "1.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" diff -r 854e2b9bca57 -r 51a350a22d0c rust/hg-core/Cargo.toml --- a/rust/hg-core/Cargo.toml Wed Nov 20 15:38:57 2024 +0100 +++ b/rust/hg-core/Cargo.toml Wed Nov 20 15:53:19 2024 +0100 @@ -43,6 +43,7 @@ once_cell = "1.16.0" bitvec = "1.0.1" chrono = "0.4.34" +schnellru = "0.2.1" dyn-clone = "1.0.16" filetime = "0.2.23" uuid = { version = "1.10", features = ["v4"] } diff -r 854e2b9bca57 -r 51a350a22d0c rust/hg-core/examples/nodemap/index.rs --- a/rust/hg-core/examples/nodemap/index.rs Wed Nov 20 15:38:57 2024 +0100 +++ b/rust/hg-core/examples/nodemap/index.rs Wed Nov 20 15:53:19 2024 +0100 @@ -4,6 +4,7 @@ // GNU General Public License version 2 or any later version. //! Minimal `RevlogIndex`, readable from standard Mercurial file format +use hg::revlog::RevlogIndex; use hg::*; use memmap2::*; use std::fs::File; diff -r 854e2b9bca57 -r 51a350a22d0c rust/hg-core/src/errors.rs --- a/rust/hg-core/src/errors.rs Wed Nov 20 15:38:57 2024 +0100 +++ b/rust/hg-core/src/errors.rs Wed Nov 20 15:53:19 2024 +0100 @@ -68,6 +68,10 @@ from: std::path::PathBuf, to: std::path::PathBuf, }, + CopyingFile { + from: std::path::PathBuf, + to: std::path::PathBuf, + }, /// `std::fs::canonicalize` CanonicalizingPath(std::path::PathBuf), /// `std::env::current_dir` @@ -150,6 +154,12 @@ from.display(), to.display() ), + IoErrorContext::CopyingFile { from, to } => write!( + f, + "when copying {} to {}", + from.display(), + to.display() + ), IoErrorContext::CanonicalizingPath(path) => { write!(f, "when canonicalizing {}", path.display()) } diff -r 854e2b9bca57 -r 51a350a22d0c rust/hg-core/src/fncache.rs --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/rust/hg-core/src/fncache.rs Wed Nov 20 15:53:19 2024 +0100 @@ -0,0 +1,26 @@ +use std::path::Path; + +use dyn_clone::DynClone; + +/// The FnCache stores the list of most files contained in the store and is +/// used for stream/copy clones. +/// +/// It keeps track of the name of "all" indexes and data files for all revlogs. +/// The names are relative to the store roots and are stored before any +/// encoding or path compression. +/// +/// Despite its name, the FnCache is *NOT* a cache, it keep tracks of +/// information that is not easily available elsewhere. It has no mechanism +/// for detecting isn't up to date, and de-synchronization with the actual +/// contents of the repository will lead to a corrupted clone and possibly +/// other corruption during maintenance operations. +/// Strictly speaking, it could be recomputed by looking at the contents of all +/// manifests AND actual store files on disk, however that is a +/// prohibitively expensive operation. +pub trait FnCache: Sync + Send + DynClone { + /// Whether the fncache was loaded from disk + fn is_loaded(&self) -> bool; + /// Add a path to be tracked in the fncache + fn add(&self, path: &Path); + // TODO add more methods once we start doing more with the FnCache +} diff -r 854e2b9bca57 -r 51a350a22d0c rust/hg-core/src/lib.rs --- a/rust/hg-core/src/lib.rs Wed Nov 20 15:38:57 2024 +0100 +++ b/rust/hg-core/src/lib.rs Wed Nov 20 15:53:19 2024 +0100 @@ -14,6 +14,7 @@ pub mod dirstate_tree; pub mod discovery; pub mod exit_codes; +pub mod fncache; pub mod requirements; pub mod testing; // unconditionally built, for use from integration tests pub use dirstate::{ @@ -29,7 +30,12 @@ pub mod matchers; pub mod repo; pub mod revlog; -pub use revlog::*; +// Export very common types to make discovery easier +pub use revlog::{ + BaseRevision, Graph, GraphError, Node, NodePrefix, Revision, + UncheckedRevision, NULL_NODE, NULL_NODE_ID, NULL_REVISION, + WORKING_DIRECTORY_HEX, WORKING_DIRECTORY_REVISION, +}; pub mod checkexec; pub mod config; pub mod lock; @@ -37,6 +43,7 @@ pub mod operations; pub mod progress; pub mod revset; +pub mod transaction; pub mod update; pub mod utils; pub mod vfs; diff -r 854e2b9bca57 -r 51a350a22d0c rust/hg-core/src/lock.rs --- a/rust/hg-core/src/lock.rs Wed Nov 20 15:38:57 2024 +0100 +++ b/rust/hg-core/src/lock.rs Wed Nov 20 15:53:19 2024 +0100 @@ -2,9 +2,11 @@ use crate::errors::HgError; use crate::errors::HgResultExt; +use crate::vfs::Vfs; use crate::vfs::VfsImpl; use std::io; use std::io::ErrorKind; +use std::path::Path; #[derive(derive_more::From)] pub enum LockError { @@ -65,7 +67,7 @@ if !lock_should_be_broken(&lock_data) { return Err(LockError::AlreadyHeld); } - Ok(hg_vfs.remove_file(lock_filename)?) + Ok(hg_vfs.unlink(Path::new(lock_filename))?) })? } @@ -99,7 +101,7 @@ } fn unlock(hg_vfs: &VfsImpl, lock_filename: &str) -> Result<(), HgError> { - hg_vfs.remove_file(lock_filename) + hg_vfs.unlink(Path::new(lock_filename)) } /// Return whether the process that is/was holding the lock is known not to be diff -r 854e2b9bca57 -r 51a350a22d0c rust/hg-core/src/logging.rs --- a/rust/hg-core/src/logging.rs Wed Nov 20 15:38:57 2024 +0100 +++ b/rust/hg-core/src/logging.rs Wed Nov 20 15:53:19 2024 +0100 @@ -1,6 +1,7 @@ use crate::errors::{HgError, HgResultExt, IoErrorContext, IoResultExt}; -use crate::vfs::VfsImpl; +use crate::vfs::{Vfs, VfsImpl}; use std::io::Write; +use std::path::Path; /// An utility to append to a log file with the given name, and optionally /// rotate it after it reaches a certain maximum size. @@ -64,15 +65,20 @@ for i in (1..self.max_files).rev() { self.vfs .rename( - format!("{}.{}", self.name, i), - format!("{}.{}", self.name, i + 1), + Path::new(&format!("{}.{}", self.name, i)), + Path::new(&format!("{}.{}", self.name, i + 1)), + false, ) .io_not_found_as_none()?; } // Then rename `{name}` to `{name}.1`. This is the // previously-opened `file`. self.vfs - .rename(self.name, format!("{}.1", self.name)) + .rename( + Path::new(&self.name), + Path::new(&format!("{}.1", self.name)), + false, + ) .io_not_found_as_none()?; // Finally, create a new `{name}` file and replace our `file` // handle. @@ -87,9 +93,7 @@ #[test] fn test_rotation() { let temp = tempfile::tempdir().unwrap(); - let vfs = VfsImpl { - base: temp.path().to_owned(), - }; + let vfs = VfsImpl::new(temp.path().to_owned(), false); let logger = LogFile::new(vfs.clone(), "log") .max_size(Some(3)) .max_files(2); diff -r 854e2b9bca57 -r 51a350a22d0c rust/hg-core/src/operations/cat.rs --- a/rust/hg-core/src/operations/cat.rs Wed Nov 20 15:38:57 2024 +0100 +++ b/rust/hg-core/src/operations/cat.rs Wed Nov 20 15:53:19 2024 +0100 @@ -12,8 +12,8 @@ use crate::utils::hg_path::HgPath; use crate::errors::HgError; -use crate::manifest::Manifest; -use crate::manifest::ManifestEntry; +use crate::revlog::manifest::Manifest; +use crate::revlog::manifest::ManifestEntry; use itertools::put_back; use itertools::PutBack; use std::cmp::Ordering; diff -r 854e2b9bca57 -r 51a350a22d0c rust/hg-core/src/operations/debugdata.rs --- a/rust/hg-core/src/operations/debugdata.rs Wed Nov 20 15:38:57 2024 +0100 +++ b/rust/hg-core/src/operations/debugdata.rs Wed Nov 20 15:53:19 2024 +0100 @@ -6,9 +6,10 @@ // GNU General Public License version 2 or any later version. use crate::errors::HgError; +use crate::exit_codes; use crate::repo::Repo; -use crate::revlog::Revlog; -use crate::{exit_codes, RevlogError, RevlogType}; +use crate::revlog::options::default_revlog_options; +use crate::revlog::{Revlog, RevlogError, RevlogType}; /// Dump the contents data of a revision. pub fn debug_data( @@ -31,7 +32,11 @@ &repo.store_vfs(), index_file, None, - repo.default_revlog_options(RevlogType::Changelog)?, + default_revlog_options( + repo.config(), + repo.requirements(), + RevlogType::Changelog, + )?, )?; let rev = crate::revset::resolve_rev_number_or_hex_prefix(revset, &revlog)?; diff -r 854e2b9bca57 -r 51a350a22d0c rust/hg-core/src/repo.rs --- a/rust/hg-core/src/repo.rs Wed Nov 20 15:38:57 2024 +0100 +++ b/rust/hg-core/src/repo.rs Wed Nov 20 15:53:19 2024 +0100 @@ -1,4 +1,3 @@ -use crate::changelog::Changelog; use crate::config::{Config, ConfigError, ConfigParseError}; use crate::dirstate::DirstateParents; use crate::dirstate_tree::dirstate_map::{ @@ -9,24 +8,19 @@ use crate::errors::HgResultExt; use crate::errors::{HgError, IoResultExt}; use crate::lock::{try_with_lock_no_wait, LockError}; -use crate::manifest::{Manifest, Manifestlog}; -use crate::requirements::{ - CHANGELOGV2_REQUIREMENT, DIRSTATE_TRACKED_HINT_V1, - GENERALDELTA_REQUIREMENT, NODEMAP_REQUIREMENT, REVLOGV1_REQUIREMENT, - REVLOGV2_REQUIREMENT, -}; +use crate::requirements::DIRSTATE_TRACKED_HINT_V1; +use crate::revlog::changelog::Changelog; use crate::revlog::filelog::Filelog; -use crate::revlog::RevlogError; +use crate::revlog::manifest::{Manifest, Manifestlog}; +use crate::revlog::options::default_revlog_options; +use crate::revlog::{RevlogError, RevlogType}; use crate::utils::debug::debug_wait_for_file_or_print; use crate::utils::files::get_path_from_bytes; use crate::utils::hg_path::HgPath; use crate::utils::SliceExt; -use crate::vfs::{is_dir, is_file, VfsImpl}; -use crate::{ - exit_codes, requirements, NodePrefix, RevlogDataConfig, RevlogDeltaConfig, - RevlogFeatureConfig, RevlogType, RevlogVersionOptions, UncheckedRevision, -}; -use crate::{DirstateError, RevlogOpenOptions}; +use crate::vfs::{is_dir, is_file, Vfs, VfsImpl}; +use crate::DirstateError; +use crate::{exit_codes, requirements, NodePrefix, UncheckedRevision}; use std::cell::{Ref, RefCell, RefMut}; use std::collections::HashSet; use std::io::Seek; @@ -151,9 +145,7 @@ let mut repo_config_files = vec![dot_hg.join("hgrc"), dot_hg.join("hgrc-not-shared")]; - let hg_vfs = VfsImpl { - base: dot_hg.to_owned(), - }; + let hg_vfs = VfsImpl::new(dot_hg.to_owned(), false); let mut reqs = requirements::load_if_exists(&hg_vfs)?; let relative = reqs.contains(requirements::RELATIVE_SHARED_REQUIREMENT); @@ -195,9 +187,10 @@ store_path = shared_path.join("store"); - let source_is_share_safe = requirements::load(VfsImpl { - base: shared_path.to_owned(), - })? + let source_is_share_safe = requirements::load(VfsImpl::new( + shared_path.to_owned(), + true, + ))? .contains(requirements::SHARESAFE_REQUIREMENT); if share_safe != source_is_share_safe { @@ -209,9 +202,10 @@ } } if share_safe { - reqs.extend(requirements::load(VfsImpl { - base: store_path.to_owned(), - })?); + reqs.extend(requirements::load(VfsImpl::new( + store_path.to_owned(), + true, + ))?); } let repo_config = if std::env::var_os("HGRCSKIPREPO").is_none() { @@ -252,23 +246,17 @@ /// For accessing repository files (in `.hg`), except for the store /// (`.hg/store`). pub fn hg_vfs(&self) -> VfsImpl { - VfsImpl { - base: self.dot_hg.to_owned(), - } + VfsImpl::new(self.dot_hg.to_owned(), false) } /// For accessing repository store files (in `.hg/store`) pub fn store_vfs(&self) -> VfsImpl { - VfsImpl { - base: self.store.to_owned(), - } + VfsImpl::new(self.store.to_owned(), false) } /// For accessing the working copy pub fn working_directory_vfs(&self) -> VfsImpl { - VfsImpl { - base: self.working_directory.to_owned(), - } + VfsImpl::new(self.working_directory.to_owned(), false) } pub fn try_with_wlock_no_wait( @@ -577,7 +565,11 @@ fn new_changelog(&self) -> Result { Changelog::open( &self.store_vfs(), - self.default_revlog_options(RevlogType::Changelog)?, + default_revlog_options( + self.config(), + self.requirements(), + RevlogType::Changelog, + )?, ) } @@ -592,7 +584,11 @@ fn new_manifestlog(&self) -> Result { Manifestlog::open( &self.store_vfs(), - self.default_revlog_options(RevlogType::Manifestlog)?, + default_revlog_options( + self.config(), + self.requirements(), + RevlogType::Manifestlog, + )?, ) } @@ -642,7 +638,11 @@ Filelog::open( self, path, - self.default_revlog_options(RevlogType::Filelog)?, + default_revlog_options( + self.config(), + self.requirements(), + RevlogType::Filelog, + )?, ) } /// Write to disk any updates that were made through `dirstate_map_mut`. @@ -787,55 +787,11 @@ if let Some(uuid) = old_uuid_to_remove { // Remove the old data file after the new docket pointing to the // new data file was written. - vfs.remove_file(format!("dirstate.{}", uuid))?; + vfs.unlink(Path::new(&format!("dirstate.{}", uuid)))?; } Ok(()) } - pub fn default_revlog_options( - &self, - revlog_type: RevlogType, - ) -> Result { - let requirements = self.requirements(); - let is_changelog = revlog_type == RevlogType::Changelog; - let version = if is_changelog - && requirements.contains(CHANGELOGV2_REQUIREMENT) - { - let compute_rank = self - .config() - .get_bool(b"experimental", b"changelog-v2.compute-rank")?; - RevlogVersionOptions::ChangelogV2 { compute_rank } - } else if requirements.contains(REVLOGV2_REQUIREMENT) { - RevlogVersionOptions::V2 - } else if requirements.contains(REVLOGV1_REQUIREMENT) { - RevlogVersionOptions::V1 { - general_delta: requirements.contains(GENERALDELTA_REQUIREMENT), - inline: !is_changelog, - } - } else { - RevlogVersionOptions::V0 - }; - Ok(RevlogOpenOptions { - version, - // We don't need to dance around the slow path like in the Python - // implementation since we know we have access to the fast code. - use_nodemap: requirements.contains(NODEMAP_REQUIREMENT), - delta_config: RevlogDeltaConfig::new( - self.config(), - self.requirements(), - revlog_type, - )?, - data_config: RevlogDataConfig::new( - self.config(), - self.requirements(), - )?, - feature_config: RevlogFeatureConfig::new( - self.config(), - requirements, - )?, - }) - } - pub fn node(&self, rev: UncheckedRevision) -> Option { self.changelog() .ok() diff -r 854e2b9bca57 -r 51a350a22d0c rust/hg-core/src/revlog/changelog.rs --- a/rust/hg-core/src/revlog/changelog.rs Wed Nov 20 15:38:57 2024 +0100 +++ b/rust/hg-core/src/revlog/changelog.rs Wed Nov 20 15:53:19 2024 +0100 @@ -14,7 +14,9 @@ use crate::revlog::{Revlog, RevlogEntry, RevlogError}; use crate::utils::hg_path::HgPath; use crate::vfs::VfsImpl; -use crate::{Graph, GraphError, RevlogOpenOptions, UncheckedRevision}; +use crate::{Graph, GraphError, UncheckedRevision}; + +use super::options::RevlogOpenOptions; /// A specialized `Revlog` to work with changelog data format. pub struct Changelog { @@ -84,7 +86,7 @@ } pub fn get_index(&self) -> &Index { - &self.revlog.index + self.revlog.index() } } @@ -364,10 +366,7 @@ let timezone = FixedOffset::west_opt(timezone_secs) .ok_or_else(|| HgError::corrupted("timezone offset out of bounds"))?; - Ok(DateTime::from_naive_utc_and_offset( - timestamp_utc.naive_utc(), - timezone, - )) + Ok(timestamp_utc.with_timezone(&timezone)) } /// Attempt to parse the given string as floating-point timestamp, and @@ -504,10 +503,7 @@ mod tests { use super::*; use crate::vfs::VfsImpl; - use crate::{ - RevlogDataConfig, RevlogDeltaConfig, RevlogFeatureConfig, - NULL_REVISION, - }; + use crate::NULL_REVISION; use pretty_assertions::assert_eq; #[test] @@ -566,23 +562,11 @@ fn test_data_from_rev_null() -> Result<(), RevlogError> { // an empty revlog will be enough for this case let temp = tempfile::tempdir().unwrap(); - let vfs = VfsImpl { - base: temp.path().to_owned(), - }; + let vfs = VfsImpl::new(temp.path().to_owned(), false); std::fs::write(temp.path().join("foo.i"), b"").unwrap(); - std::fs::write(temp.path().join("foo.d"), b"").unwrap(); - let revlog = Revlog::open( - &vfs, - "foo.i", - None, - RevlogOpenOptions::new( - false, - RevlogDataConfig::default(), - RevlogDeltaConfig::default(), - RevlogFeatureConfig::default(), - ), - ) - .unwrap(); + let revlog = + Revlog::open(&vfs, "foo.i", None, RevlogOpenOptions::default()) + .unwrap(); let changelog = Changelog { revlog }; assert_eq!( diff -r 854e2b9bca57 -r 51a350a22d0c rust/hg-core/src/revlog/compression.rs --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/rust/hg-core/src/revlog/compression.rs Wed Nov 20 15:53:19 2024 +0100 @@ -0,0 +1,383 @@ +//! Helpers around revlog compression + +use std::cell::RefCell; +use std::collections::HashSet; +use std::io::Read; + +use flate2::bufread::ZlibEncoder; +use flate2::read::ZlibDecoder; + +use crate::config::Config; +use crate::errors::HgError; +use crate::exit_codes; + +use super::corrupted; +use super::RevlogError; + +/// Header byte used to identify ZSTD-compressed data +pub const ZSTD_BYTE: u8 = b'\x28'; +/// Header byte used to identify Zlib-compressed data +pub const ZLIB_BYTE: u8 = b'x'; + +const ZSTD_DEFAULT_LEVEL: u8 = 3; +const ZLIB_DEFAULT_LEVEL: u8 = 6; +/// The length of data below which we don't even try to compress it when using +/// Zstandard. +const MINIMUM_LENGTH_ZSTD: usize = 50; +/// The length of data below which we don't even try to compress it when using +/// Zlib. +const MINIMUM_LENGTH_ZLIB: usize = 44; + +/// Defines the available compression engines and their options. +#[derive(Debug, Copy, Clone, PartialEq, Eq)] +pub enum CompressionConfig { + Zlib { + /// Between 0 and 9 included + level: u8, + }, + Zstd { + /// Between 0 and 22 included + level: u8, + /// Never used in practice for now + threads: u8, + }, + /// No compression is performed + None, +} + +impl CompressionConfig { + pub fn new( + config: &Config, + requirements: &HashSet, + ) -> Result { + let mut new = Self::default(); + + let zlib_level = config.get_u32(b"storage", b"revlog.zlib.level")?; + let zstd_level = config.get_u32(b"storage", b"revlog.zstd.level")?; + + for requirement in requirements { + if requirement.starts_with("revlog-compression-") + || requirement.starts_with("exp-compression-") + { + let split = &mut requirement.splitn(3, '-'); + split.next(); + split.next(); + new = match split.next().unwrap() { + "zstd" => CompressionConfig::zstd(zstd_level)?, + e => { + return Err(HgError::UnsupportedFeature(format!( + "Unsupported compression engine '{e}'" + ))) + } + }; + } + } + if let Some(level) = zlib_level { + if matches!(new, CompressionConfig::Zlib { .. }) { + new.set_level(level as usize)?; + } + } + Ok(new) + } + + /// Sets the level of the current compression engine + pub fn set_level(&mut self, new_level: usize) -> Result<(), HgError> { + match self { + CompressionConfig::Zlib { level } => { + if new_level > 9 { + return Err(HgError::abort( + format!( + "invalid compression zlib compression level {}, \ + expected between 0 and 9 included", + new_level + ), + exit_codes::ABORT, + None, + )); + } + *level = new_level as u8; + } + CompressionConfig::Zstd { level, .. } => { + if new_level > 22 { + return Err(HgError::abort( + format!( + "invalid compression zstd compression level {}, \ + expected between 0 and 22 included", + new_level + ), + exit_codes::ABORT, + None, + )); + } + *level = new_level as u8; + } + CompressionConfig::None => {} + } + Ok(()) + } + + /// Return a ZSTD compression config + pub fn zstd( + zstd_level: Option, + ) -> Result { + let mut engine = CompressionConfig::Zstd { + level: ZSTD_DEFAULT_LEVEL, + threads: 0, + }; + if let Some(level) = zstd_level { + engine.set_level(level as usize)?; + } + Ok(engine) + } +} + +impl Default for CompressionConfig { + fn default() -> Self { + Self::Zlib { + level: ZLIB_DEFAULT_LEVEL, + } + } +} + +/// A high-level trait to define compressors that should be able to compress +/// and decompress arbitrary bytes. +pub trait Compressor { + /// Returns a new [`Vec`] with the compressed data. + /// Should return `Ok(None)` if compression does not apply (e.g. too small) + fn compress( + &mut self, + data: &[u8], + ) -> Result>, RevlogError>; + /// Returns a new [`Vec`] with the decompressed data. + fn decompress(&self, data: &[u8]) -> Result, RevlogError>; +} + +/// A compressor that does nothing (useful in tests) +pub struct NoneCompressor; + +impl Compressor for NoneCompressor { + fn compress( + &mut self, + _data: &[u8], + ) -> Result>, RevlogError> { + Ok(None) + } + + fn decompress(&self, data: &[u8]) -> Result, RevlogError> { + Ok(data.to_owned()) + } +} + +/// A compressor for Zstandard +pub struct ZstdCompressor { + /// Level of compression to use + level: u8, + /// How many threads are used (not implemented yet) + threads: u8, + /// The underlying zstd compressor + compressor: zstd::bulk::Compressor<'static>, +} + +impl ZstdCompressor { + pub fn new(level: u8, threads: u8) -> Self { + Self { + level, + threads, + compressor: zstd::bulk::Compressor::new(level.into()) + .expect("invalid zstd arguments"), + } + } +} + +impl Compressor for ZstdCompressor { + fn compress( + &mut self, + data: &[u8], + ) -> Result>, RevlogError> { + if self.threads != 0 { + // TODO use a zstd builder + zstd cargo feature to support this + unimplemented!("zstd parallel compression is not implemented"); + } + if data.len() < MINIMUM_LENGTH_ZSTD { + return Ok(None); + } + let level = self.level as i32; + if data.len() <= 1000000 { + let compressed = self.compressor.compress(data).map_err(|e| { + corrupted(format!("revlog compress error: {}", e)) + })?; + Ok(if compressed.len() < data.len() { + Some(compressed) + } else { + None + }) + } else { + Ok(Some(zstd::stream::encode_all(data, level).map_err( + |e| corrupted(format!("revlog compress error: {}", e)), + )?)) + } + } + + fn decompress(&self, data: &[u8]) -> Result, RevlogError> { + zstd::stream::decode_all(data).map_err(|e| { + corrupted(format!("revlog decompress error: {}", e)).into() + }) + } +} + +/// A compressor for Zlib +pub struct ZlibCompressor { + /// Level of compression to use + level: flate2::Compression, +} + +impl ZlibCompressor { + pub fn new(level: u8) -> Self { + Self { + level: flate2::Compression::new(level.into()), + } + } +} + +impl Compressor for ZlibCompressor { + fn compress( + &mut self, + data: &[u8], + ) -> Result>, RevlogError> { + assert!(!data.is_empty()); + if data.len() < MINIMUM_LENGTH_ZLIB { + return Ok(None); + } + let mut buf = Vec::with_capacity(data.len()); + ZlibEncoder::new(data, self.level) + .read_to_end(&mut buf) + .map_err(|e| corrupted(format!("revlog compress error: {}", e)))?; + + Ok(if buf.len() < data.len() { + buf.shrink_to_fit(); + Some(buf) + } else { + None + }) + } + + fn decompress(&self, data: &[u8]) -> Result, RevlogError> { + let mut decoder = ZlibDecoder::new(data); + // TODO reuse the allocation somehow? + let mut buf = vec![]; + decoder.read_to_end(&mut buf).map_err(|e| { + corrupted(format!("revlog decompress error: {}", e)) + })?; + Ok(buf) + } +} + +thread_local! { + // seems fine to [unwrap] here: this can only fail due to memory allocation + // failing, and it's normal for that to cause panic. + static ZSTD_DECODER : RefCell> = + RefCell::new(zstd::bulk::Decompressor::new().ok().unwrap()); +} + +/// Util to wrap the reuse of a zstd decoder while controlling its buffer size. +fn zstd_decompress_to_buffer( + bytes: &[u8], + buf: &mut Vec, +) -> Result { + ZSTD_DECODER + .with(|decoder| decoder.borrow_mut().decompress_to_buffer(bytes, buf)) +} + +/// Specialized revlog decompression to use less memory for deltas while +/// keeping performance acceptable. +pub(super) fn uncompressed_zstd_data( + bytes: &[u8], + is_delta: bool, + uncompressed_len: i32, +) -> Result, HgError> { + let cap = uncompressed_len.max(0) as usize; + if is_delta { + // [cap] is usually an over-estimate of the space needed because + // it's the length of delta-decoded data, but we're interested + // in the size of the delta. + // This means we have to [shrink_to_fit] to avoid holding on + // to a large chunk of memory, but it also means we must have a + // fallback branch, for the case when the delta is longer than + // the original data (surprisingly, this does happen in practice) + let mut buf = Vec::with_capacity(cap); + match zstd_decompress_to_buffer(bytes, &mut buf) { + Ok(_) => buf.shrink_to_fit(), + Err(_) => { + buf.clear(); + zstd::stream::copy_decode(bytes, &mut buf) + .map_err(|e| corrupted(e.to_string()))?; + } + }; + Ok(buf) + } else { + let mut buf = Vec::with_capacity(cap); + let len = zstd_decompress_to_buffer(bytes, &mut buf) + .map_err(|e| corrupted(e.to_string()))?; + if len != uncompressed_len as usize { + Err(corrupted("uncompressed length does not match")) + } else { + Ok(buf) + } + } +} + +#[cfg(test)] +mod tests { + use super::*; + + const LARGE_TEXT: &[u8] = b" + Patali Dirapata, Cromda Cromda Ripalo, Pata Pata, Ko Ko Ko + Bokoro Dipoulito, Rondi Rondi Pepino, Pata Pata, Ko Ko Ko + Emana Karassoli, Loucra Loucra Nonponto, Pata Pata, Ko Ko Ko."; + + #[test] + fn test_zlib_compressor() { + // Can return `Ok(None)` + let mut compressor = ZlibCompressor::new(1); + assert_eq!(compressor.compress(b"too small").unwrap(), None); + + // Compression returns bytes + let compressed_with_1 = + compressor.compress(LARGE_TEXT).unwrap().unwrap(); + assert!(compressed_with_1.len() < LARGE_TEXT.len()); + // Round trip works + assert_eq!( + compressor.decompress(&compressed_with_1).unwrap(), + LARGE_TEXT + ); + + // Compression levels mean something + let mut compressor = ZlibCompressor::new(9); + // Compression returns bytes + let compressed = compressor.compress(LARGE_TEXT).unwrap().unwrap(); + assert!(compressed.len() < compressed_with_1.len()); + } + + #[test] + fn test_zstd_compressor() { + // Can return `Ok(None)` + let mut compressor = ZstdCompressor::new(1, 0); + assert_eq!(compressor.compress(b"too small").unwrap(), None); + + // Compression returns bytes + let compressed_with_1 = + compressor.compress(LARGE_TEXT).unwrap().unwrap(); + assert!(compressed_with_1.len() < LARGE_TEXT.len()); + // Round trip works + assert_eq!( + compressor.decompress(&compressed_with_1).unwrap(), + LARGE_TEXT + ); + + // Compression levels mean something + let mut compressor = ZstdCompressor::new(22, 0); + // Compression returns bytes + let compressed = compressor.compress(LARGE_TEXT).unwrap().unwrap(); + assert!(compressed.len() < compressed_with_1.len()); + } +} diff -r 854e2b9bca57 -r 51a350a22d0c rust/hg-core/src/revlog/file_io.rs --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/rust/hg-core/src/revlog/file_io.rs Wed Nov 20 15:53:19 2024 +0100 @@ -0,0 +1,544 @@ +//! Helpers for revlog file reading and writing. + +use std::{ + cell::RefCell, + io::{Read, Seek, SeekFrom, Write}, + path::{Path, PathBuf}, + sync::{Arc, Mutex}, +}; + +use crate::{ + errors::{HgError, IoResultExt}, + vfs::{Vfs, VfsFile}, +}; + +/// Wraps accessing arbitrary chunks of data within a file and reusing handles. +/// This is currently useful for accessing a revlog's data file, only reading +/// the ranges that are currently relevant, like a sort of basic and manual +/// file-based mmap. +/// +/// XXX should this just be replaced with `mmap` + `madvise` ranges? +/// The upcoming `UncompressedChunkCache` will make up for most of the slowness +/// of re-reading the same chunks, so this might not be as useful. Aside from +/// the major benefit of having less code to take care of, using `mmap` will +/// allow multiple processes to share the same pages, especially for the +/// changelog and manifest, which would make a difference in server contexts. +pub struct RandomAccessFile { + /// The current store VFS to pass it to [`FileHandle`] + vfs: Box, + /// Filename of the open file, relative to the vfs root + pub filename: PathBuf, + /// The current read-only handle on the file, if any + pub reading_handle: RefCell>, + /// The current read-write handle on the file, if any + pub writing_handle: RefCell>, +} + +impl RandomAccessFile { + /// Wrap a file for random access + pub fn new(vfs: Box, filename: PathBuf) -> Self { + assert!(filename.is_relative()); + Self { + vfs, + filename, + reading_handle: RefCell::new(None), + writing_handle: RefCell::new(None), + } + } + + /// Read a chunk of bytes from the file. + pub fn read_chunk( + &self, + offset: usize, + length: usize, + ) -> Result, HgError> { + let mut handle = self.get_read_handle()?; + handle + .seek(SeekFrom::Start(offset as u64)) + .when_reading_file(&self.filename)?; + handle.read_exact(length).when_reading_file(&self.filename) + } + + /// `pub` only for hg-cpython + #[doc(hidden)] + pub fn get_read_handle(&self) -> Result { + if let Some(handle) = &*self.writing_handle.borrow() { + // Use a file handle being actively used for writes, if available. + // There is some danger to doing this because reads will seek the + // file. + // However, [`Revlog::write_entry`] performs a `SeekFrom::End(0)` + // before all writes, so we should be safe. + return Ok(handle.clone()); + } + if let Some(handle) = &*self.reading_handle.borrow() { + return Ok(handle.clone()); + } + // early returns done to work around borrowck being overzealous + // See https://github.com/rust-lang/rust/issues/103108 + let new_handle = FileHandle::new( + dyn_clone::clone_box(&*self.vfs), + &self.filename, + false, + false, + )?; + *self.reading_handle.borrow_mut() = Some(new_handle.clone()); + Ok(new_handle) + } + + /// `pub` only for hg-cpython + #[doc(hidden)] + pub fn exit_reading_context(&self) { + self.reading_handle.take(); + } + + // Returns whether this file currently open + pub fn is_open(&self) -> bool { + self.reading_handle.borrow().is_some() + || self.writing_handle.borrow().is_some() + } +} + +/// A buffer that holds new changelog index data that needs to be written +/// after the manifest and filelogs so that the repo is updated atomically to +/// external processes. +#[derive(Clone, Debug, Default)] +pub struct DelayedBuffer { + // The actual in-memory bytes storing the delayed writes + pub(super) buffer: Vec, + /// The current offset into the virtual file composed of file + buffer + offset: u64, + /// The size of the file at the time of opening + file_size: u64, +} + +impl DelayedBuffer { + /// Returns the length of the full data (on-disk + buffer length). + pub fn len(&self) -> u64 { + self.buffer.len() as u64 + self.file_size + } + + pub fn is_empty(&self) -> bool { + self.len() == 0 + } +} + +/// Holds an open [`VfsFile`] and the related data. This can be used for +/// reading and writing. Writes can be delayed to a buffer before touching +/// the disk, if relevant (in the changelog case), but reads are transparent. +pub struct FileHandle { + /// The actual open file + pub file: VfsFile, + /// The VFS with which the file was opened + vfs: Box, + /// Filename of the open file, relative to the repo root + filename: PathBuf, + /// Buffer of delayed entry writes to the changelog index. This points + /// back to the buffer inside the revlog this handle refers to. + delayed_buffer: Option>>, +} + +impl std::fmt::Debug for FileHandle { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + f.debug_struct("FileHandle") + .field("filename", &self.filename) + .field("delayed_buffer", &self.delayed_buffer) + .field("file", &self.file) + .finish() + } +} + +impl Clone for FileHandle { + fn clone(&self) -> Self { + Self { + vfs: dyn_clone::clone_box(&*self.vfs), + filename: self.filename.clone(), + delayed_buffer: self.delayed_buffer.clone(), + // This can only fail if the OS doesn't have the file handle + // anymore, so we're not going to do anything useful anyway. + file: self.file.try_clone().expect("couldn't clone file handle"), + } + } +} + +impl FileHandle { + /// Get a (read or write) file handle to `filename`. Only creates the file + /// if `create` is `true`. + pub fn new( + vfs: Box, + filename: impl AsRef, + create: bool, + write: bool, + ) -> Result { + let file = if create { + vfs.create(filename.as_ref(), false)? + } else if write { + vfs.open(filename.as_ref())? + } else { + vfs.open_read(filename.as_ref())? + }; + Ok(Self { + vfs, + filename: filename.as_ref().to_owned(), + delayed_buffer: None, + file, + }) + } + + /// Get a file handle to `filename`, but writes go to a [`DelayedBuffer`]. + pub fn new_delayed( + vfs: Box, + filename: impl AsRef, + create: bool, + delayed_buffer: Arc>, + ) -> Result { + let mut file = if create { + vfs.create(filename.as_ref(), false)? + } else { + vfs.open(filename.as_ref())? + }; + let size = vfs.file_size(&file)?; + let offset = file + .stream_position() + .when_reading_file(filename.as_ref())?; + + { + let mut buf = delayed_buffer.lock().unwrap(); + buf.file_size = size; + buf.offset = offset; + } + + Ok(Self { + vfs, + filename: filename.as_ref().to_owned(), + delayed_buffer: Some(delayed_buffer), + file, + }) + } + + /// Wrap an existing [`VfsFile`] + pub fn from_file( + file: VfsFile, + vfs: Box, + filename: impl AsRef, + ) -> Self { + Self { + vfs, + filename: filename.as_ref().to_owned(), + delayed_buffer: None, + file, + } + } + + /// Wrap an existing [`VfsFile`], but writes go to a [`DelayedBuffer`]. + pub fn from_file_delayed( + mut file: VfsFile, + vfs: Box, + filename: impl AsRef, + delayed_buffer: Arc>, + ) -> Result { + let size = vfs.file_size(&file)?; + let offset = file + .stream_position() + .when_reading_file(filename.as_ref())?; + + { + let mut buf = delayed_buffer.lock().unwrap(); + buf.file_size = size; + buf.offset = offset; + } + + Ok(Self { + vfs, + filename: filename.as_ref().to_owned(), + delayed_buffer: Some(delayed_buffer), + file, + }) + } + + /// Move the position of the handle to `pos`, + /// spanning the [`DelayedBuffer`] if defined. Will return an error if + /// an invalid seek position is asked, or for any standard io error. + pub fn seek(&mut self, pos: SeekFrom) -> Result { + if let Some(delay_buf) = &self.delayed_buffer { + let mut delay_buf = delay_buf.lock().unwrap(); + // Virtual file offset spans real file and data + match pos { + SeekFrom::Start(offset) => delay_buf.offset = offset, + SeekFrom::End(offset) => { + delay_buf.offset = + delay_buf.len().saturating_add_signed(offset) + } + SeekFrom::Current(offset) => { + delay_buf.offset = + delay_buf.offset.saturating_add_signed(offset); + } + } + if delay_buf.offset < delay_buf.file_size { + self.file.seek(pos) + } else { + Ok(delay_buf.offset) + } + } else { + self.file.seek(pos) + } + } + + /// Read exactly `length` bytes from the current position. + /// Errors are the same as [`std::io::Read::read_exact`]. + pub fn read_exact( + &mut self, + length: usize, + ) -> Result, std::io::Error> { + if let Some(delay_buf) = self.delayed_buffer.as_mut() { + let mut delay_buf = delay_buf.lock().unwrap(); + let mut buf = vec![0; length]; + let offset: isize = + delay_buf.offset.try_into().expect("buffer too large"); + let file_size: isize = + delay_buf.file_size.try_into().expect("file too large"); + let span: isize = offset - file_size; + let length = length.try_into().expect("too large of a length"); + let absolute_span: u64 = + span.unsigned_abs().try_into().expect("length too large"); + if span < 0 { + if length <= absolute_span { + // We're only in the file + self.file.read_exact(&mut buf)?; + } else { + // We're spanning file and buffer + self.file + .read_exact(&mut buf[..absolute_span as usize])?; + delay_buf + .buffer + .take(length - absolute_span) + .read_exact(&mut buf[absolute_span as usize..])?; + } + } else { + // We're only in the buffer + delay_buf.buffer[absolute_span as usize..] + .take(length) + .read_exact(&mut buf)?; + } + delay_buf.offset += length; + Ok(buf.to_owned()) + } else { + let mut buf = vec![0; length]; + self.file.read_exact(&mut buf)?; + Ok(buf) + } + } + + /// Flush the in-memory changes to disk. This does *not* write the + /// delayed buffer, only the pending file changes. + pub fn flush(&mut self) -> Result<(), HgError> { + self.file.flush().when_writing_file(&self.filename) + } + + /// Return the current position in the file + pub fn position(&mut self) -> Result { + self.file + .stream_position() + .when_reading_file(&self.filename) + } + + /// Append `data` to the file, or to the [`DelayedBuffer`], if any. + pub fn write_all(&mut self, data: &[u8]) -> Result<(), HgError> { + if let Some(buf) = &mut self.delayed_buffer { + let mut delayed_buffer = buf.lock().expect("propagate the panic"); + assert_eq!(delayed_buffer.offset, delayed_buffer.len()); + delayed_buffer.buffer.extend_from_slice(data); + delayed_buffer.offset += data.len() as u64; + Ok(()) + } else { + self.file + .write_all(data) + .when_writing_file(&self.filename)?; + Ok(()) + } + } +} + +/// Write handles to a given revlog (index + maybe data) +#[derive(Debug)] +pub struct WriteHandles { + /// Handle to the index file + pub index_handle: FileHandle, + /// Handle to the data file, if the revlog is non-inline + pub data_handle: Option, +} + +#[cfg(test)] +mod tests { + use std::io::ErrorKind; + + use crate::vfs::VfsImpl; + + use super::*; + + #[test] + fn test_random_access_file() { + let base = tempfile::tempdir().unwrap().into_path(); + let filename = Path::new("a"); + let file_path = base.join(filename); + let raf = RandomAccessFile::new( + Box::new(VfsImpl::new(base.clone(), true)), + filename.to_owned(), + ); + + assert!(!raf.is_open()); + assert_eq!(&raf.filename, &filename); + // Should fail to read a non-existing file + match raf.get_read_handle().unwrap_err() { + HgError::IoError { error, .. } => match error.kind() { + std::io::ErrorKind::NotFound => {} + _ => panic!("should be not found"), + }, + e => panic!("{}", e.to_string()), + } + + std::fs::write(file_path, b"1234567890").unwrap(); + + // Should be able to open an existing file + let mut handle = raf.get_read_handle().unwrap(); + assert!(raf.is_open()); + assert_eq!(handle.read_exact(10).unwrap(), b"1234567890".to_vec()); + } + + #[test] + fn test_file_handle() { + let base = tempfile::tempdir().unwrap().into_path(); + let filename = base.join("a"); + // No `create` should fail + FileHandle::new( + Box::new(VfsImpl::new(base.clone(), false)), + &filename, + false, + false, + ) + .unwrap_err(); + std::fs::write(&filename, b"1234567890").unwrap(); + + let mut read_handle = FileHandle::new( + Box::new(VfsImpl::new(base.clone(), true)), + &filename, + false, + false, + ) + .unwrap(); + assert_eq!(&read_handle.filename, &filename); + assert_eq!(read_handle.position().unwrap(), 0); + + // Writing to an explicit read handle should fail + read_handle.write_all(b"some data").unwrap_err(); + + // reading exactly n bytes should work + assert_eq!(read_handle.read_exact(3).unwrap(), b"123".to_vec()); + // and the position should be remembered + assert_eq!(read_handle.read_exact(2).unwrap(), b"45".to_vec()); + + // Seeking should work + let position = read_handle.position().unwrap(); + read_handle.seek(SeekFrom::Current(-2)).unwrap(); + assert_eq!(position - 2, read_handle.position().unwrap()); + + // Seeking too much data should fail + read_handle.read_exact(1000).unwrap_err(); + + // Open a write handle + let mut handle = FileHandle::new( + Box::new(VfsImpl::new(base.clone(), false)), + &filename, + false, + true, + ) + .unwrap(); + + // Now writing should succeed + handle.write_all(b"new data").unwrap(); + // Opening or writing does not seek, so we should be at the start + assert_eq!(handle.position().unwrap(), 8); + // We can still read + assert_eq!(handle.read_exact(2).unwrap(), b"90".to_vec()); + + let mut read_handle = FileHandle::new( + Box::new(VfsImpl::new(base.clone(), true)), + &filename, + false, + false, + ) + .unwrap(); + read_handle.seek(SeekFrom::Start(0)).unwrap(); + // On-disk file contents should be changed + assert_eq!( + &read_handle.read_exact(10).unwrap(), + &b"new data90".to_vec(), + ); + // Flushing doesn't do anything unexpected + handle.flush().unwrap(); + + let delayed_buffer = Arc::new(Mutex::new(DelayedBuffer::default())); + let mut handle = FileHandle::new_delayed( + Box::new(VfsImpl::new(base.clone(), false)), + &filename, + false, + delayed_buffer, + ) + .unwrap(); + + assert_eq!( + handle + .delayed_buffer + .as_ref() + .unwrap() + .lock() + .unwrap() + .file_size, + 10 + ); + handle.seek(SeekFrom::End(0)).unwrap(); + handle.write_all(b"should go to buffer").unwrap(); + assert_eq!( + handle + .delayed_buffer + .as_ref() + .unwrap() + .lock() + .unwrap() + .len(), + 29 + ); + read_handle.seek(SeekFrom::Start(0)).unwrap(); + // On-disk file contents should be unchanged + assert_eq!( + read_handle.read_exact(10).unwrap(), + b"new data90".to_vec(), + ); + + assert_eq!( + read_handle.read_exact(1).unwrap_err().kind(), + ErrorKind::UnexpectedEof + ); + + handle.flush().unwrap(); + // On-disk file contents should still be unchanged after a flush + assert_eq!( + read_handle.read_exact(1).unwrap_err().kind(), + ErrorKind::UnexpectedEof + ); + + // Read from the buffer only + handle.seek(SeekFrom::End(-1)).unwrap(); + assert_eq!(handle.read_exact(1).unwrap(), b"r".to_vec()); + + // Read from an overlapping section of file and buffer + handle.seek(SeekFrom::Start(6)).unwrap(); + assert_eq!( + handle.read_exact(20).unwrap(), + b"ta90should go to buf".to_vec() + ); + + // Read from file only + handle.seek(SeekFrom::Start(0)).unwrap(); + assert_eq!(handle.read_exact(8).unwrap(), b"new data".to_vec()); + } +} diff -r 854e2b9bca57 -r 51a350a22d0c rust/hg-core/src/revlog/filelog.rs --- a/rust/hg-core/src/revlog/filelog.rs Wed Nov 20 15:38:57 2024 +0100 +++ b/rust/hg-core/src/revlog/filelog.rs Wed Nov 20 15:53:19 2024 +0100 @@ -11,10 +11,11 @@ use crate::utils::SliceExt; use crate::Graph; use crate::GraphError; -use crate::RevlogOpenOptions; use crate::UncheckedRevision; use std::path::PathBuf; +use super::options::RevlogOpenOptions; + /// A specialized `Revlog` to work with file data logs. pub struct Filelog { /// The generic `revlog` format. diff -r 854e2b9bca57 -r 51a350a22d0c rust/hg-core/src/revlog/index.rs --- a/rust/hg-core/src/revlog/index.rs Wed Nov 20 15:38:57 2024 +0100 +++ b/rust/hg-core/src/revlog/index.rs Wed Nov 20 15:53:19 2024 +0100 @@ -7,14 +7,14 @@ use byteorder::{BigEndian, ByteOrder}; use bytes_cast::{unaligned, BytesCast}; -use super::REVIDX_KNOWN_FLAGS; +use super::{NodePrefix, RevlogError, RevlogIndex, REVIDX_KNOWN_FLAGS}; use crate::errors::HgError; -use crate::node::{NODE_BYTES_LENGTH, NULL_NODE, STORED_NODE_ID_BYTES}; -use crate::revlog::node::Node; +use crate::revlog::node::{ + Node, NODE_BYTES_LENGTH, NULL_NODE, STORED_NODE_ID_BYTES, +}; use crate::revlog::{Revision, NULL_REVISION}; use crate::{ - dagops, BaseRevision, FastHashMap, Graph, GraphError, RevlogError, - RevlogIndex, UncheckedRevision, + dagops, BaseRevision, FastHashMap, Graph, GraphError, UncheckedRevision, }; pub const INDEX_ENTRY_SIZE: usize = 64; @@ -62,22 +62,19 @@ BigEndian::read_u16(&self.header_bytes[2..4]) } - pub fn parse(index_bytes: &[u8]) -> Result, HgError> { - if index_bytes.is_empty() { - return Ok(None); - } + pub fn parse(index_bytes: &[u8]) -> Result { if index_bytes.len() < 4 { return Err(HgError::corrupted( "corrupted revlog: can't read the index format header", )); } - Ok(Some(IndexHeader { + Ok(IndexHeader { header_bytes: { let bytes: [u8; 4] = index_bytes[0..4].try_into().expect("impossible"); bytes }, - })) + }) } } @@ -275,7 +272,7 @@ /// be accessed via the [`Self::head_revs`] method. /// The last filtered revisions in this index, used to make sure /// we haven't changed filters when returning the cached `head_revs`. - head_revs: RwLock<(Vec, HashSet)>, + pub(super) head_revs: RwLock<(Vec, HashSet)>, } impl Debug for Index { @@ -341,8 +338,11 @@ bytes: Box + Send + Sync>, default_header: IndexHeader, ) -> Result { - let header = - IndexHeader::parse(bytes.as_ref())?.unwrap_or(default_header); + let header = if bytes.len() < INDEX_ENTRY_SIZE { + default_header + } else { + IndexHeader::parse(bytes.as_ref())? + }; if header.format_version() != IndexHeader::REVLOGV1 { // A proper new version should have had a repo/store @@ -417,6 +417,45 @@ } } + /// Same as `rev_from_node`, without using a persistent nodemap + /// + /// This is used as fallback when a persistent nodemap is not present. + /// This happens when the persistent-nodemap experimental feature is not + /// enabled, or for small revlogs. + pub fn rev_from_node_no_persistent_nodemap( + &self, + node: NodePrefix, + ) -> Result { + // Linear scan of the revlog + // TODO: consider building a non-persistent nodemap in memory to + // optimize these cases. + let mut found_by_prefix = None; + for rev in (-1..self.len() as BaseRevision).rev() { + let rev = Revision(rev as BaseRevision); + let candidate_node = if rev == Revision(-1) { + NULL_NODE + } else { + let index_entry = self.get_entry(rev).ok_or_else(|| { + HgError::corrupted( + "revlog references a revision not in the index", + ) + })?; + *index_entry.hash() + }; + if node == candidate_node { + return Ok(rev); + } + if node.is_prefix_of(&candidate_node) { + if found_by_prefix.is_some() { + return Err(RevlogError::AmbiguousPrefix); + } + found_by_prefix = Some(rev) + } + } + found_by_prefix + .ok_or_else(|| RevlogError::InvalidRevision(format!("{:x}", node))) + } + pub fn get_offsets(&self) -> RwLockReadGuard>> { assert!(self.is_inline()); { @@ -925,8 +964,8 @@ let mut gaps = Vec::new(); let mut previous_end = None; - for (i, (_rev, entry)) in entries.iter().enumerate() { - let start = entry.c_start() as usize; + for (i, (rev, entry)) in entries.iter().enumerate() { + let start = self.start(*rev, entry); let length = entry.compressed_len(); // Skip empty revisions to form larger holes @@ -1004,15 +1043,13 @@ if revs.is_empty() { return 0; } - let last_entry = &self.get_entry(revs[revs.len() - 1]).unwrap(); - let end = last_entry.c_start() + last_entry.compressed_len() as u64; + let last_rev = revs[revs.len() - 1]; + let last_entry = &self.get_entry(last_rev).unwrap(); + let end = last_entry.offset() + last_entry.compressed_len() as usize; let first_rev = revs.iter().find(|r| r.0 != NULL_REVISION.0).unwrap(); - let start = if first_rev.0 == 0 { - 0 - } else { - self.get_entry(*first_rev).unwrap().c_start() - }; - (end - start) as usize + let first_entry = self.get_entry(*first_rev).unwrap(); + let start = first_entry.offset(); + end - start } /// Returns `&revs[startidx..endidx]` without empty trailing revs @@ -1379,6 +1416,33 @@ }) .collect()) } + + /// Return the offset into the data corresponding to `rev` (in the index + /// file if inline, in the data file otherwise). `entry` must be the entry + /// for `rev`: the API is done this way to reduce the number of lookups + /// since we sometimes already have the entry, and because very few places + /// actually use this function. + #[inline(always)] + pub fn start(&self, rev: Revision, entry: &IndexEntry<'_>) -> usize { + #[cfg(debug_assertions)] + { + assert_eq!(&self.get_entry(rev).unwrap(), entry); + } + let offset = entry.offset(); + if self.is_inline() { + offset + ((rev.0 as usize + 1) * INDEX_ENTRY_SIZE) + } else { + offset + } + } + + pub(crate) fn make_null_entry(&self) -> IndexEntry<'_> { + IndexEntry { + bytes: b"\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0 \ + \xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff \ + \0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0", + } + } } /// The kind of functionality needed by find_gca_candidates @@ -1692,7 +1756,7 @@ } } -#[derive(Debug)] +#[derive(Debug, PartialEq, Eq)] pub struct IndexEntry<'a> { bytes: &'a [u8], } @@ -1708,11 +1772,6 @@ BigEndian::read_u64(&self.bytes[0..8]) } - /// Same result (except potentially for rev 0) as C `index_get_start()` - fn c_start(&self) -> u64 { - self.raw_offset() >> 16 - } - pub fn flags(&self) -> u16 { BigEndian::read_u16(&self.bytes[6..=7]) } @@ -1766,7 +1825,7 @@ #[cfg(test)] mod tests { use super::*; - use crate::node::NULL_NODE; + use crate::NULL_NODE; #[cfg(test)] #[derive(Debug, Copy, Clone)] @@ -1901,24 +1960,21 @@ pub fn is_inline(index_bytes: &[u8]) -> bool { IndexHeader::parse(index_bytes) - .expect("too short") - .unwrap() + .expect("invalid header") .format_flags() .is_inline() } pub fn uses_generaldelta(index_bytes: &[u8]) -> bool { IndexHeader::parse(index_bytes) - .expect("too short") - .unwrap() + .expect("invalid header") .format_flags() .uses_generaldelta() } pub fn get_version(index_bytes: &[u8]) -> u16 { IndexHeader::parse(index_bytes) - .expect("too short") - .unwrap() + .expect("invalid header") .format_version() } diff -r 854e2b9bca57 -r 51a350a22d0c rust/hg-core/src/revlog/inner_revlog.rs --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/rust/hg-core/src/revlog/inner_revlog.rs Wed Nov 20 15:53:19 2024 +0100 @@ -0,0 +1,1354 @@ +//! A layer of lower-level revlog functionality to encapsulate most of the +//! IO work and expensive operations. +use std::{ + borrow::Cow, + cell::RefCell, + io::{ErrorKind, Seek, SeekFrom, Write}, + ops::Deref, + path::PathBuf, + sync::{Arc, Mutex}, +}; + +use schnellru::{ByMemoryUsage, LruMap}; +use sha1::{Digest, Sha1}; + +use crate::{ + errors::{HgError, IoResultExt}, + exit_codes, + transaction::Transaction, + vfs::Vfs, +}; + +use super::{ + compression::{ + uncompressed_zstd_data, CompressionConfig, Compressor, NoneCompressor, + ZlibCompressor, ZstdCompressor, ZLIB_BYTE, ZSTD_BYTE, + }, + file_io::{DelayedBuffer, FileHandle, RandomAccessFile, WriteHandles}, + index::{Index, IndexHeader, INDEX_ENTRY_SIZE}, + node::{NODE_BYTES_LENGTH, NULL_NODE}, + options::{RevlogDataConfig, RevlogDeltaConfig, RevlogFeatureConfig}, + BaseRevision, Node, Revision, RevlogEntry, RevlogError, RevlogIndex, + UncheckedRevision, NULL_REVISION, NULL_REVLOG_ENTRY_FLAGS, +}; + +/// Matches the `_InnerRevlog` class in the Python code, as an arbitrary +/// boundary to incrementally rewrite higher-level revlog functionality in +/// Rust. +pub struct InnerRevlog { + /// When index and data are not interleaved: bytes of the revlog index. + /// When index and data are interleaved (inline revlog): bytes of the + /// revlog index and data. + pub index: Index, + /// The store vfs that is used to interact with the filesystem + vfs: Box, + /// The index file path, relative to the vfs root + pub index_file: PathBuf, + /// The data file path, relative to the vfs root (same as `index_file` + /// if inline) + data_file: PathBuf, + /// Data config that applies to this revlog + data_config: RevlogDataConfig, + /// Delta config that applies to this revlog + delta_config: RevlogDeltaConfig, + /// Feature config that applies to this revlog + feature_config: RevlogFeatureConfig, + /// A view into this revlog's data file + segment_file: RandomAccessFile, + /// A cache of uncompressed chunks that have previously been restored. + /// Its eviction policy is defined in [`Self::new`]. + uncompressed_chunk_cache: Option, + /// Used to keep track of the actual target during diverted writes + /// for the changelog + original_index_file: Option, + /// Write handles to the index and data files + /// XXX why duplicate from `index` and `segment_file`? + writing_handles: Option, + /// See [`DelayedBuffer`]. + delayed_buffer: Option>>, + /// Whether this revlog is inline. XXX why duplicate from `index`? + pub inline: bool, + /// A cache of the last revision, which is usually accessed multiple + /// times. + pub last_revision_cache: Mutex>, +} + +impl InnerRevlog { + pub fn new( + vfs: Box, + index: Index, + index_file: PathBuf, + data_file: PathBuf, + data_config: RevlogDataConfig, + delta_config: RevlogDeltaConfig, + feature_config: RevlogFeatureConfig, + ) -> Self { + assert!(index_file.is_relative()); + assert!(data_file.is_relative()); + let segment_file = RandomAccessFile::new( + dyn_clone::clone_box(&*vfs), + if index.is_inline() { + index_file.to_owned() + } else { + data_file.to_owned() + }, + ); + + let uncompressed_chunk_cache = + data_config.uncompressed_cache_factor.map( + // Arbitrary initial value + // TODO check if using a hasher specific to integers is useful + |_factor| RefCell::new(LruMap::with_memory_budget(65536)), + ); + + let inline = index.is_inline(); + Self { + index, + vfs, + index_file, + data_file, + data_config, + delta_config, + feature_config, + segment_file, + uncompressed_chunk_cache, + original_index_file: None, + writing_handles: None, + delayed_buffer: None, + inline, + last_revision_cache: Mutex::new(None), + } + } + + /// Return number of entries of the revlog index + pub fn len(&self) -> usize { + self.index.len() + } + + /// Return `true` if this revlog has no entries + pub fn is_empty(&self) -> bool { + self.len() == 0 + } + + /// Return whether this revlog is inline (mixed index and data) + pub fn is_inline(&self) -> bool { + self.inline + } + + /// Clear all caches from this revlog + pub fn clear_cache(&mut self) { + assert!(!self.is_delaying()); + if let Some(cache) = self.uncompressed_chunk_cache.as_ref() { + // We don't clear the allocation here because it's probably faster. + // We could change our minds later if this ends up being a problem + // with regards to memory consumption. + cache.borrow_mut().clear(); + } + } + + /// Return an entry for the null revision + pub fn make_null_entry(&self) -> RevlogEntry { + RevlogEntry { + revlog: self, + rev: NULL_REVISION, + uncompressed_len: 0, + p1: NULL_REVISION, + p2: NULL_REVISION, + flags: NULL_REVLOG_ENTRY_FLAGS, + hash: NULL_NODE, + } + } + + /// Return the [`RevlogEntry`] for a [`Revision`] that is known to exist + pub fn get_entry_for_checked_rev( + &self, + rev: Revision, + ) -> Result { + if rev == NULL_REVISION { + return Ok(self.make_null_entry()); + } + let index_entry = self + .index + .get_entry(rev) + .ok_or_else(|| RevlogError::InvalidRevision(rev.to_string()))?; + let p1 = + self.index.check_revision(index_entry.p1()).ok_or_else(|| { + RevlogError::corrupted(format!( + "p1 for rev {} is invalid", + rev + )) + })?; + let p2 = + self.index.check_revision(index_entry.p2()).ok_or_else(|| { + RevlogError::corrupted(format!( + "p2 for rev {} is invalid", + rev + )) + })?; + let entry = RevlogEntry { + revlog: self, + rev, + uncompressed_len: index_entry.uncompressed_len(), + p1, + p2, + flags: index_entry.flags(), + hash: *index_entry.hash(), + }; + Ok(entry) + } + + /// Return the [`RevlogEntry`] for `rev`. If `rev` fails to check, this + /// returns a [`RevlogError`]. + /// TODO normalize naming across the index and all revlogs + /// (changelog, etc.) so that `get_entry` is always on an unchecked rev and + /// `get_entry_for_checked_rev` is for checked rev + pub fn get_entry( + &self, + rev: UncheckedRevision, + ) -> Result { + if rev == NULL_REVISION.into() { + return Ok(self.make_null_entry()); + } + let rev = self.index.check_revision(rev).ok_or_else(|| { + RevlogError::corrupted(format!("rev {} is invalid", rev)) + })?; + self.get_entry_for_checked_rev(rev) + } + + /// Is the revlog currently delaying the visibility of written data? + /// + /// The delaying mechanism can be either in-memory or written on disk in a + /// side-file. + pub fn is_delaying(&self) -> bool { + self.delayed_buffer.is_some() || self.original_index_file.is_some() + } + + /// The offset of the data chunk for this revision + #[inline(always)] + pub fn start(&self, rev: Revision) -> usize { + self.index.start( + rev, + &self + .index + .get_entry(rev) + .unwrap_or_else(|| self.index.make_null_entry()), + ) + } + + /// The length of the data chunk for this revision + /// TODO rename this method and others to more explicit names than the + /// existing ones that were copied over from Python + #[inline(always)] + pub fn length(&self, rev: Revision) -> usize { + self.index + .get_entry(rev) + .unwrap_or_else(|| self.index.make_null_entry()) + .compressed_len() as usize + } + + /// The end of the data chunk for this revision + #[inline(always)] + pub fn end(&self, rev: Revision) -> usize { + self.start(rev) + self.length(rev) + } + + /// Return the delta parent of the given revision + pub fn delta_parent(&self, rev: Revision) -> Revision { + let base = self + .index + .get_entry(rev) + .unwrap() + .base_revision_or_base_of_delta_chain(); + if base.0 == rev.0 { + NULL_REVISION + } else if self.delta_config.general_delta { + Revision(base.0) + } else { + Revision(rev.0 - 1) + } + } + + /// Return whether `rev` points to a snapshot revision (i.e. does not have + /// a delta base). + pub fn is_snapshot(&self, rev: Revision) -> Result { + if !self.delta_config.sparse_revlog { + return Ok(self.delta_parent(rev) == NULL_REVISION); + } + self.index.is_snapshot_unchecked(rev) + } + + /// Return the delta chain for `rev` according to this revlog's config. + /// See [`Index::delta_chain`] for more information. + pub fn delta_chain( + &self, + rev: Revision, + stop_rev: Option, + ) -> Result<(Vec, bool), HgError> { + self.index.delta_chain( + rev, + stop_rev, + self.delta_config.general_delta.into(), + ) + } + + fn compressor(&self) -> Result, HgError> { + // TODO cache the compressor? + Ok(match self.feature_config.compression_engine { + CompressionConfig::Zlib { level } => { + Box::new(ZlibCompressor::new(level)) + } + CompressionConfig::Zstd { level, threads } => { + Box::new(ZstdCompressor::new(level, threads)) + } + CompressionConfig::None => Box::new(NoneCompressor), + }) + } + + /// Generate a possibly-compressed representation of data. + /// Returns `None` if the data was not compressed. + pub fn compress<'data>( + &self, + data: &'data [u8], + ) -> Result>, RevlogError> { + if data.is_empty() { + return Ok(Some(data.into())); + } + let res = self.compressor()?.compress(data)?; + if let Some(compressed) = res { + // The revlog compressor added the header in the returned data. + return Ok(Some(compressed.into())); + } + + if data[0] == b'\0' { + return Ok(Some(data.into())); + } + Ok(None) + } + + /// Decompress a revlog chunk. + /// + /// The chunk is expected to begin with a header identifying the + /// format type so it can be routed to an appropriate decompressor. + pub fn decompress<'a>( + &'a self, + data: &'a [u8], + ) -> Result, RevlogError> { + if data.is_empty() { + return Ok(data.into()); + } + + // Revlogs are read much more frequently than they are written and many + // chunks only take microseconds to decompress, so performance is + // important here. + + let header = data[0]; + match header { + // Settings don't matter as they only affect compression + ZLIB_BYTE => Ok(ZlibCompressor::new(0).decompress(data)?.into()), + // Settings don't matter as they only affect compression + ZSTD_BYTE => { + Ok(ZstdCompressor::new(0, 0).decompress(data)?.into()) + } + b'\0' => Ok(data.into()), + b'u' => Ok((&data[1..]).into()), + other => Err(HgError::UnsupportedFeature(format!( + "unknown compression header '{}'", + other + )) + .into()), + } + } + + /// Obtain a segment of raw data corresponding to a range of revisions. + /// + /// Requests for data may be satisfied by a cache. + /// + /// Returns a 2-tuple of (offset, data) for the requested range of + /// revisions. Offset is the integer offset from the beginning of the + /// revlog and data is a slice of the raw byte data. + /// + /// Callers will need to call `self.start(rev)` and `self.length(rev)` + /// to determine where each revision's data begins and ends. + pub fn get_segment_for_revs( + &self, + start_rev: Revision, + end_rev: Revision, + ) -> Result<(usize, Vec), HgError> { + let start = if start_rev == NULL_REVISION { + 0 + } else { + let start_entry = self + .index + .get_entry(start_rev) + .expect("null revision segment"); + self.index.start(start_rev, &start_entry) + }; + let end_entry = self + .index + .get_entry(end_rev) + .expect("null revision segment"); + let end = self.index.start(end_rev, &end_entry) + self.length(end_rev); + + let length = end - start; + + // XXX should we use mmap instead of doing this for platforms that + // support madvise/populate? + Ok((start, self.segment_file.read_chunk(start, length)?)) + } + + /// Return the uncompressed raw data for `rev` + pub fn chunk_for_rev(&self, rev: Revision) -> Result, HgError> { + if let Some(cache) = self.uncompressed_chunk_cache.as_ref() { + if let Some(chunk) = cache.borrow_mut().get(&rev) { + return Ok(chunk.clone()); + } + } + // TODO revlogv2 should check the compression mode + let data = self.get_segment_for_revs(rev, rev)?.1; + let uncompressed = self.decompress(&data).map_err(|e| { + HgError::abort( + format!("revlog decompression error: {}", e), + exit_codes::ABORT, + None, + ) + })?; + let uncompressed: Arc<[u8]> = Arc::from(uncompressed.into_owned()); + if let Some(cache) = self.uncompressed_chunk_cache.as_ref() { + cache.borrow_mut().insert(rev, uncompressed.clone()); + } + Ok(uncompressed) + } + + /// Execute `func` within a read context for the data file, meaning that + /// the read handle will be taken and discarded after the operation. + pub fn with_read( + &self, + func: impl FnOnce() -> Result, + ) -> Result { + self.enter_reading_context()?; + let res = func(); + self.exit_reading_context(); + res.map_err(Into::into) + } + + /// `pub` only for use in hg-cpython + #[doc(hidden)] + pub fn enter_reading_context(&self) -> Result<(), HgError> { + if self.is_empty() { + // Nothing to be read + return Ok(()); + } + if self.delayed_buffer.is_some() && self.is_inline() { + return Err(HgError::abort( + "revlog with delayed write should not be inline", + exit_codes::ABORT, + None, + )); + } + self.segment_file.get_read_handle()?; + Ok(()) + } + + /// `pub` only for use in hg-cpython + #[doc(hidden)] + pub fn exit_reading_context(&self) { + self.segment_file.exit_reading_context() + } + + /// Fill the buffer returned by `get_buffer` with the possibly un-validated + /// raw text for a revision. It can be already validated if it comes + /// from the cache. + pub fn raw_text( + &self, + rev: Revision, + get_buffer: G, + ) -> Result<(), RevlogError> + where + G: FnOnce( + usize, + &mut dyn FnMut( + &mut dyn RevisionBuffer, + ) -> Result<(), RevlogError>, + ) -> Result<(), RevlogError>, + { + let entry = &self.get_entry_for_checked_rev(rev)?; + let raw_size = entry.uncompressed_len(); + let mut mutex_guard = self + .last_revision_cache + .lock() + .expect("lock should not be held"); + let cached_rev = if let Some((_node, rev, data)) = &*mutex_guard { + Some((*rev, data.deref().as_ref())) + } else { + None + }; + if let Some(cache) = &self.uncompressed_chunk_cache { + let cache = &mut cache.borrow_mut(); + if let Some(size) = raw_size { + // Dynamically update the uncompressed_chunk_cache size to the + // largest revision we've seen in this revlog. + // Do it *before* restoration in case the current revision + // is the largest. + let factor = self + .data_config + .uncompressed_cache_factor + .expect("cache should not exist without factor"); + let candidate_size = (size as f64 * factor) as usize; + let limiter_mut = cache.limiter_mut(); + if candidate_size > limiter_mut.max_memory_usage() { + std::mem::swap( + limiter_mut, + &mut ByMemoryUsage::new(candidate_size), + ); + } + } + } + entry.rawdata(cached_rev, get_buffer)?; + // drop cache to save memory, the caller is expected to update + // the revision cache after validating the text + mutex_guard.take(); + Ok(()) + } + + /// Only `pub` for `hg-cpython`. + /// Obtain decompressed raw data for the specified revisions that are + /// assumed to be in ascending order. + /// + /// Returns a list with decompressed data for each requested revision. + #[doc(hidden)] + pub fn chunks( + &self, + revs: Vec, + target_size: Option, + ) -> Result>, RevlogError> { + if revs.is_empty() { + return Ok(vec![]); + } + let mut fetched_revs = vec![]; + let mut chunks = Vec::with_capacity(revs.len()); + + match self.uncompressed_chunk_cache.as_ref() { + Some(cache) => { + let mut cache = cache.borrow_mut(); + for rev in revs.iter() { + match cache.get(rev) { + Some(hit) => chunks.push((*rev, hit.to_owned())), + None => fetched_revs.push(*rev), + } + } + } + None => fetched_revs = revs, + } + + let already_cached = chunks.len(); + + let sliced_chunks = if fetched_revs.is_empty() { + vec![] + } else if !self.data_config.with_sparse_read || self.is_inline() { + vec![fetched_revs] + } else { + self.slice_chunk(&fetched_revs, target_size)? + }; + + self.with_read(|| { + for revs_chunk in sliced_chunks { + let first_rev = revs_chunk[0]; + // Skip trailing revisions with empty diff + let last_rev_idx = revs_chunk + .iter() + .rposition(|r| self.length(*r) != 0) + .unwrap_or(revs_chunk.len() - 1); + + let last_rev = revs_chunk[last_rev_idx]; + + let (offset, data) = + self.get_segment_for_revs(first_rev, last_rev)?; + + let revs_chunk = &revs_chunk[..=last_rev_idx]; + + for rev in revs_chunk { + let chunk_start = self.start(*rev); + let chunk_length = self.length(*rev); + // TODO revlogv2 should check the compression mode + let bytes = &data[chunk_start - offset..][..chunk_length]; + let chunk = if !bytes.is_empty() && bytes[0] == ZSTD_BYTE { + // If we're using `zstd`, we want to try a more + // specialized decompression + let entry = self.index.get_entry(*rev).unwrap(); + let is_delta = entry + .base_revision_or_base_of_delta_chain() + != (*rev).into(); + let uncompressed = uncompressed_zstd_data( + bytes, + is_delta, + entry.uncompressed_len(), + )?; + Cow::Owned(uncompressed) + } else { + // Otherwise just fallback to generic decompression. + self.decompress(bytes)? + }; + + chunks.push((*rev, chunk.into())); + } + } + Ok(()) + })?; + + if let Some(cache) = self.uncompressed_chunk_cache.as_ref() { + let mut cache = cache.borrow_mut(); + for (rev, chunk) in chunks.iter().skip(already_cached) { + cache.insert(*rev, chunk.clone()); + } + } + // Use stable sort here since it's *mostly* sorted + chunks.sort_by(|a, b| a.0.cmp(&b.0)); + Ok(chunks.into_iter().map(|(_r, chunk)| chunk).collect()) + } + + /// Slice revs to reduce the amount of unrelated data to be read from disk. + /// + /// ``revs`` is sliced into groups that should be read in one time. + /// Assume that revs are sorted. + /// + /// The initial chunk is sliced until the overall density + /// (payload/chunks-span ratio) is above + /// `revlog.data_config.sr_density_threshold`. + /// No gap smaller than `revlog.data_config.sr_min_gap_size` is skipped. + /// + /// If `target_size` is set, no chunk larger than `target_size` + /// will be returned. + /// For consistency with other slicing choices, this limit won't go lower + /// than `revlog.data_config.sr_min_gap_size`. + /// + /// If individual revision chunks are larger than this limit, they will + /// still be raised individually. + pub fn slice_chunk( + &self, + revs: &[Revision], + target_size: Option, + ) -> Result>, RevlogError> { + let target_size = + target_size.map(|size| size.max(self.data_config.sr_min_gap_size)); + + let target_density = self.data_config.sr_density_threshold; + let min_gap_size = self.data_config.sr_min_gap_size as usize; + let to_density = self.index.slice_chunk_to_density( + revs, + target_density, + min_gap_size, + ); + + let mut sliced = vec![]; + + for chunk in to_density { + sliced.extend( + self.slice_chunk_to_size(&chunk, target_size)? + .into_iter() + .map(ToOwned::to_owned), + ); + } + + Ok(sliced) + } + + /// Slice revs to match the target size + /// + /// This is intended to be used on chunks that density slicing selected, + /// but that are still too large compared to the read guarantee of revlogs. + /// This might happen when the "minimal gap size" interrupted the slicing + /// or when chains are built in a way that create large blocks next to + /// each other. + fn slice_chunk_to_size<'a>( + &self, + revs: &'a [Revision], + target_size: Option, + ) -> Result, RevlogError> { + let mut start_data = self.start(revs[0]); + let end_data = self.end(revs[revs.len() - 1]); + let full_span = end_data - start_data; + + let nothing_to_do = target_size + .map(|size| full_span <= size as usize) + .unwrap_or(true); + + if nothing_to_do { + return Ok(vec![revs]); + } + let target_size = target_size.expect("target_size is set") as usize; + + let mut start_rev_idx = 0; + let mut end_rev_idx = 1; + let mut chunks = vec![]; + + for (idx, rev) in revs.iter().enumerate().skip(1) { + let span = self.end(*rev) - start_data; + let is_snapshot = self.is_snapshot(*rev)?; + if span <= target_size && is_snapshot { + end_rev_idx = idx + 1; + } else { + let chunk = + self.trim_chunk(revs, start_rev_idx, Some(end_rev_idx)); + if !chunk.is_empty() { + chunks.push(chunk); + } + start_rev_idx = idx; + start_data = self.start(*rev); + end_rev_idx = idx + 1; + } + if !is_snapshot { + break; + } + } + + // For the others, we use binary slicing to quickly converge towards + // valid chunks (otherwise, we might end up looking for the start/end + // of many revisions). This logic is not looking for the perfect + // slicing point, it quickly converges towards valid chunks. + let number_of_items = revs.len(); + + while (end_data - start_data) > target_size { + end_rev_idx = number_of_items; + if number_of_items - start_rev_idx <= 1 { + // Protect against individual chunks larger than the limit + break; + } + let mut local_end_data = self.end(revs[end_rev_idx - 1]); + let mut span = local_end_data - start_data; + while span > target_size { + if end_rev_idx - start_rev_idx <= 1 { + // Protect against individual chunks larger than the limit + break; + } + end_rev_idx -= (end_rev_idx - start_rev_idx) / 2; + local_end_data = self.end(revs[end_rev_idx - 1]); + span = local_end_data - start_data; + } + let chunk = + self.trim_chunk(revs, start_rev_idx, Some(end_rev_idx)); + if !chunk.is_empty() { + chunks.push(chunk); + } + start_rev_idx = end_rev_idx; + start_data = self.start(revs[start_rev_idx]); + } + + let chunk = self.trim_chunk(revs, start_rev_idx, None); + if !chunk.is_empty() { + chunks.push(chunk); + } + + Ok(chunks) + } + + /// Returns `revs[startidx..endidx]` without empty trailing revs + fn trim_chunk<'a>( + &self, + revs: &'a [Revision], + start_rev_idx: usize, + end_rev_idx: Option, + ) -> &'a [Revision] { + let mut end_rev_idx = end_rev_idx.unwrap_or(revs.len()); + + // If we have a non-empty delta candidate, there is nothing to trim + if revs[end_rev_idx - 1].0 < self.len() as BaseRevision { + // Trim empty revs at the end, except the very first rev of a chain + while end_rev_idx > 1 + && end_rev_idx > start_rev_idx + && self.length(revs[end_rev_idx - 1]) == 0 + { + end_rev_idx -= 1 + } + } + + &revs[start_rev_idx..end_rev_idx] + } + + /// Check the hash of some given data against the recorded hash. + pub fn check_hash( + &self, + p1: Revision, + p2: Revision, + expected: &[u8], + data: &[u8], + ) -> bool { + let e1 = self.index.get_entry(p1); + let h1 = match e1 { + Some(ref entry) => entry.hash(), + None => &NULL_NODE, + }; + let e2 = self.index.get_entry(p2); + let h2 = match e2 { + Some(ref entry) => entry.hash(), + None => &NULL_NODE, + }; + + hash(data, h1.as_bytes(), h2.as_bytes()) == expected + } + + /// Returns whether we are currently in a [`Self::with_write`] context + pub fn is_writing(&self) -> bool { + self.writing_handles.is_some() + } + + /// Open the revlog files for writing + /// + /// Adding content to a revlog should be done within this context. + /// TODO try using `BufRead` and `BufWrite` and see if performance improves + pub fn with_write( + &mut self, + transaction: &mut impl Transaction, + data_end: Option, + func: impl FnOnce() -> R, + ) -> Result { + if self.is_writing() { + return Ok(func()); + } + self.enter_writing_context(data_end, transaction) + .map_err(|e| { + self.exit_writing_context(); + e + })?; + let res = func(); + self.exit_writing_context(); + Ok(res) + } + + /// `pub` only for use in hg-cpython + #[doc(hidden)] + pub fn exit_writing_context(&mut self) { + self.writing_handles.take(); + self.segment_file.writing_handle.take(); + self.segment_file.reading_handle.take(); + } + + /// `pub` only for use in hg-cpython + #[doc(hidden)] + pub fn python_writing_handles(&self) -> Option<&WriteHandles> { + self.writing_handles.as_ref() + } + + /// `pub` only for use in hg-cpython + #[doc(hidden)] + pub fn enter_writing_context( + &mut self, + data_end: Option, + transaction: &mut impl Transaction, + ) -> Result<(), HgError> { + let data_size = if self.is_empty() { + 0 + } else { + self.end(Revision((self.len() - 1) as BaseRevision)) + }; + let data_handle = if !self.is_inline() { + let data_handle = match self.vfs.open(&self.data_file) { + Ok(mut f) => { + if let Some(end) = data_end { + f.seek(SeekFrom::Start(end as u64)) + .when_reading_file(&self.data_file)?; + } else { + f.seek(SeekFrom::End(0)) + .when_reading_file(&self.data_file)?; + } + f + } + Err(e) => match e { + HgError::IoError { error, context } => { + if error.kind() != ErrorKind::NotFound { + return Err(HgError::IoError { error, context }); + } + self.vfs.create(&self.data_file, true)? + } + e => return Err(e), + }, + }; + transaction.add(&self.data_file, data_size); + Some(FileHandle::from_file( + data_handle, + dyn_clone::clone_box(&*self.vfs), + &self.data_file, + )) + } else { + None + }; + let index_size = self.len() * INDEX_ENTRY_SIZE; + let index_handle = self.index_write_handle()?; + if self.is_inline() { + transaction.add(&self.index_file, data_size); + } else { + transaction.add(&self.index_file, index_size); + } + self.writing_handles = Some(WriteHandles { + index_handle: index_handle.clone(), + data_handle: data_handle.clone(), + }); + *self.segment_file.reading_handle.borrow_mut() = if self.is_inline() { + Some(index_handle) + } else { + data_handle + }; + Ok(()) + } + + /// Get a write handle to the index, sought to the end of its data. + fn index_write_handle(&self) -> Result { + let res = if self.delayed_buffer.is_none() { + if self.data_config.check_ambig { + self.vfs.open_check_ambig(&self.index_file) + } else { + self.vfs.open(&self.index_file) + } + } else { + self.vfs.open(&self.index_file) + }; + match res { + Ok(mut handle) => { + handle + .seek(SeekFrom::End(0)) + .when_reading_file(&self.index_file)?; + Ok( + if let Some(delayed_buffer) = self.delayed_buffer.as_ref() + { + FileHandle::from_file_delayed( + handle, + dyn_clone::clone_box(&*self.vfs), + &self.index_file, + delayed_buffer.clone(), + )? + } else { + FileHandle::from_file( + handle, + dyn_clone::clone_box(&*self.vfs), + &self.index_file, + ) + }, + ) + } + Err(e) => match e { + HgError::IoError { error, context } => { + if error.kind() != ErrorKind::NotFound { + return Err(HgError::IoError { error, context }); + }; + if let Some(delayed_buffer) = self.delayed_buffer.as_ref() + { + FileHandle::new_delayed( + dyn_clone::clone_box(&*self.vfs), + &self.index_file, + true, + delayed_buffer.clone(), + ) + } else { + FileHandle::new( + dyn_clone::clone_box(&*self.vfs), + &self.index_file, + true, + true, + ) + } + } + e => Err(e), + }, + } + } + + /// Split the data of an inline revlog into an index and a data file + pub fn split_inline( + &mut self, + header: IndexHeader, + new_index_file_path: Option, + ) -> Result { + assert!(self.delayed_buffer.is_none()); + let existing_handles = self.writing_handles.is_some(); + if let Some(handles) = &mut self.writing_handles { + handles.index_handle.flush()?; + self.writing_handles.take(); + self.segment_file.writing_handle.take(); + } + let mut new_data_file_handle = + self.vfs.create(&self.data_file, true)?; + // Drop any potential data, possibly redundant with the VFS impl. + new_data_file_handle + .set_len(0) + .when_writing_file(&self.data_file)?; + + self.with_read(|| -> Result<(), RevlogError> { + for r in 0..self.index.len() { + let rev = Revision(r as BaseRevision); + let rev_segment = self.get_segment_for_revs(rev, rev)?.1; + new_data_file_handle + .write_all(&rev_segment) + .when_writing_file(&self.data_file)?; + } + new_data_file_handle + .flush() + .when_writing_file(&self.data_file)?; + Ok(()) + })?; + + if let Some(index_path) = new_index_file_path { + self.index_file = index_path + } + + let mut new_index_handle = self.vfs.create(&self.index_file, true)?; + let mut new_data = Vec::with_capacity(self.len() * INDEX_ENTRY_SIZE); + for r in 0..self.len() { + let rev = Revision(r as BaseRevision); + let entry = self.index.entry_binary(rev).unwrap_or_else(|| { + panic!( + "entry {} should exist in {}", + r, + self.index_file.display() + ) + }); + if r == 0 { + new_data.extend(header.header_bytes); + } + new_data.extend(entry); + } + new_index_handle + .write_all(&new_data) + .when_writing_file(&self.index_file)?; + // Replace the index with a new one because the buffer contains inline + // data + self.index = Index::new(Box::new(new_data), header)?; + self.inline = false; + + self.segment_file = RandomAccessFile::new( + dyn_clone::clone_box(&*self.vfs), + self.data_file.to_owned(), + ); + if existing_handles { + // Switched from inline to conventional, reopen the index + let new_data_handle = Some(FileHandle::from_file( + new_data_file_handle, + dyn_clone::clone_box(&*self.vfs), + &self.data_file, + )); + self.writing_handles = Some(WriteHandles { + index_handle: self.index_write_handle()?, + data_handle: new_data_handle.clone(), + }); + *self.segment_file.writing_handle.borrow_mut() = new_data_handle; + } + + Ok(self.index_file.to_owned()) + } + + /// Write a new entry to this revlog. + /// - `entry` is the index bytes + /// - `header_and_data` is the compression header and the revision data + /// - `offset` is the position in the data file to write to + /// - `index_end` is the overwritten position in the index in revlog-v2, + /// since the format may allow a rewrite of garbage data at the end. + /// - `data_end` is the overwritten position in the data-file in revlog-v2, + /// since the format may allow a rewrite of garbage data at the end. + /// + /// XXX Why do we have `data_end` *and* `offset`? Same question in Python + pub fn write_entry( + &mut self, + mut transaction: impl Transaction, + entry: &[u8], + header_and_data: (&[u8], &[u8]), + mut offset: usize, + index_end: Option, + data_end: Option, + ) -> Result<(u64, Option), HgError> { + let current_revision = self.len() - 1; + let canonical_index_file = self.canonical_index_file(); + + let is_inline = self.is_inline(); + let handles = match &mut self.writing_handles { + None => { + return Err(HgError::abort( + "adding revision outside of the `with_write` context", + exit_codes::ABORT, + None, + )); + } + Some(handles) => handles, + }; + let index_handle = &mut handles.index_handle; + let data_handle = &mut handles.data_handle; + if let Some(end) = index_end { + index_handle + .seek(SeekFrom::Start(end)) + .when_reading_file(&self.index_file)?; + } else { + index_handle + .seek(SeekFrom::End(0)) + .when_reading_file(&self.index_file)?; + } + if let Some(data_handle) = data_handle { + if let Some(end) = data_end { + data_handle + .seek(SeekFrom::Start(end)) + .when_reading_file(&self.data_file)?; + } else { + data_handle + .seek(SeekFrom::End(0)) + .when_reading_file(&self.data_file)?; + } + } + let (header, data) = header_and_data; + + if !is_inline { + transaction.add(&self.data_file, offset); + transaction + .add(&canonical_index_file, current_revision * entry.len()); + let data_handle = data_handle + .as_mut() + .expect("data handle should exist when not inline"); + if !header.is_empty() { + data_handle.write_all(header)?; + } + data_handle.write_all(data)?; + match &mut self.delayed_buffer { + Some(buf) => { + buf.lock() + .expect("propagate the panic") + .buffer + .write_all(entry) + .expect("write to delay buffer should succeed"); + } + None => index_handle.write_all(entry)?, + } + } else if self.delayed_buffer.is_some() { + return Err(HgError::abort( + "invalid delayed write on inline revlog", + exit_codes::ABORT, + None, + )); + } else { + offset += current_revision * entry.len(); + transaction.add(&canonical_index_file, offset); + index_handle.write_all(entry)?; + index_handle.write_all(header)?; + index_handle.write_all(data)?; + } + let data_position = match data_handle { + Some(h) => Some(h.position()?), + None => None, + }; + Ok((index_handle.position()?, data_position)) + } + + /// Return the real target index file and not the temporary when diverting + pub fn canonical_index_file(&self) -> PathBuf { + self.original_index_file + .as_ref() + .map(ToOwned::to_owned) + .unwrap_or_else(|| self.index_file.to_owned()) + } + + /// Return the path to the diverted index + fn diverted_index(&self) -> PathBuf { + self.index_file.with_extension("i.a") + } + + /// True if we're in a [`Self::with_write`] or [`Self::with_read`] context + pub fn is_open(&self) -> bool { + self.segment_file.is_open() + } + + /// Set this revlog to delay its writes to a buffer + pub fn delay(&mut self) -> Result, HgError> { + assert!(!self.is_open()); + if self.is_inline() { + return Err(HgError::abort( + "revlog with delayed write should not be inline", + exit_codes::ABORT, + None, + )); + } + if self.delayed_buffer.is_some() || self.original_index_file.is_some() + { + // Delay or divert already happening + return Ok(None); + } + if self.is_empty() { + self.original_index_file = Some(self.index_file.to_owned()); + self.index_file = self.diverted_index(); + if self.vfs.exists(&self.index_file) { + self.vfs.unlink(&self.index_file)?; + } + Ok(Some(self.index_file.to_owned())) + } else { + self.delayed_buffer = + Some(Arc::new(Mutex::new(DelayedBuffer::default()))); + Ok(None) + } + } + + /// Write the pending data (in memory) if any to the diverted index file + /// (on disk temporary file) + pub fn write_pending( + &mut self, + ) -> Result<(Option, bool), HgError> { + assert!(!self.is_open()); + if self.is_inline() { + return Err(HgError::abort( + "revlog with delayed write should not be inline", + exit_codes::ABORT, + None, + )); + } + if self.original_index_file.is_some() { + return Ok((None, true)); + } + let mut any_pending = false; + let pending_index_file = self.diverted_index(); + if self.vfs.exists(&pending_index_file) { + self.vfs.unlink(&pending_index_file)?; + } + self.vfs.copy(&self.index_file, &pending_index_file)?; + if let Some(delayed_buffer) = self.delayed_buffer.take() { + let mut index_file_handle = self.vfs.open(&pending_index_file)?; + index_file_handle + .seek(SeekFrom::End(0)) + .when_writing_file(&pending_index_file)?; + let delayed_data = + &delayed_buffer.lock().expect("propagate the panic").buffer; + index_file_handle + .write_all(delayed_data) + .when_writing_file(&pending_index_file)?; + any_pending = true; + } + self.original_index_file = Some(self.index_file.to_owned()); + self.index_file = pending_index_file; + Ok((Some(self.index_file.to_owned()), any_pending)) + } + + /// Overwrite the canonical file with the diverted file, or write out the + /// delayed buffer. + /// Returns an error if the revlog is neither diverted nor delayed. + pub fn finalize_pending(&mut self) -> Result { + assert!(!self.is_open()); + if self.is_inline() { + return Err(HgError::abort( + "revlog with delayed write should not be inline", + exit_codes::ABORT, + None, + )); + } + match ( + self.delayed_buffer.as_ref(), + self.original_index_file.as_ref(), + ) { + (None, None) => { + return Err(HgError::abort( + "neither delay nor divert found on this revlog", + exit_codes::ABORT, + None, + )); + } + (Some(delay), None) => { + let mut index_file_handle = self.vfs.open(&self.index_file)?; + index_file_handle + .seek(SeekFrom::End(0)) + .when_writing_file(&self.index_file)?; + index_file_handle + .write_all( + &delay.lock().expect("propagate the panic").buffer, + ) + .when_writing_file(&self.index_file)?; + self.delayed_buffer = None; + } + (None, Some(divert)) => { + if self.vfs.exists(&self.index_file) { + self.vfs.rename(&self.index_file, divert, true)?; + } + divert.clone_into(&mut self.index_file); + self.original_index_file = None; + } + (Some(_), Some(_)) => unreachable!( + "{} is in an inconsistent state of both delay and divert", + self.canonical_index_file().display(), + ), + } + Ok(self.canonical_index_file()) + } + + /// `pub` only for `hg-cpython`. This is made a different method than + /// [`Revlog::index`] in case there is a different invariant that pops up + /// later. + #[doc(hidden)] + pub fn shared_index(&self) -> &Index { + &self.index + } +} + +/// The use of a [`Refcell`] assumes that a given revlog will only +/// be accessed (read or write) by a single thread. +type UncompressedChunkCache = + RefCell, ByMemoryUsage>>; + +/// The node, revision and data for the last revision we've seen. Speeds up +/// a lot of sequential operations of the revlog. +/// +/// The data is not just bytes since it can come from Python and we want to +/// avoid copies if possible. +type SingleRevisionCache = + (Node, Revision, Box + Send>); + +/// A way of progressively filling a buffer with revision data, then return +/// that buffer. Used to abstract away Python-allocated code to reduce copying +/// for performance reasons. +pub trait RevisionBuffer { + /// The owned buffer type to return + type Target; + /// Copies the slice into the buffer + fn extend_from_slice(&mut self, slice: &[u8]); + /// Returns the now finished owned buffer + fn finish(self) -> Self::Target; +} + +/// A simple vec-based buffer. This is uselessly complicated for the pure Rust +/// case, but it's the price to pay for Python compatibility. +#[derive(Debug)] +pub(super) struct CoreRevisionBuffer { + buf: Vec, +} + +impl CoreRevisionBuffer { + pub fn new() -> Self { + Self { buf: vec![] } + } + + #[inline] + pub fn resize(&mut self, size: usize) { + self.buf.reserve_exact(size - self.buf.capacity()); + } +} + +impl RevisionBuffer for CoreRevisionBuffer { + type Target = Vec; + + #[inline] + fn extend_from_slice(&mut self, slice: &[u8]) { + self.buf.extend_from_slice(slice); + } + + #[inline] + fn finish(self) -> Self::Target { + self.buf + } +} + +/// Calculate the hash of a revision given its data and its parents. +pub fn hash( + data: &[u8], + p1_hash: &[u8], + p2_hash: &[u8], +) -> [u8; NODE_BYTES_LENGTH] { + let mut hasher = Sha1::new(); + let (a, b) = (p1_hash, p2_hash); + if a > b { + hasher.update(b); + hasher.update(a); + } else { + hasher.update(a); + hasher.update(b); + } + hasher.update(data); + *hasher.finalize().as_ref() +} diff -r 854e2b9bca57 -r 51a350a22d0c rust/hg-core/src/revlog/manifest.rs --- a/rust/hg-core/src/revlog/manifest.rs Wed Nov 20 15:38:57 2024 +0100 +++ b/rust/hg-core/src/revlog/manifest.rs Wed Nov 20 15:53:19 2024 +0100 @@ -6,9 +6,9 @@ use crate::utils::hg_path::HgPath; use crate::utils::SliceExt; use crate::vfs::VfsImpl; -use crate::{ - Graph, GraphError, Revision, RevlogOpenOptions, UncheckedRevision, -}; +use crate::{Graph, GraphError, Revision, UncheckedRevision}; + +use super::options::RevlogOpenOptions; /// A specialized `Revlog` to work with `manifest` data format. pub struct Manifestlog { diff -r 854e2b9bca57 -r 51a350a22d0c rust/hg-core/src/revlog/mod.rs --- a/rust/hg-core/src/revlog/mod.rs Wed Nov 20 15:38:57 2024 +0100 +++ b/rust/hg-core/src/revlog/mod.rs Wed Nov 20 15:53:19 2024 +0100 @@ -9,35 +9,35 @@ pub mod nodemap; mod nodemap_docket; pub mod path_encode; -pub use node::{FromHexError, Node, NodePrefix}; +use inner_revlog::CoreRevisionBuffer; +use inner_revlog::InnerRevlog; +use inner_revlog::RevisionBuffer; +use memmap2::MmapOptions; +pub use node::{FromHexError, Node, NodePrefix, NULL_NODE, NULL_NODE_ID}; +use options::RevlogOpenOptions; pub mod changelog; +pub mod compression; +pub mod file_io; pub mod filelog; pub mod index; +pub mod inner_revlog; pub mod manifest; +pub mod options; pub mod patch; use std::borrow::Cow; -use std::collections::HashSet; +use std::io::ErrorKind; use std::io::Read; use std::ops::Deref; use std::path::Path; -use flate2::read::ZlibDecoder; -use sha1::{Digest, Sha1}; -use std::cell::RefCell; -use zstd; - -use self::node::{NODE_BYTES_LENGTH, NULL_NODE}; use self::nodemap_docket::NodeMapDocket; -use super::index::Index; -use super::index::INDEX_ENTRY_SIZE; -use super::nodemap::{NodeMap, NodeMapError}; -use crate::config::{Config, ResourceProfileValue}; use crate::errors::HgError; +use crate::errors::IoResultExt; use crate::exit_codes; -use crate::requirements::{ - GENERALDELTA_REQUIREMENT, NARROW_REQUIREMENT, SPARSEREVLOG_REQUIREMENT, -}; +use crate::revlog::index::Index; +use crate::revlog::nodemap::{NodeMap, NodeMapError}; +use crate::vfs::Vfs; use crate::vfs::VfsImpl; /// As noted in revlog.c, revision numbers are actually encoded in @@ -258,461 +258,17 @@ } } -#[derive(Debug, Copy, Clone, PartialEq, Eq)] -pub enum CompressionEngine { - Zlib { - /// Between 0 and 9 included - level: u32, - }, - Zstd { - /// Between 0 and 22 included - level: u32, - /// Never used in practice for now - threads: u32, - }, - /// No compression is performed - None, -} -impl CompressionEngine { - pub fn set_level(&mut self, new_level: usize) -> Result<(), HgError> { - match self { - CompressionEngine::Zlib { level } => { - if new_level > 9 { - return Err(HgError::abort( - format!( - "invalid compression zlib compression level {}", - new_level - ), - exit_codes::ABORT, - None, - )); - } - *level = new_level as u32; - } - CompressionEngine::Zstd { level, .. } => { - if new_level > 22 { - return Err(HgError::abort( - format!( - "invalid compression zstd compression level {}", - new_level - ), - exit_codes::ABORT, - None, - )); - } - *level = new_level as u32; - } - CompressionEngine::None => {} - } - Ok(()) - } - - pub fn zstd( - zstd_level: Option, - ) -> Result { - let mut engine = CompressionEngine::Zstd { - level: 3, - threads: 0, - }; - if let Some(level) = zstd_level { - engine.set_level(level as usize)?; - } - Ok(engine) - } -} - -impl Default for CompressionEngine { - fn default() -> Self { - Self::Zlib { level: 6 } - } -} - -#[derive(Debug, Clone, Copy, PartialEq)] -/// Holds configuration values about how the revlog data is read -pub struct RevlogDataConfig { - /// Should we try to open the "pending" version of the revlog - pub try_pending: bool, - /// Should we try to open the "split" version of the revlog - pub try_split: bool, - /// When True, `indexfile` should be opened with `checkambig=True` at - /// writing time, to avoid file stat ambiguity - pub check_ambig: bool, - /// If true, use mmap instead of reading to deal with large indexes - pub mmap_large_index: bool, - /// How much data is considered large - pub mmap_index_threshold: Option, - /// How much data to read and cache into the raw revlog data cache - pub chunk_cache_size: u64, - /// The size of the uncompressed cache compared to the largest revision - /// seen - pub uncompressed_cache_factor: Option, - /// The number of chunks cached - pub uncompressed_cache_count: Option, - /// Allow sparse reading of the revlog data - pub with_sparse_read: bool, - /// Minimal density of a sparse read chunk - pub sr_density_threshold: f64, - /// Minimal size of the data we skip when performing sparse reads - pub sr_min_gap_size: u64, - /// Whether deltas are encoded against arbitrary bases - pub general_delta: bool, -} - -impl RevlogDataConfig { - pub fn new( - config: &Config, - requirements: &HashSet, - ) -> Result { - let mut data_config = Self::default(); - if let Some(chunk_cache_size) = - config.get_byte_size(b"format", b"chunkcachesize")? - { - data_config.chunk_cache_size = chunk_cache_size; - } - - let memory_profile = config.get_resource_profile(Some("memory")); - if memory_profile.value >= ResourceProfileValue::Medium { - data_config.uncompressed_cache_count = Some(10_000); - data_config.uncompressed_cache_factor = Some(4.0); - if memory_profile.value >= ResourceProfileValue::High { - data_config.uncompressed_cache_factor = Some(10.0) - } - } - - if let Some(mmap_index_threshold) = config - .get_byte_size(b"storage", b"revlog.mmap.index:size-threshold")? - { - data_config.mmap_index_threshold = Some(mmap_index_threshold); - } - - let with_sparse_read = - config.get_bool(b"experimental", b"sparse-read")?; - if let Some(sr_density_threshold) = config - .get_f64(b"experimental", b"sparse-read.density-threshold")? - { - data_config.sr_density_threshold = sr_density_threshold; - } - data_config.with_sparse_read = with_sparse_read; - if let Some(sr_min_gap_size) = config - .get_byte_size(b"experimental", b"sparse-read.min-gap-size")? - { - data_config.sr_min_gap_size = sr_min_gap_size; - } - - data_config.with_sparse_read = - requirements.contains(SPARSEREVLOG_REQUIREMENT); - - Ok(data_config) - } -} - -impl Default for RevlogDataConfig { - fn default() -> Self { - Self { - chunk_cache_size: 65536, - sr_density_threshold: 0.50, - sr_min_gap_size: 262144, - try_pending: Default::default(), - try_split: Default::default(), - check_ambig: Default::default(), - mmap_large_index: Default::default(), - mmap_index_threshold: Default::default(), - uncompressed_cache_factor: Default::default(), - uncompressed_cache_count: Default::default(), - with_sparse_read: Default::default(), - general_delta: Default::default(), - } - } -} - -#[derive(Debug, Clone, Copy, PartialEq)] -/// Holds configuration values about how new deltas are computed. -/// -/// Some attributes are duplicated from [`RevlogDataConfig`] to help having -/// each object self contained. -pub struct RevlogDeltaConfig { - /// Whether deltas can be encoded against arbitrary bases - pub general_delta: bool, - /// Allow sparse writing of the revlog data - pub sparse_revlog: bool, - /// Maximum length of a delta chain - pub max_chain_len: Option, - /// Maximum distance between a delta chain's start and end - pub max_deltachain_span: Option, - /// If `upper_bound_comp` is not None, this is the expected maximal - /// gain from compression for the data content - pub upper_bound_comp: Option, - /// Should we try a delta against both parents - pub delta_both_parents: bool, - /// Test delta base candidate groups by chunks of this maximal size - pub candidate_group_chunk_size: u64, - /// Should we display debug information about delta computation - pub debug_delta: bool, - /// Trust incoming deltas by default - pub lazy_delta: bool, - /// Trust the base of incoming deltas by default - pub lazy_delta_base: bool, -} -impl RevlogDeltaConfig { - pub fn new( - config: &Config, - requirements: &HashSet, - revlog_type: RevlogType, - ) -> Result { - let mut delta_config = Self { - delta_both_parents: config - .get_option_no_default( - b"storage", - b"revlog.optimize-delta-parent-choice", - )? - .unwrap_or(true), - candidate_group_chunk_size: config - .get_u64( - b"storage", - b"revlog.delta-parent-search.candidate-group-chunk-size", - )? - .unwrap_or_default(), - ..Default::default() - }; - - delta_config.debug_delta = - config.get_bool(b"debug", b"revlog.debug-delta")?; - - delta_config.general_delta = - requirements.contains(GENERALDELTA_REQUIREMENT); - - let lazy_delta = - config.get_bool(b"storage", b"revlog.reuse-external-delta")?; - - if revlog_type == RevlogType::Manifestlog { - // upper bound of what we expect from compression - // (real life value seems to be 3) - delta_config.upper_bound_comp = Some(3.0) - } - - let mut lazy_delta_base = false; - if lazy_delta { - lazy_delta_base = match config.get_option_no_default( - b"storage", - b"revlog.reuse-external-delta-parent", - )? { - Some(base) => base, - None => config.get_bool(b"format", b"generaldelta")?, - }; - } - delta_config.lazy_delta = lazy_delta; - delta_config.lazy_delta_base = lazy_delta_base; - - delta_config.max_deltachain_span = - match config.get_i64(b"experimental", b"maxdeltachainspan")? { - Some(span) => { - if span < 0 { - None - } else { - Some(span as u64) - } - } - None => None, - }; - - delta_config.sparse_revlog = - requirements.contains(SPARSEREVLOG_REQUIREMENT); - - delta_config.max_chain_len = - config.get_byte_size_no_default(b"format", b"maxchainlen")?; - - Ok(delta_config) - } -} - -impl Default for RevlogDeltaConfig { - fn default() -> Self { - Self { - delta_both_parents: true, - lazy_delta: true, - general_delta: Default::default(), - sparse_revlog: Default::default(), - max_chain_len: Default::default(), - max_deltachain_span: Default::default(), - upper_bound_comp: Default::default(), - candidate_group_chunk_size: Default::default(), - debug_delta: Default::default(), - lazy_delta_base: Default::default(), - } - } -} - -#[derive(Debug, Default, Clone, Copy, PartialEq)] -/// Holds configuration values about the available revlog features -pub struct RevlogFeatureConfig { - /// The compression engine and its options - pub compression_engine: CompressionEngine, - /// Can we use censor on this revlog - pub censorable: bool, - /// Does this revlog use the "side data" feature - pub has_side_data: bool, - /// Might remove this configuration once the rank computation has no - /// impact - pub compute_rank: bool, - /// Parent order is supposed to be semantically irrelevant, so we - /// normally re-sort parents to ensure that the first parent is non-null, - /// if there is a non-null parent at all. - /// filelog abuses the parent order as a flag to mark some instances of - /// meta-encoded files, so allow it to disable this behavior. - pub canonical_parent_order: bool, - /// Can ellipsis commit be used - pub enable_ellipsis: bool, -} -impl RevlogFeatureConfig { - pub fn new( - config: &Config, - requirements: &HashSet, - ) -> Result { - let mut feature_config = Self::default(); - - let zlib_level = config.get_u32(b"storage", b"revlog.zlib.level")?; - let zstd_level = config.get_u32(b"storage", b"revlog.zstd.level")?; - - feature_config.compression_engine = CompressionEngine::default(); - - for requirement in requirements { - if requirement.starts_with("revlog-compression-") - || requirement.starts_with("exp-compression-") - { - let split = &mut requirement.splitn(3, '-'); - split.next(); - split.next(); - feature_config.compression_engine = match split.next().unwrap() - { - "zstd" => CompressionEngine::zstd(zstd_level)?, - e => { - return Err(HgError::UnsupportedFeature(format!( - "Unsupported compression engine '{e}'" - ))) - } - }; - } - } - if let Some(level) = zlib_level { - if matches!( - feature_config.compression_engine, - CompressionEngine::Zlib { .. } - ) { - feature_config - .compression_engine - .set_level(level as usize)?; - } - } - - feature_config.enable_ellipsis = - requirements.contains(NARROW_REQUIREMENT); - - Ok(feature_config) - } -} - -/// Read only implementation of revlog. pub struct Revlog { - /// When index and data are not interleaved: bytes of the revlog index. - /// When index and data are interleaved: bytes of the revlog index and - /// data. - index: Index, - /// When index and data are not interleaved: bytes of the revlog data - data_bytes: Option + Send>>, + inner: InnerRevlog, /// When present on disk: the persistent nodemap for this revlog nodemap: Option, } impl Graph for Revlog { fn parents(&self, rev: Revision) -> Result<[Revision; 2], GraphError> { - self.index.parents(rev) - } -} - -#[derive(Debug, Copy, Clone, PartialEq)] -pub enum RevlogVersionOptions { - V0, - V1 { general_delta: bool, inline: bool }, - V2, - ChangelogV2 { compute_rank: bool }, -} - -/// Options to govern how a revlog should be opened, usually from the -/// repository configuration or requirements. -#[derive(Debug, Copy, Clone)] -pub struct RevlogOpenOptions { - /// The revlog version, along with any option specific to this version - pub version: RevlogVersionOptions, - /// Whether the revlog uses a persistent nodemap. - pub use_nodemap: bool, - pub delta_config: RevlogDeltaConfig, - pub data_config: RevlogDataConfig, - pub feature_config: RevlogFeatureConfig, -} - -#[cfg(test)] -impl Default for RevlogOpenOptions { - fn default() -> Self { - Self { - version: RevlogVersionOptions::V1 { - general_delta: true, - inline: false, - }, - use_nodemap: true, - data_config: Default::default(), - delta_config: Default::default(), - feature_config: Default::default(), - } + self.index().parents(rev) } } - -impl RevlogOpenOptions { - pub fn new( - inline: bool, - data_config: RevlogDataConfig, - delta_config: RevlogDeltaConfig, - feature_config: RevlogFeatureConfig, - ) -> Self { - Self { - version: RevlogVersionOptions::V1 { - general_delta: data_config.general_delta, - inline, - }, - use_nodemap: false, - data_config, - delta_config, - feature_config, - } - } - - pub fn index_header(&self) -> index::IndexHeader { - index::IndexHeader { - header_bytes: match self.version { - RevlogVersionOptions::V0 => [0, 0, 0, 0], - RevlogVersionOptions::V1 { - general_delta, - inline, - } => [ - 0, - if general_delta && inline { - 3 - } else if general_delta { - 2 - } else { - u8::from(inline) - }, - 0, - 1, - ], - RevlogVersionOptions::V2 => 0xDEADu32.to_be_bytes(), - RevlogVersionOptions::ChangelogV2 { compute_rank: _ } => { - 0xD34Du32.to_be_bytes() - } - }, - } - } -} - impl Revlog { /// Open a revlog index file. /// @@ -728,6 +284,10 @@ Self::open_gen(store_vfs, index_path, data_path, options, None) } + fn index(&self) -> &Index { + &self.inner.index + } + fn open_gen( // Todo use the `Vfs` trait here once we create a function for mmap store_vfs: &VfsImpl, @@ -737,37 +297,10 @@ nodemap_for_test: Option, ) -> Result { let index_path = index_path.as_ref(); - let index = { - match store_vfs.mmap_open_opt(index_path)? { - None => Index::new( - Box::>::default(), - options.index_header(), - ), - Some(index_mmap) => { - let index = Index::new( - Box::new(index_mmap), - options.index_header(), - )?; - Ok(index) - } - } - }?; + let index = open_index(store_vfs, index_path, options)?; let default_data_path = index_path.with_extension("d"); - - // type annotation required - // won't recognize Mmap as Deref - let data_bytes: Option + Send>> = - if index.is_inline() { - None - } else if index.is_empty() { - // No need to even try to open the data file then. - Some(Box::new(&[][..])) - } else { - let data_path = data_path.unwrap_or(&default_data_path); - let data_mmap = store_vfs.mmap_open(data_path)?; - Some(Box::new(data_mmap)) - }; + let data_path = data_path.unwrap_or(&default_data_path); let nodemap = if index.is_inline() || !options.use_nodemap { None @@ -785,20 +318,27 @@ let nodemap = nodemap_for_test.or(nodemap); Ok(Revlog { - index, - data_bytes, + inner: InnerRevlog::new( + Box::new(store_vfs.clone()), + index, + index_path.to_path_buf(), + data_path.to_path_buf(), + options.data_config, + options.delta_config, + options.feature_config, + ), nodemap, }) } /// Return number of entries of the `Revlog`. pub fn len(&self) -> usize { - self.index.len() + self.index().len() } /// Returns `true` if the `Revlog` has zero `entries`. pub fn is_empty(&self) -> bool { - self.index.is_empty() + self.index().is_empty() } /// Returns the node ID for the given revision number, if it exists in this @@ -807,8 +347,8 @@ if rev == NULL_REVISION.into() { return Some(&NULL_NODE); } - let rev = self.index.check_revision(rev)?; - Some(self.index.get_entry(rev)?.hash()) + let rev = self.index().check_revision(rev)?; + Some(self.index().get_entry(rev)?.hash()) } /// Return the revision number for the given node ID, if it exists in this @@ -819,56 +359,30 @@ ) -> Result { if let Some(nodemap) = &self.nodemap { nodemap - .find_bin(&self.index, node)? + .find_bin(self.index(), node)? .ok_or(RevlogError::InvalidRevision(format!("{:x}", node))) } else { - self.rev_from_node_no_persistent_nodemap(node) + self.index().rev_from_node_no_persistent_nodemap(node) } } - /// Same as `rev_from_node`, without using a persistent nodemap - /// - /// This is used as fallback when a persistent nodemap is not present. - /// This happens when the persistent-nodemap experimental feature is not - /// enabled, or for small revlogs. - fn rev_from_node_no_persistent_nodemap( - &self, - node: NodePrefix, - ) -> Result { - // Linear scan of the revlog - // TODO: consider building a non-persistent nodemap in memory to - // optimize these cases. - let mut found_by_prefix = None; - for rev in (-1..self.len() as BaseRevision).rev() { - let rev = Revision(rev as BaseRevision); - let candidate_node = if rev == Revision(-1) { - NULL_NODE - } else { - let index_entry = - self.index.get_entry(rev).ok_or_else(|| { - HgError::corrupted( - "revlog references a revision not in the index", - ) - })?; - *index_entry.hash() - }; - if node == candidate_node { - return Ok(rev); - } - if node.is_prefix_of(&candidate_node) { - if found_by_prefix.is_some() { - return Err(RevlogError::AmbiguousPrefix); - } - found_by_prefix = Some(rev) - } - } - found_by_prefix - .ok_or(RevlogError::InvalidRevision(format!("{:x}", node))) - } - /// Returns whether the given revision exists in this revlog. pub fn has_rev(&self, rev: UncheckedRevision) -> bool { - self.index.check_revision(rev).is_some() + self.index().check_revision(rev).is_some() + } + + pub fn get_entry_for_checked_rev( + &self, + rev: Revision, + ) -> Result { + self.inner.get_entry_for_checked_rev(rev) + } + + pub fn get_entry( + &self, + rev: UncheckedRevision, + ) -> Result { + self.inner.get_entry(rev) } /// Return the full data associated to a revision. @@ -905,174 +419,154 @@ expected: &[u8], data: &[u8], ) -> bool { - let e1 = self.index.get_entry(p1); - let h1 = match e1 { - Some(ref entry) => entry.hash(), - None => &NULL_NODE, - }; - let e2 = self.index.get_entry(p2); - let h2 = match e2 { - Some(ref entry) => entry.hash(), - None => &NULL_NODE, - }; - - hash(data, h1.as_bytes(), h2.as_bytes()) == expected + self.inner.check_hash(p1, p2, expected, data) } /// Build the full data of a revision out its snapshot /// and its deltas. - fn build_data_from_deltas( - snapshot: RevlogEntry, - deltas: &[RevlogEntry], - ) -> Result, HgError> { - let snapshot = snapshot.data_chunk()?; - let deltas = deltas - .iter() - .rev() - .map(RevlogEntry::data_chunk) - .collect::, _>>()?; - let patches: Vec<_> = - deltas.iter().map(|d| patch::PatchList::new(d)).collect(); - let patch = patch::fold_patch_lists(&patches); - Ok(patch.apply(&snapshot)) - } - - /// Return the revlog data. - fn data(&self) -> &[u8] { - match &self.data_bytes { - Some(data_bytes) => data_bytes, - None => panic!( - "forgot to load the data or trying to access inline data" - ), - } - } - - pub fn make_null_entry(&self) -> RevlogEntry { - RevlogEntry { - revlog: self, - rev: NULL_REVISION, - bytes: b"", - compressed_len: 0, - uncompressed_len: 0, - base_rev_or_base_of_delta_chain: None, - p1: NULL_REVISION, - p2: NULL_REVISION, - flags: NULL_REVLOG_ENTRY_FLAGS, - hash: NULL_NODE, - } - } - - fn get_entry_for_checked_rev( - &self, - rev: Revision, - ) -> Result { - if rev == NULL_REVISION { - return Ok(self.make_null_entry()); + fn build_data_from_deltas( + buffer: &mut dyn RevisionBuffer, + snapshot: &[u8], + deltas: &[impl AsRef<[u8]>], + ) -> Result<(), RevlogError> { + if deltas.is_empty() { + buffer.extend_from_slice(snapshot); + return Ok(()); } - let index_entry = self - .index - .get_entry(rev) - .ok_or(RevlogError::InvalidRevision(rev.to_string()))?; - let offset = index_entry.offset(); - let start = if self.index.is_inline() { - offset + ((rev.0 as usize + 1) * INDEX_ENTRY_SIZE) - } else { - offset - }; - let end = start + index_entry.compressed_len() as usize; - let data = if self.index.is_inline() { - self.index.data(start, end) - } else { - &self.data()[start..end] - }; - let base_rev = self - .index - .check_revision(index_entry.base_revision_or_base_of_delta_chain()) - .ok_or_else(|| { - RevlogError::corrupted(format!( - "base revision for rev {} is invalid", - rev - )) - })?; - let p1 = - self.index.check_revision(index_entry.p1()).ok_or_else(|| { - RevlogError::corrupted(format!( - "p1 for rev {} is invalid", - rev - )) - })?; - let p2 = - self.index.check_revision(index_entry.p2()).ok_or_else(|| { - RevlogError::corrupted(format!( - "p2 for rev {} is invalid", - rev - )) - })?; - let entry = RevlogEntry { - revlog: self, - rev, - bytes: data, - compressed_len: index_entry.compressed_len(), - uncompressed_len: index_entry.uncompressed_len(), - base_rev_or_base_of_delta_chain: if base_rev == rev { + let patches: Result, _> = deltas + .iter() + .map(|d| patch::PatchList::new(d.as_ref())) + .collect(); + let patch = patch::fold_patch_lists(&patches?); + patch.apply(buffer, snapshot); + Ok(()) + } +} + +type IndexData = Box + Send + Sync>; + +/// TODO We should check for version 5.14+ at runtime, but we either should +/// add the `nix` dependency to get it efficiently, or vendor the code to read +/// both of which are overkill for such a feature. If we need this dependency +/// for more things later, we'll use it here too. +#[cfg(target_os = "linux")] +fn can_advise_populate_read() -> bool { + true +} + +#[cfg(not(target_os = "linux"))] +fn can_advise_populate_read() -> bool { + false +} + +/// Call `madvise` on the mmap with `MADV_POPULATE_READ` in a separate thread +/// to populate the mmap in the background for a small perf improvement. +#[cfg(target_os = "linux")] +fn advise_populate_read_mmap(mmap: &memmap2::Mmap) { + const MADV_POPULATE_READ: i32 = 22; + + // This is fine because the mmap is still referenced for at least + // the duration of this function, and the kernel will reject any wrong + // address. + let ptr = mmap.as_ptr() as u64; + let len = mmap.len(); + + // Fire and forget. The `JoinHandle` returned by `spawn` is dropped right + // after the call, the thread is thus detached. We don't care about success + // or failure here. + std::thread::spawn(move || unsafe { + // mmap's pointer is always page-aligned on Linux. In the case of + // file-based mmap (which is our use-case), the length should be + // correct. If not, it's not a safety concern as the kernel will just + // ignore unmapped pages and return ENOMEM, which we will promptly + // ignore, because we don't care about any errors. + libc::madvise(ptr as *mut libc::c_void, len, MADV_POPULATE_READ); + }); +} + +#[cfg(not(target_os = "linux"))] +fn advise_populate_read_mmap(mmap: &memmap2::Mmap) {} + +/// Open the revlog [`Index`] at `index_path`, through the `store_vfs` and the +/// given `options`. This controls whether (and how) we `mmap` the index file, +/// and returns an empty buffer if the index does not exist on disk. +/// This is only used when doing pure-Rust work, in Python contexts this is +/// unused at the time of writing. +pub fn open_index( + store_vfs: &impl Vfs, + index_path: &Path, + options: RevlogOpenOptions, +) -> Result { + let buf: IndexData = match store_vfs.open_read(index_path) { + Ok(mut file) => { + let mut buf = if let Some(threshold) = + options.data_config.mmap_index_threshold + { + let size = store_vfs.file_size(&file)?; + if size >= threshold { + // TODO madvise populate read in a background thread + let mut mmap_options = MmapOptions::new(); + if !can_advise_populate_read() { + // Fall back to populating in the main thread if + // post-creation advice is not supported. + // Does nothing on platforms where it's not defined. + mmap_options.populate(); + } + // Safety is "enforced" by locks and assuming other + // processes are well-behaved. If any misbehaving or + // malicious process does touch the index, it could lead + // to corruption. This is somewhat inherent to file-based + // `mmap`, though some platforms have some ways of + // mitigating. + // TODO linux: set the immutable flag with `chattr(1)`? + let mmap = unsafe { mmap_options.map(&file) } + .when_reading_file(index_path)?; + + if can_advise_populate_read() { + advise_populate_read_mmap(&mmap); + } + + Some(Box::new(mmap) as IndexData) + } else { + None + } + } else { None - } else { - Some(base_rev) - }, - p1, - p2, - flags: index_entry.flags(), - hash: *index_entry.hash(), - }; - Ok(entry) - } + }; - /// Get an entry of the revlog. - pub fn get_entry( - &self, - rev: UncheckedRevision, - ) -> Result { - if rev == NULL_REVISION.into() { - return Ok(self.make_null_entry()); + if buf.is_none() { + let mut data = vec![]; + file.read_to_end(&mut data).when_reading_file(index_path)?; + buf = Some(Box::new(data) as IndexData); + } + buf.unwrap() } - let rev = self.index.check_revision(rev).ok_or_else(|| { - RevlogError::corrupted(format!("rev {} is invalid", rev)) - })?; - self.get_entry_for_checked_rev(rev) - } + Err(err) => match err { + HgError::IoError { error, context } => match error.kind() { + ErrorKind::NotFound => Box::>::default(), + _ => return Err(HgError::IoError { error, context }), + }, + e => return Err(e), + }, + }; + + let index = Index::new(buf, options.index_header())?; + Ok(index) } /// The revlog entry's bytes and the necessary informations to extract /// the entry's data. #[derive(Clone)] pub struct RevlogEntry<'revlog> { - revlog: &'revlog Revlog, + revlog: &'revlog InnerRevlog, rev: Revision, - bytes: &'revlog [u8], - compressed_len: u32, uncompressed_len: i32, - base_rev_or_base_of_delta_chain: Option, p1: Revision, p2: Revision, flags: u16, hash: Node, } -thread_local! { - // seems fine to [unwrap] here: this can only fail due to memory allocation - // failing, and it's normal for that to cause panic. - static ZSTD_DECODER : RefCell> = - RefCell::new(zstd::bulk::Decompressor::new().ok().unwrap()); -} - -fn zstd_decompress_to_buffer( - bytes: &[u8], - buf: &mut Vec, -) -> Result { - ZSTD_DECODER - .with(|decoder| decoder.borrow_mut().decompress_to_buffer(bytes, buf)) -} - impl<'revlog> RevlogEntry<'revlog> { pub fn revision(&self) -> Revision { self.rev @@ -1137,33 +631,47 @@ } /// The data for this entry, after resolving deltas if any. - pub fn rawdata(&self) -> Result, RevlogError> { - let mut entry = self.clone(); - let mut delta_chain = vec![]; + /// Non-Python callers should probably call [`Self::data`] instead. + fn rawdata( + &self, + stop_rev: Option<(Revision, &[u8])>, + with_buffer: G, + ) -> Result<(), RevlogError> + where + G: FnOnce( + usize, + &mut dyn FnMut( + &mut dyn RevisionBuffer, + ) -> Result<(), RevlogError>, + ) -> Result<(), RevlogError>, + { + let (delta_chain, stopped) = self + .revlog + .delta_chain(self.revision(), stop_rev.map(|(r, _)| r))?; + let target_size = + self.uncompressed_len().map(|raw_size| 4 * raw_size as u64); - // The meaning of `base_rev_or_base_of_delta_chain` depends on - // generaldelta. See the doc on `ENTRY_DELTA_BASE` in - // `mercurial/revlogutils/constants.py` and the code in - // [_chaininfo] and in [index_deltachain]. - let uses_generaldelta = self.revlog.index.uses_generaldelta(); - while let Some(base_rev) = entry.base_rev_or_base_of_delta_chain { - entry = if uses_generaldelta { - delta_chain.push(entry); - self.revlog.get_entry_for_checked_rev(base_rev)? - } else { - let base_rev = UncheckedRevision(entry.rev.0 - 1); - delta_chain.push(entry); - self.revlog.get_entry(base_rev)? - }; - } + let deltas = self.revlog.chunks(delta_chain, target_size)?; - let data = if delta_chain.is_empty() { - entry.data_chunk()? + let (base_text, deltas) = if stopped { + ( + stop_rev.as_ref().expect("last revision should be cached").1, + &deltas[..], + ) } else { - Revlog::build_data_from_deltas(entry, &delta_chain)?.into() + let (buf, deltas) = deltas.split_at(1); + (buf[0].as_ref(), deltas) }; - Ok(data) + let size = self + .uncompressed_len() + .map(|l| l as usize) + .unwrap_or(base_text.len()); + with_buffer(size, &mut |buf| { + Revlog::build_data_from_deltas(buf, base_text, deltas)?; + Ok(()) + })?; + Ok(()) } fn check_data( @@ -1193,128 +701,35 @@ } pub fn data(&self) -> Result, RevlogError> { - let data = self.rawdata()?; + // TODO figure out if there is ever a need for `Cow` here anymore. + let mut data = CoreRevisionBuffer::new(); if self.rev == NULL_REVISION { - return Ok(data); + return Ok(data.finish().into()); } + self.rawdata(None, |size, f| { + // Pre-allocate the expected size (received from the index) + data.resize(size); + // Actually fill the buffer + f(&mut data)?; + Ok(()) + })?; if self.is_censored() { return Err(HgError::CensoredNodeError.into()); } - self.check_data(data) - } - - /// Extract the data contained in the entry. - /// This may be a delta. (See `is_delta`.) - fn data_chunk(&self) -> Result, HgError> { - if self.bytes.is_empty() { - return Ok(Cow::Borrowed(&[])); - } - match self.bytes[0] { - // Revision data is the entirety of the entry, including this - // header. - b'\0' => Ok(Cow::Borrowed(self.bytes)), - // Raw revision data follows. - b'u' => Ok(Cow::Borrowed(&self.bytes[1..])), - // zlib (RFC 1950) data. - b'x' => Ok(Cow::Owned(self.uncompressed_zlib_data()?)), - // zstd data. - b'\x28' => Ok(Cow::Owned(self.uncompressed_zstd_data()?)), - // A proper new format should have had a repo/store requirement. - format_type => Err(corrupted(format!( - "unknown compression header '{}'", - format_type - ))), - } - } - - fn uncompressed_zlib_data(&self) -> Result, HgError> { - let mut decoder = ZlibDecoder::new(self.bytes); - if self.is_delta() { - let mut buf = Vec::with_capacity(self.compressed_len as usize); - decoder - .read_to_end(&mut buf) - .map_err(|e| corrupted(e.to_string()))?; - Ok(buf) - } else { - let cap = self.uncompressed_len.max(0) as usize; - let mut buf = vec![0; cap]; - decoder - .read_exact(&mut buf) - .map_err(|e| corrupted(e.to_string()))?; - Ok(buf) - } + self.check_data(data.finish().into()) } - - fn uncompressed_zstd_data(&self) -> Result, HgError> { - let cap = self.uncompressed_len.max(0) as usize; - if self.is_delta() { - // [cap] is usually an over-estimate of the space needed because - // it's the length of delta-decoded data, but we're interested - // in the size of the delta. - // This means we have to [shrink_to_fit] to avoid holding on - // to a large chunk of memory, but it also means we must have a - // fallback branch, for the case when the delta is longer than - // the original data (surprisingly, this does happen in practice) - let mut buf = Vec::with_capacity(cap); - match zstd_decompress_to_buffer(self.bytes, &mut buf) { - Ok(_) => buf.shrink_to_fit(), - Err(_) => { - buf.clear(); - zstd::stream::copy_decode(self.bytes, &mut buf) - .map_err(|e| corrupted(e.to_string()))?; - } - }; - Ok(buf) - } else { - let mut buf = Vec::with_capacity(cap); - let len = zstd_decompress_to_buffer(self.bytes, &mut buf) - .map_err(|e| corrupted(e.to_string()))?; - if len != self.uncompressed_len as usize { - Err(corrupted("uncompressed length does not match")) - } else { - Ok(buf) - } - } - } - - /// Tell if the entry is a snapshot or a delta - /// (influences on decompression). - fn is_delta(&self) -> bool { - self.base_rev_or_base_of_delta_chain.is_some() - } -} - -/// Calculate the hash of a revision given its data and its parents. -fn hash( - data: &[u8], - p1_hash: &[u8], - p2_hash: &[u8], -) -> [u8; NODE_BYTES_LENGTH] { - let mut hasher = Sha1::new(); - let (a, b) = (p1_hash, p2_hash); - if a > b { - hasher.update(b); - hasher.update(a); - } else { - hasher.update(a); - hasher.update(b); - } - hasher.update(data); - *hasher.finalize().as_ref() } #[cfg(test)] mod tests { use super::*; - use crate::index::IndexEntryBuilder; + use crate::revlog::index::IndexEntryBuilder; use itertools::Itertools; #[test] fn test_empty() { let temp = tempfile::tempdir().unwrap(); - let vfs = VfsImpl { - base: temp.path().to_owned(), - }; + let vfs = VfsImpl::new(temp.path().to_owned(), false); std::fs::write(temp.path().join("foo.i"), b"").unwrap(); std::fs::write(temp.path().join("foo.d"), b"").unwrap(); let revlog = @@ -1336,9 +751,7 @@ #[test] fn test_inline() { let temp = tempfile::tempdir().unwrap(); - let vfs = VfsImpl { - base: temp.path().to_owned(), - }; + let vfs = VfsImpl::new(temp.path().to_owned(), false); let node0 = Node::from_hex("2ed2a3912a0b24502043eae84ee4b279c18b90dd") .unwrap(); let node1 = Node::from_hex("b004912a8510032a0350a74daa2803dadfb00e12") @@ -1405,9 +818,7 @@ #[test] fn test_nodemap() { let temp = tempfile::tempdir().unwrap(); - let vfs = VfsImpl { - base: temp.path().to_owned(), - }; + let vfs = VfsImpl::new(temp.path().to_owned(), false); // building a revlog with a forced Node starting with zeros // This is a corruption, but it does not preclude using the nodemap diff -r 854e2b9bca57 -r 51a350a22d0c rust/hg-core/src/revlog/options.rs --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/rust/hg-core/src/revlog/options.rs Wed Nov 20 15:53:19 2024 +0100 @@ -0,0 +1,421 @@ +//! Helpers for the revlog config and opening options + +use std::collections::HashSet; + +use crate::{ + config::{Config, ResourceProfileValue}, + errors::HgError, + requirements::{ + CHANGELOGV2_REQUIREMENT, GENERALDELTA_REQUIREMENT, NARROW_REQUIREMENT, + NODEMAP_REQUIREMENT, REVLOGV1_REQUIREMENT, REVLOGV2_REQUIREMENT, + SPARSEREVLOG_REQUIREMENT, + }, +}; + +use super::{compression::CompressionConfig, RevlogType}; + +const DEFAULT_CHUNK_CACHE_SIZE: u64 = 65536; +const DEFAULT_SPARSE_READ_DENSITY_THRESHOLD: f64 = 0.50; +const DEFAULT_SPARSE_READ_MIN_GAP_SIZE: u64 = 262144; + +/// The known revlog versions and their options +#[derive(Debug, Copy, Clone, PartialEq)] +pub enum RevlogVersionOptions { + V0, + V1 { general_delta: bool, inline: bool }, + V2, + ChangelogV2 { compute_rank: bool }, +} + +/// Options to govern how a revlog should be opened, usually from the +/// repository configuration or requirements. +#[derive(Debug, Copy, Clone)] +pub struct RevlogOpenOptions { + /// The revlog version, along with any option specific to this version + pub version: RevlogVersionOptions, + /// Whether the revlog uses a persistent nodemap. + pub use_nodemap: bool, + pub delta_config: RevlogDeltaConfig, + pub data_config: RevlogDataConfig, + pub feature_config: RevlogFeatureConfig, +} + +#[cfg(test)] +impl Default for RevlogOpenOptions { + fn default() -> Self { + Self { + version: RevlogVersionOptions::V1 { + general_delta: true, + inline: false, + }, + use_nodemap: true, + data_config: Default::default(), + delta_config: Default::default(), + feature_config: Default::default(), + } + } +} + +impl RevlogOpenOptions { + pub fn new( + inline: bool, + data_config: RevlogDataConfig, + delta_config: RevlogDeltaConfig, + feature_config: RevlogFeatureConfig, + ) -> Self { + Self { + version: RevlogVersionOptions::V1 { + general_delta: data_config.general_delta, + inline, + }, + use_nodemap: false, + data_config, + delta_config, + feature_config, + } + } + + pub fn index_header(&self) -> super::index::IndexHeader { + super::index::IndexHeader { + header_bytes: match self.version { + RevlogVersionOptions::V0 => [0, 0, 0, 0], + RevlogVersionOptions::V1 { + general_delta, + inline, + } => [ + 0, + if general_delta && inline { + 3 + } else if general_delta { + 2 + } else { + u8::from(inline) + }, + 0, + 1, + ], + RevlogVersionOptions::V2 => 0xDEADu32.to_be_bytes(), + RevlogVersionOptions::ChangelogV2 { compute_rank: _ } => { + 0xD34Du32.to_be_bytes() + } + }, + } + } +} + +/// Technically only Linux 2.5.46+ has `MAP_POPULATE` and only `2.6.23` on +/// private mappings, but if you're using such ancient Linux, you have other +/// problems. +#[cfg(target_os = "linux")] +const fn can_populate_mmap() -> bool { + true +} + +/// There is a of populating mmaps for Windows, but it would need testing. +#[cfg(not(target_os = "linux"))] +const fn can_populate_mmap() { + false +} + +#[derive(Debug, Clone, Copy, PartialEq)] +/// Holds configuration values about how the revlog data is read +pub struct RevlogDataConfig { + /// Should we try to open the "pending" version of the revlog + pub try_pending: bool, + /// Should we try to open the "split" version of the revlog + pub try_split: bool, + /// When True, `indexfile` should be opened with `checkambig=True` at + /// writing time, to avoid file stat ambiguity + pub check_ambig: bool, + /// If true, use mmap instead of reading to deal with large indexes + pub mmap_large_index: bool, + /// How much data is considered large + pub mmap_index_threshold: Option, + /// How much data to read and cache into the raw revlog data cache + pub chunk_cache_size: u64, + /// The size of the uncompressed cache compared to the largest revision + /// seen + pub uncompressed_cache_factor: Option, + /// The number of chunks cached + pub uncompressed_cache_count: Option, + /// Allow sparse reading of the revlog data + pub with_sparse_read: bool, + /// Minimal density of a sparse read chunk + pub sr_density_threshold: f64, + /// Minimal size of the data we skip when performing sparse reads + pub sr_min_gap_size: u64, + /// Whether deltas are encoded against arbitrary bases + pub general_delta: bool, +} + +impl RevlogDataConfig { + pub fn new( + config: &Config, + requirements: &HashSet, + ) -> Result { + let mut data_config = Self::default(); + if let Some(chunk_cache_size) = + config.get_byte_size(b"format", b"chunkcachesize")? + { + data_config.chunk_cache_size = chunk_cache_size; + } + + let memory_profile = config.get_resource_profile(Some("memory")); + if memory_profile.value >= ResourceProfileValue::Medium { + data_config.uncompressed_cache_count = Some(10_000); + data_config.uncompressed_cache_factor = Some(4.0); + if memory_profile.value >= ResourceProfileValue::High { + data_config.uncompressed_cache_factor = Some(10.0) + } + } + + // Use mmap if requested, or by default if we can fully populate it + let mmap_index = config + .get_option_no_default(b"storage", b"revlog.mmap.index")? + .unwrap_or(can_populate_mmap()); + if mmap_index { + if let Some(mmap_index_threshold) = config.get_byte_size( + b"storage", + b"revlog.mmap.index:size-threshold", + )? { + // Only mmap if above the requested size threshold + data_config.mmap_index_threshold = Some(mmap_index_threshold); + } + } + + if let Some(mmap_index_threshold) = config + .get_byte_size(b"storage", b"revlog.mmap.index:size-threshold")? + { + data_config.mmap_index_threshold = Some(mmap_index_threshold); + } + + let with_sparse_read = + config.get_bool(b"experimental", b"sparse-read")?; + if let Some(sr_density_threshold) = config + .get_f64(b"experimental", b"sparse-read.density-threshold")? + { + data_config.sr_density_threshold = sr_density_threshold; + } + data_config.with_sparse_read = with_sparse_read; + if let Some(sr_min_gap_size) = config + .get_byte_size(b"experimental", b"sparse-read.min-gap-size")? + { + data_config.sr_min_gap_size = sr_min_gap_size; + } + + data_config.with_sparse_read = + requirements.contains(SPARSEREVLOG_REQUIREMENT); + + Ok(data_config) + } +} + +impl Default for RevlogDataConfig { + fn default() -> Self { + Self { + chunk_cache_size: DEFAULT_CHUNK_CACHE_SIZE, + sr_density_threshold: DEFAULT_SPARSE_READ_DENSITY_THRESHOLD, + sr_min_gap_size: DEFAULT_SPARSE_READ_MIN_GAP_SIZE, + try_pending: Default::default(), + try_split: Default::default(), + check_ambig: Default::default(), + mmap_large_index: Default::default(), + mmap_index_threshold: Default::default(), + uncompressed_cache_factor: Default::default(), + uncompressed_cache_count: Default::default(), + with_sparse_read: Default::default(), + general_delta: Default::default(), + } + } +} + +#[derive(Debug, Clone, Copy, PartialEq)] +/// Holds configuration values about how new deltas are computed. +/// +/// Some attributes are duplicated from [`RevlogDataConfig`] to help having +/// each object self contained. +pub struct RevlogDeltaConfig { + /// Whether deltas can be encoded against arbitrary bases + pub general_delta: bool, + /// Allow sparse writing of the revlog data + pub sparse_revlog: bool, + /// Maximum length of a delta chain + pub max_chain_len: Option, + /// Maximum distance between a delta chain's start and end + pub max_deltachain_span: Option, + /// If `upper_bound_comp` is not None, this is the expected maximal + /// gain from compression for the data content + pub upper_bound_comp: Option, + /// Should we try a delta against both parents + pub delta_both_parents: bool, + /// Test delta base candidate groups by chunks of this maximal size + pub candidate_group_chunk_size: u64, + /// Should we display debug information about delta computation + pub debug_delta: bool, + /// Trust incoming deltas by default + pub lazy_delta: bool, + /// Trust the base of incoming deltas by default + pub lazy_delta_base: bool, +} + +impl RevlogDeltaConfig { + pub fn new( + config: &Config, + requirements: &HashSet, + revlog_type: RevlogType, + ) -> Result { + let mut delta_config = Self { + delta_both_parents: config + .get_option_no_default( + b"storage", + b"revlog.optimize-delta-parent-choice", + )? + .unwrap_or(true), + candidate_group_chunk_size: config + .get_u64( + b"storage", + b"revlog.delta-parent-search.candidate-group-chunk-size", + )? + .unwrap_or_default(), + ..Default::default() + }; + + delta_config.debug_delta = + config.get_bool(b"debug", b"revlog.debug-delta")?; + + delta_config.general_delta = + requirements.contains(GENERALDELTA_REQUIREMENT); + + let lazy_delta = + config.get_bool(b"storage", b"revlog.reuse-external-delta")?; + + if revlog_type == RevlogType::Manifestlog { + // upper bound of what we expect from compression + // (real life value seems to be 3) + delta_config.upper_bound_comp = Some(3.0) + } + + let mut lazy_delta_base = false; + if lazy_delta { + lazy_delta_base = match config.get_option_no_default( + b"storage", + b"revlog.reuse-external-delta-parent", + )? { + Some(base) => base, + None => config.get_bool(b"format", b"generaldelta")?, + }; + } + delta_config.lazy_delta = lazy_delta; + delta_config.lazy_delta_base = lazy_delta_base; + + delta_config.max_deltachain_span = + match config.get_i64(b"experimental", b"maxdeltachainspan")? { + Some(span) => { + if span < 0 { + None + } else { + Some(span as u64) + } + } + None => None, + }; + + delta_config.sparse_revlog = + requirements.contains(SPARSEREVLOG_REQUIREMENT); + + delta_config.max_chain_len = + config.get_byte_size_no_default(b"format", b"maxchainlen")?; + + Ok(delta_config) + } +} + +impl Default for RevlogDeltaConfig { + fn default() -> Self { + Self { + delta_both_parents: true, + lazy_delta: true, + general_delta: Default::default(), + sparse_revlog: Default::default(), + max_chain_len: Default::default(), + max_deltachain_span: Default::default(), + upper_bound_comp: Default::default(), + candidate_group_chunk_size: Default::default(), + debug_delta: Default::default(), + lazy_delta_base: Default::default(), + } + } +} + +#[derive(Debug, Default, Clone, Copy, PartialEq)] +/// Holds configuration values about the available revlog features +pub struct RevlogFeatureConfig { + /// The compression engine and its options + pub compression_engine: CompressionConfig, + /// Can we use censor on this revlog + pub censorable: bool, + /// Does this revlog use the "side data" feature + pub has_side_data: bool, + /// Might remove this configuration once the rank computation has no + /// impact + pub compute_rank: bool, + /// Parent order is supposed to be semantically irrelevant, so we + /// normally re-sort parents to ensure that the first parent is non-null, + /// if there is a non-null parent at all. + /// filelog abuses the parent order as a flag to mark some instances of + /// meta-encoded files, so allow it to disable this behavior. + pub canonical_parent_order: bool, + /// Can ellipsis commit be used + pub enable_ellipsis: bool, +} + +impl RevlogFeatureConfig { + pub fn new( + config: &Config, + requirements: &HashSet, + ) -> Result { + Ok(Self { + compression_engine: CompressionConfig::new(config, requirements)?, + enable_ellipsis: requirements.contains(NARROW_REQUIREMENT), + ..Default::default() + }) + } +} + +/// Return the default options for a revlog of `revlog_type` according to the +/// current config and requirements. +pub fn default_revlog_options( + config: &Config, + requirements: &HashSet, + revlog_type: RevlogType, +) -> Result { + let is_changelog = revlog_type == RevlogType::Changelog; + let version = + if is_changelog && requirements.contains(CHANGELOGV2_REQUIREMENT) { + let compute_rank = config + .get_bool(b"experimental", b"changelog-v2.compute-rank")?; + RevlogVersionOptions::ChangelogV2 { compute_rank } + } else if requirements.contains(REVLOGV2_REQUIREMENT) { + RevlogVersionOptions::V2 + } else if requirements.contains(REVLOGV1_REQUIREMENT) { + RevlogVersionOptions::V1 { + general_delta: requirements.contains(GENERALDELTA_REQUIREMENT), + inline: !is_changelog, + } + } else { + RevlogVersionOptions::V0 + }; + Ok(RevlogOpenOptions { + version, + // We don't need to dance around the slow path like in the Python + // implementation since we know we have access to the fast code. + use_nodemap: requirements.contains(NODEMAP_REQUIREMENT), + delta_config: RevlogDeltaConfig::new( + config, + requirements, + revlog_type, + )?, + data_config: RevlogDataConfig::new(config, requirements)?, + feature_config: RevlogFeatureConfig::new(config, requirements)?, + }) +} diff -r 854e2b9bca57 -r 51a350a22d0c rust/hg-core/src/revlog/patch.rs --- a/rust/hg-core/src/revlog/patch.rs Wed Nov 20 15:38:57 2024 +0100 +++ b/rust/hg-core/src/revlog/patch.rs Wed Nov 20 15:53:19 2024 +0100 @@ -1,5 +1,9 @@ use byteorder::{BigEndian, ByteOrder}; +use crate::revlog::RevlogError; + +use super::inner_revlog::RevisionBuffer; + /// A chunk of data to insert, delete or replace in a patch /// /// A chunk is: @@ -61,14 +65,16 @@ impl<'a> PatchList<'a> { /// Create a `PatchList` from bytes. - pub fn new(data: &'a [u8]) -> Self { + pub fn new(data: &'a [u8]) -> Result { let mut chunks = vec![]; let mut data = data; while !data.is_empty() { let start = BigEndian::read_u32(&data[0..]); let end = BigEndian::read_u32(&data[4..]); let len = BigEndian::read_u32(&data[8..]); - assert!(start <= end); + if start > end { + return Err(RevlogError::corrupted("patch cannot be decoded")); + } chunks.push(Chunk { start, end, @@ -76,29 +82,23 @@ }); data = &data[12 + (len as usize)..]; } - PatchList { chunks } - } - - /// Return the final length of data after patching - /// given its initial length . - fn size(&self, initial_size: i32) -> i32 { - self.chunks - .iter() - .fold(initial_size, |acc, chunk| acc + chunk.len_diff()) + Ok(PatchList { chunks }) } /// Apply the patch to some data. - pub fn apply(&self, initial: &[u8]) -> Vec { + pub fn apply( + &self, + buffer: &mut dyn RevisionBuffer, + initial: &[u8], + ) { let mut last: usize = 0; - let mut vec = - Vec::with_capacity(self.size(initial.len() as i32) as usize); for Chunk { start, end, data } in self.chunks.iter() { - vec.extend(&initial[last..(*start as usize)]); - vec.extend(data.iter()); + let slice = &initial[last..(*start as usize)]; + buffer.extend_from_slice(slice); + buffer.extend_from_slice(data); last = *end as usize; } - vec.extend(&initial[last..]); - vec + buffer.extend_from_slice(&initial[last..]); } /// Combine two patch lists into a single patch list. @@ -229,6 +229,8 @@ #[cfg(test)] mod tests { + use crate::revlog::inner_revlog::CoreRevisionBuffer; + use super::*; struct PatchDataBuilder { @@ -264,17 +266,18 @@ let data = vec![0u8, 0u8, 0u8]; let mut patch1_data = PatchDataBuilder::new(); patch1_data.replace(0, 1, &[1, 2]); - let mut patch1 = PatchList::new(patch1_data.get()); + let mut patch1 = PatchList::new(patch1_data.get()).unwrap(); let mut patch2_data = PatchDataBuilder::new(); patch2_data.replace(2, 4, &[3, 4]); - let mut patch2 = PatchList::new(patch2_data.get()); + let mut patch2 = PatchList::new(patch2_data.get()).unwrap(); let patch = patch1.combine(&mut patch2); - let result = patch.apply(&data); + let mut buffer = CoreRevisionBuffer::new(); + patch.apply(&mut buffer, &data); - assert_eq!(result, vec![1u8, 2, 3, 4]); + assert_eq!(buffer.finish(), vec![1u8, 2, 3, 4]); } #[test] @@ -282,17 +285,18 @@ let data = vec![0u8, 0u8, 0u8]; let mut patch1_data = PatchDataBuilder::new(); patch1_data.replace(2, 3, &[3]); - let mut patch1 = PatchList::new(patch1_data.get()); + let mut patch1 = PatchList::new(patch1_data.get()).unwrap(); let mut patch2_data = PatchDataBuilder::new(); patch2_data.replace(1, 2, &[1, 2]); - let mut patch2 = PatchList::new(patch2_data.get()); + let mut patch2 = PatchList::new(patch2_data.get()).unwrap(); let patch = patch1.combine(&mut patch2); - let result = patch.apply(&data); + let mut buffer = CoreRevisionBuffer::new(); + patch.apply(&mut buffer, &data); - assert_eq!(result, vec![0u8, 1, 2, 3]); + assert_eq!(buffer.finish(), vec![0u8, 1, 2, 3]); } #[test] @@ -300,17 +304,18 @@ let data = vec![0u8, 0, 0]; let mut patch1_data = PatchDataBuilder::new(); patch1_data.replace(1, 2, &[3, 4]); - let mut patch1 = PatchList::new(patch1_data.get()); + let mut patch1 = PatchList::new(patch1_data.get()).unwrap(); let mut patch2_data = PatchDataBuilder::new(); patch2_data.replace(1, 4, &[1, 2, 3]); - let mut patch2 = PatchList::new(patch2_data.get()); + let mut patch2 = PatchList::new(patch2_data.get()).unwrap(); let patch = patch1.combine(&mut patch2); - let result = patch.apply(&data); + let mut buffer = CoreRevisionBuffer::new(); + patch.apply(&mut buffer, &data); - assert_eq!(result, vec![0u8, 1, 2, 3]); + assert_eq!(buffer.finish(), vec![0u8, 1, 2, 3]); } #[test] @@ -318,17 +323,18 @@ let data = vec![0u8, 0, 0]; let mut patch1_data = PatchDataBuilder::new(); patch1_data.replace(0, 1, &[1, 3]); - let mut patch1 = PatchList::new(patch1_data.get()); + let mut patch1 = PatchList::new(patch1_data.get()).unwrap(); let mut patch2_data = PatchDataBuilder::new(); patch2_data.replace(1, 4, &[2, 3, 4]); - let mut patch2 = PatchList::new(patch2_data.get()); + let mut patch2 = PatchList::new(patch2_data.get()).unwrap(); let patch = patch1.combine(&mut patch2); - let result = patch.apply(&data); + let mut buffer = CoreRevisionBuffer::new(); + patch.apply(&mut buffer, &data); - assert_eq!(result, vec![1u8, 2, 3, 4]); + assert_eq!(buffer.finish(), vec![1u8, 2, 3, 4]); } #[test] @@ -336,17 +342,18 @@ let data = vec![0u8, 0, 0]; let mut patch1_data = PatchDataBuilder::new(); patch1_data.replace(1, 3, &[1, 3, 4]); - let mut patch1 = PatchList::new(patch1_data.get()); + let mut patch1 = PatchList::new(patch1_data.get()).unwrap(); let mut patch2_data = PatchDataBuilder::new(); patch2_data.replace(0, 2, &[1, 2]); - let mut patch2 = PatchList::new(patch2_data.get()); + let mut patch2 = PatchList::new(patch2_data.get()).unwrap(); let patch = patch1.combine(&mut patch2); - let result = patch.apply(&data); + let mut buffer = CoreRevisionBuffer::new(); + patch.apply(&mut buffer, &data); - assert_eq!(result, vec![1u8, 2, 3, 4]); + assert_eq!(buffer.finish(), vec![1u8, 2, 3, 4]); } #[test] @@ -354,16 +361,17 @@ let data = vec![0u8, 0, 0]; let mut patch1_data = PatchDataBuilder::new(); patch1_data.replace(0, 3, &[1, 3, 3, 4]); - let mut patch1 = PatchList::new(patch1_data.get()); + let mut patch1 = PatchList::new(patch1_data.get()).unwrap(); let mut patch2_data = PatchDataBuilder::new(); patch2_data.replace(1, 3, &[2, 3]); - let mut patch2 = PatchList::new(patch2_data.get()); + let mut patch2 = PatchList::new(patch2_data.get()).unwrap(); let patch = patch1.combine(&mut patch2); - let result = patch.apply(&data); + let mut buffer = CoreRevisionBuffer::new(); + patch.apply(&mut buffer, &data); - assert_eq!(result, vec![1u8, 2, 3, 4]); + assert_eq!(buffer.finish(), vec![1u8, 2, 3, 4]); } } diff -r 854e2b9bca57 -r 51a350a22d0c rust/hg-core/src/transaction.rs --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/rust/hg-core/src/transaction.rs Wed Nov 20 15:53:19 2024 +0100 @@ -0,0 +1,11 @@ +use std::path::Path; + +/// The Mercurial transaction system is based on the append-only nature +/// of its core files. This exposes the necessary methods to safely write to +/// the different core datastructures. +pub trait Transaction { + /// Record the state of an append-only file before update + fn add(&mut self, file: impl AsRef, offset: usize); + + // TODO the rest of the methods once we do more in Rust. +} diff -r 854e2b9bca57 -r 51a350a22d0c rust/hg-core/src/update.rs --- a/rust/hg-core/src/update.rs Wed Nov 20 15:38:57 2024 +0100 +++ b/rust/hg-core/src/update.rs Wed Nov 20 15:53:19 2024 +0100 @@ -15,13 +15,14 @@ dirstate_map::DirstateEntryReset, on_disk::write_tracked_key, }, errors::{HgError, IoResultExt}, - exit_codes, - filelog::Filelog, - narrow, - node::NULL_NODE, + exit_codes, narrow, operations::{list_rev_tracked_files, ExpandedManifestEntry}, progress::Progress, repo::Repo, + revlog::filelog::Filelog, + revlog::node::NULL_NODE, + revlog::options::{default_revlog_options, RevlogOpenOptions}, + revlog::RevlogError, sparse, utils::{ cap_default_rayon_threads, @@ -30,8 +31,7 @@ path_auditor::PathAuditor, }, vfs::{is_on_nfs_mount, VfsImpl}, - DirstateParents, RevlogError, RevlogOpenOptions, UncheckedRevision, - INTERRUPT_RECEIVED, + DirstateParents, UncheckedRevision, INTERRUPT_RECEIVED, }; use crossbeam_channel::{Receiver, Sender}; use rayon::prelude::*; @@ -93,7 +93,11 @@ return Ok(0); } let store_vfs = &repo.store_vfs(); - let options = repo.default_revlog_options(crate::RevlogType::Filelog)?; + let options = default_revlog_options( + repo.config(), + repo.requirements(), + crate::revlog::RevlogType::Filelog, + )?; let (errors_sender, errors_receiver) = crossbeam_channel::unbounded(); let (files_sender, files_receiver) = crossbeam_channel::unbounded(); let working_directory_path = &repo.working_directory_path(); @@ -149,7 +153,7 @@ fn handle_revlog_error(e: RevlogError) -> HgError { match e { - crate::RevlogError::Other(hg_error) => hg_error, + crate::revlog::RevlogError::Other(hg_error) => hg_error, e => HgError::abort( format!("revlog error: {}", e), exit_codes::ABORT, diff -r 854e2b9bca57 -r 51a350a22d0c rust/hg-core/src/vfs.rs --- a/rust/hg-core/src/vfs.rs Wed Nov 20 15:38:57 2024 +0100 +++ b/rust/hg-core/src/vfs.rs Wed Nov 20 15:53:19 2024 +0100 @@ -1,21 +1,75 @@ -use crate::errors::{HgError, IoErrorContext, IoResultExt}; +use crate::errors::{HgError, HgResultExt, IoErrorContext, IoResultExt}; use crate::exit_codes; +use crate::fncache::FnCache; +use crate::revlog::path_encode::path_encode; +use crate::utils::files::{get_bytes_from_path, get_path_from_bytes}; use dyn_clone::DynClone; +use format_bytes::format_bytes; use memmap2::{Mmap, MmapOptions}; -use std::fs::File; -use std::io::{ErrorKind, Write}; -use std::os::unix::fs::MetadataExt; +use rand::distributions::{Alphanumeric, DistString}; +use std::fs::{File, Metadata, OpenOptions}; +use std::io::{ErrorKind, Read, Seek, Write}; +use std::os::fd::AsRawFd; +use std::os::unix::fs::{MetadataExt, PermissionsExt}; use std::path::{Path, PathBuf}; +#[cfg(test)] +use std::sync::atomic::AtomicUsize; +#[cfg(test)] +use std::sync::atomic::Ordering; +use std::sync::OnceLock; /// Filesystem access abstraction for the contents of a given "base" diretory #[derive(Clone)] pub struct VfsImpl { pub(crate) base: PathBuf, + pub readonly: bool, + pub mode: Option, } struct FileNotFound(std::io::Error, PathBuf); +/// Store the umask for the whole process since it's expensive to get. +static UMASK: OnceLock = OnceLock::new(); + +fn get_umask() -> u32 { + *UMASK.get_or_init(|| unsafe { + // TODO is there any way of getting the umask without temporarily + // setting it? Doesn't this affect all threads in this tiny window? + let mask = libc::umask(0); + libc::umask(mask); + mask & 0o777 + }) +} + +/// Return the (unix) mode with which we will create/fix files +fn get_mode(base: impl AsRef) -> Option { + match base.as_ref().metadata() { + Ok(meta) => { + // files in .hg/ will be created using this mode + let mode = meta.mode(); + // avoid some useless chmods + if (0o777 & !get_umask()) == (0o777 & mode) { + None + } else { + Some(mode) + } + } + Err(_) => None, + } +} + impl VfsImpl { + pub fn new(base: PathBuf, readonly: bool) -> Self { + let mode = get_mode(&base); + Self { + base, + readonly, + mode, + } + } + + // XXX these methods are probably redundant with VFS trait? + pub fn join(&self, relative_path: impl AsRef) -> PathBuf { self.base.join(relative_path) } @@ -103,26 +157,6 @@ } } - pub fn rename( - &self, - relative_from: impl AsRef, - relative_to: impl AsRef, - ) -> Result<(), HgError> { - let from = self.join(relative_from); - let to = self.join(relative_to); - std::fs::rename(&from, &to) - .with_context(|| IoErrorContext::RenamingFile { from, to }) - } - - pub fn remove_file( - &self, - relative_path: impl AsRef, - ) -> Result<(), HgError> { - let path = self.join(relative_path); - std::fs::remove_file(&path) - .with_context(|| IoErrorContext::RemovingFile(path)) - } - #[cfg(unix)] pub fn create_symlink( &self, @@ -160,7 +194,7 @@ path: impl AsRef, ) -> Result, HgError> { let path = path.as_ref(); - match std::fs::metadata(path) { + match path.metadata() { Ok(meta) => Ok(Some(meta)), Err(error) => match error.kind() { // TODO: when we require a Rust version where `NotADirectory` is @@ -174,6 +208,188 @@ } } +/// Abstraction over the files handled by a [`Vfs`]. +#[derive(Debug)] +pub enum VfsFile { + Atomic(AtomicFile), + + Normal { + file: File, + path: PathBuf, + /// If `Some`, check (and maybe fix) this file's timestamp ambiguity. + /// See [`is_filetime_ambiguous`]. + check_ambig: Option, + }, +} + +impl VfsFile { + pub fn normal(file: File, path: PathBuf) -> Self { + Self::Normal { + file, + check_ambig: None, + path, + } + } + pub fn normal_check_ambig( + file: File, + path: PathBuf, + ) -> Result { + Ok(Self::Normal { + file, + check_ambig: Some(path.metadata().when_reading_file(&path)?), + path, + }) + } + pub fn try_clone(&self) -> Result { + Ok(match self { + VfsFile::Atomic(AtomicFile { + fp, + temp_path, + check_ambig, + target_name, + is_open, + }) => Self::Atomic(AtomicFile { + fp: fp.try_clone().when_reading_file(temp_path)?, + temp_path: temp_path.clone(), + check_ambig: *check_ambig, + target_name: target_name.clone(), + is_open: *is_open, + }), + VfsFile::Normal { + file, + check_ambig, + path, + } => Self::Normal { + file: file.try_clone().when_reading_file(path)?, + check_ambig: check_ambig.clone(), + path: path.to_owned(), + }, + }) + } + pub fn set_len(&self, len: u64) -> Result<(), std::io::Error> { + match self { + VfsFile::Atomic(atomic_file) => atomic_file.fp.set_len(len), + VfsFile::Normal { file, .. } => file.set_len(len), + } + } + + pub fn metadata(&self) -> Result { + match self { + VfsFile::Atomic(atomic_file) => atomic_file.fp.metadata(), + VfsFile::Normal { file, .. } => file.metadata(), + } + } +} + +impl AsRawFd for VfsFile { + fn as_raw_fd(&self) -> std::os::unix::prelude::RawFd { + match self { + VfsFile::Atomic(atomic_file) => atomic_file.fp.as_raw_fd(), + VfsFile::Normal { file, .. } => file.as_raw_fd(), + } + } +} + +impl Seek for VfsFile { + fn seek(&mut self, pos: std::io::SeekFrom) -> std::io::Result { + match self { + VfsFile::Atomic(atomic_file) => atomic_file.seek(pos), + VfsFile::Normal { file, .. } => file.seek(pos), + } + } +} + +impl Read for VfsFile { + fn read(&mut self, buf: &mut [u8]) -> std::io::Result { + match self { + VfsFile::Atomic(atomic_file) => atomic_file.fp.read(buf), + VfsFile::Normal { file, .. } => file.read(buf), + } + } +} + +impl Write for VfsFile { + fn write(&mut self, buf: &[u8]) -> std::io::Result { + match self { + VfsFile::Atomic(atomic_file) => atomic_file.fp.write(buf), + VfsFile::Normal { file, .. } => file.write(buf), + } + } + + fn flush(&mut self) -> std::io::Result<()> { + match self { + VfsFile::Atomic(atomic_file) => atomic_file.fp.flush(), + VfsFile::Normal { file, .. } => file.flush(), + } + } +} + +impl Drop for VfsFile { + fn drop(&mut self) { + if let VfsFile::Normal { + path, + check_ambig: Some(old), + .. + } = self + { + avoid_timestamp_ambiguity(path, old) + } + } +} + +/// Records the number of times we've fixed a timestamp ambiguity, only +/// applicable for tests. +#[cfg(test)] +static TIMESTAMP_FIXES_CALLS: AtomicUsize = AtomicUsize::new(0); + +fn avoid_timestamp_ambiguity(path: &Path, old: &Metadata) { + if let Ok(new) = path.metadata() { + let is_ambiguous = is_filetime_ambiguous(&new, old); + if is_ambiguous { + let advanced = + filetime::FileTime::from_unix_time(old.mtime() + 1, 0); + if filetime::set_file_times(path, advanced, advanced).is_ok() { + #[cfg(test)] + { + TIMESTAMP_FIXES_CALLS.fetch_add(1, Ordering::Relaxed); + } + } + } + } +} + +/// Examine whether new stat is ambiguous against old one +/// +/// "S[N]" below means stat of a file at N-th change: +/// +/// - S[n-1].ctime < S[n].ctime: can detect change of a file +/// - S[n-1].ctime == S[n].ctime +/// - S[n-1].ctime < S[n].mtime: means natural advancing (*1) +/// - S[n-1].ctime == S[n].mtime: is ambiguous (*2) +/// - S[n-1].ctime > S[n].mtime: never occurs naturally (don't care) +/// - S[n-1].ctime > S[n].ctime: never occurs naturally (don't care) +/// +/// Case (*2) above means that a file was changed twice or more at +/// same time in sec (= S[n-1].ctime), and comparison of timestamp +/// is ambiguous. +/// +/// Base idea to avoid such ambiguity is "advance mtime 1 sec, if +/// timestamp is ambiguous". +/// +/// But advancing mtime only in case (*2) doesn't work as +/// expected, because naturally advanced S[n].mtime in case (*1) +/// might be equal to manually advanced S[n-1 or earlier].mtime. +/// +/// Therefore, all "S[n-1].ctime == S[n].ctime" cases should be +/// treated as ambiguous regardless of mtime, to avoid overlooking +/// by confliction between such mtime. +/// +/// Advancing mtime "if isambig(new, old)" ensures "S[n-1].mtime != +/// S[n].mtime", even if size of a file isn't changed. +fn is_filetime_ambiguous(new: &Metadata, old: &Metadata) -> bool { + new.ctime() == old.ctime() +} + /// Writable file object that atomically updates a file /// /// All writes will go to a temporary copy of the original file. Call @@ -181,6 +397,7 @@ /// the temporary copy to the original name, making the changes /// visible. If the object is destroyed without being closed, all your /// writes are discarded. +#[derive(Debug)] pub struct AtomicFile { /// The temporary file to write to fp: std::fs::File, @@ -197,6 +414,48 @@ impl AtomicFile { pub fn new( + target_path: impl AsRef, + empty: bool, + check_ambig: bool, + ) -> Result { + let target_path = target_path.as_ref().to_owned(); + + let random_id = + Alphanumeric.sample_string(&mut rand::thread_rng(), 12); + let filename = + target_path.file_name().expect("target has no filename"); + let filename = get_bytes_from_path(filename); + let temp_filename = + format_bytes!(b".{}-{}~", filename, random_id.as_bytes()); + let temp_path = + target_path.with_file_name(get_path_from_bytes(&temp_filename)); + + if !empty { + std::fs::copy(&target_path, &temp_path) + .with_context(|| IoErrorContext::CopyingFile { + from: target_path.to_owned(), + to: temp_path.to_owned(), + }) + // If it doesn't exist, create it on open + .io_not_found_as_none()?; + } + let fp = std::fs::OpenOptions::new() + .write(true) + .create(true) + .truncate(empty) + .open(&temp_path) + .when_writing_file(&temp_path)?; + + Ok(Self { + fp, + temp_path, + check_ambig, + target_name: target_path, + is_open: true, + }) + } + + pub fn from_file( fp: std::fs::File, check_ambig: bool, temp_name: PathBuf, @@ -228,31 +487,30 @@ self.fp.flush()?; let target = self.target(); if self.check_ambig { - if let Ok(stat) = std::fs::metadata(&target) { + if let Ok(stat) = target.metadata() { std::fs::rename(&self.temp_path, &target)?; - let new_stat = std::fs::metadata(&target)?; - let ctime = new_stat.ctime(); - let is_ambiguous = ctime == stat.ctime(); - if is_ambiguous { - let advanced = - filetime::FileTime::from_unix_time(ctime + 1, 0); - filetime::set_file_times(target, advanced, advanced)?; - } + avoid_timestamp_ambiguity(&target, &stat); } else { std::fs::rename(&self.temp_path, target)?; } } else { - std::fs::rename(&self.temp_path, target).unwrap(); + std::fs::rename(&self.temp_path, target)?; } self.is_open = false; Ok(()) } } +impl Seek for AtomicFile { + fn seek(&mut self, pos: std::io::SeekFrom) -> std::io::Result { + self.fp.seek(pos) + } +} + impl Drop for AtomicFile { fn drop(&mut self) { if self.is_open { - std::fs::remove_file(self.target()).ok(); + std::fs::remove_file(&self.temp_path).ok(); } } } @@ -260,58 +518,178 @@ /// Abstracts over the VFS to allow for different implementations of the /// filesystem layer (like passing one from Python). pub trait Vfs: Sync + Send + DynClone { - fn open(&self, filename: &Path) -> Result; - fn open_read(&self, filename: &Path) -> Result; - fn open_check_ambig( + // TODO make `open` readonly and make `open_read` an `open_write` + /// Open a [`VfsFile::Normal`] for writing and reading the file at + /// `filename`, relative to this VFS's root. + fn open(&self, filename: &Path) -> Result; + /// Open a [`VfsFile::Normal`] for reading the file at `filename`, + /// relative to this VFS's root. + fn open_read(&self, filename: &Path) -> Result; + /// Open a [`VfsFile::Normal`] for reading and writing the file at + /// `filename`, relative to this VFS's root. This file will be checked + /// for an ambiguous mtime on [`drop`]. See [`is_filetime_ambiguous`]. + fn open_check_ambig(&self, filename: &Path) -> Result; + /// Create a [`VfsFile::Normal`] for reading and writing the file at + /// `filename`, relative to this VFS's root. If the file already exists, + /// it will be truncated to 0 bytes. + fn create( &self, filename: &Path, - ) -> Result; - fn create(&self, filename: &Path) -> Result; - /// Must truncate the new file if exist + check_ambig: bool, + ) -> Result; + /// Create a [`VfsFile::Atomic`] for reading and writing the file at + /// `filename`, relative to this VFS's root. If the file already exists, + /// it will be truncated to 0 bytes. fn create_atomic( &self, filename: &Path, check_ambig: bool, - ) -> Result; - fn file_size(&self, file: &File) -> Result; + ) -> Result; + /// Return the total file size in bytes of the open `file`. Errors are + /// usual IO errors (invalid file handle, permissions, etc.) + fn file_size(&self, file: &VfsFile) -> Result; + /// Return `true` if `filename` exists relative to this VFS's root. Errors + /// will coerce to `false`, to this also returns `false` if there are + /// IO problems. This is fine because any operation that actually tries + /// to do anything with this path will get the same error. fn exists(&self, filename: &Path) -> bool; + /// Remove the file at `filename` relative to this VFS's root. Errors + /// are the usual IO errors (lacking permission, file does not exist, etc.) fn unlink(&self, filename: &Path) -> Result<(), HgError>; + /// Rename the file `from` to `to`, both relative to this VFS's root. + /// Errors are the usual IO errors (lacking permission, file does not + /// exist, etc.). If `check_ambig` is `true`, the VFS will check for an + /// ambiguous mtime on rename. See [`is_filetime_ambiguous`]. fn rename( &self, from: &Path, to: &Path, check_ambig: bool, ) -> Result<(), HgError>; + /// Rename the file `from` to `to`, both relative to this VFS's root. + /// Errors are the usual IO errors (lacking permission, file does not + /// exist, etc.). If `check_ambig` is passed, the VFS will check for an + /// ambiguous mtime on rename. See [`is_filetime_ambiguous`]. fn copy(&self, from: &Path, to: &Path) -> Result<(), HgError>; + /// Returns the absolute root path of this VFS, relative to which all + /// operations are done. + fn base(&self) -> &Path; } /// These methods will need to be implemented once `rhg` (and other) non-Python /// users of `hg-core` start doing more on their own, like writing to files. impl Vfs for VfsImpl { - fn open(&self, _filename: &Path) -> Result { - todo!() + fn open(&self, filename: &Path) -> Result { + if self.readonly { + return Err(HgError::abort( + "write access in a readonly vfs", + exit_codes::ABORT, + None, + )); + } + // TODO auditpath + let path = self.base.join(filename); + copy_in_place_if_hardlink(&path)?; + + Ok(VfsFile::normal( + OpenOptions::new() + .create(false) + .create_new(false) + .write(true) + .read(true) + .open(&path) + .when_writing_file(&path)?, + path.to_owned(), + )) } - fn open_read(&self, filename: &Path) -> Result { + + fn open_read(&self, filename: &Path) -> Result { + // TODO auditpath let path = self.base.join(filename); - std::fs::File::open(&path).when_reading_file(&path) + Ok(VfsFile::normal( + std::fs::File::open(&path).when_reading_file(&path)?, + filename.to_owned(), + )) } - fn open_check_ambig( + + fn open_check_ambig(&self, filename: &Path) -> Result { + if self.readonly { + return Err(HgError::abort( + "write access in a readonly vfs", + exit_codes::ABORT, + None, + )); + } + + let path = self.base.join(filename); + copy_in_place_if_hardlink(&path)?; + + // TODO auditpath + VfsFile::normal_check_ambig( + OpenOptions::new() + .write(true) + .read(true) // Can be used for reading to save on `open` calls + .create(false) + .open(&path) + .when_reading_file(&path)?, + path.to_owned(), + ) + } + + fn create( &self, - _filename: &Path, - ) -> Result { - todo!() + filename: &Path, + check_ambig: bool, + ) -> Result { + if self.readonly { + return Err(HgError::abort( + "write access in a readonly vfs", + exit_codes::ABORT, + None, + )); + } + // TODO auditpath + let path = self.base.join(filename); + let parent = path.parent().expect("file at root"); + std::fs::create_dir_all(parent).when_writing_file(parent)?; + + let file = OpenOptions::new() + .create(true) + .truncate(true) + .write(true) + .read(true) + .open(&path) + .when_writing_file(&path)?; + + if let Some(mode) = self.mode { + // Creating the file with the right permission (with `.mode()`) + // may not work since umask takes effect for file creation. + // So we need to fix the permission after creating the file. + fix_directory_permissions(&self.base, &path, mode)?; + let perm = std::fs::Permissions::from_mode(mode & 0o666); + std::fs::set_permissions(&path, perm).when_writing_file(&path)?; + } + + Ok(VfsFile::Normal { + file, + check_ambig: if check_ambig { + Some(path.metadata().when_reading_file(&path)?) + } else { + None + }, + path: path.to_owned(), + }) } - fn create(&self, _filename: &Path) -> Result { - todo!() - } + fn create_atomic( &self, _filename: &Path, _check_ambig: bool, - ) -> Result { + ) -> Result { todo!() } - fn file_size(&self, file: &File) -> Result { + + fn file_size(&self, file: &VfsFile) -> Result { Ok(file .metadata() .map_err(|e| { @@ -323,25 +701,268 @@ })? .size()) } - fn exists(&self, _filename: &Path) -> bool { - todo!() + + fn exists(&self, filename: &Path) -> bool { + self.base.join(filename).exists() } - fn unlink(&self, _filename: &Path) -> Result<(), HgError> { - todo!() + + fn unlink(&self, filename: &Path) -> Result<(), HgError> { + if self.readonly { + return Err(HgError::abort( + "write access in a readonly vfs", + exit_codes::ABORT, + None, + )); + } + let path = self.base.join(filename); + std::fs::remove_file(&path) + .with_context(|| IoErrorContext::RemovingFile(path)) } + fn rename( &self, - _from: &Path, - _to: &Path, - _check_ambig: bool, + from: &Path, + to: &Path, + check_ambig: bool, + ) -> Result<(), HgError> { + if self.readonly { + return Err(HgError::abort( + "write access in a readonly vfs", + exit_codes::ABORT, + None, + )); + } + let old_stat = if check_ambig { + Some( + from.metadata() + .when_reading_file(from) + .io_not_found_as_none()?, + ) + } else { + None + }; + let from = self.base.join(from); + let to = self.base.join(to); + std::fs::rename(&from, &to).with_context(|| { + IoErrorContext::RenamingFile { + from, + to: to.to_owned(), + } + })?; + if let Some(Some(old)) = old_stat { + avoid_timestamp_ambiguity(&to, &old); + } + Ok(()) + } + + fn copy(&self, from: &Path, to: &Path) -> Result<(), HgError> { + let from = self.base.join(from); + let to = self.base.join(to); + std::fs::copy(&from, &to) + .with_context(|| IoErrorContext::CopyingFile { from, to }) + .map(|_| ()) + } + + fn base(&self) -> &Path { + &self.base + } +} + +fn fix_directory_permissions( + base: &Path, + path: &Path, + mode: u32, +) -> Result<(), HgError> { + let mut ancestors = path.ancestors(); + ancestors.next(); // yields the path itself + + for ancestor in ancestors { + if ancestor == base { + break; + } + let perm = std::fs::Permissions::from_mode(mode); + std::fs::set_permissions(ancestor, perm) + .when_writing_file(ancestor)?; + } + Ok(()) +} + +/// A VFS that understands the `fncache` store layout (file encoding), and +/// adds new entries to the `fncache`. +/// TODO Only works when using from Python for now. +pub struct FnCacheVfs { + inner: VfsImpl, + fncache: Box, +} + +impl Clone for FnCacheVfs { + fn clone(&self) -> Self { + Self { + inner: self.inner.clone(), + fncache: dyn_clone::clone_box(&*self.fncache), + } + } +} + +impl FnCacheVfs { + pub fn new( + base: PathBuf, + readonly: bool, + fncache: Box, + ) -> Self { + let inner = VfsImpl::new(base, readonly); + Self { inner, fncache } + } + + fn maybe_add_to_fncache( + &self, + filename: &Path, + encoded_path: &Path, ) -> Result<(), HgError> { - todo!() + let relevant_file = (filename.starts_with("data/") + || filename.starts_with("meta/")) + && is_revlog_file(filename); + if relevant_file { + let not_load = !self.fncache.is_loaded() + && (self.exists(filename) + && self + .inner + .join(encoded_path) + .metadata() + .when_reading_file(encoded_path)? + .size() + != 0); + if !not_load { + self.fncache.add(filename); + } + }; + Ok(()) + } +} + +impl Vfs for FnCacheVfs { + fn open(&self, filename: &Path) -> Result { + let encoded = path_encode(&get_bytes_from_path(filename)); + let encoded_path = get_path_from_bytes(&encoded); + self.maybe_add_to_fncache(filename, encoded_path)?; + self.inner.open(encoded_path) + } + + fn open_read(&self, filename: &Path) -> Result { + let encoded = path_encode(&get_bytes_from_path(filename)); + let filename = get_path_from_bytes(&encoded); + self.inner.open_read(filename) + } + + fn open_check_ambig(&self, filename: &Path) -> Result { + let encoded = path_encode(&get_bytes_from_path(filename)); + let filename = get_path_from_bytes(&encoded); + self.inner.open_check_ambig(filename) + } + + fn create( + &self, + filename: &Path, + check_ambig: bool, + ) -> Result { + let encoded = path_encode(&get_bytes_from_path(filename)); + let encoded_path = get_path_from_bytes(&encoded); + self.maybe_add_to_fncache(filename, encoded_path)?; + self.inner.create(encoded_path, check_ambig) + } + + fn create_atomic( + &self, + filename: &Path, + check_ambig: bool, + ) -> Result { + let encoded = path_encode(&get_bytes_from_path(filename)); + let filename = get_path_from_bytes(&encoded); + self.inner.create_atomic(filename, check_ambig) + } + + fn file_size(&self, file: &VfsFile) -> Result { + self.inner.file_size(file) + } + + fn exists(&self, filename: &Path) -> bool { + let encoded = path_encode(&get_bytes_from_path(filename)); + let filename = get_path_from_bytes(&encoded); + self.inner.exists(filename) } - fn copy(&self, _from: &Path, _to: &Path) -> Result<(), HgError> { - todo!() + + fn unlink(&self, filename: &Path) -> Result<(), HgError> { + let encoded = path_encode(&get_bytes_from_path(filename)); + let filename = get_path_from_bytes(&encoded); + self.inner.unlink(filename) + } + + fn rename( + &self, + from: &Path, + to: &Path, + check_ambig: bool, + ) -> Result<(), HgError> { + let encoded = path_encode(&get_bytes_from_path(from)); + let from = get_path_from_bytes(&encoded); + let encoded = path_encode(&get_bytes_from_path(to)); + let to = get_path_from_bytes(&encoded); + self.inner.rename(from, to, check_ambig) + } + + fn copy(&self, from: &Path, to: &Path) -> Result<(), HgError> { + let encoded = path_encode(&get_bytes_from_path(from)); + let from = get_path_from_bytes(&encoded); + let encoded = path_encode(&get_bytes_from_path(to)); + let to = get_path_from_bytes(&encoded); + self.inner.copy(from, to) + } + fn base(&self) -> &Path { + self.inner.base() } } +/// Detects whether `path` is a hardlink and does a tmp copy + rename erase +/// to turn it into its own file. Revlogs are usually hardlinked when doing +/// a local clone, and we don't want to modify the original repo. +fn copy_in_place_if_hardlink(path: &Path) -> Result<(), HgError> { + let metadata = path.metadata().when_writing_file(path)?; + if metadata.nlink() > 0 { + // If it's hardlinked, copy it and rename it back before changing it. + let tmpdir = path.parent().expect("file at root"); + let name = Alphanumeric.sample_string(&mut rand::thread_rng(), 16); + let tmpfile = tmpdir.join(name); + std::fs::create_dir_all(tmpfile.parent().expect("file at root")) + .with_context(|| IoErrorContext::CopyingFile { + from: path.to_owned(), + to: tmpfile.to_owned(), + })?; + std::fs::copy(path, &tmpfile).with_context(|| { + IoErrorContext::CopyingFile { + from: path.to_owned(), + to: tmpfile.to_owned(), + } + })?; + std::fs::rename(&tmpfile, path).with_context(|| { + IoErrorContext::RenamingFile { + from: tmpfile, + to: path.to_owned(), + } + })?; + } + Ok(()) +} + +pub fn is_revlog_file(path: impl AsRef) -> bool { + path.as_ref() + .extension() + .map(|ext| { + ["i", "idx", "d", "dat", "n", "nd", "sda"] + .contains(&ext.to_string_lossy().as_ref()) + }) + .unwrap_or(false) +} + pub(crate) fn is_dir(path: impl AsRef) -> Result { Ok(fs_metadata(path)?.map_or(false, |meta| meta.is_dir())) } @@ -380,3 +1001,131 @@ pub(crate) fn is_on_nfs_mount(_path: impl AsRef) -> bool { false } + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn test_atomic_file() { + let dir = tempfile::tempdir().unwrap().into_path(); + let target_path = dir.join("sometargetname"); + + for empty in [true, false] { + let file = AtomicFile::new(&target_path, empty, false).unwrap(); + assert!(file.is_open); + let filename = + file.temp_path.file_name().unwrap().to_str().unwrap(); + // Make sure we have a coherent temp name + assert_eq!(filename.len(), 29, "{}", filename); + assert!(filename.contains("sometargetname")); + + // Make sure the temp file is created in the same folder + assert_eq!(target_path.parent(), file.temp_path.parent()); + } + + assert!(!target_path.exists()); + std::fs::write(&target_path, "version 1").unwrap(); + let mut file = AtomicFile::new(&target_path, false, false).unwrap(); + file.write_all(b"version 2!").unwrap(); + assert_eq!( + std::fs::read(&target_path).unwrap(), + b"version 1".to_vec() + ); + let temp_path = file.temp_path.to_owned(); + // test that dropping the file should discard the temp file and not + // affect the target path. + drop(file); + assert_eq!( + std::fs::read(&target_path).unwrap(), + b"version 1".to_vec() + ); + assert!(!temp_path.exists()); + + let mut file = AtomicFile::new(&target_path, false, false).unwrap(); + file.write_all(b"version 2!").unwrap(); + assert_eq!( + std::fs::read(&target_path).unwrap(), + b"version 1".to_vec() + ); + file.close().unwrap(); + assert_eq!( + std::fs::read(&target_path).unwrap(), + b"version 2!".to_vec(), + "{}", + std::fs::read_to_string(&target_path).unwrap() + ); + assert!(target_path.exists()); + assert!(!temp_path.exists()); + } + + #[test] + fn test_vfs_file_check_ambig() { + let dir = tempfile::tempdir().unwrap().into_path(); + let file_path = dir.join("file"); + + fn vfs_file_write(file_path: &Path, check_ambig: bool) { + let file = std::fs::OpenOptions::new() + .write(true) + .open(file_path) + .unwrap(); + let old_stat = if check_ambig { + Some(file.metadata().unwrap()) + } else { + None + }; + + let mut vfs_file = VfsFile::Normal { + file, + path: file_path.to_owned(), + check_ambig: old_stat, + }; + vfs_file.write_all(b"contents").unwrap(); + } + + std::fs::OpenOptions::new() + .write(true) + .create(true) + .truncate(false) + .open(&file_path) + .unwrap(); + + let number_of_writes = 3; + + // Try multiple times, because reproduction of an ambiguity depends + // on "filesystem time" + for _ in 0..5 { + TIMESTAMP_FIXES_CALLS.store(0, Ordering::Relaxed); + vfs_file_write(&file_path, false); + let old_stat = file_path.metadata().unwrap(); + if old_stat.ctime() != old_stat.mtime() { + // subsequent changing never causes ambiguity + continue; + } + + // Repeat atomic write with `check_ambig == true`, to examine + // whether the mtime is advanced multiple times as expected + for _ in 0..number_of_writes { + vfs_file_write(&file_path, true); + } + let new_stat = file_path.metadata().unwrap(); + if !is_filetime_ambiguous(&new_stat, &old_stat) { + // timestamp ambiguity was naturally avoided while repetition + continue; + } + + assert_eq!( + TIMESTAMP_FIXES_CALLS.load(Ordering::Relaxed), + number_of_writes + ); + assert_eq!( + old_stat.mtime() + number_of_writes as i64, + file_path.metadata().unwrap().mtime() + ); + break; + } + // If we've arrived here without breaking, we might not have + // tested anything because the platform is too slow. This test will + // still work on fast platforms. + } +} diff -r 854e2b9bca57 -r 51a350a22d0c rust/hg-cpython/Cargo.toml --- a/rust/hg-cpython/Cargo.toml Wed Nov 20 15:38:57 2024 +0100 +++ b/rust/hg-cpython/Cargo.toml Wed Nov 20 15:53:19 2024 +0100 @@ -17,3 +17,5 @@ env_logger = "0.9.3" stable_deref_trait = "1.2.0" vcsgraph = "0.2.0" +logging_timer = "1.1.0" +python3-sys = "0.7.1" diff -r 854e2b9bca57 -r 51a350a22d0c rust/hg-cpython/src/conversion.rs --- a/rust/hg-cpython/src/conversion.rs Wed Nov 20 15:38:57 2024 +0100 +++ b/rust/hg-cpython/src/conversion.rs Wed Nov 20 15:53:19 2024 +0100 @@ -9,7 +9,7 @@ //! `hg-core` crate. From Python, this will be seen as `rustext.ancestor` use cpython::{ObjectProtocol, PyErr, PyObject, PyResult, Python}; -use hg::{Revision, RevlogIndex, UncheckedRevision}; +use hg::{revlog::RevlogIndex, Revision, UncheckedRevision}; use crate::{exceptions::GraphError, PyRevision}; diff -r 854e2b9bca57 -r 51a350a22d0c rust/hg-cpython/src/lib.rs --- a/rust/hg-cpython/src/lib.rs Wed Nov 20 15:38:57 2024 +0100 +++ b/rust/hg-cpython/src/lib.rs Wed Nov 20 15:53:19 2024 +0100 @@ -47,6 +47,7 @@ pub mod revlog; pub mod update; pub mod utils; +pub mod vfs; /// Revision as exposed to/from the Python layer. /// diff -r 854e2b9bca57 -r 51a350a22d0c rust/hg-cpython/src/pybytes_deref.rs --- a/rust/hg-cpython/src/pybytes_deref.rs Wed Nov 20 15:38:57 2024 +0100 +++ b/rust/hg-cpython/src/pybytes_deref.rs Wed Nov 20 15:53:19 2024 +0100 @@ -1,4 +1,7 @@ -use cpython::{PyBytes, Python}; +use crate::cpython::buffer::Element; +use cpython::{ + buffer::PyBuffer, exc::ValueError, PyBytes, PyErr, PyResult, Python, +}; use stable_deref_trait::StableDeref; /// Safe abstraction over a `PyBytes` together with the `&[u8]` slice @@ -55,3 +58,67 @@ // but here sending one to another thread is fine since we ensure it stays // valid. unsafe impl Send for PyBytesDeref {} + +/// +/// It also enables using a (wrapped) `PyBuffer` in GIL-unaware generic code. +pub struct PyBufferDeref { + #[allow(unused)] + keep_alive: PyBuffer, + + /// Borrows the buffer inside `self.keep_alive`, + /// but the borrow-checker cannot express self-referential structs. + data: *const [u8], +} + +fn get_buffer<'a>(py: Python, buf: &'a PyBuffer) -> PyResult<&'a [u8]> { + let len = buf.item_count(); + + let cbuf = buf.buf_ptr(); + let has_correct_item_size = std::mem::size_of::() == buf.item_size(); + let is_valid_buffer = has_correct_item_size + && buf.is_c_contiguous() + && u8::is_compatible_format(buf.format()) + && buf.readonly(); + + let bytes = if is_valid_buffer { + unsafe { std::slice::from_raw_parts(cbuf as *const u8, len) } + } else { + return Err(PyErr::new::( + py, + "Buffer has an invalid memory representation", + )); + }; + Ok(bytes) +} + +impl PyBufferDeref { + pub fn new(py: Python, buf: PyBuffer) -> PyResult { + Ok(Self { + data: get_buffer(py, &buf)?, + keep_alive: buf, + }) + } +} + +impl std::ops::Deref for PyBufferDeref { + type Target = [u8]; + + fn deref(&self) -> &[u8] { + // Safety: the raw pointer is valid as long as the PyBuffer is still + // alive, and the returned slice borrows `self`. + unsafe { &*self.data } + } +} + +unsafe impl StableDeref for PyBufferDeref {} + +#[allow(unused)] +fn static_assert_pybuffer_is_send() { + #[allow(clippy::no_effect)] + require_send::; +} + +// Safety: PyBuffer is Send. Raw pointers are not by default, +// but here sending one to another thread is fine since we ensure it stays +// valid. +unsafe impl Send for PyBufferDeref {} diff -r 854e2b9bca57 -r 51a350a22d0c rust/hg-cpython/src/revlog.rs --- a/rust/hg-cpython/src/revlog.rs Wed Nov 20 15:38:57 2024 +0100 +++ b/rust/hg-cpython/src/revlog.rs Wed Nov 20 15:53:19 2024 +0100 @@ -4,9 +4,11 @@ // // This software may be used and distributed according to the terms of the // GNU General Public License version 2 or any later version. +#![allow(non_snake_case)] use crate::{ conversion::{rev_pyiter_collect, rev_pyiter_collect_or_else}, + pybytes_deref::{PyBufferDeref, PyBytesDeref}, utils::{node_from_py_bytes, node_from_py_object}, PyRevision, }; @@ -14,39 +16,54 @@ buffer::{Element, PyBuffer}, exc::{IndexError, ValueError}, ObjectProtocol, PyBool, PyBytes, PyClone, PyDict, PyErr, PyInt, PyList, - PyModule, PyObject, PyResult, PySet, PyString, PyTuple, Python, + PyModule, PyObject, PyResult, PySet, PyTuple, PyType, Python, PythonObject, ToPyObject, UnsafePyLeaked, }; use hg::{ errors::HgError, - index::{ - IndexHeader, Phase, RevisionDataParams, SnapshotsCache, - INDEX_ENTRY_SIZE, + fncache::FnCache, + revlog::{ + compression::CompressionConfig, + index::{ + Index, IndexHeader, Phase, RevisionDataParams, SnapshotsCache, + INDEX_ENTRY_SIZE, + }, + inner_revlog::{InnerRevlog as CoreInnerRevlog, RevisionBuffer}, + nodemap::{Block, NodeMap, NodeMapError, NodeTree as CoreNodeTree}, + options::{ + RevlogDataConfig, RevlogDeltaConfig, RevlogFeatureConfig, + RevlogOpenOptions, + }, + Graph, NodePrefix, RevlogError, RevlogIndex, RevlogType, }, - nodemap::{Block, NodeMapError, NodeTree as CoreNodeTree}, - revlog::{nodemap::NodeMap, Graph, NodePrefix, RevlogError, RevlogIndex}, + transaction::Transaction, + utils::files::{get_bytes_from_path, get_path_from_bytes}, + vfs::FnCacheVfs, BaseRevision, Node, Revision, UncheckedRevision, NULL_REVISION, }; use std::{ - cell::RefCell, + cell::{Cell, RefCell}, collections::{HashMap, HashSet}, + sync::atomic::{AtomicBool, AtomicUsize, Ordering}, + sync::OnceLock, }; use vcsgraph::graph::Graph as VCSGraph; pub struct PySharedIndex { /// The underlying hg-core index - pub(crate) inner: &'static hg::index::Index, + pub(crate) inner: &'static Index, } /// Return a Struct implementing the Graph trait pub(crate) fn py_rust_index_to_graph( py: Python, - index: PyObject, + index_proxy: PyObject, ) -> PyResult> { - let midx = index.extract::(py)?; - let leaked = midx.index(py).leak_immutable(); + let inner_revlog = index_proxy.getattr(py, "inner")?; + let inner_revlog = inner_revlog.extract::(py)?; + let leaked = inner_revlog.inner(py).leak_immutable(); // Safety: we don't leak the "faked" reference out of the `UnsafePyLeaked` - Ok(unsafe { leaked.map(py, |idx| PySharedIndex { inner: idx }) }) + Ok(unsafe { leaked.map(py, |idx| PySharedIndex { inner: &idx.index }) }) } impl Clone for PySharedIndex { @@ -91,398 +108,6 @@ } } -py_class!(pub class Index |py| { - @shared data index: hg::index::Index; - data nt: RefCell>; - data docket: RefCell>; - // Holds a reference to the mmap'ed persistent nodemap data - data nodemap_mmap: RefCell>; - // Holds a reference to the mmap'ed persistent index data - data index_mmap: RefCell>; - data head_revs_py_list: RefCell>; - data head_node_ids_py_list: RefCell>; - - def __new__( - _cls, - data: PyObject, - default_header: u32, - ) -> PyResult { - Self::new(py, data, default_header) - } - - /// Compatibility layer used for Python consumers needing access to the C index - /// - /// Only use case so far is `scmutil.shortesthexnodeidprefix`, - /// that may need to build a custom `nodetree`, based on a specified revset. - /// With a Rust implementation of the nodemap, we will be able to get rid of - /// this, by exposing our own standalone nodemap class, - /// ready to accept `Index`. -/* def get_cindex(&self) -> PyResult { - Ok(self.cindex(py).borrow().inner().clone_ref(py)) - } -*/ - // Index API involving nodemap, as defined in mercurial/pure/parsers.py - - /// Return Revision if found, raises a bare `error.RevlogError` - /// in case of ambiguity, same as C version does - def get_rev(&self, node: PyBytes) -> PyResult> { - let opt = self.get_nodetree(py)?.borrow(); - let nt = opt.as_ref().unwrap(); - let ridx = &*self.index(py).borrow(); - let node = node_from_py_bytes(py, &node)?; - let rust_rev = - nt.find_bin(ridx, node.into()).map_err(|e| nodemap_error(py, e))?; - Ok(rust_rev.map(Into::into)) - - } - - /// same as `get_rev()` but raises a bare `error.RevlogError` if node - /// is not found. - /// - /// No need to repeat `node` in the exception, `mercurial/revlog.py` - /// will catch and rewrap with it - def rev(&self, node: PyBytes) -> PyResult { - self.get_rev(py, node)?.ok_or_else(|| revlog_error(py)) - } - - /// return True if the node exist in the index - def has_node(&self, node: PyBytes) -> PyResult { - // TODO OPTIM we could avoid a needless conversion here, - // to do when scaffolding for pure Rust switch is removed, - // as `get_rev()` currently does the necessary assertions - self.get_rev(py, node).map(|opt| opt.is_some()) - } - - /// find length of shortest hex nodeid of a binary ID - def shortest(&self, node: PyBytes) -> PyResult { - let opt = self.get_nodetree(py)?.borrow(); - let nt = opt.as_ref().unwrap(); - let idx = &*self.index(py).borrow(); - match nt.unique_prefix_len_node(idx, &node_from_py_bytes(py, &node)?) - { - Ok(Some(l)) => Ok(l), - Ok(None) => Err(revlog_error(py)), - Err(e) => Err(nodemap_error(py, e)), - } - } - - def partialmatch(&self, node: PyObject) -> PyResult> { - let opt = self.get_nodetree(py)?.borrow(); - let nt = opt.as_ref().unwrap(); - let idx = &*self.index(py).borrow(); - - let node_as_string = if cfg!(feature = "python3-sys") { - node.cast_as::(py)?.to_string(py)?.to_string() - } - else { - let node = node.extract::(py)?; - String::from_utf8_lossy(node.data(py)).to_string() - }; - - let prefix = NodePrefix::from_hex(&node_as_string) - .map_err(|_| PyErr::new::( - py, format!("Invalid node or prefix '{}'", node_as_string)) - )?; - - nt.find_bin(idx, prefix) - // TODO make an inner API returning the node directly - .map(|opt| opt.map( - |rev| PyBytes::new(py, idx.node(rev).unwrap().as_bytes()))) - .map_err(|e| nodemap_error(py, e)) - - } - - /// append an index entry - def append(&self, tup: PyTuple) -> PyResult { - if tup.len(py) < 8 { - // this is better than the panic promised by tup.get_item() - return Err( - PyErr::new::(py, "tuple index out of range")) - } - let node_bytes = tup.get_item(py, 7).extract(py)?; - let node = node_from_py_object(py, &node_bytes)?; - - let rev = self.len(py)? as BaseRevision; - - // This is ok since we will just add the revision to the index - let rev = Revision(rev); - self.index(py) - .borrow_mut() - .append(py_tuple_to_revision_data_params(py, tup)?) - .unwrap(); - let idx = &*self.index(py).borrow(); - self.get_nodetree(py)?.borrow_mut().as_mut().unwrap() - .insert(idx, &node, rev) - .map_err(|e| nodemap_error(py, e))?; - Ok(py.None()) - } - - def __delitem__(&self, key: PyObject) -> PyResult<()> { - // __delitem__ is both for `del idx[r]` and `del idx[r1:r2]` - let start = if let Ok(rev) = key.extract(py) { - UncheckedRevision(rev) - } else { - let start = key.getattr(py, "start")?; - UncheckedRevision(start.extract(py)?) - }; - let start = self.index(py) - .borrow() - .check_revision(start) - .ok_or_else(|| { - nodemap_error(py, NodeMapError::RevisionNotInIndex(start)) - })?; - self.index(py).borrow_mut().remove(start).unwrap(); - let mut opt = self.get_nodetree(py)?.borrow_mut(); - let nt = opt.as_mut().unwrap(); - nt.invalidate_all(); - self.fill_nodemap(py, nt)?; - Ok(()) - } - - // - // Index methods previously reforwarded to C index (tp_methods) - // Same ordering as in revlog.c - // - - /// return the gca set of the given revs - def ancestors(&self, *args, **_kw) -> PyResult { - let rust_res = self.inner_ancestors(py, args)?; - Ok(rust_res) - } - - /// return the heads of the common ancestors of the given revs - def commonancestorsheads(&self, *args, **_kw) -> PyResult { - let rust_res = self.inner_commonancestorsheads(py, args)?; - Ok(rust_res) - } - - /// Clear the index caches and inner py_class data. - /// It is Python's responsibility to call `update_nodemap_data` again. - def clearcaches(&self) -> PyResult { - self.nt(py).borrow_mut().take(); - self.docket(py).borrow_mut().take(); - self.nodemap_mmap(py).borrow_mut().take(); - self.head_revs_py_list(py).borrow_mut().take(); - self.head_node_ids_py_list(py).borrow_mut().take(); - self.index(py).borrow().clear_caches(); - Ok(py.None()) - } - - /// return the raw binary string representing a revision - def entry_binary(&self, *args, **_kw) -> PyResult { - let rindex = self.index(py).borrow(); - let rev = UncheckedRevision(args.get_item(py, 0).extract(py)?); - let rust_bytes = rindex.check_revision(rev).and_then( - |r| rindex.entry_binary(r)) - .ok_or_else(|| rev_not_in_index(py, rev))?; - let rust_res = PyBytes::new(py, rust_bytes).into_object(); - Ok(rust_res) - } - - /// return a binary packed version of the header - def pack_header(&self, *args, **_kw) -> PyResult { - let rindex = self.index(py).borrow(); - let packed = rindex.pack_header(args.get_item(py, 0).extract(py)?); - let rust_res = PyBytes::new(py, &packed).into_object(); - Ok(rust_res) - } - - /// compute phases - def computephasesmapsets(&self, *args, **_kw) -> PyResult { - let py_roots = args.get_item(py, 0).extract::(py)?; - let rust_res = self.inner_computephasesmapsets(py, py_roots)?; - Ok(rust_res) - } - - /// reachableroots - def reachableroots2(&self, *args, **_kw) -> PyResult { - let rust_res = self.inner_reachableroots2( - py, - UncheckedRevision(args.get_item(py, 0).extract(py)?), - args.get_item(py, 1), - args.get_item(py, 2), - args.get_item(py, 3).extract(py)?, - )?; - Ok(rust_res) - } - - /// get head revisions - def headrevs(&self, *args, **_kw) -> PyResult { - let (filtered_revs, stop_rev) = match &args.len(py) { - 0 => Ok((py.None(), py.None())), - 1 => Ok((args.get_item(py, 0), py.None())), - 2 => Ok((args.get_item(py, 0), args.get_item(py, 1))), - _ => Err(PyErr::new::(py, "too many arguments")), - }?; - self.inner_headrevs(py, &filtered_revs, &stop_rev) - } - - /// get head nodeids - def head_node_ids(&self) -> PyResult { - let rust_res = self.inner_head_node_ids(py)?; - Ok(rust_res) - } - - /// get diff in head revisions - def headrevsdiff(&self, *args, **_kw) -> PyResult { - let rust_res = self.inner_headrevsdiff( - py, - &args.get_item(py, 0), - &args.get_item(py, 1))?; - Ok(rust_res) - } - - /// True if the object is a snapshot - def issnapshot(&self, *args, **_kw) -> PyResult { - let index = self.index(py).borrow(); - let result = index - .is_snapshot(UncheckedRevision(args.get_item(py, 0).extract(py)?)) - .map_err(|e| { - PyErr::new::(py, e.to_string()) - })?; - Ok(result) - } - - /// Gather snapshot data in a cache dict - def findsnapshots(&self, *args, **_kw) -> PyResult { - let index = self.index(py).borrow(); - let cache: PyDict = args.get_item(py, 0).extract(py)?; - // this methods operates by setting new values in the cache, - // hence we will compare results by letting the C implementation - // operate over a deepcopy of the cache, and finally compare both - // caches. - let c_cache = PyDict::new(py); - for (k, v) in cache.items(py) { - c_cache.set_item(py, k, PySet::new(py, v)?)?; - } - - let start_rev = UncheckedRevision(args.get_item(py, 1).extract(py)?); - let end_rev = UncheckedRevision(args.get_item(py, 2).extract(py)?); - let mut cache_wrapper = PySnapshotsCache{ py, dict: cache }; - index.find_snapshots( - start_rev, - end_rev, - &mut cache_wrapper, - ).map_err(|_| revlog_error(py))?; - Ok(py.None()) - } - - /// determine revisions with deltas to reconstruct fulltext - def deltachain(&self, *args, **_kw) -> PyResult { - let index = self.index(py).borrow(); - let rev = args.get_item(py, 0).extract::(py)?.into(); - let stop_rev = - args.get_item(py, 1).extract::>(py)?; - let rev = index.check_revision(rev).ok_or_else(|| { - nodemap_error(py, NodeMapError::RevisionNotInIndex(rev)) - })?; - let stop_rev = if let Some(stop_rev) = stop_rev { - let stop_rev = UncheckedRevision(stop_rev); - Some(index.check_revision(stop_rev).ok_or_else(|| { - nodemap_error(py, NodeMapError::RevisionNotInIndex(stop_rev)) - })?) - } else {None}; - let using_general_delta = args.get_item(py, 2) - .extract::>(py)? - .map(|i| i != 0); - let (chain, stopped) = index.delta_chain( - rev, stop_rev, using_general_delta - ).map_err(|e| { - PyErr::new::(py, e.to_string()) - })?; - - let chain: Vec<_> = chain.into_iter().map(|r| r.0).collect(); - Ok( - PyTuple::new( - py, - &[ - chain.into_py_object(py).into_object(), - stopped.into_py_object(py).into_object() - ] - ).into_object() - ) - - } - - /// slice planned chunk read to reach a density threshold - def slicechunktodensity(&self, *args, **_kw) -> PyResult { - let rust_res = self.inner_slicechunktodensity( - py, - args.get_item(py, 0), - args.get_item(py, 1).extract(py)?, - args.get_item(py, 2).extract(py)? - )?; - Ok(rust_res) - } - - // index_sequence_methods and index_mapping_methods. - // - // Since we call back through the high level Python API, - // there's no point making a distinction between index_get - // and index_getitem. - // gracinet 2023: this above is no longer true for the pure Rust impl - - def __len__(&self) -> PyResult { - self.len(py) - } - - def __getitem__(&self, key: PyObject) -> PyResult { - let rust_res = self.inner_getitem(py, key.clone_ref(py))?; - Ok(rust_res) - } - - def __contains__(&self, item: PyObject) -> PyResult { - // ObjectProtocol does not seem to provide contains(), so - // this is an equivalent implementation of the index_contains() - // defined in revlog.c - match item.extract::(py) { - Ok(rev) => { - Ok(rev >= -1 && rev < self.len(py)? as BaseRevision) - } - Err(_) => { - let item_bytes: PyBytes = item.extract(py)?; - let rust_res = self.has_node(py, item_bytes)?; - Ok(rust_res) - } - } - } - - def nodemap_data_all(&self) -> PyResult { - self.inner_nodemap_data_all(py) - } - - def nodemap_data_incremental(&self) -> PyResult { - self.inner_nodemap_data_incremental(py) - } - def update_nodemap_data( - &self, - docket: PyObject, - nm_data: PyObject - ) -> PyResult { - self.inner_update_nodemap_data(py, docket, nm_data) - } - - @property - def entry_size(&self) -> PyResult { - let rust_res: PyInt = INDEX_ENTRY_SIZE.to_py_object(py); - Ok(rust_res) - } - - @property - def rust_ext_compat(&self) -> PyResult { - // will be entirely removed when the Rust index yet useful to - // implement in Rust to detangle things when removing `self.cindex` - let rust_res: PyInt = 1.to_py_object(py); - Ok(rust_res) - } - - @property - def is_rust(&self) -> PyResult { - Ok(false.to_py_object(py)) - } - -}); - /// Take a (potentially) mmap'ed buffer, and return the underlying Python /// buffer along with the Rust slice into said buffer. We need to keep the /// Python buffer around, otherwise we'd get a dangling pointer once the buffer @@ -513,8 +138,7 @@ } else { return Err(PyErr::new::( py, - "Nodemap data buffer has an invalid memory representation" - .to_string(), + "buffer has an invalid memory representation".to_string(), )); }; @@ -538,7 +162,7 @@ .extract::(py)? .data(py) .try_into() - .unwrap(); + .expect("nodeid should be set"); let flags = (offset_or_flags & 0xFFFF) as u16; let data_offset = offset_or_flags >> 16; Ok(RevisionDataParams { @@ -622,36 +246,1252 @@ } } -impl Index { - fn new(py: Python, data: PyObject, header: u32) -> PyResult { - // Safety: we keep the buffer around inside the class as `index_mmap` - let (buf, bytes) = unsafe { mmap_keeparound(py, data)? }; +// There are no static generics in Rust (because their implementation is hard, +// I'm guessing it's due to different compilation stages, etc.). +// So manually generate all three caches and use them in `with_filelog_cache`. +static DELTA_CONFIG_CACHE: OnceLock<(PyObject, RevlogDeltaConfig)> = + OnceLock::new(); +static DATA_CONFIG_CACHE: OnceLock<(PyObject, RevlogDataConfig)> = + OnceLock::new(); +static FEATURE_CONFIG_CACHE: OnceLock<(PyObject, RevlogFeatureConfig)> = + OnceLock::new(); + +/// Cache the first conversion from Python -> Rust config for all filelogs to +/// save on conversion time when called in a loop. +fn with_filelog_cache( + py: Python, + py_config: &PyObject, + revlog_type: RevlogType, + cache: &OnceLock<(PyObject, T)>, + callback: impl Fn() -> PyResult, +) -> PyResult { + let mut was_cached = false; + if revlog_type == RevlogType::Filelog { + if let Some((cached_py_config, rust_config)) = cache.get() { + was_cached = true; + // All filelogs in a given repository *most likely* have the + // exact same config, but it's not impossible that some extensions + // do some magic with configs or that this code will be used + // for longer-running processes. So compare the source `PyObject` + // in case the source changed, at the cost of some overhead. + // We can't use `py_config.eq(cached_py_config)` because all config + // objects are different in Python and `a is b` is false. + if py_config.compare(py, cached_py_config)?.is_eq() { + return Ok(*rust_config); + } + } + } + let config = callback()?; + // Do not call the lock unnecessarily if it's already been set. + if !was_cached && revlog_type == RevlogType::Filelog { + cache.set((py_config.clone_ref(py), config)).ok(); + } + Ok(config) +} + +fn extract_delta_config( + py: Python, + py_config: PyObject, + revlog_type: RevlogType, +) -> PyResult { + let get_delta_config = || { + let max_deltachain_span = py_config + .getattr(py, "max_deltachain_span")? + .extract::(py)?; + + let revlog_delta_config = RevlogDeltaConfig { + general_delta: py_config + .getattr(py, "general_delta")? + .extract(py)?, + sparse_revlog: py_config + .getattr(py, "sparse_revlog")? + .extract(py)?, + max_chain_len: py_config + .getattr(py, "max_chain_len")? + .extract(py)?, + max_deltachain_span: if max_deltachain_span < 0 { + None + } else { + Some(max_deltachain_span as u64) + }, + upper_bound_comp: py_config + .getattr(py, "upper_bound_comp")? + .extract(py)?, + delta_both_parents: py_config + .getattr(py, "delta_both_parents")? + .extract(py)?, + candidate_group_chunk_size: py_config + .getattr(py, "candidate_group_chunk_size")? + .extract(py)?, + debug_delta: py_config.getattr(py, "debug_delta")?.extract(py)?, + lazy_delta: py_config.getattr(py, "lazy_delta")?.extract(py)?, + lazy_delta_base: py_config + .getattr(py, "lazy_delta_base")? + .extract(py)?, + }; + Ok(revlog_delta_config) + }; + with_filelog_cache( + py, + &py_config, + revlog_type, + &DELTA_CONFIG_CACHE, + get_delta_config, + ) +} + +fn extract_data_config( + py: Python, + py_config: PyObject, + revlog_type: RevlogType, +) -> PyResult { + let get_data_config = || { + Ok(RevlogDataConfig { + try_pending: py_config.getattr(py, "try_pending")?.extract(py)?, + try_split: py_config.getattr(py, "try_split")?.extract(py)?, + check_ambig: py_config.getattr(py, "check_ambig")?.extract(py)?, + mmap_large_index: py_config + .getattr(py, "mmap_large_index")? + .extract(py)?, + mmap_index_threshold: py_config + .getattr(py, "mmap_index_threshold")? + .extract(py)?, + chunk_cache_size: py_config + .getattr(py, "chunk_cache_size")? + .extract(py)?, + uncompressed_cache_factor: py_config + .getattr(py, "uncompressed_cache_factor")? + .extract(py)?, + uncompressed_cache_count: py_config + .getattr(py, "uncompressed_cache_count")? + .extract(py)?, + with_sparse_read: py_config + .getattr(py, "with_sparse_read")? + .extract(py)?, + sr_density_threshold: py_config + .getattr(py, "sr_density_threshold")? + .extract(py)?, + sr_min_gap_size: py_config + .getattr(py, "sr_min_gap_size")? + .extract(py)?, + general_delta: py_config + .getattr(py, "generaldelta")? + .extract(py)?, + }) + }; + + with_filelog_cache( + py, + &py_config, + revlog_type, + &DATA_CONFIG_CACHE, + get_data_config, + ) +} + +fn extract_feature_config( + py: Python, + py_config: PyObject, + revlog_type: RevlogType, +) -> PyResult { + let get_feature_config = || { + let engine_bytes = &py_config + .getattr(py, "compression_engine")? + .extract::(py)?; + let compression_engine = engine_bytes.data(py); + let compression_engine = match compression_engine { + b"zlib" => { + let compression_options = &py_config + .getattr(py, "compression_engine_options")? + .extract::(py)?; + let zlib_level = compression_options + .get_item(py, PyBytes::new(py, &b"zlib.level"[..])); + let level = if let Some(level) = zlib_level { + if level.is_none(py) { + None + } else { + Some(level.extract(py)?) + } + } else { + None + }; + let mut engine = CompressionConfig::default(); + if let Some(level) = level { + engine + .set_level(level) + .expect("invalid compression level from Python"); + } + engine + } + b"zstd" => { + let compression_options = &py_config + .getattr(py, "compression_engine_options")? + .extract::(py)?; + let zstd_level = compression_options + .get_item(py, PyBytes::new(py, &b"zstd.level"[..])); + let level = if let Some(level) = zstd_level { + if level.is_none(py) { + None + } else { + Some(level.extract(py)?) + } + } else { + let level = compression_options + .get_item(py, PyBytes::new(py, &b"level"[..])); + if let Some(level) = level { + if level.is_none(py) { + None + } else { + Some(level.extract(py)?) + } + } else { + None + } + }; + CompressionConfig::zstd(level) + .expect("invalid compression level from Python") + } + b"none" => CompressionConfig::None, + e => { + return Err(PyErr::new::( + py, + format!( + "invalid compression engine {}", + String::from_utf8_lossy(e) + ), + )) + } + }; + let revlog_feature_config = RevlogFeatureConfig { + compression_engine, + censorable: py_config.getattr(py, "censorable")?.extract(py)?, + has_side_data: py_config + .getattr(py, "has_side_data")? + .extract(py)?, + compute_rank: py_config + .getattr(py, "compute_rank")? + .extract(py)?, + canonical_parent_order: py_config + .getattr(py, "canonical_parent_order")? + .extract(py)?, + enable_ellipsis: py_config + .getattr(py, "enable_ellipsis")? + .extract(py)?, + }; + Ok(revlog_feature_config) + }; + with_filelog_cache( + py, + &py_config, + revlog_type, + &FEATURE_CONFIG_CACHE, + get_feature_config, + ) +} + +fn revlog_error_from_msg(py: Python, e: impl ToString) -> PyErr { + let msg = e.to_string(); - Self::create_instance( + match py + .import("mercurial.error") + .and_then(|m| m.get(py, "RevlogError")) + { + Err(e) => e, + Ok(cls) => { + let msg = PyBytes::new(py, msg.as_bytes()); + PyErr::from_instance( + py, + cls.call(py, (msg,), None).ok().into_py_object(py), + ) + } + } +} + +py_class!(pub class ReadingContextManager |py| { + data inner_revlog: RefCell; + + def __enter__(&self) -> PyResult { + let res = self.inner_revlog(py) + .borrow() + .inner(py) + .borrow() + .enter_reading_context() + .map_err(|e| revlog_error_from_msg(py, e)); + if let Err(e) = res { + // `__exit__` is not called from Python if `__enter__` fails + self.inner_revlog(py) + .borrow() + .inner(py) + .borrow() + .exit_reading_context(); + return Err(e) + } + Ok(py.None()) + } + + def __exit__( + &self, + ty: Option, + value: PyObject, + traceback: PyObject + ) -> PyResult { + // unused arguments, keep clippy from complaining without adding + // a general rule + let _ = ty; + let _ = value; + let _ = traceback; + + self.inner_revlog(py) + .borrow() + .inner(py) + .borrow() + .exit_reading_context(); + Ok(py.None()) + } +}); + +// Only used from Python *tests* +py_class!(pub class PyFileHandle |py| { + data inner_file: RefCell; + + def tell(&self) -> PyResult { + let locals = PyDict::new(py); + locals.set_item(py, "os", py.import("os")?)?; + locals.set_item(py, "fd", *self.inner_file(py).borrow())?; + let f = py.eval("os.fdopen(fd)", None, Some(&locals))?; + + // Prevent Python from closing the file after garbage collecting. + // This is fine since Rust is still holding on to the actual File. + // (and also because it's only used in tests). + std::mem::forget(f.clone_ref(py)); + + locals.set_item(py, "f", f)?; + let res = py.eval("f.tell()", None, Some(&locals))?; + Ok(res) + } +}); + +/// Wrapper around a Python transaction object, to keep `hg-core` oblivious +/// of the fact it's being called from Python. +pub struct PyTransaction { + inner: PyObject, +} + +impl PyTransaction { + pub fn new(inner: PyObject) -> Self { + Self { inner } + } +} + +impl Clone for PyTransaction { + fn clone(&self) -> Self { + let gil = &Python::acquire_gil(); + let py = gil.python(); + Self { + inner: self.inner.clone_ref(py), + } + } +} + +impl Transaction for PyTransaction { + fn add(&mut self, file: impl AsRef, offset: usize) { + let gil = &Python::acquire_gil(); + let py = gil.python(); + let file = PyBytes::new(py, &get_bytes_from_path(file.as_ref())); + self.inner + .call_method(py, "add", (file, offset), None) + .expect("transaction add failed"); + } +} + +py_class!(pub class WritingContextManager |py| { + data inner_revlog: RefCell; + data transaction: RefCell; + data data_end: Cell>; + + def __enter__(&self) -> PyResult { + let res = self.inner_revlog(py) + .borrow_mut() + .inner(py) + .borrow_mut() + .enter_writing_context( + self.data_end(py).get(), + &mut *self.transaction(py).borrow_mut() + ).map_err(|e| revlog_error_from_msg(py, e)); + if let Err(e) = res { + // `__exit__` is not called from Python if `__enter__` fails + self.inner_revlog(py) + .borrow_mut() + .inner(py) + .borrow_mut() + .exit_writing_context(); + return Err(e) + } + Ok(py.None()) + } + + def __exit__( + &self, + ty: Option, + value: PyObject, + traceback: PyObject + ) -> PyResult { + // unused arguments, keep clippy from complaining without adding + // a general rule + let _ = ty; + let _ = value; + let _ = traceback; + + self.inner_revlog(py) + .borrow_mut() + .inner(py) + .borrow_mut() + .exit_writing_context(); + Ok(py.None()) + } +}); + +struct PyFnCache { + fncache: PyObject, +} +impl PyFnCache { + fn new(fncache: PyObject) -> Self { + Self { fncache } + } +} + +impl Clone for PyFnCache { + fn clone(&self) -> Self { + let gil = Python::acquire_gil(); + let py = gil.python(); + Self { + fncache: self.fncache.clone_ref(py), + } + } +} + +/// Cache whether the fncache is loaded to avoid Python round-trip every time. +/// Once the fncache is loaded, it stays loaded unless we're in a very +/// long-running process, none of which we actually support for now. +static FN_CACHE_IS_LOADED: AtomicBool = AtomicBool::new(false); + +impl FnCache for PyFnCache { + fn is_loaded(&self) -> bool { + if FN_CACHE_IS_LOADED.load(Ordering::Relaxed) { + return true; + } + let gil = Python::acquire_gil(); + let py = gil.python(); + // TODO raise in case of error? + let is_loaded = self + .fncache + .getattr(py, "is_loaded") + .ok() + .map(|o| { + o.extract::(py) + .expect("is_loaded returned something other than a bool") + }) + .unwrap_or(false); + if is_loaded { + FN_CACHE_IS_LOADED.store(true, Ordering::Relaxed); + } + is_loaded + } + fn add(&self, path: &std::path::Path) { + let gil = Python::acquire_gil(); + let py = gil.python(); + // TODO raise in case of error? + self.fncache + .call_method( + py, + "add", + (PyBytes::new(py, &get_bytes_from_path(path)),), + None, + ) + .ok(); + } +} + +py_class!(pub class InnerRevlog |py| { + @shared data inner: CoreInnerRevlog; + data nt: RefCell>; + data docket: RefCell>; + // Holds a reference to the mmap'ed persistent nodemap data + data nodemap_mmap: RefCell>; + // Holds a reference to the mmap'ed persistent index data + data index_mmap: RefCell; + data head_revs_py_list: RefCell>; + data head_node_ids_py_list: RefCell>; + data revision_cache: RefCell>; + data use_persistent_nodemap: bool; + data nodemap_queries: AtomicUsize; + + def __new__( + _cls, + vfs_base: PyObject, + fncache: PyObject, + vfs_is_readonly: bool, + index_data: PyObject, + index_file: PyObject, + data_file: PyObject, + sidedata_file: PyObject, + inline: bool, + data_config: PyObject, + delta_config: PyObject, + feature_config: PyObject, + chunk_cache: PyObject, + default_compression_header: PyObject, + revlog_type: usize, + use_persistent_nodemap: bool, + ) -> PyResult { + Self::inner_new( py, - hg::index::Index::new( - bytes, - IndexHeader::parse(&header.to_be_bytes()) - .expect("default header is broken") - .unwrap(), - ) - .map_err(|e| { - revlog_error_with_msg(py, e.to_string().as_bytes()) - })?, - RefCell::new(None), - RefCell::new(None), - RefCell::new(None), - RefCell::new(Some(buf)), - RefCell::new(None), - RefCell::new(None), + vfs_base, + fncache, + vfs_is_readonly, + index_data, + index_file, + data_file, + sidedata_file, + inline, + data_config, + delta_config, + feature_config, + chunk_cache, + default_compression_header, + revlog_type, + use_persistent_nodemap ) } + def clear_cache(&self) -> PyResult { + assert!(!self.is_delaying(py)?); + self.revision_cache(py).borrow_mut().take(); + self.inner(py).borrow_mut().clear_cache(); + self.nodemap_queries(py).store(0, Ordering::Relaxed); + Ok(py.None()) + } + + @property def canonical_index_file(&self) -> PyResult { + let path = self.inner(py).borrow().canonical_index_file(); + Ok(PyBytes::new(py, &get_bytes_from_path(path))) + } + + @property def is_delaying(&self) -> PyResult { + Ok(self.inner(py).borrow().is_delaying()) + } + + @property def _revisioncache(&self) -> PyResult { + let cache = &*self.revision_cache(py).borrow(); + match cache { + None => Ok(py.None()), + Some(cache) => { + Ok(cache.clone_ref(py)) + } + } + + } + + @property def _writinghandles(&self) -> PyResult { + use std::os::fd::AsRawFd; + + let inner = self.inner(py).borrow(); + let handles = inner.python_writing_handles(); + + match handles.as_ref() { + None => Ok(py.None()), + Some(handles) => { + let d_handle = if let Some(d_handle) = &handles.data_handle { + let handle = RefCell::new(d_handle.file.as_raw_fd()); + Some(PyFileHandle::create_instance(py, handle)?) + } else { + None + }; + let handle = + RefCell::new(handles.index_handle.file.as_raw_fd()); + Ok( + ( + PyFileHandle::create_instance(py, handle)?, + d_handle, + py.None(), // Sidedata handle + + ).to_py_object(py).into_object() + ) + } + } + + } + + @_revisioncache.setter def set_revision_cache( + &self, + value: Option + ) -> PyResult<()> { + *self.revision_cache(py).borrow_mut() = value.clone_ref(py); + match value { + None => { + // This means the property has been deleted, *not* that the + // property has been set to `None`. Whatever happens is up + // to the implementation. Here we just set it to `None`. + self + .inner(py) + .borrow() + .last_revision_cache + .lock() + .expect("lock should not be held") + .take(); + }, + Some(tuple) => { + if tuple.is_none(py) { + self + .inner(py) + .borrow() + .last_revision_cache + .lock() + .expect("lock should not be held") + .take(); + return Ok(()) + } + let node = tuple.get_item(py, 0)?.extract::(py)?; + let node = node_from_py_bytes(py, &node)?; + let rev = tuple.get_item(py, 1)?.extract::(py)?; + // Ok because Python only sets this if the revision has been + // checked + let rev = Revision(rev); + let data = tuple.get_item(py, 2)?.extract::(py)?; + let inner = self.inner(py).borrow(); + let mut last_revision_cache = inner + .last_revision_cache + .lock() + .expect("lock should not be held"); + *last_revision_cache = + Some((node, rev, Box::new(PyBytesDeref::new(py, data)))); + } + } + Ok(()) + } + + @property def inline(&self) -> PyResult { + Ok(self.inner(py).borrow().is_inline()) + } + + @inline.setter def set_inline( + &self, + value: Option + ) -> PyResult<()> { + if let Some(v) = value { + self.inner(py).borrow_mut().inline = v.extract(py)?; + }; + Ok(()) + } + + @property def index_file(&self) -> PyResult { + Ok( + PyBytes::new( + py, + &get_bytes_from_path(&self.inner(py).borrow().index_file) + ) + ) + } + + @index_file.setter def set_index_file( + &self, + value: Option + ) -> PyResult<()> { + let path = get_path_from_bytes( + value + .expect("don't delete the index path") + .extract::(py)? + .data(py) + ).to_owned(); + self.inner(py).borrow_mut().index_file = path; + Ok(()) + } + + @property def is_writing(&self) -> PyResult { + Ok(self.inner(py).borrow().is_writing()) + } + + @property def is_open(&self) -> PyResult { + Ok(self.inner(py).borrow().is_open()) + } + + def issnapshot(&self, rev: PyRevision) -> PyResult { + self.inner_issnapshot(py, UncheckedRevision(rev.0)) + } + + def _deltachain(&self, *args, **kw) -> PyResult { + let inner = self.inner(py).borrow(); + let general_delta = inner.index.uses_generaldelta(); + let args = PyTuple::new( + py, + &[ + args.get_item(py, 0), + kw.and_then(|d| d.get_item(py, "stoprev")).to_py_object(py), + general_delta.to_py_object(py).into_object(), + ] + ); + self._index_deltachain(py, &args, kw) + } + + def compress(&self, data: PyObject) -> PyResult { + let inner = self.inner(py).borrow(); + let py_buffer = PyBuffer::get(py, &data)?; + let deref = PyBufferDeref::new(py, py_buffer)?; + let compressed = inner.compress(&deref) + .map_err(|e| revlog_error_from_msg(py, e))?; + let compressed = compressed.as_deref(); + let header = if compressed.is_some() { + PyBytes::new(py, &b""[..]) + } else { + PyBytes::new(py, &b"u"[..]) + }; + Ok( + ( + header, + PyBytes::new(py, compressed.unwrap_or(&deref)) + ).to_py_object(py) + ) + } + + def reading(&self) -> PyResult { + ReadingContextManager::create_instance( + py, + RefCell::new(self.clone_ref(py)), + ) + } + + def writing( + &self, + transaction: PyObject, + data_end: Option, + sidedata_end: Option, + ) -> PyResult { + // Silence unused argument (only relevant for changelog v2) + let _ = sidedata_end; + WritingContextManager::create_instance( + py, + RefCell::new(self.clone_ref(py)), + RefCell::new(PyTransaction::new(transaction)), + Cell::new(data_end) + ) + } + + def split_inline( + &self, + _tr: PyObject, + header: i32, + new_index_file_path: Option + ) -> PyResult { + let mut inner = self.inner(py).borrow_mut(); + let new_index_file_path = match new_index_file_path { + Some(path) => { + let path = path.extract::(py)?; + Some(get_path_from_bytes(path.data(py)).to_owned()) + }, + None => None, + }; + let header = IndexHeader::parse(&header.to_be_bytes()); + let header = header.expect("invalid header bytes"); + let path = inner + .split_inline(header, new_index_file_path) + .map_err(|e| revlog_error_from_msg(py, e))?; + Ok(PyBytes::new(py, &get_bytes_from_path(path))) + } + + def get_segment_for_revs( + &self, + startrev: PyRevision, + endrev: PyRevision, + ) -> PyResult { + let inner = self.inner(py).borrow(); + let (offset, data) = inner + .get_segment_for_revs(Revision(startrev.0), Revision(endrev.0)) + .map_err(|e| revlog_error_from_msg(py, e))?; + let data = PyBytes::new(py, &data); + Ok((offset, data).to_py_object(py)) + } + + def raw_text( + &self, + _node: PyObject, + rev: PyRevision + ) -> PyResult { + let inner = self.inner(py).borrow(); + let mut py_bytes = PyBytes::new(py, &[]); + inner + .raw_text(Revision(rev.0), |size, f| { + py_bytes = with_pybytes_buffer(py, size, f)?; + Ok(()) + }).map_err(|e| revlog_error_from_msg(py, e))?; + Ok(py_bytes) + } + + def _chunk( + &self, + rev: PyRevision, + ) -> PyResult { + let inner = self.inner(py).borrow(); + let chunk = inner + .chunk_for_rev(Revision(rev.0)) + .map_err(|e| revlog_error_from_msg(py, e))?; + let chunk = PyBytes::new(py, &chunk); + Ok(chunk) + } + + def write_entry( + &self, + transaction: PyObject, + entry: PyObject, + data: PyTuple, + _link: PyObject, + offset: usize, + _sidedata: PyObject, + _sidedata_offset: PyInt, + index_end: Option, + data_end: Option, + _sidedata_end: Option, + ) -> PyResult { + let mut inner = self.inner(py).borrow_mut(); + let transaction = PyTransaction::new(transaction); + let py_bytes = entry.extract(py)?; + let entry = PyBytesDeref::new(py, py_bytes); + let header = data.get_item(py, 0).extract::(py)?; + let header = header.data(py); + let data = data.get_item(py, 1); + let py_bytes = data.extract(py)?; + let data = PyBytesDeref::new(py, py_bytes); + Ok( + inner.write_entry( + transaction, + &entry, + (header, &data), + offset, + index_end, + data_end + ).map_err(|e| revlog_error_from_msg(py, e))? + .to_py_object(py) + ) + } + + def delay(&self) -> PyResult> { + let path = self.inner(py) + .borrow_mut() + .delay() + .map_err(|e| revlog_error_from_msg(py, e))?; + Ok(path.map(|p| PyBytes::new(py, &get_bytes_from_path(p)))) + } + + def write_pending(&self) -> PyResult { + let (path, any_pending) = self.inner(py) + .borrow_mut() + .write_pending() + .map_err(|e| revlog_error_from_msg(py, e))?; + let maybe_path = match path { + Some(path) => { + PyBytes::new(py, &get_bytes_from_path(path)).into_object() + }, + None => { + py.None() + } + }; + Ok( + ( + maybe_path, + any_pending + ).to_py_object(py) + ) + } + + def finalize_pending(&self) -> PyResult { + let path = self.inner(py) + .borrow_mut() + .finalize_pending() + .map_err(|e| revlog_error_from_msg(py, e))?; + Ok(PyBytes::new(py, &get_bytes_from_path(path))) + } + + // -- forwarded index methods -- + + def _index_get_rev(&self, node: PyBytes) -> PyResult> { + let node = node_from_py_bytes(py, &node)?; + // Filelogs have no persistent nodemaps and are often small, use a + // brute force lookup from the end backwards. If there is a very large + // filelog (automation file that changes every commit etc.), it also + // seems to work quite well for all measured purposes so far. + let mut nodemap_queries = + self.nodemap_queries(py).fetch_add(1, Ordering::Relaxed); + // Still need to add since `fetch_add` returns the old value + nodemap_queries += 1; + if !*self.use_persistent_nodemap(py) && nodemap_queries <= 4 { + let idx = &self.inner(py).borrow().index; + let res = + idx.rev_from_node_no_persistent_nodemap(node.into()).ok(); + return Ok(res.map(Into::into)) + } + let opt = self.get_nodetree(py)?.borrow(); + let nt = opt.as_ref().expect("nodetree should be set"); + let ridx = &self.inner(py).borrow().index; + let rust_rev = + nt.find_bin(ridx, node.into()).map_err(|e| nodemap_error(py, e))?; + Ok(rust_rev.map(Into::into)) + } + + /// same as `_index_get_rev()` but raises a bare `error.RevlogError` if node + /// is not found. + /// + /// No need to repeat `node` in the exception, `mercurial/revlog.py` + /// will catch and rewrap with it + def _index_rev(&self, node: PyBytes) -> PyResult { + self._index_get_rev(py, node)?.ok_or_else(|| revlog_error(py)) + } + + /// return True if the node exist in the index + def _index_has_node(&self, node: PyBytes) -> PyResult { + // TODO OPTIM we could avoid a needless conversion here, + // to do when scaffolding for pure Rust switch is removed, + // as `_index_get_rev()` currently does the necessary assertions + self._index_get_rev(py, node).map(|opt| opt.is_some()) + } + + /// find length of shortest hex nodeid of a binary ID + def _index_shortest(&self, node: PyBytes) -> PyResult { + let opt = self.get_nodetree(py)?.borrow(); + let nt = opt.as_ref().expect("nodetree should be set"); + let idx = &self.inner(py).borrow().index; + match nt.unique_prefix_len_node(idx, &node_from_py_bytes(py, &node)?) + { + Ok(Some(l)) => Ok(l), + Ok(None) => Err(revlog_error(py)), + Err(e) => Err(nodemap_error(py, e)), + } + } + + def _index_partialmatch( + &self, + node: PyObject + ) -> PyResult> { + let opt = self.get_nodetree(py)?.borrow(); + let nt = opt.as_ref().expect("nodetree should be set"); + let idx = &self.inner(py).borrow().index; + + let node = node.extract::(py)?; + let node_as_string = String::from_utf8_lossy(node.data(py)); + + let prefix = NodePrefix::from_hex(node_as_string.to_string()) + .map_err(|_| PyErr::new::( + py, format!("Invalid node or prefix '{}'", node_as_string)) + )?; + + nt.find_bin(idx, prefix) + // TODO make an inner API returning the node directly + .map(|opt| opt.map(|rev| { + PyBytes::new( + py, + idx.node(rev).expect("node should exist").as_bytes() + ) + })) + .map_err(|e| nodemap_error(py, e)) + + } + + /// append an index entry + def _index_append(&self, tup: PyTuple) -> PyResult { + if tup.len(py) < 8 { + // this is better than the panic promised by tup.get_item() + return Err( + PyErr::new::(py, "tuple index out of range")) + } + let node_bytes = tup.get_item(py, 7).extract(py)?; + let node = node_from_py_object(py, &node_bytes)?; + + let rev = self.len(py)? as BaseRevision; + + // This is ok since we will just add the revision to the index + let rev = Revision(rev); + self.inner(py) + .borrow_mut() + .index + .append(py_tuple_to_revision_data_params(py, tup)?) + .map_err(|e| revlog_error_from_msg(py, e))?; + let idx = &self.inner(py).borrow().index; + self.get_nodetree(py)? + .borrow_mut() + .as_mut() + .expect("nodetree should be set") + .insert(idx, &node, rev) + .map_err(|e| nodemap_error(py, e))?; + Ok(py.None()) + } + + def _index___delitem__(&self, key: PyObject) -> PyResult { + // __delitem__ is both for `del idx[r]` and `del idx[r1:r2]` + let start = if let Ok(rev) = key.extract(py) { + UncheckedRevision(rev) + } else { + let start = key.getattr(py, "start")?; + UncheckedRevision(start.extract(py)?) + }; + let mut borrow = self.inner(py).borrow_mut(); + let start = borrow + .index + .check_revision(start) + .ok_or_else(|| { + nodemap_error(py, NodeMapError::RevisionNotInIndex(start)) + })?; + borrow.index + .remove(start) + .map_err(|e| revlog_error_from_msg(py, e))?; + drop(borrow); + let mut opt = self.get_nodetree(py)?.borrow_mut(); + let nt = opt.as_mut().expect("nodetree should be set"); + nt.invalidate_all(); + self.fill_nodemap(py, nt)?; + Ok(py.None()) + } + + /// return the gca set of the given revs + def _index_ancestors(&self, *args, **_kw) -> PyResult { + let rust_res = self.inner_ancestors(py, args)?; + Ok(rust_res) + } + + /// return the heads of the common ancestors of the given revs + def _index_commonancestorsheads( + &self, + *args, + **_kw + ) -> PyResult { + let rust_res = self.inner_commonancestorsheads(py, args)?; + Ok(rust_res) + } + + /// Clear the index caches and inner py_class data. + /// It is Python's responsibility to call `update_nodemap_data` again. + def _index_clearcaches(&self) -> PyResult { + self.nt(py).borrow_mut().take(); + self.docket(py).borrow_mut().take(); + self.nodemap_mmap(py).borrow_mut().take(); + self.head_revs_py_list(py).borrow_mut().take(); + self.head_node_ids_py_list(py).borrow_mut().take(); + self.inner(py).borrow_mut().index.clear_caches(); + Ok(py.None()) + } + + /// return the raw binary string representing a revision + def _index_entry_binary(&self, *args, **_kw) -> PyResult { + let rindex = &self.inner(py).borrow().index; + let rev = UncheckedRevision(args.get_item(py, 0).extract(py)?); + let rust_bytes = rindex.check_revision(rev).and_then( + |r| rindex.entry_binary(r)).ok_or_else(|| rev_not_in_index(py, rev) + )?; + let rust_res = PyBytes::new(py, rust_bytes).into_object(); + Ok(rust_res) + } + + + /// return a binary packed version of the header + def _index_pack_header(&self, *args, **_kw) -> PyResult { + let rindex = &self.inner(py).borrow().index; + let packed = rindex.pack_header(args.get_item(py, 0).extract(py)?); + let rust_res = PyBytes::new(py, &packed).into_object(); + Ok(rust_res) + } + + /// compute phases + def _index_computephasesmapsets( + &self, + *args, + **_kw + ) -> PyResult { + let py_roots = args.get_item(py, 0).extract::(py)?; + let rust_res = self.inner_computephasesmapsets(py, py_roots)?; + Ok(rust_res) + } + + /// reachableroots + def _index_reachableroots2(&self, *args, **_kw) -> PyResult { + let rust_res = self.inner_reachableroots2( + py, + UncheckedRevision(args.get_item(py, 0).extract(py)?), + args.get_item(py, 1), + args.get_item(py, 2), + args.get_item(py, 3).extract(py)?, + )?; + Ok(rust_res) + } + + /// get head revisions + def _index_headrevs(&self, *args, **_kw) -> PyResult { + let (filtered_revs, stop_rev) = match &args.len(py) { + 0 => Ok((py.None(), py.None())), + 1 => Ok((args.get_item(py, 0), py.None())), + 2 => Ok((args.get_item(py, 0), args.get_item(py, 1))), + _ => Err(PyErr::new::(py, "too many arguments")), + }?; + self.inner_headrevs(py, &filtered_revs, &stop_rev) + } + + /// get head nodeids + def _index_head_node_ids(&self) -> PyResult { + let rust_res = self.inner_head_node_ids(py)?; + Ok(rust_res) + } + + /// get diff in head revisions + def _index_headrevsdiff(&self, *args, **_kw) -> PyResult { + let rust_res = self.inner_headrevsdiff( + py, + &args.get_item(py, 0), + &args.get_item(py, 1))?; + Ok(rust_res) + } + + /// True if the object is a snapshot + def _index_issnapshot(&self, *args, **_kw) -> PyResult { + let rev = UncheckedRevision(args.get_item(py, 0).extract(py)?); + self.inner_issnapshot(py, rev) + } + + /// Gather snapshot data in a cache dict + def _index_findsnapshots(&self, *args, **_kw) -> PyResult { + let index = &self.inner(py).borrow().index; + let cache: PyDict = args.get_item(py, 0).extract(py)?; + // this methods operates by setting new values in the cache, + // hence we will compare results by letting the C implementation + // operate over a deepcopy of the cache, and finally compare both + // caches. + let c_cache = PyDict::new(py); + for (k, v) in cache.items(py) { + c_cache.set_item(py, k, PySet::new(py, v)?)?; + } + + let start_rev = UncheckedRevision(args.get_item(py, 1).extract(py)?); + let end_rev = UncheckedRevision(args.get_item(py, 2).extract(py)?); + let mut cache_wrapper = PySnapshotsCache{ py, dict: cache }; + index.find_snapshots( + start_rev, + end_rev, + &mut cache_wrapper, + ).map_err(|_| revlog_error(py))?; + Ok(py.None()) + } + + /// determine revisions with deltas to reconstruct fulltext + def _index_deltachain(&self, *args, **_kw) -> PyResult { + let index = &self.inner(py).borrow().index; + let rev = args.get_item(py, 0).extract::(py)?.into(); + let stop_rev = + args.get_item(py, 1).extract::>(py)?; + let rev = index.check_revision(rev).ok_or_else(|| { + nodemap_error(py, NodeMapError::RevisionNotInIndex(rev)) + })?; + let stop_rev = if let Some(stop_rev) = stop_rev { + let stop_rev = UncheckedRevision(stop_rev); + Some(index.check_revision(stop_rev).ok_or_else(|| { + nodemap_error(py, NodeMapError::RevisionNotInIndex(stop_rev)) + })?) + } else {None}; + let using_general_delta = args.get_item(py, 2) + .extract::>(py)? + .map(|i| i != 0); + let (chain, stopped) = index.delta_chain( + rev, stop_rev, using_general_delta + ).map_err(|e| { + PyErr::new::(py, e.to_string()) + })?; + + let chain: Vec<_> = chain.into_iter().map(|r| r.0).collect(); + Ok( + PyTuple::new( + py, + &[ + chain.into_py_object(py).into_object(), + stopped.into_py_object(py).into_object() + ] + ).into_object() + ) + } + + /// slice planned chunk read to reach a density threshold + def _index_slicechunktodensity(&self, *args, **_kw) -> PyResult { + let rust_res = self.inner_slicechunktodensity( + py, + args.get_item(py, 0), + args.get_item(py, 1).extract(py)?, + args.get_item(py, 2).extract(py)? + )?; + Ok(rust_res) + } + + def _index___len__(&self) -> PyResult { + self.len(py) + } + + def _index___getitem__(&self, key: PyObject) -> PyResult { + let rust_res = self.inner_getitem(py, key.clone_ref(py))?; + Ok(rust_res) + } + + def _index___contains__(&self, item: PyObject) -> PyResult { + // ObjectProtocol does not seem to provide contains(), so + // this is an equivalent implementation of the index_contains() + // defined in revlog.c + match item.extract::(py) { + Ok(rev) => { + Ok(rev >= -1 && rev < self.len(py)? as BaseRevision) + } + Err(_) => { + let item_bytes: PyBytes = item.extract(py)?; + let rust_res = self._index_has_node(py, item_bytes)?; + Ok(rust_res) + } + } + } + + def _index_nodemap_data_all(&self) -> PyResult { + self.inner_nodemap_data_all(py) + } + + def _index_nodemap_data_incremental(&self) -> PyResult { + self.inner_nodemap_data_incremental(py) + } + + def _index_update_nodemap_data( + &self, + docket: PyObject, + nm_data: PyObject + ) -> PyResult { + self.inner_update_nodemap_data(py, docket, nm_data) + } + + @property + def _index_entry_size(&self) -> PyResult { + let rust_res: PyInt = INDEX_ENTRY_SIZE.to_py_object(py); + Ok(rust_res) + } + + @property + def _index_rust_ext_compat(&self) -> PyResult { + // will be entirely removed when the Rust index yet useful to + // implement in Rust to detangle things when removing `self.cindex` + let rust_res: PyInt = 1.to_py_object(py); + Ok(rust_res) + } + + @property + def _index_is_rust(&self) -> PyResult { + Ok(false.to_py_object(py)) + } + + +}); + +/// Forwarded index methods? +impl InnerRevlog { fn len(&self, py: Python) -> PyResult { - let rust_index_len = self.index(py).borrow().len(); + let rust_index_len = self.inner(py).borrow().index.len(); Ok(rust_index_len) } - /// This is scaffolding at this point, but it could also become /// a way to start a persistent nodemap or perform a /// vacuum / repack operation @@ -660,11 +1500,11 @@ py: Python, nt: &mut CoreNodeTree, ) -> PyResult { - let index = self.index(py).borrow(); + let index = &self.inner(py).borrow().index; for r in 0..self.len(py)? { let rev = Revision(r as BaseRevision); // in this case node() won't ever return None - nt.insert(&*index, index.node(rev).unwrap(), rev) + nt.insert(index, index.node(rev).expect("node should exist"), rev) .map_err(|e| nodemap_error(py, e))? } Ok(py.None()) @@ -685,7 +1525,11 @@ /// Returns the full nodemap bytes to be written as-is to disk fn inner_nodemap_data_all(&self, py: Python) -> PyResult { - let nodemap = self.get_nodetree(py)?.borrow_mut().take().unwrap(); + let nodemap = self + .get_nodetree(py)? + .borrow_mut() + .take() + .expect("nodetree should exist"); let (readonly, bytes) = nodemap.into_readonly_and_added_bytes(); // If there's anything readonly, we need to build the data again from @@ -718,7 +1562,11 @@ None => return Ok(py.None()), }; - let node_tree = self.get_nodetree(py)?.borrow_mut().take().unwrap(); + let node_tree = self + .get_nodetree(py)? + .borrow_mut() + .take() + .expect("nodetree should exist"); let masked_blocks = node_tree.masked_readonly_blocks(); let (_, data) = node_tree.into_readonly_and_added_bytes(); let changed = masked_blocks * std::mem::size_of::(); @@ -748,7 +1596,7 @@ .extract::(py)? .into(); self.docket(py).borrow_mut().replace(docket.clone_ref(py)); - let idx = self.index(py).borrow(); + let idx = &self.inner(py).borrow().index; let data_tip = idx.check_revision(data_tip).ok_or_else(|| { nodemap_error(py, NodeMapError::RevisionNotInIndex(data_tip)) })?; @@ -757,7 +1605,7 @@ for r in (data_tip.0 + 1)..current_tip as BaseRevision { let rev = Revision(r); // in this case node() won't ever return None - nt.insert(&*idx, idx.node(rev).unwrap(), rev) + nt.insert(idx, idx.node(rev).expect("node should exist"), rev) .map_err(|e| nodemap_error(py, e))? } @@ -767,7 +1615,7 @@ } fn inner_getitem(&self, py: Python, key: PyObject) -> PyResult { - let idx = self.index(py).borrow(); + let idx = &self.inner(py).borrow().index; Ok(match key.extract::(py) { Ok(key_as_int) => { let entry_params = if key_as_int == NULL_REVISION.0 { @@ -787,15 +1635,17 @@ revision_data_params_to_py_tuple(py, entry_params) .into_object() } - _ => self.get_rev(py, key.extract::(py)?)?.map_or_else( - || py.None(), - |py_rev| py_rev.into_py_object(py).into_object(), - ), + _ => self + ._index_get_rev(py, key.extract::(py)?)? + .map_or_else( + || py.None(), + |py_rev| py_rev.into_py_object(py).into_object(), + ), }) } fn inner_head_node_ids(&self, py: Python) -> PyResult { - let index = &*self.index(py).borrow(); + let index = &self.inner(py).borrow().index; // We don't use the shortcut here, as it's actually slower to loop // through the cached `PyList` than to re-do the whole computation for @@ -827,7 +1677,7 @@ filtered_revs: &PyObject, stop_rev: &PyObject, ) -> PyResult { - let index = &*self.index(py).borrow(); + let index = &self.inner(py).borrow().index; let stop_rev = if stop_rev.is_none(py) { None } else { @@ -883,7 +1733,7 @@ } fn check_revision( - index: &hg::index::Index, + index: &Index, rev: UncheckedRevision, py: Python, ) -> PyResult { @@ -900,7 +1750,7 @@ ) -> PyResult { let begin = begin.extract::(py)?; let end = end.extract::(py)?; - let index = &*self.index(py).borrow(); + let index = &self.inner(py).borrow().index; let begin = Self::check_revision(index, UncheckedRevision(begin - 1), py)?; let end = Self::check_revision(index, UncheckedRevision(end - 1), py)?; @@ -919,7 +1769,7 @@ new_heads: &[Revision], py: Python<'_>, ) -> PyList { - let index = self.index(py).borrow(); + let index = &self.inner(py).borrow().index; let as_vec: Vec = new_heads .iter() .map(|r| { @@ -959,7 +1809,7 @@ py: Python, py_revs: &PyTuple, ) -> PyResult { - let index = &*self.index(py).borrow(); + let index = &self.inner(py).borrow().index; let revs: Vec<_> = rev_pyiter_collect(py, py_revs.as_object(), index)?; let as_vec: Vec<_> = index .ancestors(&revs) @@ -975,7 +1825,7 @@ py: Python, py_revs: &PyTuple, ) -> PyResult { - let index = &*self.index(py).borrow(); + let index = &self.inner(py).borrow().index; let revs: Vec<_> = rev_pyiter_collect(py, py_revs.as_object(), index)?; let as_vec: Vec<_> = index .common_ancestor_heads(&revs) @@ -991,7 +1841,7 @@ py: Python, py_roots: PyDict, ) -> PyResult { - let index = &*self.index(py).borrow(); + let index = &self.inner(py).borrow().index; let roots: Result>, PyErr> = py_roots .items_list(py) .iter(py) @@ -1038,7 +1888,7 @@ target_density: f64, min_gap_size: usize, ) -> PyResult { - let index = &*self.index(py).borrow(); + let index = &self.inner(py).borrow().index; let revs: Vec<_> = rev_pyiter_collect(py, &revs, index)?; let as_nested_vec = index.slice_chunk_to_density(&revs, target_density, min_gap_size); @@ -1070,7 +1920,7 @@ roots: PyObject, include_path: bool, ) -> PyResult { - let index = &*self.index(py).borrow(); + let index = &self.inner(py).borrow().index; let heads = rev_pyiter_collect_or_else(py, &heads, index, |_rev| { PyErr::new::(py, "head out of range") })?; @@ -1092,6 +1942,96 @@ .collect(); Ok(PyList::new(py, &as_vec).into_object()) } + fn inner_issnapshot( + &self, + py: Python, + rev: UncheckedRevision, + ) -> PyResult { + let inner = &self.inner(py).borrow(); + let index = &self.inner(py).borrow().index; + let rev = index + .check_revision(rev) + .ok_or_else(|| rev_not_in_index(py, rev))?; + let result = inner.is_snapshot(rev).map_err(|e| { + PyErr::new::(py, e.to_string()) + })?; + Ok(result) + } +} + +impl InnerRevlog { + pub fn inner_new( + py: Python, + vfs_base: PyObject, + fncache: PyObject, + vfs_is_readonly: bool, + index_data: PyObject, + index_file: PyObject, + data_file: PyObject, + _sidedata_file: PyObject, + inline: bool, + data_config: PyObject, + delta_config: PyObject, + feature_config: PyObject, + _chunk_cache: PyObject, + _default_compression_header: PyObject, + revlog_type: usize, + use_persistent_nodemap: bool, + ) -> PyResult { + let index_file = + get_path_from_bytes(index_file.extract::(py)?.data(py)) + .to_owned(); + let data_file = + get_path_from_bytes(data_file.extract::(py)?.data(py)) + .to_owned(); + let revlog_type = RevlogType::try_from(revlog_type) + .map_err(|e| revlog_error_from_msg(py, e))?; + let data_config = extract_data_config(py, data_config, revlog_type)?; + let delta_config = + extract_delta_config(py, delta_config, revlog_type)?; + let feature_config = + extract_feature_config(py, feature_config, revlog_type)?; + let options = RevlogOpenOptions::new( + inline, + data_config, + delta_config, + feature_config, + ); + + // Safety: we keep the buffer around inside the class as `index_mmap` + let (buf, bytes) = unsafe { mmap_keeparound(py, index_data)? }; + let index = Index::new(bytes, options.index_header()) + .map_err(|e| revlog_error_from_msg(py, e))?; + + let base = &vfs_base.extract::(py)?; + let base = get_path_from_bytes(base.data(py)).to_owned(); + let core = CoreInnerRevlog::new( + Box::new(FnCacheVfs::new( + base, + vfs_is_readonly, + Box::new(PyFnCache::new(fncache)), + )), + index, + index_file, + data_file, + data_config, + delta_config, + feature_config, + ); + Self::create_instance( + py, + core, + RefCell::new(None), + RefCell::new(None), + RefCell::new(None), + RefCell::new(buf), + RefCell::new(None), + RefCell::new(None), + RefCell::new(None), + use_persistent_nodemap, + AtomicUsize::new(0), + ) + } } py_class!(pub class NodeTree |py| { @@ -1112,7 +2052,7 @@ /// (generation-based guard), same as iterating on a `dict` that has /// been meanwhile mutated. def is_invalidated(&self) -> PyResult { - let leaked = self.index(py).borrow(); + let leaked = &self.index(py).borrow(); // Safety: we don't leak the "faked" reference out of `UnsafePyLeaked` let result = unsafe { leaked.try_borrow(py) }; // two cases for result to be an error: @@ -1124,7 +2064,7 @@ } def insert(&self, rev: PyRevision) -> PyResult { - let leaked = self.index(py).borrow(); + let leaked = &self.index(py).borrow(); // Safety: we don't leak the "faked" reference out of `UnsafePyLeaked` let index = &*unsafe { leaked.try_borrow(py)? }; @@ -1136,7 +2076,7 @@ return Err(rev_not_in_index(py, rev.into())) } - let entry = index.inner.get_entry(rev).unwrap(); + let entry = index.inner.get_entry(rev).expect("entry should exist"); let mut nt = self.nt(py).borrow_mut(); nt.insert(index, entry.hash(), rev).map_err(|e| nodemap_error(py, e))?; @@ -1159,7 +2099,7 @@ )?; let nt = self.nt(py).borrow(); - let leaked = self.index(py).borrow(); + let leaked = &self.index(py).borrow(); // Safety: we don't leak the "faked" reference out of `UnsafePyLeaked` let index = &*unsafe { leaked.try_borrow(py)? }; @@ -1171,7 +2111,7 @@ def shortest(&self, node: PyBytes) -> PyResult { let nt = self.nt(py).borrow(); - let leaked = self.index(py).borrow(); + let leaked = &self.index(py).borrow(); // Safety: we don't leak the "faked" reference out of `UnsafePyLeaked` let idx = &*unsafe { leaked.try_borrow(py)? }; match nt.unique_prefix_len_node(idx, &node_from_py_bytes(py, &node)?) @@ -1183,6 +2123,120 @@ } }); +fn panic_after_error(_py: Python) -> ! { + unsafe { + python3_sys::PyErr_Print(); + } + panic!("Python API called failed"); +} + +/// # Safety +/// +/// Don't call this. Its only caller is taken from `PyO3`. +unsafe fn cast_from_owned_ptr_or_panic( + py: Python, + p: *mut python3_sys::PyObject, +) -> T +where + T: cpython::PythonObjectWithCheckedDowncast, +{ + if p.is_null() { + panic_after_error(py); + } else { + PyObject::from_owned_ptr(py, p).cast_into(py).unwrap() + } +} + +fn with_pybytes_buffer( + py: Python, + len: usize, + init: F, +) -> Result +where + F: FnOnce( + &mut dyn RevisionBuffer, + ) -> Result<(), RevlogError>, +{ + // Largely inspired by code in PyO3 + // https://pyo3.rs/main/doc/pyo3/types/struct.pybytes#method.new_bound_with + unsafe { + let pyptr = python3_sys::PyBytes_FromStringAndSize( + std::ptr::null(), + len as python3_sys::Py_ssize_t, + ); + let pybytes = cast_from_owned_ptr_or_panic::(py, pyptr); + let buffer: *mut u8 = python3_sys::PyBytes_AsString(pyptr).cast(); + debug_assert!(!buffer.is_null()); + let mut rev_buf = PyRevisionBuffer::new(pybytes, buffer, len); + // Initialise the bytestring in init + // If init returns an Err, the buffer is deallocated by `pybytes` + init(&mut rev_buf).map(|_| rev_buf.finish()) + } +} + +/// Wrapper around a Python-provided buffer into which the revision contents +/// will be written. Done for speed in order to save a large allocation + copy. +struct PyRevisionBuffer { + py_bytes: PyBytes, + _buf: *mut u8, + len: usize, + current_buf: *mut u8, + current_len: usize, +} + +impl PyRevisionBuffer { + /// # Safety + /// + /// `buf` should be the start of the allocated bytes of `bytes`, and `len` + /// exactly the length of said allocated bytes. + #[inline] + unsafe fn new(bytes: PyBytes, buf: *mut u8, len: usize) -> Self { + Self { + py_bytes: bytes, + _buf: buf, + len, + current_len: 0, + current_buf: buf, + } + } + + /// Number of bytes that have been copied to. Will be different to the + /// total allocated length of the buffer unless the revision is done being + /// written. + #[inline] + fn current_len(&self) -> usize { + self.current_len + } +} + +impl RevisionBuffer for PyRevisionBuffer { + type Target = PyBytes; + + #[inline] + fn extend_from_slice(&mut self, slice: &[u8]) { + assert!(self.current_len + slice.len() <= self.len); + unsafe { + // We cannot use `copy_from_nonoverlapping` since it's *possible* + // to create a slice from the same Python memory region using + // [`PyBytesDeref`]. Probable that LLVM has an optimization anyway? + self.current_buf.copy_from(slice.as_ptr(), slice.len()); + self.current_buf = self.current_buf.add(slice.len()); + } + self.current_len += slice.len() + } + + #[inline] + fn finish(self) -> Self::Target { + // catch unzeroed bytes before it becomes undefined behavior + assert_eq!( + self.current_len(), + self.len, + "not enough bytes read for revision" + ); + self.py_bytes + } +} + fn revlog_error(py: Python) -> PyErr { match py .import("mercurial.error") @@ -1196,21 +2250,6 @@ } } -fn revlog_error_with_msg(py: Python, msg: &[u8]) -> PyErr { - match py - .import("mercurial.error") - .and_then(|m| m.get(py, "RevlogError")) - { - Err(e) => e, - Ok(cls) => PyErr::from_instance( - py, - cls.call(py, (PyBytes::new(py, msg),), None) - .ok() - .into_py_object(py), - ), - } -} - fn graph_error(py: Python, _err: hg::GraphError) -> PyErr { // ParentOutOfRange is currently the only alternative // in `hg::GraphError`. The C index always raises this simple ValueError. @@ -1250,8 +2289,8 @@ m.add(py, "__package__", package)?; m.add(py, "__doc__", "RevLog - Rust implementations")?; - m.add_class::(py)?; m.add_class::(py)?; + m.add_class::(py)?; let sys = PyModule::import(py, "sys")?; let sys_modules: PyDict = sys.get(py, "modules")?.extract(py)?; diff -r 854e2b9bca57 -r 51a350a22d0c rust/hg-cpython/src/vfs.rs --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/rust/hg-cpython/src/vfs.rs Wed Nov 20 15:53:19 2024 +0100 @@ -0,0 +1,303 @@ +use std::{ + cell::Cell, + fs::File, + io::Error, + os::fd::{AsRawFd, FromRawFd}, + path::{Path, PathBuf}, +}; + +use cpython::{ + ObjectProtocol, PyBytes, PyClone, PyDict, PyErr, PyInt, PyObject, + PyResult, PyTuple, Python, PythonObject, ToPyObject, +}; +use hg::{ + errors::{HgError, IoResultExt}, + exit_codes, + utils::files::{get_bytes_from_path, get_path_from_bytes}, + vfs::{Vfs, VfsFile}, +}; + +/// Wrapper around a Python VFS object to call back into Python from `hg-core`. +pub struct PyVfs { + inner: PyObject, + base: PathBuf, +} + +impl Clone for PyVfs { + fn clone(&self) -> Self { + let gil = &Python::acquire_gil(); + let py = gil.python(); + Self { + inner: self.inner.clone_ref(py), + base: self.base.clone(), + } + } +} + +impl PyVfs { + pub fn new( + _py: Python, + py_vfs: PyObject, + base: PathBuf, + ) -> PyResult { + Ok(Self { + inner: py_vfs, + base, + }) + } + + fn inner_open( + &self, + filename: &Path, + create: bool, + check_ambig: bool, + atomic_temp: bool, + write: bool, + ) -> Result<(File, Option), HgError> { + let gil = &Python::acquire_gil(); + let py = gil.python(); + let mode = if atomic_temp { + PyBytes::new(py, b"w") + } else if create { + PyBytes::new(py, b"w+") + } else if write { + PyBytes::new(py, b"r+") + } else { + PyBytes::new(py, b"rb") + }; + let res = self.inner.call( + py, + ( + PyBytes::new(py, &get_bytes_from_path(filename)), + mode, + atomic_temp, + check_ambig, + ), + None, + ); + match res { + Ok(tup) => { + let tup = tup + .extract::(py) + .map_err(|e| vfs_error("vfs did not return a tuple", e))?; + let fileno = tup.get_item(py, 0).extract(py).map_err(|e| { + vfs_error("vfs did not return a valid fileno", e) + })?; + let temp_name = tup.get_item(py, 1); + // Safety: this must be a valid owned file descriptor, and + // Python has just given it to us, it will only exist here now + let file = unsafe { File::from_raw_fd(fileno) }; + let temp_name = if atomic_temp { + Some( + get_path_from_bytes( + temp_name + .extract::(py) + .map_err(|e| vfs_error("invalid tempname", e))? + .data(py), + ) + .to_owned(), + ) + } else { + None + }; + Ok((file, temp_name)) + } + Err(mut e) => { + // TODO surely there is a better way of comparing + if e.instance(py).get_type(py).name(py) == "FileNotFoundError" + { + return Err(HgError::IoError { + error: Error::new( + std::io::ErrorKind::NotFound, + e.instance(py).to_string(), + ), + context: hg::errors::IoErrorContext::ReadingFile( + filename.to_owned(), + ), + }); + } + Err(vfs_error("failed to call opener", e)) + } + } + } +} + +fn vfs_error(reason: impl Into, mut error: PyErr) -> HgError { + let gil = &Python::acquire_gil(); + let py = gil.python(); + HgError::abort( + format!("{}: {}", reason.into(), error.instance(py)), + exit_codes::ABORT, + None, + ) +} + +py_class!(pub class PyFile |py| { + data number: Cell; + + def fileno(&self) -> PyResult { + Ok(self.number(py).get().to_py_object(py)) + } +}); + +impl Vfs for PyVfs { + fn open(&self, filename: &Path) -> Result { + self.inner_open(filename, false, false, false, true) + .map(|(f, _)| VfsFile::normal(f, filename.to_owned())) + } + fn open_read(&self, filename: &Path) -> Result { + self.inner_open(filename, false, false, false, false) + .map(|(f, _)| VfsFile::normal(f, filename.to_owned())) + } + + fn open_check_ambig(&self, filename: &Path) -> Result { + self.inner_open(filename, false, true, false, true) + .map(|(f, _)| VfsFile::normal(f, filename.to_owned())) + } + + fn create( + &self, + filename: &Path, + check_ambig: bool, + ) -> Result { + self.inner_open(filename, true, check_ambig, false, true) + .map(|(f, _)| VfsFile::normal(f, filename.to_owned())) + } + + fn create_atomic( + &self, + filename: &Path, + check_ambig: bool, + ) -> Result { + self.inner_open(filename, true, false, true, true).map( + |(fp, temp_name)| { + VfsFile::Atomic(hg::vfs::AtomicFile::from_file( + fp, + check_ambig, + temp_name.expect("temp name should exist"), + filename.to_owned(), + )) + }, + ) + } + + fn file_size(&self, file: &VfsFile) -> Result { + let gil = &Python::acquire_gil(); + let py = gil.python(); + let raw_fd = file.as_raw_fd(); + let py_fd = PyFile::create_instance(py, Cell::new(raw_fd)) + .expect("create_instance cannot fail"); + let fstat = self + .inner + .call_method(py, "fstat", (py_fd,), None) + .map_err(|e| { + vfs_error(format!("failed to fstat fd '{}'", raw_fd), e) + })?; + fstat + .getattr(py, "st_size") + .map(|v| { + v.extract(py).map_err(|e| { + vfs_error(format!("invalid size for fd '{}'", raw_fd), e) + }) + }) + .map_err(|e| { + vfs_error(format!("failed to get size of fd '{}'", raw_fd), e) + })? + } + + fn exists(&self, filename: &Path) -> bool { + let gil = &Python::acquire_gil(); + let py = gil.python(); + self.inner + .call_method( + py, + "exists", + (PyBytes::new(py, &get_bytes_from_path(filename)),), + None, + ) + .unwrap_or_else(|_| false.into_py_object(py).into_object()) + .extract(py) + .unwrap() + } + + fn unlink(&self, filename: &Path) -> Result<(), HgError> { + let gil = &Python::acquire_gil(); + let py = gil.python(); + if let Err(e) = self.inner.call_method( + py, + "unlink", + (PyBytes::new(py, &get_bytes_from_path(filename)),), + None, + ) { + return Err(vfs_error( + format!("failed to unlink '{}'", filename.display()), + e, + )); + } + Ok(()) + } + + fn rename( + &self, + from: &Path, + to: &Path, + check_ambig: bool, + ) -> Result<(), HgError> { + let gil = &Python::acquire_gil(); + let py = gil.python(); + let kwargs = PyDict::new(py); + kwargs + .set_item(py, "checkambig", check_ambig) + .map_err(|e| vfs_error("dict setitem failed", e))?; + if let Err(e) = self.inner.call_method( + py, + "rename", + ( + PyBytes::new(py, &get_bytes_from_path(from)), + PyBytes::new(py, &get_bytes_from_path(to)), + ), + Some(&kwargs), + ) { + let msg = format!( + "failed to rename '{}' to '{}'", + from.display(), + to.display() + ); + return Err(vfs_error(msg, e)); + } + Ok(()) + } + + fn copy(&self, from: &Path, to: &Path) -> Result<(), HgError> { + let gil = &Python::acquire_gil(); + let py = gil.python(); + let from = self + .inner + .call_method( + py, + "join", + (PyBytes::new(py, &get_bytes_from_path(from)),), + None, + ) + .unwrap(); + let from = from.extract::(py).unwrap(); + let from = get_path_from_bytes(from.data(py)); + let to = self + .inner + .call_method( + py, + "join", + (PyBytes::new(py, &get_bytes_from_path(to)),), + None, + ) + .unwrap(); + let to = to.extract::(py).unwrap(); + let to = get_path_from_bytes(to.data(py)); + std::fs::copy(from, to).when_writing_file(to)?; + Ok(()) + } + + fn base(&self) -> &Path { + &self.base + } +} diff -r 854e2b9bca57 -r 51a350a22d0c rust/rhg/src/commands/debugdata.rs --- a/rust/rhg/src/commands/debugdata.rs Wed Nov 20 15:38:57 2024 +0100 +++ b/rust/rhg/src/commands/debugdata.rs Wed Nov 20 15:53:19 2024 +0100 @@ -2,7 +2,7 @@ use clap::Arg; use clap::ArgGroup; use hg::operations::debug_data; -use hg::RevlogType; +use hg::revlog::RevlogType; pub const HELP_TEXT: &str = " Dump the contents of a data file revision diff -r 854e2b9bca57 -r 51a350a22d0c rust/rhg/src/commands/status.rs --- a/rust/rhg/src/commands/status.rs Wed Nov 20 15:38:57 2024 +0100 +++ b/rust/rhg/src/commands/status.rs Wed Nov 20 15:53:19 2024 +0100 @@ -20,20 +20,22 @@ use hg::errors::{HgError, IoResultExt}; use hg::filepatterns::parse_pattern_args; use hg::lock::LockError; -use hg::manifest::Manifest; use hg::matchers::{AlwaysMatcher, IntersectionMatcher}; use hg::repo::Repo; +use hg::revlog::manifest::Manifest; +use hg::revlog::options::{default_revlog_options, RevlogOpenOptions}; +use hg::revlog::RevlogType; use hg::utils::debug::debug_wait_for_file; use hg::utils::files::{ get_bytes_from_os_str, get_bytes_from_os_string, get_path_from_bytes, }; use hg::utils::hg_path::{hg_path_to_path_buf, HgPath}; +use hg::DirstateStatus; +use hg::PatternFileWarning; use hg::Revision; use hg::StatusError; use hg::StatusOptions; use hg::{self, narrow, sparse}; -use hg::{DirstateStatus, RevlogOpenOptions}; -use hg::{PatternFileWarning, RevlogType}; use log::info; use rayon::prelude::*; use std::borrow::Cow; @@ -383,8 +385,11 @@ })?; let working_directory_vfs = repo.working_directory_vfs(); let store_vfs = repo.store_vfs(); - let filelog_open_options = - repo.default_revlog_options(RevlogType::Filelog)?; + let filelog_open_options = default_revlog_options( + repo.config(), + repo.requirements(), + RevlogType::Filelog, + )?; let res: Vec<_> = take(&mut ds_status.unsure) .into_par_iter() .map(|to_check| { @@ -785,7 +790,7 @@ if entry_flags.map(|f| f.into()) != fs_flags { return Ok(UnsureOutcome::Modified); } - let filelog = hg::filelog::Filelog::open_vfs( + let filelog = hg::revlog::filelog::Filelog::open_vfs( store_vfs, hg_path, revlog_open_options, diff -r 854e2b9bca57 -r 51a350a22d0c tests/test-bundle.t --- a/tests/test-bundle.t Wed Nov 20 15:38:57 2024 +0100 +++ b/tests/test-bundle.t Wed Nov 20 15:53:19 2024 +0100 @@ -294,7 +294,7 @@ packed.hg: size=2865, sha1=353d10311f4befa195d9a1ca4b8e26518115c702 (no-rust !) 0000: 48 47 53 31 55 4e 00 00 00 00 00 00 00 06 00 00 |HGS1UN..........| (no-rust !) 0010: 00 00 00 00 0a 69 00 3b 67 65 6e 65 72 61 6c 64 |.....i.;generald| (no-rust !) - packed.hg: size=3181, sha1=b202787710a1c109246554be589506cd2916acb7 (rust !) + packed.hg: size=3181, sha1=3e865df183d388222969c5b19c844dd8697c85c6 (rust !) 0000: 48 47 53 31 55 4e 00 00 00 00 00 00 00 09 00 00 |HGS1UN..........| (rust !) 0010: 00 00 00 00 0b 67 00 3b 67 65 6e 65 72 61 6c 64 |.....g.;generald| (rust !) 0020: 65 6c 74 61 2c 72 65 76 6c 6f 67 2d 63 6f 6d 70 |elta,revlog-comp| diff -r 854e2b9bca57 -r 51a350a22d0c tests/test-contrib-perf.t --- a/tests/test-contrib-perf.t Wed Nov 20 15:38:57 2024 +0100 +++ b/tests/test-contrib-perf.t Wed Nov 20 15:53:19 2024 +0100 @@ -283,8 +283,15 @@ #if reporevlogstore $ hg perfrevlogrevisions .hg/store/data/a.i #endif + +#if no-rust +Cannot test in Rust because this these are highly invasive and expect a certain +structure from Python code. + $ hg perfrevlogrevision -m 0 $ hg perfrevlogchunks -c +#endif + $ hg perfrevrange $ hg perfrevset 'all()' $ hg perfstartup diff -r 854e2b9bca57 -r 51a350a22d0c tests/test-journal-exists.t --- a/tests/test-journal-exists.t Wed Nov 20 15:38:57 2024 +0100 +++ b/tests/test-journal-exists.t Wed Nov 20 15:53:19 2024 +0100 @@ -45,12 +45,21 @@ $ hg bundle -qa repo.hg $ chmod -w foo/.hg/store/00changelog.i +#if rust + $ hg -R foo unbundle repo.hg + adding changesets + transaction abort! + rollback completed + abort: abort: when writing $TESTTMP/repo/foo/.hg/store/00changelog.i: $EACCES$ + [50] +#else $ hg -R foo unbundle repo.hg adding changesets transaction abort! rollback completed abort: $EACCES$: '$TESTTMP/repo/foo/.hg/store/.00changelog.i-*' (glob) [255] +#endif $ if test -f foo/.hg/store/journal; then echo 'journal exists :-('; fi #endif diff -r 854e2b9bca57 -r 51a350a22d0c tests/test-permissions.t --- a/tests/test-permissions.t Wed Nov 20 15:38:57 2024 +0100 +++ b/tests/test-permissions.t Wed Nov 20 15:53:19 2024 +0100 @@ -34,10 +34,16 @@ $ chmod -w .hg/store/data/a.i $ echo barber > a +#if rust + $ hg commit -m "2" + abort: abort: when writing $TESTTMP/t/.hg/store/data/a.i: $EACCES$ + [50] +#else $ hg commit -m "2" trouble committing a! abort: $EACCES$: '$TESTTMP/t/.hg/store/data/a.i' [255] +#endif $ chmod -w . diff -r 854e2b9bca57 -r 51a350a22d0c tests/test-remotefilelog-bgprefetch.t --- a/tests/test-remotefilelog-bgprefetch.t Wed Nov 20 15:38:57 2024 +0100 +++ b/tests/test-remotefilelog-bgprefetch.t Wed Nov 20 15:53:19 2024 +0100 @@ -33,8 +33,8 @@ transferred 776 bytes in * seconds (*/sec) (glob) (no-zstd !) 3 files to transfer, 784 bytes of data (zstd no-rust !) transferred 784 bytes in * seconds (*/sec) (glob) (zstd no-rust !) - 5 files to transfer, 910 bytes of data (rust !) - transferred 910 bytes in * seconds (*/sec) (glob) (rust !) + 5 files to transfer, 911 bytes of data (rust !) + transferred 911 bytes in * seconds (*/sec) (glob) (rust !) searching for changes no changes found diff -r 854e2b9bca57 -r 51a350a22d0c tests/test-remotefilelog-prefetch.t --- a/tests/test-remotefilelog-prefetch.t Wed Nov 20 15:38:57 2024 +0100 +++ b/tests/test-remotefilelog-prefetch.t Wed Nov 20 15:53:19 2024 +0100 @@ -26,8 +26,8 @@ transferred 528 bytes in * seconds (* */sec) (glob) (no-zstd !) 3 files to transfer, 532 bytes of data (zstd no-rust !) transferred 532 bytes in * seconds (* */sec) (glob) (zstd no-rust !) - 5 files to transfer, 658 bytes of data (zstd rust !) - transferred 658 bytes in * seconds (*/sec) (glob) (zstd rust !) + 5 files to transfer, 659 bytes of data (zstd rust !) + transferred 659 bytes in * seconds (*/sec) (glob) (zstd rust !) searching for changes no changes found $ cd shallow @@ -172,8 +172,8 @@ transferred 528 bytes in * seconds * (glob) (no-zstd !) 3 files to transfer, 532 bytes of data (zstd no-rust !) transferred 532 bytes in * seconds (* */sec) (glob) (zstd no-rust !) - 5 files to transfer, 658 bytes of data (zstd rust !) - transferred 658 bytes in * seconds (*/sec) (glob) (zstd rust !) + 5 files to transfer, 659 bytes of data (zstd rust !) + transferred 659 bytes in * seconds (*/sec) (glob) (zstd rust !) searching for changes no changes found updating to branch default diff -r 854e2b9bca57 -r 51a350a22d0c tests/test-repo-compengines.t --- a/tests/test-repo-compengines.t Wed Nov 20 15:38:57 2024 +0100 +++ b/tests/test-repo-compengines.t Wed Nov 20 15:53:19 2024 +0100 @@ -194,9 +194,11 @@ > done $ $RUNTESTDIR/f -s zstd-*/.hg/store/data/* - zstd-level-1/.hg/store/data/a.i: size=4114 + zstd-level-1/.hg/store/data/a.i: size=4114 (no-rust !) + zstd-level-1/.hg/store/data/a.i: size=4112 (rust !) zstd-level-22/.hg/store/data/a.i: size=4091 - zstd-level-default/\.hg/store/data/a\.i: size=(4094|4102) (re) + zstd-level-default/\.hg/store/data/a\.i: size=(4094|4102) (re) (no-rust !) + zstd-level-default/.hg/store/data/a.i: size=4097 (rust !) Test error cases diff -r 854e2b9bca57 -r 51a350a22d0c tests/test-rust-revlog.py --- a/tests/test-rust-revlog.py Wed Nov 20 15:38:57 2024 +0100 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,90 +0,0 @@ -import struct -import unittest - -from mercurial.node import hex - -try: - from mercurial import rustext - - rustext.__name__ # trigger immediate actual import -except ImportError: - rustext = None -else: - from mercurial.rustext import revlog - - # this would fail already without appropriate ancestor.__package__ - from mercurial.rustext.ancestor import LazyAncestors - -from mercurial.testing import revlog as revlogtesting - -header = struct.unpack(">I", revlogtesting.data_non_inlined[:4])[0] - - -@unittest.skipIf( - rustext is None, - "rustext module revlog relies on is not available", -) -class RustRevlogIndexTest(revlogtesting.RevlogBasedTestBase): - def test_heads(self): - idx = self.parseindex() - rustidx = revlog.Index(revlogtesting.data_non_inlined, header) - self.assertEqual(rustidx.headrevs(), idx.headrevs()) - - def test_len(self): - idx = self.parseindex() - rustidx = revlog.Index(revlogtesting.data_non_inlined, header) - self.assertEqual(len(rustidx), len(idx)) - - def test_ancestors(self): - rustidx = revlog.Index(revlogtesting.data_non_inlined, header) - lazy = LazyAncestors(rustidx, [3], 0, True) - # we have two more references to the index: - # - in its inner iterator for __contains__ and __bool__ - # - in the LazyAncestors instance itself (to spawn new iterators) - self.assertTrue(2 in lazy) - self.assertTrue(bool(lazy)) - self.assertEqual(list(lazy), [3, 2, 1, 0]) - # a second time to validate that we spawn new iterators - self.assertEqual(list(lazy), [3, 2, 1, 0]) - - # let's check bool for an empty one - self.assertFalse(LazyAncestors(rustidx, [0], 0, False)) - - -@unittest.skipIf( - rustext is None, - "rustext module revlog relies on is not available", -) -class RustRevlogNodeTreeClassTest(revlogtesting.RustRevlogBasedTestBase): - def test_standalone_nodetree(self): - idx = self.parserustindex() - nt = revlog.NodeTree(idx) - for i in range(4): - nt.insert(i) - - bin_nodes = [entry[7] for entry in idx] - hex_nodes = [hex(n) for n in bin_nodes] - - for i, node in enumerate(hex_nodes): - self.assertEqual(nt.prefix_rev_lookup(node), i) - self.assertEqual(nt.prefix_rev_lookup(node[:5]), i) - - # all 4 revisions in idx (standard data set) have different - # first nybbles in their Node IDs, - # hence `nt.shortest()` should return 1 for them, except when - # the leading nybble is 0 (ambiguity with NULL_NODE) - for i, (bin_node, hex_node) in enumerate(zip(bin_nodes, hex_nodes)): - shortest = nt.shortest(bin_node) - expected = 2 if hex_node[0] == ord('0') else 1 - self.assertEqual(shortest, expected) - self.assertEqual(nt.prefix_rev_lookup(hex_node[:shortest]), i) - - # test invalidation (generation poisoning) detection - del idx[3] - self.assertTrue(nt.is_invalidated()) - - -if __name__ == '__main__': - import silenttestrunner - - silenttestrunner.main(__name__) diff -r 854e2b9bca57 -r 51a350a22d0c tests/test-treemanifest.t --- a/tests/test-treemanifest.t Wed Nov 20 15:38:57 2024 +0100 +++ b/tests/test-treemanifest.t Wed Nov 20 15:53:19 2024 +0100 @@ -869,11 +869,13 @@ > done $ hg ci -m 'have some content' $ f -s .hg/store/00manifest.* - .hg/store/00manifest.i: size=798 (no-pure !) - .hg/store/00manifest.i: size=784 (pure !) + .hg/store/00manifest.i: size=798 (no-pure no-rust !) + .hg/store/00manifest.i: size=800 (rust !) + .hg/store/00manifest.i: size=784 (pure no-rust !) $ f -s .hg/store/meta/dir/00manifest* - .hg/store/meta/dir/00manifest.i: size=556 (no-pure !) - .hg/store/meta/dir/00manifest.i: size=544 (pure !) + .hg/store/meta/dir/00manifest.i: size=556 (no-pure no-rust !) + .hg/store/meta/dir/00manifest.i: size=557 (rust !) + .hg/store/meta/dir/00manifest.i: size=544 (pure no-rust !) $ hg debugupgraderepo --config format.revlog-compression=none --config experimental.treemanifest=yes --run --quiet --no-backup upgrade will perform the following actions: diff -r 854e2b9bca57 -r 51a350a22d0c tests/test-verify.t --- a/tests/test-verify.t Wed Nov 20 15:38:57 2024 +0100 +++ b/tests/test-verify.t Wed Nov 20 15:53:19 2024 +0100 @@ -321,7 +321,8 @@ $ cat start b > .hg/store/data/a.i $ hg verify -q - a@1: broken revlog! (index a is corrupted) + a@1: broken revlog! (index a is corrupted) (no-rust !) + a@1: broken revlog! (abort: unexpected inline revlog length) (rust !) warning: orphan data file 'data/a.i' not checking dirstate because of previous errors 1 warnings encountered!