Mercurial > hg
view mercurial/branching/rev_cache.py @ 52167:7346f93be7a4
revlog: add the glue to use the Rust `InnerRevlog` from Python
The performance of this has been looked at for quite some time, and some
workflows are actually quite a bit faster than with the Python + C code.
However, we are still (up to 20%) slower in some crucial places like cloning
certain repos, log, cat, which makes this an incomplete rewrite. This is
mostly due to the high amount of overhead in Python <-> Rust FFI, especially
around the VFS code. A future patch series will rewrite the VFS code in
pure Rust, which should hopefully get us up to par with current perfomance,
if not better in all important cases.
This is a "save state" of sorts, as this is a ton of code, and I don't want
to pile up even more things in a single review.
Continuing to try to match the current performance will take an extremely
long time, if it's not impossible, without the aforementioned VFS work.
author | Raphaël Gomès <rgomes@octobus.net> |
---|---|
date | Wed, 19 Jun 2024 19:10:49 +0200 |
parents | db1980a361cb |
children |
line wrap: on
line source
# rev_cache.py - caching branch information per revision # # This software may be used and distributed according to the terms of the # GNU General Public License version 2 or any later version. from __future__ import annotations import os import struct from ..node import ( nullrev, ) from .. import ( encoding, error, pycompat, util, ) from ..utils import ( stringutil, ) calcsize = struct.calcsize pack_into = struct.pack_into unpack_from = struct.unpack_from # Revision branch info cache # The "V2" version use the same format as the "V1" but garantee it won't be # truncated, preventing SIGBUS when it is mmap-ed _rbcversion = b'-v2' _rbcnames = b'rbc-names' + _rbcversion _rbcrevs = b'rbc-revs' + _rbcversion _rbc_legacy_version = b'-v1' _rbc_legacy_names = b'rbc-names' + _rbc_legacy_version _rbc_legacy_revs = b'rbc-revs' + _rbc_legacy_version # [4 byte hash prefix][4 byte branch name number with sign bit indicating open] _rbcrecfmt = b'>4sI' _rbcrecsize = calcsize(_rbcrecfmt) _rbcmininc = 64 * _rbcrecsize _rbcnodelen = 4 _rbcbranchidxmask = 0x7FFFFFFF _rbccloseflag = 0x80000000 # with atomic replacement. REWRITE_RATIO = 0.2 class rbcrevs: """a byte string consisting of an immutable prefix followed by a mutable suffix""" def __init__(self, revs): self._prefix = revs self._rest = bytearray() @property def len_prefix(self): size = len(self._prefix) return size - (size % _rbcrecsize) def __len__(self): return self.len_prefix + len(self._rest) def unpack_record(self, rbcrevidx): if rbcrevidx < self.len_prefix: return unpack_from(_rbcrecfmt, util.buffer(self._prefix), rbcrevidx) else: return unpack_from( _rbcrecfmt, util.buffer(self._rest), rbcrevidx - self.len_prefix, ) def make_mutable(self): if self.len_prefix > 0: entirety = bytearray() entirety[:] = self._prefix[: self.len_prefix] entirety.extend(self._rest) self._rest = entirety self._prefix = bytearray() def truncate(self, pos): self.make_mutable() del self._rest[pos:] def pack_into(self, rbcrevidx, node, branchidx): if rbcrevidx < self.len_prefix: self.make_mutable() buf = self._rest start_offset = rbcrevidx - self.len_prefix end_offset = start_offset + _rbcrecsize if len(self._rest) < end_offset: # bytearray doesn't allocate extra space at least in Python 3.7. # When multiple changesets are added in a row, precise resize would # result in quadratic complexity. Overallocate to compensate by # using the classic doubling technique for dynamic arrays instead. # If there was a gap in the map before, less space will be reserved. self._rest.extend(b'\0' * end_offset) return pack_into( _rbcrecfmt, buf, start_offset, node, branchidx, ) def extend(self, extension): return self._rest.extend(extension) def slice(self, begin, end): if begin < self.len_prefix: acc = bytearray() acc[:] = self._prefix[begin : min(end, self.len_prefix)] acc.extend( self._rest[begin - self.len_prefix : end - self.len_prefix] ) return acc return self._rest[begin - self.len_prefix : end - self.len_prefix] class revbranchcache: """Persistent cache, mapping from revision number to branch name and close. This is a low level cache, independent of filtering. Branch names are stored in rbc-names in internal encoding separated by 0. rbc-names is append-only, and each branch name is only stored once and will thus have a unique index. The branch info for each revision is stored in rbc-revs as constant size records. The whole file is read into memory, but it is only 'parsed' on demand. The file is usually append-only but will be truncated if repo modification is detected. The record for each revision contains the first 4 bytes of the corresponding node hash, and the record is only used if it still matches. Even a completely trashed rbc-revs fill thus still give the right result while converging towards full recovery ... assuming no incorrectly matching node hashes. The record also contains 4 bytes where 31 bits contains the index of the branch and the last bit indicate that it is a branch close commit. The usage pattern for rbc-revs is thus somewhat similar to 00changelog.i and will grow with it but be 1/8th of its size. """ def __init__(self, repo, readonly=True): assert repo.filtername is None self._repo = repo self._names = [] # branch names in local encoding with static index self._rbcrevs = rbcrevs(bytearray()) self._rbcsnameslen = 0 # length of names read at _rbcsnameslen self._force_overwrite = False v1_fallback = False try: try: bndata = repo.cachevfs.read(_rbcnames) except (IOError, OSError): # If we don't have "v2" data, we might have "v1" data worth # using. # # consider stop doing this many version after hg-6.9 release bndata = repo.cachevfs.read(_rbc_legacy_names) v1_fallback = True self._force_overwrite = True self._rbcsnameslen = len(bndata) # for verification before writing if bndata: self._names = [ encoding.tolocal(bn) for bn in bndata.split(b'\0') ] except (IOError, OSError): if readonly: # don't try to use cache - fall back to the slow path self.branchinfo = self._branchinfo if self._names: try: # In order to rename the atomictempfile in _writerevs(), the # existing file needs to be removed. The Windows code # (successfully) renames it to a temp file first, before moving # the temp file into its place. But the removal of the original # file then fails, because it's still mapped. The mmap object # needs to be closed in order to remove the file, but in order # to do that, the memoryview returned by util.buffer needs to be # released. usemmap = repo.ui.configbool( b'storage', b'revbranchcache.mmap', default=not pycompat.iswindows, ) if not v1_fallback: with repo.cachevfs(_rbcrevs) as fp: if usemmap and repo.cachevfs.is_mmap_safe(_rbcrevs): data = util.buffer(util.mmapread(fp)) else: data = fp.read() else: # If we don't have "v2" data, we might have "v1" data worth # using. # # Consider stop doing this many version after hg-6.9 # release. with repo.cachevfs(_rbc_legacy_revs) as fp: data = fp.read() self._rbcrevs = rbcrevs(data) except (IOError, OSError) as inst: repo.ui.debug( b"couldn't read revision branch cache: %s\n" % stringutil.forcebytestr(inst) ) # remember number of good records on disk self._rbcrevslen = min( len(self._rbcrevs) // _rbcrecsize, len(repo.changelog) ) if self._rbcrevslen == 0: self._names = [] self._rbcnamescount = len(self._names) # number of names read at # _rbcsnameslen def _clear(self): self._rbcsnameslen = 0 del self._names[:] self._rbcnamescount = 0 self._rbcrevslen = len(self._repo.changelog) self._rbcrevs = rbcrevs(bytearray(self._rbcrevslen * _rbcrecsize)) util.clearcachedproperty(self, b'_namesreverse') self._force_overwrite = True def invalidate(self, rev=0): self._rbcrevslen = rev self._rbcrevs.truncate(rev) self._force_overwrite = True @util.propertycache def _namesreverse(self): return {b: r for r, b in enumerate(self._names)} def branchinfo(self, rev): """Return branch name and close flag for rev, using and updating persistent cache.""" changelog = self._repo.changelog rbcrevidx = rev * _rbcrecsize # avoid negative index, changelog.read(nullrev) is fast without cache if rev == nullrev: return changelog.branchinfo(rev) # if requested rev isn't allocated, grow and cache the rev info if len(self._rbcrevs) < rbcrevidx + _rbcrecsize: return self._branchinfo(rev) # fast path: extract data from cache, use it if node is matching reponode = changelog.node(rev)[:_rbcnodelen] cachenode, branchidx = self._rbcrevs.unpack_record(rbcrevidx) close = bool(branchidx & _rbccloseflag) if close: branchidx &= _rbcbranchidxmask if cachenode == b'\0\0\0\0': pass elif cachenode == reponode: try: return self._names[branchidx], close except IndexError: # recover from invalid reference to unknown branch self._repo.ui.debug( b"referenced branch names not found" b" - rebuilding revision branch cache from scratch\n" ) self._clear() else: # rev/node map has changed, invalidate the cache from here up self._repo.ui.debug( b"history modification detected - truncating " b"revision branch cache to revision %d\n" % rev ) truncate = rbcrevidx + _rbcrecsize self._rbcrevs.truncate(truncate) self._rbcrevslen = min(self._rbcrevslen, truncate) # fall back to slow path and make sure it will be written to disk return self._branchinfo(rev) def _branchinfo(self, rev): """Retrieve branch info from changelog and update _rbcrevs""" changelog = self._repo.changelog b, close = changelog.branchinfo(rev) if b in self._namesreverse: branchidx = self._namesreverse[b] else: branchidx = len(self._names) self._names.append(b) self._namesreverse[b] = branchidx reponode = changelog.node(rev) if close: branchidx |= _rbccloseflag self._setcachedata(rev, reponode, branchidx) return b, close def setdata(self, rev, changelogrevision): """add new data information to the cache""" branch, close = changelogrevision.branchinfo if branch in self._namesreverse: branchidx = self._namesreverse[branch] else: branchidx = len(self._names) self._names.append(branch) self._namesreverse[branch] = branchidx if close: branchidx |= _rbccloseflag self._setcachedata(rev, self._repo.changelog.node(rev), branchidx) # If no cache data were readable (non exists, bad permission, etc) # the cache was bypassing itself by setting: # # self.branchinfo = self._branchinfo # # Since we now have data in the cache, we need to drop this bypassing. if 'branchinfo' in vars(self): del self.branchinfo def _setcachedata(self, rev, node, branchidx): """Writes the node's branch data to the in-memory cache data.""" if rev == nullrev: return rbcrevidx = rev * _rbcrecsize self._rbcrevs.pack_into(rbcrevidx, node, branchidx) self._rbcrevslen = min(self._rbcrevslen, rev) tr = self._repo.currenttransaction() if tr: tr.addfinalize(b'write-revbranchcache', self.write) def write(self, tr=None): """Save branch cache if it is dirty.""" repo = self._repo wlock = None step = b'' try: # write the new names if self._force_overwrite or self._rbcnamescount < len(self._names): wlock = repo.wlock(wait=False) step = b' names' self._writenames(repo) # write the new revs start = self._rbcrevslen * _rbcrecsize if self._force_overwrite or start != len(self._rbcrevs): step = b'' if wlock is None: wlock = repo.wlock(wait=False) self._writerevs(repo, start) except (IOError, OSError, error.Abort, error.LockError) as inst: repo.ui.debug( b"couldn't write revision branch cache%s: %s\n" % (step, stringutil.forcebytestr(inst)) ) finally: if wlock is not None: wlock.release() def _writenames(self, repo): """write the new branch names to revbranchcache""" f = None if self._force_overwrite: self._rbcsnameslen = 0 self._rbcnamescount = 0 try: if self._force_overwrite or self._rbcnamescount != 0: f = repo.cachevfs.open(_rbcnames, b'ab') current_size = f.tell() if current_size == self._rbcsnameslen: f.write(b'\0') else: f.close() if self._force_overwrite: dbg = b"resetting content of %s\n" elif current_size > 0: dbg = b"%s changed - rewriting it\n" else: dbg = b"%s is missing - rewriting it\n" repo.ui.debug(dbg % _rbcnames) self._rbcnamescount = 0 self._rbcrevslen = 0 if self._rbcnamescount == 0: # before rewriting names, make sure references are removed repo.cachevfs.unlinkpath(_rbcrevs, ignoremissing=True) f = repo.cachevfs.open(_rbcnames, b'wb') names = self._names[self._rbcnamescount :] from_local = encoding.fromlocal data = b'\0'.join(from_local(b) for b in names) f.write(data) self._rbcsnameslen = f.tell() finally: if f is not None: f.close() self._rbcnamescount = len(self._names) def _writerevs(self, repo, start): """write the new revs to revbranchcache""" revs = min(len(repo.changelog), len(self._rbcrevs) // _rbcrecsize) end = revs * _rbcrecsize if self._force_overwrite: start = 0 # align start on entry boundary start = _rbcrecsize * (start // _rbcrecsize) with repo.cachevfs.open(_rbcrevs, b'a+b') as f: pass # this make sure the file exist… with repo.cachevfs.open(_rbcrevs, b'r+b') as f: f.seek(0, os.SEEK_END) current_size = f.tell() if current_size < start: start = 0 if current_size != start: threshold = current_size * REWRITE_RATIO overwritten = min(end, current_size) - start if (max(end, current_size) - start) >= threshold: start = 0 dbg = b"resetting content of cache/%s\n" % _rbcrevs repo.ui.debug(dbg) elif overwritten > 0: # end affected, let us overwrite the bad value dbg = b"overwriting %d bytes from %d in cache/%s" dbg %= (current_size - start, start, _rbcrevs) if end < current_size: extra = b" leaving (%d trailing bytes)" extra %= current_size - end dbg += extra dbg += b'\n' repo.ui.debug(dbg) else: # extra untouched data at the end, lets warn about them assert start == end # since don't write anything dbg = b"cache/%s contains %d unknown trailing bytes\n" dbg %= (_rbcrevs, current_size - start) repo.ui.debug(dbg) if start > 0: f.seek(start) f.write(self._rbcrevs.slice(start, end)) else: f.close() with repo.cachevfs.open( _rbcrevs, b'wb', atomictemp=True, ) as rev_file: rev_file.write(self._rbcrevs.slice(start, end)) self._rbcrevslen = revs self._force_overwrite = False