Mercurial > hg
changeset 51901:f0e07efc199f
rev-branch-cache: move the code in a dedicated module
The branchmap module is getting huge and the rev branch cache is fully
independent, lets move it elsewhere.
author | Pierre-Yves David <pierre-yves.david@octobus.net> |
---|---|
date | Sun, 22 Sep 2024 15:55:46 +0200 |
parents | 77a9c7d8a7ba |
children | 1eb2317c1762 |
files | mercurial/branching/__init__.py mercurial/branching/rev_cache.py mercurial/branchmap.py mercurial/localrepo.py setup.py |
diffstat | 4 files changed, 357 insertions(+), 332 deletions(-) [+] |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/mercurial/branching/rev_cache.py Sun Sep 22 15:55:46 2024 +0200 @@ -0,0 +1,350 @@ +# rev_cache.py - caching branch information per revision +# +# This software may be used and distributed according to the terms of the +# GNU General Public License version 2 or any later version. +from __future__ import annotations + +import struct + +from ..node import ( + nullrev, +) + +from .. import ( + encoding, + error, + util, +) + +from ..utils import ( + stringutil, +) + +calcsize = struct.calcsize +pack_into = struct.pack_into +unpack_from = struct.unpack_from + + +# Revision branch info cache + +_rbcversion = b'-v1' +_rbcnames = b'rbc-names' + _rbcversion +_rbcrevs = b'rbc-revs' + _rbcversion +# [4 byte hash prefix][4 byte branch name number with sign bit indicating open] +_rbcrecfmt = b'>4sI' +_rbcrecsize = calcsize(_rbcrecfmt) +_rbcmininc = 64 * _rbcrecsize +_rbcnodelen = 4 +_rbcbranchidxmask = 0x7FFFFFFF +_rbccloseflag = 0x80000000 + + +class rbcrevs: + """a byte string consisting of an immutable prefix followed by a mutable suffix""" + + def __init__(self, revs): + self._prefix = revs + self._rest = bytearray() + + def __len__(self): + return len(self._prefix) + len(self._rest) + + def unpack_record(self, rbcrevidx): + if rbcrevidx < len(self._prefix): + return unpack_from(_rbcrecfmt, util.buffer(self._prefix), rbcrevidx) + else: + return unpack_from( + _rbcrecfmt, + util.buffer(self._rest), + rbcrevidx - len(self._prefix), + ) + + def make_mutable(self): + if len(self._prefix) > 0: + entirety = bytearray() + entirety[:] = self._prefix + entirety.extend(self._rest) + self._rest = entirety + self._prefix = bytearray() + + def truncate(self, pos): + self.make_mutable() + del self._rest[pos:] + + def pack_into(self, rbcrevidx, node, branchidx): + if rbcrevidx < len(self._prefix): + self.make_mutable() + buf = self._rest + start_offset = rbcrevidx - len(self._prefix) + end_offset = start_offset + _rbcrecsize + + if len(self._rest) < end_offset: + # bytearray doesn't allocate extra space at least in Python 3.7. + # When multiple changesets are added in a row, precise resize would + # result in quadratic complexity. Overallocate to compensate by + # using the classic doubling technique for dynamic arrays instead. + # If there was a gap in the map before, less space will be reserved. + self._rest.extend(b'\0' * end_offset) + return pack_into( + _rbcrecfmt, + buf, + start_offset, + node, + branchidx, + ) + + def extend(self, extension): + return self._rest.extend(extension) + + def slice(self, begin, end): + if begin < len(self._prefix): + acc = bytearray() + acc[:] = self._prefix[begin:end] + acc.extend( + self._rest[begin - len(self._prefix) : end - len(self._prefix)] + ) + return acc + return self._rest[begin - len(self._prefix) : end - len(self._prefix)] + + +class revbranchcache: + """Persistent cache, mapping from revision number to branch name and close. + This is a low level cache, independent of filtering. + + Branch names are stored in rbc-names in internal encoding separated by 0. + rbc-names is append-only, and each branch name is only stored once and will + thus have a unique index. + + The branch info for each revision is stored in rbc-revs as constant size + records. The whole file is read into memory, but it is only 'parsed' on + demand. The file is usually append-only but will be truncated if repo + modification is detected. + The record for each revision contains the first 4 bytes of the + corresponding node hash, and the record is only used if it still matches. + Even a completely trashed rbc-revs fill thus still give the right result + while converging towards full recovery ... assuming no incorrectly matching + node hashes. + The record also contains 4 bytes where 31 bits contains the index of the + branch and the last bit indicate that it is a branch close commit. + The usage pattern for rbc-revs is thus somewhat similar to 00changelog.i + and will grow with it but be 1/8th of its size. + """ + + def __init__(self, repo, readonly=True): + assert repo.filtername is None + self._repo = repo + self._names = [] # branch names in local encoding with static index + self._rbcrevs = rbcrevs(bytearray()) + self._rbcsnameslen = 0 # length of names read at _rbcsnameslen + try: + bndata = repo.cachevfs.read(_rbcnames) + self._rbcsnameslen = len(bndata) # for verification before writing + if bndata: + self._names = [ + encoding.tolocal(bn) for bn in bndata.split(b'\0') + ] + except (IOError, OSError): + if readonly: + # don't try to use cache - fall back to the slow path + self.branchinfo = self._branchinfo + + if self._names: + try: + usemmap = repo.ui.configbool(b'storage', b'revbranchcache.mmap') + with repo.cachevfs(_rbcrevs) as fp: + if usemmap and repo.cachevfs.is_mmap_safe(_rbcrevs): + data = util.buffer(util.mmapread(fp)) + else: + data = fp.read() + self._rbcrevs = rbcrevs(data) + except (IOError, OSError) as inst: + repo.ui.debug( + b"couldn't read revision branch cache: %s\n" + % stringutil.forcebytestr(inst) + ) + # remember number of good records on disk + self._rbcrevslen = min( + len(self._rbcrevs) // _rbcrecsize, len(repo.changelog) + ) + if self._rbcrevslen == 0: + self._names = [] + self._rbcnamescount = len(self._names) # number of names read at + # _rbcsnameslen + + def _clear(self): + self._rbcsnameslen = 0 + del self._names[:] + self._rbcnamescount = 0 + self._rbcrevslen = len(self._repo.changelog) + self._rbcrevs = rbcrevs(bytearray(self._rbcrevslen * _rbcrecsize)) + util.clearcachedproperty(self, b'_namesreverse') + + @util.propertycache + def _namesreverse(self): + return {b: r for r, b in enumerate(self._names)} + + def branchinfo(self, rev): + """Return branch name and close flag for rev, using and updating + persistent cache.""" + changelog = self._repo.changelog + rbcrevidx = rev * _rbcrecsize + + # avoid negative index, changelog.read(nullrev) is fast without cache + if rev == nullrev: + return changelog.branchinfo(rev) + + # if requested rev isn't allocated, grow and cache the rev info + if len(self._rbcrevs) < rbcrevidx + _rbcrecsize: + return self._branchinfo(rev) + + # fast path: extract data from cache, use it if node is matching + reponode = changelog.node(rev)[:_rbcnodelen] + cachenode, branchidx = self._rbcrevs.unpack_record(rbcrevidx) + close = bool(branchidx & _rbccloseflag) + if close: + branchidx &= _rbcbranchidxmask + if cachenode == b'\0\0\0\0': + pass + elif cachenode == reponode: + try: + return self._names[branchidx], close + except IndexError: + # recover from invalid reference to unknown branch + self._repo.ui.debug( + b"referenced branch names not found" + b" - rebuilding revision branch cache from scratch\n" + ) + self._clear() + else: + # rev/node map has changed, invalidate the cache from here up + self._repo.ui.debug( + b"history modification detected - truncating " + b"revision branch cache to revision %d\n" % rev + ) + truncate = rbcrevidx + _rbcrecsize + self._rbcrevs.truncate(truncate) + self._rbcrevslen = min(self._rbcrevslen, truncate) + + # fall back to slow path and make sure it will be written to disk + return self._branchinfo(rev) + + def _branchinfo(self, rev): + """Retrieve branch info from changelog and update _rbcrevs""" + changelog = self._repo.changelog + b, close = changelog.branchinfo(rev) + if b in self._namesreverse: + branchidx = self._namesreverse[b] + else: + branchidx = len(self._names) + self._names.append(b) + self._namesreverse[b] = branchidx + reponode = changelog.node(rev) + if close: + branchidx |= _rbccloseflag + self._setcachedata(rev, reponode, branchidx) + return b, close + + def setdata(self, rev, changelogrevision): + """add new data information to the cache""" + branch, close = changelogrevision.branchinfo + + if branch in self._namesreverse: + branchidx = self._namesreverse[branch] + else: + branchidx = len(self._names) + self._names.append(branch) + self._namesreverse[branch] = branchidx + if close: + branchidx |= _rbccloseflag + self._setcachedata(rev, self._repo.changelog.node(rev), branchidx) + # If no cache data were readable (non exists, bad permission, etc) + # the cache was bypassing itself by setting: + # + # self.branchinfo = self._branchinfo + # + # Since we now have data in the cache, we need to drop this bypassing. + if 'branchinfo' in vars(self): + del self.branchinfo + + def _setcachedata(self, rev, node, branchidx): + """Writes the node's branch data to the in-memory cache data.""" + if rev == nullrev: + return + rbcrevidx = rev * _rbcrecsize + self._rbcrevs.pack_into(rbcrevidx, node, branchidx) + self._rbcrevslen = min(self._rbcrevslen, rev) + + tr = self._repo.currenttransaction() + if tr: + tr.addfinalize(b'write-revbranchcache', self.write) + + def write(self, tr=None): + """Save branch cache if it is dirty.""" + repo = self._repo + wlock = None + step = b'' + try: + # write the new names + if self._rbcnamescount < len(self._names): + wlock = repo.wlock(wait=False) + step = b' names' + self._writenames(repo) + + # write the new revs + start = self._rbcrevslen * _rbcrecsize + if start != len(self._rbcrevs): + step = b'' + if wlock is None: + wlock = repo.wlock(wait=False) + self._writerevs(repo, start) + + except (IOError, OSError, error.Abort, error.LockError) as inst: + repo.ui.debug( + b"couldn't write revision branch cache%s: %s\n" + % (step, stringutil.forcebytestr(inst)) + ) + finally: + if wlock is not None: + wlock.release() + + def _writenames(self, repo): + """write the new branch names to revbranchcache""" + if self._rbcnamescount != 0: + f = repo.cachevfs.open(_rbcnames, b'ab') + if f.tell() == self._rbcsnameslen: + f.write(b'\0') + else: + f.close() + repo.ui.debug(b"%s changed - rewriting it\n" % _rbcnames) + self._rbcnamescount = 0 + self._rbcrevslen = 0 + if self._rbcnamescount == 0: + # before rewriting names, make sure references are removed + repo.cachevfs.unlinkpath(_rbcrevs, ignoremissing=True) + f = repo.cachevfs.open(_rbcnames, b'wb') + f.write( + b'\0'.join( + encoding.fromlocal(b) + for b in self._names[self._rbcnamescount :] + ) + ) + self._rbcsnameslen = f.tell() + f.close() + self._rbcnamescount = len(self._names) + + def _writerevs(self, repo, start): + """write the new revs to revbranchcache""" + revs = min(len(repo.changelog), len(self._rbcrevs) // _rbcrecsize) + with repo.cachevfs.open(_rbcrevs, b'ab') as f: + if f.tell() != start: + repo.ui.debug( + b"truncating cache/%s to %d\n" % (_rbcrevs, start) + ) + f.seek(start) + if f.tell() != start: + start = 0 + f.seek(start) + f.truncate() + end = revs * _rbcrecsize + f.write(self._rbcrevs.slice(start, end)) + self._rbcrevslen = revs
--- a/mercurial/branchmap.py Wed Sep 25 01:16:47 2024 -0400 +++ b/mercurial/branchmap.py Sun Sep 22 15:55:46 2024 +0200 @@ -7,8 +7,6 @@ from __future__ import annotations -import struct - from .node import ( bin, hex, @@ -49,10 +47,6 @@ subsettable = repoviewutil.subsettable -calcsize = struct.calcsize -pack_into = struct.pack_into -unpack_from = struct.unpack_from - class BranchMapCache: """mapping of filtered views of repo with their branchcache""" @@ -1087,328 +1081,3 @@ closednodes: Optional[Set[bytes]] = None, ) -> None: super().__init__(repo=repo, entries=entries, closed_nodes=closednodes) - - -# Revision branch info cache - -_rbcversion = b'-v1' -_rbcnames = b'rbc-names' + _rbcversion -_rbcrevs = b'rbc-revs' + _rbcversion -# [4 byte hash prefix][4 byte branch name number with sign bit indicating open] -_rbcrecfmt = b'>4sI' -_rbcrecsize = calcsize(_rbcrecfmt) -_rbcmininc = 64 * _rbcrecsize -_rbcnodelen = 4 -_rbcbranchidxmask = 0x7FFFFFFF -_rbccloseflag = 0x80000000 - - -class rbcrevs: - """a byte string consisting of an immutable prefix followed by a mutable suffix""" - - def __init__(self, revs): - self._prefix = revs - self._rest = bytearray() - - def __len__(self): - return len(self._prefix) + len(self._rest) - - def unpack_record(self, rbcrevidx): - if rbcrevidx < len(self._prefix): - return unpack_from(_rbcrecfmt, util.buffer(self._prefix), rbcrevidx) - else: - return unpack_from( - _rbcrecfmt, - util.buffer(self._rest), - rbcrevidx - len(self._prefix), - ) - - def make_mutable(self): - if len(self._prefix) > 0: - entirety = bytearray() - entirety[:] = self._prefix - entirety.extend(self._rest) - self._rest = entirety - self._prefix = bytearray() - - def truncate(self, pos): - self.make_mutable() - del self._rest[pos:] - - def pack_into(self, rbcrevidx, node, branchidx): - if rbcrevidx < len(self._prefix): - self.make_mutable() - buf = self._rest - start_offset = rbcrevidx - len(self._prefix) - end_offset = start_offset + _rbcrecsize - - if len(self._rest) < end_offset: - # bytearray doesn't allocate extra space at least in Python 3.7. - # When multiple changesets are added in a row, precise resize would - # result in quadratic complexity. Overallocate to compensate by - # using the classic doubling technique for dynamic arrays instead. - # If there was a gap in the map before, less space will be reserved. - self._rest.extend(b'\0' * end_offset) - return pack_into( - _rbcrecfmt, - buf, - start_offset, - node, - branchidx, - ) - - def extend(self, extension): - return self._rest.extend(extension) - - def slice(self, begin, end): - if begin < len(self._prefix): - acc = bytearray() - acc[:] = self._prefix[begin:end] - acc.extend( - self._rest[begin - len(self._prefix) : end - len(self._prefix)] - ) - return acc - return self._rest[begin - len(self._prefix) : end - len(self._prefix)] - - -class revbranchcache: - """Persistent cache, mapping from revision number to branch name and close. - This is a low level cache, independent of filtering. - - Branch names are stored in rbc-names in internal encoding separated by 0. - rbc-names is append-only, and each branch name is only stored once and will - thus have a unique index. - - The branch info for each revision is stored in rbc-revs as constant size - records. The whole file is read into memory, but it is only 'parsed' on - demand. The file is usually append-only but will be truncated if repo - modification is detected. - The record for each revision contains the first 4 bytes of the - corresponding node hash, and the record is only used if it still matches. - Even a completely trashed rbc-revs fill thus still give the right result - while converging towards full recovery ... assuming no incorrectly matching - node hashes. - The record also contains 4 bytes where 31 bits contains the index of the - branch and the last bit indicate that it is a branch close commit. - The usage pattern for rbc-revs is thus somewhat similar to 00changelog.i - and will grow with it but be 1/8th of its size. - """ - - def __init__(self, repo, readonly=True): - assert repo.filtername is None - self._repo = repo - self._names = [] # branch names in local encoding with static index - self._rbcrevs = rbcrevs(bytearray()) - self._rbcsnameslen = 0 # length of names read at _rbcsnameslen - try: - bndata = repo.cachevfs.read(_rbcnames) - self._rbcsnameslen = len(bndata) # for verification before writing - if bndata: - self._names = [ - encoding.tolocal(bn) for bn in bndata.split(b'\0') - ] - except (IOError, OSError): - if readonly: - # don't try to use cache - fall back to the slow path - self.branchinfo = self._branchinfo - - if self._names: - try: - usemmap = repo.ui.configbool(b'storage', b'revbranchcache.mmap') - with repo.cachevfs(_rbcrevs) as fp: - if usemmap and repo.cachevfs.is_mmap_safe(_rbcrevs): - data = util.buffer(util.mmapread(fp)) - else: - data = fp.read() - self._rbcrevs = rbcrevs(data) - except (IOError, OSError) as inst: - repo.ui.debug( - b"couldn't read revision branch cache: %s\n" - % stringutil.forcebytestr(inst) - ) - # remember number of good records on disk - self._rbcrevslen = min( - len(self._rbcrevs) // _rbcrecsize, len(repo.changelog) - ) - if self._rbcrevslen == 0: - self._names = [] - self._rbcnamescount = len(self._names) # number of names read at - # _rbcsnameslen - - def _clear(self): - self._rbcsnameslen = 0 - del self._names[:] - self._rbcnamescount = 0 - self._rbcrevslen = len(self._repo.changelog) - self._rbcrevs = rbcrevs(bytearray(self._rbcrevslen * _rbcrecsize)) - util.clearcachedproperty(self, b'_namesreverse') - - @util.propertycache - def _namesreverse(self): - return {b: r for r, b in enumerate(self._names)} - - def branchinfo(self, rev): - """Return branch name and close flag for rev, using and updating - persistent cache.""" - changelog = self._repo.changelog - rbcrevidx = rev * _rbcrecsize - - # avoid negative index, changelog.read(nullrev) is fast without cache - if rev == nullrev: - return changelog.branchinfo(rev) - - # if requested rev isn't allocated, grow and cache the rev info - if len(self._rbcrevs) < rbcrevidx + _rbcrecsize: - return self._branchinfo(rev) - - # fast path: extract data from cache, use it if node is matching - reponode = changelog.node(rev)[:_rbcnodelen] - cachenode, branchidx = self._rbcrevs.unpack_record(rbcrevidx) - close = bool(branchidx & _rbccloseflag) - if close: - branchidx &= _rbcbranchidxmask - if cachenode == b'\0\0\0\0': - pass - elif cachenode == reponode: - try: - return self._names[branchidx], close - except IndexError: - # recover from invalid reference to unknown branch - self._repo.ui.debug( - b"referenced branch names not found" - b" - rebuilding revision branch cache from scratch\n" - ) - self._clear() - else: - # rev/node map has changed, invalidate the cache from here up - self._repo.ui.debug( - b"history modification detected - truncating " - b"revision branch cache to revision %d\n" % rev - ) - truncate = rbcrevidx + _rbcrecsize - self._rbcrevs.truncate(truncate) - self._rbcrevslen = min(self._rbcrevslen, truncate) - - # fall back to slow path and make sure it will be written to disk - return self._branchinfo(rev) - - def _branchinfo(self, rev): - """Retrieve branch info from changelog and update _rbcrevs""" - changelog = self._repo.changelog - b, close = changelog.branchinfo(rev) - if b in self._namesreverse: - branchidx = self._namesreverse[b] - else: - branchidx = len(self._names) - self._names.append(b) - self._namesreverse[b] = branchidx - reponode = changelog.node(rev) - if close: - branchidx |= _rbccloseflag - self._setcachedata(rev, reponode, branchidx) - return b, close - - def setdata(self, rev, changelogrevision): - """add new data information to the cache""" - branch, close = changelogrevision.branchinfo - - if branch in self._namesreverse: - branchidx = self._namesreverse[branch] - else: - branchidx = len(self._names) - self._names.append(branch) - self._namesreverse[branch] = branchidx - if close: - branchidx |= _rbccloseflag - self._setcachedata(rev, self._repo.changelog.node(rev), branchidx) - # If no cache data were readable (non exists, bad permission, etc) - # the cache was bypassing itself by setting: - # - # self.branchinfo = self._branchinfo - # - # Since we now have data in the cache, we need to drop this bypassing. - if 'branchinfo' in vars(self): - del self.branchinfo - - def _setcachedata(self, rev, node, branchidx): - """Writes the node's branch data to the in-memory cache data.""" - if rev == nullrev: - return - rbcrevidx = rev * _rbcrecsize - self._rbcrevs.pack_into(rbcrevidx, node, branchidx) - self._rbcrevslen = min(self._rbcrevslen, rev) - - tr = self._repo.currenttransaction() - if tr: - tr.addfinalize(b'write-revbranchcache', self.write) - - def write(self, tr=None): - """Save branch cache if it is dirty.""" - repo = self._repo - wlock = None - step = b'' - try: - # write the new names - if self._rbcnamescount < len(self._names): - wlock = repo.wlock(wait=False) - step = b' names' - self._writenames(repo) - - # write the new revs - start = self._rbcrevslen * _rbcrecsize - if start != len(self._rbcrevs): - step = b'' - if wlock is None: - wlock = repo.wlock(wait=False) - self._writerevs(repo, start) - - except (IOError, OSError, error.Abort, error.LockError) as inst: - repo.ui.debug( - b"couldn't write revision branch cache%s: %s\n" - % (step, stringutil.forcebytestr(inst)) - ) - finally: - if wlock is not None: - wlock.release() - - def _writenames(self, repo): - """write the new branch names to revbranchcache""" - if self._rbcnamescount != 0: - f = repo.cachevfs.open(_rbcnames, b'ab') - if f.tell() == self._rbcsnameslen: - f.write(b'\0') - else: - f.close() - repo.ui.debug(b"%s changed - rewriting it\n" % _rbcnames) - self._rbcnamescount = 0 - self._rbcrevslen = 0 - if self._rbcnamescount == 0: - # before rewriting names, make sure references are removed - repo.cachevfs.unlinkpath(_rbcrevs, ignoremissing=True) - f = repo.cachevfs.open(_rbcnames, b'wb') - f.write( - b'\0'.join( - encoding.fromlocal(b) - for b in self._names[self._rbcnamescount :] - ) - ) - self._rbcsnameslen = f.tell() - f.close() - self._rbcnamescount = len(self._names) - - def _writerevs(self, repo, start): - """write the new revs to revbranchcache""" - revs = min(len(repo.changelog), len(self._rbcrevs) // _rbcrecsize) - with repo.cachevfs.open(_rbcrevs, b'ab') as f: - if f.tell() != start: - repo.ui.debug( - b"truncating cache/%s to %d\n" % (_rbcrevs, start) - ) - f.seek(start) - if f.tell() != start: - start = 0 - f.seek(start) - f.truncate() - end = revs * _rbcrecsize - f.write(self._rbcrevs.slice(start, end)) - self._rbcrevslen = revs
--- a/mercurial/localrepo.py Wed Sep 25 01:16:47 2024 -0400 +++ b/mercurial/localrepo.py Sun Sep 22 15:55:46 2024 +0200 @@ -77,6 +77,10 @@ wireprototypes, ) +from .branching import ( + rev_cache as rev_branch_cache, +) + from .interfaces import ( repository, util as interfaceutil, @@ -2281,7 +2285,8 @@ @unfilteredmethod def revbranchcache(self): if not self._revbranchcache: - self._revbranchcache = branchmap.revbranchcache(self.unfiltered()) + unfi = self.unfiltered() + self._revbranchcache = rev_branch_cache.revbranchcache(unfi) return self._revbranchcache def register_changeset(self, rev, changelogrevision):