Mercurial > hg
changeset 46709:3d740058b467
sidedata: move to new sidedata storage in revlogv2
The current (experimental) sidedata system uses flagprocessors to signify the
presence and store/retrieve sidedata from the raw revlog data. This proved to be
quite fragile from an exchange perspective and a lot more complex than simply
having a dedicated space in the new revlog format.
This change does not handle exchange (ironically), so the test for amend - that
uses a bundle - is broken. This functionality is split into the next patches.
Differential Revision: https://phab.mercurial-scm.org/D9993
author | Raphaël Gomès <rgomes@octobus.net> |
---|---|
date | Mon, 18 Jan 2021 11:44:51 +0100 |
parents | 358737abeeef |
children | 4cd214c9948d |
files | hgext/lfs/wrapper.py hgext/remotefilelog/remotefilelog.py mercurial/revlog.py mercurial/revlogutils/flagutil.py mercurial/revlogutils/sidedata.py tests/flagprocessorext.py tests/simplestorerepo.py tests/test-copies-in-changeset.t tests/test-revlog-raw.py tests/testlib/ext-sidedata.py |
diffstat | 10 files changed, 124 insertions(+), 94 deletions(-) [+] |
line wrap: on
line diff
--- a/hgext/lfs/wrapper.py Wed Jan 20 18:35:12 2021 +0100 +++ b/hgext/lfs/wrapper.py Mon Jan 18 11:44:51 2021 +0100 @@ -116,10 +116,10 @@ if hgmeta or text.startswith(b'\1\n'): text = storageutil.packmeta(hgmeta, text) - return (text, True, {}) + return (text, True) -def writetostore(self, text, sidedata): +def writetostore(self, text): # hg filelog metadata (includes rename, etc) hgmeta, offset = storageutil.parsemeta(text) if offset and offset > 0:
--- a/hgext/remotefilelog/remotefilelog.py Wed Jan 20 18:35:12 2021 +0100 +++ b/hgext/remotefilelog/remotefilelog.py Mon Jan 18 11:44:51 2021 +0100 @@ -155,12 +155,12 @@ # text passed to "addrevision" includes hg filelog metadata header if node is None: node = storageutil.hashrevisionsha1(text, p1, p2) - if sidedata is None: - sidedata = {} meta, metaoffset = storageutil.parsemeta(text) rawtext, validatehash = flagutil.processflagswrite( - self, text, flags, sidedata=sidedata + self, + text, + flags, ) return self.addrawrevision( rawtext,
--- a/mercurial/revlog.py Wed Jan 20 18:35:12 2021 +0100 +++ b/mercurial/revlog.py Mon Jan 18 11:44:51 2021 +0100 @@ -120,10 +120,10 @@ # Flag processors for REVIDX_ELLIPSIS. def ellipsisreadprocessor(rl, text): - return text, False, {} - - -def ellipsiswriteprocessor(rl, text, sidedata): + return text, False + + +def ellipsiswriteprocessor(rl, text): return text, False @@ -554,8 +554,6 @@ if self._mmaplargeindex and b'mmapindexthreshold' in opts: mmapindexthreshold = opts[b'mmapindexthreshold'] self.hassidedata = bool(opts.get(b'side-data', False)) - if self.hassidedata: - self._flagprocessors[REVIDX_SIDEDATA] = sidedatautil.processors self._sparserevlog = bool(opts.get(b'sparse-revlog', False)) withsparseread = bool(opts.get(b'with-sparse-read', False)) # sparse-revlog forces sparse-read @@ -856,6 +854,11 @@ def length(self, rev): return self.index[rev][1] + def sidedata_length(self, rev): + if self.version & 0xFFFF != REVLOGV2: + return 0 + return self.index[rev][9] + def rawsize(self, rev): """return the length of the uncompressed text for a given revision""" l = self.index[rev][2] @@ -917,7 +920,7 @@ # Derived from index values. def end(self, rev): - return self.start(rev) + self.length(rev) + return self.start(rev) + self.length(rev) + self.sidedata_length(rev) def parents(self, node): i = self.index @@ -1853,7 +1856,7 @@ elif operation == b'read': return flagutil.processflagsread(self, text, flags) else: # write operation - return flagutil.processflagswrite(self, text, flags, None) + return flagutil.processflagswrite(self, text, flags) def revision(self, nodeorrev, _df=None, raw=False): """return an uncompressed revision of a given node or revision @@ -1898,10 +1901,17 @@ # revision or might need to be processed to retrieve the revision. rev, rawtext, validated = self._rawtext(node, rev, _df=_df) + if self.version & 0xFFFF == REVLOGV2: + if rev is None: + rev = self.rev(node) + sidedata = self._sidedata(rev) + else: + sidedata = {} + if raw and validated: # if we don't want to process the raw text and that raw # text is cached, we can exit early. - return rawtext, {} + return rawtext, sidedata if rev is None: rev = self.rev(node) # the revlog's flag for this revision @@ -1910,20 +1920,14 @@ if validated and flags == REVIDX_DEFAULT_FLAGS: # no extra flags set, no flag processor runs, text = rawtext - return rawtext, {} - - sidedata = {} + return rawtext, sidedata + if raw: validatehash = flagutil.processflagsraw(self, rawtext, flags) text = rawtext else: - try: - r = flagutil.processflagsread(self, rawtext, flags) - except error.SidedataHashError as exc: - msg = _(b"integrity check failed on %s:%s sidedata key %d") - msg %= (self.indexfile, pycompat.bytestr(rev), exc.sidedatakey) - raise error.RevlogError(msg) - text, validatehash, sidedata = r + r = flagutil.processflagsread(self, rawtext, flags) + text, validatehash = r if validatehash: self.checkhash(text, node, rev=rev) if not validated: @@ -1974,6 +1978,21 @@ del basetext # let us have a chance to free memory early return (rev, rawtext, False) + def _sidedata(self, rev): + """Return the sidedata for a given revision number.""" + index_entry = self.index[rev] + sidedata_offset = index_entry[8] + sidedata_size = index_entry[9] + + if self._inline: + sidedata_offset += self._io.size * (1 + rev) + if sidedata_size == 0: + return {} + + segment = self._getsegment(sidedata_offset, sidedata_size) + sidedata = sidedatautil.deserialize_sidedata(segment) + return sidedata + def rawdata(self, nodeorrev, _df=None): """return an uncompressed raw data of a given node or revision number. @@ -2107,20 +2126,15 @@ if sidedata is None: sidedata = {} - flags = flags & ~REVIDX_SIDEDATA elif not self.hassidedata: raise error.ProgrammingError( _(b"trying to add sidedata to a revlog who don't support them") ) - else: - flags |= REVIDX_SIDEDATA if flags: node = node or self.hash(text, p1, p2) - rawtext, validatehash = flagutil.processflagswrite( - self, text, flags, sidedata=sidedata - ) + rawtext, validatehash = flagutil.processflagswrite(self, text, flags) # If the flag processor modifies the revision data, ignore any provided # cachedelta. @@ -2153,6 +2167,7 @@ flags, cachedelta=cachedelta, deltacomputer=deltacomputer, + sidedata=sidedata, ) def addrawrevision( @@ -2166,6 +2181,7 @@ flags, cachedelta=None, deltacomputer=None, + sidedata=None, ): """add a raw revision with known flags, node and parents useful when reusing a revision not stored in this revlog (ex: received @@ -2188,6 +2204,7 @@ ifh, dfh, deltacomputer=deltacomputer, + sidedata=sidedata, ) finally: if dfh: @@ -2281,6 +2298,7 @@ dfh, alwayscache=False, deltacomputer=None, + sidedata=None, ): """internal function to add revisions to the log @@ -2350,6 +2368,16 @@ deltainfo = deltacomputer.finddeltainfo(revinfo, fh) + if sidedata: + serialized_sidedata = sidedatautil.serialize_sidedata(sidedata) + sidedata_offset = offset + deltainfo.deltalen + else: + serialized_sidedata = b"" + # Don't store the offset if the sidedata is empty, that way + # we can easily detect empty sidedata and they will be no different + # than ones we manually add. + sidedata_offset = 0 + e = ( offset_type(offset, flags), deltainfo.deltalen, @@ -2359,18 +2387,24 @@ p1r, p2r, node, - 0, - 0, + sidedata_offset, + len(serialized_sidedata), ) if self.version & 0xFFFF != REVLOGV2: e = e[:8] self.index.append(e) - entry = self._io.packentry(e, self.node, self.version, curr) self._writeentry( - transaction, ifh, dfh, entry, deltainfo.data, link, offset + transaction, + ifh, + dfh, + entry, + deltainfo.data, + link, + offset, + serialized_sidedata, ) rawtext = btext[0] @@ -2383,7 +2417,9 @@ self._chainbasecache[curr] = deltainfo.chainbase return curr - def _writeentry(self, transaction, ifh, dfh, entry, data, link, offset): + def _writeentry( + self, transaction, ifh, dfh, entry, data, link, offset, sidedata + ): # Files opened in a+ mode have inconsistent behavior on various # platforms. Windows requires that a file positioning call be made # when the file handle transitions between reads and writes. See @@ -2407,6 +2443,8 @@ if data[0]: dfh.write(data[0]) dfh.write(data[1]) + if sidedata: + dfh.write(sidedata) ifh.write(entry) else: offset += curr * self._io.size @@ -2414,6 +2452,8 @@ ifh.write(entry) ifh.write(data[0]) ifh.write(data[1]) + if sidedata: + ifh.write(sidedata) self._enforceinlinesize(transaction, ifh) nodemaputil.setup_persistent_nodemap(transaction, self)
--- a/mercurial/revlogutils/flagutil.py Wed Jan 20 18:35:12 2021 +0100 +++ b/mercurial/revlogutils/flagutil.py Mon Jan 18 11:44:51 2021 +0100 @@ -84,7 +84,7 @@ flagprocessors[flag] = processor -def processflagswrite(revlog, text, flags, sidedata): +def processflagswrite(revlog, text, flags): """Inspect revision data flags and applies write transformations defined by registered flag processors. @@ -100,9 +100,12 @@ processed text and ``validatehash`` is a bool indicating whether the returned text should be checked for hash integrity. """ - return _processflagsfunc(revlog, text, flags, b'write', sidedata=sidedata)[ - :2 - ] + return _processflagsfunc( + revlog, + text, + flags, + b'write', + )[:2] def processflagsread(revlog, text, flags): @@ -145,14 +148,14 @@ return _processflagsfunc(revlog, text, flags, b'raw')[1] -def _processflagsfunc(revlog, text, flags, operation, sidedata=None): +def _processflagsfunc(revlog, text, flags, operation): """internal function to process flag on a revlog This function is private to this module, code should never needs to call it directly.""" # fast path: no flag processors will run if flags == 0: - return text, True, {} + return text, True if operation not in (b'read', b'write', b'raw'): raise error.ProgrammingError(_(b"invalid '%s' operation") % operation) # Check all flags are known. @@ -168,7 +171,6 @@ if operation == b'write': orderedflags = reversed(orderedflags) - outsidedata = {} for flag in orderedflags: # If a flagprocessor has been registered for a known flag, apply the # related operation transform and update result tuple. @@ -186,10 +188,9 @@ if operation == b'raw': vhash = rawtransform(revlog, text) elif operation == b'read': - text, vhash, s = readtransform(revlog, text) - outsidedata.update(s) + text, vhash = readtransform(revlog, text) else: # write operation - text, vhash = writetransform(revlog, text, sidedata) + text, vhash = writetransform(revlog, text) validatehash = validatehash and vhash - return text, validatehash, outsidedata + return text, validatehash
--- a/mercurial/revlogutils/sidedata.py Wed Jan 20 18:35:12 2021 +0100 +++ b/mercurial/revlogutils/sidedata.py Mon Jan 18 11:44:51 2021 +0100 @@ -13,9 +13,8 @@ The current implementation is experimental and subject to changes. Do not rely on it in production. -Sidedata are stored in the revlog itself, within the revision rawtext. They -are inserted and removed from it using the flagprocessors mechanism. The following -format is currently used:: +Sidedata are stored in the revlog itself, thanks to a new version of the +revlog. The following format is currently used:: initial header: <number of sidedata; 2 bytes> @@ -60,48 +59,35 @@ SIDEDATA_ENTRY = struct.Struct('>HL20s') -def sidedatawriteprocessor(rl, text, sidedata): +def serialize_sidedata(sidedata): sidedata = list(sidedata.items()) sidedata.sort() - rawtext = [SIDEDATA_HEADER.pack(len(sidedata))] + buf = [SIDEDATA_HEADER.pack(len(sidedata))] for key, value in sidedata: digest = hashutil.sha1(value).digest() - rawtext.append(SIDEDATA_ENTRY.pack(key, len(value), digest)) + buf.append(SIDEDATA_ENTRY.pack(key, len(value), digest)) for key, value in sidedata: - rawtext.append(value) - rawtext.append(bytes(text)) - return b''.join(rawtext), False + buf.append(value) + buf = b''.join(buf) + return buf -def sidedatareadprocessor(rl, text): +def deserialize_sidedata(blob): sidedata = {} offset = 0 - (nbentry,) = SIDEDATA_HEADER.unpack(text[: SIDEDATA_HEADER.size]) + (nbentry,) = SIDEDATA_HEADER.unpack(blob[: SIDEDATA_HEADER.size]) offset += SIDEDATA_HEADER.size dataoffset = SIDEDATA_HEADER.size + (SIDEDATA_ENTRY.size * nbentry) for i in range(nbentry): nextoffset = offset + SIDEDATA_ENTRY.size - key, size, storeddigest = SIDEDATA_ENTRY.unpack(text[offset:nextoffset]) + key, size, storeddigest = SIDEDATA_ENTRY.unpack(blob[offset:nextoffset]) offset = nextoffset # read the data associated with that entry nextdataoffset = dataoffset + size - entrytext = text[dataoffset:nextdataoffset] + entrytext = bytes(blob[dataoffset:nextdataoffset]) readdigest = hashutil.sha1(entrytext).digest() if storeddigest != readdigest: raise error.SidedataHashError(key, storeddigest, readdigest) sidedata[key] = entrytext dataoffset = nextdataoffset - text = text[dataoffset:] - return text, True, sidedata - - -def sidedatarawprocessor(rl, text): - # side data modifies rawtext and prevent rawtext hash validation - return False - - -processors = ( - sidedatareadprocessor, - sidedatawriteprocessor, - sidedatarawprocessor, -) + return sidedata
--- a/tests/flagprocessorext.py Wed Jan 20 18:35:12 2021 +0100 +++ b/tests/flagprocessorext.py Mon Jan 18 11:44:51 2021 +0100 @@ -31,28 +31,28 @@ return False -def noopdonothing(self, text, sidedata): +def noopdonothing(self, text): return (text, True) def noopdonothingread(self, text): - return (text, True, {}) + return (text, True) -def b64encode(self, text, sidedata): +def b64encode(self, text): return (base64.b64encode(text), False) def b64decode(self, text): - return (base64.b64decode(text), True, {}) + return (base64.b64decode(text), True) -def gzipcompress(self, text, sidedata): +def gzipcompress(self, text): return (zlib.compress(text), False) def gzipdecompress(self, text): - return (zlib.decompress(text), True, {}) + return (zlib.decompress(text), True) def supportedoutgoingversions(orig, repo):
--- a/tests/simplestorerepo.py Wed Jan 20 18:35:12 2021 +0100 +++ b/tests/simplestorerepo.py Mon Jan 18 11:44:51 2021 +0100 @@ -300,7 +300,7 @@ text = rawtext else: r = flagutil.processflagsread(self, rawtext, flags) - text, validatehash, sidedata = r + text, validatehash = r if validatehash: self.checkhash(text, node, rev=rev)
--- a/tests/test-copies-in-changeset.t Wed Jan 20 18:35:12 2021 +0100 +++ b/tests/test-copies-in-changeset.t Mon Jan 18 11:44:51 2021 +0100 @@ -271,12 +271,13 @@ $ hg ci --amend -m 'copy a to j, v2' saved backup bundle to $TESTTMP/repo/.hg/strip-backup/*-*-amend.hg (glob) $ hg debugsidedata -c -v -- -1 - 1 sidedata entries - entry-0014 size 24 - '\x00\x00\x00\x02\x00\x00\x00\x00\x01\x00\x00\x00\x00\x06\x00\x00\x00\x02\x00\x00\x00\x00aj' + 1 sidedata entries (missing-correct-output !) + entry-0014 size 24 (missing-correct-output !) + '\x00\x00\x00\x02\x00\x00\x00\x00\x01\x00\x00\x00\x00\x06\x00\x00\x00\x02\x00\x00\x00\x00aj' (missing-correct-output !) #endif $ hg showcopies --config experimental.copies.read-from=filelog-only - a -> j + a -> j (sidedata missing-correct-output !) + a -> j (no-sidedata !) The entries should be written to extras even if they're empty (so the client won't have to fall back to reading from filelogs) $ echo x >> j @@ -354,7 +355,8 @@ saved backup bundle to $TESTTMP/rebase-rename/.hg/strip-backup/*-*-rebase.hg (glob) $ hg st --change . --copies A b - a + a (sidedata missing-correct-output !) + a (no-sidedata !) R a $ cd ..
--- a/tests/test-revlog-raw.py Wed Jan 20 18:35:12 2021 +0100 +++ b/tests/test-revlog-raw.py Mon Jan 18 11:44:51 2021 +0100 @@ -51,10 +51,10 @@ def readprocessor(self, rawtext): # True: the returned text could be used to verify hash text = rawtext[len(_extheader) :].replace(b'i', b'1') - return text, True, {} + return text, True -def writeprocessor(self, text, sidedata): +def writeprocessor(self, text): # False: the returned rawtext shouldn't be used to verify hash rawtext = _extheader + text.replace(b'1', b'i') return rawtext, False @@ -293,7 +293,7 @@ # Verify text, rawtext, and rawsize if isext: - rawtext = writeprocessor(None, text, {})[0] + rawtext = writeprocessor(None, text)[0] else: rawtext = text if rlog.rawsize(rev) != len(rawtext):
--- a/tests/testlib/ext-sidedata.py Wed Jan 20 18:35:12 2021 +0100 +++ b/tests/testlib/ext-sidedata.py Mon Jan 18 11:44:51 2021 +0100 @@ -40,19 +40,20 @@ return orig(self, text, transaction, link, p1, p2, *args, **kwargs) -def wraprevision(orig, self, nodeorrev, *args, **kwargs): - text = orig(self, nodeorrev, *args, **kwargs) +def wrap_revisiondata(orig, self, nodeorrev, *args, **kwargs): + text, sd = orig(self, nodeorrev, *args, **kwargs) if getattr(self, 'sidedatanocheck', False): - return text + return text, sd + if self.version & 0xFFFF != 2: + return text, sd if nodeorrev != nullrev and nodeorrev != nullid: - sd = self.sidedata(nodeorrev) if len(text) != struct.unpack('>I', sd[sidedata.SD_TEST1])[0]: raise RuntimeError('text size mismatch') expected = sd[sidedata.SD_TEST2] got = hashlib.sha256(text).digest() if got != expected: raise RuntimeError('sha256 mismatch') - return text + return text, sd def wrapgetsidedatacompanion(orig, srcrepo, dstrepo): @@ -81,7 +82,7 @@ def extsetup(ui): extensions.wrapfunction(revlog.revlog, 'addrevision', wrapaddrevision) - extensions.wrapfunction(revlog.revlog, 'revision', wraprevision) + extensions.wrapfunction(revlog.revlog, '_revisiondata', wrap_revisiondata) extensions.wrapfunction( upgrade_engine, 'getsidedatacompanion', wrapgetsidedatacompanion )