# HG changeset patch # User Pierre-Yves David # Date 1698192798 -7200 # Node ID a82704902db88245d19043f7944bace0fffdc4bc # Parent de6a8cc24de398e17e9d9088b697619520cfc14a revlog: move the compression/decompression logic on the inner object This is a necessary step before being able to move more logic around restoring a revision content there. For now, we do a simple patch for the perf extension logic, when the implementation of the inner object changes, we will likely need some evolution of the API. However this is true of many things in the perf extension. So we will see this later. diff -r de6a8cc24de3 -r a82704902db8 contrib/perf.py --- a/contrib/perf.py Mon Oct 23 14:27:07 2023 +0200 +++ b/contrib/perf.py Wed Oct 25 02:13:18 2023 +0200 @@ -3833,14 +3833,16 @@ def docompress(compressor): rl.clearcaches() + compressor_holder = getattr(rl, '_inner', rl) + try: # Swap in the requested compression engine. - oldcompressor = rl._compressor - rl._compressor = compressor + oldcompressor = compressor_holder._compressor + compressor_holder._compressor = compressor for chunk in chunks[0]: rl.compress(chunk) finally: - rl._compressor = oldcompressor + compressor_holder._compressor = oldcompressor benches = [ (lambda: doread(), b'read'), diff -r de6a8cc24de3 -r a82704902db8 mercurial/revlog.py --- a/mercurial/revlog.py Mon Oct 23 14:27:07 2023 +0200 +++ b/mercurial/revlog.py Wed Oct 25 02:13:18 2023 +0200 @@ -353,7 +353,9 @@ sidedata_file, inline, data_config, + feature_config, chunk_cache, + default_compression_header, ): self.opener = opener self.index = index @@ -363,6 +365,9 @@ self.sidedata_file = sidedata_file self.inline = inline self.data_config = data_config + self.feature_config = feature_config + + self._default_compression_header = default_compression_header # index @@ -381,6 +386,9 @@ self.data_config.chunk_cache_size, ) + # revlog header -> revlog compressor + self._decompressors = {} + @property def index_file(self): return self.__index_file @@ -405,6 +413,103 @@ """the end of the data chunk for this revision""" return self.start(rev) + self.length(rev) + @util.propertycache + def _compressor(self): + engine = util.compengines[self.feature_config.compression_engine] + return engine.revlogcompressor( + self.feature_config.compression_engine_options + ) + + @util.propertycache + def _decompressor(self): + """the default decompressor""" + if self._default_compression_header is None: + return None + t = self._default_compression_header + c = self._get_decompressor(t) + return c.decompress + + def _get_decompressor(self, t): + try: + compressor = self._decompressors[t] + except KeyError: + try: + engine = util.compengines.forrevlogheader(t) + compressor = engine.revlogcompressor( + self.feature_config.compression_engine_options + ) + self._decompressors[t] = compressor + except KeyError: + raise error.RevlogError( + _(b'unknown compression type %s') % binascii.hexlify(t) + ) + return compressor + + def compress(self, data): + """Generate a possibly-compressed representation of data.""" + if not data: + return b'', data + + compressed = self._compressor.compress(data) + + if compressed: + # The revlog compressor added the header in the returned data. + return b'', compressed + + if data[0:1] == b'\0': + return b'', data + return b'u', data + + def decompress(self, data): + """Decompress a revlog chunk. + + The chunk is expected to begin with a header identifying the + format type so it can be routed to an appropriate decompressor. + """ + if not data: + return data + + # Revlogs are read much more frequently than they are written and many + # chunks only take microseconds to decompress, so performance is + # important here. + # + # We can make a few assumptions about revlogs: + # + # 1) the majority of chunks will be compressed (as opposed to inline + # raw data). + # 2) decompressing *any* data will likely by at least 10x slower than + # returning raw inline data. + # 3) we want to prioritize common and officially supported compression + # engines + # + # It follows that we want to optimize for "decompress compressed data + # when encoded with common and officially supported compression engines" + # case over "raw data" and "data encoded by less common or non-official + # compression engines." That is why we have the inline lookup first + # followed by the compengines lookup. + # + # According to `hg perfrevlogchunks`, this is ~0.5% faster for zlib + # compressed chunks. And this matters for changelog and manifest reads. + t = data[0:1] + + if t == b'x': + try: + return _zlibdecompress(data) + except zlib.error as e: + raise error.RevlogError( + _(b'revlog decompress error: %s') + % stringutil.forcebytestr(e) + ) + # '\0' is more common than 'u' so it goes first. + elif t == b'\0': + return data + elif t == b'u': + return util.buffer(data, 1) + + compressor = self._get_decompressor(t) + + return compressor.decompress(data) + @contextlib.contextmanager def reading(self): """Context manager that keeps data and sidedata files open for reading""" @@ -1284,12 +1389,15 @@ self.index = index # revnum -> (chain-length, sum-delta-length) self._chaininfocache = util.lrucachedict(500) - # revlog header -> revlog compressor - self._decompressors = {} return chunkcache def _load_inner(self, chunk_cache): + if self._docket is None: + default_compression_header = None + else: + default_compression_header = self._docket.default_compression_header + self._inner = _InnerRevlog( opener=self.opener, index=self.index, @@ -1298,7 +1406,9 @@ sidedata_file=self._sidedatafile, inline=self._inline, data_config=self.data_config, + feature_config=self.feature_config, chunk_cache=chunk_cache, + default_compression_header=default_compression_header, ) def get_revlog(self): @@ -1319,38 +1429,6 @@ else: return self.radix - def _get_decompressor(self, t): - try: - compressor = self._decompressors[t] - except KeyError: - try: - engine = util.compengines.forrevlogheader(t) - compressor = engine.revlogcompressor( - self.feature_config.compression_engine_options - ) - self._decompressors[t] = compressor - except KeyError: - raise error.RevlogError( - _(b'unknown compression type %s') % binascii.hexlify(t) - ) - return compressor - - @util.propertycache - def _compressor(self): - engine = util.compengines[self.feature_config.compression_engine] - return engine.revlogcompressor( - self.feature_config.compression_engine_options - ) - - @util.propertycache - def _decompressor(self): - """the default decompressor""" - if self._docket is None: - return None - t = self._docket.default_compression_header - c = self._get_decompressor(t) - return c.decompress - def _datafp(self, mode=b'r'): """file object for the revlog's data file""" return self.opener(self._datafile, mode=mode) @@ -2272,9 +2350,9 @@ if compression_mode == COMP_MODE_PLAIN: return data elif compression_mode == COMP_MODE_DEFAULT: - return self._decompressor(data) + return self._inner._decompressor(data) elif compression_mode == COMP_MODE_INLINE: - return self.decompress(data) + return self._inner.decompress(data) else: msg = b'unknown compression mode %d' msg %= compression_mode @@ -2328,9 +2406,9 @@ # 2G on Windows return [self._chunk(rev) for rev in revschunk] - decomp = self.decompress + decomp = self._inner.decompress # self._decompressor might be None, but will not be used in that case - def_decomp = self._decompressor + def_decomp = self._inner._decompressor for rev in revschunk: chunkstart = start(rev) if inline: @@ -2544,9 +2622,9 @@ if comp == COMP_MODE_PLAIN: segment = comp_segment elif comp == COMP_MODE_DEFAULT: - segment = self._decompressor(comp_segment) + segment = self._inner._decompressor(comp_segment) elif comp == COMP_MODE_INLINE: - segment = self.decompress(comp_segment) + segment = self._inner.decompress(comp_segment) else: msg = b'unknown compression mode %d' msg %= comp @@ -2842,69 +2920,10 @@ ) def compress(self, data): - """Generate a possibly-compressed representation of data.""" - if not data: - return b'', data - - compressed = self._compressor.compress(data) - - if compressed: - # The revlog compressor added the header in the returned data. - return b'', compressed - - if data[0:1] == b'\0': - return b'', data - return b'u', data + return self._inner.compress(data) def decompress(self, data): - """Decompress a revlog chunk. - - The chunk is expected to begin with a header identifying the - format type so it can be routed to an appropriate decompressor. - """ - if not data: - return data - - # Revlogs are read much more frequently than they are written and many - # chunks only take microseconds to decompress, so performance is - # important here. - # - # We can make a few assumptions about revlogs: - # - # 1) the majority of chunks will be compressed (as opposed to inline - # raw data). - # 2) decompressing *any* data will likely by at least 10x slower than - # returning raw inline data. - # 3) we want to prioritize common and officially supported compression - # engines - # - # It follows that we want to optimize for "decompress compressed data - # when encoded with common and officially supported compression engines" - # case over "raw data" and "data encoded by less common or non-official - # compression engines." That is why we have the inline lookup first - # followed by the compengines lookup. - # - # According to `hg perfrevlogchunks`, this is ~0.5% faster for zlib - # compressed chunks. And this matters for changelog and manifest reads. - t = data[0:1] - - if t == b'x': - try: - return _zlibdecompress(data) - except zlib.error as e: - raise error.RevlogError( - _(b'revlog decompress error: %s') - % stringutil.forcebytestr(e) - ) - # '\0' is more common than 'u' so it goes first. - elif t == b'\0': - return data - elif t == b'u': - return util.buffer(data, 1) - - compressor = self._get_decompressor(t) - - return compressor.decompress(data) + return self._inner.decompress(data) def _addrevision( self, @@ -3029,7 +3048,7 @@ sidedata_compression_mode = COMP_MODE_PLAIN serialized_sidedata = sidedatautil.serialize_sidedata(sidedata) sidedata_offset = self._docket.sidedata_end - h, comp_sidedata = self.compress(serialized_sidedata) + h, comp_sidedata = self._inner.compress(serialized_sidedata) if ( h != b'u' and comp_sidedata[0:1] != b'\0' @@ -3876,7 +3895,7 @@ sidedata_compression_mode = COMP_MODE_INLINE if serialized_sidedata and self.feature_config.has_side_data: sidedata_compression_mode = COMP_MODE_PLAIN - h, comp_sidedata = self.compress(serialized_sidedata) + h, comp_sidedata = self._inner.compress(serialized_sidedata) if ( h != b'u' and comp_sidedata[0] != b'\0' diff -r de6a8cc24de3 -r a82704902db8 mercurial/revlogutils/deltas.py --- a/mercurial/revlogutils/deltas.py Mon Oct 23 14:27:07 2023 +0200 +++ b/mercurial/revlogutils/deltas.py Wed Oct 25 02:13:18 2023 +0200 @@ -1205,7 +1205,7 @@ msg = b"DBG-DELTAS-SEARCH: DISCARDED (prev size)\n" self._write_debug(msg) return None - header, data = revlog.compress(delta) + header, data = revlog._inner.compress(delta) deltalen = len(header) + len(data) offset = revlog.end(len(revlog) - 1) dist = deltalen + offset - revlog.start(chainbase) @@ -1226,7 +1226,7 @@ def _fullsnapshotinfo(self, revinfo, curr): rawtext = self.buildtext(revinfo) - data = self.revlog.compress(rawtext) + data = self.revlog._inner.compress(rawtext) compresseddeltalen = deltalen = dist = len(data[1]) + len(data[0]) deltabase = chainbase = curr snapshotdepth = 0