revlog: add a small cache of unfiltered chunk
This can provides a massive boost to the reading of multiple revision and the
computation of a valid delta chain.
This greatly help operation like `hg log --patch`, delta computation (helping
pull/unbundle), linkrev adjustment (helping copy tracing).
A first round of benchmark for `hg log --patch --limit 1000` shows improvement
in the 10-20% range on "small" repository like pypy or mercurial and large
improvements (about 33%) for more complex ones like netbeans and mozilla's.
These speeds up are consistent with the improvement to `hg pull` (from a server
sending poor deltas) I saw benchmarking this last year. Further benchmark will
be run during the freeze.
I added some configuration in the experimental space to be able to further test
the effect of various tuning for now. This feature should fit well in the
"usage/resource profile" configuration that we should land next cycle.
When it does not provides a benefit the overhead of the cache seem to be around
2%, a small price for the big improvement. In addition I believe we could shave
most of this overhead with a more efficent lru implementation.
--- a/mercurial/configitems.toml Fri Oct 27 02:57:09 2023 +0200
+++ b/mercurial/configitems.toml Fri Oct 27 08:54:41 2023 +0200
@@ -1111,6 +1111,28 @@
[[items]]
section = "experimental"
+name = "revlog.uncompressed-cache.enabled"
+default = true
+experimental = true
+documentation = """Enable some caching of uncompressed chunk, greatly boosting
+performance at the cost of memory usage."""
+
+[[items]]
+section = "experimental"
+name = "revlog.uncompressed-cache.factor"
+default = 4
+experimental = true
+documentation = """The size of the cache compared to the largest revision seen."""
+
+[[items]]
+section = "experimental"
+name = "revlog.uncompressed-cache.count"
+default = 10000
+experimental = true
+documentation = """The number of chunk cached."""
+
+[[items]]
+section = "experimental"
name = "stream-v3"
default = false
--- a/mercurial/localrepo.py Fri Oct 27 02:57:09 2023 +0200
+++ b/mercurial/localrepo.py Fri Oct 27 08:54:41 2023 +0200
@@ -1089,6 +1089,16 @@
if chunkcachesize is not None:
data_config.chunk_cache_size = chunkcachesize
+ if ui.configbool(b'experimental', b'revlog.uncompressed-cache.enabled'):
+ factor = ui.configint(
+ b'experimental', b'revlog.uncompressed-cache.factor'
+ )
+ count = ui.configint(
+ b'experimental', b'revlog.uncompressed-cache.count'
+ )
+ data_config.uncompressed_cache_factor = factor
+ data_config.uncompressed_cache_count = count
+
delta_config.delta_both_parents = ui.configbool(
b'storage', b'revlog.optimize-delta-parent-choice'
)
--- a/mercurial/revlog.py Fri Oct 27 02:57:09 2023 +0200
+++ b/mercurial/revlog.py Fri Oct 27 08:54:41 2023 +0200
@@ -295,6 +295,12 @@
# How much data to read and cache into the raw revlog data cache.
chunk_cache_size = attr.ib(default=65536)
+ # The size of the uncompressed cache compared to the largest revision seen.
+ uncompressed_cache_factor = attr.ib(default=None)
+
+ # The number of chunk cached
+ uncompressed_cache_count = attr.ib(default=None)
+
# Allow sparse reading of the revlog data
with_sparse_read = attr.ib(default=False)
# minimal density of a sparse read chunk
@@ -396,6 +402,18 @@
# 3-tuple of (node, rev, text) for a raw revision.
self._revisioncache = None
+ # cache some uncompressed chunks
+ # rev → uncompressed_chunk
+ #
+ # the max cost is dynamically updated to be proportionnal to the
+ # size of revision we actually encounter.
+ self._uncompressed_chunk_cache = None
+ if self.data_config.uncompressed_cache_factor is not None:
+ self._uncompressed_chunk_cache = util.lrucachedict(
+ self.data_config.uncompressed_cache_count,
+ maxcost=65536, # some arbitrary initial value
+ )
+
self._delay_buffer = None
@property
@@ -414,6 +432,8 @@
def clear_cache(self):
assert not self.is_delaying
self._revisioncache = None
+ if self._uncompressed_chunk_cache is not None:
+ self._uncompressed_chunk_cache.clear()
self._segmentfile.clear_cache()
self._segmentfile_sidedata.clear_cache()
@@ -865,18 +885,26 @@
Returns a str holding uncompressed data for the requested revision.
"""
+ if self._uncompressed_chunk_cache is not None:
+ uncomp = self._uncompressed_chunk_cache.get(rev)
+ if uncomp is not None:
+ return uncomp
+
compression_mode = self.index[rev][10]
data = self.get_segment_for_revs(rev, rev)[1]
if compression_mode == COMP_MODE_PLAIN:
- return data
+ uncomp = data
elif compression_mode == COMP_MODE_DEFAULT:
- return self._decompressor(data)
+ uncomp = self._decompressor(data)
elif compression_mode == COMP_MODE_INLINE:
- return self.decompress(data)
+ uncomp = self.decompress(data)
else:
msg = b'unknown compression mode %d'
msg %= compression_mode
raise error.RevlogError(msg)
+ if self._uncompressed_chunk_cache is not None:
+ self._uncompressed_chunk_cache.insert(rev, uncomp, cost=len(uncomp))
+ return uncomp
def _chunks(self, revs, targetsize=None):
"""Obtain decompressed chunks for the specified revisions.
@@ -899,17 +927,30 @@
iosize = self.index.entry_size
buffer = util.buffer
- l = []
- ladd = l.append
+ fetched_revs = []
+ fadd = fetched_revs.append
+
chunks = []
ladd = chunks.append
- if not self.data_config.with_sparse_read:
- slicedchunks = (revs,)
+ if self._uncompressed_chunk_cache is None:
+ fetched_revs = revs
+ else:
+ for rev in revs:
+ cached_value = self._uncompressed_chunk_cache.get(rev)
+ if cached_value is None:
+ fadd(rev)
+ else:
+ ladd((rev, cached_value))
+
+ if not fetched_revs:
+ slicedchunks = ()
+ elif not self.data_config.with_sparse_read:
+ slicedchunks = (fetched_revs,)
else:
slicedchunks = deltautil.slicechunk(
self,
- revs,
+ fetched_revs,
targetsize=targetsize,
)
@@ -949,7 +990,10 @@
msg %= comp_mode
raise error.RevlogError(msg)
ladd((rev, c))
-
+ if self._uncompressed_chunk_cache is not None:
+ self._uncompressed_chunk_cache.insert(rev, c, len(c))
+
+ chunks.sort()
return [x[1] for x in chunks]
def raw_text(self, node, rev):
@@ -981,6 +1025,14 @@
if 0 <= rawsize:
targetsize = 4 * rawsize
+ if self._uncompressed_chunk_cache is not None:
+ # dynamically update the uncompressed_chunk_cache size to the
+ # largest revision we saw in this revlog.
+ factor = self.data_config.uncompressed_cache_factor
+ candidate_size = rawsize * factor
+ if candidate_size > self._uncompressed_chunk_cache.maxcost:
+ self._uncompressed_chunk_cache.maxcost = candidate_size
+
bins = self._chunks(chain, targetsize=targetsize)
if basetext is None:
basetext = bytes(bins[0])