Mercurial > hg

--- a/mercurial/revlog.py	Fri Jan 13 10:22:25 2017 +0100
+++ b/mercurial/revlog.py	Fri Jan 13 19:58:00 2017 -0800
@@ -39,7 +39,8 @@

 _pack = struct.pack
 _unpack = struct.unpack
-_decompress = zlib.decompress
+# Aliased for performance.
+_zlibdecompress = zlib.decompress

 # revlog header flags
 REVLOGV0 = 0
@@ -339,6 +340,8 @@
             self._chunkclear()
         # revnum -> (chain-length, sum-delta-length)
         self._chaininfocache = {}
+        # revlog header -> revlog compressor
+        self._decompressors = {}

     @util.propertycache
     def _compressor(self):
@@ -1491,17 +1494,52 @@
         """
         if not data:
             return data
+
+        # Revlogs are read much more frequently than they are written and many
+        # chunks only take microseconds to decompress, so performance is
+        # important here.
+        #
+        # We can make a few assumptions about revlogs:
+        #
+        # 1) the majority of chunks will be compressed (as opposed to inline
+        #    raw data).
+        # 2) decompressing *any* data will likely by at least 10x slower than
+        #    returning raw inline data.
+        # 3) we want to prioritize common and officially supported compression
+        #    engines
+        #
+        # It follows that we want to optimize for "decompress compressed data
+        # when encoded with common and officially supported compression engines"
+        # case over "raw data" and "data encoded by less common or non-official
+        # compression engines." That is why we have the inline lookup first
+        # followed by the compengines lookup.
+        #
+        # According to `hg perfrevlogchunks`, this is ~0.5% faster for zlib
+        # compressed chunks. And this matters for changelog and manifest reads.
         t = data[0]
-        if t == '\0':
-            return data
+
         if t == 'x':
             try:
-                return _decompress(data)
+                return _zlibdecompress(data)
             except zlib.error as e:
                 raise RevlogError(_('revlog decompress error: %s') % str(e))
-        if t == 'u':
+        # '\0' is more common than 'u' so it goes first.
+        elif t == '\0':
+            return data
+        elif t == 'u':
             return util.buffer(data, 1)
-        raise RevlogError(_('unknown compression type %r') % t)
+
+        try:
+            compressor = self._decompressors[t]
+        except KeyError:
+            try:
+                engine = util.compengines.forrevlogheader(t)
+                compressor = engine.revlogcompressor()
+                self._decompressors[t] = compressor
+            except KeyError:
+                raise RevlogError(_('unknown compression type %r') % t)
+
+        return compressor.decompress(data)

     def _isgooddelta(self, d, textlen):
         """Returns True if the given delta is good. Good means that it is within