comparison mercurial/revlog.py @ 30795:78ac56aebab6

revlog: use compression engine API for compression This commit swaps in the just-added revlog compressor API into the revlog class. Instead of implementing zlib compression inline in compress(), we now store a cached-on-first-use revlog compressor on each revlog instance and invoke its "compress()" method. As part of this, revlog.compress() has been refactored a bit to use a cleaner code flow and modern formatting (e.g. avoiding parenthesis around returned tuples). On a mozilla-unified repo, here are the "compress" times for a few commands: $ hg perfrevlogchunks -c ! wall 5.772450 comb 5.780000 user 5.780000 sys 0.000000 (best of 3) ! wall 5.795158 comb 5.790000 user 5.790000 sys 0.000000 (best of 3) $ hg perfrevlogchunks -m ! wall 9.975789 comb 9.970000 user 9.970000 sys 0.000000 (best of 3) ! wall 10.019505 comb 10.010000 user 10.010000 sys 0.000000 (best of 3) Compression times did seem to slow down just a little. There are 360,210 changelog revisions and 359,342 manifest revisions. For the changelog, mean time to compress a revision increased from ~16.025us to ~16.088us. That's basically a function call or an attribute lookup. I suppose this is the price you pay for abstraction. It's so low that I'm not concerned.
author Gregory Szorc <gregory.szorc@gmail.com>
date Mon, 02 Jan 2017 11:22:52 -0800
parents b6f455a6e4d6
children 2b279126b8f5
comparison
equal deleted inserted replaced
30794:31e1f0d4ab44 30795:78ac56aebab6
37 util, 37 util,
38 ) 38 )
39 39
40 _pack = struct.pack 40 _pack = struct.pack
41 _unpack = struct.unpack 41 _unpack = struct.unpack
42 _compress = zlib.compress
43 _decompress = zlib.decompress 42 _decompress = zlib.decompress
44 43
45 # revlog header flags 44 # revlog header flags
46 REVLOGV0 = 0 45 REVLOGV0 = 0
47 REVLOGNG = 1 46 REVLOGNG = 1
338 self.nodemap = self._nodecache = nodemap 337 self.nodemap = self._nodecache = nodemap
339 if not self._chunkcache: 338 if not self._chunkcache:
340 self._chunkclear() 339 self._chunkclear()
341 # revnum -> (chain-length, sum-delta-length) 340 # revnum -> (chain-length, sum-delta-length)
342 self._chaininfocache = {} 341 self._chaininfocache = {}
342
343 @util.propertycache
344 def _compressor(self):
345 return util.compengines['zlib'].revlogcompressor()
343 346
344 def tip(self): 347 def tip(self):
345 return self.node(len(self.index) - 2) 348 return self.node(len(self.index) - 2)
346 def __contains__(self, rev): 349 def __contains__(self, rev):
347 return 0 <= rev < len(self) 350 return 0 <= rev < len(self)
1463 finally: 1466 finally:
1464 if dfh: 1467 if dfh:
1465 dfh.close() 1468 dfh.close()
1466 ifh.close() 1469 ifh.close()
1467 1470
1468 def compress(self, text): 1471 def compress(self, data):
1469 """ generate a possibly-compressed representation of text """ 1472 """Generate a possibly-compressed representation of data."""
1470 if not text: 1473 if not data:
1471 return ("", text) 1474 return '', data
1472 l = len(text) 1475
1473 bin = None 1476 compressed = self._compressor.compress(data)
1474 if l < 44: 1477
1475 pass 1478 if compressed:
1476 elif l > 1000000: 1479 # The revlog compressor added the header in the returned data.
1477 # zlib makes an internal copy, thus doubling memory usage for 1480 return '', compressed
1478 # large files, so lets do this in pieces 1481
1479 z = zlib.compressobj() 1482 if data[0] == '\0':
1480 p = [] 1483 return '', data
1481 pos = 0 1484 return 'u', data
1482 while pos < l:
1483 pos2 = pos + 2**20
1484 p.append(z.compress(text[pos:pos2]))
1485 pos = pos2
1486 p.append(z.flush())
1487 if sum(map(len, p)) < l:
1488 bin = "".join(p)
1489 else:
1490 bin = _compress(text)
1491 if bin is None or len(bin) >= l:
1492 if text[0] == '\0':
1493 return ("", text)
1494 return ('u', text)
1495 return ("", bin)
1496 1485
1497 def decompress(self, data): 1486 def decompress(self, data):
1498 """Decompress a revlog chunk. 1487 """Decompress a revlog chunk.
1499 1488
1500 The chunk is expected to begin with a header identifying the 1489 The chunk is expected to begin with a header identifying the