mercurial/revlog.py
changeset 51095 a82704902db8
parent 51094 de6a8cc24de3
child 51096 9c8df10ea6e0
equal deleted inserted replaced
51094:de6a8cc24de3 51095:a82704902db8
   351         index_file,
   351         index_file,
   352         data_file,
   352         data_file,
   353         sidedata_file,
   353         sidedata_file,
   354         inline,
   354         inline,
   355         data_config,
   355         data_config,
       
   356         feature_config,
   356         chunk_cache,
   357         chunk_cache,
       
   358         default_compression_header,
   357     ):
   359     ):
   358         self.opener = opener
   360         self.opener = opener
   359         self.index = index
   361         self.index = index
   360 
   362 
   361         self.__index_file = index_file
   363         self.__index_file = index_file
   362         self.data_file = data_file
   364         self.data_file = data_file
   363         self.sidedata_file = sidedata_file
   365         self.sidedata_file = sidedata_file
   364         self.inline = inline
   366         self.inline = inline
   365         self.data_config = data_config
   367         self.data_config = data_config
       
   368         self.feature_config = feature_config
       
   369 
       
   370         self._default_compression_header = default_compression_header
   366 
   371 
   367         # index
   372         # index
   368 
   373 
   369         # 3-tuple of file handles being used for active writing.
   374         # 3-tuple of file handles being used for active writing.
   370         self._writinghandles = None
   375         self._writinghandles = None
   379             self.opener,
   384             self.opener,
   380             self.sidedata_file,
   385             self.sidedata_file,
   381             self.data_config.chunk_cache_size,
   386             self.data_config.chunk_cache_size,
   382         )
   387         )
   383 
   388 
       
   389         # revlog header -> revlog compressor
       
   390         self._decompressors = {}
       
   391 
   384     @property
   392     @property
   385     def index_file(self):
   393     def index_file(self):
   386         return self.__index_file
   394         return self.__index_file
   387 
   395 
   388     @index_file.setter
   396     @index_file.setter
   402         return self.index[rev][1]
   410         return self.index[rev][1]
   403 
   411 
   404     def end(self, rev):
   412     def end(self, rev):
   405         """the end of the data chunk for this revision"""
   413         """the end of the data chunk for this revision"""
   406         return self.start(rev) + self.length(rev)
   414         return self.start(rev) + self.length(rev)
       
   415 
       
   416     @util.propertycache
       
   417     def _compressor(self):
       
   418         engine = util.compengines[self.feature_config.compression_engine]
       
   419         return engine.revlogcompressor(
       
   420             self.feature_config.compression_engine_options
       
   421         )
       
   422 
       
   423     @util.propertycache
       
   424     def _decompressor(self):
       
   425         """the default decompressor"""
       
   426         if self._default_compression_header is None:
       
   427             return None
       
   428         t = self._default_compression_header
       
   429         c = self._get_decompressor(t)
       
   430         return c.decompress
       
   431 
       
   432     def _get_decompressor(self, t):
       
   433         try:
       
   434             compressor = self._decompressors[t]
       
   435         except KeyError:
       
   436             try:
       
   437                 engine = util.compengines.forrevlogheader(t)
       
   438                 compressor = engine.revlogcompressor(
       
   439                     self.feature_config.compression_engine_options
       
   440                 )
       
   441                 self._decompressors[t] = compressor
       
   442             except KeyError:
       
   443                 raise error.RevlogError(
       
   444                     _(b'unknown compression type %s') % binascii.hexlify(t)
       
   445                 )
       
   446         return compressor
       
   447 
       
   448     def compress(self, data):
       
   449         """Generate a possibly-compressed representation of data."""
       
   450         if not data:
       
   451             return b'', data
       
   452 
       
   453         compressed = self._compressor.compress(data)
       
   454 
       
   455         if compressed:
       
   456             # The revlog compressor added the header in the returned data.
       
   457             return b'', compressed
       
   458 
       
   459         if data[0:1] == b'\0':
       
   460             return b'', data
       
   461         return b'u', data
       
   462 
       
   463     def decompress(self, data):
       
   464         """Decompress a revlog chunk.
       
   465 
       
   466         The chunk is expected to begin with a header identifying the
       
   467         format type so it can be routed to an appropriate decompressor.
       
   468         """
       
   469         if not data:
       
   470             return data
       
   471 
       
   472         # Revlogs are read much more frequently than they are written and many
       
   473         # chunks only take microseconds to decompress, so performance is
       
   474         # important here.
       
   475         #
       
   476         # We can make a few assumptions about revlogs:
       
   477         #
       
   478         # 1) the majority of chunks will be compressed (as opposed to inline
       
   479         #    raw data).
       
   480         # 2) decompressing *any* data will likely by at least 10x slower than
       
   481         #    returning raw inline data.
       
   482         # 3) we want to prioritize common and officially supported compression
       
   483         #    engines
       
   484         #
       
   485         # It follows that we want to optimize for "decompress compressed data
       
   486         # when encoded with common and officially supported compression engines"
       
   487         # case over "raw data" and "data encoded by less common or non-official
       
   488         # compression engines." That is why we have the inline lookup first
       
   489         # followed by the compengines lookup.
       
   490         #
       
   491         # According to `hg perfrevlogchunks`, this is ~0.5% faster for zlib
       
   492         # compressed chunks. And this matters for changelog and manifest reads.
       
   493         t = data[0:1]
       
   494 
       
   495         if t == b'x':
       
   496             try:
       
   497                 return _zlibdecompress(data)
       
   498             except zlib.error as e:
       
   499                 raise error.RevlogError(
       
   500                     _(b'revlog decompress error: %s')
       
   501                     % stringutil.forcebytestr(e)
       
   502                 )
       
   503         # '\0' is more common than 'u' so it goes first.
       
   504         elif t == b'\0':
       
   505             return data
       
   506         elif t == b'u':
       
   507             return util.buffer(data, 1)
       
   508 
       
   509         compressor = self._get_decompressor(t)
       
   510 
       
   511         return compressor.decompress(data)
   407 
   512 
   408     @contextlib.contextmanager
   513     @contextlib.contextmanager
   409     def reading(self):
   514     def reading(self):
   410         """Context manager that keeps data and sidedata files open for reading"""
   515         """Context manager that keeps data and sidedata files open for reading"""
   411         if len(self.index) == 0:
   516         if len(self.index) == 0:
  1282                 _(b"index %s is corrupted") % self.display_id
  1387                 _(b"index %s is corrupted") % self.display_id
  1283             )
  1388             )
  1284         self.index = index
  1389         self.index = index
  1285         # revnum -> (chain-length, sum-delta-length)
  1390         # revnum -> (chain-length, sum-delta-length)
  1286         self._chaininfocache = util.lrucachedict(500)
  1391         self._chaininfocache = util.lrucachedict(500)
  1287         # revlog header -> revlog compressor
       
  1288         self._decompressors = {}
       
  1289 
  1392 
  1290         return chunkcache
  1393         return chunkcache
  1291 
  1394 
  1292     def _load_inner(self, chunk_cache):
  1395     def _load_inner(self, chunk_cache):
       
  1396         if self._docket is None:
       
  1397             default_compression_header = None
       
  1398         else:
       
  1399             default_compression_header = self._docket.default_compression_header
       
  1400 
  1293         self._inner = _InnerRevlog(
  1401         self._inner = _InnerRevlog(
  1294             opener=self.opener,
  1402             opener=self.opener,
  1295             index=self.index,
  1403             index=self.index,
  1296             index_file=self._indexfile,
  1404             index_file=self._indexfile,
  1297             data_file=self._datafile,
  1405             data_file=self._datafile,
  1298             sidedata_file=self._sidedatafile,
  1406             sidedata_file=self._sidedatafile,
  1299             inline=self._inline,
  1407             inline=self._inline,
  1300             data_config=self.data_config,
  1408             data_config=self.data_config,
       
  1409             feature_config=self.feature_config,
  1301             chunk_cache=chunk_cache,
  1410             chunk_cache=chunk_cache,
       
  1411             default_compression_header=default_compression_header,
  1302         )
  1412         )
  1303 
  1413 
  1304     def get_revlog(self):
  1414     def get_revlog(self):
  1305         """simple function to mirror API of other not-really-revlog API"""
  1415         """simple function to mirror API of other not-really-revlog API"""
  1306         return self
  1416         return self
  1316             # Reference the file without the "data/" prefix, so it is familiar
  1426             # Reference the file without the "data/" prefix, so it is familiar
  1317             # to the user.
  1427             # to the user.
  1318             return self.target[1]
  1428             return self.target[1]
  1319         else:
  1429         else:
  1320             return self.radix
  1430             return self.radix
  1321 
       
  1322     def _get_decompressor(self, t):
       
  1323         try:
       
  1324             compressor = self._decompressors[t]
       
  1325         except KeyError:
       
  1326             try:
       
  1327                 engine = util.compengines.forrevlogheader(t)
       
  1328                 compressor = engine.revlogcompressor(
       
  1329                     self.feature_config.compression_engine_options
       
  1330                 )
       
  1331                 self._decompressors[t] = compressor
       
  1332             except KeyError:
       
  1333                 raise error.RevlogError(
       
  1334                     _(b'unknown compression type %s') % binascii.hexlify(t)
       
  1335                 )
       
  1336         return compressor
       
  1337 
       
  1338     @util.propertycache
       
  1339     def _compressor(self):
       
  1340         engine = util.compengines[self.feature_config.compression_engine]
       
  1341         return engine.revlogcompressor(
       
  1342             self.feature_config.compression_engine_options
       
  1343         )
       
  1344 
       
  1345     @util.propertycache
       
  1346     def _decompressor(self):
       
  1347         """the default decompressor"""
       
  1348         if self._docket is None:
       
  1349             return None
       
  1350         t = self._docket.default_compression_header
       
  1351         c = self._get_decompressor(t)
       
  1352         return c.decompress
       
  1353 
  1431 
  1354     def _datafp(self, mode=b'r'):
  1432     def _datafp(self, mode=b'r'):
  1355         """file object for the revlog's data file"""
  1433         """file object for the revlog's data file"""
  1356         return self.opener(self._datafile, mode=mode)
  1434         return self.opener(self._datafile, mode=mode)
  1357 
  1435 
  2270         compression_mode = self.index[rev][10]
  2348         compression_mode = self.index[rev][10]
  2271         data = self._inner.get_segment_for_revs(rev, rev)[1]
  2349         data = self._inner.get_segment_for_revs(rev, rev)[1]
  2272         if compression_mode == COMP_MODE_PLAIN:
  2350         if compression_mode == COMP_MODE_PLAIN:
  2273             return data
  2351             return data
  2274         elif compression_mode == COMP_MODE_DEFAULT:
  2352         elif compression_mode == COMP_MODE_DEFAULT:
  2275             return self._decompressor(data)
  2353             return self._inner._decompressor(data)
  2276         elif compression_mode == COMP_MODE_INLINE:
  2354         elif compression_mode == COMP_MODE_INLINE:
  2277             return self.decompress(data)
  2355             return self._inner.decompress(data)
  2278         else:
  2356         else:
  2279             msg = b'unknown compression mode %d'
  2357             msg = b'unknown compression mode %d'
  2280             msg %= compression_mode
  2358             msg %= compression_mode
  2281             raise error.RevlogError(msg)
  2359             raise error.RevlogError(msg)
  2282 
  2360 
  2326             except OverflowError:
  2404             except OverflowError:
  2327                 # issue4215 - we can't cache a run of chunks greater than
  2405                 # issue4215 - we can't cache a run of chunks greater than
  2328                 # 2G on Windows
  2406                 # 2G on Windows
  2329                 return [self._chunk(rev) for rev in revschunk]
  2407                 return [self._chunk(rev) for rev in revschunk]
  2330 
  2408 
  2331             decomp = self.decompress
  2409             decomp = self._inner.decompress
  2332             # self._decompressor might be None, but will not be used in that case
  2410             # self._decompressor might be None, but will not be used in that case
  2333             def_decomp = self._decompressor
  2411             def_decomp = self._inner._decompressor
  2334             for rev in revschunk:
  2412             for rev in revschunk:
  2335                 chunkstart = start(rev)
  2413                 chunkstart = start(rev)
  2336                 if inline:
  2414                 if inline:
  2337                     chunkstart += (rev + 1) * iosize
  2415                     chunkstart += (rev + 1) * iosize
  2338                 chunklength = length(rev)
  2416                 chunklength = length(rev)
  2542 
  2620 
  2543         comp = self.index[rev][11]
  2621         comp = self.index[rev][11]
  2544         if comp == COMP_MODE_PLAIN:
  2622         if comp == COMP_MODE_PLAIN:
  2545             segment = comp_segment
  2623             segment = comp_segment
  2546         elif comp == COMP_MODE_DEFAULT:
  2624         elif comp == COMP_MODE_DEFAULT:
  2547             segment = self._decompressor(comp_segment)
  2625             segment = self._inner._decompressor(comp_segment)
  2548         elif comp == COMP_MODE_INLINE:
  2626         elif comp == COMP_MODE_INLINE:
  2549             segment = self.decompress(comp_segment)
  2627             segment = self._inner.decompress(comp_segment)
  2550         else:
  2628         else:
  2551             msg = b'unknown compression mode %d'
  2629             msg = b'unknown compression mode %d'
  2552             msg %= comp
  2630             msg %= comp
  2553             raise error.RevlogError(msg)
  2631             raise error.RevlogError(msg)
  2554 
  2632 
  2840                 deltacomputer=deltacomputer,
  2918                 deltacomputer=deltacomputer,
  2841                 sidedata=sidedata,
  2919                 sidedata=sidedata,
  2842             )
  2920             )
  2843 
  2921 
  2844     def compress(self, data):
  2922     def compress(self, data):
  2845         """Generate a possibly-compressed representation of data."""
  2923         return self._inner.compress(data)
  2846         if not data:
       
  2847             return b'', data
       
  2848 
       
  2849         compressed = self._compressor.compress(data)
       
  2850 
       
  2851         if compressed:
       
  2852             # The revlog compressor added the header in the returned data.
       
  2853             return b'', compressed
       
  2854 
       
  2855         if data[0:1] == b'\0':
       
  2856             return b'', data
       
  2857         return b'u', data
       
  2858 
  2924 
  2859     def decompress(self, data):
  2925     def decompress(self, data):
  2860         """Decompress a revlog chunk.
  2926         return self._inner.decompress(data)
  2861 
       
  2862         The chunk is expected to begin with a header identifying the
       
  2863         format type so it can be routed to an appropriate decompressor.
       
  2864         """
       
  2865         if not data:
       
  2866             return data
       
  2867 
       
  2868         # Revlogs are read much more frequently than they are written and many
       
  2869         # chunks only take microseconds to decompress, so performance is
       
  2870         # important here.
       
  2871         #
       
  2872         # We can make a few assumptions about revlogs:
       
  2873         #
       
  2874         # 1) the majority of chunks will be compressed (as opposed to inline
       
  2875         #    raw data).
       
  2876         # 2) decompressing *any* data will likely by at least 10x slower than
       
  2877         #    returning raw inline data.
       
  2878         # 3) we want to prioritize common and officially supported compression
       
  2879         #    engines
       
  2880         #
       
  2881         # It follows that we want to optimize for "decompress compressed data
       
  2882         # when encoded with common and officially supported compression engines"
       
  2883         # case over "raw data" and "data encoded by less common or non-official
       
  2884         # compression engines." That is why we have the inline lookup first
       
  2885         # followed by the compengines lookup.
       
  2886         #
       
  2887         # According to `hg perfrevlogchunks`, this is ~0.5% faster for zlib
       
  2888         # compressed chunks. And this matters for changelog and manifest reads.
       
  2889         t = data[0:1]
       
  2890 
       
  2891         if t == b'x':
       
  2892             try:
       
  2893                 return _zlibdecompress(data)
       
  2894             except zlib.error as e:
       
  2895                 raise error.RevlogError(
       
  2896                     _(b'revlog decompress error: %s')
       
  2897                     % stringutil.forcebytestr(e)
       
  2898                 )
       
  2899         # '\0' is more common than 'u' so it goes first.
       
  2900         elif t == b'\0':
       
  2901             return data
       
  2902         elif t == b'u':
       
  2903             return util.buffer(data, 1)
       
  2904 
       
  2905         compressor = self._get_decompressor(t)
       
  2906 
       
  2907         return compressor.decompress(data)
       
  2908 
  2927 
  2909     def _addrevision(
  2928     def _addrevision(
  2910         self,
  2929         self,
  2911         node,
  2930         node,
  2912         rawtext,
  2931         rawtext,
  3027         sidedata_compression_mode = COMP_MODE_INLINE
  3046         sidedata_compression_mode = COMP_MODE_INLINE
  3028         if sidedata and self.feature_config.has_side_data:
  3047         if sidedata and self.feature_config.has_side_data:
  3029             sidedata_compression_mode = COMP_MODE_PLAIN
  3048             sidedata_compression_mode = COMP_MODE_PLAIN
  3030             serialized_sidedata = sidedatautil.serialize_sidedata(sidedata)
  3049             serialized_sidedata = sidedatautil.serialize_sidedata(sidedata)
  3031             sidedata_offset = self._docket.sidedata_end
  3050             sidedata_offset = self._docket.sidedata_end
  3032             h, comp_sidedata = self.compress(serialized_sidedata)
  3051             h, comp_sidedata = self._inner.compress(serialized_sidedata)
  3033             if (
  3052             if (
  3034                 h != b'u'
  3053                 h != b'u'
  3035                 and comp_sidedata[0:1] != b'\0'
  3054                 and comp_sidedata[0:1] != b'\0'
  3036                 and len(comp_sidedata) < len(serialized_sidedata)
  3055                 and len(comp_sidedata) < len(serialized_sidedata)
  3037             ):
  3056             ):
  3874                 )
  3893                 )
  3875 
  3894 
  3876                 sidedata_compression_mode = COMP_MODE_INLINE
  3895                 sidedata_compression_mode = COMP_MODE_INLINE
  3877                 if serialized_sidedata and self.feature_config.has_side_data:
  3896                 if serialized_sidedata and self.feature_config.has_side_data:
  3878                     sidedata_compression_mode = COMP_MODE_PLAIN
  3897                     sidedata_compression_mode = COMP_MODE_PLAIN
  3879                     h, comp_sidedata = self.compress(serialized_sidedata)
  3898                     h, comp_sidedata = self._inner.compress(serialized_sidedata)
  3880                     if (
  3899                     if (
  3881                         h != b'u'
  3900                         h != b'u'
  3882                         and comp_sidedata[0] != b'\0'
  3901                         and comp_sidedata[0] != b'\0'
  3883                         and len(comp_sidedata) < len(serialized_sidedata)
  3902                         and len(comp_sidedata) < len(serialized_sidedata)
  3884                     ):
  3903                     ):