mercurial/revlogutils/censor.py
changeset 47465 bc8536e09a20
parent 47463 5fa083a5ff04
child 47466 f7a94e2d4470
equal deleted inserted replaced
47463:5fa083a5ff04 47465:bc8536e09a20
     1 # censor code related to censoring revision
       
     2 # coding: utf8
       
     3 #
       
     4 # Copyright 2021 Pierre-Yves David <pierre-yves.david@octobus.net>
       
     5 # Copyright 2015 Google, Inc <martinvonz@google.com>
       
     6 #
       
     7 # This software may be used and distributed according to the terms of the
       
     8 # GNU General Public License version 2 or any later version.
       
     9 
       
    10 import contextlib
       
    11 import os
       
    12 
       
    13 from ..node import (
       
    14     nullrev,
       
    15 )
       
    16 from .constants import (
       
    17     COMP_MODE_PLAIN,
       
    18     ENTRY_DATA_COMPRESSED_LENGTH,
       
    19     ENTRY_DATA_COMPRESSION_MODE,
       
    20     ENTRY_DATA_OFFSET,
       
    21     ENTRY_DATA_UNCOMPRESSED_LENGTH,
       
    22     ENTRY_DELTA_BASE,
       
    23     ENTRY_LINK_REV,
       
    24     ENTRY_NODE_ID,
       
    25     ENTRY_PARENT_1,
       
    26     ENTRY_PARENT_2,
       
    27     ENTRY_SIDEDATA_COMPRESSED_LENGTH,
       
    28     ENTRY_SIDEDATA_COMPRESSION_MODE,
       
    29     ENTRY_SIDEDATA_OFFSET,
       
    30     REVLOGV0,
       
    31     REVLOGV1,
       
    32 )
       
    33 from ..i18n import _
       
    34 
       
    35 from .. import (
       
    36     error,
       
    37     pycompat,
       
    38     revlogutils,
       
    39     util,
       
    40 )
       
    41 from ..utils import (
       
    42     storageutil,
       
    43 )
       
    44 from . import (
       
    45     constants,
       
    46     deltas,
       
    47 )
       
    48 
       
    49 
       
    50 def v1_censor(rl, tr, censornode, tombstone=b''):
       
    51     """censors a revision in a "version 1" revlog"""
       
    52     assert rl._format_version == constants.REVLOGV1, rl._format_version
       
    53 
       
    54     # avoid cycle
       
    55     from .. import revlog
       
    56 
       
    57     censorrev = rl.rev(censornode)
       
    58     tombstone = storageutil.packmeta({b'censored': tombstone}, b'')
       
    59 
       
    60     # Rewriting the revlog in place is hard. Our strategy for censoring is
       
    61     # to create a new revlog, copy all revisions to it, then replace the
       
    62     # revlogs on transaction close.
       
    63     #
       
    64     # This is a bit dangerous. We could easily have a mismatch of state.
       
    65     newrl = revlog.revlog(
       
    66         rl.opener,
       
    67         target=rl.target,
       
    68         radix=rl.radix,
       
    69         postfix=b'tmpcensored',
       
    70         censorable=True,
       
    71     )
       
    72     newrl._format_version = rl._format_version
       
    73     newrl._format_flags = rl._format_flags
       
    74     newrl._generaldelta = rl._generaldelta
       
    75     newrl._parse_index = rl._parse_index
       
    76 
       
    77     for rev in rl.revs():
       
    78         node = rl.node(rev)
       
    79         p1, p2 = rl.parents(node)
       
    80 
       
    81         if rev == censorrev:
       
    82             newrl.addrawrevision(
       
    83                 tombstone,
       
    84                 tr,
       
    85                 rl.linkrev(censorrev),
       
    86                 p1,
       
    87                 p2,
       
    88                 censornode,
       
    89                 constants.REVIDX_ISCENSORED,
       
    90             )
       
    91 
       
    92             if newrl.deltaparent(rev) != nullrev:
       
    93                 m = _(b'censored revision stored as delta; cannot censor')
       
    94                 h = _(
       
    95                     b'censoring of revlogs is not fully implemented;'
       
    96                     b' please report this bug'
       
    97                 )
       
    98                 raise error.Abort(m, hint=h)
       
    99             continue
       
   100 
       
   101         if rl.iscensored(rev):
       
   102             if rl.deltaparent(rev) != nullrev:
       
   103                 m = _(
       
   104                     b'cannot censor due to censored '
       
   105                     b'revision having delta stored'
       
   106                 )
       
   107                 raise error.Abort(m)
       
   108             rawtext = rl._chunk(rev)
       
   109         else:
       
   110             rawtext = rl.rawdata(rev)
       
   111 
       
   112         newrl.addrawrevision(
       
   113             rawtext, tr, rl.linkrev(rev), p1, p2, node, rl.flags(rev)
       
   114         )
       
   115 
       
   116     tr.addbackup(rl._indexfile, location=b'store')
       
   117     if not rl._inline:
       
   118         tr.addbackup(rl._datafile, location=b'store')
       
   119 
       
   120     rl.opener.rename(newrl._indexfile, rl._indexfile)
       
   121     if not rl._inline:
       
   122         rl.opener.rename(newrl._datafile, rl._datafile)
       
   123 
       
   124     rl.clearcaches()
       
   125     rl._loadindex()
       
   126 
       
   127 
       
   128 def v2_censor(rl, tr, censornode, tombstone=b''):
       
   129     """censors a revision in a "version 2" revlog"""
       
   130     # General principle
       
   131     #
       
   132     # We create new revlog files (index/data/sidedata) to copy the content of
       
   133     # the existing data without the censored data.
       
   134     #
       
   135     # We need to recompute new delta for any revision that used the censored
       
   136     # revision as delta base. As the cumulative size of the new delta may be
       
   137     # large, we store them in a temporary file until they are stored in their
       
   138     # final destination.
       
   139     #
       
   140     # All data before the censored data can be blindly copied. The rest needs
       
   141     # to be copied as we go and the associated index entry needs adjustement.
       
   142 
       
   143     assert rl._format_version != REVLOGV0, rl._format_version
       
   144     assert rl._format_version != REVLOGV1, rl._format_version
       
   145 
       
   146     old_index = rl.index
       
   147     docket = rl._docket
       
   148 
       
   149     censor_rev = rl.rev(censornode)
       
   150     tombstone = storageutil.packmeta({b'censored': tombstone}, b'')
       
   151 
       
   152     censored_entry = rl.index[censor_rev]
       
   153     index_cutoff = rl.index.entry_size * censor_rev
       
   154     data_cutoff = censored_entry[ENTRY_DATA_OFFSET] >> 16
       
   155     sidedata_cutoff = rl.sidedata_cut_off(censor_rev)
       
   156 
       
   157     # rev → (new_base, data_start, data_end)
       
   158     rewritten_entries = {}
       
   159 
       
   160     dc = deltas.deltacomputer(rl)
       
   161     excl = [censor_rev]
       
   162 
       
   163     with pycompat.unnamedtempfile(mode=b"w+b") as tmp_storage:
       
   164         with rl._segmentfile._open_read() as dfh:
       
   165             for rev in range(censor_rev + 1, len(old_index)):
       
   166                 entry = old_index[rev]
       
   167                 if censor_rev != entry[ENTRY_DELTA_BASE]:
       
   168                     continue
       
   169                 # This is a revision that use the censored revision as the base
       
   170                 # for its delta. We need a need new deltas
       
   171                 if entry[ENTRY_DATA_UNCOMPRESSED_LENGTH] == 0:
       
   172                     # this revision is empty, we can delta against nullrev
       
   173                     rewritten_entries[rev] = (nullrev, 0, 0)
       
   174                 else:
       
   175 
       
   176                     text = rl.rawdata(rev, _df=dfh)
       
   177                     info = revlogutils.revisioninfo(
       
   178                         node=entry[ENTRY_NODE_ID],
       
   179                         p1=rl.node(entry[ENTRY_PARENT_1]),
       
   180                         p2=rl.node(entry[ENTRY_PARENT_2]),
       
   181                         btext=[text],
       
   182                         textlen=len(text),
       
   183                         cachedelta=None,
       
   184                         flags=entry[ENTRY_DATA_OFFSET] & 0xFFFF,
       
   185                     )
       
   186                     d = dc.finddeltainfo(
       
   187                         info, dfh, excluded_bases=excl, target_rev=rev
       
   188                     )
       
   189                     default_comp = rl._docket.default_compression_header
       
   190                     comp_mode, d = deltas.delta_compression(default_comp, d)
       
   191                     # using `tell` is a bit lazy, but we are not here for speed
       
   192                     start = tmp_storage.tell()
       
   193                     tmp_storage.write(d.data[1])
       
   194                     end = tmp_storage.tell()
       
   195                     rewritten_entries[rev] = (d.base, start, end, comp_mode)
       
   196 
       
   197         old_index_filepath = rl.opener.join(docket.index_filepath())
       
   198         old_data_filepath = rl.opener.join(docket.data_filepath())
       
   199         old_sidedata_filepath = rl.opener.join(docket.sidedata_filepath())
       
   200 
       
   201         new_index_filepath = rl.opener.join(docket.new_index_file())
       
   202         new_data_filepath = rl.opener.join(docket.new_data_file())
       
   203         new_sidedata_filepath = rl.opener.join(docket.new_sidedata_file())
       
   204 
       
   205         util.copyfile(
       
   206             old_index_filepath, new_index_filepath, nb_bytes=index_cutoff
       
   207         )
       
   208         util.copyfile(
       
   209             old_data_filepath, new_data_filepath, nb_bytes=data_cutoff
       
   210         )
       
   211         util.copyfile(
       
   212             old_sidedata_filepath,
       
   213             new_sidedata_filepath,
       
   214             nb_bytes=sidedata_cutoff,
       
   215         )
       
   216         rl.opener.register_file(docket.index_filepath())
       
   217         rl.opener.register_file(docket.data_filepath())
       
   218         rl.opener.register_file(docket.sidedata_filepath())
       
   219 
       
   220         docket.index_end = index_cutoff
       
   221         docket.data_end = data_cutoff
       
   222         docket.sidedata_end = sidedata_cutoff
       
   223 
       
   224         # reload the revlog internal information
       
   225         rl.clearcaches()
       
   226         rl._loadindex(docket=docket)
       
   227 
       
   228         @contextlib.contextmanager
       
   229         def all_files():
       
   230             # hide opening in an helper function to please check-code, black
       
   231             # and various python ersion at the same time
       
   232             with open(old_data_filepath, 'rb') as old_data_file:
       
   233                 with open(old_sidedata_filepath, 'rb') as old_sidedata_file:
       
   234                     with open(new_index_filepath, 'r+b') as new_index_file:
       
   235                         with open(new_data_filepath, 'r+b') as new_data_file:
       
   236                             with open(
       
   237                                 new_sidedata_filepath, 'r+b'
       
   238                             ) as new_sidedata_file:
       
   239                                 yield (
       
   240                                     old_data_file,
       
   241                                     old_sidedata_file,
       
   242                                     new_index_file,
       
   243                                     new_data_file,
       
   244                                     new_sidedata_file,
       
   245                                 )
       
   246 
       
   247         # we dont need to open the old index file since its content already
       
   248         # exist in a usable form in `old_index`.
       
   249         with all_files() as (
       
   250             old_data_file,
       
   251             old_sidedata_file,
       
   252             new_index_file,
       
   253             new_data_file,
       
   254             new_sidedata_file,
       
   255         ):
       
   256             new_index_file.seek(0, os.SEEK_END)
       
   257             assert new_index_file.tell() == index_cutoff
       
   258             new_data_file.seek(0, os.SEEK_END)
       
   259             assert new_data_file.tell() == data_cutoff
       
   260             new_sidedata_file.seek(0, os.SEEK_END)
       
   261             assert new_sidedata_file.tell() == sidedata_cutoff
       
   262 
       
   263             ### writing the censored revision
       
   264             entry = old_index[censor_rev]
       
   265 
       
   266             # XXX consider trying the default compression too
       
   267             new_data_size = len(tombstone)
       
   268             new_data_offset = new_data_file.tell()
       
   269             new_data_file.write(tombstone)
       
   270 
       
   271             # we are not adding any sidedata as they might leak info about the censored version
       
   272 
       
   273             new_entry = revlogutils.entry(
       
   274                 flags=constants.REVIDX_ISCENSORED,
       
   275                 data_offset=new_data_offset,
       
   276                 data_compressed_length=new_data_size,
       
   277                 data_uncompressed_length=new_data_size,
       
   278                 data_delta_base=censor_rev,
       
   279                 link_rev=entry[ENTRY_LINK_REV],
       
   280                 parent_rev_1=entry[ENTRY_PARENT_1],
       
   281                 parent_rev_2=entry[ENTRY_PARENT_2],
       
   282                 node_id=entry[ENTRY_NODE_ID],
       
   283                 sidedata_offset=0,
       
   284                 sidedata_compressed_length=0,
       
   285                 data_compression_mode=COMP_MODE_PLAIN,
       
   286                 sidedata_compression_mode=COMP_MODE_PLAIN,
       
   287             )
       
   288             rl.index.append(new_entry)
       
   289             entry_bin = rl.index.entry_binary(censor_rev)
       
   290             new_index_file.write(entry_bin)
       
   291             docket.index_end = new_index_file.tell()
       
   292             docket.data_end = new_data_file.tell()
       
   293 
       
   294             #### Writing all subsequent revisions
       
   295             for rev in range(censor_rev + 1, len(old_index)):
       
   296                 entry = old_index[rev]
       
   297                 flags = entry[ENTRY_DATA_OFFSET] & 0xFFFF
       
   298                 old_data_offset = entry[ENTRY_DATA_OFFSET] >> 16
       
   299 
       
   300                 if rev not in rewritten_entries:
       
   301                     old_data_file.seek(old_data_offset)
       
   302                     new_data_size = entry[ENTRY_DATA_COMPRESSED_LENGTH]
       
   303                     new_data = old_data_file.read(new_data_size)
       
   304                     data_delta_base = entry[ENTRY_DELTA_BASE]
       
   305                     d_comp_mode = entry[ENTRY_DATA_COMPRESSION_MODE]
       
   306                 else:
       
   307                     (
       
   308                         data_delta_base,
       
   309                         start,
       
   310                         end,
       
   311                         d_comp_mode,
       
   312                     ) = rewritten_entries[rev]
       
   313                     new_data_size = end - start
       
   314                     tmp_storage.seek(start)
       
   315                     new_data = tmp_storage.read(new_data_size)
       
   316 
       
   317                 # It might be faster to group continuous read/write operation,
       
   318                 # however, this is censor, an operation that is not focussed
       
   319                 # around stellar performance. So I have not written this
       
   320                 # optimisation yet.
       
   321                 new_data_offset = new_data_file.tell()
       
   322                 new_data_file.write(new_data)
       
   323 
       
   324                 sidedata_size = entry[ENTRY_SIDEDATA_COMPRESSED_LENGTH]
       
   325                 new_sidedata_offset = new_sidedata_file.tell()
       
   326                 if 0 < sidedata_size:
       
   327                     old_sidedata_offset = entry[ENTRY_SIDEDATA_OFFSET]
       
   328                     old_sidedata_file.seek(old_sidedata_offset)
       
   329                     new_sidedata = old_sidedata_file.read(sidedata_size)
       
   330                     new_sidedata_file.write(new_sidedata)
       
   331 
       
   332                 data_uncompressed_length = entry[ENTRY_DATA_UNCOMPRESSED_LENGTH]
       
   333                 sd_com_mode = entry[ENTRY_SIDEDATA_COMPRESSION_MODE]
       
   334                 assert data_delta_base <= rev, (data_delta_base, rev)
       
   335 
       
   336                 new_entry = revlogutils.entry(
       
   337                     flags=flags,
       
   338                     data_offset=new_data_offset,
       
   339                     data_compressed_length=new_data_size,
       
   340                     data_uncompressed_length=data_uncompressed_length,
       
   341                     data_delta_base=data_delta_base,
       
   342                     link_rev=entry[ENTRY_LINK_REV],
       
   343                     parent_rev_1=entry[ENTRY_PARENT_1],
       
   344                     parent_rev_2=entry[ENTRY_PARENT_2],
       
   345                     node_id=entry[ENTRY_NODE_ID],
       
   346                     sidedata_offset=new_sidedata_offset,
       
   347                     sidedata_compressed_length=sidedata_size,
       
   348                     data_compression_mode=d_comp_mode,
       
   349                     sidedata_compression_mode=sd_com_mode,
       
   350                 )
       
   351                 rl.index.append(new_entry)
       
   352                 entry_bin = rl.index.entry_binary(rev)
       
   353                 new_index_file.write(entry_bin)
       
   354 
       
   355                 docket.index_end = new_index_file.tell()
       
   356                 docket.data_end = new_data_file.tell()
       
   357                 docket.sidedata_end = new_sidedata_file.tell()
       
   358 
       
   359     docket.write(transaction=None, stripping=True)