view mercurial/revlogutils/rewrite.py @ 47472:c81a5297f185

censor: migrate the logic to a set of `censor_revs` Instead of considering a special unique censored revision within the code, we consider a set of revision (currently always of size 1). This make the main code less censor-centric and prepare for the usage of a similar approach for stripping changesets. Differential Revision: https://phab.mercurial-scm.org/D10903
author Pierre-Yves David <pierre-yves.david@octobus.net>
date Tue, 22 Jun 2021 23:20:32 +0200
parents aab064416f0c
children 5045ba2a3afd
line wrap: on
line source

# censor code related to censoring revision
# coding: utf8
#
# Copyright 2021 Pierre-Yves David <pierre-yves.david@octobus.net>
# Copyright 2015 Google, Inc <martinvonz@google.com>
#
# This software may be used and distributed according to the terms of the
# GNU General Public License version 2 or any later version.

import contextlib
import os

from ..node import (
    nullrev,
)
from .constants import (
    COMP_MODE_PLAIN,
    ENTRY_DATA_COMPRESSED_LENGTH,
    ENTRY_DATA_COMPRESSION_MODE,
    ENTRY_DATA_OFFSET,
    ENTRY_DATA_UNCOMPRESSED_LENGTH,
    ENTRY_DELTA_BASE,
    ENTRY_LINK_REV,
    ENTRY_NODE_ID,
    ENTRY_PARENT_1,
    ENTRY_PARENT_2,
    ENTRY_SIDEDATA_COMPRESSED_LENGTH,
    ENTRY_SIDEDATA_COMPRESSION_MODE,
    ENTRY_SIDEDATA_OFFSET,
    REVLOGV0,
    REVLOGV1,
)
from ..i18n import _

from .. import (
    error,
    pycompat,
    revlogutils,
    util,
)
from ..utils import (
    storageutil,
)
from . import (
    constants,
    deltas,
)


def v1_censor(rl, tr, censornode, tombstone=b''):
    """censors a revision in a "version 1" revlog"""
    assert rl._format_version == constants.REVLOGV1, rl._format_version

    # avoid cycle
    from .. import revlog

    censorrev = rl.rev(censornode)
    tombstone = storageutil.packmeta({b'censored': tombstone}, b'')

    # Rewriting the revlog in place is hard. Our strategy for censoring is
    # to create a new revlog, copy all revisions to it, then replace the
    # revlogs on transaction close.
    #
    # This is a bit dangerous. We could easily have a mismatch of state.
    newrl = revlog.revlog(
        rl.opener,
        target=rl.target,
        radix=rl.radix,
        postfix=b'tmpcensored',
        censorable=True,
    )
    newrl._format_version = rl._format_version
    newrl._format_flags = rl._format_flags
    newrl._generaldelta = rl._generaldelta
    newrl._parse_index = rl._parse_index

    for rev in rl.revs():
        node = rl.node(rev)
        p1, p2 = rl.parents(node)

        if rev == censorrev:
            newrl.addrawrevision(
                tombstone,
                tr,
                rl.linkrev(censorrev),
                p1,
                p2,
                censornode,
                constants.REVIDX_ISCENSORED,
            )

            if newrl.deltaparent(rev) != nullrev:
                m = _(b'censored revision stored as delta; cannot censor')
                h = _(
                    b'censoring of revlogs is not fully implemented;'
                    b' please report this bug'
                )
                raise error.Abort(m, hint=h)
            continue

        if rl.iscensored(rev):
            if rl.deltaparent(rev) != nullrev:
                m = _(
                    b'cannot censor due to censored '
                    b'revision having delta stored'
                )
                raise error.Abort(m)
            rawtext = rl._chunk(rev)
        else:
            rawtext = rl.rawdata(rev)

        newrl.addrawrevision(
            rawtext, tr, rl.linkrev(rev), p1, p2, node, rl.flags(rev)
        )

    tr.addbackup(rl._indexfile, location=b'store')
    if not rl._inline:
        tr.addbackup(rl._datafile, location=b'store')

    rl.opener.rename(newrl._indexfile, rl._indexfile)
    if not rl._inline:
        rl.opener.rename(newrl._datafile, rl._datafile)

    rl.clearcaches()
    rl._loadindex()


def v2_censor(revlog, tr, censornode, tombstone=b''):
    """censors a revision in a "version 2" revlog"""
    # General principle
    #
    # We create new revlog files (index/data/sidedata) to copy the content of
    # the existing data without the censored data.
    #
    # We need to recompute new delta for any revision that used the censored
    # revision as delta base. As the cumulative size of the new delta may be
    # large, we store them in a temporary file until they are stored in their
    # final destination.
    #
    # All data before the censored data can be blindly copied. The rest needs
    # to be copied as we go and the associated index entry needs adjustement.

    assert revlog._format_version != REVLOGV0, revlog._format_version
    assert revlog._format_version != REVLOGV1, revlog._format_version

    old_index = revlog.index
    docket = revlog._docket

    censor_revs = {revlog.rev(censornode)}
    tombstone = storageutil.packmeta({b'censored': tombstone}, b'')

    first_excl_rev = min(censor_revs)

    first_excl_entry = revlog.index[first_excl_rev]
    index_cutoff = revlog.index.entry_size * first_excl_rev
    data_cutoff = first_excl_entry[ENTRY_DATA_OFFSET] >> 16
    sidedata_cutoff = revlog.sidedata_cut_off(first_excl_rev)

    with pycompat.unnamedtempfile(mode=b"w+b") as tmp_storage:
        # rev → (new_base, data_start, data_end, compression_mode)
        rewritten_entries = _precompute_rewritten_delta(
            revlog,
            old_index,
            censor_revs,
            tmp_storage,
        )

        all_files = _setup_new_files(
            revlog,
            index_cutoff,
            data_cutoff,
            sidedata_cutoff,
        )

        # we dont need to open the old index file since its content already
        # exist in a usable form in `old_index`.
        with all_files() as open_files:
            (
                old_data_file,
                old_sidedata_file,
                new_index_file,
                new_data_file,
                new_sidedata_file,
            ) = open_files

            # writing the censored revision

            # Writing all subsequent revisions
            for rev in range(first_excl_rev, len(old_index)):
                if rev in censor_revs:
                    _rewrite_censor(
                        revlog,
                        old_index,
                        open_files,
                        rev,
                        tombstone,
                    )
                else:
                    _rewrite_simple(
                        revlog,
                        old_index,
                        open_files,
                        rev,
                        rewritten_entries,
                        tmp_storage,
                    )
    docket.write(transaction=None, stripping=True)


def _precompute_rewritten_delta(
    revlog,
    old_index,
    excluded_revs,
    tmp_storage,
):
    """Compute new delta for revisions whose delta is based on revision that
    will not survive as is.

    Return a mapping: {rev → (new_base, data_start, data_end, compression_mode)}
    """
    dc = deltas.deltacomputer(revlog)
    rewritten_entries = {}
    first_excl_rev = min(excluded_revs)
    with revlog._segmentfile._open_read() as dfh:
        for rev in range(first_excl_rev, len(old_index)):
            if rev in excluded_revs:
                # this revision will be preserved as is, so we don't need to
                # consider recomputing a delta.
                continue
            entry = old_index[rev]
            if entry[ENTRY_DELTA_BASE] not in excluded_revs:
                continue
            # This is a revision that use the censored revision as the base
            # for its delta. We need a need new deltas
            if entry[ENTRY_DATA_UNCOMPRESSED_LENGTH] == 0:
                # this revision is empty, we can delta against nullrev
                rewritten_entries[rev] = (nullrev, 0, 0, COMP_MODE_PLAIN)
            else:

                text = revlog.rawdata(rev, _df=dfh)
                info = revlogutils.revisioninfo(
                    node=entry[ENTRY_NODE_ID],
                    p1=revlog.node(entry[ENTRY_PARENT_1]),
                    p2=revlog.node(entry[ENTRY_PARENT_2]),
                    btext=[text],
                    textlen=len(text),
                    cachedelta=None,
                    flags=entry[ENTRY_DATA_OFFSET] & 0xFFFF,
                )
                d = dc.finddeltainfo(
                    info, dfh, excluded_bases=excluded_revs, target_rev=rev
                )
                default_comp = revlog._docket.default_compression_header
                comp_mode, d = deltas.delta_compression(default_comp, d)
                # using `tell` is a bit lazy, but we are not here for speed
                start = tmp_storage.tell()
                tmp_storage.write(d.data[1])
                end = tmp_storage.tell()
                rewritten_entries[rev] = (d.base, start, end, comp_mode)
    return rewritten_entries


def _setup_new_files(
    revlog,
    index_cutoff,
    data_cutoff,
    sidedata_cutoff,
):
    """

    return a context manager to open all the relevant files:
    - old_data_file,
    - old_sidedata_file,
    - new_index_file,
    - new_data_file,
    - new_sidedata_file,

    The old_index_file is not here because it is accessed through the
    `old_index` object if the caller function.
    """
    docket = revlog._docket
    old_index_filepath = revlog.opener.join(docket.index_filepath())
    old_data_filepath = revlog.opener.join(docket.data_filepath())
    old_sidedata_filepath = revlog.opener.join(docket.sidedata_filepath())

    new_index_filepath = revlog.opener.join(docket.new_index_file())
    new_data_filepath = revlog.opener.join(docket.new_data_file())
    new_sidedata_filepath = revlog.opener.join(docket.new_sidedata_file())

    util.copyfile(old_index_filepath, new_index_filepath, nb_bytes=index_cutoff)
    util.copyfile(old_data_filepath, new_data_filepath, nb_bytes=data_cutoff)
    util.copyfile(
        old_sidedata_filepath,
        new_sidedata_filepath,
        nb_bytes=sidedata_cutoff,
    )
    revlog.opener.register_file(docket.index_filepath())
    revlog.opener.register_file(docket.data_filepath())
    revlog.opener.register_file(docket.sidedata_filepath())

    docket.index_end = index_cutoff
    docket.data_end = data_cutoff
    docket.sidedata_end = sidedata_cutoff

    # reload the revlog internal information
    revlog.clearcaches()
    revlog._loadindex(docket=docket)

    @contextlib.contextmanager
    def all_files_opener():
        # hide opening in an helper function to please check-code, black
        # and various python version at the same time
        with open(old_data_filepath, 'rb') as old_data_file:
            with open(old_sidedata_filepath, 'rb') as old_sidedata_file:
                with open(new_index_filepath, 'r+b') as new_index_file:
                    with open(new_data_filepath, 'r+b') as new_data_file:
                        with open(
                            new_sidedata_filepath, 'r+b'
                        ) as new_sidedata_file:
                            new_index_file.seek(0, os.SEEK_END)
                            assert new_index_file.tell() == index_cutoff
                            new_data_file.seek(0, os.SEEK_END)
                            assert new_data_file.tell() == data_cutoff
                            new_sidedata_file.seek(0, os.SEEK_END)
                            assert new_sidedata_file.tell() == sidedata_cutoff
                            yield (
                                old_data_file,
                                old_sidedata_file,
                                new_index_file,
                                new_data_file,
                                new_sidedata_file,
                            )

    return all_files_opener


def _rewrite_simple(
    revlog,
    old_index,
    all_files,
    rev,
    rewritten_entries,
    tmp_storage,
):
    """append a normal revision to the index after the rewritten one(s)"""
    (
        old_data_file,
        old_sidedata_file,
        new_index_file,
        new_data_file,
        new_sidedata_file,
    ) = all_files
    entry = old_index[rev]
    flags = entry[ENTRY_DATA_OFFSET] & 0xFFFF
    old_data_offset = entry[ENTRY_DATA_OFFSET] >> 16

    if rev not in rewritten_entries:
        old_data_file.seek(old_data_offset)
        new_data_size = entry[ENTRY_DATA_COMPRESSED_LENGTH]
        new_data = old_data_file.read(new_data_size)
        data_delta_base = entry[ENTRY_DELTA_BASE]
        d_comp_mode = entry[ENTRY_DATA_COMPRESSION_MODE]
    else:
        (
            data_delta_base,
            start,
            end,
            d_comp_mode,
        ) = rewritten_entries[rev]
        new_data_size = end - start
        tmp_storage.seek(start)
        new_data = tmp_storage.read(new_data_size)

    # It might be faster to group continuous read/write operation,
    # however, this is censor, an operation that is not focussed
    # around stellar performance. So I have not written this
    # optimisation yet.
    new_data_offset = new_data_file.tell()
    new_data_file.write(new_data)

    sidedata_size = entry[ENTRY_SIDEDATA_COMPRESSED_LENGTH]
    new_sidedata_offset = new_sidedata_file.tell()
    if 0 < sidedata_size:
        old_sidedata_offset = entry[ENTRY_SIDEDATA_OFFSET]
        old_sidedata_file.seek(old_sidedata_offset)
        new_sidedata = old_sidedata_file.read(sidedata_size)
        new_sidedata_file.write(new_sidedata)

    data_uncompressed_length = entry[ENTRY_DATA_UNCOMPRESSED_LENGTH]
    sd_com_mode = entry[ENTRY_SIDEDATA_COMPRESSION_MODE]
    assert data_delta_base <= rev, (data_delta_base, rev)

    new_entry = revlogutils.entry(
        flags=flags,
        data_offset=new_data_offset,
        data_compressed_length=new_data_size,
        data_uncompressed_length=data_uncompressed_length,
        data_delta_base=data_delta_base,
        link_rev=entry[ENTRY_LINK_REV],
        parent_rev_1=entry[ENTRY_PARENT_1],
        parent_rev_2=entry[ENTRY_PARENT_2],
        node_id=entry[ENTRY_NODE_ID],
        sidedata_offset=new_sidedata_offset,
        sidedata_compressed_length=sidedata_size,
        data_compression_mode=d_comp_mode,
        sidedata_compression_mode=sd_com_mode,
    )
    revlog.index.append(new_entry)
    entry_bin = revlog.index.entry_binary(rev)
    new_index_file.write(entry_bin)

    revlog._docket.index_end = new_index_file.tell()
    revlog._docket.data_end = new_data_file.tell()
    revlog._docket.sidedata_end = new_sidedata_file.tell()


def _rewrite_censor(
    revlog,
    old_index,
    all_files,
    rev,
    tombstone,
):
    """rewrite and append a censored revision"""
    (
        old_data_file,
        old_sidedata_file,
        new_index_file,
        new_data_file,
        new_sidedata_file,
    ) = all_files
    entry = old_index[rev]

    # XXX consider trying the default compression too
    new_data_size = len(tombstone)
    new_data_offset = new_data_file.tell()
    new_data_file.write(tombstone)

    # we are not adding any sidedata as they might leak info about the censored version

    link_rev = entry[ENTRY_LINK_REV]

    p1 = entry[ENTRY_PARENT_1]
    p2 = entry[ENTRY_PARENT_2]

    new_entry = revlogutils.entry(
        flags=constants.REVIDX_ISCENSORED,
        data_offset=new_data_offset,
        data_compressed_length=new_data_size,
        data_uncompressed_length=new_data_size,
        data_delta_base=rev,
        link_rev=link_rev,
        parent_rev_1=p1,
        parent_rev_2=p2,
        node_id=entry[ENTRY_NODE_ID],
        sidedata_offset=0,
        sidedata_compressed_length=0,
        data_compression_mode=COMP_MODE_PLAIN,
        sidedata_compression_mode=COMP_MODE_PLAIN,
    )
    revlog.index.append(new_entry)
    entry_bin = revlog.index.entry_binary(rev)
    new_index_file.write(entry_bin)
    revlog._docket.index_end = new_index_file.tell()
    revlog._docket.data_end = new_data_file.tell()