diff hgext/censor.py @ 24347:1bcfecbbf569

censor: add censor command to hgext with basic client-side tests The censor command is a core extension which can replace the contents of a historical file revision with a censor "tombstone" which can be exchanged with older clients in place of the real revision data. The command rewrites the filelog by copying revision-by-revision. Care must be taken to expand the fulltext of the children of the censored revision before copying them to the new filelog; they might be stored as deltas against the uncensored revision, and those deltas will be invalidated. For more background on the censorship feature design, see: http://mercurial.selenic.com/wiki/CensorPlan
author Mike Edgar <adgar@google.com>
date Sun, 15 Mar 2015 21:52:17 -0400
parents
children 5e111acc1170
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/hgext/censor.py	Sun Mar 15 21:52:17 2015 -0400
@@ -0,0 +1,168 @@
+# Copyright (C) 2015 - Mike Edgar <adgar@google.com>
+#
+# This extension enables removal of file content at a given revision,
+# rewriting the data/metadata of successive revisions to preserve revision log
+# integrity.
+
+"""erase file content at a given revision
+
+The censor command instructs Mercurial to erase all content of a file at a given
+revision *without updating the changeset hash.* This allows existing history to
+remain valid while preventing future clones/pulls from receiving the erased
+data.
+
+Typical uses for censor are due to security or legal requirements, including::
+
+ * Passwords, private keys, crytographic material
+ * Licensed data/code/libraries for which the license has expired
+ * Personally Identifiable Information or other private data
+
+Censored file revisions are listed in a tracked file called .hgcensored stored
+in the repository root. The censor command adds an entry to the .hgcensored file
+in the working directory and commits it (much like ``hg tag`` and .hgtags). The
+censored file data is then replaced with a pointer to the new commit, enabling
+verification.
+
+Censored nodes can interrupt mercurial's typical operation whenever the excised
+data needs to be materialized. Some commands, like ``hg cat``/``hg revert``,
+simply fail when asked to produce censored data. Others, like ``hg verify`` and
+``hg update``, must be capable of tolerating censored data to continue to
+function in a meaningful way. Such commands only tolerate censored file
+revisions if they are allowed by the policy specified by the "censor.allow"
+config option.
+"""
+
+from mercurial.node import short
+from mercurial import cmdutil, error, filelog, revlog, scmutil, util
+from mercurial.i18n import _
+
+cmdtable = {}
+command = cmdutil.command(cmdtable)
+testedwith = 'internal'
+
+@command('censor',
+    [('r', 'rev', '', _('censor file from specified revision'), _('REV')),
+     ('t', 'tombstone', '', _('replacement tombstone data'), _('TEXT'))],
+    _('-r REV [-t TEXT] [FILE]'))
+def censor(ui, repo, path, rev='', tombstone='', **opts):
+    if not path:
+        raise util.Abort(_('must specify file path to censor'))
+    if not rev:
+        raise util.Abort(_('must specify revision to censor'))
+
+    flog = repo.file(path)
+    if not len(flog):
+        raise util.Abort(_('cannot censor file with no history'))
+
+    rev = scmutil.revsingle(repo, rev, rev).rev()
+    try:
+        ctx = repo[rev]
+    except KeyError:
+        raise util.Abort(_('invalid revision identifier %s') % rev)
+
+    try:
+        fctx = ctx.filectx(path)
+    except error.LookupError:
+        raise util.Abort(_('file does not exist at revision %s') % rev)
+
+    fnode = fctx.filenode()
+    headctxs = [repo[c] for c in repo.heads()]
+    heads = [c for c in headctxs if path in c and c.filenode(path) == fnode]
+    if heads:
+        headlist = ', '.join([short(c.node()) for c in heads])
+        raise util.Abort(_('cannot censor file in heads (%s)') % headlist,
+            hint=_('clean/delete and commit first'))
+
+    wctx = repo[None]
+    wp = wctx.parents()
+    if ctx.node() in [p.node() for p in wp]:
+        raise util.Abort(_('cannot censor working directory'),
+            hint=_('clean/delete/update first'))
+
+    flogv = flog.version & 0xFFFF
+    if flogv != revlog.REVLOGNG:
+        raise util.Abort(
+            _('censor does not support revlog version %d') % (flogv,))
+
+    tombstone = filelog.packmeta({"censored": tombstone}, "")
+
+    crev = fctx.filerev()
+
+    if len(tombstone) > flog.rawsize(crev):
+        raise util.Abort(_(
+            'censor tombstone must be no longer than censored data'))
+
+    # Using two files instead of one makes it easy to rewrite entry-by-entry
+    idxread = repo.svfs(flog.indexfile, 'r')
+    idxwrite = repo.svfs(flog.indexfile, 'wb', atomictemp=True)
+    if flog.version & revlog.REVLOGNGINLINEDATA:
+        dataread, datawrite = idxread, idxwrite
+    else:
+        dataread = repo.svfs(flog.datafile, 'r')
+        datawrite = repo.svfs(flog.datafile, 'wb', atomictemp=True)
+
+    # Copy all revlog data up to the entry to be censored.
+    rio = revlog.revlogio()
+    offset = flog.start(crev)
+
+    for chunk in util.filechunkiter(idxread, limit=crev * rio.size):
+        idxwrite.write(chunk)
+    for chunk in util.filechunkiter(dataread, limit=offset):
+        datawrite.write(chunk)
+
+    def rewriteindex(r, newoffs, newdata=None):
+        """Rewrite the index entry with a new data offset and optional new data.
+
+        The newdata argument, if given, is a tuple of three positive integers:
+        (new compressed, new uncompressed, added flag bits).
+        """
+        offlags, comp, uncomp, base, link, p1, p2, nodeid = flog.index[r]
+        flags = revlog.gettype(offlags)
+        if newdata:
+            comp, uncomp, nflags = newdata
+            flags |= nflags
+        offlags = revlog.offset_type(newoffs, flags)
+        e = (offlags, comp, uncomp, r, link, p1, p2, nodeid)
+        idxwrite.write(rio.packentry(e, None, flog.version, r))
+        idxread.seek(rio.size, 1)
+
+    def rewrite(r, offs, data, nflags=revlog.REVIDX_DEFAULT_FLAGS):
+        """Write the given full text to the filelog with the given data offset.
+
+        Returns:
+            The integer number of data bytes written, for tracking data offsets.
+        """
+        flag, compdata = flog.compress(data)
+        newcomp = len(flag) + len(compdata)
+        rewriteindex(r, offs, (newcomp, len(data), nflags))
+        datawrite.write(flag)
+        datawrite.write(compdata)
+        dataread.seek(flog.length(r), 1)
+        return newcomp
+
+    # Rewrite censored revlog entry with (padded) tombstone data.
+    pad = ' ' * (flog.rawsize(crev) - len(tombstone))
+    offset += rewrite(crev, offset, tombstone + pad, revlog.REVIDX_ISCENSORED)
+
+    # Rewrite all following filelog revisions fixing up offsets and deltas.
+    for srev in xrange(crev + 1, len(flog)):
+        if crev in flog.parentrevs(srev):
+            # Immediate children of censored node must be re-added as fulltext.
+            try:
+                revdata = flog.revision(srev)
+            except error.CensoredNodeError, e:
+                revdata = e.tombstone
+            dlen = rewrite(srev, offset, revdata)
+        else:
+            # Copy any other revision data verbatim after fixing up the offset.
+            rewriteindex(srev, offset)
+            dlen = flog.length(srev)
+            for chunk in util.filechunkiter(dataread, limit=dlen):
+                datawrite.write(chunk)
+        offset += dlen
+
+    idxread.close()
+    idxwrite.close()
+    if dataread is not idxread:
+        dataread.close()
+        datawrite.close()