--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/hgext/censor.py Sun Mar 15 21:52:17 2015 -0400
@@ -0,0 +1,168 @@
+# Copyright (C) 2015 - Mike Edgar <adgar@google.com>
+#
+# This extension enables removal of file content at a given revision,
+# rewriting the data/metadata of successive revisions to preserve revision log
+# integrity.
+
+"""erase file content at a given revision
+
+The censor command instructs Mercurial to erase all content of a file at a given
+revision *without updating the changeset hash.* This allows existing history to
+remain valid while preventing future clones/pulls from receiving the erased
+data.
+
+Typical uses for censor are due to security or legal requirements, including::
+
+ * Passwords, private keys, crytographic material
+ * Licensed data/code/libraries for which the license has expired
+ * Personally Identifiable Information or other private data
+
+Censored file revisions are listed in a tracked file called .hgcensored stored
+in the repository root. The censor command adds an entry to the .hgcensored file
+in the working directory and commits it (much like ``hg tag`` and .hgtags). The
+censored file data is then replaced with a pointer to the new commit, enabling
+verification.
+
+Censored nodes can interrupt mercurial's typical operation whenever the excised
+data needs to be materialized. Some commands, like ``hg cat``/``hg revert``,
+simply fail when asked to produce censored data. Others, like ``hg verify`` and
+``hg update``, must be capable of tolerating censored data to continue to
+function in a meaningful way. Such commands only tolerate censored file
+revisions if they are allowed by the policy specified by the "censor.allow"
+config option.
+"""
+
+from mercurial.node import short
+from mercurial import cmdutil, error, filelog, revlog, scmutil, util
+from mercurial.i18n import _
+
+cmdtable = {}
+command = cmdutil.command(cmdtable)
+testedwith = 'internal'
+
+@command('censor',
+ [('r', 'rev', '', _('censor file from specified revision'), _('REV')),
+ ('t', 'tombstone', '', _('replacement tombstone data'), _('TEXT'))],
+ _('-r REV [-t TEXT] [FILE]'))
+def censor(ui, repo, path, rev='', tombstone='', **opts):
+ if not path:
+ raise util.Abort(_('must specify file path to censor'))
+ if not rev:
+ raise util.Abort(_('must specify revision to censor'))
+
+ flog = repo.file(path)
+ if not len(flog):
+ raise util.Abort(_('cannot censor file with no history'))
+
+ rev = scmutil.revsingle(repo, rev, rev).rev()
+ try:
+ ctx = repo[rev]
+ except KeyError:
+ raise util.Abort(_('invalid revision identifier %s') % rev)
+
+ try:
+ fctx = ctx.filectx(path)
+ except error.LookupError:
+ raise util.Abort(_('file does not exist at revision %s') % rev)
+
+ fnode = fctx.filenode()
+ headctxs = [repo[c] for c in repo.heads()]
+ heads = [c for c in headctxs if path in c and c.filenode(path) == fnode]
+ if heads:
+ headlist = ', '.join([short(c.node()) for c in heads])
+ raise util.Abort(_('cannot censor file in heads (%s)') % headlist,
+ hint=_('clean/delete and commit first'))
+
+ wctx = repo[None]
+ wp = wctx.parents()
+ if ctx.node() in [p.node() for p in wp]:
+ raise util.Abort(_('cannot censor working directory'),
+ hint=_('clean/delete/update first'))
+
+ flogv = flog.version & 0xFFFF
+ if flogv != revlog.REVLOGNG:
+ raise util.Abort(
+ _('censor does not support revlog version %d') % (flogv,))
+
+ tombstone = filelog.packmeta({"censored": tombstone}, "")
+
+ crev = fctx.filerev()
+
+ if len(tombstone) > flog.rawsize(crev):
+ raise util.Abort(_(
+ 'censor tombstone must be no longer than censored data'))
+
+ # Using two files instead of one makes it easy to rewrite entry-by-entry
+ idxread = repo.svfs(flog.indexfile, 'r')
+ idxwrite = repo.svfs(flog.indexfile, 'wb', atomictemp=True)
+ if flog.version & revlog.REVLOGNGINLINEDATA:
+ dataread, datawrite = idxread, idxwrite
+ else:
+ dataread = repo.svfs(flog.datafile, 'r')
+ datawrite = repo.svfs(flog.datafile, 'wb', atomictemp=True)
+
+ # Copy all revlog data up to the entry to be censored.
+ rio = revlog.revlogio()
+ offset = flog.start(crev)
+
+ for chunk in util.filechunkiter(idxread, limit=crev * rio.size):
+ idxwrite.write(chunk)
+ for chunk in util.filechunkiter(dataread, limit=offset):
+ datawrite.write(chunk)
+
+ def rewriteindex(r, newoffs, newdata=None):
+ """Rewrite the index entry with a new data offset and optional new data.
+
+ The newdata argument, if given, is a tuple of three positive integers:
+ (new compressed, new uncompressed, added flag bits).
+ """
+ offlags, comp, uncomp, base, link, p1, p2, nodeid = flog.index[r]
+ flags = revlog.gettype(offlags)
+ if newdata:
+ comp, uncomp, nflags = newdata
+ flags |= nflags
+ offlags = revlog.offset_type(newoffs, flags)
+ e = (offlags, comp, uncomp, r, link, p1, p2, nodeid)
+ idxwrite.write(rio.packentry(e, None, flog.version, r))
+ idxread.seek(rio.size, 1)
+
+ def rewrite(r, offs, data, nflags=revlog.REVIDX_DEFAULT_FLAGS):
+ """Write the given full text to the filelog with the given data offset.
+
+ Returns:
+ The integer number of data bytes written, for tracking data offsets.
+ """
+ flag, compdata = flog.compress(data)
+ newcomp = len(flag) + len(compdata)
+ rewriteindex(r, offs, (newcomp, len(data), nflags))
+ datawrite.write(flag)
+ datawrite.write(compdata)
+ dataread.seek(flog.length(r), 1)
+ return newcomp
+
+ # Rewrite censored revlog entry with (padded) tombstone data.
+ pad = ' ' * (flog.rawsize(crev) - len(tombstone))
+ offset += rewrite(crev, offset, tombstone + pad, revlog.REVIDX_ISCENSORED)
+
+ # Rewrite all following filelog revisions fixing up offsets and deltas.
+ for srev in xrange(crev + 1, len(flog)):
+ if crev in flog.parentrevs(srev):
+ # Immediate children of censored node must be re-added as fulltext.
+ try:
+ revdata = flog.revision(srev)
+ except error.CensoredNodeError, e:
+ revdata = e.tombstone
+ dlen = rewrite(srev, offset, revdata)
+ else:
+ # Copy any other revision data verbatim after fixing up the offset.
+ rewriteindex(srev, offset)
+ dlen = flog.length(srev)
+ for chunk in util.filechunkiter(dataread, limit=dlen):
+ datawrite.write(chunk)
+ offset += dlen
+
+ idxread.close()
+ idxwrite.close()
+ if dataread is not idxread:
+ dataread.close()
+ datawrite.close()