censor: add censor command to hgext with basic client-side tests
The censor command is a core extension which can replace the contents of a
historical file revision with a censor "tombstone" which can be exchanged
with older clients in place of the real revision data. The command rewrites
the filelog by copying revision-by-revision.
Care must be taken to expand the fulltext of the children of the censored
revision before copying them to the new filelog; they might be stored as
deltas against the uncensored revision, and those deltas will be invalidated.
For more background on the censorship feature design, see:
http://mercurial.selenic.com/wiki/CensorPlan
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/hgext/censor.py Sun Mar 15 21:52:17 2015 -0400
@@ -0,0 +1,168 @@
+# Copyright (C) 2015 - Mike Edgar <adgar@google.com>
+#
+# This extension enables removal of file content at a given revision,
+# rewriting the data/metadata of successive revisions to preserve revision log
+# integrity.
+
+"""erase file content at a given revision
+
+The censor command instructs Mercurial to erase all content of a file at a given
+revision *without updating the changeset hash.* This allows existing history to
+remain valid while preventing future clones/pulls from receiving the erased
+data.
+
+Typical uses for censor are due to security or legal requirements, including::
+
+ * Passwords, private keys, crytographic material
+ * Licensed data/code/libraries for which the license has expired
+ * Personally Identifiable Information or other private data
+
+Censored file revisions are listed in a tracked file called .hgcensored stored
+in the repository root. The censor command adds an entry to the .hgcensored file
+in the working directory and commits it (much like ``hg tag`` and .hgtags). The
+censored file data is then replaced with a pointer to the new commit, enabling
+verification.
+
+Censored nodes can interrupt mercurial's typical operation whenever the excised
+data needs to be materialized. Some commands, like ``hg cat``/``hg revert``,
+simply fail when asked to produce censored data. Others, like ``hg verify`` and
+``hg update``, must be capable of tolerating censored data to continue to
+function in a meaningful way. Such commands only tolerate censored file
+revisions if they are allowed by the policy specified by the "censor.allow"
+config option.
+"""
+
+from mercurial.node import short
+from mercurial import cmdutil, error, filelog, revlog, scmutil, util
+from mercurial.i18n import _
+
+cmdtable = {}
+command = cmdutil.command(cmdtable)
+testedwith = 'internal'
+
+@command('censor',
+ [('r', 'rev', '', _('censor file from specified revision'), _('REV')),
+ ('t', 'tombstone', '', _('replacement tombstone data'), _('TEXT'))],
+ _('-r REV [-t TEXT] [FILE]'))
+def censor(ui, repo, path, rev='', tombstone='', **opts):
+ if not path:
+ raise util.Abort(_('must specify file path to censor'))
+ if not rev:
+ raise util.Abort(_('must specify revision to censor'))
+
+ flog = repo.file(path)
+ if not len(flog):
+ raise util.Abort(_('cannot censor file with no history'))
+
+ rev = scmutil.revsingle(repo, rev, rev).rev()
+ try:
+ ctx = repo[rev]
+ except KeyError:
+ raise util.Abort(_('invalid revision identifier %s') % rev)
+
+ try:
+ fctx = ctx.filectx(path)
+ except error.LookupError:
+ raise util.Abort(_('file does not exist at revision %s') % rev)
+
+ fnode = fctx.filenode()
+ headctxs = [repo[c] for c in repo.heads()]
+ heads = [c for c in headctxs if path in c and c.filenode(path) == fnode]
+ if heads:
+ headlist = ', '.join([short(c.node()) for c in heads])
+ raise util.Abort(_('cannot censor file in heads (%s)') % headlist,
+ hint=_('clean/delete and commit first'))
+
+ wctx = repo[None]
+ wp = wctx.parents()
+ if ctx.node() in [p.node() for p in wp]:
+ raise util.Abort(_('cannot censor working directory'),
+ hint=_('clean/delete/update first'))
+
+ flogv = flog.version & 0xFFFF
+ if flogv != revlog.REVLOGNG:
+ raise util.Abort(
+ _('censor does not support revlog version %d') % (flogv,))
+
+ tombstone = filelog.packmeta({"censored": tombstone}, "")
+
+ crev = fctx.filerev()
+
+ if len(tombstone) > flog.rawsize(crev):
+ raise util.Abort(_(
+ 'censor tombstone must be no longer than censored data'))
+
+ # Using two files instead of one makes it easy to rewrite entry-by-entry
+ idxread = repo.svfs(flog.indexfile, 'r')
+ idxwrite = repo.svfs(flog.indexfile, 'wb', atomictemp=True)
+ if flog.version & revlog.REVLOGNGINLINEDATA:
+ dataread, datawrite = idxread, idxwrite
+ else:
+ dataread = repo.svfs(flog.datafile, 'r')
+ datawrite = repo.svfs(flog.datafile, 'wb', atomictemp=True)
+
+ # Copy all revlog data up to the entry to be censored.
+ rio = revlog.revlogio()
+ offset = flog.start(crev)
+
+ for chunk in util.filechunkiter(idxread, limit=crev * rio.size):
+ idxwrite.write(chunk)
+ for chunk in util.filechunkiter(dataread, limit=offset):
+ datawrite.write(chunk)
+
+ def rewriteindex(r, newoffs, newdata=None):
+ """Rewrite the index entry with a new data offset and optional new data.
+
+ The newdata argument, if given, is a tuple of three positive integers:
+ (new compressed, new uncompressed, added flag bits).
+ """
+ offlags, comp, uncomp, base, link, p1, p2, nodeid = flog.index[r]
+ flags = revlog.gettype(offlags)
+ if newdata:
+ comp, uncomp, nflags = newdata
+ flags |= nflags
+ offlags = revlog.offset_type(newoffs, flags)
+ e = (offlags, comp, uncomp, r, link, p1, p2, nodeid)
+ idxwrite.write(rio.packentry(e, None, flog.version, r))
+ idxread.seek(rio.size, 1)
+
+ def rewrite(r, offs, data, nflags=revlog.REVIDX_DEFAULT_FLAGS):
+ """Write the given full text to the filelog with the given data offset.
+
+ Returns:
+ The integer number of data bytes written, for tracking data offsets.
+ """
+ flag, compdata = flog.compress(data)
+ newcomp = len(flag) + len(compdata)
+ rewriteindex(r, offs, (newcomp, len(data), nflags))
+ datawrite.write(flag)
+ datawrite.write(compdata)
+ dataread.seek(flog.length(r), 1)
+ return newcomp
+
+ # Rewrite censored revlog entry with (padded) tombstone data.
+ pad = ' ' * (flog.rawsize(crev) - len(tombstone))
+ offset += rewrite(crev, offset, tombstone + pad, revlog.REVIDX_ISCENSORED)
+
+ # Rewrite all following filelog revisions fixing up offsets and deltas.
+ for srev in xrange(crev + 1, len(flog)):
+ if crev in flog.parentrevs(srev):
+ # Immediate children of censored node must be re-added as fulltext.
+ try:
+ revdata = flog.revision(srev)
+ except error.CensoredNodeError, e:
+ revdata = e.tombstone
+ dlen = rewrite(srev, offset, revdata)
+ else:
+ # Copy any other revision data verbatim after fixing up the offset.
+ rewriteindex(srev, offset)
+ dlen = flog.length(srev)
+ for chunk in util.filechunkiter(dataread, limit=dlen):
+ datawrite.write(chunk)
+ offset += dlen
+
+ idxread.close()
+ idxwrite.close()
+ if dataread is not idxread:
+ dataread.close()
+ datawrite.close()
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/tests/test-censor.t Sun Mar 15 21:52:17 2015 -0400
@@ -0,0 +1,315 @@
+ $ cat >> $HGRCPATH <<EOF
+ > [extensions]
+ > censor=
+ > EOF
+ $ cp $HGRCPATH $HGRCPATH.orig
+
+Create repo with unimpeachable content
+
+ $ hg init r
+ $ cd r
+ $ echo 'Initially untainted file' > target
+ $ echo 'Normal file here' > bystander
+ $ hg add target bystander
+ $ hg ci -m init
+
+Clone repo so we can test pull later
+
+ $ cd ..
+ $ hg clone r rpull
+ updating to branch default
+ 2 files updated, 0 files merged, 0 files removed, 0 files unresolved
+ $ cd r
+
+Introduce content which will ultimately require censorship. Name the first
+censored node C1, second C2, and so on
+
+ $ echo 'Tainted file' > target
+ $ echo 'Passwords: hunter2' >> target
+ $ hg ci -m taint target
+ $ C1=`hg id --debug -i`
+
+ $ echo 'hunter3' >> target
+ $ echo 'Normal file v2' > bystander
+ $ hg ci -m moretaint target bystander
+ $ C2=`hg id --debug -i`
+
+Add a new sanitized versions to correct our mistake. Name the first head H1,
+the second head H2, and so on
+
+ $ echo 'Tainted file is now sanitized' > target
+ $ hg ci -m sanitized target
+ $ H1=`hg id --debug -i`
+
+ $ hg update -r $C2
+ 1 files updated, 0 files merged, 0 files removed, 0 files unresolved
+ $ echo 'Tainted file now super sanitized' > target
+ $ hg ci -m 'super sanitized' target
+ created new head
+ $ H2=`hg id --debug -i`
+
+Verify target contents before censorship at each revision
+
+ $ hg cat -r 3 target
+ Tainted file is now sanitized
+ $ hg cat -r $H2 target
+ Tainted file now super sanitized
+ $ hg cat -r $C2 target
+ Tainted file
+ Passwords: hunter2
+ hunter3
+ $ hg cat -r $C1 target
+ Tainted file
+ Passwords: hunter2
+ $ hg cat -r 0 target
+ Initially untainted file
+
+Try to censor revision with too large of a tombstone message
+
+ $ hg censor -r $C1 -t 'blah blah blah blah blah blah blah blah bla' target
+ abort: censor tombstone must be no longer than censored data
+ [255]
+
+Censor revision with 2 offenses
+
+ $ hg censor -r $C2 -t "remove password" target
+ $ hg cat -r 3 target
+ Tainted file is now sanitized
+ $ hg cat -r $H2 target
+ Tainted file now super sanitized
+ $ hg cat -r $C2 target
+ abort: censored node: 1e0247a9a4b7
+ (set censor.policy to ignore errors)
+ [255]
+ $ hg cat -r $C1 target
+ Tainted file
+ Passwords: hunter2
+ $ hg cat -r 0 target
+ Initially untainted file
+
+Censor revision with 1 offense
+
+ $ hg censor -r $C1 target
+ $ hg cat -r 3 target
+ Tainted file is now sanitized
+ $ hg cat -r $H2 target
+ Tainted file now super sanitized
+ $ hg cat -r $C2 target
+ abort: censored node: 1e0247a9a4b7
+ (set censor.policy to ignore errors)
+ [255]
+ $ hg cat -r $C1 target
+ abort: censored node: 613bc869fceb
+ (set censor.policy to ignore errors)
+ [255]
+ $ hg cat -r 0 target
+ Initially untainted file
+
+Can only checkout target at uncensored revisions, -X is workaround for --all
+
+ $ hg revert -r $C2 target
+ abort: censored node: 1e0247a9a4b7
+ (set censor.policy to ignore errors)
+ [255]
+ $ hg revert -r $C1 target
+ abort: censored node: 613bc869fceb
+ (set censor.policy to ignore errors)
+ [255]
+ $ hg revert -r $C1 --all
+ reverting bystander
+ reverting target
+ abort: censored node: 613bc869fceb
+ (set censor.policy to ignore errors)
+ [255]
+ $ hg revert -r $C1 --all -X target
+ $ cat target
+ Tainted file now super sanitized
+ $ hg revert -r 0 --all
+ reverting target
+ $ cat target
+ Initially untainted file
+ $ hg revert -r $H2 --all
+ reverting bystander
+ reverting target
+ $ cat target
+ Tainted file now super sanitized
+
+Uncensored file can be viewed at any revision
+
+ $ hg cat -r 3 bystander
+ Normal file v2
+ $ hg cat -r $C2 bystander
+ Normal file v2
+ $ hg cat -r $C1 bystander
+ Normal file here
+ $ hg cat -r 0 bystander
+ Normal file here
+
+Can update to children of censored revision
+
+ $ hg update -r 3
+ 1 files updated, 0 files merged, 0 files removed, 0 files unresolved
+ $ cat target
+ Tainted file is now sanitized
+ $ hg update -r $H2
+ 1 files updated, 0 files merged, 0 files removed, 0 files unresolved
+ $ cat target
+ Tainted file now super sanitized
+
+Set censor policy to abort in trusted $HGRC so hg verify fails
+
+ $ cp $HGRCPATH.orig $HGRCPATH
+ $ cat >> $HGRCPATH <<EOF
+ > [censor]
+ > policy = abort
+ > EOF
+
+Repo fails verification due to censorship
+
+ $ hg verify
+ checking changesets
+ checking manifests
+ crosschecking files in changesets and manifests
+ checking files
+ target@1: censored file data
+ target@2: censored file data
+ 2 files, 5 changesets, 7 total revisions
+ 2 integrity errors encountered!
+ (first damaged changeset appears to be 1)
+ [1]
+
+Cannot update to revision with censored data
+
+ $ hg update -r $C2
+ abort: censored node: 1e0247a9a4b7
+ (set censor.policy to ignore errors)
+ [255]
+ $ hg update -r $C1
+ abort: censored node: 613bc869fceb
+ (set censor.policy to ignore errors)
+ [255]
+ $ hg update -r 0
+ 2 files updated, 0 files merged, 0 files removed, 0 files unresolved
+ $ hg update -r $H2
+ 2 files updated, 0 files merged, 0 files removed, 0 files unresolved
+
+Set censor policy to ignore in trusted $HGRC so hg verify passes
+
+ $ cp $HGRCPATH.orig $HGRCPATH
+ $ cat >> $HGRCPATH <<EOF
+ > [censor]
+ > policy = ignore
+ > EOF
+
+Repo passes verification with warnings with explicit config
+
+ $ hg verify
+ checking changesets
+ checking manifests
+ crosschecking files in changesets and manifests
+ checking files
+ 2 files, 5 changesets, 7 total revisions
+
+May update to revision with censored data with explicit config
+
+ $ hg update -r $C2
+ 1 files updated, 0 files merged, 0 files removed, 0 files unresolved
+ $ cat target
+ $ hg update -r $C1
+ 2 files updated, 0 files merged, 0 files removed, 0 files unresolved
+ $ cat target
+ $ hg update -r 0
+ 1 files updated, 0 files merged, 0 files removed, 0 files unresolved
+ $ cat target
+ Initially untainted file
+ $ hg update -r $H2
+ 2 files updated, 0 files merged, 0 files removed, 0 files unresolved
+ $ cat target
+ Tainted file now super sanitized
+
+Can merge in revision with censored data. Test requires one branch of history
+with the file censored, but we can't censor at a head, so advance H1.
+
+ $ hg update -r $H1
+ 1 files updated, 0 files merged, 0 files removed, 0 files unresolved
+ $ C3=$H1
+ $ echo 'advanced head H1' > target
+ $ hg ci -m 'advance head H1' target
+ $ H1=`hg id --debug -i`
+ $ hg censor -r $C3 target
+ $ hg update -r $H2
+ 1 files updated, 0 files merged, 0 files removed, 0 files unresolved
+ $ hg merge -r $C3
+ merging target
+ 0 files updated, 1 files merged, 0 files removed, 0 files unresolved
+ (branch merge, don't forget to commit)
+
+Revisions present in repository heads may not be censored
+
+ $ hg update -C -r $H2
+ 1 files updated, 0 files merged, 0 files removed, 0 files unresolved
+ $ hg censor -r $H2 target
+ abort: cannot censor file in heads (78a8fc215e79)
+ (clean/delete and commit first)
+ [255]
+ $ echo 'twiddling thumbs' > bystander
+ $ hg ci -m 'bystander commit'
+ $ H2=`hg id --debug -i`
+ $ hg censor -r "$H2^" target
+ abort: cannot censor file in heads (efbe78065929)
+ (clean/delete and commit first)
+ [255]
+
+Cannot censor working directory
+
+ $ echo 'seriously no passwords' > target
+ $ hg ci -m 'extend second head arbitrarily' target
+ $ H2=`hg id --debug -i`
+ $ hg update -r "$H2^"
+ 1 files updated, 0 files merged, 0 files removed, 0 files unresolved
+ $ hg censor -r . target
+ abort: cannot censor working directory
+ (clean/delete/update first)
+ [255]
+ $ hg update -r $H2
+ 1 files updated, 0 files merged, 0 files removed, 0 files unresolved
+
+Can re-add file after being deleted + censored
+
+ $ C4=$H2
+ $ hg rm target
+ $ hg ci -m 'delete target so it may be censored'
+ $ H2=`hg id --debug -i`
+ $ hg censor -r $C4 target
+ $ hg cat -r $C4 target
+ $ hg cat -r "$H2^^" target
+ Tainted file now super sanitized
+ $ echo 'fresh start' > target
+ $ hg add target
+ $ hg ci -m reincarnated target
+ $ H2=`hg id --debug -i`
+ $ hg cat -r $H2 target
+ fresh start
+ $ hg cat -r "$H2^" target
+ target: no such file in rev 452ec1762369
+ [1]
+ $ hg cat -r $C4 target
+ $ hg cat -r "$H2^^^" target
+ Tainted file now super sanitized
+
+Can censor after revlog has expanded to no longer permit inline storage
+
+ $ for x in `seq 0 50000`
+ > do
+ > echo "Password: hunter$x" >> target
+ > done
+ $ hg ci -m 'add 100k passwords'
+ $ H2=`hg id --debug -i`
+ $ C5=$H2
+ $ hg revert -r "$H2^" target
+ $ hg ci -m 'cleaned 100k passwords'
+ $ H2=`hg id --debug -i`
+ $ hg censor -r $C5 target
+ $ hg cat -r $C5 target
+ $ hg cat -r $H2 target
+ fresh start
--- a/tests/test-help.t Fri Mar 13 14:08:30 2015 -0700
+++ b/tests/test-help.t Sun Mar 15 21:52:17 2015 -0400
@@ -245,6 +245,7 @@
acl hooks for controlling repository access
blackbox log repository events to a blackbox for debugging
bugzilla hooks for integrating with the Bugzilla bug tracker
+ censor erase file content at a given revision
churn command to display statistics about repository history
color colorize output from some commands
convert import revisions from foreign VCS repositories into