# HG changeset patch # User Mike Edgar # Date 1426470737 14400 # Node ID 1bcfecbbf5691071d3a5a211f2ea9411286359da # Parent 31edcea517c11a53c608113592523244d6317841 censor: add censor command to hgext with basic client-side tests The censor command is a core extension which can replace the contents of a historical file revision with a censor "tombstone" which can be exchanged with older clients in place of the real revision data. The command rewrites the filelog by copying revision-by-revision. Care must be taken to expand the fulltext of the children of the censored revision before copying them to the new filelog; they might be stored as deltas against the uncensored revision, and those deltas will be invalidated. For more background on the censorship feature design, see: http://mercurial.selenic.com/wiki/CensorPlan diff -r 31edcea517c1 -r 1bcfecbbf569 hgext/censor.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/hgext/censor.py Sun Mar 15 21:52:17 2015 -0400 @@ -0,0 +1,168 @@ +# Copyright (C) 2015 - Mike Edgar +# +# This extension enables removal of file content at a given revision, +# rewriting the data/metadata of successive revisions to preserve revision log +# integrity. + +"""erase file content at a given revision + +The censor command instructs Mercurial to erase all content of a file at a given +revision *without updating the changeset hash.* This allows existing history to +remain valid while preventing future clones/pulls from receiving the erased +data. + +Typical uses for censor are due to security or legal requirements, including:: + + * Passwords, private keys, crytographic material + * Licensed data/code/libraries for which the license has expired + * Personally Identifiable Information or other private data + +Censored file revisions are listed in a tracked file called .hgcensored stored +in the repository root. The censor command adds an entry to the .hgcensored file +in the working directory and commits it (much like ``hg tag`` and .hgtags). The +censored file data is then replaced with a pointer to the new commit, enabling +verification. + +Censored nodes can interrupt mercurial's typical operation whenever the excised +data needs to be materialized. Some commands, like ``hg cat``/``hg revert``, +simply fail when asked to produce censored data. Others, like ``hg verify`` and +``hg update``, must be capable of tolerating censored data to continue to +function in a meaningful way. Such commands only tolerate censored file +revisions if they are allowed by the policy specified by the "censor.allow" +config option. +""" + +from mercurial.node import short +from mercurial import cmdutil, error, filelog, revlog, scmutil, util +from mercurial.i18n import _ + +cmdtable = {} +command = cmdutil.command(cmdtable) +testedwith = 'internal' + +@command('censor', + [('r', 'rev', '', _('censor file from specified revision'), _('REV')), + ('t', 'tombstone', '', _('replacement tombstone data'), _('TEXT'))], + _('-r REV [-t TEXT] [FILE]')) +def censor(ui, repo, path, rev='', tombstone='', **opts): + if not path: + raise util.Abort(_('must specify file path to censor')) + if not rev: + raise util.Abort(_('must specify revision to censor')) + + flog = repo.file(path) + if not len(flog): + raise util.Abort(_('cannot censor file with no history')) + + rev = scmutil.revsingle(repo, rev, rev).rev() + try: + ctx = repo[rev] + except KeyError: + raise util.Abort(_('invalid revision identifier %s') % rev) + + try: + fctx = ctx.filectx(path) + except error.LookupError: + raise util.Abort(_('file does not exist at revision %s') % rev) + + fnode = fctx.filenode() + headctxs = [repo[c] for c in repo.heads()] + heads = [c for c in headctxs if path in c and c.filenode(path) == fnode] + if heads: + headlist = ', '.join([short(c.node()) for c in heads]) + raise util.Abort(_('cannot censor file in heads (%s)') % headlist, + hint=_('clean/delete and commit first')) + + wctx = repo[None] + wp = wctx.parents() + if ctx.node() in [p.node() for p in wp]: + raise util.Abort(_('cannot censor working directory'), + hint=_('clean/delete/update first')) + + flogv = flog.version & 0xFFFF + if flogv != revlog.REVLOGNG: + raise util.Abort( + _('censor does not support revlog version %d') % (flogv,)) + + tombstone = filelog.packmeta({"censored": tombstone}, "") + + crev = fctx.filerev() + + if len(tombstone) > flog.rawsize(crev): + raise util.Abort(_( + 'censor tombstone must be no longer than censored data')) + + # Using two files instead of one makes it easy to rewrite entry-by-entry + idxread = repo.svfs(flog.indexfile, 'r') + idxwrite = repo.svfs(flog.indexfile, 'wb', atomictemp=True) + if flog.version & revlog.REVLOGNGINLINEDATA: + dataread, datawrite = idxread, idxwrite + else: + dataread = repo.svfs(flog.datafile, 'r') + datawrite = repo.svfs(flog.datafile, 'wb', atomictemp=True) + + # Copy all revlog data up to the entry to be censored. + rio = revlog.revlogio() + offset = flog.start(crev) + + for chunk in util.filechunkiter(idxread, limit=crev * rio.size): + idxwrite.write(chunk) + for chunk in util.filechunkiter(dataread, limit=offset): + datawrite.write(chunk) + + def rewriteindex(r, newoffs, newdata=None): + """Rewrite the index entry with a new data offset and optional new data. + + The newdata argument, if given, is a tuple of three positive integers: + (new compressed, new uncompressed, added flag bits). + """ + offlags, comp, uncomp, base, link, p1, p2, nodeid = flog.index[r] + flags = revlog.gettype(offlags) + if newdata: + comp, uncomp, nflags = newdata + flags |= nflags + offlags = revlog.offset_type(newoffs, flags) + e = (offlags, comp, uncomp, r, link, p1, p2, nodeid) + idxwrite.write(rio.packentry(e, None, flog.version, r)) + idxread.seek(rio.size, 1) + + def rewrite(r, offs, data, nflags=revlog.REVIDX_DEFAULT_FLAGS): + """Write the given full text to the filelog with the given data offset. + + Returns: + The integer number of data bytes written, for tracking data offsets. + """ + flag, compdata = flog.compress(data) + newcomp = len(flag) + len(compdata) + rewriteindex(r, offs, (newcomp, len(data), nflags)) + datawrite.write(flag) + datawrite.write(compdata) + dataread.seek(flog.length(r), 1) + return newcomp + + # Rewrite censored revlog entry with (padded) tombstone data. + pad = ' ' * (flog.rawsize(crev) - len(tombstone)) + offset += rewrite(crev, offset, tombstone + pad, revlog.REVIDX_ISCENSORED) + + # Rewrite all following filelog revisions fixing up offsets and deltas. + for srev in xrange(crev + 1, len(flog)): + if crev in flog.parentrevs(srev): + # Immediate children of censored node must be re-added as fulltext. + try: + revdata = flog.revision(srev) + except error.CensoredNodeError, e: + revdata = e.tombstone + dlen = rewrite(srev, offset, revdata) + else: + # Copy any other revision data verbatim after fixing up the offset. + rewriteindex(srev, offset) + dlen = flog.length(srev) + for chunk in util.filechunkiter(dataread, limit=dlen): + datawrite.write(chunk) + offset += dlen + + idxread.close() + idxwrite.close() + if dataread is not idxread: + dataread.close() + datawrite.close() diff -r 31edcea517c1 -r 1bcfecbbf569 tests/test-censor.t --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/tests/test-censor.t Sun Mar 15 21:52:17 2015 -0400 @@ -0,0 +1,315 @@ + $ cat >> $HGRCPATH < [extensions] + > censor= + > EOF + $ cp $HGRCPATH $HGRCPATH.orig + +Create repo with unimpeachable content + + $ hg init r + $ cd r + $ echo 'Initially untainted file' > target + $ echo 'Normal file here' > bystander + $ hg add target bystander + $ hg ci -m init + +Clone repo so we can test pull later + + $ cd .. + $ hg clone r rpull + updating to branch default + 2 files updated, 0 files merged, 0 files removed, 0 files unresolved + $ cd r + +Introduce content which will ultimately require censorship. Name the first +censored node C1, second C2, and so on + + $ echo 'Tainted file' > target + $ echo 'Passwords: hunter2' >> target + $ hg ci -m taint target + $ C1=`hg id --debug -i` + + $ echo 'hunter3' >> target + $ echo 'Normal file v2' > bystander + $ hg ci -m moretaint target bystander + $ C2=`hg id --debug -i` + +Add a new sanitized versions to correct our mistake. Name the first head H1, +the second head H2, and so on + + $ echo 'Tainted file is now sanitized' > target + $ hg ci -m sanitized target + $ H1=`hg id --debug -i` + + $ hg update -r $C2 + 1 files updated, 0 files merged, 0 files removed, 0 files unresolved + $ echo 'Tainted file now super sanitized' > target + $ hg ci -m 'super sanitized' target + created new head + $ H2=`hg id --debug -i` + +Verify target contents before censorship at each revision + + $ hg cat -r 3 target + Tainted file is now sanitized + $ hg cat -r $H2 target + Tainted file now super sanitized + $ hg cat -r $C2 target + Tainted file + Passwords: hunter2 + hunter3 + $ hg cat -r $C1 target + Tainted file + Passwords: hunter2 + $ hg cat -r 0 target + Initially untainted file + +Try to censor revision with too large of a tombstone message + + $ hg censor -r $C1 -t 'blah blah blah blah blah blah blah blah bla' target + abort: censor tombstone must be no longer than censored data + [255] + +Censor revision with 2 offenses + + $ hg censor -r $C2 -t "remove password" target + $ hg cat -r 3 target + Tainted file is now sanitized + $ hg cat -r $H2 target + Tainted file now super sanitized + $ hg cat -r $C2 target + abort: censored node: 1e0247a9a4b7 + (set censor.policy to ignore errors) + [255] + $ hg cat -r $C1 target + Tainted file + Passwords: hunter2 + $ hg cat -r 0 target + Initially untainted file + +Censor revision with 1 offense + + $ hg censor -r $C1 target + $ hg cat -r 3 target + Tainted file is now sanitized + $ hg cat -r $H2 target + Tainted file now super sanitized + $ hg cat -r $C2 target + abort: censored node: 1e0247a9a4b7 + (set censor.policy to ignore errors) + [255] + $ hg cat -r $C1 target + abort: censored node: 613bc869fceb + (set censor.policy to ignore errors) + [255] + $ hg cat -r 0 target + Initially untainted file + +Can only checkout target at uncensored revisions, -X is workaround for --all + + $ hg revert -r $C2 target + abort: censored node: 1e0247a9a4b7 + (set censor.policy to ignore errors) + [255] + $ hg revert -r $C1 target + abort: censored node: 613bc869fceb + (set censor.policy to ignore errors) + [255] + $ hg revert -r $C1 --all + reverting bystander + reverting target + abort: censored node: 613bc869fceb + (set censor.policy to ignore errors) + [255] + $ hg revert -r $C1 --all -X target + $ cat target + Tainted file now super sanitized + $ hg revert -r 0 --all + reverting target + $ cat target + Initially untainted file + $ hg revert -r $H2 --all + reverting bystander + reverting target + $ cat target + Tainted file now super sanitized + +Uncensored file can be viewed at any revision + + $ hg cat -r 3 bystander + Normal file v2 + $ hg cat -r $C2 bystander + Normal file v2 + $ hg cat -r $C1 bystander + Normal file here + $ hg cat -r 0 bystander + Normal file here + +Can update to children of censored revision + + $ hg update -r 3 + 1 files updated, 0 files merged, 0 files removed, 0 files unresolved + $ cat target + Tainted file is now sanitized + $ hg update -r $H2 + 1 files updated, 0 files merged, 0 files removed, 0 files unresolved + $ cat target + Tainted file now super sanitized + +Set censor policy to abort in trusted $HGRC so hg verify fails + + $ cp $HGRCPATH.orig $HGRCPATH + $ cat >> $HGRCPATH < [censor] + > policy = abort + > EOF + +Repo fails verification due to censorship + + $ hg verify + checking changesets + checking manifests + crosschecking files in changesets and manifests + checking files + target@1: censored file data + target@2: censored file data + 2 files, 5 changesets, 7 total revisions + 2 integrity errors encountered! + (first damaged changeset appears to be 1) + [1] + +Cannot update to revision with censored data + + $ hg update -r $C2 + abort: censored node: 1e0247a9a4b7 + (set censor.policy to ignore errors) + [255] + $ hg update -r $C1 + abort: censored node: 613bc869fceb + (set censor.policy to ignore errors) + [255] + $ hg update -r 0 + 2 files updated, 0 files merged, 0 files removed, 0 files unresolved + $ hg update -r $H2 + 2 files updated, 0 files merged, 0 files removed, 0 files unresolved + +Set censor policy to ignore in trusted $HGRC so hg verify passes + + $ cp $HGRCPATH.orig $HGRCPATH + $ cat >> $HGRCPATH < [censor] + > policy = ignore + > EOF + +Repo passes verification with warnings with explicit config + + $ hg verify + checking changesets + checking manifests + crosschecking files in changesets and manifests + checking files + 2 files, 5 changesets, 7 total revisions + +May update to revision with censored data with explicit config + + $ hg update -r $C2 + 1 files updated, 0 files merged, 0 files removed, 0 files unresolved + $ cat target + $ hg update -r $C1 + 2 files updated, 0 files merged, 0 files removed, 0 files unresolved + $ cat target + $ hg update -r 0 + 1 files updated, 0 files merged, 0 files removed, 0 files unresolved + $ cat target + Initially untainted file + $ hg update -r $H2 + 2 files updated, 0 files merged, 0 files removed, 0 files unresolved + $ cat target + Tainted file now super sanitized + +Can merge in revision with censored data. Test requires one branch of history +with the file censored, but we can't censor at a head, so advance H1. + + $ hg update -r $H1 + 1 files updated, 0 files merged, 0 files removed, 0 files unresolved + $ C3=$H1 + $ echo 'advanced head H1' > target + $ hg ci -m 'advance head H1' target + $ H1=`hg id --debug -i` + $ hg censor -r $C3 target + $ hg update -r $H2 + 1 files updated, 0 files merged, 0 files removed, 0 files unresolved + $ hg merge -r $C3 + merging target + 0 files updated, 1 files merged, 0 files removed, 0 files unresolved + (branch merge, don't forget to commit) + +Revisions present in repository heads may not be censored + + $ hg update -C -r $H2 + 1 files updated, 0 files merged, 0 files removed, 0 files unresolved + $ hg censor -r $H2 target + abort: cannot censor file in heads (78a8fc215e79) + (clean/delete and commit first) + [255] + $ echo 'twiddling thumbs' > bystander + $ hg ci -m 'bystander commit' + $ H2=`hg id --debug -i` + $ hg censor -r "$H2^" target + abort: cannot censor file in heads (efbe78065929) + (clean/delete and commit first) + [255] + +Cannot censor working directory + + $ echo 'seriously no passwords' > target + $ hg ci -m 'extend second head arbitrarily' target + $ H2=`hg id --debug -i` + $ hg update -r "$H2^" + 1 files updated, 0 files merged, 0 files removed, 0 files unresolved + $ hg censor -r . target + abort: cannot censor working directory + (clean/delete/update first) + [255] + $ hg update -r $H2 + 1 files updated, 0 files merged, 0 files removed, 0 files unresolved + +Can re-add file after being deleted + censored + + $ C4=$H2 + $ hg rm target + $ hg ci -m 'delete target so it may be censored' + $ H2=`hg id --debug -i` + $ hg censor -r $C4 target + $ hg cat -r $C4 target + $ hg cat -r "$H2^^" target + Tainted file now super sanitized + $ echo 'fresh start' > target + $ hg add target + $ hg ci -m reincarnated target + $ H2=`hg id --debug -i` + $ hg cat -r $H2 target + fresh start + $ hg cat -r "$H2^" target + target: no such file in rev 452ec1762369 + [1] + $ hg cat -r $C4 target + $ hg cat -r "$H2^^^" target + Tainted file now super sanitized + +Can censor after revlog has expanded to no longer permit inline storage + + $ for x in `seq 0 50000` + > do + > echo "Password: hunter$x" >> target + > done + $ hg ci -m 'add 100k passwords' + $ H2=`hg id --debug -i` + $ C5=$H2 + $ hg revert -r "$H2^" target + $ hg ci -m 'cleaned 100k passwords' + $ H2=`hg id --debug -i` + $ hg censor -r $C5 target + $ hg cat -r $C5 target + $ hg cat -r $H2 target + fresh start diff -r 31edcea517c1 -r 1bcfecbbf569 tests/test-help.t --- a/tests/test-help.t Fri Mar 13 14:08:30 2015 -0700 +++ b/tests/test-help.t Sun Mar 15 21:52:17 2015 -0400 @@ -245,6 +245,7 @@ acl hooks for controlling repository access blackbox log repository events to a blackbox for debugging bugzilla hooks for integrating with the Bugzilla bug tracker + censor erase file content at a given revision churn command to display statistics about repository history color colorize output from some commands convert import revisions from foreign VCS repositories into