comparison hgext/censor.py @ 24347:1bcfecbbf569

censor: add censor command to hgext with basic client-side tests The censor command is a core extension which can replace the contents of a historical file revision with a censor "tombstone" which can be exchanged with older clients in place of the real revision data. The command rewrites the filelog by copying revision-by-revision. Care must be taken to expand the fulltext of the children of the censored revision before copying them to the new filelog; they might be stored as deltas against the uncensored revision, and those deltas will be invalidated. For more background on the censorship feature design, see: http://mercurial.selenic.com/wiki/CensorPlan
author Mike Edgar <adgar@google.com>
date Sun, 15 Mar 2015 21:52:17 -0400
parents
children 5e111acc1170
comparison
equal deleted inserted replaced
24346:31edcea517c1 24347:1bcfecbbf569
1 # Copyright (C) 2015 - Mike Edgar <adgar@google.com>
2 #
3 # This extension enables removal of file content at a given revision,
4 # rewriting the data/metadata of successive revisions to preserve revision log
5 # integrity.
6
7 """erase file content at a given revision
8
9 The censor command instructs Mercurial to erase all content of a file at a given
10 revision *without updating the changeset hash.* This allows existing history to
11 remain valid while preventing future clones/pulls from receiving the erased
12 data.
13
14 Typical uses for censor are due to security or legal requirements, including::
15
16 * Passwords, private keys, crytographic material
17 * Licensed data/code/libraries for which the license has expired
18 * Personally Identifiable Information or other private data
19
20 Censored file revisions are listed in a tracked file called .hgcensored stored
21 in the repository root. The censor command adds an entry to the .hgcensored file
22 in the working directory and commits it (much like ``hg tag`` and .hgtags). The
23 censored file data is then replaced with a pointer to the new commit, enabling
24 verification.
25
26 Censored nodes can interrupt mercurial's typical operation whenever the excised
27 data needs to be materialized. Some commands, like ``hg cat``/``hg revert``,
28 simply fail when asked to produce censored data. Others, like ``hg verify`` and
29 ``hg update``, must be capable of tolerating censored data to continue to
30 function in a meaningful way. Such commands only tolerate censored file
31 revisions if they are allowed by the policy specified by the "censor.allow"
32 config option.
33 """
34
35 from mercurial.node import short
36 from mercurial import cmdutil, error, filelog, revlog, scmutil, util
37 from mercurial.i18n import _
38
39 cmdtable = {}
40 command = cmdutil.command(cmdtable)
41 testedwith = 'internal'
42
43 @command('censor',
44 [('r', 'rev', '', _('censor file from specified revision'), _('REV')),
45 ('t', 'tombstone', '', _('replacement tombstone data'), _('TEXT'))],
46 _('-r REV [-t TEXT] [FILE]'))
47 def censor(ui, repo, path, rev='', tombstone='', **opts):
48 if not path:
49 raise util.Abort(_('must specify file path to censor'))
50 if not rev:
51 raise util.Abort(_('must specify revision to censor'))
52
53 flog = repo.file(path)
54 if not len(flog):
55 raise util.Abort(_('cannot censor file with no history'))
56
57 rev = scmutil.revsingle(repo, rev, rev).rev()
58 try:
59 ctx = repo[rev]
60 except KeyError:
61 raise util.Abort(_('invalid revision identifier %s') % rev)
62
63 try:
64 fctx = ctx.filectx(path)
65 except error.LookupError:
66 raise util.Abort(_('file does not exist at revision %s') % rev)
67
68 fnode = fctx.filenode()
69 headctxs = [repo[c] for c in repo.heads()]
70 heads = [c for c in headctxs if path in c and c.filenode(path) == fnode]
71 if heads:
72 headlist = ', '.join([short(c.node()) for c in heads])
73 raise util.Abort(_('cannot censor file in heads (%s)') % headlist,
74 hint=_('clean/delete and commit first'))
75
76 wctx = repo[None]
77 wp = wctx.parents()
78 if ctx.node() in [p.node() for p in wp]:
79 raise util.Abort(_('cannot censor working directory'),
80 hint=_('clean/delete/update first'))
81
82 flogv = flog.version & 0xFFFF
83 if flogv != revlog.REVLOGNG:
84 raise util.Abort(
85 _('censor does not support revlog version %d') % (flogv,))
86
87 tombstone = filelog.packmeta({"censored": tombstone}, "")
88
89 crev = fctx.filerev()
90
91 if len(tombstone) > flog.rawsize(crev):
92 raise util.Abort(_(
93 'censor tombstone must be no longer than censored data'))
94
95 # Using two files instead of one makes it easy to rewrite entry-by-entry
96 idxread = repo.svfs(flog.indexfile, 'r')
97 idxwrite = repo.svfs(flog.indexfile, 'wb', atomictemp=True)
98 if flog.version & revlog.REVLOGNGINLINEDATA:
99 dataread, datawrite = idxread, idxwrite
100 else:
101 dataread = repo.svfs(flog.datafile, 'r')
102 datawrite = repo.svfs(flog.datafile, 'wb', atomictemp=True)
103
104 # Copy all revlog data up to the entry to be censored.
105 rio = revlog.revlogio()
106 offset = flog.start(crev)
107
108 for chunk in util.filechunkiter(idxread, limit=crev * rio.size):
109 idxwrite.write(chunk)
110 for chunk in util.filechunkiter(dataread, limit=offset):
111 datawrite.write(chunk)
112
113 def rewriteindex(r, newoffs, newdata=None):
114 """Rewrite the index entry with a new data offset and optional new data.
115
116 The newdata argument, if given, is a tuple of three positive integers:
117 (new compressed, new uncompressed, added flag bits).
118 """
119 offlags, comp, uncomp, base, link, p1, p2, nodeid = flog.index[r]
120 flags = revlog.gettype(offlags)
121 if newdata:
122 comp, uncomp, nflags = newdata
123 flags |= nflags
124 offlags = revlog.offset_type(newoffs, flags)
125 e = (offlags, comp, uncomp, r, link, p1, p2, nodeid)
126 idxwrite.write(rio.packentry(e, None, flog.version, r))
127 idxread.seek(rio.size, 1)
128
129 def rewrite(r, offs, data, nflags=revlog.REVIDX_DEFAULT_FLAGS):
130 """Write the given full text to the filelog with the given data offset.
131
132 Returns:
133 The integer number of data bytes written, for tracking data offsets.
134 """
135 flag, compdata = flog.compress(data)
136 newcomp = len(flag) + len(compdata)
137 rewriteindex(r, offs, (newcomp, len(data), nflags))
138 datawrite.write(flag)
139 datawrite.write(compdata)
140 dataread.seek(flog.length(r), 1)
141 return newcomp
142
143 # Rewrite censored revlog entry with (padded) tombstone data.
144 pad = ' ' * (flog.rawsize(crev) - len(tombstone))
145 offset += rewrite(crev, offset, tombstone + pad, revlog.REVIDX_ISCENSORED)
146
147 # Rewrite all following filelog revisions fixing up offsets and deltas.
148 for srev in xrange(crev + 1, len(flog)):
149 if crev in flog.parentrevs(srev):
150 # Immediate children of censored node must be re-added as fulltext.
151 try:
152 revdata = flog.revision(srev)
153 except error.CensoredNodeError, e:
154 revdata = e.tombstone
155 dlen = rewrite(srev, offset, revdata)
156 else:
157 # Copy any other revision data verbatim after fixing up the offset.
158 rewriteindex(srev, offset)
159 dlen = flog.length(srev)
160 for chunk in util.filechunkiter(dataread, limit=dlen):
161 datawrite.write(chunk)
162 offset += dlen
163
164 idxread.close()
165 idxwrite.close()
166 if dataread is not idxread:
167 dataread.close()
168 datawrite.close()