1 """reorder a revlog (the manifest by default) to save space |
|
2 |
|
3 Specifically, this topologically sorts the revisions in the revlog so that |
|
4 revisions on the same branch are adjacent as much as possible. This is a |
|
5 workaround for the fact that Mercurial computes deltas relative to the |
|
6 previous revision rather than relative to a parent revision. |
|
7 |
|
8 This is *not* safe to run on a changelog. |
|
9 """ |
|
10 |
|
11 # Originally written by Benoit Boissinot <benoit.boissinot at ens-lyon.org> |
|
12 # as a patch to rewrite-log. Cleaned up, refactored, documented, and |
|
13 # renamed by Greg Ward <greg at gerg.ca>. |
|
14 |
|
15 # XXX would be nice to have a way to verify the repository after shrinking, |
|
16 # e.g. by comparing "before" and "after" states of random changesets |
|
17 # (maybe: export before, shrink, export after, diff). |
|
18 |
|
19 import os, errno |
|
20 from mercurial import revlog, transaction, node, util, scmutil |
|
21 from mercurial import changegroup |
|
22 from mercurial.i18n import _ |
|
23 |
|
24 |
|
25 def postorder(start, edges): |
|
26 result = [] |
|
27 visit = list(start) |
|
28 finished = set() |
|
29 |
|
30 while visit: |
|
31 cur = visit[-1] |
|
32 for p in edges[cur]: |
|
33 # defend against node.nullrev because it's occasionally |
|
34 # possible for a node to have parents (null, something) |
|
35 # rather than (something, null) |
|
36 if p not in finished and p != node.nullrev: |
|
37 visit.append(p) |
|
38 break |
|
39 else: |
|
40 result.append(cur) |
|
41 finished.add(cur) |
|
42 visit.pop() |
|
43 |
|
44 return result |
|
45 |
|
46 def toposort_reversepostorder(ui, rl): |
|
47 # postorder of the reverse directed graph |
|
48 |
|
49 # map rev to list of parent revs (p2 first) |
|
50 parents = {} |
|
51 heads = set() |
|
52 ui.status(_('reading revs\n')) |
|
53 try: |
|
54 for rev in rl: |
|
55 ui.progress(_('reading'), rev, total=len(rl)) |
|
56 (p1, p2) = rl.parentrevs(rev) |
|
57 if p1 == p2 == node.nullrev: |
|
58 parents[rev] = () # root node |
|
59 elif p1 == p2 or p2 == node.nullrev: |
|
60 parents[rev] = (p1,) # normal node |
|
61 else: |
|
62 parents[rev] = (p2, p1) # merge node |
|
63 heads.add(rev) |
|
64 for p in parents[rev]: |
|
65 heads.discard(p) |
|
66 finally: |
|
67 ui.progress(_('reading'), None) |
|
68 |
|
69 heads = list(heads) |
|
70 heads.sort(reverse=True) |
|
71 |
|
72 ui.status(_('sorting revs\n')) |
|
73 return postorder(heads, parents) |
|
74 |
|
75 def toposort_postorderreverse(ui, rl): |
|
76 # reverse-postorder of the reverse directed graph |
|
77 |
|
78 children = {} |
|
79 roots = set() |
|
80 ui.status(_('reading revs\n')) |
|
81 try: |
|
82 for rev in rl: |
|
83 ui.progress(_('reading'), rev, total=len(rl)) |
|
84 (p1, p2) = rl.parentrevs(rev) |
|
85 if p1 == p2 == node.nullrev: |
|
86 roots.add(rev) |
|
87 children[rev] = [] |
|
88 if p1 != node.nullrev: |
|
89 children[p1].append(rev) |
|
90 if p2 != node.nullrev: |
|
91 children[p2].append(rev) |
|
92 finally: |
|
93 ui.progress(_('reading'), None) |
|
94 |
|
95 roots = list(roots) |
|
96 roots.sort() |
|
97 |
|
98 ui.status(_('sorting revs\n')) |
|
99 result = postorder(roots, children) |
|
100 result.reverse() |
|
101 return result |
|
102 |
|
103 def writerevs(ui, repo, r1, r2, order, tr): |
|
104 |
|
105 ui.status(_('writing revs\n')) |
|
106 |
|
107 |
|
108 order = [r1.node(r) for r in order] |
|
109 |
|
110 # this is a bit ugly, but it works |
|
111 count = [0] |
|
112 def lookup(revl, x): |
|
113 count[0] += 1 |
|
114 ui.progress(_('writing'), count[0], total=len(order)) |
|
115 return "%020d" % revl.linkrev(revl.rev(x)) |
|
116 |
|
117 unlookup = lambda x: int(x, 10) |
|
118 |
|
119 try: |
|
120 bundler = changegroup.bundle10(repo) |
|
121 bundler.start(lookup) |
|
122 group = util.chunkbuffer(bundler.group(order, r1)) |
|
123 group = changegroup.unbundle10(group, "UN") |
|
124 r2.addgroup(group, unlookup, tr) |
|
125 finally: |
|
126 ui.progress(_('writing'), None) |
|
127 |
|
128 def report(ui, r1, r2): |
|
129 def getsize(r): |
|
130 s = 0 |
|
131 for fn in (r.indexfile, r.datafile): |
|
132 try: |
|
133 s += os.stat(fn).st_size |
|
134 except OSError, inst: |
|
135 if inst.errno != errno.ENOENT: |
|
136 raise |
|
137 return s |
|
138 |
|
139 oldsize = float(getsize(r1)) |
|
140 newsize = float(getsize(r2)) |
|
141 |
|
142 # argh: have to pass an int to %d, because a float >= 2^32 |
|
143 # blows up under Python 2.5 or earlier |
|
144 ui.write(_('old file size: %12d bytes (%6.1f MiB)\n') |
|
145 % (int(oldsize), oldsize / 1024 / 1024)) |
|
146 ui.write(_('new file size: %12d bytes (%6.1f MiB)\n') |
|
147 % (int(newsize), newsize / 1024 / 1024)) |
|
148 |
|
149 shrink_percent = (oldsize - newsize) / oldsize * 100 |
|
150 shrink_factor = oldsize / newsize |
|
151 ui.write(_('shrinkage: %.1f%% (%.1fx)\n') |
|
152 % (shrink_percent, shrink_factor)) |
|
153 |
|
154 def shrink(ui, repo, **opts): |
|
155 """shrink a revlog by reordering revisions |
|
156 |
|
157 Rewrites all the entries in some revlog of the current repository |
|
158 (by default, the manifest log) to save space. |
|
159 |
|
160 Different sort algorithms have different performance |
|
161 characteristics. Use ``--sort`` to select a sort algorithm so you |
|
162 can determine which works best for your data. |
|
163 """ |
|
164 |
|
165 if not repo.local(): |
|
166 raise util.Abort(_('not a local repository: %s') % repo.root) |
|
167 |
|
168 fn = opts.get('revlog') |
|
169 if not fn: |
|
170 indexfn = repo.sjoin('00manifest.i') |
|
171 else: |
|
172 if not fn.endswith('.i'): |
|
173 raise util.Abort(_('--revlog option must specify the revlog index ' |
|
174 'file (*.i), not %s') % opts.get('revlog')) |
|
175 |
|
176 indexfn = os.path.realpath(fn) |
|
177 store = repo.sjoin('') |
|
178 if not indexfn.startswith(store): |
|
179 raise util.Abort(_('--revlog option must specify a revlog in %s, ' |
|
180 'not %s') % (store, indexfn)) |
|
181 |
|
182 sortname = opts['sort'] |
|
183 try: |
|
184 toposort = globals()['toposort_' + sortname] |
|
185 except KeyError: |
|
186 raise util.Abort(_('no such toposort algorithm: %s') % sortname) |
|
187 |
|
188 if not os.path.exists(indexfn): |
|
189 raise util.Abort(_('no such file: %s') % indexfn) |
|
190 if '00changelog' in indexfn: |
|
191 raise util.Abort(_('shrinking the changelog ' |
|
192 'will corrupt your repository')) |
|
193 |
|
194 ui.write(_('shrinking %s\n') % indexfn) |
|
195 tmpindexfn = util.mktempcopy(indexfn, emptyok=True) |
|
196 |
|
197 r1 = revlog.revlog(scmutil.opener(os.getcwd(), audit=False), indexfn) |
|
198 r2 = revlog.revlog(scmutil.opener(os.getcwd(), audit=False), tmpindexfn) |
|
199 |
|
200 datafn, tmpdatafn = r1.datafile, r2.datafile |
|
201 |
|
202 oldindexfn = indexfn + '.old' |
|
203 olddatafn = datafn + '.old' |
|
204 if os.path.exists(oldindexfn) or os.path.exists(olddatafn): |
|
205 raise util.Abort(_('one or both of\n' |
|
206 ' %s\n' |
|
207 ' %s\n' |
|
208 'exists from a previous run; please clean up ' |
|
209 'before running again') % (oldindexfn, olddatafn)) |
|
210 |
|
211 # Don't use repo.transaction(), because then things get hairy with |
|
212 # paths: some need to be relative to .hg, and some need to be |
|
213 # absolute. Doing it this way keeps things simple: everything is an |
|
214 # absolute path. |
|
215 lock = repo.lock(wait=False) |
|
216 tr = transaction.transaction(ui.warn, |
|
217 open, |
|
218 repo.sjoin('journal')) |
|
219 |
|
220 def ignoremissing(func): |
|
221 def f(*args, **kw): |
|
222 try: |
|
223 return func(*args, **kw) |
|
224 except OSError, inst: |
|
225 if inst.errno != errno.ENOENT: |
|
226 raise |
|
227 return f |
|
228 |
|
229 try: |
|
230 try: |
|
231 order = toposort(ui, r1) |
|
232 |
|
233 suboptimal = 0 |
|
234 for i in xrange(1, len(order)): |
|
235 parents = [p for p in r1.parentrevs(order[i]) |
|
236 if p != node.nullrev] |
|
237 if parents and order[i - 1] not in parents: |
|
238 suboptimal += 1 |
|
239 ui.note(_('%d suboptimal nodes\n') % suboptimal) |
|
240 |
|
241 writerevs(ui, repo, r1, r2, order, tr) |
|
242 report(ui, r1, r2) |
|
243 tr.close() |
|
244 except: # re-raises |
|
245 # Abort transaction first, so we truncate the files before |
|
246 # deleting them. |
|
247 tr.abort() |
|
248 for fn in (tmpindexfn, tmpdatafn): |
|
249 ignoremissing(os.unlink)(fn) |
|
250 raise |
|
251 if not opts.get('dry_run'): |
|
252 # racy, both files cannot be renamed atomically |
|
253 # copy files |
|
254 util.oslink(indexfn, oldindexfn) |
|
255 ignoremissing(util.oslink)(datafn, olddatafn) |
|
256 |
|
257 # rename |
|
258 util.rename(tmpindexfn, indexfn) |
|
259 try: |
|
260 os.chmod(tmpdatafn, os.stat(datafn).st_mode) |
|
261 util.rename(tmpdatafn, datafn) |
|
262 except OSError, inst: |
|
263 if inst.errno != errno.ENOENT: |
|
264 raise |
|
265 ignoremissing(os.unlink)(datafn) |
|
266 else: |
|
267 for fn in (tmpindexfn, tmpdatafn): |
|
268 ignoremissing(os.unlink)(fn) |
|
269 finally: |
|
270 lock.release() |
|
271 |
|
272 if not opts.get('dry_run'): |
|
273 ui.write( |
|
274 _('note: old revlog saved in:\n' |
|
275 ' %s\n' |
|
276 ' %s\n' |
|
277 '(You can delete those files when you are satisfied that your\n' |
|
278 'repository is still sane. ' |
|
279 'Running \'hg verify\' is strongly recommended.)\n') |
|
280 % (oldindexfn, olddatafn)) |
|
281 |
|
282 cmdtable = { |
|
283 'shrink': (shrink, |
|
284 [('', 'revlog', '', |
|
285 _('the revlog to shrink (.i)')), |
|
286 ('n', 'dry-run', None, |
|
287 _('do not shrink, simulate only')), |
|
288 ('', 'sort', 'reversepostorder', |
|
289 _('name of sort algorithm to use')), |
|
290 ], |
|
291 _('hg shrink [--revlog PATH]')) |
|
292 } |
|
293 |
|
294 if __name__ == "__main__": |
|
295 print "shrink-revlog.py is now an extension (see hg help extensions)" |
|