contrib/shrink-revlog.py
changeset 19205 94a22595f702
parent 19204 e9c5b1c246dc
child 19206 6308896b1d4a
equal deleted inserted replaced
19204:e9c5b1c246dc 19205:94a22595f702
     1 """reorder a revlog (the manifest by default) to save space
       
     2 
       
     3 Specifically, this topologically sorts the revisions in the revlog so that
       
     4 revisions on the same branch are adjacent as much as possible. This is a
       
     5 workaround for the fact that Mercurial computes deltas relative to the
       
     6 previous revision rather than relative to a parent revision.
       
     7 
       
     8 This is *not* safe to run on a changelog.
       
     9 """
       
    10 
       
    11 # Originally written by Benoit Boissinot <benoit.boissinot at ens-lyon.org>
       
    12 # as a patch to rewrite-log. Cleaned up, refactored, documented, and
       
    13 # renamed by Greg Ward <greg at gerg.ca>.
       
    14 
       
    15 # XXX would be nice to have a way to verify the repository after shrinking,
       
    16 # e.g. by comparing "before" and "after" states of random changesets
       
    17 # (maybe: export before, shrink, export after, diff).
       
    18 
       
    19 import os, errno
       
    20 from mercurial import revlog, transaction, node, util, scmutil
       
    21 from mercurial import changegroup
       
    22 from mercurial.i18n import _
       
    23 
       
    24 
       
    25 def postorder(start, edges):
       
    26     result = []
       
    27     visit = list(start)
       
    28     finished = set()
       
    29 
       
    30     while visit:
       
    31         cur = visit[-1]
       
    32         for p in edges[cur]:
       
    33             # defend against node.nullrev because it's occasionally
       
    34             # possible for a node to have parents (null, something)
       
    35             # rather than (something, null)
       
    36             if p not in finished and p != node.nullrev:
       
    37                 visit.append(p)
       
    38                 break
       
    39         else:
       
    40             result.append(cur)
       
    41             finished.add(cur)
       
    42             visit.pop()
       
    43 
       
    44     return result
       
    45 
       
    46 def toposort_reversepostorder(ui, rl):
       
    47     # postorder of the reverse directed graph
       
    48 
       
    49     # map rev to list of parent revs (p2 first)
       
    50     parents = {}
       
    51     heads = set()
       
    52     ui.status(_('reading revs\n'))
       
    53     try:
       
    54         for rev in rl:
       
    55             ui.progress(_('reading'), rev, total=len(rl))
       
    56             (p1, p2) = rl.parentrevs(rev)
       
    57             if p1 == p2 == node.nullrev:
       
    58                 parents[rev] = ()       # root node
       
    59             elif p1 == p2 or p2 == node.nullrev:
       
    60                 parents[rev] = (p1,)    # normal node
       
    61             else:
       
    62                 parents[rev] = (p2, p1) # merge node
       
    63             heads.add(rev)
       
    64             for p in parents[rev]:
       
    65                 heads.discard(p)
       
    66     finally:
       
    67         ui.progress(_('reading'), None)
       
    68 
       
    69     heads = list(heads)
       
    70     heads.sort(reverse=True)
       
    71 
       
    72     ui.status(_('sorting revs\n'))
       
    73     return postorder(heads, parents)
       
    74 
       
    75 def toposort_postorderreverse(ui, rl):
       
    76     # reverse-postorder of the reverse directed graph
       
    77 
       
    78     children = {}
       
    79     roots = set()
       
    80     ui.status(_('reading revs\n'))
       
    81     try:
       
    82         for rev in rl:
       
    83             ui.progress(_('reading'), rev, total=len(rl))
       
    84             (p1, p2) = rl.parentrevs(rev)
       
    85             if p1 == p2 == node.nullrev:
       
    86                 roots.add(rev)
       
    87             children[rev] = []
       
    88             if p1 != node.nullrev:
       
    89                 children[p1].append(rev)
       
    90             if p2 != node.nullrev:
       
    91                 children[p2].append(rev)
       
    92     finally:
       
    93         ui.progress(_('reading'), None)
       
    94 
       
    95     roots = list(roots)
       
    96     roots.sort()
       
    97 
       
    98     ui.status(_('sorting revs\n'))
       
    99     result = postorder(roots, children)
       
   100     result.reverse()
       
   101     return result
       
   102 
       
   103 def writerevs(ui, repo, r1, r2, order, tr):
       
   104 
       
   105     ui.status(_('writing revs\n'))
       
   106 
       
   107 
       
   108     order = [r1.node(r) for r in order]
       
   109 
       
   110     # this is a bit ugly, but it works
       
   111     count = [0]
       
   112     def lookup(revl, x):
       
   113         count[0] += 1
       
   114         ui.progress(_('writing'), count[0], total=len(order))
       
   115         return "%020d" % revl.linkrev(revl.rev(x))
       
   116 
       
   117     unlookup = lambda x: int(x, 10)
       
   118 
       
   119     try:
       
   120         bundler = changegroup.bundle10(repo)
       
   121         bundler.start(lookup)
       
   122         group = util.chunkbuffer(bundler.group(order, r1))
       
   123         group = changegroup.unbundle10(group, "UN")
       
   124         r2.addgroup(group, unlookup, tr)
       
   125     finally:
       
   126         ui.progress(_('writing'), None)
       
   127 
       
   128 def report(ui, r1, r2):
       
   129     def getsize(r):
       
   130         s = 0
       
   131         for fn in (r.indexfile, r.datafile):
       
   132             try:
       
   133                 s += os.stat(fn).st_size
       
   134             except OSError, inst:
       
   135                 if inst.errno != errno.ENOENT:
       
   136                     raise
       
   137         return s
       
   138 
       
   139     oldsize = float(getsize(r1))
       
   140     newsize = float(getsize(r2))
       
   141 
       
   142     # argh: have to pass an int to %d, because a float >= 2^32
       
   143     # blows up under Python 2.5 or earlier
       
   144     ui.write(_('old file size: %12d bytes (%6.1f MiB)\n')
       
   145              % (int(oldsize), oldsize / 1024 / 1024))
       
   146     ui.write(_('new file size: %12d bytes (%6.1f MiB)\n')
       
   147              % (int(newsize), newsize / 1024 / 1024))
       
   148 
       
   149     shrink_percent = (oldsize - newsize) / oldsize * 100
       
   150     shrink_factor = oldsize / newsize
       
   151     ui.write(_('shrinkage: %.1f%% (%.1fx)\n')
       
   152              % (shrink_percent, shrink_factor))
       
   153 
       
   154 def shrink(ui, repo, **opts):
       
   155     """shrink a revlog by reordering revisions
       
   156 
       
   157     Rewrites all the entries in some revlog of the current repository
       
   158     (by default, the manifest log) to save space.
       
   159 
       
   160     Different sort algorithms have different performance
       
   161     characteristics.  Use ``--sort`` to select a sort algorithm so you
       
   162     can determine which works best for your data.
       
   163     """
       
   164 
       
   165     if not repo.local():
       
   166         raise util.Abort(_('not a local repository: %s') % repo.root)
       
   167 
       
   168     fn = opts.get('revlog')
       
   169     if not fn:
       
   170         indexfn = repo.sjoin('00manifest.i')
       
   171     else:
       
   172         if not fn.endswith('.i'):
       
   173             raise util.Abort(_('--revlog option must specify the revlog index '
       
   174                                'file (*.i), not %s') % opts.get('revlog'))
       
   175 
       
   176         indexfn = os.path.realpath(fn)
       
   177         store = repo.sjoin('')
       
   178         if not indexfn.startswith(store):
       
   179             raise util.Abort(_('--revlog option must specify a revlog in %s, '
       
   180                                'not %s') % (store, indexfn))
       
   181 
       
   182     sortname = opts['sort']
       
   183     try:
       
   184         toposort = globals()['toposort_' + sortname]
       
   185     except KeyError:
       
   186         raise util.Abort(_('no such toposort algorithm: %s') % sortname)
       
   187 
       
   188     if not os.path.exists(indexfn):
       
   189         raise util.Abort(_('no such file: %s') % indexfn)
       
   190     if '00changelog' in indexfn:
       
   191         raise util.Abort(_('shrinking the changelog '
       
   192                            'will corrupt your repository'))
       
   193 
       
   194     ui.write(_('shrinking %s\n') % indexfn)
       
   195     tmpindexfn = util.mktempcopy(indexfn, emptyok=True)
       
   196 
       
   197     r1 = revlog.revlog(scmutil.opener(os.getcwd(), audit=False), indexfn)
       
   198     r2 = revlog.revlog(scmutil.opener(os.getcwd(), audit=False), tmpindexfn)
       
   199 
       
   200     datafn, tmpdatafn = r1.datafile, r2.datafile
       
   201 
       
   202     oldindexfn = indexfn + '.old'
       
   203     olddatafn = datafn + '.old'
       
   204     if os.path.exists(oldindexfn) or os.path.exists(olddatafn):
       
   205         raise util.Abort(_('one or both of\n'
       
   206                            '  %s\n'
       
   207                            '  %s\n'
       
   208                            'exists from a previous run; please clean up '
       
   209                            'before running again') % (oldindexfn, olddatafn))
       
   210 
       
   211     # Don't use repo.transaction(), because then things get hairy with
       
   212     # paths: some need to be relative to .hg, and some need to be
       
   213     # absolute. Doing it this way keeps things simple: everything is an
       
   214     # absolute path.
       
   215     lock = repo.lock(wait=False)
       
   216     tr = transaction.transaction(ui.warn,
       
   217                                  open,
       
   218                                  repo.sjoin('journal'))
       
   219 
       
   220     def ignoremissing(func):
       
   221         def f(*args, **kw):
       
   222             try:
       
   223                 return func(*args, **kw)
       
   224             except OSError, inst:
       
   225                 if inst.errno != errno.ENOENT:
       
   226                     raise
       
   227         return f
       
   228 
       
   229     try:
       
   230         try:
       
   231             order = toposort(ui, r1)
       
   232 
       
   233             suboptimal = 0
       
   234             for i in xrange(1, len(order)):
       
   235                 parents = [p for p in r1.parentrevs(order[i])
       
   236                            if p != node.nullrev]
       
   237                 if parents and order[i - 1] not in parents:
       
   238                     suboptimal += 1
       
   239             ui.note(_('%d suboptimal nodes\n') % suboptimal)
       
   240 
       
   241             writerevs(ui, repo, r1, r2, order, tr)
       
   242             report(ui, r1, r2)
       
   243             tr.close()
       
   244         except: # re-raises
       
   245             # Abort transaction first, so we truncate the files before
       
   246             # deleting them.
       
   247             tr.abort()
       
   248             for fn in (tmpindexfn, tmpdatafn):
       
   249                 ignoremissing(os.unlink)(fn)
       
   250             raise
       
   251         if not opts.get('dry_run'):
       
   252             # racy, both files cannot be renamed atomically
       
   253             # copy files
       
   254             util.oslink(indexfn, oldindexfn)
       
   255             ignoremissing(util.oslink)(datafn, olddatafn)
       
   256 
       
   257             # rename
       
   258             util.rename(tmpindexfn, indexfn)
       
   259             try:
       
   260                 os.chmod(tmpdatafn, os.stat(datafn).st_mode)
       
   261                 util.rename(tmpdatafn, datafn)
       
   262             except OSError, inst:
       
   263                 if inst.errno != errno.ENOENT:
       
   264                     raise
       
   265                 ignoremissing(os.unlink)(datafn)
       
   266         else:
       
   267             for fn in (tmpindexfn, tmpdatafn):
       
   268                 ignoremissing(os.unlink)(fn)
       
   269     finally:
       
   270         lock.release()
       
   271 
       
   272     if not opts.get('dry_run'):
       
   273         ui.write(
       
   274             _('note: old revlog saved in:\n'
       
   275               '  %s\n'
       
   276               '  %s\n'
       
   277               '(You can delete those files when you are satisfied that your\n'
       
   278               'repository is still sane.  '
       
   279               'Running \'hg verify\' is strongly recommended.)\n')
       
   280             % (oldindexfn, olddatafn))
       
   281 
       
   282 cmdtable = {
       
   283     'shrink': (shrink,
       
   284                [('', 'revlog', '',
       
   285                  _('the revlog to shrink (.i)')),
       
   286                 ('n', 'dry-run', None,
       
   287                  _('do not shrink, simulate only')),
       
   288                 ('', 'sort', 'reversepostorder',
       
   289                  _('name of sort algorithm to use')),
       
   290                 ],
       
   291                _('hg shrink [--revlog PATH]'))
       
   292 }
       
   293 
       
   294 if __name__ == "__main__":
       
   295     print "shrink-revlog.py is now an extension (see hg help extensions)"