changeset 9729:aa9ccab5af37

Issue919: add a standard extension to recreate hardlinks between repositories. Having to run a standalone Python script from the contrib dir is a nuisance. Also makes various improvements to locking, file discovery, etc. Should also update: http://www.selenic.com/mercurial/wiki/index.cgi/RecreateHardlinksBetweenRepositories
author Jesse Glick <jesse.glick@sun.com>
date Thu, 05 Nov 2009 17:38:03 -0500
parents acb1c59b4514
children 732fc0e9d411
files contrib/hg-relink hgext/relink.py
diffstat 2 files changed, 149 insertions(+), 128 deletions(-) [+]
line wrap: on
line diff
--- a/contrib/hg-relink	Fri Nov 06 08:28:45 2009 +0100
+++ /dev/null	Thu Jan 01 00:00:00 1970 +0000
@@ -1,128 +0,0 @@
-#!/usr/bin/env python
-#
-# Copyright (C) 2007 Brendan Cully <brendan@kublai.com>
-#
-# This software may be used and distributed according to the terms of the
-# GNU General Public License version 2, incorporated herein by reference.
-
-import os, sys
-
-class ConfigError(Exception): pass
-
-def usage():
-    print """relink <source> <destination>
-    Recreate hard links between source and destination repositories"""
-
-class Config:
-    def __init__(self, args):
-        if len(args) != 3:
-            raise ConfigError("wrong number of arguments")
-        self.src = os.path.abspath(args[1])
-        self.dst = os.path.abspath(args[2])
-        for d in (self.src, self.dst):
-            if not os.path.exists(os.path.join(d, '.hg')):
-                raise ConfigError("%s: not a mercurial repository" % d)
-
-def collect(src):
-    seplen = len(os.path.sep)
-    candidates = []
-    for dirpath, dirnames, filenames in os.walk(src):
-        relpath = dirpath[len(src) + seplen:]
-        for filename in filenames:
-            if not filename.endswith('.i'):
-                continue
-            st = os.stat(os.path.join(dirpath, filename))
-            candidates.append((os.path.join(relpath, filename), st))
-
-    return candidates
-
-def prune(candidates, dst):
-    def getdatafile(path):
-        if not path.endswith('.i'):
-            return None, None
-        df = path[:-1] + 'd'
-        try:
-            st = os.stat(df)
-        except OSError:
-            return None, None
-        return df, st
-
-    def linkfilter(dst, st):
-        try:
-            ts = os.stat(dst)
-        except OSError:
-            # Destination doesn't have this file?
-            return False
-        if st.st_ino == ts.st_ino:
-            return False
-        if st.st_dev != ts.st_dev:
-            # No point in continuing
-            raise Exception('Source and destination are on different devices')
-        if st.st_size != ts.st_size:
-            # TODO: compare revlog heads
-            return False
-        return st
-
-    targets = []
-    for fn, st in candidates:
-        tgt = os.path.join(dst, fn)
-        ts = linkfilter(tgt, st)
-        if not ts:
-            continue
-        targets.append((fn, ts.st_size))
-        df, ts = getdatafile(tgt)
-        if df:
-            targets.append((fn[:-1] + 'd', ts.st_size))
-
-    return targets
-
-def relink(src, dst, files):
-    def relinkfile(src, dst):
-        bak = dst + '.bak'
-        os.rename(dst, bak)
-        try:
-            os.link(src, dst)
-        except OSError:
-            os.rename(bak, dst)
-            raise
-        os.remove(bak)
-
-    CHUNKLEN = 65536
-    relinked = 0
-    savedbytes = 0
-
-    for f, sz in files:
-        source = os.path.join(src, f)
-        tgt = os.path.join(dst, f)
-        sfp = file(source)
-        dfp = file(tgt)
-        sin = sfp.read(CHUNKLEN)
-        while sin:
-            din = dfp.read(CHUNKLEN)
-            if sin != din:
-                break
-            sin = sfp.read(CHUNKLEN)
-        if sin:
-            continue
-        try:
-            relinkfile(source, tgt)
-            print 'Relinked %s' % f
-            relinked += 1
-            savedbytes += sz
-        except OSError, inst:
-            print '%s: %s' % (tgt, str(inst))
-
-    print 'Relinked %d files (%d bytes reclaimed)' % (relinked, savedbytes)
-
-try:
-    cfg = Config(sys.argv)
-except ConfigError, inst:
-    print str(inst)
-    usage()
-    sys.exit(1)
-
-src = os.path.join(cfg.src, '.hg')
-dst = os.path.join(cfg.dst, '.hg')
-candidates = collect(src)
-targets = prune(candidates, dst)
-relink(src, dst, targets)
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/hgext/relink.py	Thu Nov 05 17:38:03 2009 -0500
@@ -0,0 +1,149 @@
+# Mercurial extension to provide 'hg relink' command
+#
+# Copyright (C) 2007 Brendan Cully <brendan@kublai.com>
+#
+# This software may be used and distributed according to the terms of the
+# GNU General Public License version 2, incorporated herein by reference.
+
+"""recreates hardlinks between repository clones"""
+
+from mercurial import cmdutil, hg, util
+from mercurial.i18n import _
+import os, stat
+
+def relink(ui, repo, origin=None, **opts):
+    """recreate hardlinks between two repositories
+
+    When repositories are cloned locally, their data files will be hardlinked
+    so that they only use the space of a single repository.
+
+    Unfortunately, subsequent pulls into either repository will break hardlinks
+    for any files touched by the new changesets, even if both repositories end
+    up pulling the same changes.
+
+    Similarly, passing --rev to "hg clone" will fail to use
+    any hardlinks, falling back to a complete copy of the source repository.
+
+    This command lets you recreate those hardlinks and reclaim that wasted
+    space.
+
+    This repository will be relinked to share space with ORIGIN, which must be
+    on the same local disk. If ORIGIN is omitted, looks for "default-relink",
+    then "default", in [paths].
+
+    Do not attempt any read operations on this repository while the command is
+    running. (Both repositories will be locked against writes.)
+    """
+    src = hg.repository(
+        cmdutil.remoteui(repo, opts),
+        ui.expandpath(origin or 'default-relink', origin or 'default'))
+    if not src.local():
+        raise util.Abort('must specify local origin repository')
+    ui.status(_('relinking %s to %s\n') % (src.store.path, repo.store.path))
+    locallock = repo.lock()
+    try:
+        remotelock = src.lock()
+        try:
+            candidates = collect(src.store.path, ui)
+            targets = prune(candidates, repo.store.path, ui)
+            do_relink(src.store.path, repo.store.path, targets, ui)
+        finally:
+            remotelock.release()
+    finally:
+        locallock.release()
+
+def collect(src, ui):
+    seplen = len(os.path.sep)
+    candidates = []
+    for dirpath, dirnames, filenames in os.walk(src):
+        relpath = dirpath[len(src) + seplen:]
+        for filename in filenames:
+            if not filename[-2:] in ('.d', '.i'):
+                continue
+            st = os.stat(os.path.join(dirpath, filename))
+            if not stat.S_ISREG(st.st_mode):
+                continue
+            candidates.append((os.path.join(relpath, filename), st))
+
+    ui.status(_('collected %d candidate storage files\n') % len(candidates))
+    return candidates
+
+def prune(candidates, dst, ui):
+    def linkfilter(dst, st):
+        try:
+            ts = os.stat(dst)
+        except OSError:
+            # Destination doesn't have this file?
+            return False
+        if st.st_ino == ts.st_ino:
+            return False
+        if st.st_dev != ts.st_dev:
+            # No point in continuing
+            raise util.Abort(
+                _('source and destination are on different devices'))
+        if st.st_size != ts.st_size:
+            return False
+        return st
+
+    targets = []
+    for fn, st in candidates:
+        tgt = os.path.join(dst, fn)
+        ts = linkfilter(tgt, st)
+        if not ts:
+            ui.debug(_('not linkable: %s\n') % fn)
+            continue
+        targets.append((fn, ts.st_size))
+
+    ui.status(_('pruned down to %d probably relinkable files\n') % len(targets))
+    return targets
+
+def do_relink(src, dst, files, ui):
+    def relinkfile(src, dst):
+        bak = dst + '.bak'
+        os.rename(dst, bak)
+        try:
+            os.link(src, dst)
+        except OSError:
+            os.rename(bak, dst)
+            raise
+        os.remove(bak)
+
+    CHUNKLEN = 65536
+    relinked = 0
+    savedbytes = 0
+
+    pos = 0
+    total = len(files)
+    for f, sz in files:
+        pos += 1
+        source = os.path.join(src, f)
+        tgt = os.path.join(dst, f)
+        sfp = file(source)
+        dfp = file(tgt)
+        sin = sfp.read(CHUNKLEN)
+        while sin:
+            din = dfp.read(CHUNKLEN)
+            if sin != din:
+                break
+            sin = sfp.read(CHUNKLEN)
+        if sin:
+            ui.debug(_('not linkable: %s\n') % f)
+            continue
+        try:
+            relinkfile(source, tgt)
+            ui.progress(_('relink'), pos, f, _(' files'), total)
+            relinked += 1
+            savedbytes += sz
+        except OSError, inst:
+            ui.warn(_('%s: %s\n') % (tgt, str(inst)))
+
+    ui.status(_('relinked %d files (%d bytes reclaimed)\n') %
+              (relinked, savedbytes))
+
+cmdtable = {
+    'relink': (
+        relink,
+        [],
+        _('[ORIGIN]')
+    )
+}