diff hgext/convert/git.py @ 22470:8e0c4df28eec

convert: add support to detect git renames and copies Git is fairly unique among VCSes in that it doesn't record copies and renames, instead choosing to detect them on the fly. Since Mercurial expects copies and renames to be recorded, it can be valuable to preserve this history while converting a Git repository to Mercurial. This patch adds a new convert option, called 'convert.git.similarity', which determines how similar files must be to be treated as renames or copies.
author Siddharth Agarwal <sid0@fb.com>
date Fri, 12 Sep 2014 11:23:26 -0700
parents 15bc0431476b
children cc5f94db672b
line wrap: on
line diff
--- a/hgext/convert/git.py	Thu Sep 11 23:57:49 2014 -0700
+++ b/hgext/convert/git.py	Fri Sep 12 11:23:26 2014 -0700
@@ -94,6 +94,17 @@
         if not os.path.exists(path + "/objects"):
             raise NoRepo(_("%s does not look like a Git repository") % path)
 
+        try:
+            similarity = int(ui.config('convert', 'git.similarity') or 0)
+        except ValueError:
+            raise util.Abort('convert.git.similarity must be a number')
+        if similarity < 0 or similarity > 100:
+            raise util.Abort(_('similarity must be between 0 and 100'))
+        if similarity > 0:
+            self.simopt = '--find-copies=%d%%' % similarity
+        else:
+            self.simopt = ''
+
         checktool('git', 'git')
 
         self.path = path
@@ -184,8 +195,10 @@
         if full:
             raise util.Abort(_("convert from git do not support --full"))
         self.modecache = {}
-        fh = self.gitopen("git diff-tree -z --root -m -r %s" % version)
+        fh = self.gitopen("git diff-tree -z --root -m -r %s %s" % (
+            self.simopt, version))
         changes = []
+        copies = {}
         seen = set()
         entry = None
         subexists = [False]
@@ -194,15 +207,16 @@
         lcount = len(difftree)
         i = 0
 
-        def add(entry, f):
+        def add(entry, f, isdest):
             seen.add(f)
             h = entry[3]
             p = (entry[1] == "100755")
             s = (entry[1] == "120000")
+            renamesource = (not isdest and entry[4][0] == 'R')
 
             if f == '.gitmodules':
                 subexists[0] = True
-                if entry[4] == 'D':
+                if entry[4] == 'D' or renamesource:
                     subdeleted[0] = True
                     changes.append(('.hgsub', hex(nullid)))
                 else:
@@ -210,6 +224,8 @@
             elif entry[1] == '160000' or entry[0] == ':160000':
                 subexists[0] = True
             else:
+                if renamesource:
+                    h = hex(nullid)
                 self.modecache[(f, h)] = (p and "x") or (s and "l") or ""
                 changes.append((f, h))
 
@@ -223,7 +239,19 @@
                 continue
             f = l
             if f not in seen:
-                add(entry, f)
+                add(entry, f, False)
+            # A file can be copied multiple times, or modified and copied
+            # simultaneously. So f can be repeated even if fdest isn't.
+            if entry[4][0] in 'RC':
+                # rename or copy: next line is the destination
+                fdest = difftree[i]
+                i += 1
+                if fdest not in seen:
+                    add(entry, fdest, True)
+                    # .gitmodules isn't imported at all, so it being copied to
+                    # and fro doesn't really make sense
+                    if f != '.gitmodules' and fdest != '.gitmodules':
+                        copies[fdest] = f
             entry = None
         if fh.close():
             raise util.Abort(_('cannot read changes in %s') % version)
@@ -234,7 +262,7 @@
             else:
                 self.retrievegitmodules(version)
                 changes.append(('.hgsubstate', ''))
-        return (changes, {})
+        return (changes, copies)
 
     def getcommit(self, version):
         c = self.catfile(version, "commit") # read the commit hash