convert: drastically speed up git conversions
authorDavid Schleimer <dschleimer@fb.com>
Tue, 27 May 2014 21:12:24 -0700
changeset 21630 a204fd9b5ba9
parent 21629 664c49420d48
child 21631 9bafe09285f2
convert: drastically speed up git conversions We would formerly exec git cat-file once for every commit, plus once for every tree and file we wnated to read. This switches to using git cat-file's batch mode, which is much, much, much faster. Using this new code, converting the git git repo to hg ran in 106 minutes on my machine. Using the stock mercurial, it required 1239 minutes. I believe this to be typical of the speedups we will see form this patch.
hgext/convert/git.py
--- a/hgext/convert/git.py	Sat May 03 19:11:51 2014 +0430
+++ b/hgext/convert/git.py	Tue May 27 21:12:24 2014 -0700
@@ -46,6 +46,18 @@
                     del os.environ['GIT_DIR']
                 else:
                     os.environ['GIT_DIR'] = prevgitdir
+
+        def gitpipe(self, s):
+            prevgitdir = os.environ.get('GIT_DIR')
+            os.environ['GIT_DIR'] = self.path
+            try:
+                return util.popen3(s)
+            finally:
+                if prevgitdir is None:
+                    del os.environ['GIT_DIR']
+                else:
+                    os.environ['GIT_DIR'] = prevgitdir
+
     else:
         def gitopen(self, s, err=None):
             if err == subprocess.PIPE:
@@ -56,6 +68,9 @@
             else:
                 return util.popen('GIT_DIR=%s %s' % (self.path, s), 'rb')
 
+        def gitpipe(self, s):
+            return util.popen3('GIT_DIR=%s %s' % (self.path, s))
+
     def popen_with_stderr(self, s):
         p = subprocess.Popen(s, shell=True, bufsize=-1,
                              close_fds=util.closefds,
@@ -84,6 +99,12 @@
         self.path = path
         self.submodules = []
 
+        self.catfilepipe = self.gitpipe('git cat-file --batch')
+
+    def after(self):
+        for f in self.catfilepipe:
+            f.close()
+
     def getheads(self):
         if not self.rev:
             heads, ret = self.gitread('git rev-parse --branches --remotes')
@@ -98,9 +119,17 @@
     def catfile(self, rev, type):
         if rev == hex(nullid):
             raise IOError
-        data, ret = self.gitread("git cat-file %s %s" % (type, rev))
-        if ret:
+        self.catfilepipe[0].write(rev+'\n')
+        self.catfilepipe[0].flush()
+        info = self.catfilepipe[1].readline().split()
+        if info[1] != type:
             raise util.Abort(_('cannot read %r object at %s') % (type, rev))
+        size = int(info[2])
+        data = self.catfilepipe[1].read(size)
+        if len(data) < size:
+            raise util.Abort(_('cannot read %r object at %s: %s') % (type, rev))
+        # read the trailing newline
+        self.catfilepipe[1].read(1)
         return data
 
     def getfile(self, name, rev):