changeset 21630:a204fd9b5ba9

convert: drastically speed up git conversions We would formerly exec git cat-file once for every commit, plus once for every tree and file we wnated to read. This switches to using git cat-file's batch mode, which is much, much, much faster. Using this new code, converting the git git repo to hg ran in 106 minutes on my machine. Using the stock mercurial, it required 1239 minutes. I believe this to be typical of the speedups we will see form this patch.
author David Schleimer <dschleimer@fb.com>
date Tue, 27 May 2014 21:12:24 -0700
parents 664c49420d48
children 9bafe09285f2
files hgext/convert/git.py
diffstat 1 files changed, 31 insertions(+), 2 deletions(-) [+]
line wrap: on
line diff
--- a/hgext/convert/git.py	Sat May 03 19:11:51 2014 +0430
+++ b/hgext/convert/git.py	Tue May 27 21:12:24 2014 -0700
@@ -46,6 +46,18 @@
                     del os.environ['GIT_DIR']
                 else:
                     os.environ['GIT_DIR'] = prevgitdir
+
+        def gitpipe(self, s):
+            prevgitdir = os.environ.get('GIT_DIR')
+            os.environ['GIT_DIR'] = self.path
+            try:
+                return util.popen3(s)
+            finally:
+                if prevgitdir is None:
+                    del os.environ['GIT_DIR']
+                else:
+                    os.environ['GIT_DIR'] = prevgitdir
+
     else:
         def gitopen(self, s, err=None):
             if err == subprocess.PIPE:
@@ -56,6 +68,9 @@
             else:
                 return util.popen('GIT_DIR=%s %s' % (self.path, s), 'rb')
 
+        def gitpipe(self, s):
+            return util.popen3('GIT_DIR=%s %s' % (self.path, s))
+
     def popen_with_stderr(self, s):
         p = subprocess.Popen(s, shell=True, bufsize=-1,
                              close_fds=util.closefds,
@@ -84,6 +99,12 @@
         self.path = path
         self.submodules = []
 
+        self.catfilepipe = self.gitpipe('git cat-file --batch')
+
+    def after(self):
+        for f in self.catfilepipe:
+            f.close()
+
     def getheads(self):
         if not self.rev:
             heads, ret = self.gitread('git rev-parse --branches --remotes')
@@ -98,9 +119,17 @@
     def catfile(self, rev, type):
         if rev == hex(nullid):
             raise IOError
-        data, ret = self.gitread("git cat-file %s %s" % (type, rev))
-        if ret:
+        self.catfilepipe[0].write(rev+'\n')
+        self.catfilepipe[0].flush()
+        info = self.catfilepipe[1].readline().split()
+        if info[1] != type:
             raise util.Abort(_('cannot read %r object at %s') % (type, rev))
+        size = int(info[2])
+        data = self.catfilepipe[1].read(size)
+        if len(data) < size:
+            raise util.Abort(_('cannot read %r object at %s: %s') % (type, rev))
+        # read the trailing newline
+        self.catfilepipe[1].read(1)
         return data
 
     def getfile(self, name, rev):