# HG changeset patch # User Mads Kiilerich # Date 1426783219 -3600 # Node ID 216fa1ba999333e86200d2608e3b78670168d516 # Parent 03163826b4e636bbbcaf60ebb23427b7f9543e19 convert: optimize convert of files that are unmodified from p2 in merges Conversion of a merge starts with p1 and re-adds the files that were changed in the merge or came unmodified from p2. Files that are unmodified from p1 will thus not be touched and take no time. Files that are unmodified from p2 would be retrieved and rehashed. They would end up getting the same hash as in p2 and end up reusing the filelog entry and look like the p1 case ... but it was slow. Instead, make getchanges also return 'files that are unmodified from p2' so the sink can reuse the existing p2 entry instead of calling getfile. Reuse of filelog entries can make a big difference when files are big and with long revlong chains so they take time to retrieve and hash, or when using an expensive custom getfile function (think http://mercurial.selenic.com/wiki/ConvertExtension#Customization with a code reformatter). This in combination with changes to reuse filectx entries in localrepo._filecommit make 'unchanged from p2' almost as fast as 'unchanged from p1'. This is so far only implemented for the combination of hg source and hg sink. This is a refactoring/optimization. It is covered by existing tests and show no changes - which is a good thing. diff -r 03163826b4e6 -r 216fa1ba9993 hgext/convert/bzr.py --- a/hgext/convert/bzr.py Thu Mar 19 17:36:17 2015 +0100 +++ b/hgext/convert/bzr.py Thu Mar 19 17:40:19 2015 +0100 @@ -143,7 +143,8 @@ parentids = self._parentids.pop(version) # only diff against first parent id prevtree = self.sourcerepo.revision_tree(parentids[0]) - return self._gettreechanges(self._revtree, prevtree) + files, changes = self._gettreechanges(self._revtree, prevtree) + return files, changes, set() def getcommit(self, version): rev = self.sourcerepo.get_revision(version) diff -r 03163826b4e6 -r 216fa1ba9993 hgext/convert/common.py --- a/hgext/convert/common.py Thu Mar 19 17:36:17 2015 +0100 +++ b/hgext/convert/common.py Thu Mar 19 17:40:19 2015 +0100 @@ -97,7 +97,7 @@ raise NotImplementedError def getchanges(self, version, full): - """Returns a tuple of (files, copies). + """Returns a tuple of (files, copies, cleanp2). files is a sorted list of (filename, id) tuples for all files changed between version and its first parent returned by @@ -105,6 +105,10 @@ id is the source revision id of the file. copies is a dictionary of dest: source + + cleanp2 is the set of files filenames that are clean against p2. + (Files that are clean against p1 are already not in files (unless + full). This makes it possible to handle p2 clean files similarly.) """ raise NotImplementedError @@ -215,7 +219,8 @@ mapping equivalent authors identifiers for each system.""" return None - def putcommit(self, files, copies, parents, commit, source, revmap, full): + def putcommit(self, files, copies, parents, commit, source, revmap, full, + cleanp2): """Create a revision with all changed files listed in 'files' and having listed parents. 'commit' is a commit object containing at a minimum the author, date, and message for this @@ -225,6 +230,8 @@ of source revisions to converted revisions. Only getfile() and lookuprev() should be called on 'source'. 'full' means that 'files' is complete and all other files should be removed. + 'cleanp2' is a set of the filenames that are unchanged from p2 + (only in the common merge case where there two parents). Note that the sink repository is not told to update itself to a particular revision (or even what that revision would be) diff -r 03163826b4e6 -r 216fa1ba9993 hgext/convert/convcmd.py --- a/hgext/convert/convcmd.py Thu Mar 19 17:36:17 2015 +0100 +++ b/hgext/convert/convcmd.py Thu Mar 19 17:40:19 2015 +0100 @@ -397,7 +397,7 @@ dest = self.map[changes] self.map[rev] = dest return - files, copies = changes + files, copies, cleanp2 = changes pbranches = [] if commit.parents: for prev in commit.parents: @@ -413,6 +413,8 @@ parents = [self.map.get(p, p) for p in parents] except KeyError: parents = [b[0] for b in pbranches] + if len(pbranches) != 2: + cleanp2 = set() if len(parents) < 3: source = progresssource(self.ui, self.source, len(files)) else: @@ -423,7 +425,7 @@ source = progresssource(self.ui, self.source, len(files) * (len(parents) - 1)) newnode = self.dest.putcommit(files, copies, parents, commit, - source, self.map, full) + source, self.map, full, cleanp2) source.close() self.source.converted(rev, newnode) self.map[rev] = newnode diff -r 03163826b4e6 -r 216fa1ba9993 hgext/convert/cvs.py --- a/hgext/convert/cvs.py Thu Mar 19 17:36:17 2015 +0100 +++ b/hgext/convert/cvs.py Thu Mar 19 17:40:19 2015 +0100 @@ -262,7 +262,7 @@ if full: raise util.Abort(_("convert from cvs do not support --full")) self._parse() - return sorted(self.files[rev].iteritems()), {} + return sorted(self.files[rev].iteritems()), {}, set() def getcommit(self, rev): self._parse() diff -r 03163826b4e6 -r 216fa1ba9993 hgext/convert/darcs.py --- a/hgext/convert/darcs.py Thu Mar 19 17:36:17 2015 +0100 +++ b/hgext/convert/darcs.py Thu Mar 19 17:40:19 2015 +0100 @@ -188,7 +188,7 @@ changes.append((elt.text.strip(), rev)) self.pull(rev) self.lastrev = rev - return sorted(changes), copies + return sorted(changes), copies, set() def getfile(self, name, rev): if rev != self.lastrev: diff -r 03163826b4e6 -r 216fa1ba9993 hgext/convert/filemap.py --- a/hgext/convert/filemap.py Thu Mar 19 17:36:17 2015 +0100 +++ b/hgext/convert/filemap.py Thu Mar 19 17:40:19 2015 +0100 @@ -384,12 +384,15 @@ # Get the real changes and do the filtering/mapping. To be # able to get the files later on in getfile, we hide the # original filename in the rev part of the return value. - changes, copies = self.base.getchanges(rev, full) + changes, copies, cleanp2 = self.base.getchanges(rev, full) files = {} + ncleanp2 = set(cleanp2) for f, r in changes: newf = self.filemapper(f) if newf and (newf != f or newf not in files): files[newf] = (f, r) + if newf != f: + ncleanp2.discard(f) files = sorted(files.items()) ncopies = {} @@ -400,7 +403,7 @@ if newsource: ncopies[newc] = newsource - return files, ncopies + return files, ncopies, ncleanp2 def getfile(self, name, rev): realname, realrev = rev diff -r 03163826b4e6 -r 216fa1ba9993 hgext/convert/git.py --- a/hgext/convert/git.py Thu Mar 19 17:36:17 2015 +0100 +++ b/hgext/convert/git.py Thu Mar 19 17:40:19 2015 +0100 @@ -264,7 +264,7 @@ else: self.retrievegitmodules(version) changes.append(('.hgsubstate', '')) - return (changes, copies) + return (changes, copies, set()) def getcommit(self, version): c = self.catfile(version, "commit") # read the commit hash diff -r 03163826b4e6 -r 216fa1ba9993 hgext/convert/gnuarch.py --- a/hgext/convert/gnuarch.py Thu Mar 19 17:36:17 2015 +0100 +++ b/hgext/convert/gnuarch.py Thu Mar 19 17:40:19 2015 +0100 @@ -171,7 +171,7 @@ copies.update(cps) self.lastrev = rev - return sorted(set(changes)), copies + return sorted(set(changes)), copies, set() def getcommit(self, rev): changes = self.changes[rev] diff -r 03163826b4e6 -r 216fa1ba9993 hgext/convert/hg.py --- a/hgext/convert/hg.py Thu Mar 19 17:36:17 2015 +0100 +++ b/hgext/convert/hg.py Thu Mar 19 17:40:19 2015 +0100 @@ -132,9 +132,14 @@ fp.write('%s %s\n' % (revid, s[1])) return fp.getvalue() - def putcommit(self, files, copies, parents, commit, source, revmap, full): + def putcommit(self, files, copies, parents, commit, source, revmap, full, + cleanp2): files = dict(files) + def getfilectx(repo, memctx, f): + if p2ctx and f in cleanp2 and f not in copies: + self.ui.debug('reusing %s from p2\n' % f) + return p2ctx[f] try: v = files[f] except KeyError: @@ -199,6 +204,9 @@ while parents: p1 = p2 p2 = parents.pop(0) + p2ctx = None + if p2 != nullid: + p2ctx = self.repo[p2] fileset = set(files) if full: fileset.update(self.repo[p1]) @@ -382,9 +390,13 @@ # getcopies() is also run for roots and before filtering so missing # revlogs are detected early copies = self.getcopies(ctx, parents, copyfiles) + cleanp2 = set() + if len(parents) == 2: + cleanp2.update(self.repo.status(parents[1].node(), ctx.node(), + clean=True).clean) changes = [(f, rev) for f in files if f not in self.ignored] changes.sort() - return changes, copies + return changes, copies, cleanp2 def getcopies(self, ctx, parents, files): copies = {} diff -r 03163826b4e6 -r 216fa1ba9993 hgext/convert/monotone.py --- a/hgext/convert/monotone.py Thu Mar 19 17:36:17 2015 +0100 +++ b/hgext/convert/monotone.py Thu Mar 19 17:40:19 2015 +0100 @@ -280,7 +280,7 @@ for fromfile in renamed.values(): files[fromfile] = rev - return (files.items(), copies) + return (files.items(), copies, set()) def getfile(self, name, rev): if not self.mtnisfile(name, rev): diff -r 03163826b4e6 -r 216fa1ba9993 hgext/convert/p4.py --- a/hgext/convert/p4.py Thu Mar 19 17:36:17 2015 +0100 +++ b/hgext/convert/p4.py Thu Mar 19 17:40:19 2015 +0100 @@ -195,7 +195,7 @@ def getchanges(self, rev, full): if full: raise util.Abort(_("convert from p4 do not support --full")) - return self.files[rev], {} + return self.files[rev], {}, set() def getcommit(self, rev): return self.changeset[rev] diff -r 03163826b4e6 -r 216fa1ba9993 hgext/convert/subversion.py --- a/hgext/convert/subversion.py Thu Mar 19 17:36:17 2015 +0100 +++ b/hgext/convert/subversion.py Thu Mar 19 17:40:19 2015 +0100 @@ -474,7 +474,7 @@ (files, copies) = self._getchanges(rev, full) # caller caches the result, so free it here to release memory del self.paths[rev] - return (files, copies) + return (files, copies, set()) def getchangedfiles(self, rev, i): # called from filemap - cache computed values for reuse in getchanges @@ -1240,7 +1240,8 @@ def revid(self, rev): return u"svn:%s@%s" % (self.uuid, rev) - def putcommit(self, files, copies, parents, commit, source, revmap, full): + def putcommit(self, files, copies, parents, commit, source, revmap, full, + cleanp2): for parent in parents: try: return self.revid(self.childmap[parent])