changeset 4765:b6a1f2c46c6c

convert extension: Add SVN converter
author Daniel Holth <dholth@fastmail.fm>
date Sun, 01 Jul 2007 23:56:11 +0200
parents 6a16ef0d1c7c
children 95cbb6b74790
files hgext/convert/__init__.py hgext/convert/hg.py hgext/convert/subversion.py
diffstat 3 files changed, 522 insertions(+), 0 deletions(-) [+]
line wrap: on
line diff
--- a/hgext/convert/__init__.py	Sun Jul 01 17:15:54 2007 +0200
+++ b/hgext/convert/__init__.py	Sun Jul 01 23:56:11 2007 +0200
@@ -193,6 +193,8 @@
     def copy(self, rev):
         c = self.commitcache[rev]
         files = self.source.getchanges(rev)
+        
+        do_copies = (hasattr(c, 'copies') and hasattr(self.dest, 'copyfile'))
 
         for f, v in files:
             try:
@@ -202,6 +204,11 @@
             else:
                 e = self.source.getmode(f, v)
                 self.dest.putfile(f, e, data)
+                if do_copies:
+                    if f in c.copies:
+                        # Merely marks that a copy happened.
+                        self.dest.copyfile(c.copies[f], f)
+
 
         r = [self.map[v] for v in c.parents]
         f = [f for f, v in files]
@@ -258,6 +265,7 @@
     Accepted source formats:
     - GIT
     - CVS
+    - SVN
 
     Accepted destination formats:
     - Mercurial
--- a/hgext/convert/hg.py	Sun Jul 01 17:15:54 2007 +0200
+++ b/hgext/convert/hg.py	Sun Jul 01 23:56:11 2007 +0200
@@ -29,6 +29,9 @@
         if self.repo.dirstate.state(f) == '?':
             self.repo.dirstate.update([f], "a")
 
+    def copyfile(self, source, dest):
+        self.repo.copy(source, dest)
+
     def delfile(self, f):
         try:
             os.unlink(self.repo.wjoin(f))
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/hgext/convert/subversion.py	Sun Jul 01 23:56:11 2007 +0200
@@ -0,0 +1,511 @@
+# Subversion 1.4/1.5 Python API backend
+#
+# Copyright(C) 2007 Daniel Holth et al
+
+import pprint
+import locale
+
+from mercurial import util
+
+# Subversion stuff. Works best with very recent Python SVN bindings
+# e.g. SVN 1.5 or backports. Thanks to the bzr folks for enhancing
+# these bindings.
+
+from svn.core import SubversionException, Pool
+import svn.core
+import svn.ra
+import svn.delta
+import svn
+import transport
+from cStringIO import StringIO
+
+from common import NoRepo, commit, converter_source, recode, nocommitmsg
+
+class CompatibilityException(Exception): pass
+
+nbRevisionsPerFetch = 50
+
+class svn_entry(object):
+    """Emulate a Subversion path change."""
+    __slots__ = ['path', 'copyfrom_path', 'copyfrom_rev', 'action']
+    def __init__(self, entry):
+        self.copyfrom_path = entry.copyfrom_path
+        self.copyfrom_rev = entry.copyfrom_rev
+        self.action = entry.action
+
+    def __str__(self):
+        return "%s %s %s" % (self.action, self.copyfrom_path, self.copyfrom_rev)
+
+    def __repr__(self):
+        return self.__str__()
+
+class svn_paths(object):
+    """Emulate a Subversion ordered dictionary of changed paths."""
+    __slots__ = ['values', 'order']
+    def __init__(self, orig_paths):
+        self.order = []
+        self.values = {}
+        if hasattr(orig_paths, 'keys'):
+            self.order = sorted(orig_paths.keys())
+            self.values.update(orig_paths)
+            return
+        if not orig_paths:
+            return
+        for path in orig_paths:
+            self.order.append(path)
+            self.values[path] = svn_entry(orig_paths[path])
+        self.order.sort() # maybe the order it came in isn't so great...
+
+    def __iter__(self):
+        return iter(self.order)
+
+    def __getitem__(self, key):
+        return self.values[key]
+
+    def __str__(self):
+        s = "{\n"
+        for path in self.order:
+            s += "'%s': %s,\n" % (path, self.values[path])
+        s += "}"
+        return s
+    
+    def __repr__(self):
+        return self.__str__()
+
+# SVN conversion code stolen from bzr-svn and tailor
+class convert_svn(converter_source):
+    def __init__(self, ui, url):
+        self.ui = ui
+        self.encoding = locale.getpreferredencoding()
+        try:
+            # Support file://path@rev syntax. Useful e.g. to convert
+            # deleted branches.
+            url, latest = url.rsplit("@", 1)
+            latest = int(latest)
+        except ValueError, e:
+            latest = None
+        self.url = url
+        self.encoding = 'UTF-8' # Subversion is always nominal UTF-8
+        try:
+            self.transport = transport.SvnRaTransport(url = url)
+            self.ra = self.transport.ra
+            self.base = svn.ra.get_repos_root(self.ra)
+            self.module = self.url[len(self.base):]
+            self.modulemap = {} # revision, module
+            self.commits = {}
+            self.files = {}
+            self.uuid = svn.ra.get_uuid(self.ra).decode(self.encoding)
+        except SubversionException, e:
+            raise NoRepo("couldn't open SVN repo %s" % url)
+
+        try:
+            self.get_blacklist()
+        except IOError, e:
+            pass
+
+        if not latest:
+            latest = svn.ra.get_latest_revnum(self.ra)
+        dirent = svn.ra.stat(self.ra, self.module, latest)
+        self.last_changed = dirent.created_rev
+
+        self.head = self.rev(self.last_changed)
+
+        # Should lazily fetch revisions in batches of, say, 1,000...:
+        self._fetch_revisions(from_revnum=self.last_changed, to_revnum=0)
+
+    def rev(self, revnum):
+        return (u"svn:%s%s@%s" % (self.uuid, self.module, revnum)).decode(self.encoding)
+            
+    def get_blacklist(self):
+        """Avoid certain revision numbers.
+        It is not uncommon for two nearby revisions to cancel each other
+        out, e.g. 'I copied trunk into a subdirectory of itself instead
+        of making a branch'. The converted repository is significantly
+        smaller if we ignore such revisions."""
+        self.blacklist = set()
+        blacklist = self.blacklist
+        for line in file("blacklist.txt", "r"):
+            if not line.startswith("#"):
+                try:
+                    svn_rev = int(line.strip())
+                    blacklist.add(svn_rev)
+                except ValueError, e:
+                    pass # not an integer or a comment
+
+    def is_blacklisted(self, svn_rev):
+        return svn_rev in self.blacklist
+
+    def reparent(self, module):
+        svn_url = self.base + module
+        self.ui.debug("reparent to %s\n" % svn_url.encode(self.encoding))
+        svn.ra.reparent(self.ra, svn_url.encode(self.encoding))
+
+    def _fetch_revisions(self, from_revnum = 0, to_revnum = 347, pb=None):
+        self.parent_cset = None
+        self.child_cset = None
+        
+        self.ui.debug('Fetching revisions %d to %d\n' % (from_revnum, to_revnum))
+
+        def get_entry_from_path(path, module=self.module):
+            # Given the repository url of this wc, say
+            #   "http://server/plone/CMFPlone/branches/Plone-2_0-branch"
+            # extract the "entry" portion (a relative path) from what
+            # svn log --xml says, ie
+            #   "/CMFPlone/branches/Plone-2_0-branch/tests/PloneTestCase.py"
+            # that is to say "tests/PloneTestCase.py"
+
+            if path.startswith(module):
+                relative = path[len(module):]
+                if relative.startswith('/'):
+                    return relative[1:]
+                else:
+                    return relative
+
+            # The path is outside our tracked tree...
+            self.ui.debug('Ignoring %r since it is not under %r\n' % (path, module))
+            return None
+
+        received = []
+        def rcvr(*arg, **args):
+            orig_paths, revnum, author, date, message, pool = arg
+            new_orig_paths = svn_paths(orig_paths)
+            rcvr2(new_orig_paths, revnum, author, date, message, pool)
+
+        def rcvr2(orig_paths, revnum, author, date, message, pool, better_paths = None):
+            if not self.is_blacklisted(revnum):
+                received.append((orig_paths, revnum, author, date, message))
+           
+        def after_received(orig_paths, revnum, author, date, message):
+            if revnum == 1172:
+                import pdb
+                pdb.set_trace()
+            if revnum in self.modulemap:
+                new_module = self.modulemap[revnum]
+                if new_module != self.module:
+                    self.module = new_module
+                    self.reparent(self.module)
+
+            copyfrom = {} # Map of entrypath, revision for finding source of deleted revisions.
+            copies = {}
+            entries = []
+            self.ui.debug("Parsing revision %d\n" % revnum)
+            if orig_paths is not None:
+                rev = self.rev(revnum)
+                try:
+                    branch = self.module.split("/")[-1]
+                except IndexError:
+                    branch = None
+                
+                for path in orig_paths:
+                    # self.ui.write("path %s\n" % path)
+                    if path == self.module: # Follow branching back in history
+                        ent = orig_paths[path]
+                        if ent:
+                            if ent.copyfrom_path:
+                                self.modulemap[ent.copyfrom_rev] = ent.copyfrom_path
+                            else:
+                                self.ui.debug("No copyfrom path, don't know what to do.\n")
+                                # Maybe it was added and there is no more history.
+                    entrypath = get_entry_from_path(path, module=self.module)
+                    # self.ui.write("entrypath %s\n" % entrypath)
+                    if not entrypath:
+                        # Outside our area of interest
+                        self.ui.debug("boring@%s: %s\n" % (revnum, path))
+                        continue
+                    entry = entrypath.decode(self.encoding)
+                    ent = orig_paths[path]
+
+                    kind = svn.ra.check_path(self.ra, entrypath, revnum)
+                    if kind == svn.core.svn_node_file:
+                        if ent.copyfrom_path:
+                            copyfrom_path = get_entry_from_path(ent.copyfrom_path)
+                            if copyfrom_path:
+                                self.ui.debug("Copied to %s from %s@%s\n" % (entry, copyfrom_path, ent.copyfrom_rev))
+                                # It's probably important for hg that the source
+                                # exists in the revision's parent, not just the
+                                # ent.copyfrom_rev
+                                fromkind = svn.ra.check_path(self.ra, copyfrom_path, ent.copyfrom_rev)
+                                if fromkind != 0:
+                                    copies[self.recode(entry)] = self.recode(copyfrom_path)
+                        entries.append(self.recode(entry))
+                    elif kind == 0: # gone, but had better be a deleted *file*
+                        self.ui.debug("gone from %s\n" % ent.copyfrom_rev)
+
+                        fromrev = revnum - 1
+                        # might always need to be revnum - 1 in these 3 lines?
+                        old_module = self.modulemap.get(fromrev, self.module)
+                        basepath = old_module + "/" + get_entry_from_path(path, module=self.module)
+                        entrypath = old_module + "/" + get_entry_from_path(path, module=self.module)
+
+                        def lookup_parts(p):
+                            rc = None
+                            parts = p.split("/")
+                            for i in range(len(parts)):
+                                part = "/".join(parts[:i])
+                                info = part, copyfrom.get(part, None)
+                                if info[1] is not None:
+                                    self.ui.debug("Found parent directory %s\n" % info)
+                                    rc = info
+                            return rc
+
+                        self.ui.debug("base, entry %s %s\n" % (basepath, entrypath))
+
+                        frompath, froment = lookup_parts(entrypath) or (None, revnum - 1)
+
+                        # need to remove fragment from lookup_parts and replace with copyfrom_path
+                        if frompath is not None:
+                            self.ui.debug("munge-o-matic\n")
+                            self.ui.debug(entrypath + '\n')
+                            self.ui.debug(entrypath[len(frompath):] + '\n')
+                            entrypath = froment.copyfrom_path + entrypath[len(frompath):]
+                            fromrev = froment.copyfrom_rev
+                            self.ui.debug("Info: %s %s %s %s\n" % (frompath, froment, ent, entrypath))
+
+                        fromkind = svn.ra.check_path(self.ra, entrypath, fromrev)
+                        if fromkind == svn.core.svn_node_file:   # a deleted file
+                            entries.append(self.recode(entry))
+                        else:
+                            # print "Deleted/moved non-file:", revnum, path, ent
+                            # children = self._find_children(path, revnum - 1)
+                            # print "find children %s@%d from %d action %s" % (path, revnum, ent.copyfrom_rev, ent.action)
+                            # Sometimes this is tricky. For example: in
+                            # The Subversion Repository revision 6940 a dir
+                            # was copied and one of its files was deleted 
+                            # from the new location in the same commit. This
+                            # code can't deal with that yet.
+                            if ent.action == 'C':
+                                children = self._find_children(path, fromrev)
+                            else:
+                                oroot = entrypath.strip('/')
+                                nroot = path.strip('/')
+                                children = self._find_children(oroot, fromrev)
+                                children = [s.replace(oroot,nroot) for s in children]
+                            # Mark all [files, not directories] as deleted.
+                            for child in children:
+                                # Can we move a child directory and its
+                                # parent in the same commit? (probably can). Could
+                                # cause problems if instead of revnum -1, 
+                                # we have to look in (copyfrom_path, revnum - 1)
+                                entrypath = get_entry_from_path("/" + child, module=old_module)
+                                if entrypath:
+                                    entry = self.recode(entrypath.decode(self.encoding))
+                                    if entry in copies:
+                                        # deleted file within a copy
+                                        del copies[entry]
+                                    else:
+                                        entries.append(entry)
+                    elif kind == svn.core.svn_node_dir:
+                        # Should probably synthesize normal file entries
+                        # and handle as above to clean up copy/rename handling.
+
+                        # If the directory just had a prop change,
+                        # then we shouldn't need to look for its children.
+                        # Also this could create duplicate entries. Not sure
+                        # whether this will matter. Maybe should make entries a set.
+                        # print "Changed directory", revnum, path, ent.action, ent.copyfrom_path, ent.copyfrom_rev
+                        # This will fail if a directory was copied
+                        # from another branch and then some of its files
+                        # were deleted in the same transaction.
+                        children = self._find_children(path, revnum)
+                        children.sort()
+                        for child in children:
+                            # Can we move a child directory and its
+                            # parent in the same commit? (probably can). Could
+                            # cause problems if instead of revnum -1, 
+                            # we have to look in (copyfrom_path, revnum - 1)
+                            entrypath = get_entry_from_path("/" + child, module=self.module)
+                            # print child, self.module, entrypath
+                            if entrypath:
+                                # Need to filter out directories here...
+                                kind = svn.ra.check_path(self.ra, entrypath, revnum)
+                                if kind != svn.core.svn_node_dir:
+                                    entries.append(self.recode(entrypath))
+
+                        # Copies here (must copy all from source)
+                        # Probably not a real problem for us if
+                        # source does not exist
+
+                        # Can do this with the copy command "hg copy"
+                        # if ent.copyfrom_path:
+                        #     copyfrom_entry = get_entry_from_path(ent.copyfrom_path.decode(self.encoding),
+                        #             module=self.module)
+                        #     copyto_entry = entrypath
+                        #
+                        #     print "copy directory", copyfrom_entry, 'to', copyto_entry
+                        #
+                        #     copies.append((copyfrom_entry, copyto_entry))
+                        
+                        if ent.copyfrom_path:
+                            copyfrom_path = ent.copyfrom_path.decode(self.encoding)
+                            copyfrom_entry = get_entry_from_path(copyfrom_path, module=self.module)
+                            if copyfrom_entry:
+                                copyfrom[path] = ent
+                                self.ui.debug("mark %s came from %s\n" % (path, copyfrom[path]))
+
+                                # Good, /probably/ a regular copy. Really should check
+                                # to see whether the parent revision actually contains
+                                # the directory in question.
+                                children = self._find_children(self.recode(copyfrom_path), ent.copyfrom_rev)
+                                children.sort()
+                                for child in children:
+                                    entrypath = get_entry_from_path("/" + child, module=self.module)
+                                    if entrypath:
+                                        entry = entrypath.decode(self.encoding)
+                                        # print "COPY COPY From", copyfrom_entry, entry
+                                        copyto_path = path + entry[len(copyfrom_entry):]
+                                        copyto_entry =  get_entry_from_path(copyto_path, module=self.module)
+                                        # print "COPY", entry, "COPY To", copyto_entry
+                                        copies[self.recode(copyto_entry)] = self.recode(entry)
+                                        # copy from quux splort/quuxfile
+              
+                self.modulemap[revnum] = self.module # track backwards in time
+                # a list of (filename, id) where id lets us retrieve the file.
+                # eg in git, id is the object hash. for svn it'll be the 
+                self.files[rev] = zip(entries, [rev] * len(entries))
+
+                # Example SVN datetime. Includes microseconds.
+                # ISO-8601 conformant
+                # '2007-01-04T17:35:00.902377Z'
+                date = util.parsedate(date[:18] + " UTC", ["%Y-%m-%dT%H:%M:%S"])
+
+                log = message and self.recode(message) or nocommitmsg
+                author = author and self.recode(author) or ''
+
+                cset = commit(author=author,
+                        date=util.datestr(date), 
+                        desc=log, 
+                        parents=[],
+                        copies=copies,
+                        branch=branch)
+
+                if self.child_cset is not None:
+                    self.child_cset.parents = [rev]
+
+                self.child_cset = cset
+
+                self.commits[rev] = cset
+
+        try:
+            discover_changed_paths = True
+            strict_node_history = False
+            svn.ra.get_log(self.ra, [self.module], from_revnum, to_revnum, 
+                           0, discover_changed_paths, strict_node_history, rcvr)
+            for args in received:
+                after_received(*args)
+            self.last_revnum = to_revnum
+        except SubversionException, (_, num):
+            if num == svn.core.SVN_ERR_FS_NO_SUCH_REVISION:
+                raise NoSuchRevision(branch=self, 
+                    revision="Revision number %d" % to_revnum)
+            raise
+
+    def getheads(self):
+        # svn-url@rev
+        # Not safe if someone committed:
+        self.heads = [self.head]
+        # print self.commits.keys()
+        return self.heads
+
+    def _getfile(self, file, rev):
+        io = StringIO()
+        # TODO: ra.get_file transmits the whole file instead of diffs.
+        mode = ''
+        try:
+            revnum = int(rev.split("@")[-1])
+            if self.module != self.modulemap[revnum]:
+                self.module = self.modulemap[revnum]
+                self.reparent(self.module)
+            info = svn.ra.get_file(self.ra, file, revnum, io)
+            if isinstance(info, list):
+                info = info[-1]
+            mode = ("svn:executable" in info) and 'x' or ''
+            mode = ("svn:special" in info) and 'l' or mode
+        except SubversionException, e:
+            notfound = (svn.core.SVN_ERR_FS_NOT_FOUND,
+                svn.core.SVN_ERR_RA_DAV_PATH_NOT_FOUND)
+            if e.apr_err in notfound: # File not found
+                raise IOError()
+            raise
+        data = io.getvalue()
+        if mode == 'l':
+            link_prefix = "link "
+            if data.startswith(link_prefix):
+                data = data[len(link_prefix):]
+        return data, mode
+
+    def getfile(self, file, rev):
+        data, mode = self._getfile(file, rev)
+        self.modecache[(file, rev)] = mode
+        return data
+
+    def getmode(self, file, rev):        
+        return self.modecache[(file, rev)]
+
+    def getchanges(self, rev):
+        self.modecache = {}
+        files = self.files[rev]
+        cl = files
+        cl.sort()
+        return cl
+
+    def getcommit(self, rev):
+        return self.commits[rev]
+
+    def gettags(self):
+        return []
+
+    def _find_children(self, path, revnum):
+        path = path.strip("/")
+
+        def _find_children_fallback(path, revnum):
+            # SWIG python bindings for getdir are broken up to at least 1.4.3
+            if not hasattr(self, 'client_ctx'):
+                self.client_ctx = svn.client.create_context()
+            optrev = svn.core.svn_opt_revision_t()
+            optrev.kind = svn.core.svn_opt_revision_number
+            optrev.value.number = revnum
+            rpath = '/'.join([self.url, path]).strip('/')
+            return ['%s/%s' % (path, x) for x in svn.client.ls(rpath, optrev, True, self.client_ctx).keys()]
+
+        if hasattr(self, '_find_children_fallback'):
+            return _find_children_fallback(path, revnum)
+
+        self.reparent("/" + path)
+        pool = Pool()
+
+        children = []
+        def find_children_inner(children, path, revnum = revnum):
+            if hasattr(svn.ra, 'get_dir2'): # Since SVN 1.4
+                fields = 0xffffffff # Binding does not provide SVN_DIRENT_ALL
+                getdir = svn.ra.get_dir2(self.ra, path, revnum, fields, pool)
+            else:
+                getdir = svn.ra.get_dir(self.ra, path, revnum, pool)
+            if type(getdir) == dict:
+                # python binding for getdir is broken up to at least 1.4.3
+                raise CompatibilityException()
+            dirents = getdir[0]
+            if type(dirents) == int:
+                # got here once due to infinite recursion bug
+                # pprint.pprint(getdir)
+                return
+            c = dirents.keys()
+            c.sort()
+            for child in c:
+                dirent = dirents[child]
+                if dirent.kind == svn.core.svn_node_dir:
+                    find_children_inner(children, (path + "/" + child).strip("/"))
+                else:
+                    children.append((path + "/" + child).strip("/"))
+
+        try:
+            find_children_inner(children, "")
+        except CompatibilityException:
+            self._find_children_fallback = True
+            self.reparent(self.module)
+            return _find_children_fallback(path, revnum)
+
+        self.reparent(self.module)
+        return [path + "/" + c for c in children]
+
+    def recode(self, s):
+        return recode(self.encoding, s)