view hgext/convert/subversion.py @ 8250:1b60efdb8bc5

convert: add copyright and license headers to back-ends
author Martin Geisler <mg@lazybytes.net>
date Sun, 26 Apr 2009 01:47:44 +0200
parents d093e57671ea
children f55869abb5c3
line wrap: on
line source

# Subversion 1.4/1.5 Python API backend
#
# Copyright(C) 2007 Daniel Holth et al
#
# Configuration options:
#
# convert.svn.trunk
#   Relative path to the trunk (default: "trunk")
# convert.svn.branches
#   Relative path to tree of branches (default: "branches")
# convert.svn.tags
#   Relative path to tree of tags (default: "tags")
#
# Set these in a hgrc, or on the command line as follows:
#
#   hg convert --config convert.svn.trunk=wackoname [...]

import locale
import os
import re
import sys
import cPickle as pickle
import tempfile
import urllib

from mercurial import strutil, util
from mercurial.i18n import _

# Subversion stuff. Works best with very recent Python SVN bindings
# e.g. SVN 1.5 or backports. Thanks to the bzr folks for enhancing
# these bindings.

from cStringIO import StringIO

from common import NoRepo, MissingTool, commit, encodeargs, decodeargs
from common import commandline, converter_source, converter_sink, mapfile

try:
    from svn.core import SubversionException, Pool
    import svn
    import svn.client
    import svn.core
    import svn.ra
    import svn.delta
    import transport
    import warnings
    warnings.filterwarnings('ignore',
            module='svn.core',
            category=DeprecationWarning)

except ImportError:
    pass

class SvnPathNotFound(Exception):
    pass

def geturl(path):
    try:
        return svn.client.url_from_path(svn.core.svn_path_canonicalize(path))
    except SubversionException:
        pass
    if os.path.isdir(path):
        path = os.path.normpath(os.path.abspath(path))
        if os.name == 'nt':
            path = '/' + util.normpath(path)
        return 'file://%s' % urllib.quote(path)
    return path

def optrev(number):
    optrev = svn.core.svn_opt_revision_t()
    optrev.kind = svn.core.svn_opt_revision_number
    optrev.value.number = number
    return optrev

class changedpath(object):
    def __init__(self, p):
        self.copyfrom_path = p.copyfrom_path
        self.copyfrom_rev = p.copyfrom_rev
        self.action = p.action

def get_log_child(fp, url, paths, start, end, limit=0, discover_changed_paths=True,
                    strict_node_history=False):
    protocol = -1
    def receiver(orig_paths, revnum, author, date, message, pool):
        if orig_paths is not None:
            for k, v in orig_paths.iteritems():
                orig_paths[k] = changedpath(v)
        pickle.dump((orig_paths, revnum, author, date, message),
                    fp, protocol)

    try:
        # Use an ra of our own so that our parent can consume
        # our results without confusing the server.
        t = transport.SvnRaTransport(url=url)
        svn.ra.get_log(t.ra, paths, start, end, limit,
                       discover_changed_paths,
                       strict_node_history,
                       receiver)
    except SubversionException, (inst, num):
        pickle.dump(num, fp, protocol)
    except IOError:
        # Caller may interrupt the iteration
        pickle.dump(None, fp, protocol)
    else:
        pickle.dump(None, fp, protocol)
    fp.close()
    # With large history, cleanup process goes crazy and suddenly
    # consumes *huge* amount of memory. The output file being closed,
    # there is no need for clean termination.
    os._exit(0)

def debugsvnlog(ui, **opts):
    """Fetch SVN log in a subprocess and channel them back to parent to
    avoid memory collection issues.
    """
    util.set_binary(sys.stdin)
    util.set_binary(sys.stdout)
    args = decodeargs(sys.stdin.read())
    get_log_child(sys.stdout, *args)

class logstream:
    """Interruptible revision log iterator."""
    def __init__(self, stdout):
        self._stdout = stdout

    def __iter__(self):
        while True:
            entry = pickle.load(self._stdout)
            try:
                orig_paths, revnum, author, date, message = entry
            except:
                if entry is None:
                    break
                raise SubversionException("child raised exception", entry)
            yield entry

    def close(self):
        if self._stdout:
            self._stdout.close()
            self._stdout = None


# Check to see if the given path is a local Subversion repo. Verify this by
# looking for several svn-specific files and directories in the given
# directory.
def filecheck(path, proto):
    for x in ('locks', 'hooks', 'format', 'db', ):
        if not os.path.exists(os.path.join(path, x)):
            return False
    return True

# Check to see if a given path is the root of an svn repo over http. We verify
# this by requesting a version-controlled URL we know can't exist and looking
# for the svn-specific "not found" XML.
def httpcheck(path, proto):
    return ('<m:human-readable errcode="160013">' in
            urllib.urlopen('%s://%s/!svn/ver/0/.svn' % (proto, path)).read())

protomap = {'http': httpcheck,
            'https': httpcheck,
            'file': filecheck,
            }
def issvnurl(url):
    if not '://' in url:
        return False
    proto, path = url.split('://', 1)
    path = urllib.url2pathname(path).replace(os.sep, '/')
    check = protomap.get(proto, lambda p, p2: False)
    while '/' in path:
        if check(path, proto):
            return True
        path = path.rsplit('/', 1)[0]
    return False

# SVN conversion code stolen from bzr-svn and tailor
#
# Subversion looks like a versioned filesystem, branches structures
# are defined by conventions and not enforced by the tool. First,
# we define the potential branches (modules) as "trunk" and "branches"
# children directories. Revisions are then identified by their
# module and revision number (and a repository identifier).
#
# The revision graph is really a tree (or a forest). By default, a
# revision parent is the previous revision in the same module. If the
# module directory is copied/moved from another module then the
# revision is the module root and its parent the source revision in
# the parent module. A revision has at most one parent.
#
class svn_source(converter_source):
    def __init__(self, ui, url, rev=None):
        super(svn_source, self).__init__(ui, url, rev=rev)

        if not (url.startswith('svn://') or url.startswith('svn+ssh://') or
                (os.path.exists(url) and
                 os.path.exists(os.path.join(url, '.svn'))) or
                issvnurl(url)):
            raise NoRepo("%s does not look like a Subversion repo" % url)

        try:
            SubversionException
        except NameError:
            raise MissingTool(_('Subversion python bindings could not be loaded'))

        try:
            version = svn.core.SVN_VER_MAJOR, svn.core.SVN_VER_MINOR
            if version < (1, 4):
                raise MissingTool(_('Subversion python bindings %d.%d found, '
                                    '1.4 or later required') % version)
        except AttributeError:
            raise MissingTool(_('Subversion python bindings are too old, 1.4 '
                                'or later required'))

        self.encoding = locale.getpreferredencoding()
        self.lastrevs = {}

        latest = None
        try:
            # Support file://path@rev syntax. Useful e.g. to convert
            # deleted branches.
            at = url.rfind('@')
            if at >= 0:
                latest = int(url[at+1:])
                url = url[:at]
        except ValueError:
            pass
        self.url = geturl(url)
        self.encoding = 'UTF-8' # Subversion is always nominal UTF-8
        try:
            self.transport = transport.SvnRaTransport(url=self.url)
            self.ra = self.transport.ra
            self.ctx = self.transport.client
            self.baseurl = svn.ra.get_repos_root(self.ra)
            # Module is either empty or a repository path starting with
            # a slash and not ending with a slash.
            self.module = urllib.unquote(self.url[len(self.baseurl):])
            self.prevmodule = None
            self.rootmodule = self.module
            self.commits = {}
            self.paths = {}
            self.uuid = svn.ra.get_uuid(self.ra).decode(self.encoding)
        except SubversionException:
            ui.traceback()
            raise NoRepo("%s does not look like a Subversion repo" % self.url)

        if rev:
            try:
                latest = int(rev)
            except ValueError:
                raise util.Abort(_('svn: revision %s is not an integer') % rev)

        self.startrev = self.ui.config('convert', 'svn.startrev', default=0)
        try:
            self.startrev = int(self.startrev)
            if self.startrev < 0:
                self.startrev = 0
        except ValueError:
            raise util.Abort(_('svn: start revision %s is not an integer')
                             % self.startrev)

        try:
            self.get_blacklist()
        except IOError:
            pass

        self.head = self.latest(self.module, latest)
        if not self.head:
            raise util.Abort(_('no revision found in module %s') %
                             self.module.encode(self.encoding))
        self.last_changed = self.revnum(self.head)

        self._changescache = None

        if os.path.exists(os.path.join(url, '.svn/entries')):
            self.wc = url
        else:
            self.wc = None
        self.convertfp = None

    def setrevmap(self, revmap):
        lastrevs = {}
        for revid in revmap.iterkeys():
            uuid, module, revnum = self.revsplit(revid)
            lastrevnum = lastrevs.setdefault(module, revnum)
            if revnum > lastrevnum:
                lastrevs[module] = revnum
        self.lastrevs = lastrevs

    def exists(self, path, optrev):
        try:
            svn.client.ls(self.url.rstrip('/') + '/' + urllib.quote(path),
                                 optrev, False, self.ctx)
            return True
        except SubversionException:
            return False

    def getheads(self):

        def isdir(path, revnum):
            kind = self._checkpath(path, revnum)
            return kind == svn.core.svn_node_dir

        def getcfgpath(name, rev):
            cfgpath = self.ui.config('convert', 'svn.' + name)
            if cfgpath is not None and cfgpath.strip() == '':
                return None
            path = (cfgpath or name).strip('/')
            if not self.exists(path, rev):
                if cfgpath:
                    raise util.Abort(_('expected %s to be at %r, but not found')
                                 % (name, path))
                return None
            self.ui.note(_('found %s at %r\n') % (name, path))
            return path

        rev = optrev(self.last_changed)
        oldmodule = ''
        trunk = getcfgpath('trunk', rev)
        self.tags = getcfgpath('tags', rev)
        branches = getcfgpath('branches', rev)

        # If the project has a trunk or branches, we will extract heads
        # from them. We keep the project root otherwise.
        if trunk:
            oldmodule = self.module or ''
            self.module += '/' + trunk
            self.head = self.latest(self.module, self.last_changed)
            if not self.head:
                raise util.Abort(_('no revision found in module %s') %
                                 self.module.encode(self.encoding))

        # First head in the list is the module's head
        self.heads = [self.head]
        if self.tags is not None:
            self.tags = '%s/%s' % (oldmodule , (self.tags or 'tags'))

        # Check if branches bring a few more heads to the list
        if branches:
            rpath = self.url.strip('/')
            branchnames = svn.client.ls(rpath + '/' + urllib.quote(branches),
                                        rev, False, self.ctx)
            for branch in branchnames.keys():
                module = '%s/%s/%s' % (oldmodule, branches, branch)
                if not isdir(module, self.last_changed):
                    continue
                brevid = self.latest(module, self.last_changed)
                if not brevid:
                    self.ui.note(_('ignoring empty branch %s\n') %
                                   branch.encode(self.encoding))
                    continue
                self.ui.note(_('found branch %s at %d\n') %
                             (branch, self.revnum(brevid)))
                self.heads.append(brevid)

        if self.startrev and self.heads:
            if len(self.heads) > 1:
                raise util.Abort(_('svn: start revision is not supported '
                                   'with more than one branch'))
            revnum = self.revnum(self.heads[0])
            if revnum < self.startrev:
                raise util.Abort(_('svn: no revision found after start revision %d')
                                 % self.startrev)

        return self.heads

    def getfile(self, file, rev):
        data, mode = self._getfile(file, rev)
        self.modecache[(file, rev)] = mode
        return data

    def getmode(self, file, rev):
        return self.modecache[(file, rev)]

    def getchanges(self, rev):
        if self._changescache and self._changescache[0] == rev:
            return self._changescache[1]
        self._changescache = None
        self.modecache = {}
        (paths, parents) = self.paths[rev]
        if parents:
            files, copies = self.expandpaths(rev, paths, parents)
        else:
            # Perform a full checkout on roots
            uuid, module, revnum = self.revsplit(rev)
            entries = svn.client.ls(self.baseurl + urllib.quote(module),
                                    optrev(revnum), True, self.ctx)
            files = [n for n,e in entries.iteritems()
                     if e.kind == svn.core.svn_node_file]
            copies = {}

        files.sort()
        files = zip(files, [rev] * len(files))

        # caller caches the result, so free it here to release memory
        del self.paths[rev]
        return (files, copies)

    def getchangedfiles(self, rev, i):
        changes = self.getchanges(rev)
        self._changescache = (rev, changes)
        return [f[0] for f in changes[0]]

    def getcommit(self, rev):
        if rev not in self.commits:
            uuid, module, revnum = self.revsplit(rev)
            self.module = module
            self.reparent(module)
            # We assume that:
            # - requests for revisions after "stop" come from the
            # revision graph backward traversal. Cache all of them
            # down to stop, they will be used eventually.
            # - requests for revisions before "stop" come to get
            # isolated branches parents. Just fetch what is needed.
            stop = self.lastrevs.get(module, 0)
            if revnum < stop:
                stop = revnum + 1
            self._fetch_revisions(revnum, stop)
        commit = self.commits[rev]
        # caller caches the result, so free it here to release memory
        del self.commits[rev]
        return commit

    def gettags(self):
        tags = {}
        if self.tags is None:
            return tags

        # svn tags are just a convention, project branches left in a
        # 'tags' directory. There is no other relationship than
        # ancestry, which is expensive to discover and makes them hard
        # to update incrementally.  Worse, past revisions may be
        # referenced by tags far away in the future, requiring a deep
        # history traversal on every calculation.  Current code
        # performs a single backward traversal, tracking moves within
        # the tags directory (tag renaming) and recording a new tag
        # everytime a project is copied from outside the tags
        # directory. It also lists deleted tags, this behaviour may
        # change in the future.
        pendings = []
        tagspath = self.tags
        start = svn.ra.get_latest_revnum(self.ra)
        try:
            for entry in self._getlog([self.tags], start, self.startrev):
                origpaths, revnum, author, date, message = entry
                copies = [(e.copyfrom_path, e.copyfrom_rev, p) for p, e
                          in origpaths.iteritems() if e.copyfrom_path]
                # Apply moves/copies from more specific to general
                copies.sort(reverse=True)

                srctagspath = tagspath
                if copies and copies[-1][2] == tagspath:
                    # Track tags directory moves
                    srctagspath = copies.pop()[0]

                for source, sourcerev, dest in copies:
                    if not dest.startswith(tagspath + '/'):
                        continue
                    for tag in pendings:
                        if tag[0].startswith(dest):
                            tagpath = source + tag[0][len(dest):]
                            tag[:2] = [tagpath, sourcerev]
                            break
                    else:
                        pendings.append([source, sourcerev, dest])

                # Filter out tags with children coming from different
                # parts of the repository like:
                # /tags/tag.1 (from /trunk:10)
                # /tags/tag.1/foo (from /branches/foo:12)
                # Here/tags/tag.1 discarded as well as its children.
                # It happens with tools like cvs2svn. Such tags cannot
                # be represented in mercurial.
                addeds = dict((p, e.copyfrom_path) for p,e 
                              in origpaths.iteritems() if e.action == 'A')
                badroots = set()
                for destroot in addeds:
                    for source, sourcerev, dest in pendings:
                        if (not dest.startswith(destroot + '/')
                            or source.startswith(addeds[destroot] + '/')):
                            continue
                        badroots.add(destroot)
                        break

                for badroot in badroots:
                    pendings = [p for p in pendings if p[2] != badroot 
                                and not p[2].startswith(badroot + '/')]

                # Tell tag renamings from tag creations
                remainings = []
                for source, sourcerev, dest in pendings:
                    tagname = dest.split('/')[-1]
                    if source.startswith(srctagspath):
                        remainings.append([source, sourcerev, tagname])
                        continue
                    if tagname in tags:
                        # Keep the latest tag value
                        continue
                    # From revision may be fake, get one with changes                    
                    try:
                        tagid = self.latest(source, sourcerev)
                        if tagid and tagname not in tags:
                            tags[tagname] = tagid
                    except SvnPathNotFound:
                        # It happens when we are following directories we assumed
                        # were copied with their parents but were really created
                        # in the tag directory.
                        pass
                pendings = remainings
                tagspath = srctagspath

        except SubversionException:
            self.ui.note(_('no tags found at revision %d\n') % start)
        return tags

    def converted(self, rev, destrev):
        if not self.wc:
            return
        if self.convertfp is None:
            self.convertfp = open(os.path.join(self.wc, '.svn', 'hg-shamap'),
                                  'a')
        self.convertfp.write('%s %d\n' % (destrev, self.revnum(rev)))
        self.convertfp.flush()

    # -- helper functions --

    def revid(self, revnum, module=None):
        if not module:
            module = self.module
        return u"svn:%s%s@%s" % (self.uuid, module.decode(self.encoding),
                                 revnum)

    def revnum(self, rev):
        return int(rev.split('@')[-1])

    def revsplit(self, rev):
        url, revnum = rev.encode(self.encoding).rsplit('@', 1)
        revnum = int(revnum)
        parts = url.split('/', 1)
        uuid = parts.pop(0)[4:]
        mod = ''
        if parts:
            mod = '/' + parts[0]
        return uuid, mod, revnum

    def latest(self, path, stop=0):
        """Find the latest revid affecting path, up to stop. It may return
        a revision in a different module, since a branch may be moved without
        a change being reported. Return None if computed module does not
        belong to rootmodule subtree.
        """
        if not path.startswith(self.rootmodule):
            # Requests on foreign branches may be forbidden at server level
            self.ui.debug(_('ignoring foreign branch %r\n') % path)
            return None

        if not stop:
            stop = svn.ra.get_latest_revnum(self.ra)
        try:
            prevmodule = self.reparent('')
            dirent = svn.ra.stat(self.ra, path.strip('/'), stop)
            self.reparent(prevmodule)
        except SubversionException:
            dirent = None
        if not dirent:
            raise SvnPathNotFound(_('%s not found up to revision %d') % (path, stop))

        # stat() gives us the previous revision on this line of development, but
        # it might be in *another module*. Fetch the log and detect renames down
        # to the latest revision.
        stream = self._getlog([path], stop, dirent.created_rev)
        try:
            for entry in stream:
                paths, revnum, author, date, message = entry
                if revnum <= dirent.created_rev:
                    break

                for p in paths:
                    if not path.startswith(p) or not paths[p].copyfrom_path:
                        continue
                    newpath = paths[p].copyfrom_path + path[len(p):]
                    self.ui.debug(_("branch renamed from %s to %s at %d\n") %
                                  (path, newpath, revnum))
                    path = newpath
                    break
        finally:
            stream.close()

        if not path.startswith(self.rootmodule):
            self.ui.debug(_('ignoring foreign branch %r\n') % path)
            return None
        return self.revid(dirent.created_rev, path)

    def get_blacklist(self):
        """Avoid certain revision numbers.
        It is not uncommon for two nearby revisions to cancel each other
        out, e.g. 'I copied trunk into a subdirectory of itself instead
        of making a branch'. The converted repository is significantly
        smaller if we ignore such revisions."""
        self.blacklist = set()
        blacklist = self.blacklist
        for line in file("blacklist.txt", "r"):
            if not line.startswith("#"):
                try:
                    svn_rev = int(line.strip())
                    blacklist.add(svn_rev)
                except ValueError:
                    pass # not an integer or a comment

    def is_blacklisted(self, svn_rev):
        return svn_rev in self.blacklist

    def reparent(self, module):
        """Reparent the svn transport and return the previous parent."""
        if self.prevmodule == module:
            return module
        svnurl = self.baseurl + urllib.quote(module)
        prevmodule = self.prevmodule
        if prevmodule is None:
            prevmodule = ''
        self.ui.debug(_("reparent to %s\n") % svnurl)
        svn.ra.reparent(self.ra, svnurl)
        self.prevmodule = module
        return prevmodule

    def expandpaths(self, rev, paths, parents):
        entries = []
        copyfrom = {} # Map of entrypath, revision for finding source of deleted revisions.
        copies = {}

        new_module, revnum = self.revsplit(rev)[1:]
        if new_module != self.module:
            self.module = new_module
            self.reparent(self.module)

        for path, ent in paths:
            entrypath = self.getrelpath(path)
            entry = entrypath.decode(self.encoding)

            kind = self._checkpath(entrypath, revnum)
            if kind == svn.core.svn_node_file:
                entries.append(self.recode(entry))
                if not ent.copyfrom_path or not parents:
                    continue
                # Copy sources not in parent revisions cannot be represented,
                # ignore their origin for now
                pmodule, prevnum = self.revsplit(parents[0])[1:]
                if ent.copyfrom_rev < prevnum:
                    continue
                copyfrom_path = self.getrelpath(ent.copyfrom_path, pmodule)
                if not copyfrom_path:
                    continue
                self.ui.debug(_("copied to %s from %s@%s\n") %
                              (entrypath, copyfrom_path, ent.copyfrom_rev))
                copies[self.recode(entry)] = self.recode(copyfrom_path)
            elif kind == 0: # gone, but had better be a deleted *file*
                self.ui.debug(_("gone from %s\n") % ent.copyfrom_rev)

                # if a branch is created but entries are removed in the same
                # changeset, get the right fromrev
                # parents cannot be empty here, you cannot remove things from
                # a root revision.
                uuid, old_module, fromrev = self.revsplit(parents[0])

                basepath = old_module + "/" + self.getrelpath(path)
                entrypath = basepath

                def lookup_parts(p):
                    rc = None
                    parts = p.split("/")
                    for i in range(len(parts)):
                        part = "/".join(parts[:i])
                        info = part, copyfrom.get(part, None)
                        if info[1] is not None:
                            self.ui.debug(_("found parent directory %s\n") % info[1])
                            rc = info
                    return rc

                self.ui.debug(_("base, entry %s %s\n") % (basepath, entrypath))

                frompath, froment = lookup_parts(entrypath) or (None, revnum - 1)

                # need to remove fragment from lookup_parts and replace with copyfrom_path
                if frompath is not None:
                    self.ui.debug(_("munge-o-matic\n"))
                    self.ui.debug(entrypath + '\n')
                    self.ui.debug(entrypath[len(frompath):] + '\n')
                    entrypath = froment.copyfrom_path + entrypath[len(frompath):]
                    fromrev = froment.copyfrom_rev
                    self.ui.debug(_("info: %s %s %s %s\n") % (frompath, froment, ent, entrypath))

                # We can avoid the reparent calls if the module has not changed
                # but it probably does not worth the pain.
                prevmodule = self.reparent('')
                fromkind = svn.ra.check_path(self.ra, entrypath.strip('/'), fromrev)
                self.reparent(prevmodule)

                if fromkind == svn.core.svn_node_file:   # a deleted file
                    entries.append(self.recode(entry))
                elif fromkind == svn.core.svn_node_dir:
                    # print "Deleted/moved non-file:", revnum, path, ent
                    # children = self._find_children(path, revnum - 1)
                    # print "find children %s@%d from %d action %s" % (path, revnum, ent.copyfrom_rev, ent.action)
                    # Sometimes this is tricky. For example: in
                    # The Subversion Repository revision 6940 a dir
                    # was copied and one of its files was deleted
                    # from the new location in the same commit. This
                    # code can't deal with that yet.
                    if ent.action == 'C':
                        children = self._find_children(path, fromrev)
                    else:
                        oroot = entrypath.strip('/')
                        nroot = path.strip('/')
                        children = self._find_children(oroot, fromrev)
                        children = [s.replace(oroot,nroot) for s in children]
                    # Mark all [files, not directories] as deleted.
                    for child in children:
                        # Can we move a child directory and its
                        # parent in the same commit? (probably can). Could
                        # cause problems if instead of revnum -1,
                        # we have to look in (copyfrom_path, revnum - 1)
                        entrypath = self.getrelpath("/" + child, module=old_module)
                        if entrypath:
                            entry = self.recode(entrypath.decode(self.encoding))
                            if entry in copies:
                                # deleted file within a copy
                                del copies[entry]
                            else:
                                entries.append(entry)
                else:
                    self.ui.debug(_('unknown path in revision %d: %s\n') % \
                                  (revnum, path))
            elif kind == svn.core.svn_node_dir:
                # Should probably synthesize normal file entries
                # and handle as above to clean up copy/rename handling.

                # If the directory just had a prop change,
                # then we shouldn't need to look for its children.
                if ent.action == 'M':
                    continue

                # Also this could create duplicate entries. Not sure
                # whether this will matter. Maybe should make entries a set.
                # print "Changed directory", revnum, path, ent.action, ent.copyfrom_path, ent.copyfrom_rev
                # This will fail if a directory was copied
                # from another branch and then some of its files
                # were deleted in the same transaction.
                children = sorted(self._find_children(path, revnum))
                for child in children:
                    # Can we move a child directory and its
                    # parent in the same commit? (probably can). Could
                    # cause problems if instead of revnum -1,
                    # we have to look in (copyfrom_path, revnum - 1)
                    entrypath = self.getrelpath("/" + child)
                    # print child, self.module, entrypath
                    if entrypath:
                        # Need to filter out directories here...
                        kind = self._checkpath(entrypath, revnum)
                        if kind != svn.core.svn_node_dir:
                            entries.append(self.recode(entrypath))

                # Copies here (must copy all from source)
                # Probably not a real problem for us if
                # source does not exist
                if not ent.copyfrom_path or not parents:
                    continue
                # Copy sources not in parent revisions cannot be represented,
                # ignore their origin for now
                pmodule, prevnum = self.revsplit(parents[0])[1:]
                if ent.copyfrom_rev < prevnum:
                    continue
                copyfrompath = ent.copyfrom_path.decode(self.encoding)
                copyfrompath = self.getrelpath(copyfrompath, pmodule)
                if not copyfrompath:
                    continue
                copyfrom[path] = ent
                self.ui.debug(_("mark %s came from %s:%d\n")
                              % (path, copyfrompath, ent.copyfrom_rev))
                children = self._find_children(ent.copyfrom_path, ent.copyfrom_rev)
                children.sort()
                for child in children:
                    entrypath = self.getrelpath("/" + child, pmodule)
                    if not entrypath:
                        continue
                    entry = entrypath.decode(self.encoding)
                    copytopath = path + entry[len(copyfrompath):]
                    copytopath = self.getrelpath(copytopath)
                    copies[self.recode(copytopath)] = self.recode(entry, pmodule)

        return (list(set(entries)), copies)

    def _fetch_revisions(self, from_revnum, to_revnum):
        if from_revnum < to_revnum:
            from_revnum, to_revnum = to_revnum, from_revnum

        self.child_cset = None

        def parselogentry(orig_paths, revnum, author, date, message):
            """Return the parsed commit object or None, and True if
            the revision is a branch root.
            """
            self.ui.debug(_("parsing revision %d (%d changes)\n") %
                          (revnum, len(orig_paths)))

            branched = False
            rev = self.revid(revnum)
            # branch log might return entries for a parent we already have

            if rev in self.commits or revnum < to_revnum:
                return None, branched

            parents = []
            # check whether this revision is the start of a branch or part
            # of a branch renaming
            orig_paths = sorted(orig_paths.iteritems())
            root_paths = [(p,e) for p,e in orig_paths if self.module.startswith(p)]
            if root_paths:
                path, ent = root_paths[-1]
                if ent.copyfrom_path:
                    branched = True
                    newpath = ent.copyfrom_path + self.module[len(path):]
                    # ent.copyfrom_rev may not be the actual last revision
                    previd = self.latest(newpath, ent.copyfrom_rev)
                    if previd is not None:
                        prevmodule, prevnum = self.revsplit(previd)[1:]
                        if prevnum >= self.startrev:
                            parents = [previd]
                            self.ui.note(_('found parent of branch %s at %d: %s\n') %
                                         (self.module, prevnum, prevmodule))
                else:
                    self.ui.debug(_("no copyfrom path, don't know what to do.\n"))

            paths = []
            # filter out unrelated paths
            for path, ent in orig_paths:
                if self.getrelpath(path) is None:
                    continue
                paths.append((path, ent))

            # Example SVN datetime. Includes microseconds.
            # ISO-8601 conformant
            # '2007-01-04T17:35:00.902377Z'
            date = util.parsedate(date[:19] + " UTC", ["%Y-%m-%dT%H:%M:%S"])

            log = message and self.recode(message) or ''
            author = author and self.recode(author) or ''
            try:
                branch = self.module.split("/")[-1]
                if branch == 'trunk':
                    branch = ''
            except IndexError:
                branch = None

            cset = commit(author=author,
                          date=util.datestr(date),
                          desc=log,
                          parents=parents,
                          branch=branch,
                          rev=rev.encode('utf-8'))

            self.commits[rev] = cset
            # The parents list is *shared* among self.paths and the
            # commit object. Both will be updated below.
            self.paths[rev] = (paths, cset.parents)
            if self.child_cset and not self.child_cset.parents:
                self.child_cset.parents[:] = [rev]
            self.child_cset = cset
            return cset, branched

        self.ui.note(_('fetching revision log for "%s" from %d to %d\n') %
                     (self.module, from_revnum, to_revnum))

        try:
            firstcset = None
            lastonbranch = False
            stream = self._getlog([self.module], from_revnum, to_revnum)
            try:
                for entry in stream:
                    paths, revnum, author, date, message = entry
                    if revnum < self.startrev:
                        lastonbranch = True
                        break
                    if self.is_blacklisted(revnum):
                        self.ui.note(_('skipping blacklisted revision %d\n')
                                     % revnum)
                        continue
                    if not paths:
                        self.ui.debug(_('revision %d has no entries\n') % revnum)
                        continue
                    cset, lastonbranch = parselogentry(paths, revnum, author,
                                                       date, message)
                    if cset:
                        firstcset = cset
                    if lastonbranch:
                        break
            finally:
                stream.close()

            if not lastonbranch and firstcset and not firstcset.parents:
                # The first revision of the sequence (the last fetched one)
                # has invalid parents if not a branch root. Find the parent
                # revision now, if any.
                try:
                    firstrevnum = self.revnum(firstcset.rev)
                    if firstrevnum > 1:
                        latest = self.latest(self.module, firstrevnum - 1)
                        if latest:
                            firstcset.parents.append(latest)
                except SvnPathNotFound:
                    pass
        except SubversionException, (inst, num):
            if num == svn.core.SVN_ERR_FS_NO_SUCH_REVISION:
                raise util.Abort(_('svn: branch has no revision %s') % to_revnum)
            raise

    def _getfile(self, file, rev):
        # TODO: ra.get_file transmits the whole file instead of diffs.
        mode = ''
        try:
            new_module, revnum = self.revsplit(rev)[1:]
            if self.module != new_module:
                self.module = new_module
                self.reparent(self.module)
            io = StringIO()
            info = svn.ra.get_file(self.ra, file, revnum, io)
            data = io.getvalue()
            # ra.get_files() seems to keep a reference on the input buffer
            # preventing collection. Release it explicitely.
            io.close()
            if isinstance(info, list):
                info = info[-1]
            mode = ("svn:executable" in info) and 'x' or ''
            mode = ("svn:special" in info) and 'l' or mode
        except SubversionException, e:
            notfound = (svn.core.SVN_ERR_FS_NOT_FOUND,
                svn.core.SVN_ERR_RA_DAV_PATH_NOT_FOUND)
            if e.apr_err in notfound: # File not found
                raise IOError()
            raise
        if mode == 'l':
            link_prefix = "link "
            if data.startswith(link_prefix):
                data = data[len(link_prefix):]
        return data, mode

    def _find_children(self, path, revnum):
        path = path.strip('/')
        pool = Pool()
        rpath = '/'.join([self.baseurl, urllib.quote(path)]).strip('/')
        return ['%s/%s' % (path, x) for x in
                svn.client.ls(rpath, optrev(revnum), True, self.ctx, pool).keys()]

    def getrelpath(self, path, module=None):
        if module is None:
            module = self.module
        # Given the repository url of this wc, say
        #   "http://server/plone/CMFPlone/branches/Plone-2_0-branch"
        # extract the "entry" portion (a relative path) from what
        # svn log --xml says, ie
        #   "/CMFPlone/branches/Plone-2_0-branch/tests/PloneTestCase.py"
        # that is to say "tests/PloneTestCase.py"
        if path.startswith(module):
            relative = path.rstrip('/')[len(module):]
            if relative.startswith('/'):
                return relative[1:]
            elif relative == '':
                return relative

        # The path is outside our tracked tree...
        self.ui.debug(_('%r is not under %r, ignoring\n') % (path, module))
        return None

    def _checkpath(self, path, revnum):
        # ra.check_path does not like leading slashes very much, it leads
        # to PROPFIND subversion errors
        return svn.ra.check_path(self.ra, path.strip('/'), revnum)

    def _getlog(self, paths, start, end, limit=0, discover_changed_paths=True,
                strict_node_history=False):
        # Normalize path names, svn >= 1.5 only wants paths relative to
        # supplied URL
        relpaths = []
        for p in paths:
            if not p.startswith('/'):
                p = self.module + '/' + p
            relpaths.append(p.strip('/'))
        args = [self.baseurl, relpaths, start, end, limit, discover_changed_paths,
                strict_node_history]
        arg = encodeargs(args)
        hgexe = util.hgexecutable()
        cmd = '%s debugsvnlog' % util.shellquote(hgexe)
        stdin, stdout = util.popen2(cmd, 'b')
        stdin.write(arg)
        stdin.close()
        return logstream(stdout)

pre_revprop_change = '''#!/bin/sh

REPOS="$1"
REV="$2"
USER="$3"
PROPNAME="$4"
ACTION="$5"

if [ "$ACTION" = "M" -a "$PROPNAME" = "svn:log" ]; then exit 0; fi
if [ "$ACTION" = "A" -a "$PROPNAME" = "hg:convert-branch" ]; then exit 0; fi
if [ "$ACTION" = "A" -a "$PROPNAME" = "hg:convert-rev" ]; then exit 0; fi

echo "Changing prohibited revision property" >&2
exit 1
'''

class svn_sink(converter_sink, commandline):
    commit_re = re.compile(r'Committed revision (\d+).', re.M)

    def prerun(self):
        if self.wc:
            os.chdir(self.wc)

    def postrun(self):
        if self.wc:
            os.chdir(self.cwd)

    def join(self, name):
        return os.path.join(self.wc, '.svn', name)

    def revmapfile(self):
        return self.join('hg-shamap')

    def authorfile(self):
        return self.join('hg-authormap')

    def __init__(self, ui, path):
        converter_sink.__init__(self, ui, path)
        commandline.__init__(self, ui, 'svn')
        self.delete = []
        self.setexec = []
        self.delexec = []
        self.copies = []
        self.wc = None
        self.cwd = os.getcwd()

        path = os.path.realpath(path)

        created = False
        if os.path.isfile(os.path.join(path, '.svn', 'entries')):
            self.wc = path
            self.run0('update')
        else:
            wcpath = os.path.join(os.getcwd(), os.path.basename(path) + '-wc')

            if os.path.isdir(os.path.dirname(path)):
                if not os.path.exists(os.path.join(path, 'db', 'fs-type')):
                    ui.status(_('initializing svn repo %r\n') %
                              os.path.basename(path))
                    commandline(ui, 'svnadmin').run0('create', path)
                    created = path
                path = util.normpath(path)
                if not path.startswith('/'):
                    path = '/' + path
                path = 'file://' + path

            ui.status(_('initializing svn wc %r\n') % os.path.basename(wcpath))
            self.run0('checkout', path, wcpath)

            self.wc = wcpath
        self.opener = util.opener(self.wc)
        self.wopener = util.opener(self.wc)
        self.childmap = mapfile(ui, self.join('hg-childmap'))
        self.is_exec = util.checkexec(self.wc) and util.is_exec or None

        if created:
            hook = os.path.join(created, 'hooks', 'pre-revprop-change')
            fp = open(hook, 'w')
            fp.write(pre_revprop_change)
            fp.close()
            util.set_flags(hook, False, True)

        xport = transport.SvnRaTransport(url=geturl(path))
        self.uuid = svn.ra.get_uuid(xport.ra)

    def wjoin(self, *names):
        return os.path.join(self.wc, *names)

    def putfile(self, filename, flags, data):
        if 'l' in flags:
            self.wopener.symlink(data, filename)
        else:
            try:
                if os.path.islink(self.wjoin(filename)):
                    os.unlink(filename)
            except OSError:
                pass
            self.wopener(filename, 'w').write(data)

            if self.is_exec:
                was_exec = self.is_exec(self.wjoin(filename))
            else:
                # On filesystems not supporting execute-bit, there is no way
                # to know if it is set but asking subversion. Setting it
                # systematically is just as expensive and much simpler.
                was_exec = 'x' not in flags

            util.set_flags(self.wjoin(filename), False, 'x' in flags)
            if was_exec:
                if 'x' not in flags:
                    self.delexec.append(filename)
            else:
                if 'x' in flags:
                    self.setexec.append(filename)

    def _copyfile(self, source, dest):
        # SVN's copy command pukes if the destination file exists, but
        # our copyfile method expects to record a copy that has
        # already occurred.  Cross the semantic gap.
        wdest = self.wjoin(dest)
        exists = os.path.exists(wdest)
        if exists:
            fd, tempname = tempfile.mkstemp(
                prefix='hg-copy-', dir=os.path.dirname(wdest))
            os.close(fd)
            os.unlink(tempname)
            os.rename(wdest, tempname)
        try:
            self.run0('copy', source, dest)
        finally:
            if exists:
                try:
                    os.unlink(wdest)
                except OSError:
                    pass
                os.rename(tempname, wdest)

    def dirs_of(self, files):
        dirs = set()
        for f in files:
            if os.path.isdir(self.wjoin(f)):
                dirs.add(f)
            for i in strutil.rfindall(f, '/'):
                dirs.add(f[:i])
        return dirs

    def add_dirs(self, files):
        add_dirs = [d for d in sorted(self.dirs_of(files))
                    if not os.path.exists(self.wjoin(d, '.svn', 'entries'))]
        if add_dirs:
            self.xargs(add_dirs, 'add', non_recursive=True, quiet=True)
        return add_dirs

    def add_files(self, files):
        if files:
            self.xargs(files, 'add', quiet=True)
        return files

    def tidy_dirs(self, names):
        deleted = []
        for d in sorted(self.dirs_of(names), reverse=True):
            wd = self.wjoin(d)
            if os.listdir(wd) == '.svn':
                self.run0('delete', d)
                deleted.append(d)
        return deleted

    def addchild(self, parent, child):
        self.childmap[parent] = child

    def revid(self, rev):
        return u"svn:%s@%s" % (self.uuid, rev)

    def putcommit(self, files, copies, parents, commit, source):
        # Apply changes to working copy
        for f, v in files:
            try:
                data = source.getfile(f, v)
            except IOError:
                self.delete.append(f)
            else:
                e = source.getmode(f, v)
                self.putfile(f, e, data)
                if f in copies:
                    self.copies.append([copies[f], f])
        files = [f[0] for f in files]

        for parent in parents:
            try:
                return self.revid(self.childmap[parent])
            except KeyError:
                pass
        entries = set(self.delete)
        files = frozenset(files)
        entries.update(self.add_dirs(files.difference(entries)))
        if self.copies:
            for s, d in self.copies:
                self._copyfile(s, d)
            self.copies = []
        if self.delete:
            self.xargs(self.delete, 'delete')
            self.delete = []
        entries.update(self.add_files(files.difference(entries)))
        entries.update(self.tidy_dirs(entries))
        if self.delexec:
            self.xargs(self.delexec, 'propdel', 'svn:executable')
            self.delexec = []
        if self.setexec:
            self.xargs(self.setexec, 'propset', 'svn:executable', '*')
            self.setexec = []

        fd, messagefile = tempfile.mkstemp(prefix='hg-convert-')
        fp = os.fdopen(fd, 'w')
        fp.write(commit.desc)
        fp.close()
        try:
            output = self.run0('commit',
                               username=util.shortuser(commit.author),
                               file=messagefile,
                               encoding='utf-8')
            try:
                rev = self.commit_re.search(output).group(1)
            except AttributeError:
                self.ui.warn(_('unexpected svn output:\n'))
                self.ui.warn(output)
                raise util.Abort(_('unable to cope with svn output'))
            if commit.rev:
                self.run('propset', 'hg:convert-rev', commit.rev,
                         revprop=True, revision=rev)
            if commit.branch and commit.branch != 'default':
                self.run('propset', 'hg:convert-branch', commit.branch,
                         revprop=True, revision=rev)
            for parent in parents:
                self.addchild(parent, rev)
            return self.revid(rev)
        finally:
            os.unlink(messagefile)

    def puttags(self, tags):
        self.ui.warn(_('XXX TAGS NOT IMPLEMENTED YET\n'))