convert: cvsps.py - code to generate changesets from a CVS repository
authorFrank Kingswood <frank@kingswood-consulting.co.uk>
Sun, 15 Jun 2008 15:59:27 +0100
changeset 6687 f8ef39206f6a
parent 6686 bb1575f74f27
child 6688 5cd7a8433cd4
convert: cvsps.py - code to generate changesets from a CVS repository
hgext/convert/cvsps.py
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/hgext/convert/cvsps.py	Sun Jun 15 15:59:27 2008 +0100
@@ -0,0 +1,547 @@
+#
+# Mercurial built-in replacement for cvsps.
+#
+# Copyright 2008, Frank Kingswood <frank@kingswood-consulting.co.uk>
+#
+# This software may be used and distributed according to the terms
+# of the GNU General Public License, incorporated herein by reference.
+
+import os
+import re
+import sys
+import cPickle as pickle
+from mercurial import util
+from mercurial.i18n import _
+
+def listsort(list, key):
+    "helper to sort by key in Python 2.3"
+    try:
+        list.sort(key=key)
+    except TypeError:
+        list.sort(lambda l, r:cmp(key(l), key(r)))
+
+class logentry(object):
+    '''Class logentry has the following attributes:
+        .author    - author name as CVS knows it
+        .branch    - name of branch this revision is on
+        .branches  - revision tuple of branches starting at this revision
+        .comment   - commit message
+        .date      - the commit date as a (time, tz) tuple
+        .dead      - true if file revision is dead
+        .file      - Name of file
+        .lines     - a tuple (+lines, -lines) or None
+        .parent    - Previous revision of this entry
+        .rcs       - name of file as returned from CVS
+        .revision  - revision number as tuple
+        .tags      - list of tags on the file
+    '''
+    def __init__(self, **entries):
+        self.__dict__.update(entries)
+
+class logerror(Exception):
+    pass
+
+def createlog(ui, directory=None, root="", rlog=True, cache=None):
+    '''Collect the CVS rlog'''
+
+    # Because we store many duplicate commit log messages, reusing strings
+    # saves a lot of memory and pickle storage space.
+    _scache = {}
+    def scache(s):
+        "return a shared version of a string"
+        return _scache.setdefault(s, s)
+
+    ui.status(_('collecting CVS rlog\n'))
+
+    log = []      # list of logentry objects containing the CVS state
+
+    # patterns to match in CVS (r)log output, by state of use
+    re_00 = re.compile('RCS file: (.+)$')
+    re_01 = re.compile('cvs \\[r?log aborted\\]: (.+)$')
+    re_02 = re.compile('cvs (r?log|server): (.+)\n$')
+    re_03 = re.compile("(Cannot access.+CVSROOT)|(can't create temporary directory.+)$")
+    re_10 = re.compile('Working file: (.+)$')
+    re_20 = re.compile('symbolic names:')
+    re_30 = re.compile('\t(.+): ([\\d.]+)$')
+    re_31 = re.compile('----------------------------$')
+    re_32 = re.compile('=============================================================================$')
+    re_50 = re.compile('revision ([\\d.]+)(\s+locked by:\s+.+;)?$')
+    re_60 = re.compile(r'date:\s+(.+);\s+author:\s+(.+);\s+state:\s+(.+?);(\s+lines:\s+(\+\d+)?\s+(-\d+)?;)?')
+    re_70 = re.compile('branches: (.+);$')
+
+    prefix = ''   # leading path to strip of what we get from CVS
+
+    if directory is None:
+        # Current working directory
+
+        # Get the real directory in the repository
+        try:
+            prefix = file(os.path.join('CVS','Repository')).read().strip()
+            if prefix == ".":
+                prefix=""
+            directory = prefix
+        except IOError:
+            raise logerror('Not a CVS sandbox')
+
+        if prefix and not prefix.endswith('/'):
+            prefix+='/'
+
+        # Use the Root file in the sandbox, if it exists
+        try:
+            root = file(os.path.join('CVS','Root')).read().strip()
+        except IOError:
+            pass
+
+    if not root:
+        root = os.environ.get('CVSROOT', '')
+
+    # read log cache if one exists
+    oldlog = []
+    date = None
+
+    if cache:
+        cachedir = os.path.expanduser('~/.hg.cvsps')
+        if not os.path.exists(cachedir):
+            os.mkdir(cachedir)
+
+        # The cvsps cache pickle needs a uniquified name, based on the
+        # repository location. The address may have all sort of nasties
+        # in it, slashes, colons and such. So here we take just the
+        # alphanumerics, concatenated in a way that does not mix up the
+        # various components, so that 
+        #    :pserver:user@server:/path
+        # and
+        #    /pserver/user/server/path
+        # are mapped to different cache file names.
+        cachefile = root.split(":")+[directory, "cache"]
+        cachefile = ['-'.join(re.findall(r'\w+', s)) for s in cachefile if s]
+        cachefile = os.path.join(cachedir, '.'.join([s for s in cachefile if s]))
+
+    if cache == 'update':
+        try:
+            ui.note(_('reading cvs log cache %s\n') % cachefile)
+            oldlog = pickle.load(file(cachefile))
+            ui.note(_('cache has %d log entries\n') % len(oldlog))
+        except Exception, e:
+            ui.note(_('error reading cache: %r\n') % e)
+
+        if oldlog:
+            date = oldlog[-1].date    # last commit date as a (time,tz) tuple
+            date = util.datestr(date, '%Y/%m/%d %H:%M:%S %1%2')
+
+    # build the CVS commandline
+    cmd = ['cvs', '-q']
+    if root:
+        cmd.append('-d%s' % root)
+        p = root.split(':')[-1]
+        if not p.endswith('/'):
+            p+='/'
+        prefix = p+prefix
+    cmd.append(['log', 'rlog'][rlog])
+    if date:
+        # no space between option and date string
+        cmd.append('-d>%s' % date)
+    cmd.append(directory)
+
+    # state machine begins here
+    tags = {}     # dictionary of revisions on current file with their tags
+    state = 0
+    store = False # set when a new record can be appended
+
+    cmd = [util.shellquote(arg) for arg in cmd]
+    ui.note("running %s\n" % (' '.join(cmd)))
+    ui.debug("prefix=%r directory=%r root=%r\n" % (prefix, directory, root))
+
+    for line in util.popen(' '.join(cmd)):
+        if line.endswith('\n'):
+            line = line[:-1]
+        #ui.debug('state=%d line=%r\n' % (state, line))
+
+        if state == 0:
+            # initial state, consume input until we see 'RCS file'
+            match = re_00.match(line)
+            if match:
+                rcs = match.group(1)
+                tags = {}
+                if rlog:
+                    filename = rcs[:-2]
+                    if filename.startswith(prefix):
+                        filename = filename[len(prefix):]
+                    if filename.startswith('/'):
+                        filename = filename[1:]
+                    if filename.startswith('Attic/'):
+                        filename = filename[6:]
+                    else:
+                        filename = filename.replace('/Attic/', '/')
+                    state = 2
+                    continue
+                state = 1
+                continue
+            match = re_01.match(line)
+            if match:
+                raise Exception(match.group(1))
+            match = re_02.match(line)
+            if match:
+                raise Exception(match.group(2))
+            if re_03.match(line):
+                raise Exception(line)
+
+        elif state == 1:
+            # expect 'Working file' (only when using log instead of rlog)
+            match = re_10.match(line)
+            assert match, _('RCS file must be followed by working file')
+            filename = match.group(1)
+            state = 2
+
+        elif state == 2:
+            # expect 'symbolic names'
+            if re_20.match(line):
+                state = 3
+
+        elif state == 3:
+            # read the symbolic names and store as tags
+            match = re_30.match(line)
+            if match:
+                rev = [int(x) for x in match.group(2).split('.')]
+
+                # Convert magic branch number to an odd-numbered one
+                revn = len(rev)
+                if revn>3 and (revn%2) == 0 and rev[-2] == 0:
+                    rev = rev[:-2]+rev[-1:]
+                rev = tuple(rev)
+
+                if rev not in tags:
+                    tags[rev] = []
+                tags[rev].append(match.group(1))
+
+            elif re_31.match(line):
+                state = 5
+            elif re_32.match(line):
+                state = 0
+
+        elif state == 4:
+            # expecting '------' separator before first revision
+            if re_31.match(line):
+                state = 5
+            else:
+                assert not re_32.match(line), _('Must have at least some revisions')
+
+        elif state == 5:
+            # expecting revision number and possibly (ignored) lock indication
+            # we create the logentry here from values stored in states 0 to 4,
+            # as this state is re-entered for subsequent revisions of a file.
+            match = re_50.match(line)
+            assert match, _('expected revision number')
+            e = logentry(rcs=scache(rcs), file=scache(filename),
+                    revision=tuple([int(x) for x in match.group(1).split('.')]),
+                    branches=[], parent=None)
+            state = 6
+
+        elif state == 6:
+            # expecting date, author, state, lines changed
+            match = re_60.match(line)
+            assert match, _('revision must be followed by date line')
+            d = match.group(1)
+            if d[2] == '/':
+                # Y2K
+                d = '19'+d
+
+            if len(d.split()) != 3:
+                # cvs log dates always in GMT
+                d = d+' UTC'
+            e.date = util.parsedate(d, ['%y/%m/%d %H:%M:%S', '%Y/%m/%d %H:%M:%S', '%Y-%m-%d %H:%M:%S'])
+            e.author = scache(match.group(2))
+            e.dead = match.group(3).lower() == 'dead'
+
+            if match.group(5):
+                if match.group(6):
+                    e.lines = (int(match.group(5)), int(match.group(6)))
+                else:
+                    e.lines = (int(match.group(5)), 0)
+            elif match.group(6):
+                e.lines = (0, int(match.group(6)))
+            else:
+                e.lines = None
+            e.comment = []
+            state = 7
+
+        elif state == 7:
+            # read the revision numbers of branches that start at this revision,
+            # or store the commit log message otherwise
+            m = re_70.match(line)
+            if m:
+                e.branches = [tuple([int(y) for y in x.strip().split('.')])
+                                for x in m.group(1).split(';')]
+                state = 8
+            elif re_31.match(line):
+                state = 5
+                store = True
+            elif re_32.match(line):
+                state = 0
+                store = True
+            else:
+                e.comment.append(line)
+
+        elif state == 8:
+            # store commit log message
+            if re_31.match(line):
+                state = 5
+                store = True
+            elif re_32.match(line):
+                state = 0
+                store = True
+            else:
+                e.comment.append(line)
+
+        if store:
+            # clean up the results and save in the log.
+            store = False
+            e.tags = [scache(x) for x in tags.get(e.revision, [])]
+            e.tags.sort()
+            e.comment = scache('\n'.join(e.comment))
+
+            revn = len(e.revision)
+            if revn>3 and (revn%2) == 0:
+                e.branch = tags.get(e.revision[:-1], [None])[0]
+            else:
+                e.branch = None
+
+            log.append(e)
+
+            if len(log)%100 == 0:
+                ui.status(util.ellipsis('%d %s'%(len(log), e.file), 80)+'\n')
+
+    listsort(log, key=lambda x:(x.rcs, x.revision))
+
+    # find parent revisions of individual files
+    versions = {}
+    for e in log:
+        branch = e.revision[:-1]
+        p = versions.get((e.rcs, branch), None)
+        if p is None:
+            p = e.revision[:-2]
+        e.parent = p
+        versions[(e.rcs, branch)] = e.revision
+
+    # update the log cache
+    if cache:
+        if log:
+            # join up the old and new logs
+            listsort(log, key=lambda x:x.date)
+
+            if oldlog and oldlog[-1].date >= log[0].date:
+                raise logerror('Log cache overlaps with new log entries, re-run without cache.')
+
+            log = oldlog+log
+
+            # write the new cachefile
+            ui.note(_('writing cvs log cache %s\n') % cachefile)
+            pickle.dump(log, file(cachefile, 'w'))
+        else:
+            log = oldlog
+
+    ui.status(_('%d log entries\n') % len(log))
+
+    return log
+
+
+class changeset(object):
+    '''Class changeset has the following attributes:
+        .author    - author name as CVS knows it
+        .branch    - name of branch this changeset is on, or None
+        .comment   - commit message
+        .date      - the commit date as a (time,tz) tuple
+        .entries   - list of logentry objects in this changeset
+        .parents   - list of one or two parent changesets
+        .tags      - list of tags on this changeset
+    '''
+    def __init__(self, **entries):
+        self.__dict__.update(entries)
+
+def createchangeset(ui, log, fuzz=60, mergefrom=None, mergeto=None):
+    '''Convert log into changesets.'''
+
+    ui.status(_('creating changesets\n'))
+
+    # Merge changesets
+
+    listsort(log, key=lambda x:(x.comment, x.author, x.branch, x.date))
+
+    changesets = []
+    files = {}
+    c = None
+    for i, e in enumerate(log):
+
+        # Check if log entry belongs to the current changeset or not.
+        if not (c and
+                  e.comment == c.comment and
+                  e.author == c.author and
+                  e.branch == c.branch and
+                  (c.date[0]+c.date[1]) <= (e.date[0]+e.date[1]) <= (c.date[0]+c.date[1])+fuzz and
+                  e.file not in files):
+            c = changeset(comment=e.comment, author=e.author,
+                          branch=e.branch, date=e.date, entries=[])
+            changesets.append(c)
+            files = {}
+            if len(changesets)%100 == 0:
+                ui.status(util.ellipsis('%d %s'%(len(changesets), repr(e.comment)[1:-1]), 80)+'\n')
+
+        e.Changeset = c
+        c.entries.append(e)
+        files[e.file] = True
+        c.date = e.date       # changeset date is date of latest commit in it
+
+    # Sort files in each changeset
+
+    for c in changesets:
+        def pathcompare(l, r):
+            'Mimic cvsps sorting order'
+            l = l.split('/')
+            r = r.split('/')
+            nl = len(l)
+            nr = len(r)
+            n = min(nl, nr)
+            for i in range(n):
+                if i+1 == nl and nl<nr:
+                    return -1
+                elif i+1 == nr and nl>nr:
+                    return +1
+                elif l[i]<r[i]:
+                    return -1
+                elif l[i]>r[i]:
+                    return +1
+            return 0
+        def entitycompare(l, r):
+            return pathcompare(l.file, r.file)
+
+        c.entries.sort(entitycompare)
+
+    # Sort changesets by date
+
+    def cscmp(l, r):
+        d = sum(l.date)-sum(r.date)
+        if d:
+            return d
+
+        # detect vendor branches and initial commits on a branch
+        le = {}
+        for e in l.entries:
+            le[e.rcs] = e.revision
+        re = {}
+        for e in r.entries:
+            re[e.rcs] = e.revision
+
+        d = 0
+        for e in l.entries:
+            if re.get(e.rcs, None) == e.parent:
+                assert not d
+                d = 1
+                break
+
+        for e in r.entries:
+            if le.get(e.rcs, None) == e.parent:
+                assert not d
+                d = -1
+                break
+
+        return d
+
+    changesets.sort(cscmp)
+
+    # Collect tags
+
+    globaltags = {}
+    for c in changesets:
+        tags = {}
+        for e in c.entries:
+            for tag in e.tags:
+                # remember which is the latest changeset to have this tag
+                globaltags[tag] = c
+
+    for c in changesets:
+        tags = {}
+        for e in c.entries:
+            for tag in e.tags:
+                tags[tag] = True
+        # remember tags only if this is the latest changeset to have it
+        tagnames = [tag for tag in tags if globaltags[tag] is c]
+        tagnames.sort()
+        c.tags = tagnames
+
+    # Find parent changesets, handle {{mergetobranch BRANCHNAME}}
+    # by inserting dummy changesets with two parents, and handle
+    # {{mergefrombranch BRANCHNAME}} by setting two parents.
+
+    if mergeto is None:
+        mergeto = r'{{mergetobranch ([-\w]+)}}'
+    if mergeto:
+        mergeto = re.compile(mergeto)
+
+    if mergefrom is None:
+        mergefrom = r'{{mergefrombranch ([-\w]+)}}'
+    if mergefrom:
+        mergefrom = re.compile(mergefrom)
+
+    versions = {}    # changeset index where we saw any particular file version
+    branches = {}    # changeset index where we saw a branch
+    n = len(changesets)
+    i = 0
+    while i<n:
+        c = changesets[i]
+
+        for f in c.entries:
+            versions[(f.rcs, f.revision)] = i
+
+        p = None
+        if c.branch in branches:
+            p = branches[c.branch]
+        else:
+            for f in c.entries:
+                p = max(p, versions.get((f.rcs, f.parent), None))
+
+        c.parents = []
+        if p is not None:
+            c.parents.append(changesets[p])
+
+        if mergefrom:
+            m = mergefrom.search(c.comment)
+            if m:
+                m = m.group(1)
+                if m == 'HEAD':
+                    m = None
+                if m in branches and c.branch != m:
+                    c.parents.append(changesets[branches[m]])
+
+        if mergeto:
+            m = mergeto.search(c.comment)
+            if m:
+                try:
+                    m = m.group(1)
+                    if m == 'HEAD':
+                        m = None
+                except:
+                    m = None   # if no group found then merge to HEAD
+                if m in branches and c.branch != m:
+                    # insert empty changeset for merge
+                    cc = changeset(author=c.author, branch=m, date=c.date,
+                            comment='convert-repo: CVS merge from branch %s' % c.branch,
+                            entries=[], tags=[], parents=[changesets[branches[m]], c])
+                    changesets.insert(i+1, cc)
+                    branches[m] = i+1
+
+                    # adjust our loop counters now we have inserted a new entry
+                    n += 1
+                    i += 2
+                    continue
+
+        branches[c.branch] = i
+        i += 1
+
+    # Number changesets
+
+    for i, c in enumerate(changesets):
+        c.id = i+1
+
+    ui.status(_('%d changeset entries\n') % len(changesets))
+
+    return changesets