comparison hgext/convert/cvsps.py @ 6687:f8ef39206f6a

convert: cvsps.py - code to generate changesets from a CVS repository
author Frank Kingswood <frank@kingswood-consulting.co.uk>
date Sun, 15 Jun 2008 15:59:27 +0100
parents
children 5cd7a8433cd4
comparison
equal deleted inserted replaced
6686:bb1575f74f27 6687:f8ef39206f6a
1 #
2 # Mercurial built-in replacement for cvsps.
3 #
4 # Copyright 2008, Frank Kingswood <frank@kingswood-consulting.co.uk>
5 #
6 # This software may be used and distributed according to the terms
7 # of the GNU General Public License, incorporated herein by reference.
8
9 import os
10 import re
11 import sys
12 import cPickle as pickle
13 from mercurial import util
14 from mercurial.i18n import _
15
16 def listsort(list, key):
17 "helper to sort by key in Python 2.3"
18 try:
19 list.sort(key=key)
20 except TypeError:
21 list.sort(lambda l, r:cmp(key(l), key(r)))
22
23 class logentry(object):
24 '''Class logentry has the following attributes:
25 .author - author name as CVS knows it
26 .branch - name of branch this revision is on
27 .branches - revision tuple of branches starting at this revision
28 .comment - commit message
29 .date - the commit date as a (time, tz) tuple
30 .dead - true if file revision is dead
31 .file - Name of file
32 .lines - a tuple (+lines, -lines) or None
33 .parent - Previous revision of this entry
34 .rcs - name of file as returned from CVS
35 .revision - revision number as tuple
36 .tags - list of tags on the file
37 '''
38 def __init__(self, **entries):
39 self.__dict__.update(entries)
40
41 class logerror(Exception):
42 pass
43
44 def createlog(ui, directory=None, root="", rlog=True, cache=None):
45 '''Collect the CVS rlog'''
46
47 # Because we store many duplicate commit log messages, reusing strings
48 # saves a lot of memory and pickle storage space.
49 _scache = {}
50 def scache(s):
51 "return a shared version of a string"
52 return _scache.setdefault(s, s)
53
54 ui.status(_('collecting CVS rlog\n'))
55
56 log = [] # list of logentry objects containing the CVS state
57
58 # patterns to match in CVS (r)log output, by state of use
59 re_00 = re.compile('RCS file: (.+)$')
60 re_01 = re.compile('cvs \\[r?log aborted\\]: (.+)$')
61 re_02 = re.compile('cvs (r?log|server): (.+)\n$')
62 re_03 = re.compile("(Cannot access.+CVSROOT)|(can't create temporary directory.+)$")
63 re_10 = re.compile('Working file: (.+)$')
64 re_20 = re.compile('symbolic names:')
65 re_30 = re.compile('\t(.+): ([\\d.]+)$')
66 re_31 = re.compile('----------------------------$')
67 re_32 = re.compile('=============================================================================$')
68 re_50 = re.compile('revision ([\\d.]+)(\s+locked by:\s+.+;)?$')
69 re_60 = re.compile(r'date:\s+(.+);\s+author:\s+(.+);\s+state:\s+(.+?);(\s+lines:\s+(\+\d+)?\s+(-\d+)?;)?')
70 re_70 = re.compile('branches: (.+);$')
71
72 prefix = '' # leading path to strip of what we get from CVS
73
74 if directory is None:
75 # Current working directory
76
77 # Get the real directory in the repository
78 try:
79 prefix = file(os.path.join('CVS','Repository')).read().strip()
80 if prefix == ".":
81 prefix=""
82 directory = prefix
83 except IOError:
84 raise logerror('Not a CVS sandbox')
85
86 if prefix and not prefix.endswith('/'):
87 prefix+='/'
88
89 # Use the Root file in the sandbox, if it exists
90 try:
91 root = file(os.path.join('CVS','Root')).read().strip()
92 except IOError:
93 pass
94
95 if not root:
96 root = os.environ.get('CVSROOT', '')
97
98 # read log cache if one exists
99 oldlog = []
100 date = None
101
102 if cache:
103 cachedir = os.path.expanduser('~/.hg.cvsps')
104 if not os.path.exists(cachedir):
105 os.mkdir(cachedir)
106
107 # The cvsps cache pickle needs a uniquified name, based on the
108 # repository location. The address may have all sort of nasties
109 # in it, slashes, colons and such. So here we take just the
110 # alphanumerics, concatenated in a way that does not mix up the
111 # various components, so that
112 # :pserver:user@server:/path
113 # and
114 # /pserver/user/server/path
115 # are mapped to different cache file names.
116 cachefile = root.split(":")+[directory, "cache"]
117 cachefile = ['-'.join(re.findall(r'\w+', s)) for s in cachefile if s]
118 cachefile = os.path.join(cachedir, '.'.join([s for s in cachefile if s]))
119
120 if cache == 'update':
121 try:
122 ui.note(_('reading cvs log cache %s\n') % cachefile)
123 oldlog = pickle.load(file(cachefile))
124 ui.note(_('cache has %d log entries\n') % len(oldlog))
125 except Exception, e:
126 ui.note(_('error reading cache: %r\n') % e)
127
128 if oldlog:
129 date = oldlog[-1].date # last commit date as a (time,tz) tuple
130 date = util.datestr(date, '%Y/%m/%d %H:%M:%S %1%2')
131
132 # build the CVS commandline
133 cmd = ['cvs', '-q']
134 if root:
135 cmd.append('-d%s' % root)
136 p = root.split(':')[-1]
137 if not p.endswith('/'):
138 p+='/'
139 prefix = p+prefix
140 cmd.append(['log', 'rlog'][rlog])
141 if date:
142 # no space between option and date string
143 cmd.append('-d>%s' % date)
144 cmd.append(directory)
145
146 # state machine begins here
147 tags = {} # dictionary of revisions on current file with their tags
148 state = 0
149 store = False # set when a new record can be appended
150
151 cmd = [util.shellquote(arg) for arg in cmd]
152 ui.note("running %s\n" % (' '.join(cmd)))
153 ui.debug("prefix=%r directory=%r root=%r\n" % (prefix, directory, root))
154
155 for line in util.popen(' '.join(cmd)):
156 if line.endswith('\n'):
157 line = line[:-1]
158 #ui.debug('state=%d line=%r\n' % (state, line))
159
160 if state == 0:
161 # initial state, consume input until we see 'RCS file'
162 match = re_00.match(line)
163 if match:
164 rcs = match.group(1)
165 tags = {}
166 if rlog:
167 filename = rcs[:-2]
168 if filename.startswith(prefix):
169 filename = filename[len(prefix):]
170 if filename.startswith('/'):
171 filename = filename[1:]
172 if filename.startswith('Attic/'):
173 filename = filename[6:]
174 else:
175 filename = filename.replace('/Attic/', '/')
176 state = 2
177 continue
178 state = 1
179 continue
180 match = re_01.match(line)
181 if match:
182 raise Exception(match.group(1))
183 match = re_02.match(line)
184 if match:
185 raise Exception(match.group(2))
186 if re_03.match(line):
187 raise Exception(line)
188
189 elif state == 1:
190 # expect 'Working file' (only when using log instead of rlog)
191 match = re_10.match(line)
192 assert match, _('RCS file must be followed by working file')
193 filename = match.group(1)
194 state = 2
195
196 elif state == 2:
197 # expect 'symbolic names'
198 if re_20.match(line):
199 state = 3
200
201 elif state == 3:
202 # read the symbolic names and store as tags
203 match = re_30.match(line)
204 if match:
205 rev = [int(x) for x in match.group(2).split('.')]
206
207 # Convert magic branch number to an odd-numbered one
208 revn = len(rev)
209 if revn>3 and (revn%2) == 0 and rev[-2] == 0:
210 rev = rev[:-2]+rev[-1:]
211 rev = tuple(rev)
212
213 if rev not in tags:
214 tags[rev] = []
215 tags[rev].append(match.group(1))
216
217 elif re_31.match(line):
218 state = 5
219 elif re_32.match(line):
220 state = 0
221
222 elif state == 4:
223 # expecting '------' separator before first revision
224 if re_31.match(line):
225 state = 5
226 else:
227 assert not re_32.match(line), _('Must have at least some revisions')
228
229 elif state == 5:
230 # expecting revision number and possibly (ignored) lock indication
231 # we create the logentry here from values stored in states 0 to 4,
232 # as this state is re-entered for subsequent revisions of a file.
233 match = re_50.match(line)
234 assert match, _('expected revision number')
235 e = logentry(rcs=scache(rcs), file=scache(filename),
236 revision=tuple([int(x) for x in match.group(1).split('.')]),
237 branches=[], parent=None)
238 state = 6
239
240 elif state == 6:
241 # expecting date, author, state, lines changed
242 match = re_60.match(line)
243 assert match, _('revision must be followed by date line')
244 d = match.group(1)
245 if d[2] == '/':
246 # Y2K
247 d = '19'+d
248
249 if len(d.split()) != 3:
250 # cvs log dates always in GMT
251 d = d+' UTC'
252 e.date = util.parsedate(d, ['%y/%m/%d %H:%M:%S', '%Y/%m/%d %H:%M:%S', '%Y-%m-%d %H:%M:%S'])
253 e.author = scache(match.group(2))
254 e.dead = match.group(3).lower() == 'dead'
255
256 if match.group(5):
257 if match.group(6):
258 e.lines = (int(match.group(5)), int(match.group(6)))
259 else:
260 e.lines = (int(match.group(5)), 0)
261 elif match.group(6):
262 e.lines = (0, int(match.group(6)))
263 else:
264 e.lines = None
265 e.comment = []
266 state = 7
267
268 elif state == 7:
269 # read the revision numbers of branches that start at this revision,
270 # or store the commit log message otherwise
271 m = re_70.match(line)
272 if m:
273 e.branches = [tuple([int(y) for y in x.strip().split('.')])
274 for x in m.group(1).split(';')]
275 state = 8
276 elif re_31.match(line):
277 state = 5
278 store = True
279 elif re_32.match(line):
280 state = 0
281 store = True
282 else:
283 e.comment.append(line)
284
285 elif state == 8:
286 # store commit log message
287 if re_31.match(line):
288 state = 5
289 store = True
290 elif re_32.match(line):
291 state = 0
292 store = True
293 else:
294 e.comment.append(line)
295
296 if store:
297 # clean up the results and save in the log.
298 store = False
299 e.tags = [scache(x) for x in tags.get(e.revision, [])]
300 e.tags.sort()
301 e.comment = scache('\n'.join(e.comment))
302
303 revn = len(e.revision)
304 if revn>3 and (revn%2) == 0:
305 e.branch = tags.get(e.revision[:-1], [None])[0]
306 else:
307 e.branch = None
308
309 log.append(e)
310
311 if len(log)%100 == 0:
312 ui.status(util.ellipsis('%d %s'%(len(log), e.file), 80)+'\n')
313
314 listsort(log, key=lambda x:(x.rcs, x.revision))
315
316 # find parent revisions of individual files
317 versions = {}
318 for e in log:
319 branch = e.revision[:-1]
320 p = versions.get((e.rcs, branch), None)
321 if p is None:
322 p = e.revision[:-2]
323 e.parent = p
324 versions[(e.rcs, branch)] = e.revision
325
326 # update the log cache
327 if cache:
328 if log:
329 # join up the old and new logs
330 listsort(log, key=lambda x:x.date)
331
332 if oldlog and oldlog[-1].date >= log[0].date:
333 raise logerror('Log cache overlaps with new log entries, re-run without cache.')
334
335 log = oldlog+log
336
337 # write the new cachefile
338 ui.note(_('writing cvs log cache %s\n') % cachefile)
339 pickle.dump(log, file(cachefile, 'w'))
340 else:
341 log = oldlog
342
343 ui.status(_('%d log entries\n') % len(log))
344
345 return log
346
347
348 class changeset(object):
349 '''Class changeset has the following attributes:
350 .author - author name as CVS knows it
351 .branch - name of branch this changeset is on, or None
352 .comment - commit message
353 .date - the commit date as a (time,tz) tuple
354 .entries - list of logentry objects in this changeset
355 .parents - list of one or two parent changesets
356 .tags - list of tags on this changeset
357 '''
358 def __init__(self, **entries):
359 self.__dict__.update(entries)
360
361 def createchangeset(ui, log, fuzz=60, mergefrom=None, mergeto=None):
362 '''Convert log into changesets.'''
363
364 ui.status(_('creating changesets\n'))
365
366 # Merge changesets
367
368 listsort(log, key=lambda x:(x.comment, x.author, x.branch, x.date))
369
370 changesets = []
371 files = {}
372 c = None
373 for i, e in enumerate(log):
374
375 # Check if log entry belongs to the current changeset or not.
376 if not (c and
377 e.comment == c.comment and
378 e.author == c.author and
379 e.branch == c.branch and
380 (c.date[0]+c.date[1]) <= (e.date[0]+e.date[1]) <= (c.date[0]+c.date[1])+fuzz and
381 e.file not in files):
382 c = changeset(comment=e.comment, author=e.author,
383 branch=e.branch, date=e.date, entries=[])
384 changesets.append(c)
385 files = {}
386 if len(changesets)%100 == 0:
387 ui.status(util.ellipsis('%d %s'%(len(changesets), repr(e.comment)[1:-1]), 80)+'\n')
388
389 e.Changeset = c
390 c.entries.append(e)
391 files[e.file] = True
392 c.date = e.date # changeset date is date of latest commit in it
393
394 # Sort files in each changeset
395
396 for c in changesets:
397 def pathcompare(l, r):
398 'Mimic cvsps sorting order'
399 l = l.split('/')
400 r = r.split('/')
401 nl = len(l)
402 nr = len(r)
403 n = min(nl, nr)
404 for i in range(n):
405 if i+1 == nl and nl<nr:
406 return -1
407 elif i+1 == nr and nl>nr:
408 return +1
409 elif l[i]<r[i]:
410 return -1
411 elif l[i]>r[i]:
412 return +1
413 return 0
414 def entitycompare(l, r):
415 return pathcompare(l.file, r.file)
416
417 c.entries.sort(entitycompare)
418
419 # Sort changesets by date
420
421 def cscmp(l, r):
422 d = sum(l.date)-sum(r.date)
423 if d:
424 return d
425
426 # detect vendor branches and initial commits on a branch
427 le = {}
428 for e in l.entries:
429 le[e.rcs] = e.revision
430 re = {}
431 for e in r.entries:
432 re[e.rcs] = e.revision
433
434 d = 0
435 for e in l.entries:
436 if re.get(e.rcs, None) == e.parent:
437 assert not d
438 d = 1
439 break
440
441 for e in r.entries:
442 if le.get(e.rcs, None) == e.parent:
443 assert not d
444 d = -1
445 break
446
447 return d
448
449 changesets.sort(cscmp)
450
451 # Collect tags
452
453 globaltags = {}
454 for c in changesets:
455 tags = {}
456 for e in c.entries:
457 for tag in e.tags:
458 # remember which is the latest changeset to have this tag
459 globaltags[tag] = c
460
461 for c in changesets:
462 tags = {}
463 for e in c.entries:
464 for tag in e.tags:
465 tags[tag] = True
466 # remember tags only if this is the latest changeset to have it
467 tagnames = [tag for tag in tags if globaltags[tag] is c]
468 tagnames.sort()
469 c.tags = tagnames
470
471 # Find parent changesets, handle {{mergetobranch BRANCHNAME}}
472 # by inserting dummy changesets with two parents, and handle
473 # {{mergefrombranch BRANCHNAME}} by setting two parents.
474
475 if mergeto is None:
476 mergeto = r'{{mergetobranch ([-\w]+)}}'
477 if mergeto:
478 mergeto = re.compile(mergeto)
479
480 if mergefrom is None:
481 mergefrom = r'{{mergefrombranch ([-\w]+)}}'
482 if mergefrom:
483 mergefrom = re.compile(mergefrom)
484
485 versions = {} # changeset index where we saw any particular file version
486 branches = {} # changeset index where we saw a branch
487 n = len(changesets)
488 i = 0
489 while i<n:
490 c = changesets[i]
491
492 for f in c.entries:
493 versions[(f.rcs, f.revision)] = i
494
495 p = None
496 if c.branch in branches:
497 p = branches[c.branch]
498 else:
499 for f in c.entries:
500 p = max(p, versions.get((f.rcs, f.parent), None))
501
502 c.parents = []
503 if p is not None:
504 c.parents.append(changesets[p])
505
506 if mergefrom:
507 m = mergefrom.search(c.comment)
508 if m:
509 m = m.group(1)
510 if m == 'HEAD':
511 m = None
512 if m in branches and c.branch != m:
513 c.parents.append(changesets[branches[m]])
514
515 if mergeto:
516 m = mergeto.search(c.comment)
517 if m:
518 try:
519 m = m.group(1)
520 if m == 'HEAD':
521 m = None
522 except:
523 m = None # if no group found then merge to HEAD
524 if m in branches and c.branch != m:
525 # insert empty changeset for merge
526 cc = changeset(author=c.author, branch=m, date=c.date,
527 comment='convert-repo: CVS merge from branch %s' % c.branch,
528 entries=[], tags=[], parents=[changesets[branches[m]], c])
529 changesets.insert(i+1, cc)
530 branches[m] = i+1
531
532 # adjust our loop counters now we have inserted a new entry
533 n += 1
534 i += 2
535 continue
536
537 branches[c.branch] = i
538 i += 1
539
540 # Number changesets
541
542 for i, c in enumerate(changesets):
543 c.id = i+1
544
545 ui.status(_('%d changeset entries\n') % len(changesets))
546
547 return changesets