convert: transcode CVS log messages by specified encoding (
issue5597)
Converting from CVS to Mercurial assumes that CVS log messages in "cvs
rlog" output are encoded in UTF-8 (or basic Latin-1). But cvs itself
is usually unaware of encoding of log messages, in practice.
Therefore, if there are commits, of which log message is encoded in
other than UTF-8, log message of corresponded revisions in the
converted repository will be broken.
To avoid such broken log messages, this patch transcodes CVS log
messages by encoding specified via "convert.cvsps.logencoding"
configuration.
This patch accepts multiple encoding for convenience, because
"multiple encoding mixed in a repository" easily occurs. For example,
UTF-8 (recent POSIX), cp932 (Windows), and EUC-JP (legacy POSIX) are
well known encoding for Japanese.
#!/usr/bin/env python
#
# posplit - split messages in paragraphs on .po/.pot files
#
# license: MIT/X11/Expat
#
from __future__ import absolute_import, print_function
import polib
import re
import sys
def addentry(po, entry, cache):
e = cache.get(entry.msgid)
if e:
e.occurrences.extend(entry.occurrences)
else:
po.append(entry)
cache[entry.msgid] = entry
def mkentry(orig, delta, msgid, msgstr):
entry = polib.POEntry()
entry.merge(orig)
entry.msgid = msgid or orig.msgid
entry.msgstr = msgstr or orig.msgstr
entry.occurrences = [(p, int(l) + delta) for (p, l) in orig.occurrences]
return entry
if __name__ == "__main__":
po = polib.pofile(sys.argv[1])
cache = {}
entries = po[:]
po[:] = []
findd = re.compile(r' *\.\. (\w+)::') # for finding directives
for entry in entries:
msgids = entry.msgid.split(u'\n\n')
if entry.msgstr:
msgstrs = entry.msgstr.split(u'\n\n')
else:
msgstrs = [u''] * len(msgids)
if len(msgids) != len(msgstrs):
# places the whole existing translation as a fuzzy
# translation for each paragraph, to give the
# translator a chance to recover part of the old
# translation - erasing extra paragraphs is
# probably better than retranslating all from start
if 'fuzzy' not in entry.flags:
entry.flags.append('fuzzy')
msgstrs = [entry.msgstr] * len(msgids)
delta = 0
for msgid, msgstr in zip(msgids, msgstrs):
if msgid and msgid != '::':
newentry = mkentry(entry, delta, msgid, msgstr)
mdirective = findd.match(msgid)
if mdirective:
if not msgid[mdirective.end():].rstrip():
# only directive, nothing to translate here
delta += 2
continue
directive = mdirective.group(1)
if directive in ('container', 'include'):
if msgid.rstrip('\n').count('\n') == 0:
# only rst syntax, nothing to translate
delta += 2
continue
else:
# lines following directly, unexpected
print('Warning: text follows line with directive' \
' %s' % directive)
comment = 'do not translate: .. %s::' % directive
if not newentry.comment:
newentry.comment = comment
elif comment not in newentry.comment:
newentry.comment += '\n' + comment
addentry(po, newentry, cache)
delta += 2 + msgid.count('\n')
po.save()