Mercurial > hg
changeset 11391:9b404640d795
Merge with hg-i18n
author | Martin Geisler <mg@lazybytes.net> |
---|---|
date | Sat, 19 Jun 2010 17:06:11 +0200 |
parents | 11cd65611f3f (diff) b2c0bc41165f (current diff) |
children | a87906461835 |
files | |
diffstat | 12 files changed, 1967 insertions(+), 34 deletions(-) [+] |
line wrap: on
line diff
--- a/.hgignore Fri Jun 18 15:22:56 2010 +0200 +++ b/.hgignore Sat Jun 19 17:06:11 2010 +0200 @@ -25,6 +25,7 @@ MANIFEST patches mercurial/__version__.py +mercurial.egg-info Output/Mercurial-*.exe .DS_Store tags
--- a/Makefile Fri Jun 18 15:22:56 2010 +0200 +++ b/Makefile Sat Jun 19 17:06:11 2010 +0200 @@ -100,6 +100,7 @@ --copyright-holder "Matt Mackall <mpm@selenic.com> and others" \ --from-code ISO-8859-1 --join --sort-by-file \ -d hg -p i18n -o hg.pot + $(PYTHON) i18n/posplit i18n/hg.pot %.po: i18n/hg.pot msgmerge --no-location --update $@ $^
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/i18n/polib.LICENSE Sat Jun 19 17:06:11 2010 +0200 @@ -0,0 +1,19 @@ +copyright (c) 2006-2010 David JEAN LOUIS + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all +copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +SOFTWARE.
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/i18n/polib.py Sat Jun 19 17:06:11 2010 +0200 @@ -0,0 +1,1680 @@ +#!/usr/bin/env python +# -*- coding: utf-8 -*- +# +# License: MIT (see LICENSE file provided) +# vim: set expandtab tabstop=4 shiftwidth=4 softtabstop=4: + +""" +**polib** allows you to manipulate, create, modify gettext files (pot, po +and mo files). You can load existing files, iterate through it's entries, +add, modify entries, comments or metadata, etc... or create new po files +from scratch. + +**polib** provides a simple and pythonic API, exporting only three +convenience functions (*pofile*, *mofile* and *detect_encoding*), and the +four core classes, *POFile*, *MOFile*, *POEntry* and *MOEntry* for creating +new files/entries. + +**Basic example**: + +>>> import polib +>>> # load an existing po file +>>> po = polib.pofile('tests/test_utf8.po') +>>> for entry in po: +... # do something with entry... +... pass +>>> # add an entry +>>> entry = polib.POEntry(msgid='Welcome', msgstr='Bienvenue') +>>> entry.occurrences = [('welcome.py', '12'), ('anotherfile.py', '34')] +>>> po.append(entry) +>>> # to save our modified po file: +>>> # po.save() +>>> # or you may want to compile the po file +>>> # po.save_as_mofile('tests/test_utf8.mo') +""" + +__author__ = 'David JEAN LOUIS <izimobil@gmail.com>' +__version__ = '0.5.2' +__all__ = ['pofile', 'POFile', 'POEntry', 'mofile', 'MOFile', 'MOEntry', + 'detect_encoding', 'escape', 'unescape', 'detect_encoding',] + +import codecs +import struct +import textwrap +import types +import re + +default_encoding = 'utf-8' + +# function pofile() {{{ + +def pofile(fpath, **kwargs): + """ + Convenience function that parse the po/pot file *fpath* and return + a POFile instance. + + **Keyword arguments**: + - *fpath*: string, full or relative path to the po/pot file to parse + - *wrapwidth*: integer, the wrap width, only useful when -w option was + passed to xgettext (optional, default to 78) + - *autodetect_encoding*: boolean, if set to False the function will + not try to detect the po file encoding (optional, default to True) + - *encoding*: string, an encoding, only relevant if autodetect_encoding + is set to False + - *check_for_duplicates*: whether to check for duplicate entries when + adding entries to the file, default: False (optional) + + **Example**: + + >>> import polib + >>> po = polib.pofile('tests/test_weird_occurrences.po', + ... check_for_duplicates=True) + >>> po #doctest: +ELLIPSIS + <POFile instance at ...> + >>> import os, tempfile + >>> all_attrs = ('msgctxt', 'msgid', 'msgstr', 'msgid_plural', + ... 'msgstr_plural', 'obsolete', 'comment', 'tcomment', + ... 'occurrences', 'flags', 'previous_msgctxt', + ... 'previous_msgid', 'previous_msgid_plural') + >>> for fname in ['test_iso-8859-15.po', 'test_utf8.po']: + ... orig_po = polib.pofile('tests/'+fname) + ... tmpf = tempfile.NamedTemporaryFile().name + ... orig_po.save(tmpf) + ... try: + ... new_po = polib.pofile(tmpf) + ... for old, new in zip(orig_po, new_po): + ... for attr in all_attrs: + ... if getattr(old, attr) != getattr(new, attr): + ... getattr(old, attr) + ... getattr(new, attr) + ... finally: + ... os.unlink(tmpf) + >>> po_file = polib.pofile('tests/test_save_as_mofile.po') + >>> tmpf = tempfile.NamedTemporaryFile().name + >>> po_file.save_as_mofile(tmpf) + >>> try: + ... mo_file = polib.mofile(tmpf) + ... for old, new in zip(po_file, mo_file): + ... if po_file._encode(old.msgid) != mo_file._encode(new.msgid): + ... 'OLD: ', po_file._encode(old.msgid) + ... 'NEW: ', mo_file._encode(new.msgid) + ... if po_file._encode(old.msgstr) != mo_file._encode(new.msgstr): + ... 'OLD: ', po_file._encode(old.msgstr) + ... 'NEW: ', mo_file._encode(new.msgstr) + ... print new.msgstr + ... finally: + ... os.unlink(tmpf) + """ + if kwargs.get('autodetect_encoding', True) == True: + enc = detect_encoding(fpath) + else: + enc = kwargs.get('encoding', default_encoding) + check_for_duplicates = kwargs.get('check_for_duplicates', False) + parser = _POFileParser( + fpath, + encoding=enc, + check_for_duplicates=kwargs.get('check_for_duplicates', False) + ) + instance = parser.parse() + instance.wrapwidth = kwargs.get('wrapwidth', 78) + return instance + +# }}} +# function mofile() {{{ + +def mofile(fpath, **kwargs): + """ + Convenience function that parse the mo file *fpath* and return + a MOFile instance. + + **Keyword arguments**: + - *fpath*: string, full or relative path to the mo file to parse + - *wrapwidth*: integer, the wrap width, only useful when -w option was + passed to xgettext to generate the po file that was used to format + the mo file (optional, default to 78) + - *autodetect_encoding*: boolean, if set to False the function will + not try to detect the po file encoding (optional, default to True) + - *encoding*: string, an encoding, only relevant if autodetect_encoding + is set to False + - *check_for_duplicates*: whether to check for duplicate entries when + adding entries to the file, default: False (optional) + + **Example**: + + >>> import polib + >>> mo = polib.mofile('tests/test_utf8.mo', check_for_duplicates=True) + >>> mo #doctest: +ELLIPSIS + <MOFile instance at ...> + >>> import os, tempfile + >>> for fname in ['test_iso-8859-15.mo', 'test_utf8.mo']: + ... orig_mo = polib.mofile('tests/'+fname) + ... tmpf = tempfile.NamedTemporaryFile().name + ... orig_mo.save(tmpf) + ... try: + ... new_mo = polib.mofile(tmpf) + ... for old, new in zip(orig_mo, new_mo): + ... if old.msgid != new.msgid: + ... old.msgstr + ... new.msgstr + ... finally: + ... os.unlink(tmpf) + """ + if kwargs.get('autodetect_encoding', True) == True: + enc = detect_encoding(fpath, True) + else: + enc = kwargs.get('encoding', default_encoding) + parser = _MOFileParser( + fpath, + encoding=enc, + check_for_duplicates=kwargs.get('check_for_duplicates', False) + ) + instance = parser.parse() + instance.wrapwidth = kwargs.get('wrapwidth', 78) + return instance + +# }}} +# function detect_encoding() {{{ + +def detect_encoding(fpath, binary_mode=False): + """ + Try to detect the encoding used by the file *fpath*. The function will + return polib default *encoding* if it's unable to detect it. + + **Keyword argument**: + - *fpath*: string, full or relative path to the mo file to parse. + + **Examples**: + + >>> print(detect_encoding('tests/test_noencoding.po')) + utf-8 + >>> print(detect_encoding('tests/test_utf8.po')) + UTF-8 + >>> print(detect_encoding('tests/test_utf8.mo', True)) + UTF-8 + >>> print(detect_encoding('tests/test_iso-8859-15.po')) + ISO_8859-15 + >>> print(detect_encoding('tests/test_iso-8859-15.mo', True)) + ISO_8859-15 + """ + import re + rx = re.compile(r'"?Content-Type:.+? charset=([\w_\-:\.]+)') + if binary_mode: + mode = 'rb' + else: + mode = 'r' + f = open(fpath, mode) + for l in f.readlines(): + match = rx.search(l) + if match: + f.close() + return match.group(1).strip() + f.close() + return default_encoding + +# }}} +# function escape() {{{ + +def escape(st): + """ + Escape special chars and return the given string *st*. + + **Examples**: + + >>> escape('\\t and \\n and \\r and " and \\\\') + '\\\\t and \\\\n and \\\\r and \\\\" and \\\\\\\\' + """ + return st.replace('\\', r'\\')\ + .replace('\t', r'\t')\ + .replace('\r', r'\r')\ + .replace('\n', r'\n')\ + .replace('\"', r'\"') + +# }}} +# function unescape() {{{ + +def unescape(st): + """ + Unescape special chars and return the given string *st*. + + **Examples**: + + >>> unescape('\\\\t and \\\\n and \\\\r and \\\\" and \\\\\\\\') + '\\t and \\n and \\r and " and \\\\' + >>> unescape(r'\\n') + '\\n' + >>> unescape(r'\\\\n') + '\\\\n' + >>> unescape(r'\\\\n\\n') + '\\\\n\\n' + """ + def unescape_repl(m): + m = m.group(1) + if m == 'n': + return '\n' + if m == 't': + return '\t' + if m == 'r': + return '\r' + if m == '\\': + return '\\' + return m # handles escaped double quote + return re.sub(r'\\(\\|n|t|r|")', unescape_repl, st) + +# }}} +# class _BaseFile {{{ + +class _BaseFile(list): + """ + Common parent class for POFile and MOFile classes. + This class must **not** be instanciated directly. + """ + + def __init__(self, *args, **kwargs): + """ + Constructor. + + **Keyword arguments**: + - *fpath*: string, path to po or mo file + - *wrapwidth*: integer, the wrap width, only useful when -w option + was passed to xgettext to generate the po file that was used to + format the mo file, default to 78 (optional), + - *encoding*: string, the encoding to use, defaults to + "default_encoding" global variable (optional), + - *check_for_duplicates*: whether to check for duplicate entries + when adding entries to the file, default: False (optional). + """ + list.__init__(self) + # the opened file handle + self.fpath = kwargs.get('fpath') + # the width at which lines should be wrapped + self.wrapwidth = kwargs.get('wrapwidth', 78) + # the file encoding + self.encoding = kwargs.get('encoding', default_encoding) + # whether to check for duplicate entries or not + self.check_for_duplicates = kwargs.get('check_for_duplicates', False) + # header + self.header = '' + # both po and mo files have metadata + self.metadata = {} + self.metadata_is_fuzzy = 0 + + def __str__(self): + """ + String representation of the file. + """ + ret = [] + entries = [self.metadata_as_entry()] + \ + [e for e in self if not e.obsolete] + for entry in entries: + ret.append(entry.__str__(self.wrapwidth)) + for entry in self.obsolete_entries(): + ret.append(entry.__str__(self.wrapwidth)) + return '\n'.join(ret) + + def __contains__(self, entry): + """ + Overriden method to implement the membership test (in and not in). + The method considers that an entry is in the file if it finds an + entry that has the same msgid (case sensitive). + + **Keyword argument**: + - *entry*: an instance of polib._BaseEntry + + **Tests**: + >>> po = POFile() + >>> e1 = POEntry(msgid='foobar', msgstr='spam') + >>> e2 = POEntry(msgid='barfoo', msgstr='spam') + >>> e3 = POEntry(msgid='foobar', msgstr='eggs') + >>> e4 = POEntry(msgid='spameggs', msgstr='eggs') + >>> po.append(e1) + >>> po.append(e2) + >>> e1 in po + True + >>> e2 not in po + False + >>> e3 in po + True + >>> e4 in po + False + """ + return self.find(entry.msgid, by='msgid') is not None + + def append(self, entry): + """ + Overriden method to check for duplicates entries, if a user tries to + add an entry that already exists, the method will raise a ValueError + exception. + + **Keyword argument**: + - *entry*: an instance of polib._BaseEntry + + **Tests**: + >>> e1 = POEntry(msgid='foobar', msgstr='spam') + >>> e2 = POEntry(msgid='foobar', msgstr='eggs') + >>> po = POFile(check_for_duplicates=True) + >>> po.append(e1) + >>> try: + ... po.append(e2) + ... except ValueError, e: + ... unicode(e) + u'Entry "foobar" already exists' + """ + if self.check_for_duplicates and entry in self: + raise ValueError('Entry "%s" already exists' % entry.msgid) + super(_BaseFile, self).append(entry) + + def insert(self, index, entry): + """ + Overriden method to check for duplicates entries, if a user tries to + insert an entry that already exists, the method will raise a ValueError + exception. + + **Keyword arguments**: + - *index*: index at which the entry should be inserted + - *entry*: an instance of polib._BaseEntry + + **Tests**: + >>> import polib + >>> polib.check_for_duplicates = True + >>> e1 = POEntry(msgid='foobar', msgstr='spam') + >>> e2 = POEntry(msgid='barfoo', msgstr='eggs') + >>> e3 = POEntry(msgid='foobar', msgstr='eggs') + >>> po = POFile(check_for_duplicates=True) + >>> po.insert(0, e1) + >>> po.insert(1, e2) + >>> try: + ... po.insert(0, e3) + ... except ValueError, e: + ... unicode(e) + u'Entry "foobar" already exists' + """ + if self.check_for_duplicates and entry in self: + raise ValueError('Entry "%s" already exists' % entry.msgid) + super(_BaseFile, self).insert(index, entry) + + def __repr__(self): + """Return the official string representation of the object.""" + return '<%s instance at %x>' % (self.__class__.__name__, id(self)) + + def metadata_as_entry(self): + """ + Return the metadata as an entry: + + >>> import polib + >>> po = polib.pofile('tests/test_fuzzy_header.po') + >>> unicode(po) == unicode(open('tests/test_fuzzy_header.po').read()) + True + """ + e = POEntry(msgid='') + mdata = self.ordered_metadata() + if mdata: + strs = [] + e._multiline_str['msgstr'] = '' + for name, value in mdata: + # Strip whitespace off each line in a multi-line entry + strs.append('%s: %s' % (name, value)) + e.msgstr = '\n'.join(strs) + '\n' + e._multiline_str['msgstr'] = '__POLIB__NL__'.join( + [s + '\n' for s in strs]) + if self.metadata_is_fuzzy: + e.flags.append('fuzzy') + return e + + def save(self, fpath=None, repr_method='__str__'): + """ + Save the po file to file *fpath* if no file handle exists for + the object. If there's already an open file and no fpath is + provided, then the existing file is rewritten with the modified + data. + + **Keyword arguments**: + - *fpath*: string, full or relative path to the file. + - *repr_method*: string, the method to use for output. + """ + if self.fpath is None and fpath is None: + raise IOError('You must provide a file path to save() method') + contents = getattr(self, repr_method)() + if fpath is None: + fpath = self.fpath + if repr_method == 'to_binary': + fhandle = open(fpath, 'wb') + else: + fhandle = codecs.open(fpath, 'w', self.encoding) + if type(contents) != types.UnicodeType: + contents = contents.decode(self.encoding) + fhandle.write(contents) + fhandle.close() + + def find(self, st, by='msgid'): + """ + Find entry which msgid (or property identified by the *by* + attribute) matches the string *st*. + + **Keyword arguments**: + - *st*: string, the string to search for + - *by*: string, the comparison attribute + + **Examples**: + + >>> po = pofile('tests/test_utf8.po') + >>> entry = po.find('Thursday') + >>> entry.msgstr + u'Jueves' + >>> entry = po.find('Some unexistant msgid') + >>> entry is None + True + >>> entry = po.find('Jueves', 'msgstr') + >>> entry.msgid + u'Thursday' + """ + for e in self: + if getattr(e, by) == st: + return e + return None + + def ordered_metadata(self): + """ + Convenience method that return the metadata ordered. The return + value is list of tuples (metadata name, metadata_value). + """ + # copy the dict first + metadata = self.metadata.copy() + data_order = [ + 'Project-Id-Version', + 'Report-Msgid-Bugs-To', + 'POT-Creation-Date', + 'PO-Revision-Date', + 'Last-Translator', + 'Language-Team', + 'MIME-Version', + 'Content-Type', + 'Content-Transfer-Encoding' + ] + ordered_data = [] + for data in data_order: + try: + value = metadata.pop(data) + ordered_data.append((data, value)) + except KeyError: + pass + # the rest of the metadata won't be ordered there are no specs for this + keys = metadata.keys() + list(keys).sort() + for data in keys: + value = metadata[data] + ordered_data.append((data, value)) + return ordered_data + + def to_binary(self): + """ + Return the mofile binary representation. + """ + import array + import struct + import types + offsets = [] + entries = self.translated_entries() + # the keys are sorted in the .mo file + def cmp(_self, other): + if _self.msgid > other.msgid: + return 1 + elif _self.msgid < other.msgid: + return -1 + else: + return 0 + # add metadata entry + entries.sort(cmp) + mentry = self.metadata_as_entry() + mentry.msgstr = mentry.msgstr.replace('\\n', '').lstrip() + entries = [mentry] + entries + entries_len = len(entries) + ids, strs = '', '' + for e in entries: + # For each string, we need size and file offset. Each string is + # NUL terminated; the NUL does not count into the size. + if e.msgid_plural: + indexes = e.msgstr_plural.keys() + indexes.sort() + msgstr = [] + for index in indexes: + msgstr.append(e.msgstr_plural[index]) + msgid = self._encode(e.msgid + '\0' + e.msgid_plural) + msgstr = self._encode('\0'.join(msgstr)) + else: + msgid = self._encode(e.msgid) + msgstr = self._encode(e.msgstr) + offsets.append((len(ids), len(msgid), len(strs), len(msgstr))) + ids += msgid + '\0' + strs += msgstr + '\0' + # The header is 7 32-bit unsigned integers. + keystart = 7*4+16*entries_len + # and the values start after the keys + valuestart = keystart + len(ids) + koffsets = [] + voffsets = [] + # The string table first has the list of keys, then the list of values. + # Each entry has first the size of the string, then the file offset. + for o1, l1, o2, l2 in offsets: + koffsets += [l1, o1+keystart] + voffsets += [l2, o2+valuestart] + offsets = koffsets + voffsets + output = struct.pack("IIIIIII", + 0x950412de, # Magic number + 0, # Version + entries_len, # # of entries + 7*4, # start of key index + 7*4+entries_len*8, # start of value index + 0, 0) # size and offset of hash table + output += array.array("I", offsets).tostring() + output += ids + output += strs + return output + + def _encode(self, mixed): + """ + Encode the given argument with the file encoding if the type is unicode + and return the encoded string. + """ + if type(mixed) == types.UnicodeType: + return mixed.encode(self.encoding) + return mixed + +# }}} +# class POFile {{{ + +class POFile(_BaseFile): + ''' + Po (or Pot) file reader/writer. + POFile objects inherit the list objects methods. + + **Example**: + + >>> po = POFile() + >>> entry1 = POEntry( + ... msgid="Some english text", + ... msgstr="Un texte en anglais" + ... ) + >>> entry1.occurrences = [('testfile', 12),('another_file', 1)] + >>> entry1.comment = "Some useful comment" + >>> entry2 = POEntry( + ... msgid="Peace in some languages", + ... msgstr="Pace سلام שלום Hasîtî 和平" + ... ) + >>> entry2.occurrences = [('testfile', 15),('another_file', 5)] + >>> entry2.comment = "Another useful comment" + >>> entry3 = POEntry( + ... msgid='Some entry with quotes " \\"', + ... msgstr='Un message unicode avec des quotes " \\"' + ... ) + >>> entry3.comment = "Test string quoting" + >>> po.append(entry1) + >>> po.append(entry2) + >>> po.append(entry3) + >>> po.header = "Some Header" + >>> print(po) + # Some Header + msgid "" + msgstr "" + <BLANKLINE> + #. Some useful comment + #: testfile:12 another_file:1 + msgid "Some english text" + msgstr "Un texte en anglais" + <BLANKLINE> + #. Another useful comment + #: testfile:15 another_file:5 + msgid "Peace in some languages" + msgstr "Pace سلام שלום Hasîtî 和平" + <BLANKLINE> + #. Test string quoting + msgid "Some entry with quotes \\" \\"" + msgstr "Un message unicode avec des quotes \\" \\"" + <BLANKLINE> + ''' + + def __str__(self): + """Return the string representation of the po file""" + ret, headers = '', self.header.split('\n') + for header in headers: + if header[:1] in [',', ':']: + ret += '#%s\n' % header + else: + ret += '# %s\n' % header + return ret + _BaseFile.__str__(self) + + def save_as_mofile(self, fpath): + """ + Save the binary representation of the file to *fpath*. + + **Keyword arguments**: + - *fpath*: string, full or relative path to the file. + """ + _BaseFile.save(self, fpath, 'to_binary') + + def percent_translated(self): + """ + Convenience method that return the percentage of translated + messages. + + **Example**: + + >>> import polib + >>> po = polib.pofile('tests/test_pofile_helpers.po') + >>> po.percent_translated() + 50 + >>> po = POFile() + >>> po.percent_translated() + 100 + """ + total = len([e for e in self if not e.obsolete]) + if total == 0: + return 100 + translated = len(self.translated_entries()) + return int((100.00 / float(total)) * translated) + + def translated_entries(self): + """ + Convenience method that return a list of translated entries. + + **Example**: + + >>> import polib + >>> po = polib.pofile('tests/test_pofile_helpers.po') + >>> len(po.translated_entries()) + 6 + """ + return [e for e in self if e.translated()] + + def untranslated_entries(self): + """ + Convenience method that return a list of untranslated entries. + + **Example**: + + >>> import polib + >>> po = polib.pofile('tests/test_pofile_helpers.po') + >>> len(po.untranslated_entries()) + 4 + """ + return [e for e in self if not e.translated() and not e.obsolete \ + and not 'fuzzy' in e.flags] + + def fuzzy_entries(self): + """ + Convenience method that return the list of 'fuzzy' entries. + + **Example**: + + >>> import polib + >>> po = polib.pofile('tests/test_pofile_helpers.po') + >>> len(po.fuzzy_entries()) + 2 + """ + return [e for e in self if 'fuzzy' in e.flags] + + def obsolete_entries(self): + """ + Convenience method that return the list of obsolete entries. + + **Example**: + + >>> import polib + >>> po = polib.pofile('tests/test_pofile_helpers.po') + >>> len(po.obsolete_entries()) + 4 + """ + return [e for e in self if e.obsolete] + + def merge(self, refpot): + """ + XXX this could not work if encodings are different, needs thinking + and general refactoring of how polib handles encoding... + + Convenience method that merge the current pofile with the pot file + provided. It behaves exactly as the gettext msgmerge utility: + + - comments of this file will be preserved, but extracted comments + and occurrences will be discarded + - any translations or comments in the file will be discarded, + however dot comments and file positions will be preserved + + **Keyword argument**: + - *refpot*: object POFile, the reference catalog. + + **Example**: + + >>> import polib + >>> refpot = polib.pofile('tests/test_merge.pot') + >>> po = polib.pofile('tests/test_merge_before.po') + >>> po.merge(refpot) + >>> expected_po = polib.pofile('tests/test_merge_after.po') + >>> unicode(po) == unicode(expected_po) + True + """ + for entry in refpot: + e = self.find(entry.msgid) + if e is None: + e = POEntry() + self.append(e) + e.merge(entry) + # ok, now we must "obsolete" entries that are not in the refpot + # anymore + for entry in self: + if refpot.find(entry.msgid) is None: + entry.obsolete = True + +# }}} +# class MOFile {{{ + +class MOFile(_BaseFile): + ''' + Mo file reader/writer. + MOFile objects inherit the list objects methods. + + **Example**: + + >>> mo = MOFile() + >>> entry1 = POEntry( + ... msgid="Some english text", + ... msgstr="Un texte en anglais" + ... ) + >>> entry2 = POEntry( + ... msgid="I need my dirty cheese", + ... msgstr="Je veux mon sale fromage" + ... ) + >>> entry3 = MOEntry( + ... msgid='Some entry with quotes " \\"', + ... msgstr='Un message unicode avec des quotes " \\"' + ... ) + >>> mo.append(entry1) + >>> mo.append(entry2) + >>> mo.append(entry3) + >>> print(mo) + msgid "" + msgstr "" + <BLANKLINE> + msgid "Some english text" + msgstr "Un texte en anglais" + <BLANKLINE> + msgid "I need my dirty cheese" + msgstr "Je veux mon sale fromage" + <BLANKLINE> + msgid "Some entry with quotes \\" \\"" + msgstr "Un message unicode avec des quotes \\" \\"" + <BLANKLINE> + ''' + + def __init__(self, *args, **kwargs): + """ + MOFile constructor. Mo files have two other properties: + - magic_number: the magic_number of the binary file, + - version: the version of the mo spec. + """ + _BaseFile.__init__(self, *args, **kwargs) + self.magic_number = None + self.version = 0 + + def save_as_pofile(self, fpath): + """ + Save the string representation of the file to *fpath*. + + **Keyword argument**: + - *fpath*: string, full or relative path to the file. + """ + _BaseFile.save(self, fpath) + + def save(self, fpath): + """ + Save the binary representation of the file to *fpath*. + + **Keyword argument**: + - *fpath*: string, full or relative path to the file. + """ + _BaseFile.save(self, fpath, 'to_binary') + + def percent_translated(self): + """ + Convenience method to keep the same interface with POFile instances. + """ + return 100 + + def translated_entries(self): + """ + Convenience method to keep the same interface with POFile instances. + """ + return self + + def untranslated_entries(self): + """ + Convenience method to keep the same interface with POFile instances. + """ + return [] + + def fuzzy_entries(self): + """ + Convenience method to keep the same interface with POFile instances. + """ + return [] + + def obsolete_entries(self): + """ + Convenience method to keep the same interface with POFile instances. + """ + return [] + +# }}} +# class _BaseEntry {{{ + +class _BaseEntry(object): + """ + Base class for POEntry or MOEntry objects. + This class must *not* be instanciated directly. + """ + + def __init__(self, *args, **kwargs): + """Base Entry constructor.""" + self.msgid = kwargs.get('msgid', '') + self.msgstr = kwargs.get('msgstr', '') + self.msgid_plural = kwargs.get('msgid_plural', '') + self.msgstr_plural = kwargs.get('msgstr_plural', {}) + self.obsolete = kwargs.get('obsolete', False) + self.encoding = kwargs.get('encoding', default_encoding) + self.msgctxt = kwargs.get('msgctxt', None) + self._multiline_str = {} + + def __repr__(self): + """Return the official string representation of the object.""" + return '<%s instance at %x>' % (self.__class__.__name__, id(self)) + + def __str__(self, wrapwidth=78): + """ + Common string representation of the POEntry and MOEntry + objects. + """ + if self.obsolete: + delflag = '#~ ' + else: + delflag = '' + ret = [] + # write the msgctxt if any + if self.msgctxt is not None: + ret += self._str_field("msgctxt", delflag, "", self.msgctxt) + # write the msgid + ret += self._str_field("msgid", delflag, "", self.msgid) + # write the msgid_plural if any + if self.msgid_plural: + ret += self._str_field("msgid_plural", delflag, "", self.msgid_plural) + if self.msgstr_plural: + # write the msgstr_plural if any + msgstrs = self.msgstr_plural + keys = list(msgstrs) + keys.sort() + for index in keys: + msgstr = msgstrs[index] + plural_index = '[%s]' % index + ret += self._str_field("msgstr", delflag, plural_index, msgstr) + else: + # otherwise write the msgstr + ret += self._str_field("msgstr", delflag, "", self.msgstr) + ret.append('') + return '\n'.join(ret) + + def _str_field(self, fieldname, delflag, plural_index, field): + if (fieldname + plural_index) in self._multiline_str: + field = self._multiline_str[fieldname + plural_index] + lines = [''] + field.split('__POLIB__NL__') + else: + lines = field.splitlines(True) + if len(lines) > 1: + lines = ['']+lines # start with initial empty line + else: + lines = [field] # needed for the empty string case + if fieldname.startswith('previous_'): + # quick and dirty trick to get the real field name + fieldname = fieldname[9:] + + ret = ['%s%s%s "%s"' % (delflag, fieldname, plural_index, + escape(lines.pop(0)))] + for mstr in lines: + ret.append('%s"%s"' % (delflag, escape(mstr))) + return ret + +# }}} +# class POEntry {{{ + +class POEntry(_BaseEntry): + """ + Represents a po file entry. + + **Examples**: + + >>> entry = POEntry(msgid='Welcome', msgstr='Bienvenue') + >>> entry.occurrences = [('welcome.py', 12), ('anotherfile.py', 34)] + >>> print(entry) + #: welcome.py:12 anotherfile.py:34 + msgid "Welcome" + msgstr "Bienvenue" + <BLANKLINE> + >>> entry = POEntry() + >>> entry.occurrences = [('src/some-very-long-filename-that-should-not-be-wrapped-even-if-it-is-larger-than-the-wrap-limit.c', 32), ('src/eggs.c', 45)] + >>> entry.comment = 'A plural translation. This is a very very very long line please do not wrap, this is just for testing comment wrapping...' + >>> entry.tcomment = 'A plural translation. This is a very very very long line please do not wrap, this is just for testing comment wrapping...' + >>> entry.flags.append('c-format') + >>> entry.previous_msgctxt = '@somecontext' + >>> entry.previous_msgid = 'I had eggs but no spam !' + >>> entry.previous_msgid_plural = 'I had eggs and %d spam !' + >>> entry.msgctxt = '@somenewcontext' + >>> entry.msgid = 'I have spam but no egg !' + >>> entry.msgid_plural = 'I have spam and %d eggs !' + >>> entry.msgstr_plural[0] = "J'ai du jambon mais aucun oeuf !" + >>> entry.msgstr_plural[1] = "J'ai du jambon et %d oeufs !" + >>> print(entry) + #. A plural translation. This is a very very very long line please do not + #. wrap, this is just for testing comment wrapping... + # A plural translation. This is a very very very long line please do not wrap, + # this is just for testing comment wrapping... + #: src/some-very-long-filename-that-should-not-be-wrapped-even-if-it-is-larger-than-the-wrap-limit.c:32 + #: src/eggs.c:45 + #, c-format + #| msgctxt "@somecontext" + #| msgid "I had eggs but no spam !" + #| msgid_plural "I had eggs and %d spam !" + msgctxt "@somenewcontext" + msgid "I have spam but no egg !" + msgid_plural "I have spam and %d eggs !" + msgstr[0] "J'ai du jambon mais aucun oeuf !" + msgstr[1] "J'ai du jambon et %d oeufs !" + <BLANKLINE> + """ + + def __init__(self, *args, **kwargs): + """POEntry constructor.""" + _BaseEntry.__init__(self, *args, **kwargs) + self.comment = kwargs.get('comment', '') + self.tcomment = kwargs.get('tcomment', '') + self.occurrences = kwargs.get('occurrences', []) + self.flags = kwargs.get('flags', []) + self.previous_msgctxt = kwargs.get('previous_msgctxt', None) + self.previous_msgid = kwargs.get('previous_msgid', None) + self.previous_msgid_plural = kwargs.get('previous_msgid_plural', None) + + def __str__(self, wrapwidth=78): + """ + Return the string representation of the entry. + """ + if self.obsolete: + return _BaseEntry.__str__(self) + ret = [] + # comment first, if any (with text wrapping as xgettext does) + if self.comment != '': + for comment in self.comment.split('\n'): + if wrapwidth > 0 and len(comment) > wrapwidth-3: + ret += textwrap.wrap(comment, wrapwidth, + initial_indent='#. ', + subsequent_indent='#. ', + break_long_words=False) + else: + ret.append('#. %s' % comment) + # translator comment, if any (with text wrapping as xgettext does) + if self.tcomment != '': + for tcomment in self.tcomment.split('\n'): + if wrapwidth > 0 and len(tcomment) > wrapwidth-2: + ret += textwrap.wrap(tcomment, wrapwidth, + initial_indent='# ', + subsequent_indent='# ', + break_long_words=False) + else: + ret.append('# %s' % tcomment) + # occurrences (with text wrapping as xgettext does) + if self.occurrences: + filelist = [] + for fpath, lineno in self.occurrences: + if lineno: + filelist.append('%s:%s' % (fpath, lineno)) + else: + filelist.append(fpath) + filestr = ' '.join(filelist) + if wrapwidth > 0 and len(filestr)+3 > wrapwidth: + # XXX textwrap split words that contain hyphen, this is not + # what we want for filenames, so the dirty hack is to + # temporally replace hyphens with a char that a file cannot + # contain, like "*" + lines = textwrap.wrap(filestr.replace('-', '*'), + wrapwidth, + initial_indent='#: ', + subsequent_indent='#: ', + break_long_words=False) + # end of the replace hack + for line in lines: + ret.append(line.replace('*', '-')) + else: + ret.append('#: '+filestr) + # flags + if self.flags: + flags = [] + for flag in self.flags: + flags.append(flag) + ret.append('#, %s' % ', '.join(flags)) + + # previous context and previous msgid/msgid_plural + if self.previous_msgctxt: + ret += self._str_field("previous_msgctxt", "#| ", "", + self.previous_msgctxt) + if self.previous_msgid: + ret += self._str_field("previous_msgid", "#| ", "", + self.previous_msgid) + if self.previous_msgid_plural: + ret += self._str_field("previous_msgid_plural", "#| ", "", + self.previous_msgid_plural) + + ret.append(_BaseEntry.__str__(self)) + return '\n'.join(ret) + + def __cmp__(self, other): + ''' + Called by comparison operations if rich comparison is not defined. + + **Tests**: + >>> a = POEntry(msgid='a', occurrences=[('b.py', 1), ('b.py', 3)]) + >>> b = POEntry(msgid='b', occurrences=[('b.py', 1), ('b.py', 3)]) + >>> c1 = POEntry(msgid='c1', occurrences=[('a.py', 1), ('b.py', 1)]) + >>> c2 = POEntry(msgid='c2', occurrences=[('a.py', 1), ('a.py', 3)]) + >>> po = POFile() + >>> po.append(a) + >>> po.append(b) + >>> po.append(c1) + >>> po.append(c2) + >>> po.sort() + >>> print(po) + # + msgid "" + msgstr "" + <BLANKLINE> + #: a.py:1 a.py:3 + msgid "c2" + msgstr "" + <BLANKLINE> + #: a.py:1 b.py:1 + msgid "c1" + msgstr "" + <BLANKLINE> + #: b.py:1 b.py:3 + msgid "a" + msgstr "" + <BLANKLINE> + #: b.py:1 b.py:3 + msgid "b" + msgstr "" + <BLANKLINE> + ''' + def compare_occurrences(a, b): + """ + Compare an entry occurrence with another one. + """ + if a[0] != b[0]: + return a[0] < b[0] + if a[1] != b[1]: + return a[1] < b[1] + return 0 + + # First: Obsolete test + if self.obsolete != other.obsolete: + if self.obsolete: + return -1 + else: + return 1 + # Work on a copy to protect original + occ1 = self.occurrences[:] + occ2 = other.occurrences[:] + # Sorting using compare method + occ1.sort(compare_occurrences) + occ2.sort(compare_occurrences) + # Comparing sorted occurrences + pos = 0 + for entry1 in occ1: + try: + entry2 = occ2[pos] + except IndexError: + return 1 + pos = pos + 1 + if entry1[0] != entry2[0]: + if entry1[0] > entry2[0]: + return 1 + else: + return -1 + if entry1[1] != entry2[1]: + if entry1[1] > entry2[1]: + return 1 + else: + return -1 + # Finally: Compare message ID + if self.msgid > other.msgid: return 1 + else: return -1 + + def translated(self): + """ + Return True if the entry has been translated or False. + """ + if self.obsolete or 'fuzzy' in self.flags: + return False + if self.msgstr != '': + return True + if self.msgstr_plural: + for pos in self.msgstr_plural: + if self.msgstr_plural[pos] == '': + return False + return True + return False + + def merge(self, other): + """ + Merge the current entry with the given pot entry. + """ + self.msgid = other.msgid + self.occurrences = other.occurrences + self.comment = other.comment + self.flags = other.flags + self.msgid_plural = other.msgid_plural + if other.msgstr_plural: + for pos in other.msgstr_plural: + try: + # keep existing translation at pos if any + self.msgstr_plural[pos] + except KeyError: + self.msgstr_plural[pos] = '' + +# }}} +# class MOEntry {{{ + +class MOEntry(_BaseEntry): + """ + Represents a mo file entry. + + **Examples**: + + >>> entry = MOEntry() + >>> entry.msgid = 'translate me !' + >>> entry.msgstr = 'traduisez moi !' + >>> print(entry) + msgid "translate me !" + msgstr "traduisez moi !" + <BLANKLINE> + """ + + def __str__(self, wrapwidth=78): + """ + Return the string representation of the entry. + """ + return _BaseEntry.__str__(self, wrapwidth) + +# }}} +# class _POFileParser {{{ + +class _POFileParser(object): + """ + A finite state machine to parse efficiently and correctly po + file format. + """ + + def __init__(self, fpath, *args, **kwargs): + """ + Constructor. + + **Arguments**: + - *fpath*: string, path to the po file + - *encoding*: string, the encoding to use, defaults to + "default_encoding" global variable (optional), + - *check_for_duplicates*: whether to check for duplicate entries + when adding entries to the file, default: False (optional). + """ + enc = kwargs.get('encoding', default_encoding) + check_dup = kwargs.get('check_for_duplicates', False) + try: + self.fhandle = codecs.open(fpath, 'rU', enc) + except LookupError: + enc = default_encoding + self.fhandle = codecs.open(fpath, 'rU', enc) + self.instance = POFile( + fpath=fpath, + encoding=enc, + check_for_duplicates=check_dup + ) + self.transitions = {} + self.current_entry = POEntry() + self.current_state = 'ST' + self.current_token = None + # two memo flags used in handlers + self.msgstr_index = 0 + self.entry_obsolete = 0 + # Configure the state machine, by adding transitions. + # Signification of symbols: + # * ST: Beginning of the file (start) + # * HE: Header + # * TC: a translation comment + # * GC: a generated comment + # * OC: a file/line occurence + # * FL: a flags line + # * CT: a message context + # * PC: a previous msgctxt + # * PM: a previous msgid + # * PP: a previous msgid_plural + # * MI: a msgid + # * MP: a msgid plural + # * MS: a msgstr + # * MX: a msgstr plural + # * MC: a msgid or msgstr continuation line + all = ['ST', 'HE', 'GC', 'OC', 'FL', 'CT', 'PC', 'PM', 'PP', 'TC', + 'MS', 'MP', 'MX', 'MI'] + + self.add('TC', ['ST', 'HE'], 'HE') + self.add('TC', ['GC', 'OC', 'FL', 'TC', 'PC', 'PM', 'PP', 'MS', + 'MP', 'MX', 'MI'], 'TC') + self.add('GC', all, 'GC') + self.add('OC', all, 'OC') + self.add('FL', all, 'FL') + self.add('PC', all, 'PC') + self.add('PM', all, 'PM') + self.add('PP', all, 'PP') + self.add('CT', ['ST', 'HE', 'GC', 'OC', 'FL', 'TC', 'PC', 'PM', + 'PP', 'MS', 'MX'], 'CT') + self.add('MI', ['ST', 'HE', 'GC', 'OC', 'FL', 'CT', 'TC', 'PC', + 'PM', 'PP', 'MS', 'MX'], 'MI') + self.add('MP', ['TC', 'GC', 'PC', 'PM', 'PP', 'MI'], 'MP') + self.add('MS', ['MI', 'MP', 'TC'], 'MS') + self.add('MX', ['MI', 'MX', 'MP', 'TC'], 'MX') + self.add('MC', ['CT', 'MI', 'MP', 'MS', 'MX', 'PM', 'PP', 'PC'], 'MC') + + def parse(self): + """ + Run the state machine, parse the file line by line and call process() + with the current matched symbol. + """ + i, lastlen = 1, 0 + for line in self.fhandle: + line = line.strip() + if line == '': + i = i+1 + continue + if line[:3] == '#~ ': + line = line[3:] + self.entry_obsolete = 1 + else: + self.entry_obsolete = 0 + self.current_token = line + if line[:2] == '#:': + # we are on a occurrences line + self.process('OC', i) + elif line[:9] == 'msgctxt "': + # we are on a msgctxt + self.process('CT', i) + elif line[:7] == 'msgid "': + # we are on a msgid + self.process('MI', i) + elif line[:8] == 'msgstr "': + # we are on a msgstr + self.process('MS', i) + elif line[:1] == '"' or line[:4] == '#| "': + # we are on a continuation line or some metadata + self.process('MC', i) + elif line[:14] == 'msgid_plural "': + # we are on a msgid plural + self.process('MP', i) + elif line[:7] == 'msgstr[': + # we are on a msgstr plural + self.process('MX', i) + elif line[:3] == '#, ': + # we are on a flags line + self.process('FL', i) + elif line[:2] == '# ' or line == '#': + if line == '#': line = line + ' ' + # we are on a translator comment line + self.process('TC', i) + elif line[:2] == '#.': + # we are on a generated comment line + self.process('GC', i) + elif line[:15] == '#| msgid_plural': + # we are on a previous msgid_plural + self.process('PP', i) + elif line[:8] == '#| msgid': + self.process('PM', i) + # we are on a previous msgid + elif line[:10] == '#| msgctxt': + # we are on a previous msgctxt + self.process('PC', i) + i = i+1 + + if self.current_entry: + # since entries are added when another entry is found, we must add + # the last entry here (only if there are lines) + self.instance.append(self.current_entry) + # before returning the instance, check if there's metadata and if + # so extract it in a dict + firstentry = self.instance[0] + if firstentry.msgid == '': # metadata found + # remove the entry + firstentry = self.instance.pop(0) + self.instance.metadata_is_fuzzy = firstentry.flags + key = None + for msg in firstentry.msgstr.splitlines(): + try: + key, val = msg.split(':', 1) + self.instance.metadata[key] = val.strip() + except: + if key is not None: + self.instance.metadata[key] += '\n'+ msg.strip() + # close opened file + self.fhandle.close() + return self.instance + + def add(self, symbol, states, next_state): + """ + Add a transition to the state machine. + Keywords arguments: + + symbol -- string, the matched token (two chars symbol) + states -- list, a list of states (two chars symbols) + next_state -- the next state the fsm will have after the action + """ + for state in states: + action = getattr(self, 'handle_%s' % next_state.lower()) + self.transitions[(symbol, state)] = (action, next_state) + + def process(self, symbol, linenum): + """ + Process the transition corresponding to the current state and the + symbol provided. + + Keywords arguments: + symbol -- string, the matched token (two chars symbol) + linenum -- integer, the current line number of the parsed file + """ + try: + (action, state) = self.transitions[(symbol, self.current_state)] + if action(): + self.current_state = state + except Exception, exc: + raise IOError('Syntax error in po file (line %s)' % linenum) + + # state handlers + + def handle_he(self): + """Handle a header comment.""" + if self.instance.header != '': + self.instance.header += '\n' + self.instance.header += self.current_token[2:] + return 1 + + def handle_tc(self): + """Handle a translator comment.""" + if self.current_state in ['MC', 'MS', 'MX']: + self.instance.append(self.current_entry) + self.current_entry = POEntry() + if self.current_entry.tcomment != '': + self.current_entry.tcomment += '\n' + self.current_entry.tcomment += self.current_token[2:] + return True + + def handle_gc(self): + """Handle a generated comment.""" + if self.current_state in ['MC', 'MS', 'MX']: + self.instance.append(self.current_entry) + self.current_entry = POEntry() + if self.current_entry.comment != '': + self.current_entry.comment += '\n' + self.current_entry.comment += self.current_token[3:] + return True + + def handle_oc(self): + """Handle a file:num occurence.""" + if self.current_state in ['MC', 'MS', 'MX']: + self.instance.append(self.current_entry) + self.current_entry = POEntry() + occurrences = self.current_token[3:].split() + for occurrence in occurrences: + if occurrence != '': + try: + fil, line = occurrence.split(':') + if not line.isdigit(): + fil = fil + line + line = '' + self.current_entry.occurrences.append((fil, line)) + except: + self.current_entry.occurrences.append((occurrence, '')) + return True + + def handle_fl(self): + """Handle a flags line.""" + if self.current_state in ['MC', 'MS', 'MX']: + self.instance.append(self.current_entry) + self.current_entry = POEntry() + self.current_entry.flags += self.current_token[3:].split(', ') + return True + + def handle_pp(self): + """Handle a previous msgid_plural line.""" + if self.current_state in ['MC', 'MS', 'MX']: + self.instance.append(self.current_entry) + self.current_entry = POEntry() + self.current_entry.previous_msgid_plural = \ + unescape(self.current_token[17:-1]) + return True + + def handle_pm(self): + """Handle a previous msgid line.""" + if self.current_state in ['MC', 'MS', 'MX']: + self.instance.append(self.current_entry) + self.current_entry = POEntry() + self.current_entry.previous_msgid = \ + unescape(self.current_token[10:-1]) + return True + + def handle_pc(self): + """Handle a previous msgctxt line.""" + if self.current_state in ['MC', 'MS', 'MX']: + self.instance.append(self.current_entry) + self.current_entry = POEntry() + self.current_entry.previous_msgctxt = \ + unescape(self.current_token[12:-1]) + return True + + def handle_ct(self): + """Handle a msgctxt.""" + if self.current_state in ['MC', 'MS', 'MX']: + self.instance.append(self.current_entry) + self.current_entry = POEntry() + self.current_entry.msgctxt = unescape(self.current_token[9:-1]) + return True + + def handle_mi(self): + """Handle a msgid.""" + if self.current_state in ['MC', 'MS', 'MX']: + self.instance.append(self.current_entry) + self.current_entry = POEntry() + self.current_entry.obsolete = self.entry_obsolete + self.current_entry.msgid = unescape(self.current_token[7:-1]) + return True + + def handle_mp(self): + """Handle a msgid plural.""" + self.current_entry.msgid_plural = unescape(self.current_token[14:-1]) + return True + + def handle_ms(self): + """Handle a msgstr.""" + self.current_entry.msgstr = unescape(self.current_token[8:-1]) + return True + + def handle_mx(self): + """Handle a msgstr plural.""" + index, value = self.current_token[7], self.current_token[11:-1] + self.current_entry.msgstr_plural[index] = unescape(value) + self.msgstr_index = index + return True + + def handle_mc(self): + """Handle a msgid or msgstr continuation line.""" + token = unescape(self.current_token[1:-1]) + if self.current_state == 'CT': + typ = 'msgctxt' + self.current_entry.msgctxt += token + elif self.current_state == 'MI': + typ = 'msgid' + self.current_entry.msgid += token + elif self.current_state == 'MP': + typ = 'msgid_plural' + self.current_entry.msgid_plural += token + elif self.current_state == 'MS': + typ = 'msgstr' + self.current_entry.msgstr += token + elif self.current_state == 'MX': + typ = 'msgstr[%s]' % self.msgstr_index + self.current_entry.msgstr_plural[self.msgstr_index] += token + elif self.current_state == 'PP': + typ = 'previous_msgid_plural' + token = token[3:] + self.current_entry.previous_msgid_plural += token + elif self.current_state == 'PM': + typ = 'previous_msgid' + token = token[3:] + self.current_entry.previous_msgid += token + elif self.current_state == 'PC': + typ = 'previous_msgctxt' + token = token[3:] + self.current_entry.previous_msgctxt += token + if typ not in self.current_entry._multiline_str: + self.current_entry._multiline_str[typ] = token + else: + self.current_entry._multiline_str[typ] += "__POLIB__NL__" + token + # don't change the current state + return False + +# }}} +# class _MOFileParser {{{ + +class _MOFileParser(object): + """ + A class to parse binary mo files. + """ + BIG_ENDIAN = 0xde120495 + LITTLE_ENDIAN = 0x950412de + + def __init__(self, fpath, *args, **kwargs): + """ + Constructor. + + **Arguments**: + - *fpath*: string, path to the po file + - *encoding*: string, the encoding to use, defaults to + "default_encoding" global variable (optional), + - *check_for_duplicates*: whether to check for duplicate entries + when adding entries to the file, default: False (optional). + """ + enc = kwargs.get('encoding', default_encoding) + check_dup = kwargs.get('check_for_duplicates', False) + self.fhandle = open(fpath, 'rb') + self.instance = MOFile( + fpath=fpath, + encoding=enc, + check_for_duplicates=check_dup + ) + + def parse_magicnumber(self): + """ + Parse the magic number and raise an exception if not valid. + """ + + def parse(self): + """ + Build the instance with the file handle provided in the + constructor. + """ + magic_number = self._readbinary('<I', 4) + if magic_number == self.LITTLE_ENDIAN: + ii = '<II' + elif magic_number == self.BIG_ENDIAN: + ii = '>II' + else: + raise IOError('Invalid mo file, magic number is incorrect !') + self.instance.magic_number = magic_number + # parse the version number and the number of strings + self.instance.version, numofstrings = self._readbinary(ii, 8) + # original strings and translation strings hash table offset + msgids_hash_offset, msgstrs_hash_offset = self._readbinary(ii, 8) + # move to msgid hash table and read length and offset of msgids + self.fhandle.seek(msgids_hash_offset) + msgids_index = [] + for i in range(numofstrings): + msgids_index.append(self._readbinary(ii, 8)) + # move to msgstr hash table and read length and offset of msgstrs + self.fhandle.seek(msgstrs_hash_offset) + msgstrs_index = [] + for i in range(numofstrings): + msgstrs_index.append(self._readbinary(ii, 8)) + # build entries + for i in range(numofstrings): + self.fhandle.seek(msgids_index[i][1]) + msgid = self.fhandle.read(msgids_index[i][0]) + self.fhandle.seek(msgstrs_index[i][1]) + msgstr = self.fhandle.read(msgstrs_index[i][0]) + if i == 0: # metadata + raw_metadata, metadata = msgstr.split('\n'), {} + for line in raw_metadata: + tokens = line.split(':', 1) + if tokens[0] != '': + try: + metadata[tokens[0]] = tokens[1].strip() + except IndexError: + metadata[tokens[0]] = '' + self.instance.metadata = metadata + continue + # test if we have a plural entry + msgid_tokens = msgid.split('\0') + if len(msgid_tokens) > 1: + entry = MOEntry( + msgid=msgid_tokens[0], + msgid_plural=msgid_tokens[1], + msgstr_plural=dict((k,v) for k,v in \ + enumerate(msgstr.split('\0'))) + ) + else: + entry = MOEntry(msgid=msgid, msgstr=msgstr) + self.instance.append(entry) + # close opened file + self.fhandle.close() + return self.instance + + def _readbinary(self, fmt, numbytes): + """ + Private method that unpack n bytes of data using format <fmt>. + It returns a tuple or a mixed value if the tuple length is 1. + """ + bytes = self.fhandle.read(numbytes) + tup = struct.unpack(fmt, bytes) + if len(tup) == 1: + return tup[0] + return tup + +# }}} +# __main__ {{{ + +if __name__ == '__main__': + """ + **Main function**:: + - to **test** the module just run: *python polib.py [-v]* + - to **profile** the module: *python polib.py -p <some_pofile.po>* + """ + import sys + if len(sys.argv) > 2 and sys.argv[1] == '-p': + def test(f): + if f.endswith('po'): + p = pofile(f) + else: + p = mofile(f) + s = unicode(p) + import profile + profile.run('test("'+sys.argv[2]+'")') + else: + import doctest + doctest.testmod() + +# }}}
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/i18n/posplit Sat Jun 19 17:06:11 2010 +0200 @@ -0,0 +1,56 @@ +#!/usr/bin/env python +# +# posplit - split messages in paragraphs on .po/.pot files +# +# license: MIT/X11/Expat +# + +import sys +import polib + +def addentry(po, entry, cache): + e = cache.get(entry.msgid) + if e: + e.occurrences.extend(entry.occurrences) + else: + po.append(entry) + cache[entry.msgid] = entry + +def mkentry(orig, delta, msgid, msgstr): + entry = polib.POEntry() + entry.merge(orig) + entry.msgid = msgid or orig.msgid + entry.msgstr = msgstr or orig.msgstr + entry.occurrences = [(p, int(l) + delta) for (p, l) in orig.occurrences] + return entry + +if __name__ == "__main__": + po = polib.pofile(sys.argv[1]) + + cache = {} + entries = po[:] + po[:] = [] + for entry in entries: + msgids = entry.msgid.split(u'\n\n') + if entry.msgstr: + msgstrs = entry.msgstr.split(u'\n\n') + else: + msgstrs = [u''] * len(msgids) + + if len(msgids) != len(msgstrs): + # places the whole existing translation as a fuzzy + # translation for each paragraph, to give the + # translator a chance to recover part of the old + # translation - erasing extra paragraphs is + # probably better than retranslating all from start + if 'fuzzy' not in entry.flags: + entry.flags.append('fuzzy') + msgstrs = [entry.msgstr] * len(msgids) + + delta = 0 + for msgid, msgstr in zip(msgids, msgstrs): + if msgid: + newentry = mkentry(entry, delta, msgid, msgstr) + addentry(po, newentry, cache) + delta += 2 + msgid.count('\n') + po.save()
--- a/mercurial/commands.py Fri Jun 18 15:22:56 2010 +0200 +++ b/mercurial/commands.py Sat Jun 19 17:06:11 2010 +0200 @@ -2451,7 +2451,8 @@ If no revision range is specified, the default is tip:0 unless --follow is set, in which case the working directory parent is - used as the starting revision. + used as the starting revision. You can specify a revision set for + log, see :hg:`help revsets` for more information. See :hg:`help dates` for a list of formats valid for -d/--date.
--- a/mercurial/help.py Fri Jun 18 15:22:56 2010 +0200 +++ b/mercurial/help.py Sat Jun 19 17:06:11 2010 +0200 @@ -92,6 +92,7 @@ loaddoc('revisions')), (['mrevs', 'multirevs'], _('Specifying Multiple Revisions'), loaddoc('multirevs')), + (['revsets'], _("Specifying Revision Sets"), loaddoc('revsets')), (['diffs'], _('Diff Formats'), loaddoc('diffs')), (['templating', 'templates'], _('Template Usage'), loaddoc('templates')),
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/mercurial/help/revsets.txt Sat Jun 19 17:06:11 2010 +0200 @@ -0,0 +1,166 @@ +Mercurial supports a functional language for selecting a set of +revisions. + +The language supports a number of predicates which are joined by infix +operators. Parenthesis can be used for grouping. + +Identifiers such as branch names must be quoted with single or double +quotes if they contain characters outside of ``[a-zA-Z0-9]`` or if +they match one of the predefined predicates. Special characters can be +used in the identifiers by quoting them, e.g., ``\n`` is interpreted +as a newline. + +There is a single prefix operator: + +``not x`` + Changesets not in x. Short form is ``! x``. + +These are the supported infix operators: + +``x::y`` + A DAG range, meaning all changesets that are descendants of x and + ancestors of y, including x and y themselves. If the first endpoint + is left out, this is equivalent to ``ancestors(y)``, if the second + is left out it is equivalent to ``descendents(x)``. + + An alternative syntax is ``x..y``. + +``x:y`` + All changesets with revision numbers between x and y, both + inclusive. Either endpoint can be left out, they default to 0 and + tip. + +``x and y`` + The intersection of changesets in x and y. Short form is ``x & y``. + +``x or y`` + The union of changesets in x and y. There are two alternative short + forms: ``x | y`` and ``x + y``. + +``x - y`` + Changesets in x but not in y. + +The following predicates are supported: + +``adds(pattern)`` + Changesets that add a file matching pattern. + +``all()`` + All changesets, the same as ``0:tip``. + +``ancestor(single, single)`` + Greatest common ancestor of the two changesets. + +``ancestors(set)`` + Changesets that are ancestors of a changeset in set. + +``author(string)`` + Alias for ``user(string)``. + +``branch(set)`` + The branch names are found for changesets in set, and the result is + all changesets belonging to one those branches. + +``children(set)`` + Child changesets of changesets in set. + +``closed()`` + Changeset is closed. + +``contains(pattern)`` + Revision contains pattern. + +``date(interval)`` + Changesets within the interval, see :hg:`help dates`. + +``descendants(set)`` + Changesets which are decendants of changesets in set. + +``file(pattern)`` + Changesets which manually affected files matching pattern. + +``follow()`` + An alias for ``::.`` (ancestors of the working copy's first parent). + +``grep(regex)`` + Like ``keyword(string)`` but accepts a regex. + +``head()`` + Changeset is a head. + +``heads(set)`` + Members of set with no children in set. + +``keyword(string)`` + Search commit message, user name, and names of changed files for + string. + +``limit(set, n)`` + First n members of set. + +``max(set)`` + Changeset with highest revision number in set. + +``merge()`` + Changeset is a merge changeset. + +``modifies(pattern)`` + Changesets which modify files matching pattern. + +``outgoing([path])`` + Changesets missing in path. + +``p1(set)`` + First parent of changesets in set. + +``p2(set)`` + Second parent of changesets in set. + +``parents(set)`` + The set of all parents for all changesets in set. + +``removes(pattern)`` + Changesets which remove files matching pattern. + +``reverse(set)`` + Reverse order of set. + +``roots(set)`` + Changesets with no parent changeset in set. + +``sort(set[, [-]key...])`` + Sort set by keys. The default sort order is ascending, specify a key + as ``-key`` to sort in descending order. + + The keys can be: + + - ``rev`` for the revision number, + - ``branch`` for the branch name, + - ``desc`` for the commit message (description), + - ``user`` for user name (``author`` can be used as an alias), + - ``date`` for the commit date + +``tagged()`` + Changeset is tagged. + +``user(string)`` + User name is string. + +Command line equivalents for :hg:`log`:: + + -f -> ::. + -d x -> date(x) + -k x -> keyword(x) + -m -> merge() + -u x -> user(x) + -b x -> branch(x) + -P x -> !::x + -l x -> limit(expr, x) + +Some sample queries:: + + hg log -r 'branch(default)' + hg log -r 'branch(default) and 1.5:: and not merge()' + hg log -r '1.3::1.5 and keyword(bug) and file("hgext/*")' + hg log -r 'sort(date("May 2008"), user)' + hg log -r '(keyword(bug) or keyword(issue)) and not ancestors(tagged())'
--- a/mercurial/i18n.py Fri Jun 18 15:22:56 2010 +0200 +++ b/mercurial/i18n.py Sat Jun 19 17:06:11 2010 +0200 @@ -36,7 +36,7 @@ if message is None: return message - u = t.ugettext(message) + u = u'\n\n'.join([t.ugettext(m) for m in message.split('\n\n')]) try: # encoding.tolocal cannot be used since it will first try to # decode the Unicode string. Calling u.decode(enc) really
--- a/mercurial/revset.py Fri Jun 18 15:22:56 2010 +0200 +++ b/mercurial/revset.py Sat Jun 19 17:06:11 2010 +0200 @@ -8,6 +8,7 @@ import re import parser, util, error, discovery import match as _match +from i18n import _ elements = { "(": (20, ("group", 1, ")"), ("func", 1, ")")), @@ -60,7 +61,7 @@ break pos += 1 else: - raise error.ParseError("unterminated string", s) + raise error.ParseError(_("unterminated string"), s) elif c.isalnum() or c in '.': # gather up a symbol/keyword s = pos pos += 1 @@ -79,7 +80,7 @@ yield ('symbol', sym, s) pos -= 1 else: - raise error.ParseError("syntax error", pos) + raise error.ParseError(_("syntax error"), pos) pos += 1 yield ('end', None, pos) @@ -105,14 +106,14 @@ def getset(repo, subset, x): if not x: - raise error.ParseError("missing argument") + raise error.ParseError(_("missing argument")) return methods[x[0]](repo, subset, *x[1:]) # operator methods def negate(repo, subset, x): return getset(repo, subset, - ('string', '-' + getstring(x, "can't negate that"))) + ('string', '-' + getstring(x, _("can't negate that")))) def stringset(repo, subset, x): x = repo[x].rev() @@ -124,7 +125,7 @@ def symbolset(repo, subset, x): if x in symbols: - raise error.ParseError("can't use %s here" % x) + raise error.ParseError(_("can't use %s here") % x) return stringset(repo, subset, x) def rangeset(repo, subset, x, y): @@ -147,12 +148,12 @@ return [r for r in subset if r not in s] def listset(repo, subset, a, b): - raise error.ParseError("can't use a list in this context") + raise error.ParseError(_("can't use a list in this context")) def func(repo, subset, a, b): if a[0] == 'symbol' and a[1] in symbols: return symbols[a[1]](repo, subset, b) - raise error.ParseError("not a function: %s" % a[1]) + raise error.ParseError(_("not a function: %s") % a[1]) # functions @@ -186,11 +187,11 @@ return [] def limit(repo, subset, x): - l = getargs(x, 2, 2, "limit wants two args") + l = getargs(x, 2, 2, _("limit wants two arguments")) try: - lim = int(getstring(l[1], "limit wants a number")) + lim = int(getstring(l[1], _("limit wants a number"))) except ValueError: - raise error.ParseError("limit expects a number") + raise error.ParseError(_("limit expects a number")) return getset(repo, subset, l[0])[:lim] def children(repo, subset, x): @@ -212,11 +213,11 @@ return [r for r in subset if r in s or repo[r].branch() in b] def ancestor(repo, subset, x): - l = getargs(x, 2, 2, "ancestor wants two args") + l = getargs(x, 2, 2, _("ancestor wants two arguments")) a = getset(repo, subset, l[0]) b = getset(repo, subset, l[1]) if len(a) > 1 or len(b) > 1: - raise error.ParseError("ancestor args must be single revisions") + raise error.ParseError(_("ancestor arguments must be single revisions")) return [repo[a[0]].ancestor(repo[b[0]]).rev()] def ancestors(repo, subset, x): @@ -230,18 +231,18 @@ return [r for r in subset if r in s] def follow(repo, subset, x): - getargs(x, 0, 0, "follow takes no arguments") + getargs(x, 0, 0, _("follow takes no arguments")) p = repo['.'].rev() s = set(repo.changelog.ancestors(p)) | set([p]) return [r for r in subset if r in s] def date(repo, subset, x): - ds = getstring(x, 'date wants a string') + ds = getstring(x, _("date wants a string")) dm = util.matchdate(ds) return [r for r in subset if dm(repo[r].date()[0])] def keyword(repo, subset, x): - kw = getstring(x, "keyword wants a string").lower() + kw = getstring(x, _("keyword wants a string")).lower() l = [] for r in subset: c = repo[r] @@ -251,7 +252,7 @@ return l def grep(repo, subset, x): - gr = re.compile(getstring(x, "grep wants a string")) + gr = re.compile(getstring(x, _("grep wants a string"))) l = [] for r in subset: c = repo[r] @@ -262,11 +263,11 @@ return l def author(repo, subset, x): - n = getstring(x, "author wants a string").lower() + n = getstring(x, _("author wants a string")).lower() return [r for r in subset if n in repo[r].user().lower()] def hasfile(repo, subset, x): - pat = getstring(x, "file wants a pattern") + pat = getstring(x, _("file wants a pattern")) m = _match.match(repo.root, repo.getcwd(), [pat]) s = [] for r in subset: @@ -277,7 +278,7 @@ return s def contains(repo, subset, x): - pat = getstring(x, "file wants a pattern") + pat = getstring(x, _("file wants a pattern")) m = _match.match(repo.root, repo.getcwd(), [pat]) s = [] if m.files() == [pat]: @@ -321,28 +322,28 @@ return s def modifies(repo, subset, x): - pat = getstring(x, "modifies wants a pattern") + pat = getstring(x, _("modifies wants a pattern")) return checkstatus(repo, subset, pat, 0) def adds(repo, subset, x): - pat = getstring(x, "adds wants a pattern") + pat = getstring(x, _("adds wants a pattern")) return checkstatus(repo, subset, pat, 1) def removes(repo, subset, x): - pat = getstring(x, "removes wants a pattern") + pat = getstring(x, _("removes wants a pattern")) return checkstatus(repo, subset, pat, 2) def merge(repo, subset, x): - getargs(x, 0, 0, "merge takes no arguments") + getargs(x, 0, 0, _("merge takes no arguments")) cl = repo.changelog return [r for r in subset if cl.parentrevs(r)[1] != -1] def closed(repo, subset, x): - getargs(x, 0, 0, "closed takes no arguments") + getargs(x, 0, 0, _("closed takes no arguments")) return [r for r in subset if repo[r].extra().get('close')] def head(repo, subset, x): - getargs(x, 0, 0, "head takes no arguments") + getargs(x, 0, 0, _("head takes no arguments")) hs = set() for b, ls in repo.branchmap().iteritems(): hs.update(repo[h].rev() for h in ls) @@ -354,10 +355,10 @@ return l def sort(repo, subset, x): - l = getargs(x, 1, 2, "sort wants one or two arguments") + l = getargs(x, 1, 2, _("sort wants one or two arguments")) keys = "rev" if len(l) == 2: - keys = getstring(l[1], "sort spec must be a string") + keys = getstring(l[1], _("sort spec must be a string")) s = l[0] keys = keys.split() @@ -389,14 +390,14 @@ elif k == '-date': e.append(-c.date()[0]) else: - raise error.ParseError("unknown sort key %r" % k) + raise error.ParseError(_("unknown sort key %r") % k) e.append(r) l.append(e) l.sort() return [e[-1] for e in l] def getall(repo, subset, x): - getargs(x, 0, 0, "all takes no arguments") + getargs(x, 0, 0, _("all takes no arguments")) return subset def heads(repo, subset, x): @@ -411,7 +412,7 @@ def outgoing(repo, subset, x): import hg # avoid start-up nasties - l = getargs(x, 0, 1, "outgoing wants a repo path") + l = getargs(x, 0, 1, _("outgoing wants a repository path")) dest = l[1:] or '' dest = repo.ui.expandpath(dest or 'default-push', dest or 'default') dest, branches = hg.parseurl(dest) @@ -425,7 +426,7 @@ return [r for r in subset if r in o] def tagged(repo, subset, x): - getargs(x, 0, 0, "tagged takes no arguments") + getargs(x, 0, 0, _("tagged takes no arguments")) cl = repo.changelog s = set([cl.rev(n) for t, n in repo.tagslist() if t != 'tip']) return [r for r in subset if r in s] @@ -523,7 +524,7 @@ wb, tb = optimize(x[2], small) return wa + wb, (op, ta, tb) elif op == 'func': - f = getstring(x[1], "not a symbol") + f = getstring(x[1], _("not a symbol")) wa, ta = optimize(x[2], small) if f in "grep date user author keyword branch file": w = 10 # slow @@ -545,6 +546,8 @@ parse = parser.parser(tokenize, elements).parse def match(spec): + if not spec: + raise error.ParseError(_("empty query")) tree = parse(spec) weight, tree = optimize(tree, True) def mfunc(repo, subset):
--- a/tests/test-globalopts.out Fri Jun 18 15:22:56 2010 +0200 +++ b/tests/test-globalopts.out Sat Jun 19 17:06:11 2010 +0200 @@ -208,6 +208,7 @@ environment Environment Variables revisions Specifying Single Revisions multirevs Specifying Multiple Revisions + revsets Specifying Revision Sets diffs Diff Formats templating Template Usage urls URL Paths @@ -279,6 +280,7 @@ environment Environment Variables revisions Specifying Single Revisions multirevs Specifying Multiple Revisions + revsets Specifying Revision Sets diffs Diff Formats templating Template Usage urls URL Paths
--- a/tests/test-help.out Fri Jun 18 15:22:56 2010 +0200 +++ b/tests/test-help.out Sat Jun 19 17:06:11 2010 +0200 @@ -101,6 +101,7 @@ environment Environment Variables revisions Specifying Single Revisions multirevs Specifying Multiple Revisions + revsets Specifying Revision Sets diffs Diff Formats templating Template Usage urls URL Paths @@ -168,6 +169,7 @@ environment Environment Variables revisions Specifying Single Revisions multirevs Specifying Multiple Revisions + revsets Specifying Revision Sets diffs Diff Formats templating Template Usage urls URL Paths @@ -619,6 +621,7 @@ environment Environment Variables revisions Specifying Single Revisions multirevs Specifying Multiple Revisions + revsets Specifying Revision Sets diffs Diff Formats templating Template Usage urls URL Paths