--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/i18n/polib.py Thu Jun 17 20:10:34 2010 -0300
@@ -0,0 +1,1676 @@
+#!/usr/bin/env python
+# -*- coding: utf-8 -*-
+#
+# License: MIT (see LICENSE file provided)
+# vim: set expandtab tabstop=4 shiftwidth=4 softtabstop=4:
+
+"""
+**polib** allows you to manipulate, create, modify gettext files (pot, po
+and mo files). You can load existing files, iterate through it's entries,
+add, modify entries, comments or metadata, etc... or create new po files
+from scratch.
+
+**polib** provides a simple and pythonic API, exporting only three
+convenience functions (*pofile*, *mofile* and *detect_encoding*), and the
+four core classes, *POFile*, *MOFile*, *POEntry* and *MOEntry* for creating
+new files/entries.
+
+**Basic example**:
+
+>>> import polib
+>>> # load an existing po file
+>>> po = polib.pofile('tests/test_utf8.po')
+>>> for entry in po:
+... # do something with entry...
+... pass
+>>> # add an entry
+>>> entry = polib.POEntry(msgid='Welcome', msgstr='Bienvenue')
+>>> entry.occurrences = [('welcome.py', '12'), ('anotherfile.py', '34')]
+>>> po.append(entry)
+>>> # to save our modified po file:
+>>> # po.save()
+>>> # or you may want to compile the po file
+>>> # po.save_as_mofile('tests/test_utf8.mo')
+"""
+
+__author__ = 'David JEAN LOUIS <izimobil@gmail.com>'
+__version__ = '0.5.2'
+__all__ = ['pofile', 'POFile', 'POEntry', 'mofile', 'MOFile', 'MOEntry',
+ 'detect_encoding', 'escape', 'unescape', 'detect_encoding',]
+
+import codecs
+import struct
+import textwrap
+import types
+
+default_encoding = 'utf-8'
+
+# function pofile() {{{
+
+def pofile(fpath, **kwargs):
+ """
+ Convenience function that parse the po/pot file *fpath* and return
+ a POFile instance.
+
+ **Keyword arguments**:
+ - *fpath*: string, full or relative path to the po/pot file to parse
+ - *wrapwidth*: integer, the wrap width, only useful when -w option was
+ passed to xgettext (optional, default to 78)
+ - *autodetect_encoding*: boolean, if set to False the function will
+ not try to detect the po file encoding (optional, default to True)
+ - *encoding*: string, an encoding, only relevant if autodetect_encoding
+ is set to False
+ - *check_for_duplicates*: whether to check for duplicate entries when
+ adding entries to the file, default: False (optional)
+
+ **Example**:
+
+ >>> import polib
+ >>> po = polib.pofile('tests/test_weird_occurrences.po',
+ ... check_for_duplicates=True)
+ >>> po #doctest: +ELLIPSIS
+ <POFile instance at ...>
+ >>> import os, tempfile
+ >>> all_attrs = ('msgctxt', 'msgid', 'msgstr', 'msgid_plural',
+ ... 'msgstr_plural', 'obsolete', 'comment', 'tcomment',
+ ... 'occurrences', 'flags', 'previous_msgctxt',
+ ... 'previous_msgid', 'previous_msgid_plural')
+ >>> for fname in ['test_iso-8859-15.po', 'test_utf8.po']:
+ ... orig_po = polib.pofile('tests/'+fname)
+ ... tmpf = tempfile.NamedTemporaryFile().name
+ ... orig_po.save(tmpf)
+ ... try:
+ ... new_po = polib.pofile(tmpf)
+ ... for old, new in zip(orig_po, new_po):
+ ... for attr in all_attrs:
+ ... if getattr(old, attr) != getattr(new, attr):
+ ... getattr(old, attr)
+ ... getattr(new, attr)
+ ... finally:
+ ... os.unlink(tmpf)
+ >>> po_file = polib.pofile('tests/test_save_as_mofile.po')
+ >>> tmpf = tempfile.NamedTemporaryFile().name
+ >>> po_file.save_as_mofile(tmpf)
+ >>> try:
+ ... mo_file = polib.mofile(tmpf)
+ ... for old, new in zip(po_file, mo_file):
+ ... if po_file._encode(old.msgid) != mo_file._encode(new.msgid):
+ ... 'OLD: ', po_file._encode(old.msgid)
+ ... 'NEW: ', mo_file._encode(new.msgid)
+ ... if po_file._encode(old.msgstr) != mo_file._encode(new.msgstr):
+ ... 'OLD: ', po_file._encode(old.msgstr)
+ ... 'NEW: ', mo_file._encode(new.msgstr)
+ ... print new.msgstr
+ ... finally:
+ ... os.unlink(tmpf)
+ """
+ if kwargs.get('autodetect_encoding', True) == True:
+ enc = detect_encoding(fpath)
+ else:
+ enc = kwargs.get('encoding', default_encoding)
+ check_for_duplicates = kwargs.get('check_for_duplicates', False)
+ parser = _POFileParser(
+ fpath,
+ encoding=enc,
+ check_for_duplicates=kwargs.get('check_for_duplicates', False)
+ )
+ instance = parser.parse()
+ instance.wrapwidth = kwargs.get('wrapwidth', 78)
+ return instance
+
+# }}}
+# function mofile() {{{
+
+def mofile(fpath, **kwargs):
+ """
+ Convenience function that parse the mo file *fpath* and return
+ a MOFile instance.
+
+ **Keyword arguments**:
+ - *fpath*: string, full or relative path to the mo file to parse
+ - *wrapwidth*: integer, the wrap width, only useful when -w option was
+ passed to xgettext to generate the po file that was used to format
+ the mo file (optional, default to 78)
+ - *autodetect_encoding*: boolean, if set to False the function will
+ not try to detect the po file encoding (optional, default to True)
+ - *encoding*: string, an encoding, only relevant if autodetect_encoding
+ is set to False
+ - *check_for_duplicates*: whether to check for duplicate entries when
+ adding entries to the file, default: False (optional)
+
+ **Example**:
+
+ >>> import polib
+ >>> mo = polib.mofile('tests/test_utf8.mo', check_for_duplicates=True)
+ >>> mo #doctest: +ELLIPSIS
+ <MOFile instance at ...>
+ >>> import os, tempfile
+ >>> for fname in ['test_iso-8859-15.mo', 'test_utf8.mo']:
+ ... orig_mo = polib.mofile('tests/'+fname)
+ ... tmpf = tempfile.NamedTemporaryFile().name
+ ... orig_mo.save(tmpf)
+ ... try:
+ ... new_mo = polib.mofile(tmpf)
+ ... for old, new in zip(orig_mo, new_mo):
+ ... if old.msgid != new.msgid:
+ ... old.msgstr
+ ... new.msgstr
+ ... finally:
+ ... os.unlink(tmpf)
+ """
+ if kwargs.get('autodetect_encoding', True) == True:
+ enc = detect_encoding(fpath, True)
+ else:
+ enc = kwargs.get('encoding', default_encoding)
+ parser = _MOFileParser(
+ fpath,
+ encoding=enc,
+ check_for_duplicates=kwargs.get('check_for_duplicates', False)
+ )
+ instance = parser.parse()
+ instance.wrapwidth = kwargs.get('wrapwidth', 78)
+ return instance
+
+# }}}
+# function detect_encoding() {{{
+
+def detect_encoding(fpath, binary_mode=False):
+ """
+ Try to detect the encoding used by the file *fpath*. The function will
+ return polib default *encoding* if it's unable to detect it.
+
+ **Keyword argument**:
+ - *fpath*: string, full or relative path to the mo file to parse.
+
+ **Examples**:
+
+ >>> print(detect_encoding('tests/test_noencoding.po'))
+ utf-8
+ >>> print(detect_encoding('tests/test_utf8.po'))
+ UTF-8
+ >>> print(detect_encoding('tests/test_utf8.mo', True))
+ UTF-8
+ >>> print(detect_encoding('tests/test_iso-8859-15.po'))
+ ISO_8859-15
+ >>> print(detect_encoding('tests/test_iso-8859-15.mo', True))
+ ISO_8859-15
+ """
+ import re
+ rx = re.compile(r'"?Content-Type:.+? charset=([\w_\-:\.]+)')
+ if binary_mode:
+ mode = 'rb'
+ else:
+ mode = 'r'
+ f = open(fpath, mode)
+ for l in f.readlines():
+ match = rx.search(l)
+ if match:
+ f.close()
+ return match.group(1).strip()
+ f.close()
+ return default_encoding
+
+# }}}
+# function escape() {{{
+
+def escape(st):
+ """
+ Escape special chars and return the given string *st*.
+
+ **Examples**:
+
+ >>> escape('\\t and \\n and \\r and " and \\\\')
+ '\\\\t and \\\\n and \\\\r and \\\\" and \\\\\\\\'
+ """
+ return st.replace('\\', r'\\')\
+ .replace('\t', r'\t')\
+ .replace('\r', r'\r')\
+ .replace('\n', r'\n')\
+ .replace('\"', r'\"')
+
+# }}}
+# function unescape() {{{
+
+def unescape(st):
+ """
+ Unescape special chars and return the given string *st*.
+
+ **Examples**:
+
+ >>> unescape('\\\\t and \\\\n and \\\\r and \\\\" and \\\\\\\\')
+ '\\t and \\n and \\r and " and \\\\'
+ >>> unescape(r'\\n')
+ '\\n'
+ >>> unescape(r'\\\\n')
+ '\\\\n'
+ """
+ raw_strings = [
+ (r'\\n', r'\n', '\n'),
+ (r'\\r', r'\r', '\r'),
+ (r'\\t', r'\t', '\t'),
+ ]
+ for a, b, c in raw_strings:
+ if a in st:
+ st = st.replace(a, b)
+ else:
+ st = st.replace(b, c)
+ return st.replace(r'\"', '"').replace(r'\\', '\\')
+
+# }}}
+# class _BaseFile {{{
+
+class _BaseFile(list):
+ """
+ Common parent class for POFile and MOFile classes.
+ This class must **not** be instanciated directly.
+ """
+
+ def __init__(self, *args, **kwargs):
+ """
+ Constructor.
+
+ **Keyword arguments**:
+ - *fpath*: string, path to po or mo file
+ - *wrapwidth*: integer, the wrap width, only useful when -w option
+ was passed to xgettext to generate the po file that was used to
+ format the mo file, default to 78 (optional),
+ - *encoding*: string, the encoding to use, defaults to
+ "default_encoding" global variable (optional),
+ - *check_for_duplicates*: whether to check for duplicate entries
+ when adding entries to the file, default: False (optional).
+ """
+ list.__init__(self)
+ # the opened file handle
+ self.fpath = kwargs.get('fpath')
+ # the width at which lines should be wrapped
+ self.wrapwidth = kwargs.get('wrapwidth', 78)
+ # the file encoding
+ self.encoding = kwargs.get('encoding', default_encoding)
+ # whether to check for duplicate entries or not
+ self.check_for_duplicates = kwargs.get('check_for_duplicates', False)
+ # header
+ self.header = ''
+ # both po and mo files have metadata
+ self.metadata = {}
+ self.metadata_is_fuzzy = 0
+
+ def __str__(self):
+ """
+ String representation of the file.
+ """
+ ret = []
+ entries = [self.metadata_as_entry()] + \
+ [e for e in self if not e.obsolete]
+ for entry in entries:
+ ret.append(entry.__str__(self.wrapwidth))
+ for entry in self.obsolete_entries():
+ ret.append(entry.__str__(self.wrapwidth))
+ return '\n'.join(ret)
+
+ def __contains__(self, entry):
+ """
+ Overriden method to implement the membership test (in and not in).
+ The method considers that an entry is in the file if it finds an
+ entry that has the same msgid (case sensitive).
+
+ **Keyword argument**:
+ - *entry*: an instance of polib._BaseEntry
+
+ **Tests**:
+ >>> po = POFile()
+ >>> e1 = POEntry(msgid='foobar', msgstr='spam')
+ >>> e2 = POEntry(msgid='barfoo', msgstr='spam')
+ >>> e3 = POEntry(msgid='foobar', msgstr='eggs')
+ >>> e4 = POEntry(msgid='spameggs', msgstr='eggs')
+ >>> po.append(e1)
+ >>> po.append(e2)
+ >>> e1 in po
+ True
+ >>> e2 not in po
+ False
+ >>> e3 in po
+ True
+ >>> e4 in po
+ False
+ """
+ return self.find(entry.msgid, by='msgid') is not None
+
+ def append(self, entry):
+ """
+ Overriden method to check for duplicates entries, if a user tries to
+ add an entry that already exists, the method will raise a ValueError
+ exception.
+
+ **Keyword argument**:
+ - *entry*: an instance of polib._BaseEntry
+
+ **Tests**:
+ >>> e1 = POEntry(msgid='foobar', msgstr='spam')
+ >>> e2 = POEntry(msgid='foobar', msgstr='eggs')
+ >>> po = POFile(check_for_duplicates=True)
+ >>> po.append(e1)
+ >>> try:
+ ... po.append(e2)
+ ... except ValueError, e:
+ ... unicode(e)
+ u'Entry "foobar" already exists'
+ """
+ if self.check_for_duplicates and entry in self:
+ raise ValueError('Entry "%s" already exists' % entry.msgid)
+ super(_BaseFile, self).append(entry)
+
+ def insert(self, index, entry):
+ """
+ Overriden method to check for duplicates entries, if a user tries to
+ insert an entry that already exists, the method will raise a ValueError
+ exception.
+
+ **Keyword arguments**:
+ - *index*: index at which the entry should be inserted
+ - *entry*: an instance of polib._BaseEntry
+
+ **Tests**:
+ >>> import polib
+ >>> polib.check_for_duplicates = True
+ >>> e1 = POEntry(msgid='foobar', msgstr='spam')
+ >>> e2 = POEntry(msgid='barfoo', msgstr='eggs')
+ >>> e3 = POEntry(msgid='foobar', msgstr='eggs')
+ >>> po = POFile(check_for_duplicates=True)
+ >>> po.insert(0, e1)
+ >>> po.insert(1, e2)
+ >>> try:
+ ... po.insert(0, e3)
+ ... except ValueError, e:
+ ... unicode(e)
+ u'Entry "foobar" already exists'
+ """
+ if self.check_for_duplicates and entry in self:
+ raise ValueError('Entry "%s" already exists' % entry.msgid)
+ super(_BaseFile, self).insert(index, entry)
+
+ def __repr__(self):
+ """Return the official string representation of the object."""
+ return '<%s instance at %x>' % (self.__class__.__name__, id(self))
+
+ def metadata_as_entry(self):
+ """
+ Return the metadata as an entry:
+
+ >>> import polib
+ >>> po = polib.pofile('tests/test_fuzzy_header.po')
+ >>> unicode(po) == unicode(open('tests/test_fuzzy_header.po').read())
+ True
+ """
+ e = POEntry(msgid='')
+ mdata = self.ordered_metadata()
+ if mdata:
+ strs = []
+ e._multiline_str['msgstr'] = ''
+ for name, value in mdata:
+ # Strip whitespace off each line in a multi-line entry
+ strs.append('%s: %s' % (name, value))
+ e.msgstr = '\n'.join(strs) + '\n'
+ e._multiline_str['msgstr'] = '__POLIB__NL__'.join(
+ [s + '\n' for s in strs])
+ if self.metadata_is_fuzzy:
+ e.flags.append('fuzzy')
+ return e
+
+ def save(self, fpath=None, repr_method='__str__'):
+ """
+ Save the po file to file *fpath* if no file handle exists for
+ the object. If there's already an open file and no fpath is
+ provided, then the existing file is rewritten with the modified
+ data.
+
+ **Keyword arguments**:
+ - *fpath*: string, full or relative path to the file.
+ - *repr_method*: string, the method to use for output.
+ """
+ if self.fpath is None and fpath is None:
+ raise IOError('You must provide a file path to save() method')
+ contents = getattr(self, repr_method)()
+ if fpath is None:
+ fpath = self.fpath
+ if repr_method == 'to_binary':
+ fhandle = open(fpath, 'wb')
+ else:
+ fhandle = codecs.open(fpath, 'w', self.encoding)
+ if type(contents) != types.UnicodeType:
+ contents = contents.decode(self.encoding)
+ fhandle.write(contents)
+ fhandle.close()
+
+ def find(self, st, by='msgid'):
+ """
+ Find entry which msgid (or property identified by the *by*
+ attribute) matches the string *st*.
+
+ **Keyword arguments**:
+ - *st*: string, the string to search for
+ - *by*: string, the comparison attribute
+
+ **Examples**:
+
+ >>> po = pofile('tests/test_utf8.po')
+ >>> entry = po.find('Thursday')
+ >>> entry.msgstr
+ u'Jueves'
+ >>> entry = po.find('Some unexistant msgid')
+ >>> entry is None
+ True
+ >>> entry = po.find('Jueves', 'msgstr')
+ >>> entry.msgid
+ u'Thursday'
+ """
+ for e in self:
+ if getattr(e, by) == st:
+ return e
+ return None
+
+ def ordered_metadata(self):
+ """
+ Convenience method that return the metadata ordered. The return
+ value is list of tuples (metadata name, metadata_value).
+ """
+ # copy the dict first
+ metadata = self.metadata.copy()
+ data_order = [
+ 'Project-Id-Version',
+ 'Report-Msgid-Bugs-To',
+ 'POT-Creation-Date',
+ 'PO-Revision-Date',
+ 'Last-Translator',
+ 'Language-Team',
+ 'MIME-Version',
+ 'Content-Type',
+ 'Content-Transfer-Encoding'
+ ]
+ ordered_data = []
+ for data in data_order:
+ try:
+ value = metadata.pop(data)
+ ordered_data.append((data, value))
+ except KeyError:
+ pass
+ # the rest of the metadata won't be ordered there are no specs for this
+ keys = metadata.keys()
+ list(keys).sort()
+ for data in keys:
+ value = metadata[data]
+ ordered_data.append((data, value))
+ return ordered_data
+
+ def to_binary(self):
+ """
+ Return the mofile binary representation.
+ """
+ import array
+ import struct
+ import types
+ offsets = []
+ entries = self.translated_entries()
+ # the keys are sorted in the .mo file
+ def cmp(_self, other):
+ if _self.msgid > other.msgid:
+ return 1
+ elif _self.msgid < other.msgid:
+ return -1
+ else:
+ return 0
+ # add metadata entry
+ entries.sort(cmp)
+ mentry = self.metadata_as_entry()
+ mentry.msgstr = mentry.msgstr.replace('\\n', '').lstrip()
+ entries = [mentry] + entries
+ entries_len = len(entries)
+ ids, strs = '', ''
+ for e in entries:
+ # For each string, we need size and file offset. Each string is
+ # NUL terminated; the NUL does not count into the size.
+ if e.msgid_plural:
+ indexes = e.msgstr_plural.keys()
+ indexes.sort()
+ msgstr = []
+ for index in indexes:
+ msgstr.append(e.msgstr_plural[index])
+ msgid = self._encode(e.msgid + '\0' + e.msgid_plural)
+ msgstr = self._encode('\0'.join(msgstr))
+ else:
+ msgid = self._encode(e.msgid)
+ msgstr = self._encode(e.msgstr)
+ offsets.append((len(ids), len(msgid), len(strs), len(msgstr)))
+ ids += msgid + '\0'
+ strs += msgstr + '\0'
+ # The header is 7 32-bit unsigned integers.
+ keystart = 7*4+16*entries_len
+ # and the values start after the keys
+ valuestart = keystart + len(ids)
+ koffsets = []
+ voffsets = []
+ # The string table first has the list of keys, then the list of values.
+ # Each entry has first the size of the string, then the file offset.
+ for o1, l1, o2, l2 in offsets:
+ koffsets += [l1, o1+keystart]
+ voffsets += [l2, o2+valuestart]
+ offsets = koffsets + voffsets
+ output = struct.pack("IIIIIII",
+ 0x950412de, # Magic number
+ 0, # Version
+ entries_len, # # of entries
+ 7*4, # start of key index
+ 7*4+entries_len*8, # start of value index
+ 0, 0) # size and offset of hash table
+ output += array.array("I", offsets).tostring()
+ output += ids
+ output += strs
+ return output
+
+ def _encode(self, mixed):
+ """
+ Encode the given argument with the file encoding if the type is unicode
+ and return the encoded string.
+ """
+ if type(mixed) == types.UnicodeType:
+ return mixed.encode(self.encoding)
+ return mixed
+
+# }}}
+# class POFile {{{
+
+class POFile(_BaseFile):
+ '''
+ Po (or Pot) file reader/writer.
+ POFile objects inherit the list objects methods.
+
+ **Example**:
+
+ >>> po = POFile()
+ >>> entry1 = POEntry(
+ ... msgid="Some english text",
+ ... msgstr="Un texte en anglais"
+ ... )
+ >>> entry1.occurrences = [('testfile', 12),('another_file', 1)]
+ >>> entry1.comment = "Some useful comment"
+ >>> entry2 = POEntry(
+ ... msgid="Peace in some languages",
+ ... msgstr="Pace سلام שלום Hasîtî 和平"
+ ... )
+ >>> entry2.occurrences = [('testfile', 15),('another_file', 5)]
+ >>> entry2.comment = "Another useful comment"
+ >>> entry3 = POEntry(
+ ... msgid='Some entry with quotes " \\"',
+ ... msgstr='Un message unicode avec des quotes " \\"'
+ ... )
+ >>> entry3.comment = "Test string quoting"
+ >>> po.append(entry1)
+ >>> po.append(entry2)
+ >>> po.append(entry3)
+ >>> po.header = "Some Header"
+ >>> print(po)
+ # Some Header
+ msgid ""
+ msgstr ""
+ <BLANKLINE>
+ #. Some useful comment
+ #: testfile:12 another_file:1
+ msgid "Some english text"
+ msgstr "Un texte en anglais"
+ <BLANKLINE>
+ #. Another useful comment
+ #: testfile:15 another_file:5
+ msgid "Peace in some languages"
+ msgstr "Pace سلام שלום Hasîtî 和平"
+ <BLANKLINE>
+ #. Test string quoting
+ msgid "Some entry with quotes \\" \\""
+ msgstr "Un message unicode avec des quotes \\" \\""
+ <BLANKLINE>
+ '''
+
+ def __str__(self):
+ """Return the string representation of the po file"""
+ ret, headers = '', self.header.split('\n')
+ for header in headers:
+ if header[:1] in [',', ':']:
+ ret += '#%s\n' % header
+ else:
+ ret += '# %s\n' % header
+ return ret + _BaseFile.__str__(self)
+
+ def save_as_mofile(self, fpath):
+ """
+ Save the binary representation of the file to *fpath*.
+
+ **Keyword arguments**:
+ - *fpath*: string, full or relative path to the file.
+ """
+ _BaseFile.save(self, fpath, 'to_binary')
+
+ def percent_translated(self):
+ """
+ Convenience method that return the percentage of translated
+ messages.
+
+ **Example**:
+
+ >>> import polib
+ >>> po = polib.pofile('tests/test_pofile_helpers.po')
+ >>> po.percent_translated()
+ 50
+ >>> po = POFile()
+ >>> po.percent_translated()
+ 100
+ """
+ total = len([e for e in self if not e.obsolete])
+ if total == 0:
+ return 100
+ translated = len(self.translated_entries())
+ return int((100.00 / float(total)) * translated)
+
+ def translated_entries(self):
+ """
+ Convenience method that return a list of translated entries.
+
+ **Example**:
+
+ >>> import polib
+ >>> po = polib.pofile('tests/test_pofile_helpers.po')
+ >>> len(po.translated_entries())
+ 6
+ """
+ return [e for e in self if e.translated()]
+
+ def untranslated_entries(self):
+ """
+ Convenience method that return a list of untranslated entries.
+
+ **Example**:
+
+ >>> import polib
+ >>> po = polib.pofile('tests/test_pofile_helpers.po')
+ >>> len(po.untranslated_entries())
+ 4
+ """
+ return [e for e in self if not e.translated() and not e.obsolete \
+ and not 'fuzzy' in e.flags]
+
+ def fuzzy_entries(self):
+ """
+ Convenience method that return the list of 'fuzzy' entries.
+
+ **Example**:
+
+ >>> import polib
+ >>> po = polib.pofile('tests/test_pofile_helpers.po')
+ >>> len(po.fuzzy_entries())
+ 2
+ """
+ return [e for e in self if 'fuzzy' in e.flags]
+
+ def obsolete_entries(self):
+ """
+ Convenience method that return the list of obsolete entries.
+
+ **Example**:
+
+ >>> import polib
+ >>> po = polib.pofile('tests/test_pofile_helpers.po')
+ >>> len(po.obsolete_entries())
+ 4
+ """
+ return [e for e in self if e.obsolete]
+
+ def merge(self, refpot):
+ """
+ XXX this could not work if encodings are different, needs thinking
+ and general refactoring of how polib handles encoding...
+
+ Convenience method that merge the current pofile with the pot file
+ provided. It behaves exactly as the gettext msgmerge utility:
+
+ - comments of this file will be preserved, but extracted comments
+ and occurrences will be discarded
+ - any translations or comments in the file will be discarded,
+ however dot comments and file positions will be preserved
+
+ **Keyword argument**:
+ - *refpot*: object POFile, the reference catalog.
+
+ **Example**:
+
+ >>> import polib
+ >>> refpot = polib.pofile('tests/test_merge.pot')
+ >>> po = polib.pofile('tests/test_merge_before.po')
+ >>> po.merge(refpot)
+ >>> expected_po = polib.pofile('tests/test_merge_after.po')
+ >>> unicode(po) == unicode(expected_po)
+ True
+ """
+ for entry in refpot:
+ e = self.find(entry.msgid)
+ if e is None:
+ e = POEntry()
+ self.append(e)
+ e.merge(entry)
+ # ok, now we must "obsolete" entries that are not in the refpot
+ # anymore
+ for entry in self:
+ if refpot.find(entry.msgid) is None:
+ entry.obsolete = True
+
+# }}}
+# class MOFile {{{
+
+class MOFile(_BaseFile):
+ '''
+ Mo file reader/writer.
+ MOFile objects inherit the list objects methods.
+
+ **Example**:
+
+ >>> mo = MOFile()
+ >>> entry1 = POEntry(
+ ... msgid="Some english text",
+ ... msgstr="Un texte en anglais"
+ ... )
+ >>> entry2 = POEntry(
+ ... msgid="I need my dirty cheese",
+ ... msgstr="Je veux mon sale fromage"
+ ... )
+ >>> entry3 = MOEntry(
+ ... msgid='Some entry with quotes " \\"',
+ ... msgstr='Un message unicode avec des quotes " \\"'
+ ... )
+ >>> mo.append(entry1)
+ >>> mo.append(entry2)
+ >>> mo.append(entry3)
+ >>> print(mo)
+ msgid ""
+ msgstr ""
+ <BLANKLINE>
+ msgid "Some english text"
+ msgstr "Un texte en anglais"
+ <BLANKLINE>
+ msgid "I need my dirty cheese"
+ msgstr "Je veux mon sale fromage"
+ <BLANKLINE>
+ msgid "Some entry with quotes \\" \\""
+ msgstr "Un message unicode avec des quotes \\" \\""
+ <BLANKLINE>
+ '''
+
+ def __init__(self, *args, **kwargs):
+ """
+ MOFile constructor. Mo files have two other properties:
+ - magic_number: the magic_number of the binary file,
+ - version: the version of the mo spec.
+ """
+ _BaseFile.__init__(self, *args, **kwargs)
+ self.magic_number = None
+ self.version = 0
+
+ def save_as_pofile(self, fpath):
+ """
+ Save the string representation of the file to *fpath*.
+
+ **Keyword argument**:
+ - *fpath*: string, full or relative path to the file.
+ """
+ _BaseFile.save(self, fpath)
+
+ def save(self, fpath):
+ """
+ Save the binary representation of the file to *fpath*.
+
+ **Keyword argument**:
+ - *fpath*: string, full or relative path to the file.
+ """
+ _BaseFile.save(self, fpath, 'to_binary')
+
+ def percent_translated(self):
+ """
+ Convenience method to keep the same interface with POFile instances.
+ """
+ return 100
+
+ def translated_entries(self):
+ """
+ Convenience method to keep the same interface with POFile instances.
+ """
+ return self
+
+ def untranslated_entries(self):
+ """
+ Convenience method to keep the same interface with POFile instances.
+ """
+ return []
+
+ def fuzzy_entries(self):
+ """
+ Convenience method to keep the same interface with POFile instances.
+ """
+ return []
+
+ def obsolete_entries(self):
+ """
+ Convenience method to keep the same interface with POFile instances.
+ """
+ return []
+
+# }}}
+# class _BaseEntry {{{
+
+class _BaseEntry(object):
+ """
+ Base class for POEntry or MOEntry objects.
+ This class must *not* be instanciated directly.
+ """
+
+ def __init__(self, *args, **kwargs):
+ """Base Entry constructor."""
+ self.msgid = kwargs.get('msgid', '')
+ self.msgstr = kwargs.get('msgstr', '')
+ self.msgid_plural = kwargs.get('msgid_plural', '')
+ self.msgstr_plural = kwargs.get('msgstr_plural', {})
+ self.obsolete = kwargs.get('obsolete', False)
+ self.encoding = kwargs.get('encoding', default_encoding)
+ self.msgctxt = kwargs.get('msgctxt', None)
+ self._multiline_str = {}
+
+ def __repr__(self):
+ """Return the official string representation of the object."""
+ return '<%s instance at %x>' % (self.__class__.__name__, id(self))
+
+ def __str__(self, wrapwidth=78):
+ """
+ Common string representation of the POEntry and MOEntry
+ objects.
+ """
+ if self.obsolete:
+ delflag = '#~ '
+ else:
+ delflag = ''
+ ret = []
+ # write the msgctxt if any
+ if self.msgctxt is not None:
+ ret += self._str_field("msgctxt", delflag, "", self.msgctxt)
+ # write the msgid
+ ret += self._str_field("msgid", delflag, "", self.msgid)
+ # write the msgid_plural if any
+ if self.msgid_plural:
+ ret += self._str_field("msgid_plural", delflag, "", self.msgid_plural)
+ if self.msgstr_plural:
+ # write the msgstr_plural if any
+ msgstrs = self.msgstr_plural
+ keys = list(msgstrs)
+ keys.sort()
+ for index in keys:
+ msgstr = msgstrs[index]
+ plural_index = '[%s]' % index
+ ret += self._str_field("msgstr", delflag, plural_index, msgstr)
+ else:
+ # otherwise write the msgstr
+ ret += self._str_field("msgstr", delflag, "", self.msgstr)
+ ret.append('')
+ return '\n'.join(ret)
+
+ def _str_field(self, fieldname, delflag, plural_index, field):
+ if (fieldname + plural_index) in self._multiline_str:
+ field = self._multiline_str[fieldname + plural_index]
+ lines = [''] + field.split('__POLIB__NL__')
+ else:
+ lines = field.splitlines(True)
+ if len(lines) > 1:
+ lines = ['']+lines # start with initial empty line
+ else:
+ lines = [field] # needed for the empty string case
+ if fieldname.startswith('previous_'):
+ # quick and dirty trick to get the real field name
+ fieldname = fieldname[9:]
+
+ ret = ['%s%s%s "%s"' % (delflag, fieldname, plural_index,
+ escape(lines.pop(0)))]
+ for mstr in lines:
+ ret.append('%s"%s"' % (delflag, escape(mstr)))
+ return ret
+
+# }}}
+# class POEntry {{{
+
+class POEntry(_BaseEntry):
+ """
+ Represents a po file entry.
+
+ **Examples**:
+
+ >>> entry = POEntry(msgid='Welcome', msgstr='Bienvenue')
+ >>> entry.occurrences = [('welcome.py', 12), ('anotherfile.py', 34)]
+ >>> print(entry)
+ #: welcome.py:12 anotherfile.py:34
+ msgid "Welcome"
+ msgstr "Bienvenue"
+ <BLANKLINE>
+ >>> entry = POEntry()
+ >>> entry.occurrences = [('src/some-very-long-filename-that-should-not-be-wrapped-even-if-it-is-larger-than-the-wrap-limit.c', 32), ('src/eggs.c', 45)]
+ >>> entry.comment = 'A plural translation. This is a very very very long line please do not wrap, this is just for testing comment wrapping...'
+ >>> entry.tcomment = 'A plural translation. This is a very very very long line please do not wrap, this is just for testing comment wrapping...'
+ >>> entry.flags.append('c-format')
+ >>> entry.previous_msgctxt = '@somecontext'
+ >>> entry.previous_msgid = 'I had eggs but no spam !'
+ >>> entry.previous_msgid_plural = 'I had eggs and %d spam !'
+ >>> entry.msgctxt = '@somenewcontext'
+ >>> entry.msgid = 'I have spam but no egg !'
+ >>> entry.msgid_plural = 'I have spam and %d eggs !'
+ >>> entry.msgstr_plural[0] = "J'ai du jambon mais aucun oeuf !"
+ >>> entry.msgstr_plural[1] = "J'ai du jambon et %d oeufs !"
+ >>> print(entry)
+ #. A plural translation. This is a very very very long line please do not
+ #. wrap, this is just for testing comment wrapping...
+ # A plural translation. This is a very very very long line please do not wrap,
+ # this is just for testing comment wrapping...
+ #: src/some-very-long-filename-that-should-not-be-wrapped-even-if-it-is-larger-than-the-wrap-limit.c:32
+ #: src/eggs.c:45
+ #, c-format
+ #| msgctxt "@somecontext"
+ #| msgid "I had eggs but no spam !"
+ #| msgid_plural "I had eggs and %d spam !"
+ msgctxt "@somenewcontext"
+ msgid "I have spam but no egg !"
+ msgid_plural "I have spam and %d eggs !"
+ msgstr[0] "J'ai du jambon mais aucun oeuf !"
+ msgstr[1] "J'ai du jambon et %d oeufs !"
+ <BLANKLINE>
+ """
+
+ def __init__(self, *args, **kwargs):
+ """POEntry constructor."""
+ _BaseEntry.__init__(self, *args, **kwargs)
+ self.comment = kwargs.get('comment', '')
+ self.tcomment = kwargs.get('tcomment', '')
+ self.occurrences = kwargs.get('occurrences', [])
+ self.flags = kwargs.get('flags', [])
+ self.previous_msgctxt = kwargs.get('previous_msgctxt', None)
+ self.previous_msgid = kwargs.get('previous_msgid', None)
+ self.previous_msgid_plural = kwargs.get('previous_msgid_plural', None)
+
+ def __str__(self, wrapwidth=78):
+ """
+ Return the string representation of the entry.
+ """
+ if self.obsolete:
+ return _BaseEntry.__str__(self)
+ ret = []
+ # comment first, if any (with text wrapping as xgettext does)
+ if self.comment != '':
+ for comment in self.comment.split('\n'):
+ if wrapwidth > 0 and len(comment) > wrapwidth-3:
+ ret += textwrap.wrap(comment, wrapwidth,
+ initial_indent='#. ',
+ subsequent_indent='#. ',
+ break_long_words=False)
+ else:
+ ret.append('#. %s' % comment)
+ # translator comment, if any (with text wrapping as xgettext does)
+ if self.tcomment != '':
+ for tcomment in self.tcomment.split('\n'):
+ if wrapwidth > 0 and len(tcomment) > wrapwidth-2:
+ ret += textwrap.wrap(tcomment, wrapwidth,
+ initial_indent='# ',
+ subsequent_indent='# ',
+ break_long_words=False)
+ else:
+ ret.append('# %s' % tcomment)
+ # occurrences (with text wrapping as xgettext does)
+ if self.occurrences:
+ filelist = []
+ for fpath, lineno in self.occurrences:
+ if lineno:
+ filelist.append('%s:%s' % (fpath, lineno))
+ else:
+ filelist.append(fpath)
+ filestr = ' '.join(filelist)
+ if wrapwidth > 0 and len(filestr)+3 > wrapwidth:
+ # XXX textwrap split words that contain hyphen, this is not
+ # what we want for filenames, so the dirty hack is to
+ # temporally replace hyphens with a char that a file cannot
+ # contain, like "*"
+ lines = textwrap.wrap(filestr.replace('-', '*'),
+ wrapwidth,
+ initial_indent='#: ',
+ subsequent_indent='#: ',
+ break_long_words=False)
+ # end of the replace hack
+ for line in lines:
+ ret.append(line.replace('*', '-'))
+ else:
+ ret.append('#: '+filestr)
+ # flags
+ if self.flags:
+ flags = []
+ for flag in self.flags:
+ flags.append(flag)
+ ret.append('#, %s' % ', '.join(flags))
+
+ # previous context and previous msgid/msgid_plural
+ if self.previous_msgctxt:
+ ret += self._str_field("previous_msgctxt", "#| ", "",
+ self.previous_msgctxt)
+ if self.previous_msgid:
+ ret += self._str_field("previous_msgid", "#| ", "",
+ self.previous_msgid)
+ if self.previous_msgid_plural:
+ ret += self._str_field("previous_msgid_plural", "#| ", "",
+ self.previous_msgid_plural)
+
+ ret.append(_BaseEntry.__str__(self))
+ return '\n'.join(ret)
+
+ def __cmp__(self, other):
+ '''
+ Called by comparison operations if rich comparison is not defined.
+
+ **Tests**:
+ >>> a = POEntry(msgid='a', occurrences=[('b.py', 1), ('b.py', 3)])
+ >>> b = POEntry(msgid='b', occurrences=[('b.py', 1), ('b.py', 3)])
+ >>> c1 = POEntry(msgid='c1', occurrences=[('a.py', 1), ('b.py', 1)])
+ >>> c2 = POEntry(msgid='c2', occurrences=[('a.py', 1), ('a.py', 3)])
+ >>> po = POFile()
+ >>> po.append(a)
+ >>> po.append(b)
+ >>> po.append(c1)
+ >>> po.append(c2)
+ >>> po.sort()
+ >>> print(po)
+ #
+ msgid ""
+ msgstr ""
+ <BLANKLINE>
+ #: a.py:1 a.py:3
+ msgid "c2"
+ msgstr ""
+ <BLANKLINE>
+ #: a.py:1 b.py:1
+ msgid "c1"
+ msgstr ""
+ <BLANKLINE>
+ #: b.py:1 b.py:3
+ msgid "a"
+ msgstr ""
+ <BLANKLINE>
+ #: b.py:1 b.py:3
+ msgid "b"
+ msgstr ""
+ <BLANKLINE>
+ '''
+ def compare_occurrences(a, b):
+ """
+ Compare an entry occurrence with another one.
+ """
+ if a[0] != b[0]:
+ return a[0] < b[0]
+ if a[1] != b[1]:
+ return a[1] < b[1]
+ return 0
+
+ # First: Obsolete test
+ if self.obsolete != other.obsolete:
+ if self.obsolete:
+ return -1
+ else:
+ return 1
+ # Work on a copy to protect original
+ occ1 = self.occurrences[:]
+ occ2 = other.occurrences[:]
+ # Sorting using compare method
+ occ1.sort(compare_occurrences)
+ occ2.sort(compare_occurrences)
+ # Comparing sorted occurrences
+ pos = 0
+ for entry1 in occ1:
+ try:
+ entry2 = occ2[pos]
+ except IndexError:
+ return 1
+ pos = pos + 1
+ if entry1[0] != entry2[0]:
+ if entry1[0] > entry2[0]:
+ return 1
+ else:
+ return -1
+ if entry1[1] != entry2[1]:
+ if entry1[1] > entry2[1]:
+ return 1
+ else:
+ return -1
+ # Finally: Compare message ID
+ if self.msgid > other.msgid: return 1
+ else: return -1
+
+ def translated(self):
+ """
+ Return True if the entry has been translated or False.
+ """
+ if self.obsolete or 'fuzzy' in self.flags:
+ return False
+ if self.msgstr != '':
+ return True
+ if self.msgstr_plural:
+ for pos in self.msgstr_plural:
+ if self.msgstr_plural[pos] == '':
+ return False
+ return True
+ return False
+
+ def merge(self, other):
+ """
+ Merge the current entry with the given pot entry.
+ """
+ self.msgid = other.msgid
+ self.occurrences = other.occurrences
+ self.comment = other.comment
+ self.flags = other.flags
+ self.msgid_plural = other.msgid_plural
+ if other.msgstr_plural:
+ for pos in other.msgstr_plural:
+ try:
+ # keep existing translation at pos if any
+ self.msgstr_plural[pos]
+ except KeyError:
+ self.msgstr_plural[pos] = ''
+
+# }}}
+# class MOEntry {{{
+
+class MOEntry(_BaseEntry):
+ """
+ Represents a mo file entry.
+
+ **Examples**:
+
+ >>> entry = MOEntry()
+ >>> entry.msgid = 'translate me !'
+ >>> entry.msgstr = 'traduisez moi !'
+ >>> print(entry)
+ msgid "translate me !"
+ msgstr "traduisez moi !"
+ <BLANKLINE>
+ """
+
+ def __str__(self, wrapwidth=78):
+ """
+ Return the string representation of the entry.
+ """
+ return _BaseEntry.__str__(self, wrapwidth)
+
+# }}}
+# class _POFileParser {{{
+
+class _POFileParser(object):
+ """
+ A finite state machine to parse efficiently and correctly po
+ file format.
+ """
+
+ def __init__(self, fpath, *args, **kwargs):
+ """
+ Constructor.
+
+ **Arguments**:
+ - *fpath*: string, path to the po file
+ - *encoding*: string, the encoding to use, defaults to
+ "default_encoding" global variable (optional),
+ - *check_for_duplicates*: whether to check for duplicate entries
+ when adding entries to the file, default: False (optional).
+ """
+ enc = kwargs.get('encoding', default_encoding)
+ check_dup = kwargs.get('check_for_duplicates', False)
+ try:
+ self.fhandle = codecs.open(fpath, 'rU', enc)
+ except LookupError:
+ enc = default_encoding
+ self.fhandle = codecs.open(fpath, 'rU', enc)
+ self.instance = POFile(
+ fpath=fpath,
+ encoding=enc,
+ check_for_duplicates=check_dup
+ )
+ self.transitions = {}
+ self.current_entry = POEntry()
+ self.current_state = 'ST'
+ self.current_token = None
+ # two memo flags used in handlers
+ self.msgstr_index = 0
+ self.entry_obsolete = 0
+ # Configure the state machine, by adding transitions.
+ # Signification of symbols:
+ # * ST: Beginning of the file (start)
+ # * HE: Header
+ # * TC: a translation comment
+ # * GC: a generated comment
+ # * OC: a file/line occurence
+ # * FL: a flags line
+ # * CT: a message context
+ # * PC: a previous msgctxt
+ # * PM: a previous msgid
+ # * PP: a previous msgid_plural
+ # * MI: a msgid
+ # * MP: a msgid plural
+ # * MS: a msgstr
+ # * MX: a msgstr plural
+ # * MC: a msgid or msgstr continuation line
+ all = ['ST', 'HE', 'GC', 'OC', 'FL', 'CT', 'PC', 'PM', 'PP', 'TC',
+ 'MS', 'MP', 'MX', 'MI']
+
+ self.add('TC', ['ST', 'HE'], 'HE')
+ self.add('TC', ['GC', 'OC', 'FL', 'TC', 'PC', 'PM', 'PP', 'MS',
+ 'MP', 'MX', 'MI'], 'TC')
+ self.add('GC', all, 'GC')
+ self.add('OC', all, 'OC')
+ self.add('FL', all, 'FL')
+ self.add('PC', all, 'PC')
+ self.add('PM', all, 'PM')
+ self.add('PP', all, 'PP')
+ self.add('CT', ['ST', 'HE', 'GC', 'OC', 'FL', 'TC', 'PC', 'PM',
+ 'PP', 'MS', 'MX'], 'CT')
+ self.add('MI', ['ST', 'HE', 'GC', 'OC', 'FL', 'CT', 'TC', 'PC',
+ 'PM', 'PP', 'MS', 'MX'], 'MI')
+ self.add('MP', ['TC', 'GC', 'PC', 'PM', 'PP', 'MI'], 'MP')
+ self.add('MS', ['MI', 'MP', 'TC'], 'MS')
+ self.add('MX', ['MI', 'MX', 'MP', 'TC'], 'MX')
+ self.add('MC', ['CT', 'MI', 'MP', 'MS', 'MX', 'PM', 'PP', 'PC'], 'MC')
+
+ def parse(self):
+ """
+ Run the state machine, parse the file line by line and call process()
+ with the current matched symbol.
+ """
+ i, lastlen = 1, 0
+ for line in self.fhandle:
+ line = line.strip()
+ if line == '':
+ i = i+1
+ continue
+ if line[:3] == '#~ ':
+ line = line[3:]
+ self.entry_obsolete = 1
+ else:
+ self.entry_obsolete = 0
+ self.current_token = line
+ if line[:2] == '#:':
+ # we are on a occurrences line
+ self.process('OC', i)
+ elif line[:9] == 'msgctxt "':
+ # we are on a msgctxt
+ self.process('CT', i)
+ elif line[:7] == 'msgid "':
+ # we are on a msgid
+ self.process('MI', i)
+ elif line[:8] == 'msgstr "':
+ # we are on a msgstr
+ self.process('MS', i)
+ elif line[:1] == '"' or line[:4] == '#| "':
+ # we are on a continuation line or some metadata
+ self.process('MC', i)
+ elif line[:14] == 'msgid_plural "':
+ # we are on a msgid plural
+ self.process('MP', i)
+ elif line[:7] == 'msgstr[':
+ # we are on a msgstr plural
+ self.process('MX', i)
+ elif line[:3] == '#, ':
+ # we are on a flags line
+ self.process('FL', i)
+ elif line[:2] == '# ' or line == '#':
+ if line == '#': line = line + ' '
+ # we are on a translator comment line
+ self.process('TC', i)
+ elif line[:2] == '#.':
+ # we are on a generated comment line
+ self.process('GC', i)
+ elif line[:15] == '#| msgid_plural':
+ # we are on a previous msgid_plural
+ self.process('PP', i)
+ elif line[:8] == '#| msgid':
+ self.process('PM', i)
+ # we are on a previous msgid
+ elif line[:10] == '#| msgctxt':
+ # we are on a previous msgctxt
+ self.process('PC', i)
+ i = i+1
+
+ if self.current_entry:
+ # since entries are added when another entry is found, we must add
+ # the last entry here (only if there are lines)
+ self.instance.append(self.current_entry)
+ # before returning the instance, check if there's metadata and if
+ # so extract it in a dict
+ firstentry = self.instance[0]
+ if firstentry.msgid == '': # metadata found
+ # remove the entry
+ firstentry = self.instance.pop(0)
+ self.instance.metadata_is_fuzzy = firstentry.flags
+ key = None
+ for msg in firstentry.msgstr.splitlines():
+ try:
+ key, val = msg.split(':', 1)
+ self.instance.metadata[key] = val.strip()
+ except:
+ if key is not None:
+ self.instance.metadata[key] += '\n'+ msg.strip()
+ # close opened file
+ self.fhandle.close()
+ return self.instance
+
+ def add(self, symbol, states, next_state):
+ """
+ Add a transition to the state machine.
+ Keywords arguments:
+
+ symbol -- string, the matched token (two chars symbol)
+ states -- list, a list of states (two chars symbols)
+ next_state -- the next state the fsm will have after the action
+ """
+ for state in states:
+ action = getattr(self, 'handle_%s' % next_state.lower())
+ self.transitions[(symbol, state)] = (action, next_state)
+
+ def process(self, symbol, linenum):
+ """
+ Process the transition corresponding to the current state and the
+ symbol provided.
+
+ Keywords arguments:
+ symbol -- string, the matched token (two chars symbol)
+ linenum -- integer, the current line number of the parsed file
+ """
+ try:
+ (action, state) = self.transitions[(symbol, self.current_state)]
+ if action():
+ self.current_state = state
+ except Exception, exc:
+ raise IOError('Syntax error in po file (line %s)' % linenum)
+
+ # state handlers
+
+ def handle_he(self):
+ """Handle a header comment."""
+ if self.instance.header != '':
+ self.instance.header += '\n'
+ self.instance.header += self.current_token[2:]
+ return 1
+
+ def handle_tc(self):
+ """Handle a translator comment."""
+ if self.current_state in ['MC', 'MS', 'MX']:
+ self.instance.append(self.current_entry)
+ self.current_entry = POEntry()
+ if self.current_entry.tcomment != '':
+ self.current_entry.tcomment += '\n'
+ self.current_entry.tcomment += self.current_token[2:]
+ return True
+
+ def handle_gc(self):
+ """Handle a generated comment."""
+ if self.current_state in ['MC', 'MS', 'MX']:
+ self.instance.append(self.current_entry)
+ self.current_entry = POEntry()
+ if self.current_entry.comment != '':
+ self.current_entry.comment += '\n'
+ self.current_entry.comment += self.current_token[3:]
+ return True
+
+ def handle_oc(self):
+ """Handle a file:num occurence."""
+ if self.current_state in ['MC', 'MS', 'MX']:
+ self.instance.append(self.current_entry)
+ self.current_entry = POEntry()
+ occurrences = self.current_token[3:].split()
+ for occurrence in occurrences:
+ if occurrence != '':
+ try:
+ fil, line = occurrence.split(':')
+ if not line.isdigit():
+ fil = fil + line
+ line = ''
+ self.current_entry.occurrences.append((fil, line))
+ except:
+ self.current_entry.occurrences.append((occurrence, ''))
+ return True
+
+ def handle_fl(self):
+ """Handle a flags line."""
+ if self.current_state in ['MC', 'MS', 'MX']:
+ self.instance.append(self.current_entry)
+ self.current_entry = POEntry()
+ self.current_entry.flags += self.current_token[3:].split(', ')
+ return True
+
+ def handle_pp(self):
+ """Handle a previous msgid_plural line."""
+ if self.current_state in ['MC', 'MS', 'MX']:
+ self.instance.append(self.current_entry)
+ self.current_entry = POEntry()
+ self.current_entry.previous_msgid_plural = \
+ unescape(self.current_token[17:-1])
+ return True
+
+ def handle_pm(self):
+ """Handle a previous msgid line."""
+ if self.current_state in ['MC', 'MS', 'MX']:
+ self.instance.append(self.current_entry)
+ self.current_entry = POEntry()
+ self.current_entry.previous_msgid = \
+ unescape(self.current_token[10:-1])
+ return True
+
+ def handle_pc(self):
+ """Handle a previous msgctxt line."""
+ if self.current_state in ['MC', 'MS', 'MX']:
+ self.instance.append(self.current_entry)
+ self.current_entry = POEntry()
+ self.current_entry.previous_msgctxt = \
+ unescape(self.current_token[12:-1])
+ return True
+
+ def handle_ct(self):
+ """Handle a msgctxt."""
+ if self.current_state in ['MC', 'MS', 'MX']:
+ self.instance.append(self.current_entry)
+ self.current_entry = POEntry()
+ self.current_entry.msgctxt = unescape(self.current_token[9:-1])
+ return True
+
+ def handle_mi(self):
+ """Handle a msgid."""
+ if self.current_state in ['MC', 'MS', 'MX']:
+ self.instance.append(self.current_entry)
+ self.current_entry = POEntry()
+ self.current_entry.obsolete = self.entry_obsolete
+ self.current_entry.msgid = unescape(self.current_token[7:-1])
+ return True
+
+ def handle_mp(self):
+ """Handle a msgid plural."""
+ self.current_entry.msgid_plural = unescape(self.current_token[14:-1])
+ return True
+
+ def handle_ms(self):
+ """Handle a msgstr."""
+ self.current_entry.msgstr = unescape(self.current_token[8:-1])
+ return True
+
+ def handle_mx(self):
+ """Handle a msgstr plural."""
+ index, value = self.current_token[7], self.current_token[11:-1]
+ self.current_entry.msgstr_plural[index] = unescape(value)
+ self.msgstr_index = index
+ return True
+
+ def handle_mc(self):
+ """Handle a msgid or msgstr continuation line."""
+ token = unescape(self.current_token[1:-1])
+ if self.current_state == 'CT':
+ typ = 'msgctxt'
+ self.current_entry.msgctxt += token
+ elif self.current_state == 'MI':
+ typ = 'msgid'
+ self.current_entry.msgid += token
+ elif self.current_state == 'MP':
+ typ = 'msgid_plural'
+ self.current_entry.msgid_plural += token
+ elif self.current_state == 'MS':
+ typ = 'msgstr'
+ self.current_entry.msgstr += token
+ elif self.current_state == 'MX':
+ typ = 'msgstr[%s]' % self.msgstr_index
+ self.current_entry.msgstr_plural[self.msgstr_index] += token
+ elif self.current_state == 'PP':
+ typ = 'previous_msgid_plural'
+ token = token[3:]
+ self.current_entry.previous_msgid_plural += token
+ elif self.current_state == 'PM':
+ typ = 'previous_msgid'
+ token = token[3:]
+ self.current_entry.previous_msgid += token
+ elif self.current_state == 'PC':
+ typ = 'previous_msgctxt'
+ token = token[3:]
+ self.current_entry.previous_msgctxt += token
+ if typ not in self.current_entry._multiline_str:
+ self.current_entry._multiline_str[typ] = token
+ else:
+ self.current_entry._multiline_str[typ] += "__POLIB__NL__" + token
+ # don't change the current state
+ return False
+
+# }}}
+# class _MOFileParser {{{
+
+class _MOFileParser(object):
+ """
+ A class to parse binary mo files.
+ """
+ BIG_ENDIAN = 0xde120495
+ LITTLE_ENDIAN = 0x950412de
+
+ def __init__(self, fpath, *args, **kwargs):
+ """
+ Constructor.
+
+ **Arguments**:
+ - *fpath*: string, path to the po file
+ - *encoding*: string, the encoding to use, defaults to
+ "default_encoding" global variable (optional),
+ - *check_for_duplicates*: whether to check for duplicate entries
+ when adding entries to the file, default: False (optional).
+ """
+ enc = kwargs.get('encoding', default_encoding)
+ check_dup = kwargs.get('check_for_duplicates', False)
+ self.fhandle = open(fpath, 'rb')
+ self.instance = MOFile(
+ fpath=fpath,
+ encoding=enc,
+ check_for_duplicates=check_dup
+ )
+
+ def parse_magicnumber(self):
+ """
+ Parse the magic number and raise an exception if not valid.
+ """
+
+ def parse(self):
+ """
+ Build the instance with the file handle provided in the
+ constructor.
+ """
+ magic_number = self._readbinary('<I', 4)
+ if magic_number == self.LITTLE_ENDIAN:
+ ii = '<II'
+ elif magic_number == self.BIG_ENDIAN:
+ ii = '>II'
+ else:
+ raise IOError('Invalid mo file, magic number is incorrect !')
+ self.instance.magic_number = magic_number
+ # parse the version number and the number of strings
+ self.instance.version, numofstrings = self._readbinary(ii, 8)
+ # original strings and translation strings hash table offset
+ msgids_hash_offset, msgstrs_hash_offset = self._readbinary(ii, 8)
+ # move to msgid hash table and read length and offset of msgids
+ self.fhandle.seek(msgids_hash_offset)
+ msgids_index = []
+ for i in range(numofstrings):
+ msgids_index.append(self._readbinary(ii, 8))
+ # move to msgstr hash table and read length and offset of msgstrs
+ self.fhandle.seek(msgstrs_hash_offset)
+ msgstrs_index = []
+ for i in range(numofstrings):
+ msgstrs_index.append(self._readbinary(ii, 8))
+ # build entries
+ for i in range(numofstrings):
+ self.fhandle.seek(msgids_index[i][1])
+ msgid = self.fhandle.read(msgids_index[i][0])
+ self.fhandle.seek(msgstrs_index[i][1])
+ msgstr = self.fhandle.read(msgstrs_index[i][0])
+ if i == 0: # metadata
+ raw_metadata, metadata = msgstr.split('\n'), {}
+ for line in raw_metadata:
+ tokens = line.split(':', 1)
+ if tokens[0] != '':
+ try:
+ metadata[tokens[0]] = tokens[1].strip()
+ except IndexError:
+ metadata[tokens[0]] = ''
+ self.instance.metadata = metadata
+ continue
+ # test if we have a plural entry
+ msgid_tokens = msgid.split('\0')
+ if len(msgid_tokens) > 1:
+ entry = MOEntry(
+ msgid=msgid_tokens[0],
+ msgid_plural=msgid_tokens[1],
+ msgstr_plural=dict((k,v) for k,v in \
+ enumerate(msgstr.split('\0')))
+ )
+ else:
+ entry = MOEntry(msgid=msgid, msgstr=msgstr)
+ self.instance.append(entry)
+ # close opened file
+ self.fhandle.close()
+ return self.instance
+
+ def _readbinary(self, fmt, numbytes):
+ """
+ Private method that unpack n bytes of data using format <fmt>.
+ It returns a tuple or a mixed value if the tuple length is 1.
+ """
+ bytes = self.fhandle.read(numbytes)
+ tup = struct.unpack(fmt, bytes)
+ if len(tup) == 1:
+ return tup[0]
+ return tup
+
+# }}}
+# __main__ {{{
+
+if __name__ == '__main__':
+ """
+ **Main function**::
+ - to **test** the module just run: *python polib.py [-v]*
+ - to **profile** the module: *python polib.py -p <some_pofile.po>*
+ """
+ import sys
+ if len(sys.argv) > 2 and sys.argv[1] == '-p':
+ def test(f):
+ if f.endswith('po'):
+ p = pofile(f)
+ else:
+ p = mofile(f)
+ s = unicode(p)
+ import profile
+ profile.run('test("'+sys.argv[2]+'")')
+ else:
+ import doctest
+ doctest.testmod()
+
+# }}}