changeset 40185:19fc5a986669

polib: update to latest release 1.0.7 (upstream rev d75ce6dbbc2a) # no-check-commit third-party code that doesn't match our style Differential Revision: https://phab.mercurial-scm.org/D5001
author Augie Fackler <augie@google.com>
date Fri, 12 Oct 2018 11:44:27 -0400
parents c3b7d9c54edd
children 9cbc2579f5be
files i18n/check-translation.py i18n/polib.py
diffstat 2 files changed, 551 insertions(+), 275 deletions(-) [+]
line wrap: on
line diff
--- a/i18n/check-translation.py	Fri Oct 12 10:01:09 2018 -0400
+++ b/i18n/check-translation.py	Fri Oct 12 11:44:27 2018 -0400
@@ -224,14 +224,6 @@
         failures, tests = doctest.testmod()
         sys.exit(failures and 1 or 0)
 
-    # replace polib._POFileParser to show linenum of problematic msgstr
-    class ExtPOFileParser(polib._POFileParser):
-        def process(self, symbol, linenum):
-            super(ExtPOFileParser, self).process(symbol, linenum)
-            if symbol == 'MS': # msgstr
-                self.current_entry.linenum = linenum
-    polib._POFileParser = ExtPOFileParser
-
     detected = []
     warning = options.warning
     for f in args:
--- a/i18n/polib.py	Fri Oct 12 10:01:09 2018 -0400
+++ b/i18n/polib.py	Fri Oct 12 11:44:27 2018 -0400
@@ -1,5 +1,5 @@
-# -*- coding: utf-8 -*-
 # no-check-code
+# -* coding: utf-8 -*-
 #
 # License: MIT (see LICENSE file provided)
 # vim: set expandtab tabstop=4 shiftwidth=4 softtabstop=4:
@@ -15,10 +15,10 @@
 
 from __future__ import absolute_import
 
-__author__    = 'David Jean Louis <izimobil@gmail.com>'
-__version__   = '0.6.4'
-__all__       = ['pofile', 'POFile', 'POEntry', 'mofile', 'MOFile', 'MOEntry',
-                 'detect_encoding', 'escape', 'unescape', 'detect_encoding',]
+__author__ = 'David Jean Louis <izimobil@gmail.com>'
+__version__ = '1.0.7'
+__all__ = ['pofile', 'POFile', 'POEntry', 'mofile', 'MOFile', 'MOEntry',
+           'default_encoding', 'escape', 'unescape', 'detect_encoding', ]
 
 import array
 import codecs
@@ -27,14 +27,47 @@
 import struct
 import sys
 import textwrap
-import types
+
+try:
+    import io
+except ImportError:
+    # replacement of io.open() for python < 2.6
+    # we use codecs instead
+    class io(object):
+        @staticmethod
+        def open(fpath, mode='r', encoding=None):
+            return codecs.open(fpath, mode, encoding)
 
 
 # the default encoding to use when encoding cannot be detected
 default_encoding = 'utf-8'
 
+# python 2/3 compatibility helpers {{{
+
+
+if sys.version_info[:2] < (3, 0):
+    PY3 = False
+    text_type = unicode
+
+    def b(s):
+        return s
+
+    def u(s):
+        return unicode(s, "unicode_escape")
+
+else:
+    PY3 = True
+    text_type = str
+
+    def b(s):
+        return s.encode("latin-1")
+
+    def u(s):
+        return s
+# }}}
 # _pofile_or_mofile {{{
 
+
 def _pofile_or_mofile(f, type, **kwargs):
     """
     Internal function used by :func:`polib.pofile` and :func:`polib.mofile` to
@@ -50,15 +83,34 @@
     parser = kls(
         f,
         encoding=enc,
-        check_for_duplicates=kwargs.get('check_for_duplicates', False)
+        check_for_duplicates=kwargs.get('check_for_duplicates', False),
+        klass=kwargs.get('klass')
     )
     instance = parser.parse()
     instance.wrapwidth = kwargs.get('wrapwidth', 78)
     return instance
+# }}}
+# _is_file {{{
 
+
+def _is_file(filename_or_contents):
+    """
+    Safely returns the value of os.path.exists(filename_or_contents).
+
+    Arguments:
+
+    ``filename_or_contents``
+        either a filename, or a string holding the contents of some file.
+        In the latter case, this function will always return False.
+    """
+    try:
+        return os.path.exists(filename_or_contents)
+    except (ValueError, UnicodeEncodeError):
+        return False
 # }}}
 # function pofile() {{{
 
+
 def pofile(pofile, **kwargs):
     """
     Convenience function that parses the po or pot file ``pofile`` and returns
@@ -80,12 +132,17 @@
     ``check_for_duplicates``
         whether to check for duplicate entries when adding entries to the
         file (optional, default: ``False``).
+
+    ``klass``
+        class which is used to instantiate the return value (optional,
+        default: ``None``, the return value with be a :class:`~polib.POFile`
+        instance).
     """
     return _pofile_or_mofile(pofile, 'pofile', **kwargs)
-
 # }}}
 # function mofile() {{{
 
+
 def mofile(mofile, **kwargs):
     """
     Convenience function that parses the mo file ``mofile`` and returns a
@@ -108,12 +165,17 @@
     ``check_for_duplicates``
         whether to check for duplicate entries when adding entries to the
         file (optional, default: ``False``).
+
+    ``klass``
+        class which is used to instantiate the return value (optional,
+        default: ``None``, the return value with be a :class:`~polib.POFile`
+        instance).
     """
     return _pofile_or_mofile(mofile, 'mofile', **kwargs)
-
 # }}}
 # function detect_encoding() {{{
 
+
 def detect_encoding(file, binary_mode=False):
     """
     Try to detect the encoding used by the ``file``. The ``file`` argument can
@@ -129,7 +191,9 @@
     ``binary_mode``
         boolean, set this to True if ``file`` is a mo file.
     """
-    rx = re.compile(r'"?Content-Type:.+? charset=([\w_\-:\.]+)')
+    PATTERN = r'"?Content-Type:.+? charset=([\w_\-:\.]+)'
+    rxt = re.compile(u(PATTERN))
+    rxb = re.compile(b(PATTERN))
 
     def charset_exists(charset):
         """Check whether ``charset`` is valid or not."""
@@ -139,31 +203,36 @@
             return False
         return True
 
-    if not os.path.exists(file):
-        match = rx.search(file)
+    if not _is_file(file):
+        match = rxt.search(file)
         if match:
             enc = match.group(1).strip()
             if charset_exists(enc):
                 return enc
     else:
-        if binary_mode:
+        # For PY3, always treat as binary
+        if binary_mode or PY3:
             mode = 'rb'
+            rx = rxb
         else:
             mode = 'r'
+            rx = rxt
         f = open(file, mode)
         for l in f.readlines():
             match = rx.search(l)
             if match:
                 f.close()
                 enc = match.group(1).strip()
+                if not isinstance(enc, text_type):
+                    enc = enc.decode('utf-8')
                 if charset_exists(enc):
                     return enc
         f.close()
     return default_encoding
-
 # }}}
 # function escape() {{{
 
+
 def escape(st):
     """
     Escapes the characters ``\\\\``, ``\\t``, ``\\n``, ``\\r`` and ``"`` in
@@ -174,10 +243,10 @@
              .replace('\r', r'\r')\
              .replace('\n', r'\n')\
              .replace('\"', r'\"')
-
 # }}}
 # function unescape() {{{
 
+
 def unescape(st):
     """
     Unescapes the characters ``\\\\``, ``\\t``, ``\\n``, ``\\r`` and ``"`` in
@@ -193,12 +262,12 @@
             return '\r'
         if m == '\\':
             return '\\'
-        return m # handles escaped double quote
+        return m  # handles escaped double quote
     return re.sub(r'\\(\\|n|t|r|")', unescape_repl, st)
-
 # }}}
 # class _BaseFile {{{
 
+
 class _BaseFile(list):
     """
     Common base class for the :class:`~polib.POFile` and :class:`~polib.MOFile`
@@ -227,7 +296,7 @@
         list.__init__(self)
         # the opened file handle
         pofile = kwargs.get('pofile', None)
-        if pofile and os.path.exists(pofile):
+        if pofile and _is_file(pofile):
             self.fpath = pofile
         else:
             self.fpath = kwargs.get('fpath')
@@ -254,38 +323,45 @@
             ret.append(entry.__unicode__(self.wrapwidth))
         for entry in self.obsolete_entries():
             ret.append(entry.__unicode__(self.wrapwidth))
-        ret = '\n'.join(ret)
+        ret = u('\n').join(ret)
 
-        if type(ret) != types.UnicodeType:
-            return unicode(ret, self.encoding)
+        assert isinstance(ret, text_type)
+        #if type(ret) != text_type:
+        #    return unicode(ret, self.encoding)
         return ret
 
-    def __str__(self):
-        """
-        Returns the string representation of the file.
-        """
-        return unicode(self).encode(self.encoding)
+    if PY3:
+        def __str__(self):
+            return self.__unicode__()
+    else:
+        def __str__(self):
+            """
+            Returns the string representation of the file.
+            """
+            return unicode(self).encode(self.encoding)
 
     def __contains__(self, entry):
         """
-        Overriden ``list`` method to implement the membership test (in and
+        Overridden ``list`` method to implement the membership test (in and
         not in).
         The method considers that an entry is in the file if it finds an entry
-        that has the same msgid (the test is **case sensitive**).
+        that has the same msgid (the test is **case sensitive**) and the same
+        msgctxt (or none for both entries).
 
         Argument:
 
         ``entry``
             an instance of :class:`~polib._BaseEntry`.
         """
-        return self.find(entry.msgid, by='msgid') is not None
+        return self.find(entry.msgid, by='msgid', msgctxt=entry.msgctxt) \
+            is not None
 
     def __eq__(self, other):
-        return unicode(self) == unicode(other)
+        return str(self) == str(other)
 
     def append(self, entry):
         """
-        Overriden method to check for duplicates entries, if a user tries to
+        Overridden method to check for duplicates entries, if a user tries to
         add an entry that is already in the file, the method will raise a
         ``ValueError`` exception.
 
@@ -300,7 +376,7 @@
 
     def insert(self, index, entry):
         """
-        Overriden method to check for duplicates entries, if a user tries to
+        Overridden method to check for duplicates entries, if a user tries to
         add an entry that is already in the file, the method will raise a
         ``ValueError`` exception.
 
@@ -332,7 +408,7 @@
             e.flags.append('fuzzy')
         return e
 
-    def save(self, fpath=None, repr_method='__str__'):
+    def save(self, fpath=None, repr_method='__unicode__'):
         """
         Saves the po file to ``fpath``.
         If it is an existing file and no ``fpath`` is provided, then the
@@ -354,8 +430,8 @@
         if repr_method == 'to_binary':
             fhandle = open(fpath, 'wb')
         else:
-            fhandle = codecs.open(fpath, 'w', self.encoding)
-            if type(contents) != types.UnicodeType:
+            fhandle = io.open(fpath, 'w', encoding=self.encoding)
+            if not isinstance(contents, text_type):
                 contents = contents.decode(self.encoding)
         fhandle.write(contents)
         fhandle.close()
@@ -381,7 +457,7 @@
             boolean, whether to also search in entries that are obsolete.
 
         ``msgctxt``
-            string, allows to specify a specific message context for the
+            string, allows specifying a specific message context for the
             search.
         """
         if include_obsolete_entries:
@@ -390,7 +466,7 @@
             entries = [e for e in self if not e.obsolete]
         for e in entries:
             if getattr(e, by) == st:
-                if msgctxt and e.msgctxt != msgctxt:
+                if msgctxt is not False and e.msgctxt != msgctxt:
                     continue
                 return e
         return None
@@ -412,7 +488,9 @@
             'Language-Team',
             'MIME-Version',
             'Content-Type',
-            'Content-Transfer-Encoding'
+            'Content-Transfer-Encoding',
+            'Language',
+            'Plural-Forms'
         ]
         ordered_data = []
         for data in data_order:
@@ -423,9 +501,7 @@
                 pass
         # the rest of the metadata will be alphabetically ordered since there
         # are no specs for this AFAIK
-        keys = metadata.keys()
-        keys.sort()
-        for data in keys:
+        for data in sorted(metadata.keys()):
             value = metadata[data]
             ordered_data.append((data, value))
         return ordered_data
@@ -436,18 +512,12 @@
         """
         offsets = []
         entries = self.translated_entries()
+
         # the keys are sorted in the .mo file
         def cmp(_self, other):
             # msgfmt compares entries with msgctxt if it exists
-            if _self.msgctxt:
-                self_msgid = _self.msgctxt
-            else:
-                self_msgid = _self.msgid
-
-            if other.msgctxt:
-                other_msgid = other.msgctxt
-            else:
-                other_msgid = other.msgid
+            self_msgid = _self.msgctxt and _self.msgctxt or _self.msgid
+            other_msgid = other.msgctxt and other.msgctxt or other.msgid
             if self_msgid > other_msgid:
                 return 1
             elif self_msgid < other_msgid:
@@ -455,25 +525,23 @@
             else:
                 return 0
         # add metadata entry
-        entries.sort(cmp)
+        entries.sort(key=lambda o: o.msgctxt or o.msgid)
         mentry = self.metadata_as_entry()
         #mentry.msgstr = mentry.msgstr.replace('\\n', '').lstrip()
         entries = [mentry] + entries
         entries_len = len(entries)
-        ids, strs = '', ''
+        ids, strs = b(''), b('')
         for e in entries:
             # For each string, we need size and file offset.  Each string is
             # NUL terminated; the NUL does not count into the size.
-            msgid = ''
+            msgid = b('')
             if e.msgctxt:
                 # Contexts are stored by storing the concatenation of the
                 # context, a <EOT> byte, and the original string
                 msgid = self._encode(e.msgctxt + '\4')
             if e.msgid_plural:
-                indexes = e.msgstr_plural.keys()
-                indexes.sort()
                 msgstr = []
-                for index in indexes:
+                for index in sorted(e.msgstr_plural.keys()):
                     msgstr.append(e.msgstr_plural[index])
                 msgid += self._encode(e.msgid + '\0' + e.msgid_plural)
                 msgstr = self._encode('\0'.join(msgstr))
@@ -481,11 +549,11 @@
                 msgid += self._encode(e.msgid)
                 msgstr = self._encode(e.msgstr)
             offsets.append((len(ids), len(msgid), len(strs), len(msgstr)))
-            ids  += msgid  + '\0'
-            strs += msgstr + '\0'
+            ids += msgid + b('\0')
+            strs += msgstr + b('\0')
 
         # The header is 7 32-bit unsigned integers.
-        keystart = 7*4+16*entries_len
+        keystart = 7 * 4 + 16 * entries_len
         # and the values start after the keys
         valuestart = keystart + len(ids)
         koffsets = []
@@ -493,26 +561,30 @@
         # The string table first has the list of keys, then the list of values.
         # Each entry has first the size of the string, then the file offset.
         for o1, l1, o2, l2 in offsets:
-            koffsets += [l1, o1+keystart]
-            voffsets += [l2, o2+valuestart]
+            koffsets += [l1, o1 + keystart]
+            voffsets += [l2, o2 + valuestart]
         offsets = koffsets + voffsets
-        # check endianness for magic number
-        if struct.pack('@h', 1) == struct.pack('<h', 1):
-            magic_number = MOFile.LITTLE_ENDIAN
-        else:
-            magic_number = MOFile.BIG_ENDIAN
 
         output = struct.pack(
             "Iiiiiii",
-            magic_number,      # Magic number
-            0,                 # Version
-            entries_len,       # # of entries
-            7*4,               # start of key index
-            7*4+entries_len*8, # start of value index
-            0, keystart        # size and offset of hash table
-                               # Important: we don't use hash tables
+            # Magic number
+            MOFile.MAGIC,
+            # Version
+            0,
+            # number of entries
+            entries_len,
+            # start of key index
+            7 * 4,
+            # start of value index
+            7 * 4 + entries_len * 8,
+            # size and offset of hash table, we don't use hash tables
+            0, keystart
+
         )
-        output += array.array("i", offsets).tostring()
+        if PY3 and sys.version_info.minor > 1:  # python 3.2 or superior
+            output += array.array("i", offsets).tobytes()
+        else:
+            output += array.array("i", offsets).tostring()
         output += ids
         output += strs
         return output
@@ -522,13 +594,13 @@
         Encodes the given ``mixed`` argument with the file encoding if and
         only if it's an unicode string and returns the encoded string.
         """
-        if type(mixed) == types.UnicodeType:
-            return mixed.encode(self.encoding)
+        if isinstance(mixed, text_type):
+            mixed = mixed.encode(self.encoding)
         return mixed
-
 # }}}
 # class POFile {{{
 
+
 class POFile(_BaseFile):
     """
     Po (or Pot) file reader/writer.
@@ -542,13 +614,15 @@
         """
         ret, headers = '', self.header.split('\n')
         for header in headers:
-            if header[:1] in [',', ':']:
+            if not len(header):
+                ret += "#\n"
+            elif header[:1] in [',', ':']:
                 ret += '#%s\n' % header
             else:
                 ret += '# %s\n' % header
 
-        if type(ret) != types.UnicodeType:
-            ret = unicode(ret, self.encoding)
+        if not isinstance(ret, text_type):
+            ret = ret.decode(self.encoding)
 
         return ret + _BaseFile.__unicode__(self)
 
@@ -572,7 +646,7 @@
         if total == 0:
             return 100
         translated = len(self.translated_entries())
-        return int((100.00 / float(total)) * translated)
+        return int(translated * 100 / float(total))
 
     def translated_entries(self):
         """
@@ -584,7 +658,7 @@
         """
         Convenience method that returns the list of untranslated entries.
         """
-        return [e for e in self if not e.translated() and not e.obsolete \
+        return [e for e in self if not e.translated() and not e.obsolete
                 and not 'fuzzy' in e.flags]
 
     def fuzzy_entries(self):
@@ -615,28 +689,32 @@
         ``refpot``
             object POFile, the reference catalog.
         """
+        # Store entries in dict/set for faster access
+        self_entries = dict((entry.msgid, entry) for entry in self)
+        refpot_msgids = set(entry.msgid for entry in refpot)
+        # Merge entries that are in the refpot
         for entry in refpot:
-            e = self.find(entry.msgid, include_obsolete_entries=True)
+            e = self_entries.get(entry.msgid)
             if e is None:
                 e = POEntry()
                 self.append(e)
             e.merge(entry)
         # ok, now we must "obsolete" entries that are not in the refpot anymore
         for entry in self:
-            if refpot.find(entry.msgid) is None:
+            if entry.msgid not in refpot_msgids:
                 entry.obsolete = True
-
 # }}}
 # class MOFile {{{
 
+
 class MOFile(_BaseFile):
     """
     Mo file reader/writer.
     This class inherits the :class:`~polib._BaseFile` class and, by
     extension, the python ``list`` type.
     """
-    BIG_ENDIAN    = 0xde120495
-    LITTLE_ENDIAN = 0x950412de
+    MAGIC = 0x950412de
+    MAGIC_SWAPPED = 0xde120495
 
     def __init__(self, *args, **kwargs):
         """
@@ -698,10 +776,10 @@
         Convenience method to keep the same interface with POFile instances.
         """
         return []
-
 # }}}
 # class _BaseEntry {{{
 
+
 class _BaseEntry(object):
     """
     Base class for :class:`~polib.POEntry` and :class:`~polib.MOEntry` classes.
@@ -753,12 +831,14 @@
         ret = []
         # write the msgctxt if any
         if self.msgctxt is not None:
-            ret += self._str_field("msgctxt", delflag, "", self.msgctxt, wrapwidth)
+            ret += self._str_field("msgctxt", delflag, "", self.msgctxt,
+                                   wrapwidth)
         # write the msgid
         ret += self._str_field("msgid", delflag, "", self.msgid, wrapwidth)
         # write the msgid_plural if any
         if self.msgid_plural:
-            ret += self._str_field("msgid_plural", delflag, "", self.msgid_plural, wrapwidth)
+            ret += self._str_field("msgid_plural", delflag, "",
+                                   self.msgid_plural, wrapwidth)
         if self.msgstr_plural:
             # write the msgstr_plural if any
             msgstrs = self.msgstr_plural
@@ -767,30 +847,34 @@
             for index in keys:
                 msgstr = msgstrs[index]
                 plural_index = '[%s]' % index
-                ret += self._str_field("msgstr", delflag, plural_index, msgstr, wrapwidth)
+                ret += self._str_field("msgstr", delflag, plural_index, msgstr,
+                                       wrapwidth)
         else:
             # otherwise write the msgstr
-            ret += self._str_field("msgstr", delflag, "", self.msgstr, wrapwidth)
+            ret += self._str_field("msgstr", delflag, "", self.msgstr,
+                                   wrapwidth)
         ret.append('')
-        ret = '\n'.join(ret)
-
-        if type(ret) != types.UnicodeType:
-            return unicode(ret, self.encoding)
+        ret = u('\n').join(ret)
         return ret
 
-    def __str__(self):
-        """
-        Returns the string representation of the entry.
-        """
-        return unicode(self).encode(self.encoding)
+    if PY3:
+        def __str__(self):
+            return self.__unicode__()
+    else:
+        def __str__(self):
+            """
+            Returns the string representation of the entry.
+            """
+            return unicode(self).encode(self.encoding)
 
     def __eq__(self, other):
-        return unicode(self) == unicode(other)
+        return str(self) == str(other)
 
-    def _str_field(self, fieldname, delflag, plural_index, field, wrapwidth=78):
+    def _str_field(self, fieldname, delflag, plural_index, field,
+                   wrapwidth=78):
         lines = field.splitlines(True)
         if len(lines) > 1:
-            lines = [''] + lines # start with initial empty line
+            lines = [''] + lines  # start with initial empty line
         else:
             escaped_field = escape(field)
             specialchars_count = 0
@@ -804,9 +888,9 @@
             real_wrapwidth = wrapwidth - flength + specialchars_count
             if wrapwidth > 0 and len(field) > real_wrapwidth:
                 # Wrap the line but take field name into account
-                lines = [''] + [unescape(item) for item in textwrap.wrap(
+                lines = [''] + [unescape(item) for item in wrap(
                     escaped_field,
-                    wrapwidth - 2, # 2 for quotes ""
+                    wrapwidth - 2,  # 2 for quotes ""
                     drop_whitespace=False,
                     break_long_words=False
                 )]
@@ -818,13 +902,13 @@
 
         ret = ['%s%s%s "%s"' % (delflag, fieldname, plural_index,
                                 escape(lines.pop(0)))]
-        for mstr in lines:
-            ret.append('%s"%s"' % (delflag, escape(mstr)))
+        for line in lines:
+            ret.append('%s"%s"' % (delflag, escape(line)))
         return ret
-
 # }}}
 # class POEntry {{{
 
+
 class POEntry(_BaseEntry):
     """
     Represents a po file entry.
@@ -854,6 +938,9 @@
 
         ``previous_msgid_plural``
             string, the entry previous msgid_plural.
+
+        ``linenum``
+            integer, the line number of the entry
         """
         _BaseEntry.__init__(self, *args, **kwargs)
         self.comment = kwargs.get('comment', '')
@@ -863,6 +950,7 @@
         self.previous_msgctxt = kwargs.get('previous_msgctxt', None)
         self.previous_msgid = kwargs.get('previous_msgid', None)
         self.previous_msgid_plural = kwargs.get('previous_msgid_plural', None)
+        self.linenum = kwargs.get('linenum', None)
 
     def __unicode__(self, wrapwidth=78):
         """
@@ -879,7 +967,7 @@
             if val:
                 for comment in val.split('\n'):
                     if wrapwidth > 0 and len(comment) + len(c[1]) > wrapwidth:
-                        ret += textwrap.wrap(
+                        ret += wrap(
                             comment,
                             wrapwidth,
                             initial_indent=c[1],
@@ -903,7 +991,7 @@
                 # what we want for filenames, so the dirty hack is to
                 # temporally replace hyphens with a char that a file cannot
                 # contain, like "*"
-                ret += [l.replace('*', '-') for l in textwrap.wrap(
+                ret += [l.replace('*', '-') for l in wrap(
                     filestr.replace('-', '*'),
                     wrapwidth,
                     initial_indent='#: ',
@@ -918,32 +1006,25 @@
             ret.append('#, %s' % ', '.join(self.flags))
 
         # previous context and previous msgid/msgid_plural
-        fields = ['previous_msgctxt', 'previous_msgid', 'previous_msgid_plural']
+        fields = ['previous_msgctxt', 'previous_msgid',
+                  'previous_msgid_plural']
         for f in fields:
             val = getattr(self, f)
             if val:
                 ret += self._str_field(f, "#| ", "", val, wrapwidth)
 
         ret.append(_BaseEntry.__unicode__(self, wrapwidth))
-        ret = '\n'.join(ret)
+        ret = u('\n').join(ret)
 
-        if type(ret) != types.UnicodeType:
-            return unicode(ret, self.encoding)
+        assert isinstance(ret, text_type)
+        #if type(ret) != types.UnicodeType:
+        #    return unicode(ret, self.encoding)
         return ret
 
     def __cmp__(self, other):
         """
         Called by comparison operations if rich comparison is not defined.
         """
-        def compare_occurrences(a, b):
-            """
-            Compare an entry occurrence with another one.
-            """
-            if a[0] != b[0]:
-                return a[0] < b[0]
-            if a[1] != b[1]:
-                return a[1] < b[1]
-            return 0
 
         # First: Obsolete test
         if self.obsolete != other.obsolete:
@@ -952,12 +1033,8 @@
             else:
                 return 1
         # Work on a copy to protect original
-        occ1 = self.occurrences[:]
-        occ2 = other.occurrences[:]
-        # Sorting using compare method
-        occ1.sort(compare_occurrences)
-        occ2.sort(compare_occurrences)
-        # Comparing sorted occurrences
+        occ1 = sorted(self.occurrences[:])
+        occ2 = sorted(other.occurrences[:])
         pos = 0
         for entry1 in occ1:
             try:
@@ -975,9 +1052,41 @@
                     return 1
                 else:
                     return -1
+        # Compare msgid_plural if set
+        if self.msgid_plural:
+            if not other.msgid_plural:
+                return 1
+            for pos in self.msgid_plural:
+                if pos not in other.msgid_plural:
+                    return 1
+                if self.msgid_plural[pos] > other.msgid_plural[pos]:
+                    return 1
+                if self.msgid_plural[pos] < other.msgid_plural[pos]:
+                    return -1
         # Finally: Compare message ID
-        if self.msgid > other.msgid: return 1
-        else: return -1
+        if self.msgid > other.msgid:
+            return 1
+        elif self.msgid < other.msgid:
+            return -1
+        return 0
+
+    def __gt__(self, other):
+        return self.__cmp__(other) > 0
+
+    def __lt__(self, other):
+        return self.__cmp__(other) < 0
+
+    def __ge__(self, other):
+        return self.__cmp__(other) >= 0
+
+    def __le__(self, other):
+        return self.__cmp__(other) <= 0
+
+    def __eq__(self, other):
+        return self.__cmp__(other) == 0
+
+    def __ne__(self, other):
+        return self.__cmp__(other) != 0
 
     def translated(self):
         """
@@ -1020,18 +1129,49 @@
                 except KeyError:
                     self.msgstr_plural[pos] = ''
 
+    def __hash__(self):
+        return hash((self.msgid, self.msgstr))
 # }}}
 # class MOEntry {{{
 
+
 class MOEntry(_BaseEntry):
     """
     Represents a mo file entry.
     """
-    pass
+    def __init__(self, *args, **kwargs):
+        """
+        Constructor, accepts the following keyword arguments,
+        for consistency with :class:`~polib.POEntry`:
+
+        ``comment``
+        ``tcomment``
+        ``occurrences``
+        ``flags``
+        ``previous_msgctxt``
+        ``previous_msgid``
+        ``previous_msgid_plural``
+
+        Note: even though these keyword arguments are accepted,
+        they hold no real meaning in the context of MO files
+        and are simply ignored.
+        """
+        _BaseEntry.__init__(self, *args, **kwargs)
+        self.comment = ''
+        self.tcomment = ''
+        self.occurrences = []
+        self.flags = []
+        self.previous_msgctxt = None
+        self.previous_msgid = None
+        self.previous_msgid_plural = None
+
+    def __hash__(self):
+        return hash((self.msgid, self.msgstr))
 
 # }}}
 # class _POFileParser {{{
 
+
 class _POFileParser(object):
     """
     A finite state machine to parse efficiently and correctly po
@@ -1056,23 +1196,27 @@
             file (optional, default: ``False``).
         """
         enc = kwargs.get('encoding', default_encoding)
-        if os.path.exists(pofile):
+        if _is_file(pofile):
             try:
-                self.fhandle = codecs.open(pofile, 'rU', enc)
+                self.fhandle = io.open(pofile, 'rt', encoding=enc)
             except LookupError:
                 enc = default_encoding
-                self.fhandle = codecs.open(pofile, 'rU', enc)
+                self.fhandle = io.open(pofile, 'rt', encoding=enc)
         else:
             self.fhandle = pofile.splitlines()
 
-        self.instance = POFile(
+        klass = kwargs.get('klass')
+        if klass is None:
+            klass = POFile
+        self.instance = klass(
             pofile=pofile,
             encoding=enc,
             check_for_duplicates=kwargs.get('check_for_duplicates', False)
         )
         self.transitions = {}
-        self.current_entry = POEntry()
-        self.current_state = 'ST'
+        self.current_line = 0
+        self.current_entry = POEntry(linenum=self.current_line)
+        self.current_state = 'st'
         self.current_token = None
         # two memo flags used in handlers
         self.msgstr_index = 0
@@ -1083,7 +1227,7 @@
         #     * HE: Header
         #     * TC: a translation comment
         #     * GC: a generated comment
-        #     * OC: a file/line occurence
+        #     * OC: a file/line occurrence
         #     * FL: a flags line
         #     * CT: a message context
         #     * PC: a previous msgctxt
@@ -1094,48 +1238,47 @@
         #     * MS: a msgstr
         #     * MX: a msgstr plural
         #     * MC: a msgid or msgstr continuation line
-        all = ['ST', 'HE', 'GC', 'OC', 'FL', 'CT', 'PC', 'PM', 'PP', 'TC',
-               'MS', 'MP', 'MX', 'MI']
+        all = ['st', 'he', 'gc', 'oc', 'fl', 'ct', 'pc', 'pm', 'pp', 'tc',
+               'ms', 'mp', 'mx', 'mi']
 
-        self.add('TC', ['ST', 'HE'],                                     'HE')
-        self.add('TC', ['GC', 'OC', 'FL', 'TC', 'PC', 'PM', 'PP', 'MS',
-                        'MP', 'MX', 'MI'],                               'TC')
-        self.add('GC', all,                                              'GC')
-        self.add('OC', all,                                              'OC')
-        self.add('FL', all,                                              'FL')
-        self.add('PC', all,                                              'PC')
-        self.add('PM', all,                                              'PM')
-        self.add('PP', all,                                              'PP')
-        self.add('CT', ['ST', 'HE', 'GC', 'OC', 'FL', 'TC', 'PC', 'PM',
-                        'PP', 'MS', 'MX'],                               'CT')
-        self.add('MI', ['ST', 'HE', 'GC', 'OC', 'FL', 'CT', 'TC', 'PC',
-                 'PM', 'PP', 'MS', 'MX'],                                'MI')
-        self.add('MP', ['TC', 'GC', 'PC', 'PM', 'PP', 'MI'],             'MP')
-        self.add('MS', ['MI', 'MP', 'TC'],                               'MS')
-        self.add('MX', ['MI', 'MX', 'MP', 'TC'],                         'MX')
-        self.add('MC', ['CT', 'MI', 'MP', 'MS', 'MX', 'PM', 'PP', 'PC'], 'MC')
+        self.add('tc', ['st', 'he'],                                     'he')
+        self.add('tc', ['gc', 'oc', 'fl', 'tc', 'pc', 'pm', 'pp', 'ms',
+                        'mp', 'mx', 'mi'],                               'tc')
+        self.add('gc', all,                                              'gc')
+        self.add('oc', all,                                              'oc')
+        self.add('fl', all,                                              'fl')
+        self.add('pc', all,                                              'pc')
+        self.add('pm', all,                                              'pm')
+        self.add('pp', all,                                              'pp')
+        self.add('ct', ['st', 'he', 'gc', 'oc', 'fl', 'tc', 'pc', 'pm',
+                        'pp', 'ms', 'mx'],                               'ct')
+        self.add('mi', ['st', 'he', 'gc', 'oc', 'fl', 'ct', 'tc', 'pc',
+                 'pm', 'pp', 'ms', 'mx'],                                'mi')
+        self.add('mp', ['tc', 'gc', 'pc', 'pm', 'pp', 'mi'],             'mp')
+        self.add('ms', ['mi', 'mp', 'tc'],                               'ms')
+        self.add('mx', ['mi', 'mx', 'mp', 'tc'],                         'mx')
+        self.add('mc', ['ct', 'mi', 'mp', 'ms', 'mx', 'pm', 'pp', 'pc'], 'mc')
 
     def parse(self):
         """
         Run the state machine, parse the file line by line and call process()
         with the current matched symbol.
         """
-        i = 0
 
         keywords = {
-            'msgctxt': 'CT',
-            'msgid': 'MI',
-            'msgstr': 'MS',
-            'msgid_plural': 'MP',
+            'msgctxt': 'ct',
+            'msgid': 'mi',
+            'msgstr': 'ms',
+            'msgid_plural': 'mp',
         }
         prev_keywords = {
-            'msgid_plural': 'PP',
-            'msgid': 'PM',
-            'msgctxt': 'PC',
+            'msgid_plural': 'pp',
+            'msgid': 'pm',
+            'msgctxt': 'pc',
         }
-
+        tokens = []
         for line in self.fhandle:
-            i += 1
+            self.current_line += 1
             line = line.strip()
             if line == '':
                 continue
@@ -1143,6 +1286,9 @@
             tokens = line.split(None, 2)
             nb_tokens = len(tokens)
 
+            if tokens[0] == '#~|':
+                continue
+
             if tokens[0] == '#~' and nb_tokens > 1:
                 line = line[3:].strip()
                 tokens = tokens[1:]
@@ -1155,41 +1301,56 @@
             # msgid, msgid_plural, msgctxt & msgstr.
             if tokens[0] in keywords and nb_tokens > 1:
                 line = line[len(tokens[0]):].lstrip()
+                if re.search(r'([^\\]|^)"', line[1:-1]):
+                    raise IOError('Syntax error in po file %s (line %s): '
+                                  'unescaped double quote found' %
+                                  (self.instance.fpath, self.current_line))
                 self.current_token = line
-                self.process(keywords[tokens[0]], i)
+                self.process(keywords[tokens[0]])
                 continue
 
             self.current_token = line
 
-            if tokens[0] == '#:' and nb_tokens > 1:
+            if tokens[0] == '#:':
+                if nb_tokens <= 1:
+                    continue
                 # we are on a occurrences line
-                self.process('OC', i)
+                self.process('oc')
 
             elif line[:1] == '"':
                 # we are on a continuation line
-                self.process('MC', i)
+                if re.search(r'([^\\]|^)"', line[1:-1]):
+                    raise IOError('Syntax error in po file %s (line %s): '
+                                  'unescaped double quote found' %
+                                  (self.instance.fpath, self.current_line))
+                self.process('mc')
 
             elif line[:7] == 'msgstr[':
                 # we are on a msgstr plural
-                self.process('MX', i)
+                self.process('mx')
 
-            elif tokens[0] == '#,' and nb_tokens > 1:
+            elif tokens[0] == '#,':
+                if nb_tokens <= 1:
+                    continue
                 # we are on a flags line
-                self.process('FL', i)
+                self.process('fl')
 
-            elif tokens[0] == '#':
-                if line == '#': line += ' '
+            elif tokens[0] == '#' or tokens[0].startswith('##'):
+                if line == '#':
+                    line += ' '
                 # we are on a translator comment line
-                self.process('TC', i)
+                self.process('tc')
 
-            elif tokens[0] == '#.' and nb_tokens > 1:
+            elif tokens[0] == '#.':
+                if nb_tokens <= 1:
+                    continue
                 # we are on a generated comment line
-                self.process('GC', i)
+                self.process('gc')
 
             elif tokens[0] == '#|':
-                if nb_tokens < 2:
-                    self.process('??', i)
-                    continue
+                if nb_tokens <= 1:
+                    raise IOError('Syntax error in po file %s (line %s)' %
+                                  (self.instance.fpath, self.current_line))
 
                 # Remove the marker and any whitespace right after that.
                 line = line[2:].lstrip()
@@ -1197,48 +1358,57 @@
 
                 if tokens[1].startswith('"'):
                     # Continuation of previous metadata.
-                    self.process('MC', i)
+                    self.process('mc')
                     continue
 
                 if nb_tokens == 2:
                     # Invalid continuation line.
-                    self.process('??', i)
+                    raise IOError('Syntax error in po file %s (line %s): '
+                                  'invalid continuation line' %
+                                  (self.instance.fpath, self.current_line))
 
                 # we are on a "previous translation" comment line,
                 if tokens[1] not in prev_keywords:
                     # Unknown keyword in previous translation comment.
-                    self.process('??', i)
+                    raise IOError('Syntax error in po file %s (line %s): '
+                                  'unknown keyword %s' %
+                                  (self.instance.fpath, self.current_line,
+                                   tokens[1]))
 
                 # Remove the keyword and any whitespace
                 # between it and the starting quote.
                 line = line[len(tokens[1]):].lstrip()
                 self.current_token = line
-                self.process(prev_keywords[tokens[1]], i)
+                self.process(prev_keywords[tokens[1]])
 
             else:
-                self.process('??', i)
+                raise IOError('Syntax error in po file %s (line %s)' %
+                              (self.instance.fpath, self.current_line))
 
-        if self.current_entry:
+        if self.current_entry and len(tokens) > 0 and \
+           not tokens[0].startswith('#'):
             # since entries are added when another entry is found, we must add
-            # the last entry here (only if there are lines)
+            # the last entry here (only if there are lines). Trailing comments
+            # are ignored
             self.instance.append(self.current_entry)
+
         # before returning the instance, check if there's metadata and if
         # so extract it in a dict
-        firstentry = self.instance[0]
-        if firstentry.msgid == '': # metadata found
+        metadataentry = self.instance.find('')
+        if metadataentry:  # metadata found
             # remove the entry
-            firstentry = self.instance.pop(0)
-            self.instance.metadata_is_fuzzy = firstentry.flags
+            self.instance.remove(metadataentry)
+            self.instance.metadata_is_fuzzy = metadataentry.flags
             key = None
-            for msg in firstentry.msgstr.splitlines():
+            for msg in metadataentry.msgstr.splitlines():
                 try:
                     key, val = msg.split(':', 1)
                     self.instance.metadata[key] = val.strip()
-                except:
+                except (ValueError, KeyError):
                     if key is not None:
-                        self.instance.metadata[key] += '\n'+ msg.strip()
+                        self.instance.metadata[key] += '\n' + msg.strip()
         # close opened file
-        if isinstance(self.fhandle, file):
+        if not isinstance(self.fhandle, list):  # must be file
             self.fhandle.close()
         return self.instance
 
@@ -1258,10 +1428,10 @@
             the next state the fsm will have after the action.
         """
         for state in states:
-            action = getattr(self, 'handle_%s' % next_state.lower())
+            action = getattr(self, 'handle_%s' % next_state)
             self.transitions[(symbol, state)] = (action, next_state)
 
-    def process(self, symbol, linenum):
+    def process(self, symbol):
         """
         Process the transition corresponding to the current state and the
         symbol provided.
@@ -1278,8 +1448,9 @@
             (action, state) = self.transitions[(symbol, self.current_state)]
             if action():
                 self.current_state = state
-        except Exception as exc:
-            raise IOError('Syntax error in po file (line %s)' % linenum)
+        except Exception:
+            raise IOError('Syntax error in po file (line %s)' %
+                          self.current_line)
 
     # state handlers
 
@@ -1292,90 +1463,94 @@
 
     def handle_tc(self):
         """Handle a translator comment."""
-        if self.current_state in ['MC', 'MS', 'MX']:
+        if self.current_state in ['mc', 'ms', 'mx']:
             self.instance.append(self.current_entry)
-            self.current_entry = POEntry()
+            self.current_entry = POEntry(linenum=self.current_line)
         if self.current_entry.tcomment != '':
             self.current_entry.tcomment += '\n'
-        self.current_entry.tcomment += self.current_token[2:]
+        tcomment = self.current_token.lstrip('#')
+        if tcomment.startswith(' '):
+            tcomment = tcomment[1:]
+        self.current_entry.tcomment += tcomment
         return True
 
     def handle_gc(self):
         """Handle a generated comment."""
-        if self.current_state in ['MC', 'MS', 'MX']:
+        if self.current_state in ['mc', 'ms', 'mx']:
             self.instance.append(self.current_entry)
-            self.current_entry = POEntry()
+            self.current_entry = POEntry(linenum=self.current_line)
         if self.current_entry.comment != '':
             self.current_entry.comment += '\n'
         self.current_entry.comment += self.current_token[3:]
         return True
 
     def handle_oc(self):
-        """Handle a file:num occurence."""
-        if self.current_state in ['MC', 'MS', 'MX']:
+        """Handle a file:num occurrence."""
+        if self.current_state in ['mc', 'ms', 'mx']:
             self.instance.append(self.current_entry)
-            self.current_entry = POEntry()
+            self.current_entry = POEntry(linenum=self.current_line)
         occurrences = self.current_token[3:].split()
         for occurrence in occurrences:
             if occurrence != '':
                 try:
                     fil, line = occurrence.split(':')
                     if not line.isdigit():
-                        fil  = fil + line
+                        fil = fil + line
                         line = ''
                     self.current_entry.occurrences.append((fil, line))
-                except:
+                except (ValueError, AttributeError):
                     self.current_entry.occurrences.append((occurrence, ''))
         return True
 
     def handle_fl(self):
         """Handle a flags line."""
-        if self.current_state in ['MC', 'MS', 'MX']:
+        if self.current_state in ['mc', 'ms', 'mx']:
             self.instance.append(self.current_entry)
-            self.current_entry = POEntry()
-        self.current_entry.flags += self.current_token[3:].split(', ')
+            self.current_entry = POEntry(linenum=self.current_line)
+        self.current_entry.flags += [c.strip() for c in
+                                     self.current_token[3:].split(',')]
         return True
 
     def handle_pp(self):
         """Handle a previous msgid_plural line."""
-        if self.current_state in ['MC', 'MS', 'MX']:
+        if self.current_state in ['mc', 'ms', 'mx']:
             self.instance.append(self.current_entry)
-            self.current_entry = POEntry()
+            self.current_entry = POEntry(linenum=self.current_line)
         self.current_entry.previous_msgid_plural = \
             unescape(self.current_token[1:-1])
         return True
 
     def handle_pm(self):
         """Handle a previous msgid line."""
-        if self.current_state in ['MC', 'MS', 'MX']:
+        if self.current_state in ['mc', 'ms', 'mx']:
             self.instance.append(self.current_entry)
-            self.current_entry = POEntry()
+            self.current_entry = POEntry(linenum=self.current_line)
         self.current_entry.previous_msgid = \
             unescape(self.current_token[1:-1])
         return True
 
     def handle_pc(self):
         """Handle a previous msgctxt line."""
-        if self.current_state in ['MC', 'MS', 'MX']:
+        if self.current_state in ['mc', 'ms', 'mx']:
             self.instance.append(self.current_entry)
-            self.current_entry = POEntry()
+            self.current_entry = POEntry(linenum=self.current_line)
         self.current_entry.previous_msgctxt = \
             unescape(self.current_token[1:-1])
         return True
 
     def handle_ct(self):
         """Handle a msgctxt."""
-        if self.current_state in ['MC', 'MS', 'MX']:
+        if self.current_state in ['mc', 'ms', 'mx']:
             self.instance.append(self.current_entry)
-            self.current_entry = POEntry()
+            self.current_entry = POEntry(linenum=self.current_line)
         self.current_entry.msgctxt = unescape(self.current_token[1:-1])
         return True
 
     def handle_mi(self):
         """Handle a msgid."""
-        if self.current_state in ['MC', 'MS', 'MX']:
+        if self.current_state in ['mc', 'ms', 'mx']:
             self.instance.append(self.current_entry)
-            self.current_entry = POEntry()
+            self.current_entry = POEntry(linenum=self.current_line)
         self.current_entry.obsolete = self.entry_obsolete
         self.current_entry.msgid = unescape(self.current_token[1:-1])
         return True
@@ -1392,47 +1567,37 @@
 
     def handle_mx(self):
         """Handle a msgstr plural."""
-        index, value = self.current_token[7], self.current_token[11:-1]
-        self.current_entry.msgstr_plural[index] = unescape(value)
-        self.msgstr_index = index
+        index = self.current_token[7]
+        value = self.current_token[self.current_token.find('"') + 1:-1]
+        self.current_entry.msgstr_plural[int(index)] = unescape(value)
+        self.msgstr_index = int(index)
         return True
 
     def handle_mc(self):
         """Handle a msgid or msgstr continuation line."""
         token = unescape(self.current_token[1:-1])
-        if self.current_state == 'CT':
-            typ = 'msgctxt'
+        if self.current_state == 'ct':
             self.current_entry.msgctxt += token
-        elif self.current_state == 'MI':
-            typ = 'msgid'
+        elif self.current_state == 'mi':
             self.current_entry.msgid += token
-        elif self.current_state == 'MP':
-            typ = 'msgid_plural'
+        elif self.current_state == 'mp':
             self.current_entry.msgid_plural += token
-        elif self.current_state == 'MS':
-            typ = 'msgstr'
+        elif self.current_state == 'ms':
             self.current_entry.msgstr += token
-        elif self.current_state == 'MX':
-            typ = 'msgstr[%s]' % self.msgstr_index
+        elif self.current_state == 'mx':
             self.current_entry.msgstr_plural[self.msgstr_index] += token
-        elif self.current_state == 'PP':
-            typ = 'previous_msgid_plural'
-            token = token[3:]
+        elif self.current_state == 'pp':
             self.current_entry.previous_msgid_plural += token
-        elif self.current_state == 'PM':
-            typ = 'previous_msgid'
-            token = token[3:]
+        elif self.current_state == 'pm':
             self.current_entry.previous_msgid += token
-        elif self.current_state == 'PC':
-            typ = 'previous_msgctxt'
-            token = token[3:]
+        elif self.current_state == 'pc':
             self.current_entry.previous_msgctxt += token
         # don't change the current state
         return False
-
 # }}}
 # class _MOFileParser {{{
 
+
 class _MOFileParser(object):
     """
     A class to parse binary mo files.
@@ -1456,12 +1621,24 @@
             file (optional, default: ``False``).
         """
         self.fhandle = open(mofile, 'rb')
-        self.instance = MOFile(
+
+        klass = kwargs.get('klass')
+        if klass is None:
+            klass = MOFile
+        self.instance = klass(
             fpath=mofile,
             encoding=kwargs.get('encoding', default_encoding),
             check_for_duplicates=kwargs.get('check_for_duplicates', False)
         )
 
+    def __del__(self):
+        """
+        Make sure the file is closed, this prevents warnings on unclosed file
+        when running tests with python >= 3.2.
+        """
+        if self.fhandle:
+            self.fhandle.close()
+
     def parse(self):
         """
         Build the instance with the file handle provided in the
@@ -1469,15 +1646,20 @@
         """
         # parse magic number
         magic_number = self._readbinary('<I', 4)
-        if magic_number == MOFile.LITTLE_ENDIAN:
+        if magic_number == MOFile.MAGIC:
             ii = '<II'
-        elif magic_number == MOFile.BIG_ENDIAN:
+        elif magic_number == MOFile.MAGIC_SWAPPED:
             ii = '>II'
         else:
             raise IOError('Invalid mo file, magic number is incorrect !')
         self.instance.magic_number = magic_number
         # parse the version number and the number of strings
-        self.instance.version, numofstrings = self._readbinary(ii, 8)
+        version, numofstrings = self._readbinary(ii, 8)
+        # from MO file format specs: "A program seeing an unexpected major
+        # revision number should stop reading the MO file entirely"
+        if version not in (0, 1):
+            raise IOError('Invalid mo file, unexpected major revision number')
+        self.instance.version = version
         # original strings and translation strings hash table offset
         msgids_hash_offset, msgstrs_hash_offset = self._readbinary(ii, 8)
         # move to msgid hash table and read length and offset of msgids
@@ -1491,29 +1673,34 @@
         for i in range(numofstrings):
             msgstrs_index.append(self._readbinary(ii, 8))
         # build entries
+        encoding = self.instance.encoding
         for i in range(numofstrings):
             self.fhandle.seek(msgids_index[i][1])
             msgid = self.fhandle.read(msgids_index[i][0])
+
             self.fhandle.seek(msgstrs_index[i][1])
             msgstr = self.fhandle.read(msgstrs_index[i][0])
-            if i == 0: # metadata
-                raw_metadata, metadata = msgstr.split('\n'), {}
+            if i == 0 and not msgid:  # metadata
+                raw_metadata, metadata = msgstr.split(b('\n')), {}
                 for line in raw_metadata:
-                    tokens = line.split(':', 1)
-                    if tokens[0] != '':
+                    tokens = line.split(b(':'), 1)
+                    if tokens[0] != b(''):
                         try:
-                            metadata[tokens[0]] = tokens[1].strip()
+                            k = tokens[0].decode(encoding)
+                            v = tokens[1].decode(encoding)
+                            metadata[k] = v.strip()
                         except IndexError:
-                            metadata[tokens[0]] = ''
+                            metadata[k] = u('')
                 self.instance.metadata = metadata
                 continue
             # test if we have a plural entry
-            msgid_tokens = msgid.split('\0')
+            msgid_tokens = msgid.split(b('\0'))
             if len(msgid_tokens) > 1:
                 entry = self._build_entry(
                     msgid=msgid_tokens[0],
                     msgid_plural=msgid_tokens[1],
-                    msgstr_plural=dict((k,v) for k,v in enumerate(msgstr.split('\0')))
+                    msgstr_plural=dict((k, v) for k, v in
+                                       enumerate(msgstr.split(b('\0'))))
                 )
             else:
                 entry = self._build_entry(msgid=msgid, msgstr=msgstr)
@@ -1524,19 +1711,22 @@
 
     def _build_entry(self, msgid, msgstr=None, msgid_plural=None,
                      msgstr_plural=None):
-        msgctxt_msgid = msgid.split('\x04')
+        msgctxt_msgid = msgid.split(b('\x04'))
+        encoding = self.instance.encoding
         if len(msgctxt_msgid) > 1:
             kwargs = {
-                'msgctxt': msgctxt_msgid[0],
-                'msgid'  : msgctxt_msgid[1],
+                'msgctxt': msgctxt_msgid[0].decode(encoding),
+                'msgid': msgctxt_msgid[1].decode(encoding),
             }
         else:
-            kwargs = {'msgid': msgid}
+            kwargs = {'msgid': msgid.decode(encoding)}
         if msgstr:
-            kwargs['msgstr'] = msgstr
+            kwargs['msgstr'] = msgstr.decode(encoding)
         if msgid_plural:
-            kwargs['msgid_plural'] = msgid_plural
+            kwargs['msgid_plural'] = msgid_plural.decode(encoding)
         if msgstr_plural:
+            for k in msgstr_plural:
+                msgstr_plural[k] = msgstr_plural[k].decode(encoding)
             kwargs['msgstr_plural'] = msgstr_plural
         return MOEntry(**kwargs)
 
@@ -1550,5 +1740,99 @@
         if len(tup) == 1:
             return tup[0]
         return tup
+# }}}
+# class TextWrapper {{{
+
+
+class TextWrapper(textwrap.TextWrapper):
+    """
+    Subclass of textwrap.TextWrapper that backport the
+    drop_whitespace option.
+    """
+    def __init__(self, *args, **kwargs):
+        drop_whitespace = kwargs.pop('drop_whitespace', True)
+        textwrap.TextWrapper.__init__(self, *args, **kwargs)
+        self.drop_whitespace = drop_whitespace
+
+    def _wrap_chunks(self, chunks):
+        """_wrap_chunks(chunks : [string]) -> [string]
+
+        Wrap a sequence of text chunks and return a list of lines of
+        length 'self.width' or less.  (If 'break_long_words' is false,
+        some lines may be longer than this.)  Chunks correspond roughly
+        to words and the whitespace between them: each chunk is
+        indivisible (modulo 'break_long_words'), but a line break can
+        come between any two chunks.  Chunks should not have internal
+        whitespace; ie. a chunk is either all whitespace or a "word".
+        Whitespace chunks will be removed from the beginning and end of
+        lines, but apart from that whitespace is preserved.
+        """
+        lines = []
+        if self.width <= 0:
+            raise ValueError("invalid width %r (must be > 0)" % self.width)
+
+        # Arrange in reverse order so items can be efficiently popped
+        # from a stack of chucks.
+        chunks.reverse()
+
+        while chunks:
+
+            # Start the list of chunks that will make up the current line.
+            # cur_len is just the length of all the chunks in cur_line.
+            cur_line = []
+            cur_len = 0
+
+            # Figure out which static string will prefix this line.
+            if lines:
+                indent = self.subsequent_indent
+            else:
+                indent = self.initial_indent
+
+            # Maximum width for this line.
+            width = self.width - len(indent)
+
+            # First chunk on line is whitespace -- drop it, unless this
+            # is the very beginning of the text (ie. no lines started yet).
+            if self.drop_whitespace and chunks[-1].strip() == '' and lines:
+                del chunks[-1]
+
+            while chunks:
+                l = len(chunks[-1])
+
+                # Can at least squeeze this chunk onto the current line.
+                if cur_len + l <= width:
+                    cur_line.append(chunks.pop())
+                    cur_len += l
+
+                # Nope, this line is full.
+                else:
+                    break
+
+            # The current line is full, and the next chunk is too big to
+            # fit on *any* line (not just this one).
+            if chunks and len(chunks[-1]) > width:
+                self._handle_long_word(chunks, cur_line, cur_len, width)
+
+            # If the last chunk on this line is all whitespace, drop it.
+            if self.drop_whitespace and cur_line and not cur_line[-1].strip():
+                del cur_line[-1]
+
+            # Convert current line back to a string and store it in list
+            # of all lines (return value).
+            if cur_line:
+                lines.append(indent + ''.join(cur_line))
+
+        return lines
+# }}}
+# function wrap() {{{
+
+
+def wrap(text, width=70, **kwargs):
+    """
+    Wrap a single paragraph of text, returning a list of wrapped lines.
+    """
+    if sys.version_info < (2, 6):
+        return TextWrapper(width=width, **kwargs).wrap(text)
+    return textwrap.wrap(text, width=width, **kwargs)
 
 # }}}