comparison i18n/polib.py @ 40185:19fc5a986669

polib: update to latest release 1.0.7 (upstream rev d75ce6dbbc2a) # no-check-commit third-party code that doesn't match our style Differential Revision: https://phab.mercurial-scm.org/D5001
author Augie Fackler <augie@google.com>
date Fri, 12 Oct 2018 11:44:27 -0400
parents a7310a477966
children 2372284d9457
comparison
equal deleted inserted replaced
40184:c3b7d9c54edd 40185:19fc5a986669
1 # -*- coding: utf-8 -*-
2 # no-check-code 1 # no-check-code
2 # -* coding: utf-8 -*-
3 # 3 #
4 # License: MIT (see LICENSE file provided) 4 # License: MIT (see LICENSE file provided)
5 # vim: set expandtab tabstop=4 shiftwidth=4 softtabstop=4: 5 # vim: set expandtab tabstop=4 shiftwidth=4 softtabstop=4:
6 6
7 """ 7 """
13 :func:`~polib.mofile` convenience functions. 13 :func:`~polib.mofile` convenience functions.
14 """ 14 """
15 15
16 from __future__ import absolute_import 16 from __future__ import absolute_import
17 17
18 __author__ = 'David Jean Louis <izimobil@gmail.com>' 18 __author__ = 'David Jean Louis <izimobil@gmail.com>'
19 __version__ = '0.6.4' 19 __version__ = '1.0.7'
20 __all__ = ['pofile', 'POFile', 'POEntry', 'mofile', 'MOFile', 'MOEntry', 20 __all__ = ['pofile', 'POFile', 'POEntry', 'mofile', 'MOFile', 'MOEntry',
21 'detect_encoding', 'escape', 'unescape', 'detect_encoding',] 21 'default_encoding', 'escape', 'unescape', 'detect_encoding', ]
22 22
23 import array 23 import array
24 import codecs 24 import codecs
25 import os 25 import os
26 import re 26 import re
27 import struct 27 import struct
28 import sys 28 import sys
29 import textwrap 29 import textwrap
30 import types 30
31 try:
32 import io
33 except ImportError:
34 # replacement of io.open() for python < 2.6
35 # we use codecs instead
36 class io(object):
37 @staticmethod
38 def open(fpath, mode='r', encoding=None):
39 return codecs.open(fpath, mode, encoding)
31 40
32 41
33 # the default encoding to use when encoding cannot be detected 42 # the default encoding to use when encoding cannot be detected
34 default_encoding = 'utf-8' 43 default_encoding = 'utf-8'
35 44
45 # python 2/3 compatibility helpers {{{
46
47
48 if sys.version_info[:2] < (3, 0):
49 PY3 = False
50 text_type = unicode
51
52 def b(s):
53 return s
54
55 def u(s):
56 return unicode(s, "unicode_escape")
57
58 else:
59 PY3 = True
60 text_type = str
61
62 def b(s):
63 return s.encode("latin-1")
64
65 def u(s):
66 return s
67 # }}}
36 # _pofile_or_mofile {{{ 68 # _pofile_or_mofile {{{
69
37 70
38 def _pofile_or_mofile(f, type, **kwargs): 71 def _pofile_or_mofile(f, type, **kwargs):
39 """ 72 """
40 Internal function used by :func:`polib.pofile` and :func:`polib.mofile` to 73 Internal function used by :func:`polib.pofile` and :func:`polib.mofile` to
41 honor the DRY concept. 74 honor the DRY concept.
48 # parse the file 81 # parse the file
49 kls = type == 'pofile' and _POFileParser or _MOFileParser 82 kls = type == 'pofile' and _POFileParser or _MOFileParser
50 parser = kls( 83 parser = kls(
51 f, 84 f,
52 encoding=enc, 85 encoding=enc,
53 check_for_duplicates=kwargs.get('check_for_duplicates', False) 86 check_for_duplicates=kwargs.get('check_for_duplicates', False),
87 klass=kwargs.get('klass')
54 ) 88 )
55 instance = parser.parse() 89 instance = parser.parse()
56 instance.wrapwidth = kwargs.get('wrapwidth', 78) 90 instance.wrapwidth = kwargs.get('wrapwidth', 78)
57 return instance 91 return instance
58 92 # }}}
93 # _is_file {{{
94
95
96 def _is_file(filename_or_contents):
97 """
98 Safely returns the value of os.path.exists(filename_or_contents).
99
100 Arguments:
101
102 ``filename_or_contents``
103 either a filename, or a string holding the contents of some file.
104 In the latter case, this function will always return False.
105 """
106 try:
107 return os.path.exists(filename_or_contents)
108 except (ValueError, UnicodeEncodeError):
109 return False
59 # }}} 110 # }}}
60 # function pofile() {{{ 111 # function pofile() {{{
112
61 113
62 def pofile(pofile, **kwargs): 114 def pofile(pofile, **kwargs):
63 """ 115 """
64 Convenience function that parses the po or pot file ``pofile`` and returns 116 Convenience function that parses the po or pot file ``pofile`` and returns
65 a :class:`~polib.POFile` instance. 117 a :class:`~polib.POFile` instance.
78 encoding will be auto-detected). 130 encoding will be auto-detected).
79 131
80 ``check_for_duplicates`` 132 ``check_for_duplicates``
81 whether to check for duplicate entries when adding entries to the 133 whether to check for duplicate entries when adding entries to the
82 file (optional, default: ``False``). 134 file (optional, default: ``False``).
135
136 ``klass``
137 class which is used to instantiate the return value (optional,
138 default: ``None``, the return value with be a :class:`~polib.POFile`
139 instance).
83 """ 140 """
84 return _pofile_or_mofile(pofile, 'pofile', **kwargs) 141 return _pofile_or_mofile(pofile, 'pofile', **kwargs)
85
86 # }}} 142 # }}}
87 # function mofile() {{{ 143 # function mofile() {{{
144
88 145
89 def mofile(mofile, **kwargs): 146 def mofile(mofile, **kwargs):
90 """ 147 """
91 Convenience function that parses the mo file ``mofile`` and returns a 148 Convenience function that parses the mo file ``mofile`` and returns a
92 :class:`~polib.MOFile` instance. 149 :class:`~polib.MOFile` instance.
106 encoding will be auto-detected). 163 encoding will be auto-detected).
107 164
108 ``check_for_duplicates`` 165 ``check_for_duplicates``
109 whether to check for duplicate entries when adding entries to the 166 whether to check for duplicate entries when adding entries to the
110 file (optional, default: ``False``). 167 file (optional, default: ``False``).
168
169 ``klass``
170 class which is used to instantiate the return value (optional,
171 default: ``None``, the return value with be a :class:`~polib.POFile`
172 instance).
111 """ 173 """
112 return _pofile_or_mofile(mofile, 'mofile', **kwargs) 174 return _pofile_or_mofile(mofile, 'mofile', **kwargs)
113
114 # }}} 175 # }}}
115 # function detect_encoding() {{{ 176 # function detect_encoding() {{{
177
116 178
117 def detect_encoding(file, binary_mode=False): 179 def detect_encoding(file, binary_mode=False):
118 """ 180 """
119 Try to detect the encoding used by the ``file``. The ``file`` argument can 181 Try to detect the encoding used by the ``file``. The ``file`` argument can
120 be a PO or MO file path or a string containing the contents of the file. 182 be a PO or MO file path or a string containing the contents of the file.
127 string, full or relative path to the po/mo file or its content. 189 string, full or relative path to the po/mo file or its content.
128 190
129 ``binary_mode`` 191 ``binary_mode``
130 boolean, set this to True if ``file`` is a mo file. 192 boolean, set this to True if ``file`` is a mo file.
131 """ 193 """
132 rx = re.compile(r'"?Content-Type:.+? charset=([\w_\-:\.]+)') 194 PATTERN = r'"?Content-Type:.+? charset=([\w_\-:\.]+)'
195 rxt = re.compile(u(PATTERN))
196 rxb = re.compile(b(PATTERN))
133 197
134 def charset_exists(charset): 198 def charset_exists(charset):
135 """Check whether ``charset`` is valid or not.""" 199 """Check whether ``charset`` is valid or not."""
136 try: 200 try:
137 codecs.lookup(charset) 201 codecs.lookup(charset)
138 except LookupError: 202 except LookupError:
139 return False 203 return False
140 return True 204 return True
141 205
142 if not os.path.exists(file): 206 if not _is_file(file):
143 match = rx.search(file) 207 match = rxt.search(file)
144 if match: 208 if match:
145 enc = match.group(1).strip() 209 enc = match.group(1).strip()
146 if charset_exists(enc): 210 if charset_exists(enc):
147 return enc 211 return enc
148 else: 212 else:
149 if binary_mode: 213 # For PY3, always treat as binary
214 if binary_mode or PY3:
150 mode = 'rb' 215 mode = 'rb'
216 rx = rxb
151 else: 217 else:
152 mode = 'r' 218 mode = 'r'
219 rx = rxt
153 f = open(file, mode) 220 f = open(file, mode)
154 for l in f.readlines(): 221 for l in f.readlines():
155 match = rx.search(l) 222 match = rx.search(l)
156 if match: 223 if match:
157 f.close() 224 f.close()
158 enc = match.group(1).strip() 225 enc = match.group(1).strip()
226 if not isinstance(enc, text_type):
227 enc = enc.decode('utf-8')
159 if charset_exists(enc): 228 if charset_exists(enc):
160 return enc 229 return enc
161 f.close() 230 f.close()
162 return default_encoding 231 return default_encoding
163
164 # }}} 232 # }}}
165 # function escape() {{{ 233 # function escape() {{{
234
166 235
167 def escape(st): 236 def escape(st):
168 """ 237 """
169 Escapes the characters ``\\\\``, ``\\t``, ``\\n``, ``\\r`` and ``"`` in 238 Escapes the characters ``\\\\``, ``\\t``, ``\\n``, ``\\r`` and ``"`` in
170 the given string ``st`` and returns it. 239 the given string ``st`` and returns it.
172 return st.replace('\\', r'\\')\ 241 return st.replace('\\', r'\\')\
173 .replace('\t', r'\t')\ 242 .replace('\t', r'\t')\
174 .replace('\r', r'\r')\ 243 .replace('\r', r'\r')\
175 .replace('\n', r'\n')\ 244 .replace('\n', r'\n')\
176 .replace('\"', r'\"') 245 .replace('\"', r'\"')
177
178 # }}} 246 # }}}
179 # function unescape() {{{ 247 # function unescape() {{{
248
180 249
181 def unescape(st): 250 def unescape(st):
182 """ 251 """
183 Unescapes the characters ``\\\\``, ``\\t``, ``\\n``, ``\\r`` and ``"`` in 252 Unescapes the characters ``\\\\``, ``\\t``, ``\\n``, ``\\r`` and ``"`` in
184 the given string ``st`` and returns it. 253 the given string ``st`` and returns it.
191 return '\t' 260 return '\t'
192 if m == 'r': 261 if m == 'r':
193 return '\r' 262 return '\r'
194 if m == '\\': 263 if m == '\\':
195 return '\\' 264 return '\\'
196 return m # handles escaped double quote 265 return m # handles escaped double quote
197 return re.sub(r'\\(\\|n|t|r|")', unescape_repl, st) 266 return re.sub(r'\\(\\|n|t|r|")', unescape_repl, st)
198
199 # }}} 267 # }}}
200 # class _BaseFile {{{ 268 # class _BaseFile {{{
269
201 270
202 class _BaseFile(list): 271 class _BaseFile(list):
203 """ 272 """
204 Common base class for the :class:`~polib.POFile` and :class:`~polib.MOFile` 273 Common base class for the :class:`~polib.POFile` and :class:`~polib.MOFile`
205 classes. This class should **not** be instanciated directly. 274 classes. This class should **not** be instanciated directly.
225 file, (optional, default: ``False``). 294 file, (optional, default: ``False``).
226 """ 295 """
227 list.__init__(self) 296 list.__init__(self)
228 # the opened file handle 297 # the opened file handle
229 pofile = kwargs.get('pofile', None) 298 pofile = kwargs.get('pofile', None)
230 if pofile and os.path.exists(pofile): 299 if pofile and _is_file(pofile):
231 self.fpath = pofile 300 self.fpath = pofile
232 else: 301 else:
233 self.fpath = kwargs.get('fpath') 302 self.fpath = kwargs.get('fpath')
234 # the width at which lines should be wrapped 303 # the width at which lines should be wrapped
235 self.wrapwidth = kwargs.get('wrapwidth', 78) 304 self.wrapwidth = kwargs.get('wrapwidth', 78)
252 [e for e in self if not e.obsolete] 321 [e for e in self if not e.obsolete]
253 for entry in entries: 322 for entry in entries:
254 ret.append(entry.__unicode__(self.wrapwidth)) 323 ret.append(entry.__unicode__(self.wrapwidth))
255 for entry in self.obsolete_entries(): 324 for entry in self.obsolete_entries():
256 ret.append(entry.__unicode__(self.wrapwidth)) 325 ret.append(entry.__unicode__(self.wrapwidth))
257 ret = '\n'.join(ret) 326 ret = u('\n').join(ret)
258 327
259 if type(ret) != types.UnicodeType: 328 assert isinstance(ret, text_type)
260 return unicode(ret, self.encoding) 329 #if type(ret) != text_type:
330 # return unicode(ret, self.encoding)
261 return ret 331 return ret
262 332
263 def __str__(self): 333 if PY3:
264 """ 334 def __str__(self):
265 Returns the string representation of the file. 335 return self.__unicode__()
266 """ 336 else:
267 return unicode(self).encode(self.encoding) 337 def __str__(self):
338 """
339 Returns the string representation of the file.
340 """
341 return unicode(self).encode(self.encoding)
268 342
269 def __contains__(self, entry): 343 def __contains__(self, entry):
270 """ 344 """
271 Overriden ``list`` method to implement the membership test (in and 345 Overridden ``list`` method to implement the membership test (in and
272 not in). 346 not in).
273 The method considers that an entry is in the file if it finds an entry 347 The method considers that an entry is in the file if it finds an entry
274 that has the same msgid (the test is **case sensitive**). 348 that has the same msgid (the test is **case sensitive**) and the same
349 msgctxt (or none for both entries).
275 350
276 Argument: 351 Argument:
277 352
278 ``entry`` 353 ``entry``
279 an instance of :class:`~polib._BaseEntry`. 354 an instance of :class:`~polib._BaseEntry`.
280 """ 355 """
281 return self.find(entry.msgid, by='msgid') is not None 356 return self.find(entry.msgid, by='msgid', msgctxt=entry.msgctxt) \
357 is not None
282 358
283 def __eq__(self, other): 359 def __eq__(self, other):
284 return unicode(self) == unicode(other) 360 return str(self) == str(other)
285 361
286 def append(self, entry): 362 def append(self, entry):
287 """ 363 """
288 Overriden method to check for duplicates entries, if a user tries to 364 Overridden method to check for duplicates entries, if a user tries to
289 add an entry that is already in the file, the method will raise a 365 add an entry that is already in the file, the method will raise a
290 ``ValueError`` exception. 366 ``ValueError`` exception.
291 367
292 Argument: 368 Argument:
293 369
298 raise ValueError('Entry "%s" already exists' % entry.msgid) 374 raise ValueError('Entry "%s" already exists' % entry.msgid)
299 super(_BaseFile, self).append(entry) 375 super(_BaseFile, self).append(entry)
300 376
301 def insert(self, index, entry): 377 def insert(self, index, entry):
302 """ 378 """
303 Overriden method to check for duplicates entries, if a user tries to 379 Overridden method to check for duplicates entries, if a user tries to
304 add an entry that is already in the file, the method will raise a 380 add an entry that is already in the file, the method will raise a
305 ``ValueError`` exception. 381 ``ValueError`` exception.
306 382
307 Arguments: 383 Arguments:
308 384
330 e.msgstr = '\n'.join(strs) + '\n' 406 e.msgstr = '\n'.join(strs) + '\n'
331 if self.metadata_is_fuzzy: 407 if self.metadata_is_fuzzy:
332 e.flags.append('fuzzy') 408 e.flags.append('fuzzy')
333 return e 409 return e
334 410
335 def save(self, fpath=None, repr_method='__str__'): 411 def save(self, fpath=None, repr_method='__unicode__'):
336 """ 412 """
337 Saves the po file to ``fpath``. 413 Saves the po file to ``fpath``.
338 If it is an existing file and no ``fpath`` is provided, then the 414 If it is an existing file and no ``fpath`` is provided, then the
339 existing file is rewritten with the modified data. 415 existing file is rewritten with the modified data.
340 416
352 if fpath is None: 428 if fpath is None:
353 fpath = self.fpath 429 fpath = self.fpath
354 if repr_method == 'to_binary': 430 if repr_method == 'to_binary':
355 fhandle = open(fpath, 'wb') 431 fhandle = open(fpath, 'wb')
356 else: 432 else:
357 fhandle = codecs.open(fpath, 'w', self.encoding) 433 fhandle = io.open(fpath, 'w', encoding=self.encoding)
358 if type(contents) != types.UnicodeType: 434 if not isinstance(contents, text_type):
359 contents = contents.decode(self.encoding) 435 contents = contents.decode(self.encoding)
360 fhandle.write(contents) 436 fhandle.write(contents)
361 fhandle.close() 437 fhandle.close()
362 # set the file path if not set 438 # set the file path if not set
363 if self.fpath is None and fpath: 439 if self.fpath is None and fpath:
379 455
380 ``include_obsolete_entries`` 456 ``include_obsolete_entries``
381 boolean, whether to also search in entries that are obsolete. 457 boolean, whether to also search in entries that are obsolete.
382 458
383 ``msgctxt`` 459 ``msgctxt``
384 string, allows to specify a specific message context for the 460 string, allows specifying a specific message context for the
385 search. 461 search.
386 """ 462 """
387 if include_obsolete_entries: 463 if include_obsolete_entries:
388 entries = self[:] 464 entries = self[:]
389 else: 465 else:
390 entries = [e for e in self if not e.obsolete] 466 entries = [e for e in self if not e.obsolete]
391 for e in entries: 467 for e in entries:
392 if getattr(e, by) == st: 468 if getattr(e, by) == st:
393 if msgctxt and e.msgctxt != msgctxt: 469 if msgctxt is not False and e.msgctxt != msgctxt:
394 continue 470 continue
395 return e 471 return e
396 return None 472 return None
397 473
398 def ordered_metadata(self): 474 def ordered_metadata(self):
410 'PO-Revision-Date', 486 'PO-Revision-Date',
411 'Last-Translator', 487 'Last-Translator',
412 'Language-Team', 488 'Language-Team',
413 'MIME-Version', 489 'MIME-Version',
414 'Content-Type', 490 'Content-Type',
415 'Content-Transfer-Encoding' 491 'Content-Transfer-Encoding',
492 'Language',
493 'Plural-Forms'
416 ] 494 ]
417 ordered_data = [] 495 ordered_data = []
418 for data in data_order: 496 for data in data_order:
419 try: 497 try:
420 value = metadata.pop(data) 498 value = metadata.pop(data)
421 ordered_data.append((data, value)) 499 ordered_data.append((data, value))
422 except KeyError: 500 except KeyError:
423 pass 501 pass
424 # the rest of the metadata will be alphabetically ordered since there 502 # the rest of the metadata will be alphabetically ordered since there
425 # are no specs for this AFAIK 503 # are no specs for this AFAIK
426 keys = metadata.keys() 504 for data in sorted(metadata.keys()):
427 keys.sort()
428 for data in keys:
429 value = metadata[data] 505 value = metadata[data]
430 ordered_data.append((data, value)) 506 ordered_data.append((data, value))
431 return ordered_data 507 return ordered_data
432 508
433 def to_binary(self): 509 def to_binary(self):
434 """ 510 """
435 Return the binary representation of the file. 511 Return the binary representation of the file.
436 """ 512 """
437 offsets = [] 513 offsets = []
438 entries = self.translated_entries() 514 entries = self.translated_entries()
515
439 # the keys are sorted in the .mo file 516 # the keys are sorted in the .mo file
440 def cmp(_self, other): 517 def cmp(_self, other):
441 # msgfmt compares entries with msgctxt if it exists 518 # msgfmt compares entries with msgctxt if it exists
442 if _self.msgctxt: 519 self_msgid = _self.msgctxt and _self.msgctxt or _self.msgid
443 self_msgid = _self.msgctxt 520 other_msgid = other.msgctxt and other.msgctxt or other.msgid
444 else:
445 self_msgid = _self.msgid
446
447 if other.msgctxt:
448 other_msgid = other.msgctxt
449 else:
450 other_msgid = other.msgid
451 if self_msgid > other_msgid: 521 if self_msgid > other_msgid:
452 return 1 522 return 1
453 elif self_msgid < other_msgid: 523 elif self_msgid < other_msgid:
454 return -1 524 return -1
455 else: 525 else:
456 return 0 526 return 0
457 # add metadata entry 527 # add metadata entry
458 entries.sort(cmp) 528 entries.sort(key=lambda o: o.msgctxt or o.msgid)
459 mentry = self.metadata_as_entry() 529 mentry = self.metadata_as_entry()
460 #mentry.msgstr = mentry.msgstr.replace('\\n', '').lstrip() 530 #mentry.msgstr = mentry.msgstr.replace('\\n', '').lstrip()
461 entries = [mentry] + entries 531 entries = [mentry] + entries
462 entries_len = len(entries) 532 entries_len = len(entries)
463 ids, strs = '', '' 533 ids, strs = b(''), b('')
464 for e in entries: 534 for e in entries:
465 # For each string, we need size and file offset. Each string is 535 # For each string, we need size and file offset. Each string is
466 # NUL terminated; the NUL does not count into the size. 536 # NUL terminated; the NUL does not count into the size.
467 msgid = '' 537 msgid = b('')
468 if e.msgctxt: 538 if e.msgctxt:
469 # Contexts are stored by storing the concatenation of the 539 # Contexts are stored by storing the concatenation of the
470 # context, a <EOT> byte, and the original string 540 # context, a <EOT> byte, and the original string
471 msgid = self._encode(e.msgctxt + '\4') 541 msgid = self._encode(e.msgctxt + '\4')
472 if e.msgid_plural: 542 if e.msgid_plural:
473 indexes = e.msgstr_plural.keys()
474 indexes.sort()
475 msgstr = [] 543 msgstr = []
476 for index in indexes: 544 for index in sorted(e.msgstr_plural.keys()):
477 msgstr.append(e.msgstr_plural[index]) 545 msgstr.append(e.msgstr_plural[index])
478 msgid += self._encode(e.msgid + '\0' + e.msgid_plural) 546 msgid += self._encode(e.msgid + '\0' + e.msgid_plural)
479 msgstr = self._encode('\0'.join(msgstr)) 547 msgstr = self._encode('\0'.join(msgstr))
480 else: 548 else:
481 msgid += self._encode(e.msgid) 549 msgid += self._encode(e.msgid)
482 msgstr = self._encode(e.msgstr) 550 msgstr = self._encode(e.msgstr)
483 offsets.append((len(ids), len(msgid), len(strs), len(msgstr))) 551 offsets.append((len(ids), len(msgid), len(strs), len(msgstr)))
484 ids += msgid + '\0' 552 ids += msgid + b('\0')
485 strs += msgstr + '\0' 553 strs += msgstr + b('\0')
486 554
487 # The header is 7 32-bit unsigned integers. 555 # The header is 7 32-bit unsigned integers.
488 keystart = 7*4+16*entries_len 556 keystart = 7 * 4 + 16 * entries_len
489 # and the values start after the keys 557 # and the values start after the keys
490 valuestart = keystart + len(ids) 558 valuestart = keystart + len(ids)
491 koffsets = [] 559 koffsets = []
492 voffsets = [] 560 voffsets = []
493 # The string table first has the list of keys, then the list of values. 561 # The string table first has the list of keys, then the list of values.
494 # Each entry has first the size of the string, then the file offset. 562 # Each entry has first the size of the string, then the file offset.
495 for o1, l1, o2, l2 in offsets: 563 for o1, l1, o2, l2 in offsets:
496 koffsets += [l1, o1+keystart] 564 koffsets += [l1, o1 + keystart]
497 voffsets += [l2, o2+valuestart] 565 voffsets += [l2, o2 + valuestart]
498 offsets = koffsets + voffsets 566 offsets = koffsets + voffsets
499 # check endianness for magic number
500 if struct.pack('@h', 1) == struct.pack('<h', 1):
501 magic_number = MOFile.LITTLE_ENDIAN
502 else:
503 magic_number = MOFile.BIG_ENDIAN
504 567
505 output = struct.pack( 568 output = struct.pack(
506 "Iiiiiii", 569 "Iiiiiii",
507 magic_number, # Magic number 570 # Magic number
508 0, # Version 571 MOFile.MAGIC,
509 entries_len, # # of entries 572 # Version
510 7*4, # start of key index 573 0,
511 7*4+entries_len*8, # start of value index 574 # number of entries
512 0, keystart # size and offset of hash table 575 entries_len,
513 # Important: we don't use hash tables 576 # start of key index
577 7 * 4,
578 # start of value index
579 7 * 4 + entries_len * 8,
580 # size and offset of hash table, we don't use hash tables
581 0, keystart
582
514 ) 583 )
515 output += array.array("i", offsets).tostring() 584 if PY3 and sys.version_info.minor > 1: # python 3.2 or superior
585 output += array.array("i", offsets).tobytes()
586 else:
587 output += array.array("i", offsets).tostring()
516 output += ids 588 output += ids
517 output += strs 589 output += strs
518 return output 590 return output
519 591
520 def _encode(self, mixed): 592 def _encode(self, mixed):
521 """ 593 """
522 Encodes the given ``mixed`` argument with the file encoding if and 594 Encodes the given ``mixed`` argument with the file encoding if and
523 only if it's an unicode string and returns the encoded string. 595 only if it's an unicode string and returns the encoded string.
524 """ 596 """
525 if type(mixed) == types.UnicodeType: 597 if isinstance(mixed, text_type):
526 return mixed.encode(self.encoding) 598 mixed = mixed.encode(self.encoding)
527 return mixed 599 return mixed
528
529 # }}} 600 # }}}
530 # class POFile {{{ 601 # class POFile {{{
602
531 603
532 class POFile(_BaseFile): 604 class POFile(_BaseFile):
533 """ 605 """
534 Po (or Pot) file reader/writer. 606 Po (or Pot) file reader/writer.
535 This class inherits the :class:`~polib._BaseFile` class and, by extension, 607 This class inherits the :class:`~polib._BaseFile` class and, by extension,
540 """ 612 """
541 Returns the unicode representation of the po file. 613 Returns the unicode representation of the po file.
542 """ 614 """
543 ret, headers = '', self.header.split('\n') 615 ret, headers = '', self.header.split('\n')
544 for header in headers: 616 for header in headers:
545 if header[:1] in [',', ':']: 617 if not len(header):
618 ret += "#\n"
619 elif header[:1] in [',', ':']:
546 ret += '#%s\n' % header 620 ret += '#%s\n' % header
547 else: 621 else:
548 ret += '# %s\n' % header 622 ret += '# %s\n' % header
549 623
550 if type(ret) != types.UnicodeType: 624 if not isinstance(ret, text_type):
551 ret = unicode(ret, self.encoding) 625 ret = ret.decode(self.encoding)
552 626
553 return ret + _BaseFile.__unicode__(self) 627 return ret + _BaseFile.__unicode__(self)
554 628
555 def save_as_mofile(self, fpath): 629 def save_as_mofile(self, fpath):
556 """ 630 """
570 """ 644 """
571 total = len([e for e in self if not e.obsolete]) 645 total = len([e for e in self if not e.obsolete])
572 if total == 0: 646 if total == 0:
573 return 100 647 return 100
574 translated = len(self.translated_entries()) 648 translated = len(self.translated_entries())
575 return int((100.00 / float(total)) * translated) 649 return int(translated * 100 / float(total))
576 650
577 def translated_entries(self): 651 def translated_entries(self):
578 """ 652 """
579 Convenience method that returns the list of translated entries. 653 Convenience method that returns the list of translated entries.
580 """ 654 """
582 656
583 def untranslated_entries(self): 657 def untranslated_entries(self):
584 """ 658 """
585 Convenience method that returns the list of untranslated entries. 659 Convenience method that returns the list of untranslated entries.
586 """ 660 """
587 return [e for e in self if not e.translated() and not e.obsolete \ 661 return [e for e in self if not e.translated() and not e.obsolete
588 and not 'fuzzy' in e.flags] 662 and not 'fuzzy' in e.flags]
589 663
590 def fuzzy_entries(self): 664 def fuzzy_entries(self):
591 """ 665 """
592 Convenience method that returns the list of fuzzy entries. 666 Convenience method that returns the list of fuzzy entries.
613 Keyword argument: 687 Keyword argument:
614 688
615 ``refpot`` 689 ``refpot``
616 object POFile, the reference catalog. 690 object POFile, the reference catalog.
617 """ 691 """
692 # Store entries in dict/set for faster access
693 self_entries = dict((entry.msgid, entry) for entry in self)
694 refpot_msgids = set(entry.msgid for entry in refpot)
695 # Merge entries that are in the refpot
618 for entry in refpot: 696 for entry in refpot:
619 e = self.find(entry.msgid, include_obsolete_entries=True) 697 e = self_entries.get(entry.msgid)
620 if e is None: 698 if e is None:
621 e = POEntry() 699 e = POEntry()
622 self.append(e) 700 self.append(e)
623 e.merge(entry) 701 e.merge(entry)
624 # ok, now we must "obsolete" entries that are not in the refpot anymore 702 # ok, now we must "obsolete" entries that are not in the refpot anymore
625 for entry in self: 703 for entry in self:
626 if refpot.find(entry.msgid) is None: 704 if entry.msgid not in refpot_msgids:
627 entry.obsolete = True 705 entry.obsolete = True
628
629 # }}} 706 # }}}
630 # class MOFile {{{ 707 # class MOFile {{{
708
631 709
632 class MOFile(_BaseFile): 710 class MOFile(_BaseFile):
633 """ 711 """
634 Mo file reader/writer. 712 Mo file reader/writer.
635 This class inherits the :class:`~polib._BaseFile` class and, by 713 This class inherits the :class:`~polib._BaseFile` class and, by
636 extension, the python ``list`` type. 714 extension, the python ``list`` type.
637 """ 715 """
638 BIG_ENDIAN = 0xde120495 716 MAGIC = 0x950412de
639 LITTLE_ENDIAN = 0x950412de 717 MAGIC_SWAPPED = 0xde120495
640 718
641 def __init__(self, *args, **kwargs): 719 def __init__(self, *args, **kwargs):
642 """ 720 """
643 Constructor, accepts all keywords arguments accepted by 721 Constructor, accepts all keywords arguments accepted by
644 :class:`~polib._BaseFile` class. 722 :class:`~polib._BaseFile` class.
696 def obsolete_entries(self): 774 def obsolete_entries(self):
697 """ 775 """
698 Convenience method to keep the same interface with POFile instances. 776 Convenience method to keep the same interface with POFile instances.
699 """ 777 """
700 return [] 778 return []
701
702 # }}} 779 # }}}
703 # class _BaseEntry {{{ 780 # class _BaseEntry {{{
781
704 782
705 class _BaseEntry(object): 783 class _BaseEntry(object):
706 """ 784 """
707 Base class for :class:`~polib.POEntry` and :class:`~polib.MOEntry` classes. 785 Base class for :class:`~polib.POEntry` and :class:`~polib.MOEntry` classes.
708 This class should **not** be instanciated directly. 786 This class should **not** be instanciated directly.
751 else: 829 else:
752 delflag = '' 830 delflag = ''
753 ret = [] 831 ret = []
754 # write the msgctxt if any 832 # write the msgctxt if any
755 if self.msgctxt is not None: 833 if self.msgctxt is not None:
756 ret += self._str_field("msgctxt", delflag, "", self.msgctxt, wrapwidth) 834 ret += self._str_field("msgctxt", delflag, "", self.msgctxt,
835 wrapwidth)
757 # write the msgid 836 # write the msgid
758 ret += self._str_field("msgid", delflag, "", self.msgid, wrapwidth) 837 ret += self._str_field("msgid", delflag, "", self.msgid, wrapwidth)
759 # write the msgid_plural if any 838 # write the msgid_plural if any
760 if self.msgid_plural: 839 if self.msgid_plural:
761 ret += self._str_field("msgid_plural", delflag, "", self.msgid_plural, wrapwidth) 840 ret += self._str_field("msgid_plural", delflag, "",
841 self.msgid_plural, wrapwidth)
762 if self.msgstr_plural: 842 if self.msgstr_plural:
763 # write the msgstr_plural if any 843 # write the msgstr_plural if any
764 msgstrs = self.msgstr_plural 844 msgstrs = self.msgstr_plural
765 keys = list(msgstrs) 845 keys = list(msgstrs)
766 keys.sort() 846 keys.sort()
767 for index in keys: 847 for index in keys:
768 msgstr = msgstrs[index] 848 msgstr = msgstrs[index]
769 plural_index = '[%s]' % index 849 plural_index = '[%s]' % index
770 ret += self._str_field("msgstr", delflag, plural_index, msgstr, wrapwidth) 850 ret += self._str_field("msgstr", delflag, plural_index, msgstr,
851 wrapwidth)
771 else: 852 else:
772 # otherwise write the msgstr 853 # otherwise write the msgstr
773 ret += self._str_field("msgstr", delflag, "", self.msgstr, wrapwidth) 854 ret += self._str_field("msgstr", delflag, "", self.msgstr,
855 wrapwidth)
774 ret.append('') 856 ret.append('')
775 ret = '\n'.join(ret) 857 ret = u('\n').join(ret)
776
777 if type(ret) != types.UnicodeType:
778 return unicode(ret, self.encoding)
779 return ret 858 return ret
780 859
781 def __str__(self): 860 if PY3:
782 """ 861 def __str__(self):
783 Returns the string representation of the entry. 862 return self.__unicode__()
784 """ 863 else:
785 return unicode(self).encode(self.encoding) 864 def __str__(self):
865 """
866 Returns the string representation of the entry.
867 """
868 return unicode(self).encode(self.encoding)
786 869
787 def __eq__(self, other): 870 def __eq__(self, other):
788 return unicode(self) == unicode(other) 871 return str(self) == str(other)
789 872
790 def _str_field(self, fieldname, delflag, plural_index, field, wrapwidth=78): 873 def _str_field(self, fieldname, delflag, plural_index, field,
874 wrapwidth=78):
791 lines = field.splitlines(True) 875 lines = field.splitlines(True)
792 if len(lines) > 1: 876 if len(lines) > 1:
793 lines = [''] + lines # start with initial empty line 877 lines = [''] + lines # start with initial empty line
794 else: 878 else:
795 escaped_field = escape(field) 879 escaped_field = escape(field)
796 specialchars_count = 0 880 specialchars_count = 0
797 for c in ['\\', '\n', '\r', '\t', '"']: 881 for c in ['\\', '\n', '\r', '\t', '"']:
798 specialchars_count += field.count(c) 882 specialchars_count += field.count(c)
802 if plural_index: 886 if plural_index:
803 flength += len(plural_index) 887 flength += len(plural_index)
804 real_wrapwidth = wrapwidth - flength + specialchars_count 888 real_wrapwidth = wrapwidth - flength + specialchars_count
805 if wrapwidth > 0 and len(field) > real_wrapwidth: 889 if wrapwidth > 0 and len(field) > real_wrapwidth:
806 # Wrap the line but take field name into account 890 # Wrap the line but take field name into account
807 lines = [''] + [unescape(item) for item in textwrap.wrap( 891 lines = [''] + [unescape(item) for item in wrap(
808 escaped_field, 892 escaped_field,
809 wrapwidth - 2, # 2 for quotes "" 893 wrapwidth - 2, # 2 for quotes ""
810 drop_whitespace=False, 894 drop_whitespace=False,
811 break_long_words=False 895 break_long_words=False
812 )] 896 )]
813 else: 897 else:
814 lines = [field] 898 lines = [field]
816 # quick and dirty trick to get the real field name 900 # quick and dirty trick to get the real field name
817 fieldname = fieldname[9:] 901 fieldname = fieldname[9:]
818 902
819 ret = ['%s%s%s "%s"' % (delflag, fieldname, plural_index, 903 ret = ['%s%s%s "%s"' % (delflag, fieldname, plural_index,
820 escape(lines.pop(0)))] 904 escape(lines.pop(0)))]
821 for mstr in lines: 905 for line in lines:
822 ret.append('%s"%s"' % (delflag, escape(mstr))) 906 ret.append('%s"%s"' % (delflag, escape(line)))
823 return ret 907 return ret
824
825 # }}} 908 # }}}
826 # class POEntry {{{ 909 # class POEntry {{{
827 910
911
828 class POEntry(_BaseEntry): 912 class POEntry(_BaseEntry):
829 """ 913 """
830 Represents a po file entry. 914 Represents a po file entry.
831 """ 915 """
832 916
852 ``previous_msgid`` 936 ``previous_msgid``
853 string, the entry previous msgid. 937 string, the entry previous msgid.
854 938
855 ``previous_msgid_plural`` 939 ``previous_msgid_plural``
856 string, the entry previous msgid_plural. 940 string, the entry previous msgid_plural.
941
942 ``linenum``
943 integer, the line number of the entry
857 """ 944 """
858 _BaseEntry.__init__(self, *args, **kwargs) 945 _BaseEntry.__init__(self, *args, **kwargs)
859 self.comment = kwargs.get('comment', '') 946 self.comment = kwargs.get('comment', '')
860 self.tcomment = kwargs.get('tcomment', '') 947 self.tcomment = kwargs.get('tcomment', '')
861 self.occurrences = kwargs.get('occurrences', []) 948 self.occurrences = kwargs.get('occurrences', [])
862 self.flags = kwargs.get('flags', []) 949 self.flags = kwargs.get('flags', [])
863 self.previous_msgctxt = kwargs.get('previous_msgctxt', None) 950 self.previous_msgctxt = kwargs.get('previous_msgctxt', None)
864 self.previous_msgid = kwargs.get('previous_msgid', None) 951 self.previous_msgid = kwargs.get('previous_msgid', None)
865 self.previous_msgid_plural = kwargs.get('previous_msgid_plural', None) 952 self.previous_msgid_plural = kwargs.get('previous_msgid_plural', None)
953 self.linenum = kwargs.get('linenum', None)
866 954
867 def __unicode__(self, wrapwidth=78): 955 def __unicode__(self, wrapwidth=78):
868 """ 956 """
869 Returns the unicode representation of the entry. 957 Returns the unicode representation of the entry.
870 """ 958 """
877 for c in comments: 965 for c in comments:
878 val = getattr(self, c[0]) 966 val = getattr(self, c[0])
879 if val: 967 if val:
880 for comment in val.split('\n'): 968 for comment in val.split('\n'):
881 if wrapwidth > 0 and len(comment) + len(c[1]) > wrapwidth: 969 if wrapwidth > 0 and len(comment) + len(c[1]) > wrapwidth:
882 ret += textwrap.wrap( 970 ret += wrap(
883 comment, 971 comment,
884 wrapwidth, 972 wrapwidth,
885 initial_indent=c[1], 973 initial_indent=c[1],
886 subsequent_indent=c[1], 974 subsequent_indent=c[1],
887 break_long_words=False 975 break_long_words=False
901 if wrapwidth > 0 and len(filestr) + 3 > wrapwidth: 989 if wrapwidth > 0 and len(filestr) + 3 > wrapwidth:
902 # textwrap split words that contain hyphen, this is not 990 # textwrap split words that contain hyphen, this is not
903 # what we want for filenames, so the dirty hack is to 991 # what we want for filenames, so the dirty hack is to
904 # temporally replace hyphens with a char that a file cannot 992 # temporally replace hyphens with a char that a file cannot
905 # contain, like "*" 993 # contain, like "*"
906 ret += [l.replace('*', '-') for l in textwrap.wrap( 994 ret += [l.replace('*', '-') for l in wrap(
907 filestr.replace('-', '*'), 995 filestr.replace('-', '*'),
908 wrapwidth, 996 wrapwidth,
909 initial_indent='#: ', 997 initial_indent='#: ',
910 subsequent_indent='#: ', 998 subsequent_indent='#: ',
911 break_long_words=False 999 break_long_words=False
916 # flags (TODO: wrapping ?) 1004 # flags (TODO: wrapping ?)
917 if self.flags: 1005 if self.flags:
918 ret.append('#, %s' % ', '.join(self.flags)) 1006 ret.append('#, %s' % ', '.join(self.flags))
919 1007
920 # previous context and previous msgid/msgid_plural 1008 # previous context and previous msgid/msgid_plural
921 fields = ['previous_msgctxt', 'previous_msgid', 'previous_msgid_plural'] 1009 fields = ['previous_msgctxt', 'previous_msgid',
1010 'previous_msgid_plural']
922 for f in fields: 1011 for f in fields:
923 val = getattr(self, f) 1012 val = getattr(self, f)
924 if val: 1013 if val:
925 ret += self._str_field(f, "#| ", "", val, wrapwidth) 1014 ret += self._str_field(f, "#| ", "", val, wrapwidth)
926 1015
927 ret.append(_BaseEntry.__unicode__(self, wrapwidth)) 1016 ret.append(_BaseEntry.__unicode__(self, wrapwidth))
928 ret = '\n'.join(ret) 1017 ret = u('\n').join(ret)
929 1018
930 if type(ret) != types.UnicodeType: 1019 assert isinstance(ret, text_type)
931 return unicode(ret, self.encoding) 1020 #if type(ret) != types.UnicodeType:
1021 # return unicode(ret, self.encoding)
932 return ret 1022 return ret
933 1023
934 def __cmp__(self, other): 1024 def __cmp__(self, other):
935 """ 1025 """
936 Called by comparison operations if rich comparison is not defined. 1026 Called by comparison operations if rich comparison is not defined.
937 """ 1027 """
938 def compare_occurrences(a, b):
939 """
940 Compare an entry occurrence with another one.
941 """
942 if a[0] != b[0]:
943 return a[0] < b[0]
944 if a[1] != b[1]:
945 return a[1] < b[1]
946 return 0
947 1028
948 # First: Obsolete test 1029 # First: Obsolete test
949 if self.obsolete != other.obsolete: 1030 if self.obsolete != other.obsolete:
950 if self.obsolete: 1031 if self.obsolete:
951 return -1 1032 return -1
952 else: 1033 else:
953 return 1 1034 return 1
954 # Work on a copy to protect original 1035 # Work on a copy to protect original
955 occ1 = self.occurrences[:] 1036 occ1 = sorted(self.occurrences[:])
956 occ2 = other.occurrences[:] 1037 occ2 = sorted(other.occurrences[:])
957 # Sorting using compare method
958 occ1.sort(compare_occurrences)
959 occ2.sort(compare_occurrences)
960 # Comparing sorted occurrences
961 pos = 0 1038 pos = 0
962 for entry1 in occ1: 1039 for entry1 in occ1:
963 try: 1040 try:
964 entry2 = occ2[pos] 1041 entry2 = occ2[pos]
965 except IndexError: 1042 except IndexError:
973 if entry1[1] != entry2[1]: 1050 if entry1[1] != entry2[1]:
974 if entry1[1] > entry2[1]: 1051 if entry1[1] > entry2[1]:
975 return 1 1052 return 1
976 else: 1053 else:
977 return -1 1054 return -1
1055 # Compare msgid_plural if set
1056 if self.msgid_plural:
1057 if not other.msgid_plural:
1058 return 1
1059 for pos in self.msgid_plural:
1060 if pos not in other.msgid_plural:
1061 return 1
1062 if self.msgid_plural[pos] > other.msgid_plural[pos]:
1063 return 1
1064 if self.msgid_plural[pos] < other.msgid_plural[pos]:
1065 return -1
978 # Finally: Compare message ID 1066 # Finally: Compare message ID
979 if self.msgid > other.msgid: return 1 1067 if self.msgid > other.msgid:
980 else: return -1 1068 return 1
1069 elif self.msgid < other.msgid:
1070 return -1
1071 return 0
1072
1073 def __gt__(self, other):
1074 return self.__cmp__(other) > 0
1075
1076 def __lt__(self, other):
1077 return self.__cmp__(other) < 0
1078
1079 def __ge__(self, other):
1080 return self.__cmp__(other) >= 0
1081
1082 def __le__(self, other):
1083 return self.__cmp__(other) <= 0
1084
1085 def __eq__(self, other):
1086 return self.__cmp__(other) == 0
1087
1088 def __ne__(self, other):
1089 return self.__cmp__(other) != 0
981 1090
982 def translated(self): 1091 def translated(self):
983 """ 1092 """
984 Returns ``True`` if the entry has been translated or ``False`` 1093 Returns ``True`` if the entry has been translated or ``False``
985 otherwise. 1094 otherwise.
1018 # keep existing translation at pos if any 1127 # keep existing translation at pos if any
1019 self.msgstr_plural[pos] 1128 self.msgstr_plural[pos]
1020 except KeyError: 1129 except KeyError:
1021 self.msgstr_plural[pos] = '' 1130 self.msgstr_plural[pos] = ''
1022 1131
1132 def __hash__(self):
1133 return hash((self.msgid, self.msgstr))
1023 # }}} 1134 # }}}
1024 # class MOEntry {{{ 1135 # class MOEntry {{{
1025 1136
1137
1026 class MOEntry(_BaseEntry): 1138 class MOEntry(_BaseEntry):
1027 """ 1139 """
1028 Represents a mo file entry. 1140 Represents a mo file entry.
1029 """ 1141 """
1030 pass 1142 def __init__(self, *args, **kwargs):
1143 """
1144 Constructor, accepts the following keyword arguments,
1145 for consistency with :class:`~polib.POEntry`:
1146
1147 ``comment``
1148 ``tcomment``
1149 ``occurrences``
1150 ``flags``
1151 ``previous_msgctxt``
1152 ``previous_msgid``
1153 ``previous_msgid_plural``
1154
1155 Note: even though these keyword arguments are accepted,
1156 they hold no real meaning in the context of MO files
1157 and are simply ignored.
1158 """
1159 _BaseEntry.__init__(self, *args, **kwargs)
1160 self.comment = ''
1161 self.tcomment = ''
1162 self.occurrences = []
1163 self.flags = []
1164 self.previous_msgctxt = None
1165 self.previous_msgid = None
1166 self.previous_msgid_plural = None
1167
1168 def __hash__(self):
1169 return hash((self.msgid, self.msgstr))
1031 1170
1032 # }}} 1171 # }}}
1033 # class _POFileParser {{{ 1172 # class _POFileParser {{{
1173
1034 1174
1035 class _POFileParser(object): 1175 class _POFileParser(object):
1036 """ 1176 """
1037 A finite state machine to parse efficiently and correctly po 1177 A finite state machine to parse efficiently and correctly po
1038 file format. 1178 file format.
1054 ``check_for_duplicates`` 1194 ``check_for_duplicates``
1055 whether to check for duplicate entries when adding entries to the 1195 whether to check for duplicate entries when adding entries to the
1056 file (optional, default: ``False``). 1196 file (optional, default: ``False``).
1057 """ 1197 """
1058 enc = kwargs.get('encoding', default_encoding) 1198 enc = kwargs.get('encoding', default_encoding)
1059 if os.path.exists(pofile): 1199 if _is_file(pofile):
1060 try: 1200 try:
1061 self.fhandle = codecs.open(pofile, 'rU', enc) 1201 self.fhandle = io.open(pofile, 'rt', encoding=enc)
1062 except LookupError: 1202 except LookupError:
1063 enc = default_encoding 1203 enc = default_encoding
1064 self.fhandle = codecs.open(pofile, 'rU', enc) 1204 self.fhandle = io.open(pofile, 'rt', encoding=enc)
1065 else: 1205 else:
1066 self.fhandle = pofile.splitlines() 1206 self.fhandle = pofile.splitlines()
1067 1207
1068 self.instance = POFile( 1208 klass = kwargs.get('klass')
1209 if klass is None:
1210 klass = POFile
1211 self.instance = klass(
1069 pofile=pofile, 1212 pofile=pofile,
1070 encoding=enc, 1213 encoding=enc,
1071 check_for_duplicates=kwargs.get('check_for_duplicates', False) 1214 check_for_duplicates=kwargs.get('check_for_duplicates', False)
1072 ) 1215 )
1073 self.transitions = {} 1216 self.transitions = {}
1074 self.current_entry = POEntry() 1217 self.current_line = 0
1075 self.current_state = 'ST' 1218 self.current_entry = POEntry(linenum=self.current_line)
1219 self.current_state = 'st'
1076 self.current_token = None 1220 self.current_token = None
1077 # two memo flags used in handlers 1221 # two memo flags used in handlers
1078 self.msgstr_index = 0 1222 self.msgstr_index = 0
1079 self.entry_obsolete = 0 1223 self.entry_obsolete = 0
1080 # Configure the state machine, by adding transitions. 1224 # Configure the state machine, by adding transitions.
1081 # Signification of symbols: 1225 # Signification of symbols:
1082 # * ST: Beginning of the file (start) 1226 # * ST: Beginning of the file (start)
1083 # * HE: Header 1227 # * HE: Header
1084 # * TC: a translation comment 1228 # * TC: a translation comment
1085 # * GC: a generated comment 1229 # * GC: a generated comment
1086 # * OC: a file/line occurence 1230 # * OC: a file/line occurrence
1087 # * FL: a flags line 1231 # * FL: a flags line
1088 # * CT: a message context 1232 # * CT: a message context
1089 # * PC: a previous msgctxt 1233 # * PC: a previous msgctxt
1090 # * PM: a previous msgid 1234 # * PM: a previous msgid
1091 # * PP: a previous msgid_plural 1235 # * PP: a previous msgid_plural
1092 # * MI: a msgid 1236 # * MI: a msgid
1093 # * MP: a msgid plural 1237 # * MP: a msgid plural
1094 # * MS: a msgstr 1238 # * MS: a msgstr
1095 # * MX: a msgstr plural 1239 # * MX: a msgstr plural
1096 # * MC: a msgid or msgstr continuation line 1240 # * MC: a msgid or msgstr continuation line
1097 all = ['ST', 'HE', 'GC', 'OC', 'FL', 'CT', 'PC', 'PM', 'PP', 'TC', 1241 all = ['st', 'he', 'gc', 'oc', 'fl', 'ct', 'pc', 'pm', 'pp', 'tc',
1098 'MS', 'MP', 'MX', 'MI'] 1242 'ms', 'mp', 'mx', 'mi']
1099 1243
1100 self.add('TC', ['ST', 'HE'], 'HE') 1244 self.add('tc', ['st', 'he'], 'he')
1101 self.add('TC', ['GC', 'OC', 'FL', 'TC', 'PC', 'PM', 'PP', 'MS', 1245 self.add('tc', ['gc', 'oc', 'fl', 'tc', 'pc', 'pm', 'pp', 'ms',
1102 'MP', 'MX', 'MI'], 'TC') 1246 'mp', 'mx', 'mi'], 'tc')
1103 self.add('GC', all, 'GC') 1247 self.add('gc', all, 'gc')
1104 self.add('OC', all, 'OC') 1248 self.add('oc', all, 'oc')
1105 self.add('FL', all, 'FL') 1249 self.add('fl', all, 'fl')
1106 self.add('PC', all, 'PC') 1250 self.add('pc', all, 'pc')
1107 self.add('PM', all, 'PM') 1251 self.add('pm', all, 'pm')
1108 self.add('PP', all, 'PP') 1252 self.add('pp', all, 'pp')
1109 self.add('CT', ['ST', 'HE', 'GC', 'OC', 'FL', 'TC', 'PC', 'PM', 1253 self.add('ct', ['st', 'he', 'gc', 'oc', 'fl', 'tc', 'pc', 'pm',
1110 'PP', 'MS', 'MX'], 'CT') 1254 'pp', 'ms', 'mx'], 'ct')
1111 self.add('MI', ['ST', 'HE', 'GC', 'OC', 'FL', 'CT', 'TC', 'PC', 1255 self.add('mi', ['st', 'he', 'gc', 'oc', 'fl', 'ct', 'tc', 'pc',
1112 'PM', 'PP', 'MS', 'MX'], 'MI') 1256 'pm', 'pp', 'ms', 'mx'], 'mi')
1113 self.add('MP', ['TC', 'GC', 'PC', 'PM', 'PP', 'MI'], 'MP') 1257 self.add('mp', ['tc', 'gc', 'pc', 'pm', 'pp', 'mi'], 'mp')
1114 self.add('MS', ['MI', 'MP', 'TC'], 'MS') 1258 self.add('ms', ['mi', 'mp', 'tc'], 'ms')
1115 self.add('MX', ['MI', 'MX', 'MP', 'TC'], 'MX') 1259 self.add('mx', ['mi', 'mx', 'mp', 'tc'], 'mx')
1116 self.add('MC', ['CT', 'MI', 'MP', 'MS', 'MX', 'PM', 'PP', 'PC'], 'MC') 1260 self.add('mc', ['ct', 'mi', 'mp', 'ms', 'mx', 'pm', 'pp', 'pc'], 'mc')
1117 1261
1118 def parse(self): 1262 def parse(self):
1119 """ 1263 """
1120 Run the state machine, parse the file line by line and call process() 1264 Run the state machine, parse the file line by line and call process()
1121 with the current matched symbol. 1265 with the current matched symbol.
1122 """ 1266 """
1123 i = 0
1124 1267
1125 keywords = { 1268 keywords = {
1126 'msgctxt': 'CT', 1269 'msgctxt': 'ct',
1127 'msgid': 'MI', 1270 'msgid': 'mi',
1128 'msgstr': 'MS', 1271 'msgstr': 'ms',
1129 'msgid_plural': 'MP', 1272 'msgid_plural': 'mp',
1130 } 1273 }
1131 prev_keywords = { 1274 prev_keywords = {
1132 'msgid_plural': 'PP', 1275 'msgid_plural': 'pp',
1133 'msgid': 'PM', 1276 'msgid': 'pm',
1134 'msgctxt': 'PC', 1277 'msgctxt': 'pc',
1135 } 1278 }
1136 1279 tokens = []
1137 for line in self.fhandle: 1280 for line in self.fhandle:
1138 i += 1 1281 self.current_line += 1
1139 line = line.strip() 1282 line = line.strip()
1140 if line == '': 1283 if line == '':
1141 continue 1284 continue
1142 1285
1143 tokens = line.split(None, 2) 1286 tokens = line.split(None, 2)
1144 nb_tokens = len(tokens) 1287 nb_tokens = len(tokens)
1288
1289 if tokens[0] == '#~|':
1290 continue
1145 1291
1146 if tokens[0] == '#~' and nb_tokens > 1: 1292 if tokens[0] == '#~' and nb_tokens > 1:
1147 line = line[3:].strip() 1293 line = line[3:].strip()
1148 tokens = tokens[1:] 1294 tokens = tokens[1:]
1149 nb_tokens -= 1 1295 nb_tokens -= 1
1153 1299
1154 # Take care of keywords like 1300 # Take care of keywords like
1155 # msgid, msgid_plural, msgctxt & msgstr. 1301 # msgid, msgid_plural, msgctxt & msgstr.
1156 if tokens[0] in keywords and nb_tokens > 1: 1302 if tokens[0] in keywords and nb_tokens > 1:
1157 line = line[len(tokens[0]):].lstrip() 1303 line = line[len(tokens[0]):].lstrip()
1304 if re.search(r'([^\\]|^)"', line[1:-1]):
1305 raise IOError('Syntax error in po file %s (line %s): '
1306 'unescaped double quote found' %
1307 (self.instance.fpath, self.current_line))
1158 self.current_token = line 1308 self.current_token = line
1159 self.process(keywords[tokens[0]], i) 1309 self.process(keywords[tokens[0]])
1160 continue 1310 continue
1161 1311
1162 self.current_token = line 1312 self.current_token = line
1163 1313
1164 if tokens[0] == '#:' and nb_tokens > 1: 1314 if tokens[0] == '#:':
1315 if nb_tokens <= 1:
1316 continue
1165 # we are on a occurrences line 1317 # we are on a occurrences line
1166 self.process('OC', i) 1318 self.process('oc')
1167 1319
1168 elif line[:1] == '"': 1320 elif line[:1] == '"':
1169 # we are on a continuation line 1321 # we are on a continuation line
1170 self.process('MC', i) 1322 if re.search(r'([^\\]|^)"', line[1:-1]):
1323 raise IOError('Syntax error in po file %s (line %s): '
1324 'unescaped double quote found' %
1325 (self.instance.fpath, self.current_line))
1326 self.process('mc')
1171 1327
1172 elif line[:7] == 'msgstr[': 1328 elif line[:7] == 'msgstr[':
1173 # we are on a msgstr plural 1329 # we are on a msgstr plural
1174 self.process('MX', i) 1330 self.process('mx')
1175 1331
1176 elif tokens[0] == '#,' and nb_tokens > 1: 1332 elif tokens[0] == '#,':
1333 if nb_tokens <= 1:
1334 continue
1177 # we are on a flags line 1335 # we are on a flags line
1178 self.process('FL', i) 1336 self.process('fl')
1179 1337
1180 elif tokens[0] == '#': 1338 elif tokens[0] == '#' or tokens[0].startswith('##'):
1181 if line == '#': line += ' ' 1339 if line == '#':
1340 line += ' '
1182 # we are on a translator comment line 1341 # we are on a translator comment line
1183 self.process('TC', i) 1342 self.process('tc')
1184 1343
1185 elif tokens[0] == '#.' and nb_tokens > 1: 1344 elif tokens[0] == '#.':
1345 if nb_tokens <= 1:
1346 continue
1186 # we are on a generated comment line 1347 # we are on a generated comment line
1187 self.process('GC', i) 1348 self.process('gc')
1188 1349
1189 elif tokens[0] == '#|': 1350 elif tokens[0] == '#|':
1190 if nb_tokens < 2: 1351 if nb_tokens <= 1:
1191 self.process('??', i) 1352 raise IOError('Syntax error in po file %s (line %s)' %
1192 continue 1353 (self.instance.fpath, self.current_line))
1193 1354
1194 # Remove the marker and any whitespace right after that. 1355 # Remove the marker and any whitespace right after that.
1195 line = line[2:].lstrip() 1356 line = line[2:].lstrip()
1196 self.current_token = line 1357 self.current_token = line
1197 1358
1198 if tokens[1].startswith('"'): 1359 if tokens[1].startswith('"'):
1199 # Continuation of previous metadata. 1360 # Continuation of previous metadata.
1200 self.process('MC', i) 1361 self.process('mc')
1201 continue 1362 continue
1202 1363
1203 if nb_tokens == 2: 1364 if nb_tokens == 2:
1204 # Invalid continuation line. 1365 # Invalid continuation line.
1205 self.process('??', i) 1366 raise IOError('Syntax error in po file %s (line %s): '
1367 'invalid continuation line' %
1368 (self.instance.fpath, self.current_line))
1206 1369
1207 # we are on a "previous translation" comment line, 1370 # we are on a "previous translation" comment line,
1208 if tokens[1] not in prev_keywords: 1371 if tokens[1] not in prev_keywords:
1209 # Unknown keyword in previous translation comment. 1372 # Unknown keyword in previous translation comment.
1210 self.process('??', i) 1373 raise IOError('Syntax error in po file %s (line %s): '
1374 'unknown keyword %s' %
1375 (self.instance.fpath, self.current_line,
1376 tokens[1]))
1211 1377
1212 # Remove the keyword and any whitespace 1378 # Remove the keyword and any whitespace
1213 # between it and the starting quote. 1379 # between it and the starting quote.
1214 line = line[len(tokens[1]):].lstrip() 1380 line = line[len(tokens[1]):].lstrip()
1215 self.current_token = line 1381 self.current_token = line
1216 self.process(prev_keywords[tokens[1]], i) 1382 self.process(prev_keywords[tokens[1]])
1217 1383
1218 else: 1384 else:
1219 self.process('??', i) 1385 raise IOError('Syntax error in po file %s (line %s)' %
1220 1386 (self.instance.fpath, self.current_line))
1221 if self.current_entry: 1387
1388 if self.current_entry and len(tokens) > 0 and \
1389 not tokens[0].startswith('#'):
1222 # since entries are added when another entry is found, we must add 1390 # since entries are added when another entry is found, we must add
1223 # the last entry here (only if there are lines) 1391 # the last entry here (only if there are lines). Trailing comments
1392 # are ignored
1224 self.instance.append(self.current_entry) 1393 self.instance.append(self.current_entry)
1394
1225 # before returning the instance, check if there's metadata and if 1395 # before returning the instance, check if there's metadata and if
1226 # so extract it in a dict 1396 # so extract it in a dict
1227 firstentry = self.instance[0] 1397 metadataentry = self.instance.find('')
1228 if firstentry.msgid == '': # metadata found 1398 if metadataentry: # metadata found
1229 # remove the entry 1399 # remove the entry
1230 firstentry = self.instance.pop(0) 1400 self.instance.remove(metadataentry)
1231 self.instance.metadata_is_fuzzy = firstentry.flags 1401 self.instance.metadata_is_fuzzy = metadataentry.flags
1232 key = None 1402 key = None
1233 for msg in firstentry.msgstr.splitlines(): 1403 for msg in metadataentry.msgstr.splitlines():
1234 try: 1404 try:
1235 key, val = msg.split(':', 1) 1405 key, val = msg.split(':', 1)
1236 self.instance.metadata[key] = val.strip() 1406 self.instance.metadata[key] = val.strip()
1237 except: 1407 except (ValueError, KeyError):
1238 if key is not None: 1408 if key is not None:
1239 self.instance.metadata[key] += '\n'+ msg.strip() 1409 self.instance.metadata[key] += '\n' + msg.strip()
1240 # close opened file 1410 # close opened file
1241 if isinstance(self.fhandle, file): 1411 if not isinstance(self.fhandle, list): # must be file
1242 self.fhandle.close() 1412 self.fhandle.close()
1243 return self.instance 1413 return self.instance
1244 1414
1245 def add(self, symbol, states, next_state): 1415 def add(self, symbol, states, next_state):
1246 """ 1416 """
1256 1426
1257 ``next_state`` 1427 ``next_state``
1258 the next state the fsm will have after the action. 1428 the next state the fsm will have after the action.
1259 """ 1429 """
1260 for state in states: 1430 for state in states:
1261 action = getattr(self, 'handle_%s' % next_state.lower()) 1431 action = getattr(self, 'handle_%s' % next_state)
1262 self.transitions[(symbol, state)] = (action, next_state) 1432 self.transitions[(symbol, state)] = (action, next_state)
1263 1433
1264 def process(self, symbol, linenum): 1434 def process(self, symbol):
1265 """ 1435 """
1266 Process the transition corresponding to the current state and the 1436 Process the transition corresponding to the current state and the
1267 symbol provided. 1437 symbol provided.
1268 1438
1269 Keywords arguments: 1439 Keywords arguments:
1276 """ 1446 """
1277 try: 1447 try:
1278 (action, state) = self.transitions[(symbol, self.current_state)] 1448 (action, state) = self.transitions[(symbol, self.current_state)]
1279 if action(): 1449 if action():
1280 self.current_state = state 1450 self.current_state = state
1281 except Exception as exc: 1451 except Exception:
1282 raise IOError('Syntax error in po file (line %s)' % linenum) 1452 raise IOError('Syntax error in po file (line %s)' %
1453 self.current_line)
1283 1454
1284 # state handlers 1455 # state handlers
1285 1456
1286 def handle_he(self): 1457 def handle_he(self):
1287 """Handle a header comment.""" 1458 """Handle a header comment."""
1290 self.instance.header += self.current_token[2:] 1461 self.instance.header += self.current_token[2:]
1291 return 1 1462 return 1
1292 1463
1293 def handle_tc(self): 1464 def handle_tc(self):
1294 """Handle a translator comment.""" 1465 """Handle a translator comment."""
1295 if self.current_state in ['MC', 'MS', 'MX']: 1466 if self.current_state in ['mc', 'ms', 'mx']:
1296 self.instance.append(self.current_entry) 1467 self.instance.append(self.current_entry)
1297 self.current_entry = POEntry() 1468 self.current_entry = POEntry(linenum=self.current_line)
1298 if self.current_entry.tcomment != '': 1469 if self.current_entry.tcomment != '':
1299 self.current_entry.tcomment += '\n' 1470 self.current_entry.tcomment += '\n'
1300 self.current_entry.tcomment += self.current_token[2:] 1471 tcomment = self.current_token.lstrip('#')
1472 if tcomment.startswith(' '):
1473 tcomment = tcomment[1:]
1474 self.current_entry.tcomment += tcomment
1301 return True 1475 return True
1302 1476
1303 def handle_gc(self): 1477 def handle_gc(self):
1304 """Handle a generated comment.""" 1478 """Handle a generated comment."""
1305 if self.current_state in ['MC', 'MS', 'MX']: 1479 if self.current_state in ['mc', 'ms', 'mx']:
1306 self.instance.append(self.current_entry) 1480 self.instance.append(self.current_entry)
1307 self.current_entry = POEntry() 1481 self.current_entry = POEntry(linenum=self.current_line)
1308 if self.current_entry.comment != '': 1482 if self.current_entry.comment != '':
1309 self.current_entry.comment += '\n' 1483 self.current_entry.comment += '\n'
1310 self.current_entry.comment += self.current_token[3:] 1484 self.current_entry.comment += self.current_token[3:]
1311 return True 1485 return True
1312 1486
1313 def handle_oc(self): 1487 def handle_oc(self):
1314 """Handle a file:num occurence.""" 1488 """Handle a file:num occurrence."""
1315 if self.current_state in ['MC', 'MS', 'MX']: 1489 if self.current_state in ['mc', 'ms', 'mx']:
1316 self.instance.append(self.current_entry) 1490 self.instance.append(self.current_entry)
1317 self.current_entry = POEntry() 1491 self.current_entry = POEntry(linenum=self.current_line)
1318 occurrences = self.current_token[3:].split() 1492 occurrences = self.current_token[3:].split()
1319 for occurrence in occurrences: 1493 for occurrence in occurrences:
1320 if occurrence != '': 1494 if occurrence != '':
1321 try: 1495 try:
1322 fil, line = occurrence.split(':') 1496 fil, line = occurrence.split(':')
1323 if not line.isdigit(): 1497 if not line.isdigit():
1324 fil = fil + line 1498 fil = fil + line
1325 line = '' 1499 line = ''
1326 self.current_entry.occurrences.append((fil, line)) 1500 self.current_entry.occurrences.append((fil, line))
1327 except: 1501 except (ValueError, AttributeError):
1328 self.current_entry.occurrences.append((occurrence, '')) 1502 self.current_entry.occurrences.append((occurrence, ''))
1329 return True 1503 return True
1330 1504
1331 def handle_fl(self): 1505 def handle_fl(self):
1332 """Handle a flags line.""" 1506 """Handle a flags line."""
1333 if self.current_state in ['MC', 'MS', 'MX']: 1507 if self.current_state in ['mc', 'ms', 'mx']:
1334 self.instance.append(self.current_entry) 1508 self.instance.append(self.current_entry)
1335 self.current_entry = POEntry() 1509 self.current_entry = POEntry(linenum=self.current_line)
1336 self.current_entry.flags += self.current_token[3:].split(', ') 1510 self.current_entry.flags += [c.strip() for c in
1511 self.current_token[3:].split(',')]
1337 return True 1512 return True
1338 1513
1339 def handle_pp(self): 1514 def handle_pp(self):
1340 """Handle a previous msgid_plural line.""" 1515 """Handle a previous msgid_plural line."""
1341 if self.current_state in ['MC', 'MS', 'MX']: 1516 if self.current_state in ['mc', 'ms', 'mx']:
1342 self.instance.append(self.current_entry) 1517 self.instance.append(self.current_entry)
1343 self.current_entry = POEntry() 1518 self.current_entry = POEntry(linenum=self.current_line)
1344 self.current_entry.previous_msgid_plural = \ 1519 self.current_entry.previous_msgid_plural = \
1345 unescape(self.current_token[1:-1]) 1520 unescape(self.current_token[1:-1])
1346 return True 1521 return True
1347 1522
1348 def handle_pm(self): 1523 def handle_pm(self):
1349 """Handle a previous msgid line.""" 1524 """Handle a previous msgid line."""
1350 if self.current_state in ['MC', 'MS', 'MX']: 1525 if self.current_state in ['mc', 'ms', 'mx']:
1351 self.instance.append(self.current_entry) 1526 self.instance.append(self.current_entry)
1352 self.current_entry = POEntry() 1527 self.current_entry = POEntry(linenum=self.current_line)
1353 self.current_entry.previous_msgid = \ 1528 self.current_entry.previous_msgid = \
1354 unescape(self.current_token[1:-1]) 1529 unescape(self.current_token[1:-1])
1355 return True 1530 return True
1356 1531
1357 def handle_pc(self): 1532 def handle_pc(self):
1358 """Handle a previous msgctxt line.""" 1533 """Handle a previous msgctxt line."""
1359 if self.current_state in ['MC', 'MS', 'MX']: 1534 if self.current_state in ['mc', 'ms', 'mx']:
1360 self.instance.append(self.current_entry) 1535 self.instance.append(self.current_entry)
1361 self.current_entry = POEntry() 1536 self.current_entry = POEntry(linenum=self.current_line)
1362 self.current_entry.previous_msgctxt = \ 1537 self.current_entry.previous_msgctxt = \
1363 unescape(self.current_token[1:-1]) 1538 unescape(self.current_token[1:-1])
1364 return True 1539 return True
1365 1540
1366 def handle_ct(self): 1541 def handle_ct(self):
1367 """Handle a msgctxt.""" 1542 """Handle a msgctxt."""
1368 if self.current_state in ['MC', 'MS', 'MX']: 1543 if self.current_state in ['mc', 'ms', 'mx']:
1369 self.instance.append(self.current_entry) 1544 self.instance.append(self.current_entry)
1370 self.current_entry = POEntry() 1545 self.current_entry = POEntry(linenum=self.current_line)
1371 self.current_entry.msgctxt = unescape(self.current_token[1:-1]) 1546 self.current_entry.msgctxt = unescape(self.current_token[1:-1])
1372 return True 1547 return True
1373 1548
1374 def handle_mi(self): 1549 def handle_mi(self):
1375 """Handle a msgid.""" 1550 """Handle a msgid."""
1376 if self.current_state in ['MC', 'MS', 'MX']: 1551 if self.current_state in ['mc', 'ms', 'mx']:
1377 self.instance.append(self.current_entry) 1552 self.instance.append(self.current_entry)
1378 self.current_entry = POEntry() 1553 self.current_entry = POEntry(linenum=self.current_line)
1379 self.current_entry.obsolete = self.entry_obsolete 1554 self.current_entry.obsolete = self.entry_obsolete
1380 self.current_entry.msgid = unescape(self.current_token[1:-1]) 1555 self.current_entry.msgid = unescape(self.current_token[1:-1])
1381 return True 1556 return True
1382 1557
1383 def handle_mp(self): 1558 def handle_mp(self):
1390 self.current_entry.msgstr = unescape(self.current_token[1:-1]) 1565 self.current_entry.msgstr = unescape(self.current_token[1:-1])
1391 return True 1566 return True
1392 1567
1393 def handle_mx(self): 1568 def handle_mx(self):
1394 """Handle a msgstr plural.""" 1569 """Handle a msgstr plural."""
1395 index, value = self.current_token[7], self.current_token[11:-1] 1570 index = self.current_token[7]
1396 self.current_entry.msgstr_plural[index] = unescape(value) 1571 value = self.current_token[self.current_token.find('"') + 1:-1]
1397 self.msgstr_index = index 1572 self.current_entry.msgstr_plural[int(index)] = unescape(value)
1573 self.msgstr_index = int(index)
1398 return True 1574 return True
1399 1575
1400 def handle_mc(self): 1576 def handle_mc(self):
1401 """Handle a msgid or msgstr continuation line.""" 1577 """Handle a msgid or msgstr continuation line."""
1402 token = unescape(self.current_token[1:-1]) 1578 token = unescape(self.current_token[1:-1])
1403 if self.current_state == 'CT': 1579 if self.current_state == 'ct':
1404 typ = 'msgctxt'
1405 self.current_entry.msgctxt += token 1580 self.current_entry.msgctxt += token
1406 elif self.current_state == 'MI': 1581 elif self.current_state == 'mi':
1407 typ = 'msgid'
1408 self.current_entry.msgid += token 1582 self.current_entry.msgid += token
1409 elif self.current_state == 'MP': 1583 elif self.current_state == 'mp':
1410 typ = 'msgid_plural'
1411 self.current_entry.msgid_plural += token 1584 self.current_entry.msgid_plural += token
1412 elif self.current_state == 'MS': 1585 elif self.current_state == 'ms':
1413 typ = 'msgstr'
1414 self.current_entry.msgstr += token 1586 self.current_entry.msgstr += token
1415 elif self.current_state == 'MX': 1587 elif self.current_state == 'mx':
1416 typ = 'msgstr[%s]' % self.msgstr_index
1417 self.current_entry.msgstr_plural[self.msgstr_index] += token 1588 self.current_entry.msgstr_plural[self.msgstr_index] += token
1418 elif self.current_state == 'PP': 1589 elif self.current_state == 'pp':
1419 typ = 'previous_msgid_plural'
1420 token = token[3:]
1421 self.current_entry.previous_msgid_plural += token 1590 self.current_entry.previous_msgid_plural += token
1422 elif self.current_state == 'PM': 1591 elif self.current_state == 'pm':
1423 typ = 'previous_msgid'
1424 token = token[3:]
1425 self.current_entry.previous_msgid += token 1592 self.current_entry.previous_msgid += token
1426 elif self.current_state == 'PC': 1593 elif self.current_state == 'pc':
1427 typ = 'previous_msgctxt'
1428 token = token[3:]
1429 self.current_entry.previous_msgctxt += token 1594 self.current_entry.previous_msgctxt += token
1430 # don't change the current state 1595 # don't change the current state
1431 return False 1596 return False
1432
1433 # }}} 1597 # }}}
1434 # class _MOFileParser {{{ 1598 # class _MOFileParser {{{
1599
1435 1600
1436 class _MOFileParser(object): 1601 class _MOFileParser(object):
1437 """ 1602 """
1438 A class to parse binary mo files. 1603 A class to parse binary mo files.
1439 """ 1604 """
1454 ``check_for_duplicates`` 1619 ``check_for_duplicates``
1455 whether to check for duplicate entries when adding entries to the 1620 whether to check for duplicate entries when adding entries to the
1456 file (optional, default: ``False``). 1621 file (optional, default: ``False``).
1457 """ 1622 """
1458 self.fhandle = open(mofile, 'rb') 1623 self.fhandle = open(mofile, 'rb')
1459 self.instance = MOFile( 1624
1625 klass = kwargs.get('klass')
1626 if klass is None:
1627 klass = MOFile
1628 self.instance = klass(
1460 fpath=mofile, 1629 fpath=mofile,
1461 encoding=kwargs.get('encoding', default_encoding), 1630 encoding=kwargs.get('encoding', default_encoding),
1462 check_for_duplicates=kwargs.get('check_for_duplicates', False) 1631 check_for_duplicates=kwargs.get('check_for_duplicates', False)
1463 ) 1632 )
1464 1633
1634 def __del__(self):
1635 """
1636 Make sure the file is closed, this prevents warnings on unclosed file
1637 when running tests with python >= 3.2.
1638 """
1639 if self.fhandle:
1640 self.fhandle.close()
1641
1465 def parse(self): 1642 def parse(self):
1466 """ 1643 """
1467 Build the instance with the file handle provided in the 1644 Build the instance with the file handle provided in the
1468 constructor. 1645 constructor.
1469 """ 1646 """
1470 # parse magic number 1647 # parse magic number
1471 magic_number = self._readbinary('<I', 4) 1648 magic_number = self._readbinary('<I', 4)
1472 if magic_number == MOFile.LITTLE_ENDIAN: 1649 if magic_number == MOFile.MAGIC:
1473 ii = '<II' 1650 ii = '<II'
1474 elif magic_number == MOFile.BIG_ENDIAN: 1651 elif magic_number == MOFile.MAGIC_SWAPPED:
1475 ii = '>II' 1652 ii = '>II'
1476 else: 1653 else:
1477 raise IOError('Invalid mo file, magic number is incorrect !') 1654 raise IOError('Invalid mo file, magic number is incorrect !')
1478 self.instance.magic_number = magic_number 1655 self.instance.magic_number = magic_number
1479 # parse the version number and the number of strings 1656 # parse the version number and the number of strings
1480 self.instance.version, numofstrings = self._readbinary(ii, 8) 1657 version, numofstrings = self._readbinary(ii, 8)
1658 # from MO file format specs: "A program seeing an unexpected major
1659 # revision number should stop reading the MO file entirely"
1660 if version not in (0, 1):
1661 raise IOError('Invalid mo file, unexpected major revision number')
1662 self.instance.version = version
1481 # original strings and translation strings hash table offset 1663 # original strings and translation strings hash table offset
1482 msgids_hash_offset, msgstrs_hash_offset = self._readbinary(ii, 8) 1664 msgids_hash_offset, msgstrs_hash_offset = self._readbinary(ii, 8)
1483 # move to msgid hash table and read length and offset of msgids 1665 # move to msgid hash table and read length and offset of msgids
1484 self.fhandle.seek(msgids_hash_offset) 1666 self.fhandle.seek(msgids_hash_offset)
1485 msgids_index = [] 1667 msgids_index = []
1489 self.fhandle.seek(msgstrs_hash_offset) 1671 self.fhandle.seek(msgstrs_hash_offset)
1490 msgstrs_index = [] 1672 msgstrs_index = []
1491 for i in range(numofstrings): 1673 for i in range(numofstrings):
1492 msgstrs_index.append(self._readbinary(ii, 8)) 1674 msgstrs_index.append(self._readbinary(ii, 8))
1493 # build entries 1675 # build entries
1676 encoding = self.instance.encoding
1494 for i in range(numofstrings): 1677 for i in range(numofstrings):
1495 self.fhandle.seek(msgids_index[i][1]) 1678 self.fhandle.seek(msgids_index[i][1])
1496 msgid = self.fhandle.read(msgids_index[i][0]) 1679 msgid = self.fhandle.read(msgids_index[i][0])
1680
1497 self.fhandle.seek(msgstrs_index[i][1]) 1681 self.fhandle.seek(msgstrs_index[i][1])
1498 msgstr = self.fhandle.read(msgstrs_index[i][0]) 1682 msgstr = self.fhandle.read(msgstrs_index[i][0])
1499 if i == 0: # metadata 1683 if i == 0 and not msgid: # metadata
1500 raw_metadata, metadata = msgstr.split('\n'), {} 1684 raw_metadata, metadata = msgstr.split(b('\n')), {}
1501 for line in raw_metadata: 1685 for line in raw_metadata:
1502 tokens = line.split(':', 1) 1686 tokens = line.split(b(':'), 1)
1503 if tokens[0] != '': 1687 if tokens[0] != b(''):
1504 try: 1688 try:
1505 metadata[tokens[0]] = tokens[1].strip() 1689 k = tokens[0].decode(encoding)
1690 v = tokens[1].decode(encoding)
1691 metadata[k] = v.strip()
1506 except IndexError: 1692 except IndexError:
1507 metadata[tokens[0]] = '' 1693 metadata[k] = u('')
1508 self.instance.metadata = metadata 1694 self.instance.metadata = metadata
1509 continue 1695 continue
1510 # test if we have a plural entry 1696 # test if we have a plural entry
1511 msgid_tokens = msgid.split('\0') 1697 msgid_tokens = msgid.split(b('\0'))
1512 if len(msgid_tokens) > 1: 1698 if len(msgid_tokens) > 1:
1513 entry = self._build_entry( 1699 entry = self._build_entry(
1514 msgid=msgid_tokens[0], 1700 msgid=msgid_tokens[0],
1515 msgid_plural=msgid_tokens[1], 1701 msgid_plural=msgid_tokens[1],
1516 msgstr_plural=dict((k,v) for k,v in enumerate(msgstr.split('\0'))) 1702 msgstr_plural=dict((k, v) for k, v in
1703 enumerate(msgstr.split(b('\0'))))
1517 ) 1704 )
1518 else: 1705 else:
1519 entry = self._build_entry(msgid=msgid, msgstr=msgstr) 1706 entry = self._build_entry(msgid=msgid, msgstr=msgstr)
1520 self.instance.append(entry) 1707 self.instance.append(entry)
1521 # close opened file 1708 # close opened file
1522 self.fhandle.close() 1709 self.fhandle.close()
1523 return self.instance 1710 return self.instance
1524 1711
1525 def _build_entry(self, msgid, msgstr=None, msgid_plural=None, 1712 def _build_entry(self, msgid, msgstr=None, msgid_plural=None,
1526 msgstr_plural=None): 1713 msgstr_plural=None):
1527 msgctxt_msgid = msgid.split('\x04') 1714 msgctxt_msgid = msgid.split(b('\x04'))
1715 encoding = self.instance.encoding
1528 if len(msgctxt_msgid) > 1: 1716 if len(msgctxt_msgid) > 1:
1529 kwargs = { 1717 kwargs = {
1530 'msgctxt': msgctxt_msgid[0], 1718 'msgctxt': msgctxt_msgid[0].decode(encoding),
1531 'msgid' : msgctxt_msgid[1], 1719 'msgid': msgctxt_msgid[1].decode(encoding),
1532 } 1720 }
1533 else: 1721 else:
1534 kwargs = {'msgid': msgid} 1722 kwargs = {'msgid': msgid.decode(encoding)}
1535 if msgstr: 1723 if msgstr:
1536 kwargs['msgstr'] = msgstr 1724 kwargs['msgstr'] = msgstr.decode(encoding)
1537 if msgid_plural: 1725 if msgid_plural:
1538 kwargs['msgid_plural'] = msgid_plural 1726 kwargs['msgid_plural'] = msgid_plural.decode(encoding)
1539 if msgstr_plural: 1727 if msgstr_plural:
1728 for k in msgstr_plural:
1729 msgstr_plural[k] = msgstr_plural[k].decode(encoding)
1540 kwargs['msgstr_plural'] = msgstr_plural 1730 kwargs['msgstr_plural'] = msgstr_plural
1541 return MOEntry(**kwargs) 1731 return MOEntry(**kwargs)
1542 1732
1543 def _readbinary(self, fmt, numbytes): 1733 def _readbinary(self, fmt, numbytes):
1544 """ 1734 """
1548 bytes = self.fhandle.read(numbytes) 1738 bytes = self.fhandle.read(numbytes)
1549 tup = struct.unpack(fmt, bytes) 1739 tup = struct.unpack(fmt, bytes)
1550 if len(tup) == 1: 1740 if len(tup) == 1:
1551 return tup[0] 1741 return tup[0]
1552 return tup 1742 return tup
1553
1554 # }}} 1743 # }}}
1744 # class TextWrapper {{{
1745
1746
1747 class TextWrapper(textwrap.TextWrapper):
1748 """
1749 Subclass of textwrap.TextWrapper that backport the
1750 drop_whitespace option.
1751 """
1752 def __init__(self, *args, **kwargs):
1753 drop_whitespace = kwargs.pop('drop_whitespace', True)
1754 textwrap.TextWrapper.__init__(self, *args, **kwargs)
1755 self.drop_whitespace = drop_whitespace
1756
1757 def _wrap_chunks(self, chunks):
1758 """_wrap_chunks(chunks : [string]) -> [string]
1759
1760 Wrap a sequence of text chunks and return a list of lines of
1761 length 'self.width' or less. (If 'break_long_words' is false,
1762 some lines may be longer than this.) Chunks correspond roughly
1763 to words and the whitespace between them: each chunk is
1764 indivisible (modulo 'break_long_words'), but a line break can
1765 come between any two chunks. Chunks should not have internal
1766 whitespace; ie. a chunk is either all whitespace or a "word".
1767 Whitespace chunks will be removed from the beginning and end of
1768 lines, but apart from that whitespace is preserved.
1769 """
1770 lines = []
1771 if self.width <= 0:
1772 raise ValueError("invalid width %r (must be > 0)" % self.width)
1773
1774 # Arrange in reverse order so items can be efficiently popped
1775 # from a stack of chucks.
1776 chunks.reverse()
1777
1778 while chunks:
1779
1780 # Start the list of chunks that will make up the current line.
1781 # cur_len is just the length of all the chunks in cur_line.
1782 cur_line = []
1783 cur_len = 0
1784
1785 # Figure out which static string will prefix this line.
1786 if lines:
1787 indent = self.subsequent_indent
1788 else:
1789 indent = self.initial_indent
1790
1791 # Maximum width for this line.
1792 width = self.width - len(indent)
1793
1794 # First chunk on line is whitespace -- drop it, unless this
1795 # is the very beginning of the text (ie. no lines started yet).
1796 if self.drop_whitespace and chunks[-1].strip() == '' and lines:
1797 del chunks[-1]
1798
1799 while chunks:
1800 l = len(chunks[-1])
1801
1802 # Can at least squeeze this chunk onto the current line.
1803 if cur_len + l <= width:
1804 cur_line.append(chunks.pop())
1805 cur_len += l
1806
1807 # Nope, this line is full.
1808 else:
1809 break
1810
1811 # The current line is full, and the next chunk is too big to
1812 # fit on *any* line (not just this one).
1813 if chunks and len(chunks[-1]) > width:
1814 self._handle_long_word(chunks, cur_line, cur_len, width)
1815
1816 # If the last chunk on this line is all whitespace, drop it.
1817 if self.drop_whitespace and cur_line and not cur_line[-1].strip():
1818 del cur_line[-1]
1819
1820 # Convert current line back to a string and store it in list
1821 # of all lines (return value).
1822 if cur_line:
1823 lines.append(indent + ''.join(cur_line))
1824
1825 return lines
1826 # }}}
1827 # function wrap() {{{
1828
1829
1830 def wrap(text, width=70, **kwargs):
1831 """
1832 Wrap a single paragraph of text, returning a list of wrapped lines.
1833 """
1834 if sys.version_info < (2, 6):
1835 return TextWrapper(width=width, **kwargs).wrap(text)
1836 return textwrap.wrap(text, width=width, **kwargs)
1837
1838 # }}}