diff hgext/convert/darcs.py @ 12717:89df79b3c011 stable

convert/darcs: support changelogs with bytes 0x7F-0xFF (issue2411) This is a followup to 4481f8a93c7a, which only fixed the conversion of patches with UTF-8 metadata. This patch allows a changelog to have any bytes with values 0x7F-0xFF. It parses the XML changelog as Latin-1 and uses converter_source.recode() to decode the data as UTF-8/Latin-1. Caveats: - Since the convert extension doesn't provide any way to specify the source encoding, users are still limited to UTF-8 and Latin-1. - etree will still complain if the changelog has bytes with values 0x00-0x19. XML only allows printable characters.
author Brodie Rao <brodie@bitheap.org>
date Fri, 01 Oct 2010 10:15:04 -0500
parents 84ceedcfeb6a
children 4e5a36eeefd1
line wrap: on
line diff
--- a/hgext/convert/darcs.py	Tue Oct 05 11:34:13 2010 +0200
+++ b/hgext/convert/darcs.py	Fri Oct 01 10:15:04 2010 -0500
@@ -7,22 +7,22 @@
 
 from common import NoRepo, checktool, commandline, commit, converter_source
 from mercurial.i18n import _
-from mercurial import util
+from mercurial import encoding, util
 import os, shutil, tempfile, re
 
 # The naming drift of ElementTree is fun!
 
 try:
-    from xml.etree.cElementTree import ElementTree
+    from xml.etree.cElementTree import ElementTree, XMLParser
 except ImportError:
     try:
-        from xml.etree.ElementTree import ElementTree
+        from xml.etree.ElementTree import ElementTree, XMLParser
     except ImportError:
         try:
-            from elementtree.cElementTree import ElementTree
+            from elementtree.cElementTree import ElementTree, XMLParser
         except ImportError:
             try:
-                from elementtree.ElementTree import ElementTree
+                from elementtree.ElementTree import ElementTree, XMLParser
             except ImportError:
                 ElementTree = None
 
@@ -88,12 +88,24 @@
         self.ui.debug('cleaning up %s\n' % self.tmppath)
         shutil.rmtree(self.tmppath, ignore_errors=True)
 
+    def recode(self, s, encoding=None):
+        if isinstance(s, unicode):
+            # XMLParser returns unicode objects for anything it can't
+            # encode into ASCII. We convert them back to str to get
+            # recode's normal conversion behavior.
+            s = s.encode('latin-1')
+        return super(darcs_source, self).recode(s, encoding)
+
     def xml(self, cmd, **kwargs):
         # NOTE: darcs is currently encoding agnostic and will print
         # patch metadata byte-for-byte, even in the XML changelog.
         etree = ElementTree()
+        # While we are decoding the XML as latin-1 to be as liberal as
+        # possible, etree will still raise an exception if any
+        # non-printable characters are in the XML changelog.
+        parser = XMLParser(encoding='latin-1')
         fp = self._run(cmd, **kwargs)
-        etree.parse(fp)
+        etree.parse(fp, parser=parser)
         self.checkexit(fp.close())
         return etree.getroot()