changeset 12717:89df79b3c011 stable

convert/darcs: support changelogs with bytes 0x7F-0xFF (issue2411) This is a followup to 4481f8a93c7a, which only fixed the conversion of patches with UTF-8 metadata. This patch allows a changelog to have any bytes with values 0x7F-0xFF. It parses the XML changelog as Latin-1 and uses converter_source.recode() to decode the data as UTF-8/Latin-1. Caveats: - Since the convert extension doesn't provide any way to specify the source encoding, users are still limited to UTF-8 and Latin-1. - etree will still complain if the changelog has bytes with values 0x00-0x19. XML only allows printable characters.
author Brodie Rao <brodie@bitheap.org>
date Fri, 01 Oct 2010 10:15:04 -0500
parents f314723f36f5
children 372abc799caa 7adb1274a4f9
files hgext/convert/darcs.py tests/test-convert-darcs tests/test-convert-darcs.out
diffstat 3 files changed, 40 insertions(+), 15 deletions(-) [+]
line wrap: on
line diff
--- a/hgext/convert/darcs.py	Tue Oct 05 11:34:13 2010 +0200
+++ b/hgext/convert/darcs.py	Fri Oct 01 10:15:04 2010 -0500
@@ -7,22 +7,22 @@
 
 from common import NoRepo, checktool, commandline, commit, converter_source
 from mercurial.i18n import _
-from mercurial import util
+from mercurial import encoding, util
 import os, shutil, tempfile, re
 
 # The naming drift of ElementTree is fun!
 
 try:
-    from xml.etree.cElementTree import ElementTree
+    from xml.etree.cElementTree import ElementTree, XMLParser
 except ImportError:
     try:
-        from xml.etree.ElementTree import ElementTree
+        from xml.etree.ElementTree import ElementTree, XMLParser
     except ImportError:
         try:
-            from elementtree.cElementTree import ElementTree
+            from elementtree.cElementTree import ElementTree, XMLParser
         except ImportError:
             try:
-                from elementtree.ElementTree import ElementTree
+                from elementtree.ElementTree import ElementTree, XMLParser
             except ImportError:
                 ElementTree = None
 
@@ -88,12 +88,24 @@
         self.ui.debug('cleaning up %s\n' % self.tmppath)
         shutil.rmtree(self.tmppath, ignore_errors=True)
 
+    def recode(self, s, encoding=None):
+        if isinstance(s, unicode):
+            # XMLParser returns unicode objects for anything it can't
+            # encode into ASCII. We convert them back to str to get
+            # recode's normal conversion behavior.
+            s = s.encode('latin-1')
+        return super(darcs_source, self).recode(s, encoding)
+
     def xml(self, cmd, **kwargs):
         # NOTE: darcs is currently encoding agnostic and will print
         # patch metadata byte-for-byte, even in the XML changelog.
         etree = ElementTree()
+        # While we are decoding the XML as latin-1 to be as liberal as
+        # possible, etree will still raise an exception if any
+        # non-printable characters are in the XML changelog.
+        parser = XMLParser(encoding='latin-1')
         fp = self._run(cmd, **kwargs)
-        etree.parse(fp)
+        etree.parse(fp, parser=parser)
         self.checkexit(fp.close())
         return etree.getroot()
 
--- a/tests/test-convert-darcs	Tue Oct 05 11:34:13 2010 +0200
+++ b/tests/test-convert-darcs	Fri Oct 01 10:15:04 2010 -0500
@@ -65,9 +65,15 @@
 # darcs is encoding agnostic, so it takes whatever bytes it's given
 darcs record -a -l -m 'p4: desc ñ' -A 'author ñ'
 
+echo % test latin-1 commit message
+echo h > h
+printf "p5: desc " > ../p5
+python -c 'print "".join([chr(i) for i in range(128, 256)])' >> ../p5
+darcs record -a -l --logfile ../p5
+
 glog()
 {
-    HGENCODING=utf-8 hg glog --template '{rev} "{desc|firstline}" ({author}) files: {files}\n' "$@"
+    hg glog --template '{rev} "{desc|firstline}" ({author}) files: {files}\n' "$@"
 }
 
 cd ..
@@ -78,6 +84,7 @@
 # Unfortunately, non-conflicting changes, like the addition of the
 # "c" file in p1.1 patch are reverted too.
 # Just to say that manifest not listing "c" here is a bug.
-glog -R darcs-repo-hg
+HGENCODING=latin-1 glog -R darcs-repo-hg -r 6 | "$TESTDIR"/printrepr.py
+HGENCODING=utf-8 glog -R darcs-repo-hg -r 0:5 | "$TESTDIR"/printrepr.py
 hg up -q -R darcs-repo-hg
 hg -R darcs-repo-hg manifest --debug
--- a/tests/test-convert-darcs.out	Tue Oct 05 11:34:13 2010 +0200
+++ b/tests/test-convert-darcs.out	Fri Oct 01 10:15:04 2010 -0500
@@ -16,17 +16,22 @@
 Finished recording patch 'p3'
 % test utf-8 commit message and author
 Finished recording patch 'p4: desc ñ'
+% test latin-1 commit message
+Finished recording patch 'p5: desc '
 initializing destination darcs-repo-hg repository
 scanning source...
 sorting...
 converting...
-5 p0
-4 p1.2
-3 p1.1
-2 p2
-1 p3
-0 p4: desc ?
-o  5 "p4: desc ñ" (author ñ) files: g
+6 p0
+5 p1.2
+4 p1.1
+3 p2
+2 p3
+1 p4: desc ?
+0 p5: desc ????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????
+o  6 "p5: desc \x80\x81\x82\x83\x84\x85\x86\x87\x88\x89\x8a\x8b\x8c\x8d\x8e\x8f\x90\x91\x92\x93\x94\x95\x96\x97\x98\x99\x9a\x9b\x9c\x9d\x9e\x9f\xa0\xa1\xa2\xa3\xa4\xa5\xa6\xa7\xa8\xa9\xaa\xab\xac\xad\xae\xaf\xb0\xb1\xb2\xb3\xb4\xb5\xb6\xb7\xb8\xb9\xba\xbb\xbc\xbd\xbe\xbf\xc0\xc1\xc2\xc3\xc4\xc5\xc6\xc7\xc8\xc9\xca\xcb\xcc\xcd\xce\xcf\xd0\xd1\xd2\xd3\xd4\xd5\xd6\xd7\xd8\xd9\xda\xdb\xdc\xdd\xde\xdf\xe0\xe1\xe2\xe3\xe4\xe5\xe6\xe7\xe8\xe9\xea\xeb\xec\xed\xee\xef\xf0\xf1\xf2\xf3\xf4\xf5\xf6\xf7\xf8\xf9\xfa\xfb\xfc\xfd\xfe\xff" (test@example.org) files: h
+|
+o  5 "p4: desc \xc3\xb1" (author \xc3\xb1) files: g
 |
 o  4 "p3" (test@example.org) files: dir/d dir/d2 dir2/d f ff
 |
@@ -43,3 +48,4 @@
 37406831adc447ec2385014019599dfec953c806 644   dir2/d
 b783a337463792a5c7d548ad85a7d3253c16ba8c 644   ff
 0973eb1b2ecc4de7fafe7447ce1b7462108b4848 644   g
+fe6f8b4f507fe3eb524c527192a84920a4288dac 644   h