# HG changeset patch # User Brodie Rao # Date 1285946104 18000 # Node ID 89df79b3c011f7824cac6c03c59c1ca4252dbfc2 # Parent f314723f36f514aa05b1f275a5bef8f0c42a251d convert/darcs: support changelogs with bytes 0x7F-0xFF (issue2411) This is a followup to 4481f8a93c7a, which only fixed the conversion of patches with UTF-8 metadata. This patch allows a changelog to have any bytes with values 0x7F-0xFF. It parses the XML changelog as Latin-1 and uses converter_source.recode() to decode the data as UTF-8/Latin-1. Caveats: - Since the convert extension doesn't provide any way to specify the source encoding, users are still limited to UTF-8 and Latin-1. - etree will still complain if the changelog has bytes with values 0x00-0x19. XML only allows printable characters. diff -r f314723f36f5 -r 89df79b3c011 hgext/convert/darcs.py --- a/hgext/convert/darcs.py Tue Oct 05 11:34:13 2010 +0200 +++ b/hgext/convert/darcs.py Fri Oct 01 10:15:04 2010 -0500 @@ -7,22 +7,22 @@ from common import NoRepo, checktool, commandline, commit, converter_source from mercurial.i18n import _ -from mercurial import util +from mercurial import encoding, util import os, shutil, tempfile, re # The naming drift of ElementTree is fun! try: - from xml.etree.cElementTree import ElementTree + from xml.etree.cElementTree import ElementTree, XMLParser except ImportError: try: - from xml.etree.ElementTree import ElementTree + from xml.etree.ElementTree import ElementTree, XMLParser except ImportError: try: - from elementtree.cElementTree import ElementTree + from elementtree.cElementTree import ElementTree, XMLParser except ImportError: try: - from elementtree.ElementTree import ElementTree + from elementtree.ElementTree import ElementTree, XMLParser except ImportError: ElementTree = None @@ -88,12 +88,24 @@ self.ui.debug('cleaning up %s\n' % self.tmppath) shutil.rmtree(self.tmppath, ignore_errors=True) + def recode(self, s, encoding=None): + if isinstance(s, unicode): + # XMLParser returns unicode objects for anything it can't + # encode into ASCII. We convert them back to str to get + # recode's normal conversion behavior. + s = s.encode('latin-1') + return super(darcs_source, self).recode(s, encoding) + def xml(self, cmd, **kwargs): # NOTE: darcs is currently encoding agnostic and will print # patch metadata byte-for-byte, even in the XML changelog. etree = ElementTree() + # While we are decoding the XML as latin-1 to be as liberal as + # possible, etree will still raise an exception if any + # non-printable characters are in the XML changelog. + parser = XMLParser(encoding='latin-1') fp = self._run(cmd, **kwargs) - etree.parse(fp) + etree.parse(fp, parser=parser) self.checkexit(fp.close()) return etree.getroot() diff -r f314723f36f5 -r 89df79b3c011 tests/test-convert-darcs --- a/tests/test-convert-darcs Tue Oct 05 11:34:13 2010 +0200 +++ b/tests/test-convert-darcs Fri Oct 01 10:15:04 2010 -0500 @@ -65,9 +65,15 @@ # darcs is encoding agnostic, so it takes whatever bytes it's given darcs record -a -l -m 'p4: desc ñ' -A 'author ñ' +echo % test latin-1 commit message +echo h > h +printf "p5: desc " > ../p5 +python -c 'print "".join([chr(i) for i in range(128, 256)])' >> ../p5 +darcs record -a -l --logfile ../p5 + glog() { - HGENCODING=utf-8 hg glog --template '{rev} "{desc|firstline}" ({author}) files: {files}\n' "$@" + hg glog --template '{rev} "{desc|firstline}" ({author}) files: {files}\n' "$@" } cd .. @@ -78,6 +84,7 @@ # Unfortunately, non-conflicting changes, like the addition of the # "c" file in p1.1 patch are reverted too. # Just to say that manifest not listing "c" here is a bug. -glog -R darcs-repo-hg +HGENCODING=latin-1 glog -R darcs-repo-hg -r 6 | "$TESTDIR"/printrepr.py +HGENCODING=utf-8 glog -R darcs-repo-hg -r 0:5 | "$TESTDIR"/printrepr.py hg up -q -R darcs-repo-hg hg -R darcs-repo-hg manifest --debug diff -r f314723f36f5 -r 89df79b3c011 tests/test-convert-darcs.out --- a/tests/test-convert-darcs.out Tue Oct 05 11:34:13 2010 +0200 +++ b/tests/test-convert-darcs.out Fri Oct 01 10:15:04 2010 -0500 @@ -16,17 +16,22 @@ Finished recording patch 'p3' % test utf-8 commit message and author Finished recording patch 'p4: desc ñ' +% test latin-1 commit message +Finished recording patch 'p5: desc ' initializing destination darcs-repo-hg repository scanning source... sorting... converting... -5 p0 -4 p1.2 -3 p1.1 -2 p2 -1 p3 -0 p4: desc ? -o 5 "p4: desc ñ" (author ñ) files: g +6 p0 +5 p1.2 +4 p1.1 +3 p2 +2 p3 +1 p4: desc ? +0 p5: desc ???????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????? +o 6 "p5: desc \x80\x81\x82\x83\x84\x85\x86\x87\x88\x89\x8a\x8b\x8c\x8d\x8e\x8f\x90\x91\x92\x93\x94\x95\x96\x97\x98\x99\x9a\x9b\x9c\x9d\x9e\x9f\xa0\xa1\xa2\xa3\xa4\xa5\xa6\xa7\xa8\xa9\xaa\xab\xac\xad\xae\xaf\xb0\xb1\xb2\xb3\xb4\xb5\xb6\xb7\xb8\xb9\xba\xbb\xbc\xbd\xbe\xbf\xc0\xc1\xc2\xc3\xc4\xc5\xc6\xc7\xc8\xc9\xca\xcb\xcc\xcd\xce\xcf\xd0\xd1\xd2\xd3\xd4\xd5\xd6\xd7\xd8\xd9\xda\xdb\xdc\xdd\xde\xdf\xe0\xe1\xe2\xe3\xe4\xe5\xe6\xe7\xe8\xe9\xea\xeb\xec\xed\xee\xef\xf0\xf1\xf2\xf3\xf4\xf5\xf6\xf7\xf8\xf9\xfa\xfb\xfc\xfd\xfe\xff" (test@example.org) files: h +| +o 5 "p4: desc \xc3\xb1" (author \xc3\xb1) files: g | o 4 "p3" (test@example.org) files: dir/d dir/d2 dir2/d f ff | @@ -43,3 +48,4 @@ 37406831adc447ec2385014019599dfec953c806 644 dir2/d b783a337463792a5c7d548ad85a7d3253c16ba8c 644 ff 0973eb1b2ecc4de7fafe7447ce1b7462108b4848 644 g +fe6f8b4f507fe3eb524c527192a84920a4288dac 644 h