convert/darcs: support changelogs with bytes 0x7F-0xFF (issue2411)
This is a followup to 4481f8a93c7a, which only fixed the conversion of
patches with UTF-8 metadata.
This patch allows a changelog to have any bytes with values
0x7F-0xFF. It parses the XML changelog as Latin-1 and uses
converter_source.recode() to decode the data as UTF-8/Latin-1.
Caveats:
- Since the convert extension doesn't provide any way to specify the
source encoding, users are still limited to UTF-8 and Latin-1.
- etree will still complain if the changelog has bytes with values
0x00-0x19. XML only allows printable characters.
--- a/hgext/convert/darcs.py Tue Oct 05 11:34:13 2010 +0200
+++ b/hgext/convert/darcs.py Fri Oct 01 10:15:04 2010 -0500
@@ -7,22 +7,22 @@
from common import NoRepo, checktool, commandline, commit, converter_source
from mercurial.i18n import _
-from mercurial import util
+from mercurial import encoding, util
import os, shutil, tempfile, re
# The naming drift of ElementTree is fun!
try:
- from xml.etree.cElementTree import ElementTree
+ from xml.etree.cElementTree import ElementTree, XMLParser
except ImportError:
try:
- from xml.etree.ElementTree import ElementTree
+ from xml.etree.ElementTree import ElementTree, XMLParser
except ImportError:
try:
- from elementtree.cElementTree import ElementTree
+ from elementtree.cElementTree import ElementTree, XMLParser
except ImportError:
try:
- from elementtree.ElementTree import ElementTree
+ from elementtree.ElementTree import ElementTree, XMLParser
except ImportError:
ElementTree = None
@@ -88,12 +88,24 @@
self.ui.debug('cleaning up %s\n' % self.tmppath)
shutil.rmtree(self.tmppath, ignore_errors=True)
+ def recode(self, s, encoding=None):
+ if isinstance(s, unicode):
+ # XMLParser returns unicode objects for anything it can't
+ # encode into ASCII. We convert them back to str to get
+ # recode's normal conversion behavior.
+ s = s.encode('latin-1')
+ return super(darcs_source, self).recode(s, encoding)
+
def xml(self, cmd, **kwargs):
# NOTE: darcs is currently encoding agnostic and will print
# patch metadata byte-for-byte, even in the XML changelog.
etree = ElementTree()
+ # While we are decoding the XML as latin-1 to be as liberal as
+ # possible, etree will still raise an exception if any
+ # non-printable characters are in the XML changelog.
+ parser = XMLParser(encoding='latin-1')
fp = self._run(cmd, **kwargs)
- etree.parse(fp)
+ etree.parse(fp, parser=parser)
self.checkexit(fp.close())
return etree.getroot()
--- a/tests/test-convert-darcs Tue Oct 05 11:34:13 2010 +0200
+++ b/tests/test-convert-darcs Fri Oct 01 10:15:04 2010 -0500
@@ -65,9 +65,15 @@
# darcs is encoding agnostic, so it takes whatever bytes it's given
darcs record -a -l -m 'p4: desc ñ' -A 'author ñ'
+echo % test latin-1 commit message
+echo h > h
+printf "p5: desc " > ../p5
+python -c 'print "".join([chr(i) for i in range(128, 256)])' >> ../p5
+darcs record -a -l --logfile ../p5
+
glog()
{
- HGENCODING=utf-8 hg glog --template '{rev} "{desc|firstline}" ({author}) files: {files}\n' "$@"
+ hg glog --template '{rev} "{desc|firstline}" ({author}) files: {files}\n' "$@"
}
cd ..
@@ -78,6 +84,7 @@
# Unfortunately, non-conflicting changes, like the addition of the
# "c" file in p1.1 patch are reverted too.
# Just to say that manifest not listing "c" here is a bug.
-glog -R darcs-repo-hg
+HGENCODING=latin-1 glog -R darcs-repo-hg -r 6 | "$TESTDIR"/printrepr.py
+HGENCODING=utf-8 glog -R darcs-repo-hg -r 0:5 | "$TESTDIR"/printrepr.py
hg up -q -R darcs-repo-hg
hg -R darcs-repo-hg manifest --debug
--- a/tests/test-convert-darcs.out Tue Oct 05 11:34:13 2010 +0200
+++ b/tests/test-convert-darcs.out Fri Oct 01 10:15:04 2010 -0500
@@ -16,17 +16,22 @@
Finished recording patch 'p3'
% test utf-8 commit message and author
Finished recording patch 'p4: desc ñ'
+% test latin-1 commit message
+Finished recording patch 'p5: desc '
initializing destination darcs-repo-hg repository
scanning source...
sorting...
converting...
-5 p0
-4 p1.2
-3 p1.1
-2 p2
-1 p3
-0 p4: desc ?
-o 5 "p4: desc ñ" (author ñ) files: g
+6 p0
+5 p1.2
+4 p1.1
+3 p2
+2 p3
+1 p4: desc ?
+0 p5: desc ????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????
+o 6 "p5: desc \x80\x81\x82\x83\x84\x85\x86\x87\x88\x89\x8a\x8b\x8c\x8d\x8e\x8f\x90\x91\x92\x93\x94\x95\x96\x97\x98\x99\x9a\x9b\x9c\x9d\x9e\x9f\xa0\xa1\xa2\xa3\xa4\xa5\xa6\xa7\xa8\xa9\xaa\xab\xac\xad\xae\xaf\xb0\xb1\xb2\xb3\xb4\xb5\xb6\xb7\xb8\xb9\xba\xbb\xbc\xbd\xbe\xbf\xc0\xc1\xc2\xc3\xc4\xc5\xc6\xc7\xc8\xc9\xca\xcb\xcc\xcd\xce\xcf\xd0\xd1\xd2\xd3\xd4\xd5\xd6\xd7\xd8\xd9\xda\xdb\xdc\xdd\xde\xdf\xe0\xe1\xe2\xe3\xe4\xe5\xe6\xe7\xe8\xe9\xea\xeb\xec\xed\xee\xef\xf0\xf1\xf2\xf3\xf4\xf5\xf6\xf7\xf8\xf9\xfa\xfb\xfc\xfd\xfe\xff" (test@example.org) files: h
+|
+o 5 "p4: desc \xc3\xb1" (author \xc3\xb1) files: g
|
o 4 "p3" (test@example.org) files: dir/d dir/d2 dir2/d f ff
|
@@ -43,3 +48,4 @@
37406831adc447ec2385014019599dfec953c806 644 dir2/d
b783a337463792a5c7d548ad85a7d3253c16ba8c 644 ff
0973eb1b2ecc4de7fafe7447ce1b7462108b4848 644 g
+fe6f8b4f507fe3eb524c527192a84920a4288dac 644 h