minirst: use unicode string as intermediate form for replacement
# this change redones part of 521c8e0c93bf, backed out by 0ad0ebe67815
Some character encodings use ASCII characters other than
control/alphabet/digit as a part of multi-bytes characters, so direct
replacing with such characters on strings in local encoding causes
invalid byte sequences.
[mpm: test changed to simple doctest]
--- a/mercurial/minirst.py Mon Oct 31 15:41:39 2011 -0500
+++ b/mercurial/minirst.py Mon Oct 31 21:06:18 2011 +0900
@@ -23,9 +23,27 @@
from i18n import _
def replace(text, substs):
+ '''
+ Apply a list of (find, replace) pairs to a text.
+
+ >>> replace("foo bar", [('f', 'F'), ('b', 'B')])
+ 'Foo Bar'
+ >>> encoding.encoding = 'latin1'
+ >>> replace('\\x81\\\\', [('\\\\', '/')])
+ '\\x81/'
+ >>> encoding.encoding = 'shiftjis'
+ >>> replace('\\x81\\\\', [('\\\\', '/')])
+ '\\x81\\\\'
+ '''
+
+ # some character encodings (cp932 for Japanese, at least) use
+ # ASCII characters other than control/alphabet/digit as a part of
+ # multi-bytes characters, so direct replacing with such characters
+ # on strings in local encoding causes invalid byte sequences.
+ utext = text.decode(encoding.encoding)
for f, t in substs:
- text = text.replace(f, t)
- return text
+ utext = utext.replace(f, t)
+ return utext.encode(encoding.encoding)
_blockre = re.compile(r"\n(?:\s*\n)+")
--- a/tests/test-doctest.py Mon Oct 31 15:41:39 2011 -0500
+++ b/tests/test-doctest.py Mon Oct 31 21:06:18 2011 +0900
@@ -36,3 +36,6 @@
import mercurial.revset
doctest.testmod(mercurial.revset)
+
+import mercurial.minirst
+doctest.testmod(mercurial.minirst)