comparison mercurial/util.py @ 11297:d320e70442a5

replace Python standard textwrap by MBCS sensitive one for i18n text Mercurial has problem around text wrapping/filling in MBCS encoding environment, because standard 'textwrap' module of Python can not treat it correctly. It splits byte sequence for one character into two lines. According to unicode specification, "east asian width" classifies characters into: W(ide), N(arrow), F(ull-width), H(alf-width), A(mbiguous) W/N/F/H can be always recognized as 2/1/2/1 bytes in byte sequence, but 'A' can not. Size of 'A' depends on language in which it is used. Unicode specification says: If the context(= language) cannot be established reliably they should be treated as narrow characters by default but many of class 'A' characters are full-width, at least, in Japanese environment. So, this patch treats class 'A' characters as full-width always for safety wrapping. This patch focuses only on MBCS safe-ness, not on writing/printing rule strict wrapping for each languages MBCS sensitive textwrap class is originally implemented by ITO Nobuaki <daydream.trippers@gmail.com>.
author FUJIWARA Katsunori <foozy@lares.dti.ne.jp>
date Sun, 06 Jun 2010 17:20:10 +0900
parents 94b7b3a1ae1b
children c37f35d7f2f5
comparison
equal deleted inserted replaced
11296:0054a328b98f 11297:d320e70442a5
14 """ 14 """
15 15
16 from i18n import _ 16 from i18n import _
17 import error, osutil, encoding 17 import error, osutil, encoding
18 import cStringIO, errno, re, shutil, sys, tempfile, traceback 18 import cStringIO, errno, re, shutil, sys, tempfile, traceback
19 import os, stat, time, calendar, textwrap, signal 19 import os, stat, time, calendar, textwrap, unicodedata, signal
20 import imp 20 import imp
21 21
22 # Python compatibility 22 # Python compatibility
23 23
24 def sha1(s): 24 def sha1(s):
1255 1255
1256 def uirepr(s): 1256 def uirepr(s):
1257 # Avoid double backslash in Windows path repr() 1257 # Avoid double backslash in Windows path repr()
1258 return repr(s).replace('\\\\', '\\') 1258 return repr(s).replace('\\\\', '\\')
1259 1259
1260 def wrap(line, hangindent, width=None): 1260 #### naming convention of below implementation follows 'textwrap' module
1261
1262 class MBTextWrapper(textwrap.TextWrapper):
1263 def __init__(self, **kwargs):
1264 textwrap.TextWrapper.__init__(self, **kwargs)
1265
1266 def _cutdown(self, str, space_left):
1267 l = 0
1268 ucstr = unicode(str, encoding.encoding)
1269 w = unicodedata.east_asian_width
1270 for i in xrange(len(ucstr)):
1271 l += w(ucstr[i]) in 'WFA' and 2 or 1
1272 if space_left < l:
1273 return (ucstr[:i].encode(encoding.encoding),
1274 ucstr[i:].encode(encoding.encoding))
1275 return str, ''
1276
1277 # ----------------------------------------
1278 # overriding of base class
1279
1280 def _handle_long_word(self, reversed_chunks, cur_line, cur_len, width):
1281 space_left = max(width - cur_len, 1)
1282
1283 if self.break_long_words:
1284 cut, res = self._cutdown(reversed_chunks[-1], space_left)
1285 cur_line.append(cut)
1286 reversed_chunks[-1] = res
1287 elif not cur_line:
1288 cur_line.append(reversed_chunks.pop())
1289
1290 #### naming convention of above implementation follows 'textwrap' module
1291
1292 def wrap(line, width=None, initindent='', hangindent=''):
1261 if width is None: 1293 if width is None:
1262 width = termwidth() - 2 1294 width = termwidth() - 2
1263 if width <= hangindent: 1295 maxindent = max(len(hangindent), len(initindent))
1296 if width <= maxindent:
1264 # adjust for weird terminal size 1297 # adjust for weird terminal size
1265 width = max(78, hangindent + 1) 1298 width = max(78, maxindent + 1)
1266 padding = '\n' + ' ' * hangindent 1299 wrapper = MBTextWrapper(width=width,
1267 # To avoid corrupting multi-byte characters in line, we must wrap 1300 initial_indent=initindent,
1268 # a Unicode string instead of a bytestring. 1301 subsequent_indent=hangindent)
1269 try: 1302 return wrapper.fill(line)
1270 u = line.decode(encoding.encoding)
1271 w = padding.join(textwrap.wrap(u, width=width - hangindent))
1272 return w.encode(encoding.encoding)
1273 except UnicodeDecodeError:
1274 return padding.join(textwrap.wrap(line, width=width - hangindent))
1275 1303
1276 def iterlines(iterator): 1304 def iterlines(iterator):
1277 for chunk in iterator: 1305 for chunk in iterator:
1278 for line in chunk.splitlines(): 1306 for line in chunk.splitlines():
1279 yield line 1307 yield line