comparison mercurial/util.py @ 15066:24efa83d81cb stable

i18n: calculate terminal columns by width information of each characters neither number of 'bytes' in any encoding nor 'characters' is appropriate to calculate terminal columns for specified string. this patch modifies MBTextWrapper for: - overriding '_wrap_chunks()' to make it use not built-in 'len()' but 'encoding.colwidth()' for columns of string - fixing '_cutdown()' to make it use 'encoding.colwidth()' instead of local, similar but incorrect implementation this patch also modifies 'encoding.py': - dividing 'colwith()' into 2 pieces: one for calculation columns of specified UNICODE string, and another for rest part of original one. the former is used from MBTextWrapper in 'util.py'. - preventing 'colwidth()' from evaluating HGENCODINGAMBIGUOUS configuration per each invocation: 'unicodedata.east_asian_width' checking is kept intact for reducing startup cost.
author FUJIWARA Katsunori <foozy@lares.dti.ne.jp>
date Sat, 27 Aug 2011 04:56:12 +0900
parents 24a6c3f903bb
children cc16323e748d 64fbd0de9773
comparison
equal deleted inserted replaced
15065:24a6c3f903bb 15066:24efa83d81cb
14 """ 14 """
15 15
16 from i18n import _ 16 from i18n import _
17 import error, osutil, encoding 17 import error, osutil, encoding
18 import errno, re, shutil, sys, tempfile, traceback 18 import errno, re, shutil, sys, tempfile, traceback
19 import os, time, calendar, textwrap, unicodedata, signal 19 import os, time, calendar, textwrap, signal
20 import imp, socket, urllib 20 import imp, socket, urllib
21 21
22 # Python compatibility 22 # Python compatibility
23 23
24 def sha1(s): 24 def sha1(s):
1133 1133
1134 # delay import of textwrap 1134 # delay import of textwrap
1135 def MBTextWrapper(**kwargs): 1135 def MBTextWrapper(**kwargs):
1136 class tw(textwrap.TextWrapper): 1136 class tw(textwrap.TextWrapper):
1137 """ 1137 """
1138 Extend TextWrapper for double-width characters. 1138 Extend TextWrapper for width-awareness.
1139 1139
1140 Some Asian characters use two terminal columns instead of one. 1140 Neither number of 'bytes' in any encoding nor 'characters' is
1141 A good example of this behavior can be seen with u'\u65e5\u672c', 1141 appropriate to calculate terminal columns for specified string.
1142 the two Japanese characters for "Japan": 1142
1143 len() returns 2, but when printed to a terminal, they eat 4 columns. 1143 Original TextWrapper implementation uses built-in 'len()' directly,
1144 1144 so overriding is needed to use width information of each characters.
1145 (Note that this has nothing to do whatsoever with unicode 1145
1146 representation, or encoding of the underlying string) 1146 In addition, characters classified into 'ambiguous' width are
1147 treated as wide in east asian area, but as narrow in other.
1148
1149 This requires use decision to determine width of such characters.
1147 """ 1150 """
1148 def __init__(self, **kwargs): 1151 def __init__(self, **kwargs):
1149 textwrap.TextWrapper.__init__(self, **kwargs) 1152 textwrap.TextWrapper.__init__(self, **kwargs)
1150 1153
1154 # for compatibility between 2.4 and 2.6
1155 if getattr(self, 'drop_whitespace', None) is None:
1156 self.drop_whitespace = kwargs.get('drop_whitespace', True)
1157
1151 def _cutdown(self, ucstr, space_left): 1158 def _cutdown(self, ucstr, space_left):
1152 l = 0 1159 l = 0
1153 colwidth = unicodedata.east_asian_width 1160 colwidth = encoding.ucolwidth
1154 for i in xrange(len(ucstr)): 1161 for i in xrange(len(ucstr)):
1155 l += colwidth(ucstr[i]) in 'WFA' and 2 or 1 1162 l += colwidth(ucstr[i])
1156 if space_left < l: 1163 if space_left < l:
1157 return (ucstr[:i], ucstr[i:]) 1164 return (ucstr[:i], ucstr[i:])
1158 return ucstr, '' 1165 return ucstr, ''
1159 1166
1160 # overriding of base class 1167 # overriding of base class
1165 cut, res = self._cutdown(reversed_chunks[-1], space_left) 1172 cut, res = self._cutdown(reversed_chunks[-1], space_left)
1166 cur_line.append(cut) 1173 cur_line.append(cut)
1167 reversed_chunks[-1] = res 1174 reversed_chunks[-1] = res
1168 elif not cur_line: 1175 elif not cur_line:
1169 cur_line.append(reversed_chunks.pop()) 1176 cur_line.append(reversed_chunks.pop())
1177
1178 # this overriding code is imported from TextWrapper of python 2.6
1179 # to calculate columns of string by 'encoding.ucolwidth()'
1180 def _wrap_chunks(self, chunks):
1181 colwidth = encoding.ucolwidth
1182
1183 lines = []
1184 if self.width <= 0:
1185 raise ValueError("invalid width %r (must be > 0)" % self.width)
1186
1187 # Arrange in reverse order so items can be efficiently popped
1188 # from a stack of chucks.
1189 chunks.reverse()
1190
1191 while chunks:
1192
1193 # Start the list of chunks that will make up the current line.
1194 # cur_len is just the length of all the chunks in cur_line.
1195 cur_line = []
1196 cur_len = 0
1197
1198 # Figure out which static string will prefix this line.
1199 if lines:
1200 indent = self.subsequent_indent
1201 else:
1202 indent = self.initial_indent
1203
1204 # Maximum width for this line.
1205 width = self.width - len(indent)
1206
1207 # First chunk on line is whitespace -- drop it, unless this
1208 # is the very beginning of the text (ie. no lines started yet).
1209 if self.drop_whitespace and chunks[-1].strip() == '' and lines:
1210 del chunks[-1]
1211
1212 while chunks:
1213 l = colwidth(chunks[-1])
1214
1215 # Can at least squeeze this chunk onto the current line.
1216 if cur_len + l <= width:
1217 cur_line.append(chunks.pop())
1218 cur_len += l
1219
1220 # Nope, this line is full.
1221 else:
1222 break
1223
1224 # The current line is full, and the next chunk is too big to
1225 # fit on *any* line (not just this one).
1226 if chunks and colwidth(chunks[-1]) > width:
1227 self._handle_long_word(chunks, cur_line, cur_len, width)
1228
1229 # If the last chunk on this line is all whitespace, drop it.
1230 if (self.drop_whitespace and
1231 cur_line and cur_line[-1].strip() == ''):
1232 del cur_line[-1]
1233
1234 # Convert current line back to a string and store it in list
1235 # of all lines (return value).
1236 if cur_line:
1237 lines.append(indent + ''.join(cur_line))
1238
1239 return lines
1170 1240
1171 global MBTextWrapper 1241 global MBTextWrapper
1172 MBTextWrapper = tw 1242 MBTextWrapper = tw
1173 return tw(**kwargs) 1243 return tw(**kwargs)
1174 1244