Mercurial > hg-stable
comparison mercurial/util.py @ 15066:24efa83d81cb stable
i18n: calculate terminal columns by width information of each characters
neither number of 'bytes' in any encoding nor 'characters' is
appropriate to calculate terminal columns for specified string.
this patch modifies MBTextWrapper for:
- overriding '_wrap_chunks()' to make it use not built-in 'len()'
but 'encoding.colwidth()' for columns of string
- fixing '_cutdown()' to make it use 'encoding.colwidth()' instead
of local, similar but incorrect implementation
this patch also modifies 'encoding.py':
- dividing 'colwith()' into 2 pieces: one for calculation columns of
specified UNICODE string, and another for rest part of original
one. the former is used from MBTextWrapper in 'util.py'.
- preventing 'colwidth()' from evaluating HGENCODINGAMBIGUOUS
configuration per each invocation: 'unicodedata.east_asian_width'
checking is kept intact for reducing startup cost.
author | FUJIWARA Katsunori <foozy@lares.dti.ne.jp> |
---|---|
date | Sat, 27 Aug 2011 04:56:12 +0900 |
parents | 24a6c3f903bb |
children | cc16323e748d 64fbd0de9773 |
comparison
equal
deleted
inserted
replaced
15065:24a6c3f903bb | 15066:24efa83d81cb |
---|---|
14 """ | 14 """ |
15 | 15 |
16 from i18n import _ | 16 from i18n import _ |
17 import error, osutil, encoding | 17 import error, osutil, encoding |
18 import errno, re, shutil, sys, tempfile, traceback | 18 import errno, re, shutil, sys, tempfile, traceback |
19 import os, time, calendar, textwrap, unicodedata, signal | 19 import os, time, calendar, textwrap, signal |
20 import imp, socket, urllib | 20 import imp, socket, urllib |
21 | 21 |
22 # Python compatibility | 22 # Python compatibility |
23 | 23 |
24 def sha1(s): | 24 def sha1(s): |
1133 | 1133 |
1134 # delay import of textwrap | 1134 # delay import of textwrap |
1135 def MBTextWrapper(**kwargs): | 1135 def MBTextWrapper(**kwargs): |
1136 class tw(textwrap.TextWrapper): | 1136 class tw(textwrap.TextWrapper): |
1137 """ | 1137 """ |
1138 Extend TextWrapper for double-width characters. | 1138 Extend TextWrapper for width-awareness. |
1139 | 1139 |
1140 Some Asian characters use two terminal columns instead of one. | 1140 Neither number of 'bytes' in any encoding nor 'characters' is |
1141 A good example of this behavior can be seen with u'\u65e5\u672c', | 1141 appropriate to calculate terminal columns for specified string. |
1142 the two Japanese characters for "Japan": | 1142 |
1143 len() returns 2, but when printed to a terminal, they eat 4 columns. | 1143 Original TextWrapper implementation uses built-in 'len()' directly, |
1144 | 1144 so overriding is needed to use width information of each characters. |
1145 (Note that this has nothing to do whatsoever with unicode | 1145 |
1146 representation, or encoding of the underlying string) | 1146 In addition, characters classified into 'ambiguous' width are |
1147 treated as wide in east asian area, but as narrow in other. | |
1148 | |
1149 This requires use decision to determine width of such characters. | |
1147 """ | 1150 """ |
1148 def __init__(self, **kwargs): | 1151 def __init__(self, **kwargs): |
1149 textwrap.TextWrapper.__init__(self, **kwargs) | 1152 textwrap.TextWrapper.__init__(self, **kwargs) |
1150 | 1153 |
1154 # for compatibility between 2.4 and 2.6 | |
1155 if getattr(self, 'drop_whitespace', None) is None: | |
1156 self.drop_whitespace = kwargs.get('drop_whitespace', True) | |
1157 | |
1151 def _cutdown(self, ucstr, space_left): | 1158 def _cutdown(self, ucstr, space_left): |
1152 l = 0 | 1159 l = 0 |
1153 colwidth = unicodedata.east_asian_width | 1160 colwidth = encoding.ucolwidth |
1154 for i in xrange(len(ucstr)): | 1161 for i in xrange(len(ucstr)): |
1155 l += colwidth(ucstr[i]) in 'WFA' and 2 or 1 | 1162 l += colwidth(ucstr[i]) |
1156 if space_left < l: | 1163 if space_left < l: |
1157 return (ucstr[:i], ucstr[i:]) | 1164 return (ucstr[:i], ucstr[i:]) |
1158 return ucstr, '' | 1165 return ucstr, '' |
1159 | 1166 |
1160 # overriding of base class | 1167 # overriding of base class |
1165 cut, res = self._cutdown(reversed_chunks[-1], space_left) | 1172 cut, res = self._cutdown(reversed_chunks[-1], space_left) |
1166 cur_line.append(cut) | 1173 cur_line.append(cut) |
1167 reversed_chunks[-1] = res | 1174 reversed_chunks[-1] = res |
1168 elif not cur_line: | 1175 elif not cur_line: |
1169 cur_line.append(reversed_chunks.pop()) | 1176 cur_line.append(reversed_chunks.pop()) |
1177 | |
1178 # this overriding code is imported from TextWrapper of python 2.6 | |
1179 # to calculate columns of string by 'encoding.ucolwidth()' | |
1180 def _wrap_chunks(self, chunks): | |
1181 colwidth = encoding.ucolwidth | |
1182 | |
1183 lines = [] | |
1184 if self.width <= 0: | |
1185 raise ValueError("invalid width %r (must be > 0)" % self.width) | |
1186 | |
1187 # Arrange in reverse order so items can be efficiently popped | |
1188 # from a stack of chucks. | |
1189 chunks.reverse() | |
1190 | |
1191 while chunks: | |
1192 | |
1193 # Start the list of chunks that will make up the current line. | |
1194 # cur_len is just the length of all the chunks in cur_line. | |
1195 cur_line = [] | |
1196 cur_len = 0 | |
1197 | |
1198 # Figure out which static string will prefix this line. | |
1199 if lines: | |
1200 indent = self.subsequent_indent | |
1201 else: | |
1202 indent = self.initial_indent | |
1203 | |
1204 # Maximum width for this line. | |
1205 width = self.width - len(indent) | |
1206 | |
1207 # First chunk on line is whitespace -- drop it, unless this | |
1208 # is the very beginning of the text (ie. no lines started yet). | |
1209 if self.drop_whitespace and chunks[-1].strip() == '' and lines: | |
1210 del chunks[-1] | |
1211 | |
1212 while chunks: | |
1213 l = colwidth(chunks[-1]) | |
1214 | |
1215 # Can at least squeeze this chunk onto the current line. | |
1216 if cur_len + l <= width: | |
1217 cur_line.append(chunks.pop()) | |
1218 cur_len += l | |
1219 | |
1220 # Nope, this line is full. | |
1221 else: | |
1222 break | |
1223 | |
1224 # The current line is full, and the next chunk is too big to | |
1225 # fit on *any* line (not just this one). | |
1226 if chunks and colwidth(chunks[-1]) > width: | |
1227 self._handle_long_word(chunks, cur_line, cur_len, width) | |
1228 | |
1229 # If the last chunk on this line is all whitespace, drop it. | |
1230 if (self.drop_whitespace and | |
1231 cur_line and cur_line[-1].strip() == ''): | |
1232 del cur_line[-1] | |
1233 | |
1234 # Convert current line back to a string and store it in list | |
1235 # of all lines (return value). | |
1236 if cur_line: | |
1237 lines.append(indent + ''.join(cur_line)) | |
1238 | |
1239 return lines | |
1170 | 1240 |
1171 global MBTextWrapper | 1241 global MBTextWrapper |
1172 MBTextWrapper = tw | 1242 MBTextWrapper = tw |
1173 return tw(**kwargs) | 1243 return tw(**kwargs) |
1174 | 1244 |