comparison mercurial/encoding.py @ 21856:d24969ee272f

encoding: add 'trim' to trim multi-byte characters at most specified columns Newly added 'trim' is used to trim multi-byte characters at most specified columns correctly: directly slicing byte sequence should be replaced with 'encoding.trim', because the former may split at intermediate multi-byte sequence. Slicing unicode sequence ('uslice') and concatenation with ellipsis ('concat') are defined as function, to make enhancement in subsequent patch easier.
author FUJIWARA Katsunori <foozy@lares.dti.ne.jp>
date Sun, 06 Jul 2014 02:56:41 +0900
parents 404feac78b8a
children b515c3a63e96
comparison
equal deleted inserted replaced
21854:ba3bc6474bbf 21856:d24969ee272f
163 for x in xrange(start + c, len(s)): 163 for x in xrange(start + c, len(s)):
164 t = s[start:x] 164 t = s[start:x]
165 if colwidth(t) == c: 165 if colwidth(t) == c:
166 return t 166 return t
167 167
168 def trim(s, width, ellipsis=''):
169 """Trim string 's' to at most 'width' columns (including 'ellipsis').
170
171 >>> ellipsis = '+++'
172 >>> from mercurial import encoding
173 >>> encoding.encoding = 'utf-8'
174 >>> t= '1234567890'
175 >>> print trim(t, 12, ellipsis=ellipsis)
176 1234567890
177 >>> print trim(t, 10, ellipsis=ellipsis)
178 1234567890
179 >>> print trim(t, 8, ellipsis=ellipsis)
180 12345+++
181 >>> print trim(t, 8)
182 12345678
183 >>> print trim(t, 3, ellipsis=ellipsis)
184 +++
185 >>> print trim(t, 1, ellipsis=ellipsis)
186 +
187 >>> u = u'\u3042\u3044\u3046\u3048\u304a' # 2 x 5 = 10 columns
188 >>> t = u.encode(encoding.encoding)
189 >>> print trim(t, 12, ellipsis=ellipsis)
190 \xe3\x81\x82\xe3\x81\x84\xe3\x81\x86\xe3\x81\x88\xe3\x81\x8a
191 >>> print trim(t, 10, ellipsis=ellipsis)
192 \xe3\x81\x82\xe3\x81\x84\xe3\x81\x86\xe3\x81\x88\xe3\x81\x8a
193 >>> print trim(t, 8, ellipsis=ellipsis)
194 \xe3\x81\x82\xe3\x81\x84+++
195 >>> print trim(t, 5)
196 \xe3\x81\x82\xe3\x81\x84
197 >>> print trim(t, 4, ellipsis=ellipsis)
198 +++
199 >>> t = '\x11\x22\x33\x44\x55\x66\x77\x88\x99\xaa' # invalid byte sequence
200 >>> print trim(t, 12, ellipsis=ellipsis)
201 \x11\x22\x33\x44\x55\x66\x77\x88\x99\xaa
202 >>> print trim(t, 10, ellipsis=ellipsis)
203 \x11\x22\x33\x44\x55\x66\x77\x88\x99\xaa
204 >>> print trim(t, 8, ellipsis=ellipsis)
205 \x11\x22\x33\x44\x55+++
206 >>> print trim(t, 8)
207 \x11\x22\x33\x44\x55\x66\x77\x88
208 >>> print trim(t, 3, ellipsis=ellipsis)
209 +++
210 >>> print trim(t, 1, ellipsis=ellipsis)
211 +
212 """
213 try:
214 u = s.decode(encoding)
215 except UnicodeDecodeError:
216 if len(s) <= width: # trimming is not needed
217 return s
218 width -= len(ellipsis)
219 if width <= 0: # no enough room even for ellipsis
220 return ellipsis[:width + len(ellipsis)]
221 return s[:width] + ellipsis
222
223 if ucolwidth(u) <= width: # trimming is not needed
224 return s
225
226 width -= len(ellipsis)
227 if width <= 0: # no enough room even for ellipsis
228 return ellipsis[:width + len(ellipsis)]
229
230 uslice = lambda i: u[:-i]
231 concat = lambda s: s + ellipsis
232 for i in xrange(1, len(u)):
233 usub = uslice(i)
234 if ucolwidth(usub) <= width:
235 return concat(usub.encode(encoding))
236 return ellipsis # no enough room for multi-column characters
237
168 def lower(s): 238 def lower(s):
169 "best-effort encoding-aware case-folding of local string s" 239 "best-effort encoding-aware case-folding of local string s"
170 try: 240 try:
171 s.decode('ascii') # throw exception for non-ASCII character 241 s.decode('ascii') # throw exception for non-ASCII character
172 return s.lower() 242 return s.lower()