encoding: add 'trim' to trim multi-byte characters at most specified columns
authorFUJIWARA Katsunori <foozy@lares.dti.ne.jp>
Sun, 06 Jul 2014 02:56:41 +0900
changeset 21856 d24969ee272f
parent 21854 ba3bc6474bbf
child 21857 86c2d792a4b7
encoding: add 'trim' to trim multi-byte characters at most specified columns Newly added 'trim' is used to trim multi-byte characters at most specified columns correctly: directly slicing byte sequence should be replaced with 'encoding.trim', because the former may split at intermediate multi-byte sequence. Slicing unicode sequence ('uslice') and concatenation with ellipsis ('concat') are defined as function, to make enhancement in subsequent patch easier.
mercurial/encoding.py
--- a/mercurial/encoding.py	Fri Jul 11 18:04:51 2014 -0500
+++ b/mercurial/encoding.py	Sun Jul 06 02:56:41 2014 +0900
@@ -165,6 +165,76 @@
         if colwidth(t) == c:
             return t
 
+def trim(s, width, ellipsis=''):
+    """Trim string 's' to at most 'width' columns (including 'ellipsis').
+
+    >>> ellipsis = '+++'
+    >>> from mercurial import encoding
+    >>> encoding.encoding = 'utf-8'
+    >>> t= '1234567890'
+    >>> print trim(t, 12, ellipsis=ellipsis)
+    1234567890
+    >>> print trim(t, 10, ellipsis=ellipsis)
+    1234567890
+    >>> print trim(t, 8, ellipsis=ellipsis)
+    12345+++
+    >>> print trim(t, 8)
+    12345678
+    >>> print trim(t, 3, ellipsis=ellipsis)
+    +++
+    >>> print trim(t, 1, ellipsis=ellipsis)
+    +
+    >>> u = u'\u3042\u3044\u3046\u3048\u304a' # 2 x 5 = 10 columns
+    >>> t = u.encode(encoding.encoding)
+    >>> print trim(t, 12, ellipsis=ellipsis)
+    \xe3\x81\x82\xe3\x81\x84\xe3\x81\x86\xe3\x81\x88\xe3\x81\x8a
+    >>> print trim(t, 10, ellipsis=ellipsis)
+    \xe3\x81\x82\xe3\x81\x84\xe3\x81\x86\xe3\x81\x88\xe3\x81\x8a
+    >>> print trim(t, 8, ellipsis=ellipsis)
+    \xe3\x81\x82\xe3\x81\x84+++
+    >>> print trim(t, 5)
+    \xe3\x81\x82\xe3\x81\x84
+    >>> print trim(t, 4, ellipsis=ellipsis)
+    +++
+    >>> t = '\x11\x22\x33\x44\x55\x66\x77\x88\x99\xaa' # invalid byte sequence
+    >>> print trim(t, 12, ellipsis=ellipsis)
+    \x11\x22\x33\x44\x55\x66\x77\x88\x99\xaa
+    >>> print trim(t, 10, ellipsis=ellipsis)
+    \x11\x22\x33\x44\x55\x66\x77\x88\x99\xaa
+    >>> print trim(t, 8, ellipsis=ellipsis)
+    \x11\x22\x33\x44\x55+++
+    >>> print trim(t, 8)
+    \x11\x22\x33\x44\x55\x66\x77\x88
+    >>> print trim(t, 3, ellipsis=ellipsis)
+    +++
+    >>> print trim(t, 1, ellipsis=ellipsis)
+    +
+    """
+    try:
+        u = s.decode(encoding)
+    except UnicodeDecodeError:
+        if len(s) <= width: # trimming is not needed
+            return s
+        width -= len(ellipsis)
+        if width <= 0: # no enough room even for ellipsis
+            return ellipsis[:width + len(ellipsis)]
+        return s[:width] + ellipsis
+
+    if ucolwidth(u) <= width: # trimming is not needed
+        return s
+
+    width -= len(ellipsis)
+    if width <= 0: # no enough room even for ellipsis
+        return ellipsis[:width + len(ellipsis)]
+
+    uslice = lambda i: u[:-i]
+    concat = lambda s: s + ellipsis
+    for i in xrange(1, len(u)):
+        usub = uslice(i)
+        if ucolwidth(usub) <= width:
+            return concat(usub.encode(encoding))
+    return ellipsis # no enough room for multi-column characters
+
 def lower(s):
     "best-effort encoding-aware case-folding of local string s"
     try: