annotate mercurial/encoding.py @ 11297:d320e70442a5

replace Python standard textwrap by MBCS sensitive one for i18n text Mercurial has problem around text wrapping/filling in MBCS encoding environment, because standard 'textwrap' module of Python can not treat it correctly. It splits byte sequence for one character into two lines. According to unicode specification, "east asian width" classifies characters into: W(ide), N(arrow), F(ull-width), H(alf-width), A(mbiguous) W/N/F/H can be always recognized as 2/1/2/1 bytes in byte sequence, but 'A' can not. Size of 'A' depends on language in which it is used. Unicode specification says: If the context(= language) cannot be established reliably they should be treated as narrow characters by default but many of class 'A' characters are full-width, at least, in Japanese environment. So, this patch treats class 'A' characters as full-width always for safety wrapping. This patch focuses only on MBCS safe-ness, not on writing/printing rule strict wrapping for each languages MBCS sensitive textwrap class is originally implemented by ITO Nobuaki <daydream.trippers@gmail.com>.
author FUJIWARA Katsunori <foozy@lares.dti.ne.jp>
date Sun, 06 Jun 2010 17:20:10 +0900
parents 08a0f04b56bd
children 2be70ca17311
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
8226
8b2cd04a6e97 put license and copyright info into comment blocks
Martin Geisler <mg@lazybytes.net>
parents: 8225
diff changeset
1 # encoding.py - character transcoding support for Mercurial
8b2cd04a6e97 put license and copyright info into comment blocks
Martin Geisler <mg@lazybytes.net>
parents: 8225
diff changeset
2 #
8b2cd04a6e97 put license and copyright info into comment blocks
Martin Geisler <mg@lazybytes.net>
parents: 8225
diff changeset
3 # Copyright 2005-2009 Matt Mackall <mpm@selenic.com> and others
8b2cd04a6e97 put license and copyright info into comment blocks
Martin Geisler <mg@lazybytes.net>
parents: 8225
diff changeset
4 #
8b2cd04a6e97 put license and copyright info into comment blocks
Martin Geisler <mg@lazybytes.net>
parents: 8225
diff changeset
5 # This software may be used and distributed according to the terms of the
10263
25e572394f5c Update license to GPLv2+
Matt Mackall <mpm@selenic.com>
parents: 9574
diff changeset
6 # GNU General Public License version 2 or any later version.
7948
de377b1a9a84 move encoding bits from util to encoding
Matt Mackall <mpm@selenic.com>
parents:
diff changeset
7
8312
b87a50b7125c separate import lines from mercurial and general python modules
Simon Heimberg <simohe@besonet.ch>
parents: 8226
diff changeset
8 import error
b87a50b7125c separate import lines from mercurial and general python modules
Simon Heimberg <simohe@besonet.ch>
parents: 8226
diff changeset
9 import sys, unicodedata, locale, os
7948
de377b1a9a84 move encoding bits from util to encoding
Matt Mackall <mpm@selenic.com>
parents:
diff changeset
10
de377b1a9a84 move encoding bits from util to encoding
Matt Mackall <mpm@selenic.com>
parents:
diff changeset
11 _encodingfixup = {'646': 'ascii', 'ANSI_X3.4-1968': 'ascii'}
de377b1a9a84 move encoding bits from util to encoding
Matt Mackall <mpm@selenic.com>
parents:
diff changeset
12
de377b1a9a84 move encoding bits from util to encoding
Matt Mackall <mpm@selenic.com>
parents:
diff changeset
13 try:
de377b1a9a84 move encoding bits from util to encoding
Matt Mackall <mpm@selenic.com>
parents:
diff changeset
14 encoding = os.environ.get("HGENCODING")
de377b1a9a84 move encoding bits from util to encoding
Matt Mackall <mpm@selenic.com>
parents:
diff changeset
15 if sys.platform == 'darwin' and not encoding:
de377b1a9a84 move encoding bits from util to encoding
Matt Mackall <mpm@selenic.com>
parents:
diff changeset
16 # On darwin, getpreferredencoding ignores the locale environment and
de377b1a9a84 move encoding bits from util to encoding
Matt Mackall <mpm@selenic.com>
parents:
diff changeset
17 # always returns mac-roman. We override this if the environment is
de377b1a9a84 move encoding bits from util to encoding
Matt Mackall <mpm@selenic.com>
parents:
diff changeset
18 # not C (has been customized by the user).
9574
9e9f63d5c456 encoding: fix issue with non-standard UTF-8 CTYPE on OS X
Dirkjan Ochtman <dirkjan@ochtman.nl>
parents: 8312
diff changeset
19 lc = locale.setlocale(locale.LC_CTYPE, '')
9e9f63d5c456 encoding: fix issue with non-standard UTF-8 CTYPE on OS X
Dirkjan Ochtman <dirkjan@ochtman.nl>
parents: 8312
diff changeset
20 if lc == 'UTF-8':
9e9f63d5c456 encoding: fix issue with non-standard UTF-8 CTYPE on OS X
Dirkjan Ochtman <dirkjan@ochtman.nl>
parents: 8312
diff changeset
21 locale.setlocale(locale.LC_CTYPE, 'en_US.UTF-8')
7948
de377b1a9a84 move encoding bits from util to encoding
Matt Mackall <mpm@selenic.com>
parents:
diff changeset
22 encoding = locale.getlocale()[1]
de377b1a9a84 move encoding bits from util to encoding
Matt Mackall <mpm@selenic.com>
parents:
diff changeset
23 if not encoding:
de377b1a9a84 move encoding bits from util to encoding
Matt Mackall <mpm@selenic.com>
parents:
diff changeset
24 encoding = locale.getpreferredencoding() or 'ascii'
de377b1a9a84 move encoding bits from util to encoding
Matt Mackall <mpm@selenic.com>
parents:
diff changeset
25 encoding = _encodingfixup.get(encoding, encoding)
de377b1a9a84 move encoding bits from util to encoding
Matt Mackall <mpm@selenic.com>
parents:
diff changeset
26 except locale.Error:
de377b1a9a84 move encoding bits from util to encoding
Matt Mackall <mpm@selenic.com>
parents:
diff changeset
27 encoding = 'ascii'
de377b1a9a84 move encoding bits from util to encoding
Matt Mackall <mpm@selenic.com>
parents:
diff changeset
28 encodingmode = os.environ.get("HGENCODINGMODE", "strict")
de377b1a9a84 move encoding bits from util to encoding
Matt Mackall <mpm@selenic.com>
parents:
diff changeset
29 fallbackencoding = 'ISO-8859-1'
de377b1a9a84 move encoding bits from util to encoding
Matt Mackall <mpm@selenic.com>
parents:
diff changeset
30
de377b1a9a84 move encoding bits from util to encoding
Matt Mackall <mpm@selenic.com>
parents:
diff changeset
31 def tolocal(s):
de377b1a9a84 move encoding bits from util to encoding
Matt Mackall <mpm@selenic.com>
parents:
diff changeset
32 """
de377b1a9a84 move encoding bits from util to encoding
Matt Mackall <mpm@selenic.com>
parents:
diff changeset
33 Convert a string from internal UTF-8 to local encoding
de377b1a9a84 move encoding bits from util to encoding
Matt Mackall <mpm@selenic.com>
parents:
diff changeset
34
de377b1a9a84 move encoding bits from util to encoding
Matt Mackall <mpm@selenic.com>
parents:
diff changeset
35 All internal strings should be UTF-8 but some repos before the
de377b1a9a84 move encoding bits from util to encoding
Matt Mackall <mpm@selenic.com>
parents:
diff changeset
36 implementation of locale support may contain latin1 or possibly
de377b1a9a84 move encoding bits from util to encoding
Matt Mackall <mpm@selenic.com>
parents:
diff changeset
37 other character sets. We attempt to decode everything strictly
de377b1a9a84 move encoding bits from util to encoding
Matt Mackall <mpm@selenic.com>
parents:
diff changeset
38 using UTF-8, then Latin-1, and failing that, we use UTF-8 and
de377b1a9a84 move encoding bits from util to encoding
Matt Mackall <mpm@selenic.com>
parents:
diff changeset
39 replace unknown characters.
de377b1a9a84 move encoding bits from util to encoding
Matt Mackall <mpm@selenic.com>
parents:
diff changeset
40 """
de377b1a9a84 move encoding bits from util to encoding
Matt Mackall <mpm@selenic.com>
parents:
diff changeset
41 for e in ('UTF-8', fallbackencoding):
de377b1a9a84 move encoding bits from util to encoding
Matt Mackall <mpm@selenic.com>
parents:
diff changeset
42 try:
de377b1a9a84 move encoding bits from util to encoding
Matt Mackall <mpm@selenic.com>
parents:
diff changeset
43 u = s.decode(e) # attempt strict decoding
de377b1a9a84 move encoding bits from util to encoding
Matt Mackall <mpm@selenic.com>
parents:
diff changeset
44 return u.encode(encoding, "replace")
de377b1a9a84 move encoding bits from util to encoding
Matt Mackall <mpm@selenic.com>
parents:
diff changeset
45 except LookupError, k:
de377b1a9a84 move encoding bits from util to encoding
Matt Mackall <mpm@selenic.com>
parents:
diff changeset
46 raise error.Abort("%s, please check your locale settings" % k)
de377b1a9a84 move encoding bits from util to encoding
Matt Mackall <mpm@selenic.com>
parents:
diff changeset
47 except UnicodeDecodeError:
de377b1a9a84 move encoding bits from util to encoding
Matt Mackall <mpm@selenic.com>
parents:
diff changeset
48 pass
de377b1a9a84 move encoding bits from util to encoding
Matt Mackall <mpm@selenic.com>
parents:
diff changeset
49 u = s.decode("utf-8", "replace") # last ditch
de377b1a9a84 move encoding bits from util to encoding
Matt Mackall <mpm@selenic.com>
parents:
diff changeset
50 return u.encode(encoding, "replace")
de377b1a9a84 move encoding bits from util to encoding
Matt Mackall <mpm@selenic.com>
parents:
diff changeset
51
de377b1a9a84 move encoding bits from util to encoding
Matt Mackall <mpm@selenic.com>
parents:
diff changeset
52 def fromlocal(s):
de377b1a9a84 move encoding bits from util to encoding
Matt Mackall <mpm@selenic.com>
parents:
diff changeset
53 """
de377b1a9a84 move encoding bits from util to encoding
Matt Mackall <mpm@selenic.com>
parents:
diff changeset
54 Convert a string from the local character encoding to UTF-8
de377b1a9a84 move encoding bits from util to encoding
Matt Mackall <mpm@selenic.com>
parents:
diff changeset
55
de377b1a9a84 move encoding bits from util to encoding
Matt Mackall <mpm@selenic.com>
parents:
diff changeset
56 We attempt to decode strings using the encoding mode set by
de377b1a9a84 move encoding bits from util to encoding
Matt Mackall <mpm@selenic.com>
parents:
diff changeset
57 HGENCODINGMODE, which defaults to 'strict'. In this mode, unknown
de377b1a9a84 move encoding bits from util to encoding
Matt Mackall <mpm@selenic.com>
parents:
diff changeset
58 characters will cause an error message. Other modes include
de377b1a9a84 move encoding bits from util to encoding
Matt Mackall <mpm@selenic.com>
parents:
diff changeset
59 'replace', which replaces unknown characters with a special
de377b1a9a84 move encoding bits from util to encoding
Matt Mackall <mpm@selenic.com>
parents:
diff changeset
60 Unicode character, and 'ignore', which drops the character.
de377b1a9a84 move encoding bits from util to encoding
Matt Mackall <mpm@selenic.com>
parents:
diff changeset
61 """
de377b1a9a84 move encoding bits from util to encoding
Matt Mackall <mpm@selenic.com>
parents:
diff changeset
62 try:
de377b1a9a84 move encoding bits from util to encoding
Matt Mackall <mpm@selenic.com>
parents:
diff changeset
63 return s.decode(encoding, encodingmode).encode("utf-8")
de377b1a9a84 move encoding bits from util to encoding
Matt Mackall <mpm@selenic.com>
parents:
diff changeset
64 except UnicodeDecodeError, inst:
10282
08a0f04b56bd many, many trivial check-code fixups
Matt Mackall <mpm@selenic.com>
parents: 10263
diff changeset
65 sub = s[max(0, inst.start - 10):inst.start + 10]
7948
de377b1a9a84 move encoding bits from util to encoding
Matt Mackall <mpm@selenic.com>
parents:
diff changeset
66 raise error.Abort("decoding near '%s': %s!" % (sub, inst))
de377b1a9a84 move encoding bits from util to encoding
Matt Mackall <mpm@selenic.com>
parents:
diff changeset
67 except LookupError, k:
de377b1a9a84 move encoding bits from util to encoding
Matt Mackall <mpm@selenic.com>
parents:
diff changeset
68 raise error.Abort("%s, please check your locale settings" % k)
de377b1a9a84 move encoding bits from util to encoding
Matt Mackall <mpm@selenic.com>
parents:
diff changeset
69
de377b1a9a84 move encoding bits from util to encoding
Matt Mackall <mpm@selenic.com>
parents:
diff changeset
70 def colwidth(s):
de377b1a9a84 move encoding bits from util to encoding
Matt Mackall <mpm@selenic.com>
parents:
diff changeset
71 "Find the column width of a UTF-8 string for display"
de377b1a9a84 move encoding bits from util to encoding
Matt Mackall <mpm@selenic.com>
parents:
diff changeset
72 d = s.decode(encoding, 'replace')
de377b1a9a84 move encoding bits from util to encoding
Matt Mackall <mpm@selenic.com>
parents:
diff changeset
73 if hasattr(unicodedata, 'east_asian_width'):
de377b1a9a84 move encoding bits from util to encoding
Matt Mackall <mpm@selenic.com>
parents:
diff changeset
74 w = unicodedata.east_asian_width
11297
d320e70442a5 replace Python standard textwrap by MBCS sensitive one for i18n text
FUJIWARA Katsunori <foozy@lares.dti.ne.jp>
parents: 10282
diff changeset
75 return sum([w(c) in 'WFA' and 2 or 1 for c in d])
7948
de377b1a9a84 move encoding bits from util to encoding
Matt Mackall <mpm@selenic.com>
parents:
diff changeset
76 return len(d)
de377b1a9a84 move encoding bits from util to encoding
Matt Mackall <mpm@selenic.com>
parents:
diff changeset
77