Mercurial > hg
annotate mercurial/encoding.py @ 24523:a4b81dbe73c1
dirstate._walkexplicit: don't bother normalizing '.'
The overwhelmingly common case is running commands like 'hg diff' with no
arguments. Therefore the only file that'll be listed is the root directory.
Normalizing that's just a waste of time.
This means that for a plain 'hg diff' we'll never need to construct the
foldmap, saving us a significant chunk of time.
On case-insensitive HFS+ on OS X, for a large repository with over 200,000
files, this brings down 'hg diff' from 2.97 seconds to 2.36.
author | Siddharth Agarwal <sid0@fb.com> |
---|---|
date | Sun, 29 Mar 2015 18:28:48 -0700 |
parents | 885bd7c5c7e3 |
children | ac08de78de7f |
rev | line source |
---|---|
8226
8b2cd04a6e97
put license and copyright info into comment blocks
Martin Geisler <mg@lazybytes.net>
parents:
8225
diff
changeset
|
1 # encoding.py - character transcoding support for Mercurial |
8b2cd04a6e97
put license and copyright info into comment blocks
Martin Geisler <mg@lazybytes.net>
parents:
8225
diff
changeset
|
2 # |
8b2cd04a6e97
put license and copyright info into comment blocks
Martin Geisler <mg@lazybytes.net>
parents:
8225
diff
changeset
|
3 # Copyright 2005-2009 Matt Mackall <mpm@selenic.com> and others |
8b2cd04a6e97
put license and copyright info into comment blocks
Martin Geisler <mg@lazybytes.net>
parents:
8225
diff
changeset
|
4 # |
8b2cd04a6e97
put license and copyright info into comment blocks
Martin Geisler <mg@lazybytes.net>
parents:
8225
diff
changeset
|
5 # This software may be used and distributed according to the terms of the |
10263 | 6 # GNU General Public License version 2 or any later version. |
7948
de377b1a9a84
move encoding bits from util to encoding
Matt Mackall <mpm@selenic.com>
parents:
diff
changeset
|
7 |
22973
bcff9ecdaae0
encoding: avoid cyclic dependency around "parsers" in pure Python build
FUJIWARA Katsunori <foozy@lares.dti.ne.jp>
parents:
22779
diff
changeset
|
8 import error |
12062
c327bfa5e831
cleanup: remove unused imports
Brodie Rao <brodie@bitheap.org>
parents:
11892
diff
changeset
|
9 import unicodedata, locale, os |
7948
de377b1a9a84
move encoding bits from util to encoding
Matt Mackall <mpm@selenic.com>
parents:
diff
changeset
|
10 |
23596
885bd7c5c7e3
encoding: add hfsignoreclean to clean out HFS-ignored characters
Augie Fackler <raf@durin42.com>
parents:
22973
diff
changeset
|
11 # These unicode characters are ignored by HFS+ (Apple Technote 1150, |
885bd7c5c7e3
encoding: add hfsignoreclean to clean out HFS-ignored characters
Augie Fackler <raf@durin42.com>
parents:
22973
diff
changeset
|
12 # "Unicode Subtleties"), so we need to ignore them in some places for |
885bd7c5c7e3
encoding: add hfsignoreclean to clean out HFS-ignored characters
Augie Fackler <raf@durin42.com>
parents:
22973
diff
changeset
|
13 # sanity. |
885bd7c5c7e3
encoding: add hfsignoreclean to clean out HFS-ignored characters
Augie Fackler <raf@durin42.com>
parents:
22973
diff
changeset
|
14 _ignore = [unichr(int(x, 16)).encode("utf-8") for x in |
885bd7c5c7e3
encoding: add hfsignoreclean to clean out HFS-ignored characters
Augie Fackler <raf@durin42.com>
parents:
22973
diff
changeset
|
15 "200c 200d 200e 200f 202a 202b 202c 202d 202e " |
885bd7c5c7e3
encoding: add hfsignoreclean to clean out HFS-ignored characters
Augie Fackler <raf@durin42.com>
parents:
22973
diff
changeset
|
16 "206a 206b 206c 206d 206e 206f feff".split()] |
885bd7c5c7e3
encoding: add hfsignoreclean to clean out HFS-ignored characters
Augie Fackler <raf@durin42.com>
parents:
22973
diff
changeset
|
17 # verify the next function will work |
885bd7c5c7e3
encoding: add hfsignoreclean to clean out HFS-ignored characters
Augie Fackler <raf@durin42.com>
parents:
22973
diff
changeset
|
18 assert set([i[0] for i in _ignore]) == set(["\xe2", "\xef"]) |
885bd7c5c7e3
encoding: add hfsignoreclean to clean out HFS-ignored characters
Augie Fackler <raf@durin42.com>
parents:
22973
diff
changeset
|
19 |
885bd7c5c7e3
encoding: add hfsignoreclean to clean out HFS-ignored characters
Augie Fackler <raf@durin42.com>
parents:
22973
diff
changeset
|
20 def hfsignoreclean(s): |
885bd7c5c7e3
encoding: add hfsignoreclean to clean out HFS-ignored characters
Augie Fackler <raf@durin42.com>
parents:
22973
diff
changeset
|
21 """Remove codepoints ignored by HFS+ from s. |
885bd7c5c7e3
encoding: add hfsignoreclean to clean out HFS-ignored characters
Augie Fackler <raf@durin42.com>
parents:
22973
diff
changeset
|
22 |
885bd7c5c7e3
encoding: add hfsignoreclean to clean out HFS-ignored characters
Augie Fackler <raf@durin42.com>
parents:
22973
diff
changeset
|
23 >>> hfsignoreclean(u'.h\u200cg'.encode('utf-8')) |
885bd7c5c7e3
encoding: add hfsignoreclean to clean out HFS-ignored characters
Augie Fackler <raf@durin42.com>
parents:
22973
diff
changeset
|
24 '.hg' |
885bd7c5c7e3
encoding: add hfsignoreclean to clean out HFS-ignored characters
Augie Fackler <raf@durin42.com>
parents:
22973
diff
changeset
|
25 >>> hfsignoreclean(u'.h\ufeffg'.encode('utf-8')) |
885bd7c5c7e3
encoding: add hfsignoreclean to clean out HFS-ignored characters
Augie Fackler <raf@durin42.com>
parents:
22973
diff
changeset
|
26 '.hg' |
885bd7c5c7e3
encoding: add hfsignoreclean to clean out HFS-ignored characters
Augie Fackler <raf@durin42.com>
parents:
22973
diff
changeset
|
27 """ |
885bd7c5c7e3
encoding: add hfsignoreclean to clean out HFS-ignored characters
Augie Fackler <raf@durin42.com>
parents:
22973
diff
changeset
|
28 if "\xe2" in s or "\xef" in s: |
885bd7c5c7e3
encoding: add hfsignoreclean to clean out HFS-ignored characters
Augie Fackler <raf@durin42.com>
parents:
22973
diff
changeset
|
29 for c in _ignore: |
885bd7c5c7e3
encoding: add hfsignoreclean to clean out HFS-ignored characters
Augie Fackler <raf@durin42.com>
parents:
22973
diff
changeset
|
30 s = s.replace(c, '') |
885bd7c5c7e3
encoding: add hfsignoreclean to clean out HFS-ignored characters
Augie Fackler <raf@durin42.com>
parents:
22973
diff
changeset
|
31 return s |
885bd7c5c7e3
encoding: add hfsignoreclean to clean out HFS-ignored characters
Augie Fackler <raf@durin42.com>
parents:
22973
diff
changeset
|
32 |
11892
2be70ca17311
encoding: improve handling of buggy getpreferredencoding() on Mac OS X
Dan Villiom Podlaski Christiansen <danchr@gmail.com>
parents:
11297
diff
changeset
|
33 def _getpreferredencoding(): |
2be70ca17311
encoding: improve handling of buggy getpreferredencoding() on Mac OS X
Dan Villiom Podlaski Christiansen <danchr@gmail.com>
parents:
11297
diff
changeset
|
34 ''' |
2be70ca17311
encoding: improve handling of buggy getpreferredencoding() on Mac OS X
Dan Villiom Podlaski Christiansen <danchr@gmail.com>
parents:
11297
diff
changeset
|
35 On darwin, getpreferredencoding ignores the locale environment and |
2be70ca17311
encoding: improve handling of buggy getpreferredencoding() on Mac OS X
Dan Villiom Podlaski Christiansen <danchr@gmail.com>
parents:
11297
diff
changeset
|
36 always returns mac-roman. http://bugs.python.org/issue6202 fixes this |
2be70ca17311
encoding: improve handling of buggy getpreferredencoding() on Mac OS X
Dan Villiom Podlaski Christiansen <danchr@gmail.com>
parents:
11297
diff
changeset
|
37 for Python 2.7 and up. This is the same corrected code for earlier |
2be70ca17311
encoding: improve handling of buggy getpreferredencoding() on Mac OS X
Dan Villiom Podlaski Christiansen <danchr@gmail.com>
parents:
11297
diff
changeset
|
38 Python versions. |
2be70ca17311
encoding: improve handling of buggy getpreferredencoding() on Mac OS X
Dan Villiom Podlaski Christiansen <danchr@gmail.com>
parents:
11297
diff
changeset
|
39 |
12770
614f0d8724ab
check-code: find trailing whitespace
Martin Geisler <mg@lazybytes.net>
parents:
12062
diff
changeset
|
40 However, we can't use a version check for this method, as some distributions |
11892
2be70ca17311
encoding: improve handling of buggy getpreferredencoding() on Mac OS X
Dan Villiom Podlaski Christiansen <danchr@gmail.com>
parents:
11297
diff
changeset
|
41 patch Python to fix this. Instead, we use it as a 'fixer' for the mac-roman |
2be70ca17311
encoding: improve handling of buggy getpreferredencoding() on Mac OS X
Dan Villiom Podlaski Christiansen <danchr@gmail.com>
parents:
11297
diff
changeset
|
42 encoding, as it is unlikely that this encoding is the actually expected. |
2be70ca17311
encoding: improve handling of buggy getpreferredencoding() on Mac OS X
Dan Villiom Podlaski Christiansen <danchr@gmail.com>
parents:
11297
diff
changeset
|
43 ''' |
2be70ca17311
encoding: improve handling of buggy getpreferredencoding() on Mac OS X
Dan Villiom Podlaski Christiansen <danchr@gmail.com>
parents:
11297
diff
changeset
|
44 try: |
2be70ca17311
encoding: improve handling of buggy getpreferredencoding() on Mac OS X
Dan Villiom Podlaski Christiansen <danchr@gmail.com>
parents:
11297
diff
changeset
|
45 locale.CODESET |
2be70ca17311
encoding: improve handling of buggy getpreferredencoding() on Mac OS X
Dan Villiom Podlaski Christiansen <danchr@gmail.com>
parents:
11297
diff
changeset
|
46 except AttributeError: |
2be70ca17311
encoding: improve handling of buggy getpreferredencoding() on Mac OS X
Dan Villiom Podlaski Christiansen <danchr@gmail.com>
parents:
11297
diff
changeset
|
47 # Fall back to parsing environment variables :-( |
2be70ca17311
encoding: improve handling of buggy getpreferredencoding() on Mac OS X
Dan Villiom Podlaski Christiansen <danchr@gmail.com>
parents:
11297
diff
changeset
|
48 return locale.getdefaultlocale()[1] |
2be70ca17311
encoding: improve handling of buggy getpreferredencoding() on Mac OS X
Dan Villiom Podlaski Christiansen <danchr@gmail.com>
parents:
11297
diff
changeset
|
49 |
2be70ca17311
encoding: improve handling of buggy getpreferredencoding() on Mac OS X
Dan Villiom Podlaski Christiansen <danchr@gmail.com>
parents:
11297
diff
changeset
|
50 oldloc = locale.setlocale(locale.LC_CTYPE) |
2be70ca17311
encoding: improve handling of buggy getpreferredencoding() on Mac OS X
Dan Villiom Podlaski Christiansen <danchr@gmail.com>
parents:
11297
diff
changeset
|
51 locale.setlocale(locale.LC_CTYPE, "") |
2be70ca17311
encoding: improve handling of buggy getpreferredencoding() on Mac OS X
Dan Villiom Podlaski Christiansen <danchr@gmail.com>
parents:
11297
diff
changeset
|
52 result = locale.nl_langinfo(locale.CODESET) |
2be70ca17311
encoding: improve handling of buggy getpreferredencoding() on Mac OS X
Dan Villiom Podlaski Christiansen <danchr@gmail.com>
parents:
11297
diff
changeset
|
53 locale.setlocale(locale.LC_CTYPE, oldloc) |
2be70ca17311
encoding: improve handling of buggy getpreferredencoding() on Mac OS X
Dan Villiom Podlaski Christiansen <danchr@gmail.com>
parents:
11297
diff
changeset
|
54 |
2be70ca17311
encoding: improve handling of buggy getpreferredencoding() on Mac OS X
Dan Villiom Podlaski Christiansen <danchr@gmail.com>
parents:
11297
diff
changeset
|
55 return result |
2be70ca17311
encoding: improve handling of buggy getpreferredencoding() on Mac OS X
Dan Villiom Podlaski Christiansen <danchr@gmail.com>
parents:
11297
diff
changeset
|
56 |
2be70ca17311
encoding: improve handling of buggy getpreferredencoding() on Mac OS X
Dan Villiom Podlaski Christiansen <danchr@gmail.com>
parents:
11297
diff
changeset
|
57 _encodingfixers = { |
2be70ca17311
encoding: improve handling of buggy getpreferredencoding() on Mac OS X
Dan Villiom Podlaski Christiansen <danchr@gmail.com>
parents:
11297
diff
changeset
|
58 '646': lambda: 'ascii', |
2be70ca17311
encoding: improve handling of buggy getpreferredencoding() on Mac OS X
Dan Villiom Podlaski Christiansen <danchr@gmail.com>
parents:
11297
diff
changeset
|
59 'ANSI_X3.4-1968': lambda: 'ascii', |
2be70ca17311
encoding: improve handling of buggy getpreferredencoding() on Mac OS X
Dan Villiom Podlaski Christiansen <danchr@gmail.com>
parents:
11297
diff
changeset
|
60 'mac-roman': _getpreferredencoding |
2be70ca17311
encoding: improve handling of buggy getpreferredencoding() on Mac OS X
Dan Villiom Podlaski Christiansen <danchr@gmail.com>
parents:
11297
diff
changeset
|
61 } |
7948
de377b1a9a84
move encoding bits from util to encoding
Matt Mackall <mpm@selenic.com>
parents:
diff
changeset
|
62 |
de377b1a9a84
move encoding bits from util to encoding
Matt Mackall <mpm@selenic.com>
parents:
diff
changeset
|
63 try: |
de377b1a9a84
move encoding bits from util to encoding
Matt Mackall <mpm@selenic.com>
parents:
diff
changeset
|
64 encoding = os.environ.get("HGENCODING") |
de377b1a9a84
move encoding bits from util to encoding
Matt Mackall <mpm@selenic.com>
parents:
diff
changeset
|
65 if not encoding: |
de377b1a9a84
move encoding bits from util to encoding
Matt Mackall <mpm@selenic.com>
parents:
diff
changeset
|
66 encoding = locale.getpreferredencoding() or 'ascii' |
11892
2be70ca17311
encoding: improve handling of buggy getpreferredencoding() on Mac OS X
Dan Villiom Podlaski Christiansen <danchr@gmail.com>
parents:
11297
diff
changeset
|
67 encoding = _encodingfixers.get(encoding, lambda: encoding)() |
7948
de377b1a9a84
move encoding bits from util to encoding
Matt Mackall <mpm@selenic.com>
parents:
diff
changeset
|
68 except locale.Error: |
de377b1a9a84
move encoding bits from util to encoding
Matt Mackall <mpm@selenic.com>
parents:
diff
changeset
|
69 encoding = 'ascii' |
de377b1a9a84
move encoding bits from util to encoding
Matt Mackall <mpm@selenic.com>
parents:
diff
changeset
|
70 encodingmode = os.environ.get("HGENCODINGMODE", "strict") |
de377b1a9a84
move encoding bits from util to encoding
Matt Mackall <mpm@selenic.com>
parents:
diff
changeset
|
71 fallbackencoding = 'ISO-8859-1' |
de377b1a9a84
move encoding bits from util to encoding
Matt Mackall <mpm@selenic.com>
parents:
diff
changeset
|
72 |
13046
7cc4263e07a9
encoding: add localstr class to track UTF-8 version of transcoded strings
Matt Mackall <mpm@selenic.com>
parents:
12866
diff
changeset
|
73 class localstr(str): |
7cc4263e07a9
encoding: add localstr class to track UTF-8 version of transcoded strings
Matt Mackall <mpm@selenic.com>
parents:
12866
diff
changeset
|
74 '''This class allows strings that are unmodified to be |
7cc4263e07a9
encoding: add localstr class to track UTF-8 version of transcoded strings
Matt Mackall <mpm@selenic.com>
parents:
12866
diff
changeset
|
75 round-tripped to the local encoding and back''' |
7cc4263e07a9
encoding: add localstr class to track UTF-8 version of transcoded strings
Matt Mackall <mpm@selenic.com>
parents:
12866
diff
changeset
|
76 def __new__(cls, u, l): |
7cc4263e07a9
encoding: add localstr class to track UTF-8 version of transcoded strings
Matt Mackall <mpm@selenic.com>
parents:
12866
diff
changeset
|
77 s = str.__new__(cls, l) |
7cc4263e07a9
encoding: add localstr class to track UTF-8 version of transcoded strings
Matt Mackall <mpm@selenic.com>
parents:
12866
diff
changeset
|
78 s._utf8 = u |
7cc4263e07a9
encoding: add localstr class to track UTF-8 version of transcoded strings
Matt Mackall <mpm@selenic.com>
parents:
12866
diff
changeset
|
79 return s |
7cc4263e07a9
encoding: add localstr class to track UTF-8 version of transcoded strings
Matt Mackall <mpm@selenic.com>
parents:
12866
diff
changeset
|
80 def __hash__(self): |
7cc4263e07a9
encoding: add localstr class to track UTF-8 version of transcoded strings
Matt Mackall <mpm@selenic.com>
parents:
12866
diff
changeset
|
81 return hash(self._utf8) # avoid collisions in local string space |
7cc4263e07a9
encoding: add localstr class to track UTF-8 version of transcoded strings
Matt Mackall <mpm@selenic.com>
parents:
12866
diff
changeset
|
82 |
7948
de377b1a9a84
move encoding bits from util to encoding
Matt Mackall <mpm@selenic.com>
parents:
diff
changeset
|
83 def tolocal(s): |
de377b1a9a84
move encoding bits from util to encoding
Matt Mackall <mpm@selenic.com>
parents:
diff
changeset
|
84 """ |
de377b1a9a84
move encoding bits from util to encoding
Matt Mackall <mpm@selenic.com>
parents:
diff
changeset
|
85 Convert a string from internal UTF-8 to local encoding |
de377b1a9a84
move encoding bits from util to encoding
Matt Mackall <mpm@selenic.com>
parents:
diff
changeset
|
86 |
de377b1a9a84
move encoding bits from util to encoding
Matt Mackall <mpm@selenic.com>
parents:
diff
changeset
|
87 All internal strings should be UTF-8 but some repos before the |
de377b1a9a84
move encoding bits from util to encoding
Matt Mackall <mpm@selenic.com>
parents:
diff
changeset
|
88 implementation of locale support may contain latin1 or possibly |
de377b1a9a84
move encoding bits from util to encoding
Matt Mackall <mpm@selenic.com>
parents:
diff
changeset
|
89 other character sets. We attempt to decode everything strictly |
de377b1a9a84
move encoding bits from util to encoding
Matt Mackall <mpm@selenic.com>
parents:
diff
changeset
|
90 using UTF-8, then Latin-1, and failing that, we use UTF-8 and |
de377b1a9a84
move encoding bits from util to encoding
Matt Mackall <mpm@selenic.com>
parents:
diff
changeset
|
91 replace unknown characters. |
13046
7cc4263e07a9
encoding: add localstr class to track UTF-8 version of transcoded strings
Matt Mackall <mpm@selenic.com>
parents:
12866
diff
changeset
|
92 |
7cc4263e07a9
encoding: add localstr class to track UTF-8 version of transcoded strings
Matt Mackall <mpm@selenic.com>
parents:
12866
diff
changeset
|
93 The localstr class is used to cache the known UTF-8 encoding of |
7cc4263e07a9
encoding: add localstr class to track UTF-8 version of transcoded strings
Matt Mackall <mpm@selenic.com>
parents:
12866
diff
changeset
|
94 strings next to their local representation to allow lossless |
7cc4263e07a9
encoding: add localstr class to track UTF-8 version of transcoded strings
Matt Mackall <mpm@selenic.com>
parents:
12866
diff
changeset
|
95 round-trip conversion back to UTF-8. |
7cc4263e07a9
encoding: add localstr class to track UTF-8 version of transcoded strings
Matt Mackall <mpm@selenic.com>
parents:
12866
diff
changeset
|
96 |
7cc4263e07a9
encoding: add localstr class to track UTF-8 version of transcoded strings
Matt Mackall <mpm@selenic.com>
parents:
12866
diff
changeset
|
97 >>> u = 'foo: \\xc3\\xa4' # utf-8 |
7cc4263e07a9
encoding: add localstr class to track UTF-8 version of transcoded strings
Matt Mackall <mpm@selenic.com>
parents:
12866
diff
changeset
|
98 >>> l = tolocal(u) |
7cc4263e07a9
encoding: add localstr class to track UTF-8 version of transcoded strings
Matt Mackall <mpm@selenic.com>
parents:
12866
diff
changeset
|
99 >>> l |
7cc4263e07a9
encoding: add localstr class to track UTF-8 version of transcoded strings
Matt Mackall <mpm@selenic.com>
parents:
12866
diff
changeset
|
100 'foo: ?' |
7cc4263e07a9
encoding: add localstr class to track UTF-8 version of transcoded strings
Matt Mackall <mpm@selenic.com>
parents:
12866
diff
changeset
|
101 >>> fromlocal(l) |
7cc4263e07a9
encoding: add localstr class to track UTF-8 version of transcoded strings
Matt Mackall <mpm@selenic.com>
parents:
12866
diff
changeset
|
102 'foo: \\xc3\\xa4' |
7cc4263e07a9
encoding: add localstr class to track UTF-8 version of transcoded strings
Matt Mackall <mpm@selenic.com>
parents:
12866
diff
changeset
|
103 >>> u2 = 'foo: \\xc3\\xa1' |
7cc4263e07a9
encoding: add localstr class to track UTF-8 version of transcoded strings
Matt Mackall <mpm@selenic.com>
parents:
12866
diff
changeset
|
104 >>> d = { l: 1, tolocal(u2): 2 } |
18378
404feac78b8a
tests: stabilize doctest output
Mads Kiilerich <mads@kiilerich.com>
parents:
17424
diff
changeset
|
105 >>> len(d) # no collision |
404feac78b8a
tests: stabilize doctest output
Mads Kiilerich <mads@kiilerich.com>
parents:
17424
diff
changeset
|
106 2 |
13046
7cc4263e07a9
encoding: add localstr class to track UTF-8 version of transcoded strings
Matt Mackall <mpm@selenic.com>
parents:
12866
diff
changeset
|
107 >>> 'foo: ?' in d |
7cc4263e07a9
encoding: add localstr class to track UTF-8 version of transcoded strings
Matt Mackall <mpm@selenic.com>
parents:
12866
diff
changeset
|
108 False |
7cc4263e07a9
encoding: add localstr class to track UTF-8 version of transcoded strings
Matt Mackall <mpm@selenic.com>
parents:
12866
diff
changeset
|
109 >>> l1 = 'foo: \\xe4' # historical latin1 fallback |
7cc4263e07a9
encoding: add localstr class to track UTF-8 version of transcoded strings
Matt Mackall <mpm@selenic.com>
parents:
12866
diff
changeset
|
110 >>> l = tolocal(l1) |
7cc4263e07a9
encoding: add localstr class to track UTF-8 version of transcoded strings
Matt Mackall <mpm@selenic.com>
parents:
12866
diff
changeset
|
111 >>> l |
7cc4263e07a9
encoding: add localstr class to track UTF-8 version of transcoded strings
Matt Mackall <mpm@selenic.com>
parents:
12866
diff
changeset
|
112 'foo: ?' |
7cc4263e07a9
encoding: add localstr class to track UTF-8 version of transcoded strings
Matt Mackall <mpm@selenic.com>
parents:
12866
diff
changeset
|
113 >>> fromlocal(l) # magically in utf-8 |
7cc4263e07a9
encoding: add localstr class to track UTF-8 version of transcoded strings
Matt Mackall <mpm@selenic.com>
parents:
12866
diff
changeset
|
114 'foo: \\xc3\\xa4' |
7948
de377b1a9a84
move encoding bits from util to encoding
Matt Mackall <mpm@selenic.com>
parents:
diff
changeset
|
115 """ |
13046
7cc4263e07a9
encoding: add localstr class to track UTF-8 version of transcoded strings
Matt Mackall <mpm@selenic.com>
parents:
12866
diff
changeset
|
116 |
16274
5d75eb8568d1
encoding: tune fast-path of tolocal a bit
Matt Mackall <mpm@selenic.com>
parents:
16133
diff
changeset
|
117 try: |
7948
de377b1a9a84
move encoding bits from util to encoding
Matt Mackall <mpm@selenic.com>
parents:
diff
changeset
|
118 try: |
16274
5d75eb8568d1
encoding: tune fast-path of tolocal a bit
Matt Mackall <mpm@selenic.com>
parents:
16133
diff
changeset
|
119 # make sure string is actually stored in UTF-8 |
5d75eb8568d1
encoding: tune fast-path of tolocal a bit
Matt Mackall <mpm@selenic.com>
parents:
16133
diff
changeset
|
120 u = s.decode('UTF-8') |
5d75eb8568d1
encoding: tune fast-path of tolocal a bit
Matt Mackall <mpm@selenic.com>
parents:
16133
diff
changeset
|
121 if encoding == 'UTF-8': |
5d75eb8568d1
encoding: tune fast-path of tolocal a bit
Matt Mackall <mpm@selenic.com>
parents:
16133
diff
changeset
|
122 # fast path |
5d75eb8568d1
encoding: tune fast-path of tolocal a bit
Matt Mackall <mpm@selenic.com>
parents:
16133
diff
changeset
|
123 return s |
13940
b7b26e54e37a
encoding: avoid localstr when a string can be encoded losslessly (issue2763)
Matt Mackall <mpm@selenic.com>
parents:
13051
diff
changeset
|
124 r = u.encode(encoding, "replace") |
b7b26e54e37a
encoding: avoid localstr when a string can be encoded losslessly (issue2763)
Matt Mackall <mpm@selenic.com>
parents:
13051
diff
changeset
|
125 if u == r.decode(encoding): |
b7b26e54e37a
encoding: avoid localstr when a string can be encoded losslessly (issue2763)
Matt Mackall <mpm@selenic.com>
parents:
13051
diff
changeset
|
126 # r is a safe, non-lossy encoding of s |
b7b26e54e37a
encoding: avoid localstr when a string can be encoded losslessly (issue2763)
Matt Mackall <mpm@selenic.com>
parents:
13051
diff
changeset
|
127 return r |
16274
5d75eb8568d1
encoding: tune fast-path of tolocal a bit
Matt Mackall <mpm@selenic.com>
parents:
16133
diff
changeset
|
128 return localstr(s, r) |
5d75eb8568d1
encoding: tune fast-path of tolocal a bit
Matt Mackall <mpm@selenic.com>
parents:
16133
diff
changeset
|
129 except UnicodeDecodeError: |
5d75eb8568d1
encoding: tune fast-path of tolocal a bit
Matt Mackall <mpm@selenic.com>
parents:
16133
diff
changeset
|
130 # we should only get here if we're looking at an ancient changeset |
5d75eb8568d1
encoding: tune fast-path of tolocal a bit
Matt Mackall <mpm@selenic.com>
parents:
16133
diff
changeset
|
131 try: |
5d75eb8568d1
encoding: tune fast-path of tolocal a bit
Matt Mackall <mpm@selenic.com>
parents:
16133
diff
changeset
|
132 u = s.decode(fallbackencoding) |
5d75eb8568d1
encoding: tune fast-path of tolocal a bit
Matt Mackall <mpm@selenic.com>
parents:
16133
diff
changeset
|
133 r = u.encode(encoding, "replace") |
5d75eb8568d1
encoding: tune fast-path of tolocal a bit
Matt Mackall <mpm@selenic.com>
parents:
16133
diff
changeset
|
134 if u == r.decode(encoding): |
5d75eb8568d1
encoding: tune fast-path of tolocal a bit
Matt Mackall <mpm@selenic.com>
parents:
16133
diff
changeset
|
135 # r is a safe, non-lossy encoding of s |
5d75eb8568d1
encoding: tune fast-path of tolocal a bit
Matt Mackall <mpm@selenic.com>
parents:
16133
diff
changeset
|
136 return r |
13940
b7b26e54e37a
encoding: avoid localstr when a string can be encoded losslessly (issue2763)
Matt Mackall <mpm@selenic.com>
parents:
13051
diff
changeset
|
137 return localstr(u.encode('UTF-8'), r) |
16274
5d75eb8568d1
encoding: tune fast-path of tolocal a bit
Matt Mackall <mpm@selenic.com>
parents:
16133
diff
changeset
|
138 except UnicodeDecodeError: |
5d75eb8568d1
encoding: tune fast-path of tolocal a bit
Matt Mackall <mpm@selenic.com>
parents:
16133
diff
changeset
|
139 u = s.decode("utf-8", "replace") # last ditch |
5d75eb8568d1
encoding: tune fast-path of tolocal a bit
Matt Mackall <mpm@selenic.com>
parents:
16133
diff
changeset
|
140 return u.encode(encoding, "replace") # can't round-trip |
5d75eb8568d1
encoding: tune fast-path of tolocal a bit
Matt Mackall <mpm@selenic.com>
parents:
16133
diff
changeset
|
141 except LookupError, k: |
5d75eb8568d1
encoding: tune fast-path of tolocal a bit
Matt Mackall <mpm@selenic.com>
parents:
16133
diff
changeset
|
142 raise error.Abort(k, hint="please check your locale settings") |
7948
de377b1a9a84
move encoding bits from util to encoding
Matt Mackall <mpm@selenic.com>
parents:
diff
changeset
|
143 |
de377b1a9a84
move encoding bits from util to encoding
Matt Mackall <mpm@selenic.com>
parents:
diff
changeset
|
144 def fromlocal(s): |
de377b1a9a84
move encoding bits from util to encoding
Matt Mackall <mpm@selenic.com>
parents:
diff
changeset
|
145 """ |
de377b1a9a84
move encoding bits from util to encoding
Matt Mackall <mpm@selenic.com>
parents:
diff
changeset
|
146 Convert a string from the local character encoding to UTF-8 |
de377b1a9a84
move encoding bits from util to encoding
Matt Mackall <mpm@selenic.com>
parents:
diff
changeset
|
147 |
de377b1a9a84
move encoding bits from util to encoding
Matt Mackall <mpm@selenic.com>
parents:
diff
changeset
|
148 We attempt to decode strings using the encoding mode set by |
de377b1a9a84
move encoding bits from util to encoding
Matt Mackall <mpm@selenic.com>
parents:
diff
changeset
|
149 HGENCODINGMODE, which defaults to 'strict'. In this mode, unknown |
de377b1a9a84
move encoding bits from util to encoding
Matt Mackall <mpm@selenic.com>
parents:
diff
changeset
|
150 characters will cause an error message. Other modes include |
de377b1a9a84
move encoding bits from util to encoding
Matt Mackall <mpm@selenic.com>
parents:
diff
changeset
|
151 'replace', which replaces unknown characters with a special |
de377b1a9a84
move encoding bits from util to encoding
Matt Mackall <mpm@selenic.com>
parents:
diff
changeset
|
152 Unicode character, and 'ignore', which drops the character. |
de377b1a9a84
move encoding bits from util to encoding
Matt Mackall <mpm@selenic.com>
parents:
diff
changeset
|
153 """ |
13046
7cc4263e07a9
encoding: add localstr class to track UTF-8 version of transcoded strings
Matt Mackall <mpm@selenic.com>
parents:
12866
diff
changeset
|
154 |
7cc4263e07a9
encoding: add localstr class to track UTF-8 version of transcoded strings
Matt Mackall <mpm@selenic.com>
parents:
12866
diff
changeset
|
155 # can we do a lossless round-trip? |
7cc4263e07a9
encoding: add localstr class to track UTF-8 version of transcoded strings
Matt Mackall <mpm@selenic.com>
parents:
12866
diff
changeset
|
156 if isinstance(s, localstr): |
7cc4263e07a9
encoding: add localstr class to track UTF-8 version of transcoded strings
Matt Mackall <mpm@selenic.com>
parents:
12866
diff
changeset
|
157 return s._utf8 |
7cc4263e07a9
encoding: add localstr class to track UTF-8 version of transcoded strings
Matt Mackall <mpm@selenic.com>
parents:
12866
diff
changeset
|
158 |
7948
de377b1a9a84
move encoding bits from util to encoding
Matt Mackall <mpm@selenic.com>
parents:
diff
changeset
|
159 try: |
de377b1a9a84
move encoding bits from util to encoding
Matt Mackall <mpm@selenic.com>
parents:
diff
changeset
|
160 return s.decode(encoding, encodingmode).encode("utf-8") |
de377b1a9a84
move encoding bits from util to encoding
Matt Mackall <mpm@selenic.com>
parents:
diff
changeset
|
161 except UnicodeDecodeError, inst: |
10282
08a0f04b56bd
many, many trivial check-code fixups
Matt Mackall <mpm@selenic.com>
parents:
10263
diff
changeset
|
162 sub = s[max(0, inst.start - 10):inst.start + 10] |
7948
de377b1a9a84
move encoding bits from util to encoding
Matt Mackall <mpm@selenic.com>
parents:
diff
changeset
|
163 raise error.Abort("decoding near '%s': %s!" % (sub, inst)) |
de377b1a9a84
move encoding bits from util to encoding
Matt Mackall <mpm@selenic.com>
parents:
diff
changeset
|
164 except LookupError, k: |
15769
afdf4f5bac61
encoding: use hint markup for "please check your locale settings"
Mads Kiilerich <mads@kiilerich.com>
parents:
15672
diff
changeset
|
165 raise error.Abort(k, hint="please check your locale settings") |
7948
de377b1a9a84
move encoding bits from util to encoding
Matt Mackall <mpm@selenic.com>
parents:
diff
changeset
|
166 |
12866
eddc20306ab6
encoding: default ambiguous character to narrow
Matt Mackall <mpm@selenic.com>
parents:
12770
diff
changeset
|
167 # How to treat ambiguous-width characters. Set to 'wide' to treat as wide. |
15066
24efa83d81cb
i18n: calculate terminal columns by width information of each characters
FUJIWARA Katsunori <foozy@lares.dti.ne.jp>
parents:
14951
diff
changeset
|
168 wide = (os.environ.get("HGENCODINGAMBIGUOUS", "narrow") == "wide" |
24efa83d81cb
i18n: calculate terminal columns by width information of each characters
FUJIWARA Katsunori <foozy@lares.dti.ne.jp>
parents:
14951
diff
changeset
|
169 and "WFA" or "WF") |
12866
eddc20306ab6
encoding: default ambiguous character to narrow
Matt Mackall <mpm@selenic.com>
parents:
12770
diff
changeset
|
170 |
7948
de377b1a9a84
move encoding bits from util to encoding
Matt Mackall <mpm@selenic.com>
parents:
diff
changeset
|
171 def colwidth(s): |
15142
176882876780
encoding: colwidth input is in the local encoding
Matt Mackall <mpm@selenic.com>
parents:
15066
diff
changeset
|
172 "Find the column width of a string for display in the local encoding" |
15066
24efa83d81cb
i18n: calculate terminal columns by width information of each characters
FUJIWARA Katsunori <foozy@lares.dti.ne.jp>
parents:
14951
diff
changeset
|
173 return ucolwidth(s.decode(encoding, 'replace')) |
24efa83d81cb
i18n: calculate terminal columns by width information of each characters
FUJIWARA Katsunori <foozy@lares.dti.ne.jp>
parents:
14951
diff
changeset
|
174 |
24efa83d81cb
i18n: calculate terminal columns by width information of each characters
FUJIWARA Katsunori <foozy@lares.dti.ne.jp>
parents:
14951
diff
changeset
|
175 def ucolwidth(d): |
24efa83d81cb
i18n: calculate terminal columns by width information of each characters
FUJIWARA Katsunori <foozy@lares.dti.ne.jp>
parents:
14951
diff
changeset
|
176 "Find the column width of a Unicode string for display" |
14951
61807854004e
encoding: use getattr isntead of hasattr
Augie Fackler <durin42@gmail.com>
parents:
14069
diff
changeset
|
177 eaw = getattr(unicodedata, 'east_asian_width', None) |
61807854004e
encoding: use getattr isntead of hasattr
Augie Fackler <durin42@gmail.com>
parents:
14069
diff
changeset
|
178 if eaw is not None: |
61807854004e
encoding: use getattr isntead of hasattr
Augie Fackler <durin42@gmail.com>
parents:
14069
diff
changeset
|
179 return sum([eaw(c) in wide and 2 or 1 for c in d]) |
7948
de377b1a9a84
move encoding bits from util to encoding
Matt Mackall <mpm@selenic.com>
parents:
diff
changeset
|
180 return len(d) |
de377b1a9a84
move encoding bits from util to encoding
Matt Mackall <mpm@selenic.com>
parents:
diff
changeset
|
181 |
15143
16c129b0f465
encoding: add getcols to extract substrings based on column width
Matt Mackall <mpm@selenic.com>
parents:
15142
diff
changeset
|
182 def getcols(s, start, c): |
16c129b0f465
encoding: add getcols to extract substrings based on column width
Matt Mackall <mpm@selenic.com>
parents:
15142
diff
changeset
|
183 '''Use colwidth to find a c-column substring of s starting at byte |
16c129b0f465
encoding: add getcols to extract substrings based on column width
Matt Mackall <mpm@selenic.com>
parents:
15142
diff
changeset
|
184 index start''' |
16c129b0f465
encoding: add getcols to extract substrings based on column width
Matt Mackall <mpm@selenic.com>
parents:
15142
diff
changeset
|
185 for x in xrange(start + c, len(s)): |
16c129b0f465
encoding: add getcols to extract substrings based on column width
Matt Mackall <mpm@selenic.com>
parents:
15142
diff
changeset
|
186 t = s[start:x] |
16c129b0f465
encoding: add getcols to extract substrings based on column width
Matt Mackall <mpm@selenic.com>
parents:
15142
diff
changeset
|
187 if colwidth(t) == c: |
16c129b0f465
encoding: add getcols to extract substrings based on column width
Matt Mackall <mpm@selenic.com>
parents:
15142
diff
changeset
|
188 return t |
16c129b0f465
encoding: add getcols to extract substrings based on column width
Matt Mackall <mpm@selenic.com>
parents:
15142
diff
changeset
|
189 |
21861
b515c3a63e96
encoding: add 'leftside' argument into 'trim' to switch trimming side
FUJIWARA Katsunori <foozy@lares.dti.ne.jp>
parents:
21856
diff
changeset
|
190 def trim(s, width, ellipsis='', leftside=False): |
21856
d24969ee272f
encoding: add 'trim' to trim multi-byte characters at most specified columns
FUJIWARA Katsunori <foozy@lares.dti.ne.jp>
parents:
18378
diff
changeset
|
191 """Trim string 's' to at most 'width' columns (including 'ellipsis'). |
d24969ee272f
encoding: add 'trim' to trim multi-byte characters at most specified columns
FUJIWARA Katsunori <foozy@lares.dti.ne.jp>
parents:
18378
diff
changeset
|
192 |
21861
b515c3a63e96
encoding: add 'leftside' argument into 'trim' to switch trimming side
FUJIWARA Katsunori <foozy@lares.dti.ne.jp>
parents:
21856
diff
changeset
|
193 If 'leftside' is True, left side of string 's' is trimmed. |
b515c3a63e96
encoding: add 'leftside' argument into 'trim' to switch trimming side
FUJIWARA Katsunori <foozy@lares.dti.ne.jp>
parents:
21856
diff
changeset
|
194 'ellipsis' is always placed at trimmed side. |
b515c3a63e96
encoding: add 'leftside' argument into 'trim' to switch trimming side
FUJIWARA Katsunori <foozy@lares.dti.ne.jp>
parents:
21856
diff
changeset
|
195 |
21856
d24969ee272f
encoding: add 'trim' to trim multi-byte characters at most specified columns
FUJIWARA Katsunori <foozy@lares.dti.ne.jp>
parents:
18378
diff
changeset
|
196 >>> ellipsis = '+++' |
d24969ee272f
encoding: add 'trim' to trim multi-byte characters at most specified columns
FUJIWARA Katsunori <foozy@lares.dti.ne.jp>
parents:
18378
diff
changeset
|
197 >>> from mercurial import encoding |
d24969ee272f
encoding: add 'trim' to trim multi-byte characters at most specified columns
FUJIWARA Katsunori <foozy@lares.dti.ne.jp>
parents:
18378
diff
changeset
|
198 >>> encoding.encoding = 'utf-8' |
d24969ee272f
encoding: add 'trim' to trim multi-byte characters at most specified columns
FUJIWARA Katsunori <foozy@lares.dti.ne.jp>
parents:
18378
diff
changeset
|
199 >>> t= '1234567890' |
d24969ee272f
encoding: add 'trim' to trim multi-byte characters at most specified columns
FUJIWARA Katsunori <foozy@lares.dti.ne.jp>
parents:
18378
diff
changeset
|
200 >>> print trim(t, 12, ellipsis=ellipsis) |
d24969ee272f
encoding: add 'trim' to trim multi-byte characters at most specified columns
FUJIWARA Katsunori <foozy@lares.dti.ne.jp>
parents:
18378
diff
changeset
|
201 1234567890 |
d24969ee272f
encoding: add 'trim' to trim multi-byte characters at most specified columns
FUJIWARA Katsunori <foozy@lares.dti.ne.jp>
parents:
18378
diff
changeset
|
202 >>> print trim(t, 10, ellipsis=ellipsis) |
d24969ee272f
encoding: add 'trim' to trim multi-byte characters at most specified columns
FUJIWARA Katsunori <foozy@lares.dti.ne.jp>
parents:
18378
diff
changeset
|
203 1234567890 |
d24969ee272f
encoding: add 'trim' to trim multi-byte characters at most specified columns
FUJIWARA Katsunori <foozy@lares.dti.ne.jp>
parents:
18378
diff
changeset
|
204 >>> print trim(t, 8, ellipsis=ellipsis) |
d24969ee272f
encoding: add 'trim' to trim multi-byte characters at most specified columns
FUJIWARA Katsunori <foozy@lares.dti.ne.jp>
parents:
18378
diff
changeset
|
205 12345+++ |
21861
b515c3a63e96
encoding: add 'leftside' argument into 'trim' to switch trimming side
FUJIWARA Katsunori <foozy@lares.dti.ne.jp>
parents:
21856
diff
changeset
|
206 >>> print trim(t, 8, ellipsis=ellipsis, leftside=True) |
b515c3a63e96
encoding: add 'leftside' argument into 'trim' to switch trimming side
FUJIWARA Katsunori <foozy@lares.dti.ne.jp>
parents:
21856
diff
changeset
|
207 +++67890 |
21856
d24969ee272f
encoding: add 'trim' to trim multi-byte characters at most specified columns
FUJIWARA Katsunori <foozy@lares.dti.ne.jp>
parents:
18378
diff
changeset
|
208 >>> print trim(t, 8) |
d24969ee272f
encoding: add 'trim' to trim multi-byte characters at most specified columns
FUJIWARA Katsunori <foozy@lares.dti.ne.jp>
parents:
18378
diff
changeset
|
209 12345678 |
21861
b515c3a63e96
encoding: add 'leftside' argument into 'trim' to switch trimming side
FUJIWARA Katsunori <foozy@lares.dti.ne.jp>
parents:
21856
diff
changeset
|
210 >>> print trim(t, 8, leftside=True) |
b515c3a63e96
encoding: add 'leftside' argument into 'trim' to switch trimming side
FUJIWARA Katsunori <foozy@lares.dti.ne.jp>
parents:
21856
diff
changeset
|
211 34567890 |
21856
d24969ee272f
encoding: add 'trim' to trim multi-byte characters at most specified columns
FUJIWARA Katsunori <foozy@lares.dti.ne.jp>
parents:
18378
diff
changeset
|
212 >>> print trim(t, 3, ellipsis=ellipsis) |
d24969ee272f
encoding: add 'trim' to trim multi-byte characters at most specified columns
FUJIWARA Katsunori <foozy@lares.dti.ne.jp>
parents:
18378
diff
changeset
|
213 +++ |
d24969ee272f
encoding: add 'trim' to trim multi-byte characters at most specified columns
FUJIWARA Katsunori <foozy@lares.dti.ne.jp>
parents:
18378
diff
changeset
|
214 >>> print trim(t, 1, ellipsis=ellipsis) |
d24969ee272f
encoding: add 'trim' to trim multi-byte characters at most specified columns
FUJIWARA Katsunori <foozy@lares.dti.ne.jp>
parents:
18378
diff
changeset
|
215 + |
d24969ee272f
encoding: add 'trim' to trim multi-byte characters at most specified columns
FUJIWARA Katsunori <foozy@lares.dti.ne.jp>
parents:
18378
diff
changeset
|
216 >>> u = u'\u3042\u3044\u3046\u3048\u304a' # 2 x 5 = 10 columns |
d24969ee272f
encoding: add 'trim' to trim multi-byte characters at most specified columns
FUJIWARA Katsunori <foozy@lares.dti.ne.jp>
parents:
18378
diff
changeset
|
217 >>> t = u.encode(encoding.encoding) |
d24969ee272f
encoding: add 'trim' to trim multi-byte characters at most specified columns
FUJIWARA Katsunori <foozy@lares.dti.ne.jp>
parents:
18378
diff
changeset
|
218 >>> print trim(t, 12, ellipsis=ellipsis) |
d24969ee272f
encoding: add 'trim' to trim multi-byte characters at most specified columns
FUJIWARA Katsunori <foozy@lares.dti.ne.jp>
parents:
18378
diff
changeset
|
219 \xe3\x81\x82\xe3\x81\x84\xe3\x81\x86\xe3\x81\x88\xe3\x81\x8a |
d24969ee272f
encoding: add 'trim' to trim multi-byte characters at most specified columns
FUJIWARA Katsunori <foozy@lares.dti.ne.jp>
parents:
18378
diff
changeset
|
220 >>> print trim(t, 10, ellipsis=ellipsis) |
d24969ee272f
encoding: add 'trim' to trim multi-byte characters at most specified columns
FUJIWARA Katsunori <foozy@lares.dti.ne.jp>
parents:
18378
diff
changeset
|
221 \xe3\x81\x82\xe3\x81\x84\xe3\x81\x86\xe3\x81\x88\xe3\x81\x8a |
d24969ee272f
encoding: add 'trim' to trim multi-byte characters at most specified columns
FUJIWARA Katsunori <foozy@lares.dti.ne.jp>
parents:
18378
diff
changeset
|
222 >>> print trim(t, 8, ellipsis=ellipsis) |
d24969ee272f
encoding: add 'trim' to trim multi-byte characters at most specified columns
FUJIWARA Katsunori <foozy@lares.dti.ne.jp>
parents:
18378
diff
changeset
|
223 \xe3\x81\x82\xe3\x81\x84+++ |
21861
b515c3a63e96
encoding: add 'leftside' argument into 'trim' to switch trimming side
FUJIWARA Katsunori <foozy@lares.dti.ne.jp>
parents:
21856
diff
changeset
|
224 >>> print trim(t, 8, ellipsis=ellipsis, leftside=True) |
b515c3a63e96
encoding: add 'leftside' argument into 'trim' to switch trimming side
FUJIWARA Katsunori <foozy@lares.dti.ne.jp>
parents:
21856
diff
changeset
|
225 +++\xe3\x81\x88\xe3\x81\x8a |
21856
d24969ee272f
encoding: add 'trim' to trim multi-byte characters at most specified columns
FUJIWARA Katsunori <foozy@lares.dti.ne.jp>
parents:
18378
diff
changeset
|
226 >>> print trim(t, 5) |
d24969ee272f
encoding: add 'trim' to trim multi-byte characters at most specified columns
FUJIWARA Katsunori <foozy@lares.dti.ne.jp>
parents:
18378
diff
changeset
|
227 \xe3\x81\x82\xe3\x81\x84 |
21861
b515c3a63e96
encoding: add 'leftside' argument into 'trim' to switch trimming side
FUJIWARA Katsunori <foozy@lares.dti.ne.jp>
parents:
21856
diff
changeset
|
228 >>> print trim(t, 5, leftside=True) |
b515c3a63e96
encoding: add 'leftside' argument into 'trim' to switch trimming side
FUJIWARA Katsunori <foozy@lares.dti.ne.jp>
parents:
21856
diff
changeset
|
229 \xe3\x81\x88\xe3\x81\x8a |
21856
d24969ee272f
encoding: add 'trim' to trim multi-byte characters at most specified columns
FUJIWARA Katsunori <foozy@lares.dti.ne.jp>
parents:
18378
diff
changeset
|
230 >>> print trim(t, 4, ellipsis=ellipsis) |
d24969ee272f
encoding: add 'trim' to trim multi-byte characters at most specified columns
FUJIWARA Katsunori <foozy@lares.dti.ne.jp>
parents:
18378
diff
changeset
|
231 +++ |
21861
b515c3a63e96
encoding: add 'leftside' argument into 'trim' to switch trimming side
FUJIWARA Katsunori <foozy@lares.dti.ne.jp>
parents:
21856
diff
changeset
|
232 >>> print trim(t, 4, ellipsis=ellipsis, leftside=True) |
b515c3a63e96
encoding: add 'leftside' argument into 'trim' to switch trimming side
FUJIWARA Katsunori <foozy@lares.dti.ne.jp>
parents:
21856
diff
changeset
|
233 +++ |
21856
d24969ee272f
encoding: add 'trim' to trim multi-byte characters at most specified columns
FUJIWARA Katsunori <foozy@lares.dti.ne.jp>
parents:
18378
diff
changeset
|
234 >>> t = '\x11\x22\x33\x44\x55\x66\x77\x88\x99\xaa' # invalid byte sequence |
d24969ee272f
encoding: add 'trim' to trim multi-byte characters at most specified columns
FUJIWARA Katsunori <foozy@lares.dti.ne.jp>
parents:
18378
diff
changeset
|
235 >>> print trim(t, 12, ellipsis=ellipsis) |
d24969ee272f
encoding: add 'trim' to trim multi-byte characters at most specified columns
FUJIWARA Katsunori <foozy@lares.dti.ne.jp>
parents:
18378
diff
changeset
|
236 \x11\x22\x33\x44\x55\x66\x77\x88\x99\xaa |
d24969ee272f
encoding: add 'trim' to trim multi-byte characters at most specified columns
FUJIWARA Katsunori <foozy@lares.dti.ne.jp>
parents:
18378
diff
changeset
|
237 >>> print trim(t, 10, ellipsis=ellipsis) |
d24969ee272f
encoding: add 'trim' to trim multi-byte characters at most specified columns
FUJIWARA Katsunori <foozy@lares.dti.ne.jp>
parents:
18378
diff
changeset
|
238 \x11\x22\x33\x44\x55\x66\x77\x88\x99\xaa |
d24969ee272f
encoding: add 'trim' to trim multi-byte characters at most specified columns
FUJIWARA Katsunori <foozy@lares.dti.ne.jp>
parents:
18378
diff
changeset
|
239 >>> print trim(t, 8, ellipsis=ellipsis) |
d24969ee272f
encoding: add 'trim' to trim multi-byte characters at most specified columns
FUJIWARA Katsunori <foozy@lares.dti.ne.jp>
parents:
18378
diff
changeset
|
240 \x11\x22\x33\x44\x55+++ |
21861
b515c3a63e96
encoding: add 'leftside' argument into 'trim' to switch trimming side
FUJIWARA Katsunori <foozy@lares.dti.ne.jp>
parents:
21856
diff
changeset
|
241 >>> print trim(t, 8, ellipsis=ellipsis, leftside=True) |
b515c3a63e96
encoding: add 'leftside' argument into 'trim' to switch trimming side
FUJIWARA Katsunori <foozy@lares.dti.ne.jp>
parents:
21856
diff
changeset
|
242 +++\x66\x77\x88\x99\xaa |
21856
d24969ee272f
encoding: add 'trim' to trim multi-byte characters at most specified columns
FUJIWARA Katsunori <foozy@lares.dti.ne.jp>
parents:
18378
diff
changeset
|
243 >>> print trim(t, 8) |
d24969ee272f
encoding: add 'trim' to trim multi-byte characters at most specified columns
FUJIWARA Katsunori <foozy@lares.dti.ne.jp>
parents:
18378
diff
changeset
|
244 \x11\x22\x33\x44\x55\x66\x77\x88 |
21861
b515c3a63e96
encoding: add 'leftside' argument into 'trim' to switch trimming side
FUJIWARA Katsunori <foozy@lares.dti.ne.jp>
parents:
21856
diff
changeset
|
245 >>> print trim(t, 8, leftside=True) |
b515c3a63e96
encoding: add 'leftside' argument into 'trim' to switch trimming side
FUJIWARA Katsunori <foozy@lares.dti.ne.jp>
parents:
21856
diff
changeset
|
246 \x33\x44\x55\x66\x77\x88\x99\xaa |
21856
d24969ee272f
encoding: add 'trim' to trim multi-byte characters at most specified columns
FUJIWARA Katsunori <foozy@lares.dti.ne.jp>
parents:
18378
diff
changeset
|
247 >>> print trim(t, 3, ellipsis=ellipsis) |
d24969ee272f
encoding: add 'trim' to trim multi-byte characters at most specified columns
FUJIWARA Katsunori <foozy@lares.dti.ne.jp>
parents:
18378
diff
changeset
|
248 +++ |
d24969ee272f
encoding: add 'trim' to trim multi-byte characters at most specified columns
FUJIWARA Katsunori <foozy@lares.dti.ne.jp>
parents:
18378
diff
changeset
|
249 >>> print trim(t, 1, ellipsis=ellipsis) |
d24969ee272f
encoding: add 'trim' to trim multi-byte characters at most specified columns
FUJIWARA Katsunori <foozy@lares.dti.ne.jp>
parents:
18378
diff
changeset
|
250 + |
d24969ee272f
encoding: add 'trim' to trim multi-byte characters at most specified columns
FUJIWARA Katsunori <foozy@lares.dti.ne.jp>
parents:
18378
diff
changeset
|
251 """ |
d24969ee272f
encoding: add 'trim' to trim multi-byte characters at most specified columns
FUJIWARA Katsunori <foozy@lares.dti.ne.jp>
parents:
18378
diff
changeset
|
252 try: |
d24969ee272f
encoding: add 'trim' to trim multi-byte characters at most specified columns
FUJIWARA Katsunori <foozy@lares.dti.ne.jp>
parents:
18378
diff
changeset
|
253 u = s.decode(encoding) |
d24969ee272f
encoding: add 'trim' to trim multi-byte characters at most specified columns
FUJIWARA Katsunori <foozy@lares.dti.ne.jp>
parents:
18378
diff
changeset
|
254 except UnicodeDecodeError: |
d24969ee272f
encoding: add 'trim' to trim multi-byte characters at most specified columns
FUJIWARA Katsunori <foozy@lares.dti.ne.jp>
parents:
18378
diff
changeset
|
255 if len(s) <= width: # trimming is not needed |
d24969ee272f
encoding: add 'trim' to trim multi-byte characters at most specified columns
FUJIWARA Katsunori <foozy@lares.dti.ne.jp>
parents:
18378
diff
changeset
|
256 return s |
d24969ee272f
encoding: add 'trim' to trim multi-byte characters at most specified columns
FUJIWARA Katsunori <foozy@lares.dti.ne.jp>
parents:
18378
diff
changeset
|
257 width -= len(ellipsis) |
d24969ee272f
encoding: add 'trim' to trim multi-byte characters at most specified columns
FUJIWARA Katsunori <foozy@lares.dti.ne.jp>
parents:
18378
diff
changeset
|
258 if width <= 0: # no enough room even for ellipsis |
d24969ee272f
encoding: add 'trim' to trim multi-byte characters at most specified columns
FUJIWARA Katsunori <foozy@lares.dti.ne.jp>
parents:
18378
diff
changeset
|
259 return ellipsis[:width + len(ellipsis)] |
21861
b515c3a63e96
encoding: add 'leftside' argument into 'trim' to switch trimming side
FUJIWARA Katsunori <foozy@lares.dti.ne.jp>
parents:
21856
diff
changeset
|
260 if leftside: |
b515c3a63e96
encoding: add 'leftside' argument into 'trim' to switch trimming side
FUJIWARA Katsunori <foozy@lares.dti.ne.jp>
parents:
21856
diff
changeset
|
261 return ellipsis + s[-width:] |
21856
d24969ee272f
encoding: add 'trim' to trim multi-byte characters at most specified columns
FUJIWARA Katsunori <foozy@lares.dti.ne.jp>
parents:
18378
diff
changeset
|
262 return s[:width] + ellipsis |
d24969ee272f
encoding: add 'trim' to trim multi-byte characters at most specified columns
FUJIWARA Katsunori <foozy@lares.dti.ne.jp>
parents:
18378
diff
changeset
|
263 |
d24969ee272f
encoding: add 'trim' to trim multi-byte characters at most specified columns
FUJIWARA Katsunori <foozy@lares.dti.ne.jp>
parents:
18378
diff
changeset
|
264 if ucolwidth(u) <= width: # trimming is not needed |
d24969ee272f
encoding: add 'trim' to trim multi-byte characters at most specified columns
FUJIWARA Katsunori <foozy@lares.dti.ne.jp>
parents:
18378
diff
changeset
|
265 return s |
d24969ee272f
encoding: add 'trim' to trim multi-byte characters at most specified columns
FUJIWARA Katsunori <foozy@lares.dti.ne.jp>
parents:
18378
diff
changeset
|
266 |
d24969ee272f
encoding: add 'trim' to trim multi-byte characters at most specified columns
FUJIWARA Katsunori <foozy@lares.dti.ne.jp>
parents:
18378
diff
changeset
|
267 width -= len(ellipsis) |
d24969ee272f
encoding: add 'trim' to trim multi-byte characters at most specified columns
FUJIWARA Katsunori <foozy@lares.dti.ne.jp>
parents:
18378
diff
changeset
|
268 if width <= 0: # no enough room even for ellipsis |
d24969ee272f
encoding: add 'trim' to trim multi-byte characters at most specified columns
FUJIWARA Katsunori <foozy@lares.dti.ne.jp>
parents:
18378
diff
changeset
|
269 return ellipsis[:width + len(ellipsis)] |
d24969ee272f
encoding: add 'trim' to trim multi-byte characters at most specified columns
FUJIWARA Katsunori <foozy@lares.dti.ne.jp>
parents:
18378
diff
changeset
|
270 |
21861
b515c3a63e96
encoding: add 'leftside' argument into 'trim' to switch trimming side
FUJIWARA Katsunori <foozy@lares.dti.ne.jp>
parents:
21856
diff
changeset
|
271 if leftside: |
b515c3a63e96
encoding: add 'leftside' argument into 'trim' to switch trimming side
FUJIWARA Katsunori <foozy@lares.dti.ne.jp>
parents:
21856
diff
changeset
|
272 uslice = lambda i: u[i:] |
b515c3a63e96
encoding: add 'leftside' argument into 'trim' to switch trimming side
FUJIWARA Katsunori <foozy@lares.dti.ne.jp>
parents:
21856
diff
changeset
|
273 concat = lambda s: ellipsis + s |
b515c3a63e96
encoding: add 'leftside' argument into 'trim' to switch trimming side
FUJIWARA Katsunori <foozy@lares.dti.ne.jp>
parents:
21856
diff
changeset
|
274 else: |
b515c3a63e96
encoding: add 'leftside' argument into 'trim' to switch trimming side
FUJIWARA Katsunori <foozy@lares.dti.ne.jp>
parents:
21856
diff
changeset
|
275 uslice = lambda i: u[:-i] |
b515c3a63e96
encoding: add 'leftside' argument into 'trim' to switch trimming side
FUJIWARA Katsunori <foozy@lares.dti.ne.jp>
parents:
21856
diff
changeset
|
276 concat = lambda s: s + ellipsis |
21856
d24969ee272f
encoding: add 'trim' to trim multi-byte characters at most specified columns
FUJIWARA Katsunori <foozy@lares.dti.ne.jp>
parents:
18378
diff
changeset
|
277 for i in xrange(1, len(u)): |
d24969ee272f
encoding: add 'trim' to trim multi-byte characters at most specified columns
FUJIWARA Katsunori <foozy@lares.dti.ne.jp>
parents:
18378
diff
changeset
|
278 usub = uslice(i) |
d24969ee272f
encoding: add 'trim' to trim multi-byte characters at most specified columns
FUJIWARA Katsunori <foozy@lares.dti.ne.jp>
parents:
18378
diff
changeset
|
279 if ucolwidth(usub) <= width: |
d24969ee272f
encoding: add 'trim' to trim multi-byte characters at most specified columns
FUJIWARA Katsunori <foozy@lares.dti.ne.jp>
parents:
18378
diff
changeset
|
280 return concat(usub.encode(encoding)) |
d24969ee272f
encoding: add 'trim' to trim multi-byte characters at most specified columns
FUJIWARA Katsunori <foozy@lares.dti.ne.jp>
parents:
18378
diff
changeset
|
281 return ellipsis # no enough room for multi-column characters |
d24969ee272f
encoding: add 'trim' to trim multi-byte characters at most specified columns
FUJIWARA Katsunori <foozy@lares.dti.ne.jp>
parents:
18378
diff
changeset
|
282 |
22973
bcff9ecdaae0
encoding: avoid cyclic dependency around "parsers" in pure Python build
FUJIWARA Katsunori <foozy@lares.dti.ne.jp>
parents:
22779
diff
changeset
|
283 def _asciilower(s): |
22778
80f2b63dd83a
parsers: add a function to efficiently lowercase ASCII strings
Siddharth Agarwal <sid0@fb.com>
parents:
22426
diff
changeset
|
284 '''convert a string to lowercase if ASCII |
80f2b63dd83a
parsers: add a function to efficiently lowercase ASCII strings
Siddharth Agarwal <sid0@fb.com>
parents:
22426
diff
changeset
|
285 |
80f2b63dd83a
parsers: add a function to efficiently lowercase ASCII strings
Siddharth Agarwal <sid0@fb.com>
parents:
22426
diff
changeset
|
286 Raises UnicodeDecodeError if non-ASCII characters are found.''' |
80f2b63dd83a
parsers: add a function to efficiently lowercase ASCII strings
Siddharth Agarwal <sid0@fb.com>
parents:
22426
diff
changeset
|
287 s.decode('ascii') |
80f2b63dd83a
parsers: add a function to efficiently lowercase ASCII strings
Siddharth Agarwal <sid0@fb.com>
parents:
22426
diff
changeset
|
288 return s.lower() |
80f2b63dd83a
parsers: add a function to efficiently lowercase ASCII strings
Siddharth Agarwal <sid0@fb.com>
parents:
22426
diff
changeset
|
289 |
22973
bcff9ecdaae0
encoding: avoid cyclic dependency around "parsers" in pure Python build
FUJIWARA Katsunori <foozy@lares.dti.ne.jp>
parents:
22779
diff
changeset
|
290 def asciilower(s): |
bcff9ecdaae0
encoding: avoid cyclic dependency around "parsers" in pure Python build
FUJIWARA Katsunori <foozy@lares.dti.ne.jp>
parents:
22779
diff
changeset
|
291 # delay importing avoids cyclic dependency around "parsers" in |
bcff9ecdaae0
encoding: avoid cyclic dependency around "parsers" in pure Python build
FUJIWARA Katsunori <foozy@lares.dti.ne.jp>
parents:
22779
diff
changeset
|
292 # pure Python build (util => i18n => encoding => parsers => util) |
bcff9ecdaae0
encoding: avoid cyclic dependency around "parsers" in pure Python build
FUJIWARA Katsunori <foozy@lares.dti.ne.jp>
parents:
22779
diff
changeset
|
293 import parsers |
bcff9ecdaae0
encoding: avoid cyclic dependency around "parsers" in pure Python build
FUJIWARA Katsunori <foozy@lares.dti.ne.jp>
parents:
22779
diff
changeset
|
294 impl = getattr(parsers, 'asciilower', _asciilower) |
bcff9ecdaae0
encoding: avoid cyclic dependency around "parsers" in pure Python build
FUJIWARA Katsunori <foozy@lares.dti.ne.jp>
parents:
22779
diff
changeset
|
295 global asciilower |
bcff9ecdaae0
encoding: avoid cyclic dependency around "parsers" in pure Python build
FUJIWARA Katsunori <foozy@lares.dti.ne.jp>
parents:
22779
diff
changeset
|
296 asciilower = impl |
bcff9ecdaae0
encoding: avoid cyclic dependency around "parsers" in pure Python build
FUJIWARA Katsunori <foozy@lares.dti.ne.jp>
parents:
22779
diff
changeset
|
297 return impl(s) |
22778
80f2b63dd83a
parsers: add a function to efficiently lowercase ASCII strings
Siddharth Agarwal <sid0@fb.com>
parents:
22426
diff
changeset
|
298 |
14069
e38846a79a23
encoding: add an encoding-aware lower function
Matt Mackall <mpm@selenic.com>
parents:
13940
diff
changeset
|
299 def lower(s): |
e38846a79a23
encoding: add an encoding-aware lower function
Matt Mackall <mpm@selenic.com>
parents:
13940
diff
changeset
|
300 "best-effort encoding-aware case-folding of local string s" |
e38846a79a23
encoding: add an encoding-aware lower function
Matt Mackall <mpm@selenic.com>
parents:
13940
diff
changeset
|
301 try: |
22779
d9585dda63c3
encoding.lower: use fast ASCII lower
Siddharth Agarwal <sid0@fb.com>
parents:
22778
diff
changeset
|
302 return asciilower(s) |
17235
3745ae495ce5
encoding: use s.decode to trigger UnicodeDecodeError
Martin Geisler <mg@aragost.com>
parents:
16493
diff
changeset
|
303 except UnicodeDecodeError: |
16387
c481761033bd
encoding: add fast-path for ASCII lowercase
Matt Mackall <mpm@selenic.com>
parents:
16274
diff
changeset
|
304 pass |
c481761033bd
encoding: add fast-path for ASCII lowercase
Matt Mackall <mpm@selenic.com>
parents:
16274
diff
changeset
|
305 try: |
14069
e38846a79a23
encoding: add an encoding-aware lower function
Matt Mackall <mpm@selenic.com>
parents:
13940
diff
changeset
|
306 if isinstance(s, localstr): |
e38846a79a23
encoding: add an encoding-aware lower function
Matt Mackall <mpm@selenic.com>
parents:
13940
diff
changeset
|
307 u = s._utf8.decode("utf-8") |
e38846a79a23
encoding: add an encoding-aware lower function
Matt Mackall <mpm@selenic.com>
parents:
13940
diff
changeset
|
308 else: |
e38846a79a23
encoding: add an encoding-aware lower function
Matt Mackall <mpm@selenic.com>
parents:
13940
diff
changeset
|
309 u = s.decode(encoding, encodingmode) |
e38846a79a23
encoding: add an encoding-aware lower function
Matt Mackall <mpm@selenic.com>
parents:
13940
diff
changeset
|
310 |
e38846a79a23
encoding: add an encoding-aware lower function
Matt Mackall <mpm@selenic.com>
parents:
13940
diff
changeset
|
311 lu = u.lower() |
e38846a79a23
encoding: add an encoding-aware lower function
Matt Mackall <mpm@selenic.com>
parents:
13940
diff
changeset
|
312 if u == lu: |
e38846a79a23
encoding: add an encoding-aware lower function
Matt Mackall <mpm@selenic.com>
parents:
13940
diff
changeset
|
313 return s # preserve localstring |
e38846a79a23
encoding: add an encoding-aware lower function
Matt Mackall <mpm@selenic.com>
parents:
13940
diff
changeset
|
314 return lu.encode(encoding) |
e38846a79a23
encoding: add an encoding-aware lower function
Matt Mackall <mpm@selenic.com>
parents:
13940
diff
changeset
|
315 except UnicodeError: |
e38846a79a23
encoding: add an encoding-aware lower function
Matt Mackall <mpm@selenic.com>
parents:
13940
diff
changeset
|
316 return s.lower() # we don't know how to fold this except in ASCII |
15672
2ebe3d0ce91d
i18n: use encoding.lower/upper for encoding aware case folding
FUJIWARA Katsunori <foozy@lares.dti.ne.jp>
parents:
15143
diff
changeset
|
317 except LookupError, k: |
2ebe3d0ce91d
i18n: use encoding.lower/upper for encoding aware case folding
FUJIWARA Katsunori <foozy@lares.dti.ne.jp>
parents:
15143
diff
changeset
|
318 raise error.Abort(k, hint="please check your locale settings") |
2ebe3d0ce91d
i18n: use encoding.lower/upper for encoding aware case folding
FUJIWARA Katsunori <foozy@lares.dti.ne.jp>
parents:
15143
diff
changeset
|
319 |
2ebe3d0ce91d
i18n: use encoding.lower/upper for encoding aware case folding
FUJIWARA Katsunori <foozy@lares.dti.ne.jp>
parents:
15143
diff
changeset
|
320 def upper(s): |
2ebe3d0ce91d
i18n: use encoding.lower/upper for encoding aware case folding
FUJIWARA Katsunori <foozy@lares.dti.ne.jp>
parents:
15143
diff
changeset
|
321 "best-effort encoding-aware case-folding of local string s" |
2ebe3d0ce91d
i18n: use encoding.lower/upper for encoding aware case folding
FUJIWARA Katsunori <foozy@lares.dti.ne.jp>
parents:
15143
diff
changeset
|
322 try: |
17236
9fb8312dbdbd
encoding: add fast-path for ASCII uppercase.
Martin Geisler <mg@aragost.com>
parents:
17235
diff
changeset
|
323 s.decode('ascii') # throw exception for non-ASCII character |
9fb8312dbdbd
encoding: add fast-path for ASCII uppercase.
Martin Geisler <mg@aragost.com>
parents:
17235
diff
changeset
|
324 return s.upper() |
9fb8312dbdbd
encoding: add fast-path for ASCII uppercase.
Martin Geisler <mg@aragost.com>
parents:
17235
diff
changeset
|
325 except UnicodeDecodeError: |
9fb8312dbdbd
encoding: add fast-path for ASCII uppercase.
Martin Geisler <mg@aragost.com>
parents:
17235
diff
changeset
|
326 pass |
9fb8312dbdbd
encoding: add fast-path for ASCII uppercase.
Martin Geisler <mg@aragost.com>
parents:
17235
diff
changeset
|
327 try: |
15672
2ebe3d0ce91d
i18n: use encoding.lower/upper for encoding aware case folding
FUJIWARA Katsunori <foozy@lares.dti.ne.jp>
parents:
15143
diff
changeset
|
328 if isinstance(s, localstr): |
2ebe3d0ce91d
i18n: use encoding.lower/upper for encoding aware case folding
FUJIWARA Katsunori <foozy@lares.dti.ne.jp>
parents:
15143
diff
changeset
|
329 u = s._utf8.decode("utf-8") |
2ebe3d0ce91d
i18n: use encoding.lower/upper for encoding aware case folding
FUJIWARA Katsunori <foozy@lares.dti.ne.jp>
parents:
15143
diff
changeset
|
330 else: |
2ebe3d0ce91d
i18n: use encoding.lower/upper for encoding aware case folding
FUJIWARA Katsunori <foozy@lares.dti.ne.jp>
parents:
15143
diff
changeset
|
331 u = s.decode(encoding, encodingmode) |
2ebe3d0ce91d
i18n: use encoding.lower/upper for encoding aware case folding
FUJIWARA Katsunori <foozy@lares.dti.ne.jp>
parents:
15143
diff
changeset
|
332 |
2ebe3d0ce91d
i18n: use encoding.lower/upper for encoding aware case folding
FUJIWARA Katsunori <foozy@lares.dti.ne.jp>
parents:
15143
diff
changeset
|
333 uu = u.upper() |
2ebe3d0ce91d
i18n: use encoding.lower/upper for encoding aware case folding
FUJIWARA Katsunori <foozy@lares.dti.ne.jp>
parents:
15143
diff
changeset
|
334 if u == uu: |
2ebe3d0ce91d
i18n: use encoding.lower/upper for encoding aware case folding
FUJIWARA Katsunori <foozy@lares.dti.ne.jp>
parents:
15143
diff
changeset
|
335 return s # preserve localstring |
2ebe3d0ce91d
i18n: use encoding.lower/upper for encoding aware case folding
FUJIWARA Katsunori <foozy@lares.dti.ne.jp>
parents:
15143
diff
changeset
|
336 return uu.encode(encoding) |
2ebe3d0ce91d
i18n: use encoding.lower/upper for encoding aware case folding
FUJIWARA Katsunori <foozy@lares.dti.ne.jp>
parents:
15143
diff
changeset
|
337 except UnicodeError: |
2ebe3d0ce91d
i18n: use encoding.lower/upper for encoding aware case folding
FUJIWARA Katsunori <foozy@lares.dti.ne.jp>
parents:
15143
diff
changeset
|
338 return s.upper() # we don't know how to fold this except in ASCII |
2ebe3d0ce91d
i18n: use encoding.lower/upper for encoding aware case folding
FUJIWARA Katsunori <foozy@lares.dti.ne.jp>
parents:
15143
diff
changeset
|
339 except LookupError, k: |
2ebe3d0ce91d
i18n: use encoding.lower/upper for encoding aware case folding
FUJIWARA Katsunori <foozy@lares.dti.ne.jp>
parents:
15143
diff
changeset
|
340 raise error.Abort(k, hint="please check your locale settings") |
16133
84c58da3a1f8
encoding: introduce utf8-b helpers
Matt Mackall <mpm@selenic.com>
parents:
15769
diff
changeset
|
341 |
22426
f6b533e64ed6
encoding: add json escaping filter
Matt Mackall <mpm@selenic.com>
parents:
22425
diff
changeset
|
342 _jsonmap = {} |
f6b533e64ed6
encoding: add json escaping filter
Matt Mackall <mpm@selenic.com>
parents:
22425
diff
changeset
|
343 |
f6b533e64ed6
encoding: add json escaping filter
Matt Mackall <mpm@selenic.com>
parents:
22425
diff
changeset
|
344 def jsonescape(s): |
f6b533e64ed6
encoding: add json escaping filter
Matt Mackall <mpm@selenic.com>
parents:
22425
diff
changeset
|
345 '''returns a string suitable for JSON |
f6b533e64ed6
encoding: add json escaping filter
Matt Mackall <mpm@selenic.com>
parents:
22425
diff
changeset
|
346 |
f6b533e64ed6
encoding: add json escaping filter
Matt Mackall <mpm@selenic.com>
parents:
22425
diff
changeset
|
347 JSON is problematic for us because it doesn't support non-Unicode |
f6b533e64ed6
encoding: add json escaping filter
Matt Mackall <mpm@selenic.com>
parents:
22425
diff
changeset
|
348 bytes. To deal with this, we take the following approach: |
f6b533e64ed6
encoding: add json escaping filter
Matt Mackall <mpm@selenic.com>
parents:
22425
diff
changeset
|
349 |
f6b533e64ed6
encoding: add json escaping filter
Matt Mackall <mpm@selenic.com>
parents:
22425
diff
changeset
|
350 - localstr objects are converted back to UTF-8 |
f6b533e64ed6
encoding: add json escaping filter
Matt Mackall <mpm@selenic.com>
parents:
22425
diff
changeset
|
351 - valid UTF-8/ASCII strings are passed as-is |
f6b533e64ed6
encoding: add json escaping filter
Matt Mackall <mpm@selenic.com>
parents:
22425
diff
changeset
|
352 - other strings are converted to UTF-8b surrogate encoding |
f6b533e64ed6
encoding: add json escaping filter
Matt Mackall <mpm@selenic.com>
parents:
22425
diff
changeset
|
353 - apply JSON-specified string escaping |
f6b533e64ed6
encoding: add json escaping filter
Matt Mackall <mpm@selenic.com>
parents:
22425
diff
changeset
|
354 |
f6b533e64ed6
encoding: add json escaping filter
Matt Mackall <mpm@selenic.com>
parents:
22425
diff
changeset
|
355 (escapes are doubled in these tests) |
f6b533e64ed6
encoding: add json escaping filter
Matt Mackall <mpm@selenic.com>
parents:
22425
diff
changeset
|
356 |
f6b533e64ed6
encoding: add json escaping filter
Matt Mackall <mpm@selenic.com>
parents:
22425
diff
changeset
|
357 >>> jsonescape('this is a test') |
f6b533e64ed6
encoding: add json escaping filter
Matt Mackall <mpm@selenic.com>
parents:
22425
diff
changeset
|
358 'this is a test' |
f6b533e64ed6
encoding: add json escaping filter
Matt Mackall <mpm@selenic.com>
parents:
22425
diff
changeset
|
359 >>> jsonescape('escape characters: \\0 \\x0b \\t \\n \\r \\" \\\\') |
f6b533e64ed6
encoding: add json escaping filter
Matt Mackall <mpm@selenic.com>
parents:
22425
diff
changeset
|
360 'escape characters: \\\\u0000 \\\\u000b \\\\t \\\\n \\\\r \\\\" \\\\\\\\' |
f6b533e64ed6
encoding: add json escaping filter
Matt Mackall <mpm@selenic.com>
parents:
22425
diff
changeset
|
361 >>> jsonescape('a weird byte: \\xdd') |
f6b533e64ed6
encoding: add json escaping filter
Matt Mackall <mpm@selenic.com>
parents:
22425
diff
changeset
|
362 'a weird byte: \\xed\\xb3\\x9d' |
f6b533e64ed6
encoding: add json escaping filter
Matt Mackall <mpm@selenic.com>
parents:
22425
diff
changeset
|
363 >>> jsonescape('utf-8: caf\\xc3\\xa9') |
f6b533e64ed6
encoding: add json escaping filter
Matt Mackall <mpm@selenic.com>
parents:
22425
diff
changeset
|
364 'utf-8: caf\\xc3\\xa9' |
f6b533e64ed6
encoding: add json escaping filter
Matt Mackall <mpm@selenic.com>
parents:
22425
diff
changeset
|
365 >>> jsonescape('') |
f6b533e64ed6
encoding: add json escaping filter
Matt Mackall <mpm@selenic.com>
parents:
22425
diff
changeset
|
366 '' |
f6b533e64ed6
encoding: add json escaping filter
Matt Mackall <mpm@selenic.com>
parents:
22425
diff
changeset
|
367 ''' |
f6b533e64ed6
encoding: add json escaping filter
Matt Mackall <mpm@selenic.com>
parents:
22425
diff
changeset
|
368 |
f6b533e64ed6
encoding: add json escaping filter
Matt Mackall <mpm@selenic.com>
parents:
22425
diff
changeset
|
369 if not _jsonmap: |
f6b533e64ed6
encoding: add json escaping filter
Matt Mackall <mpm@selenic.com>
parents:
22425
diff
changeset
|
370 for x in xrange(32): |
f6b533e64ed6
encoding: add json escaping filter
Matt Mackall <mpm@selenic.com>
parents:
22425
diff
changeset
|
371 _jsonmap[chr(x)] = "\u%04x" %x |
f6b533e64ed6
encoding: add json escaping filter
Matt Mackall <mpm@selenic.com>
parents:
22425
diff
changeset
|
372 for x in xrange(32, 256): |
f6b533e64ed6
encoding: add json escaping filter
Matt Mackall <mpm@selenic.com>
parents:
22425
diff
changeset
|
373 c = chr(x) |
f6b533e64ed6
encoding: add json escaping filter
Matt Mackall <mpm@selenic.com>
parents:
22425
diff
changeset
|
374 _jsonmap[c] = c |
f6b533e64ed6
encoding: add json escaping filter
Matt Mackall <mpm@selenic.com>
parents:
22425
diff
changeset
|
375 _jsonmap['\t'] = '\\t' |
f6b533e64ed6
encoding: add json escaping filter
Matt Mackall <mpm@selenic.com>
parents:
22425
diff
changeset
|
376 _jsonmap['\n'] = '\\n' |
f6b533e64ed6
encoding: add json escaping filter
Matt Mackall <mpm@selenic.com>
parents:
22425
diff
changeset
|
377 _jsonmap['\"'] = '\\"' |
f6b533e64ed6
encoding: add json escaping filter
Matt Mackall <mpm@selenic.com>
parents:
22425
diff
changeset
|
378 _jsonmap['\\'] = '\\\\' |
f6b533e64ed6
encoding: add json escaping filter
Matt Mackall <mpm@selenic.com>
parents:
22425
diff
changeset
|
379 _jsonmap['\b'] = '\\b' |
f6b533e64ed6
encoding: add json escaping filter
Matt Mackall <mpm@selenic.com>
parents:
22425
diff
changeset
|
380 _jsonmap['\f'] = '\\f' |
f6b533e64ed6
encoding: add json escaping filter
Matt Mackall <mpm@selenic.com>
parents:
22425
diff
changeset
|
381 _jsonmap['\r'] = '\\r' |
f6b533e64ed6
encoding: add json escaping filter
Matt Mackall <mpm@selenic.com>
parents:
22425
diff
changeset
|
382 |
f6b533e64ed6
encoding: add json escaping filter
Matt Mackall <mpm@selenic.com>
parents:
22425
diff
changeset
|
383 return ''.join(_jsonmap[c] for c in toutf8b(s)) |
f6b533e64ed6
encoding: add json escaping filter
Matt Mackall <mpm@selenic.com>
parents:
22425
diff
changeset
|
384 |
16133
84c58da3a1f8
encoding: introduce utf8-b helpers
Matt Mackall <mpm@selenic.com>
parents:
15769
diff
changeset
|
385 def toutf8b(s): |
84c58da3a1f8
encoding: introduce utf8-b helpers
Matt Mackall <mpm@selenic.com>
parents:
15769
diff
changeset
|
386 '''convert a local, possibly-binary string into UTF-8b |
84c58da3a1f8
encoding: introduce utf8-b helpers
Matt Mackall <mpm@selenic.com>
parents:
15769
diff
changeset
|
387 |
84c58da3a1f8
encoding: introduce utf8-b helpers
Matt Mackall <mpm@selenic.com>
parents:
15769
diff
changeset
|
388 This is intended as a generic method to preserve data when working |
84c58da3a1f8
encoding: introduce utf8-b helpers
Matt Mackall <mpm@selenic.com>
parents:
15769
diff
changeset
|
389 with schemes like JSON and XML that have no provision for |
84c58da3a1f8
encoding: introduce utf8-b helpers
Matt Mackall <mpm@selenic.com>
parents:
15769
diff
changeset
|
390 arbitrary byte strings. As Mercurial often doesn't know |
84c58da3a1f8
encoding: introduce utf8-b helpers
Matt Mackall <mpm@selenic.com>
parents:
15769
diff
changeset
|
391 what encoding data is in, we use so-called UTF-8b. |
84c58da3a1f8
encoding: introduce utf8-b helpers
Matt Mackall <mpm@selenic.com>
parents:
15769
diff
changeset
|
392 |
84c58da3a1f8
encoding: introduce utf8-b helpers
Matt Mackall <mpm@selenic.com>
parents:
15769
diff
changeset
|
393 If a string is already valid UTF-8 (or ASCII), it passes unmodified. |
84c58da3a1f8
encoding: introduce utf8-b helpers
Matt Mackall <mpm@selenic.com>
parents:
15769
diff
changeset
|
394 Otherwise, unsupported bytes are mapped to UTF-16 surrogate range, |
84c58da3a1f8
encoding: introduce utf8-b helpers
Matt Mackall <mpm@selenic.com>
parents:
15769
diff
changeset
|
395 uDC00-uDCFF. |
84c58da3a1f8
encoding: introduce utf8-b helpers
Matt Mackall <mpm@selenic.com>
parents:
15769
diff
changeset
|
396 |
84c58da3a1f8
encoding: introduce utf8-b helpers
Matt Mackall <mpm@selenic.com>
parents:
15769
diff
changeset
|
397 Principles of operation: |
84c58da3a1f8
encoding: introduce utf8-b helpers
Matt Mackall <mpm@selenic.com>
parents:
15769
diff
changeset
|
398 |
17424
e7cfe3587ea4
fix trivial spelling errors
Mads Kiilerich <mads@kiilerich.com>
parents:
17236
diff
changeset
|
399 - ASCII and UTF-8 data successfully round-trips and is understood |
16133
84c58da3a1f8
encoding: introduce utf8-b helpers
Matt Mackall <mpm@selenic.com>
parents:
15769
diff
changeset
|
400 by Unicode-oriented clients |
84c58da3a1f8
encoding: introduce utf8-b helpers
Matt Mackall <mpm@selenic.com>
parents:
15769
diff
changeset
|
401 - filenames and file contents in arbitrary other encodings can have |
84c58da3a1f8
encoding: introduce utf8-b helpers
Matt Mackall <mpm@selenic.com>
parents:
15769
diff
changeset
|
402 be round-tripped or recovered by clueful clients |
84c58da3a1f8
encoding: introduce utf8-b helpers
Matt Mackall <mpm@selenic.com>
parents:
15769
diff
changeset
|
403 - local strings that have a cached known UTF-8 encoding (aka |
84c58da3a1f8
encoding: introduce utf8-b helpers
Matt Mackall <mpm@selenic.com>
parents:
15769
diff
changeset
|
404 localstr) get sent as UTF-8 so Unicode-oriented clients get the |
84c58da3a1f8
encoding: introduce utf8-b helpers
Matt Mackall <mpm@selenic.com>
parents:
15769
diff
changeset
|
405 Unicode data they want |
84c58da3a1f8
encoding: introduce utf8-b helpers
Matt Mackall <mpm@selenic.com>
parents:
15769
diff
changeset
|
406 - because we must preserve UTF-8 bytestring in places such as |
84c58da3a1f8
encoding: introduce utf8-b helpers
Matt Mackall <mpm@selenic.com>
parents:
15769
diff
changeset
|
407 filenames, metadata can't be roundtripped without help |
84c58da3a1f8
encoding: introduce utf8-b helpers
Matt Mackall <mpm@selenic.com>
parents:
15769
diff
changeset
|
408 |
84c58da3a1f8
encoding: introduce utf8-b helpers
Matt Mackall <mpm@selenic.com>
parents:
15769
diff
changeset
|
409 (Note: "UTF-8b" often refers to decoding a mix of valid UTF-8 and |
84c58da3a1f8
encoding: introduce utf8-b helpers
Matt Mackall <mpm@selenic.com>
parents:
15769
diff
changeset
|
410 arbitrary bytes into an internal Unicode format that can be |
84c58da3a1f8
encoding: introduce utf8-b helpers
Matt Mackall <mpm@selenic.com>
parents:
15769
diff
changeset
|
411 re-encoded back into the original. Here we are exposing the |
84c58da3a1f8
encoding: introduce utf8-b helpers
Matt Mackall <mpm@selenic.com>
parents:
15769
diff
changeset
|
412 internal surrogate encoding as a UTF-8 string.) |
84c58da3a1f8
encoding: introduce utf8-b helpers
Matt Mackall <mpm@selenic.com>
parents:
15769
diff
changeset
|
413 ''' |
84c58da3a1f8
encoding: introduce utf8-b helpers
Matt Mackall <mpm@selenic.com>
parents:
15769
diff
changeset
|
414 |
84c58da3a1f8
encoding: introduce utf8-b helpers
Matt Mackall <mpm@selenic.com>
parents:
15769
diff
changeset
|
415 if isinstance(s, localstr): |
84c58da3a1f8
encoding: introduce utf8-b helpers
Matt Mackall <mpm@selenic.com>
parents:
15769
diff
changeset
|
416 return s._utf8 |
84c58da3a1f8
encoding: introduce utf8-b helpers
Matt Mackall <mpm@selenic.com>
parents:
15769
diff
changeset
|
417 |
84c58da3a1f8
encoding: introduce utf8-b helpers
Matt Mackall <mpm@selenic.com>
parents:
15769
diff
changeset
|
418 try: |
22425
6fd944c204a9
encoding: handle empty string in toutf8
Matt Mackall <mpm@selenic.com>
parents:
21861
diff
changeset
|
419 s.decode('utf-8') |
6fd944c204a9
encoding: handle empty string in toutf8
Matt Mackall <mpm@selenic.com>
parents:
21861
diff
changeset
|
420 return s |
16133
84c58da3a1f8
encoding: introduce utf8-b helpers
Matt Mackall <mpm@selenic.com>
parents:
15769
diff
changeset
|
421 except UnicodeDecodeError: |
84c58da3a1f8
encoding: introduce utf8-b helpers
Matt Mackall <mpm@selenic.com>
parents:
15769
diff
changeset
|
422 # surrogate-encode any characters that don't round-trip |
84c58da3a1f8
encoding: introduce utf8-b helpers
Matt Mackall <mpm@selenic.com>
parents:
15769
diff
changeset
|
423 s2 = s.decode('utf-8', 'ignore').encode('utf-8') |
84c58da3a1f8
encoding: introduce utf8-b helpers
Matt Mackall <mpm@selenic.com>
parents:
15769
diff
changeset
|
424 r = "" |
84c58da3a1f8
encoding: introduce utf8-b helpers
Matt Mackall <mpm@selenic.com>
parents:
15769
diff
changeset
|
425 pos = 0 |
84c58da3a1f8
encoding: introduce utf8-b helpers
Matt Mackall <mpm@selenic.com>
parents:
15769
diff
changeset
|
426 for c in s: |
84c58da3a1f8
encoding: introduce utf8-b helpers
Matt Mackall <mpm@selenic.com>
parents:
15769
diff
changeset
|
427 if s2[pos:pos + 1] == c: |
84c58da3a1f8
encoding: introduce utf8-b helpers
Matt Mackall <mpm@selenic.com>
parents:
15769
diff
changeset
|
428 r += c |
84c58da3a1f8
encoding: introduce utf8-b helpers
Matt Mackall <mpm@selenic.com>
parents:
15769
diff
changeset
|
429 pos += 1 |
84c58da3a1f8
encoding: introduce utf8-b helpers
Matt Mackall <mpm@selenic.com>
parents:
15769
diff
changeset
|
430 else: |
84c58da3a1f8
encoding: introduce utf8-b helpers
Matt Mackall <mpm@selenic.com>
parents:
15769
diff
changeset
|
431 r += unichr(0xdc00 + ord(c)).encode('utf-8') |
84c58da3a1f8
encoding: introduce utf8-b helpers
Matt Mackall <mpm@selenic.com>
parents:
15769
diff
changeset
|
432 return r |
84c58da3a1f8
encoding: introduce utf8-b helpers
Matt Mackall <mpm@selenic.com>
parents:
15769
diff
changeset
|
433 |
84c58da3a1f8
encoding: introduce utf8-b helpers
Matt Mackall <mpm@selenic.com>
parents:
15769
diff
changeset
|
434 def fromutf8b(s): |
84c58da3a1f8
encoding: introduce utf8-b helpers
Matt Mackall <mpm@selenic.com>
parents:
15769
diff
changeset
|
435 '''Given a UTF-8b string, return a local, possibly-binary string. |
84c58da3a1f8
encoding: introduce utf8-b helpers
Matt Mackall <mpm@selenic.com>
parents:
15769
diff
changeset
|
436 |
84c58da3a1f8
encoding: introduce utf8-b helpers
Matt Mackall <mpm@selenic.com>
parents:
15769
diff
changeset
|
437 return the original binary string. This |
84c58da3a1f8
encoding: introduce utf8-b helpers
Matt Mackall <mpm@selenic.com>
parents:
15769
diff
changeset
|
438 is a round-trip process for strings like filenames, but metadata |
84c58da3a1f8
encoding: introduce utf8-b helpers
Matt Mackall <mpm@selenic.com>
parents:
15769
diff
changeset
|
439 that's was passed through tolocal will remain in UTF-8. |
84c58da3a1f8
encoding: introduce utf8-b helpers
Matt Mackall <mpm@selenic.com>
parents:
15769
diff
changeset
|
440 |
84c58da3a1f8
encoding: introduce utf8-b helpers
Matt Mackall <mpm@selenic.com>
parents:
15769
diff
changeset
|
441 >>> m = "\\xc3\\xa9\\x99abcd" |
84c58da3a1f8
encoding: introduce utf8-b helpers
Matt Mackall <mpm@selenic.com>
parents:
15769
diff
changeset
|
442 >>> n = toutf8b(m) |
84c58da3a1f8
encoding: introduce utf8-b helpers
Matt Mackall <mpm@selenic.com>
parents:
15769
diff
changeset
|
443 >>> n |
84c58da3a1f8
encoding: introduce utf8-b helpers
Matt Mackall <mpm@selenic.com>
parents:
15769
diff
changeset
|
444 '\\xc3\\xa9\\xed\\xb2\\x99abcd' |
84c58da3a1f8
encoding: introduce utf8-b helpers
Matt Mackall <mpm@selenic.com>
parents:
15769
diff
changeset
|
445 >>> fromutf8b(n) == m |
84c58da3a1f8
encoding: introduce utf8-b helpers
Matt Mackall <mpm@selenic.com>
parents:
15769
diff
changeset
|
446 True |
84c58da3a1f8
encoding: introduce utf8-b helpers
Matt Mackall <mpm@selenic.com>
parents:
15769
diff
changeset
|
447 ''' |
84c58da3a1f8
encoding: introduce utf8-b helpers
Matt Mackall <mpm@selenic.com>
parents:
15769
diff
changeset
|
448 |
84c58da3a1f8
encoding: introduce utf8-b helpers
Matt Mackall <mpm@selenic.com>
parents:
15769
diff
changeset
|
449 # fast path - look for uDxxx prefixes in s |
84c58da3a1f8
encoding: introduce utf8-b helpers
Matt Mackall <mpm@selenic.com>
parents:
15769
diff
changeset
|
450 if "\xed" not in s: |
84c58da3a1f8
encoding: introduce utf8-b helpers
Matt Mackall <mpm@selenic.com>
parents:
15769
diff
changeset
|
451 return s |
84c58da3a1f8
encoding: introduce utf8-b helpers
Matt Mackall <mpm@selenic.com>
parents:
15769
diff
changeset
|
452 |
84c58da3a1f8
encoding: introduce utf8-b helpers
Matt Mackall <mpm@selenic.com>
parents:
15769
diff
changeset
|
453 u = s.decode("utf-8") |
84c58da3a1f8
encoding: introduce utf8-b helpers
Matt Mackall <mpm@selenic.com>
parents:
15769
diff
changeset
|
454 r = "" |
84c58da3a1f8
encoding: introduce utf8-b helpers
Matt Mackall <mpm@selenic.com>
parents:
15769
diff
changeset
|
455 for c in u: |
84c58da3a1f8
encoding: introduce utf8-b helpers
Matt Mackall <mpm@selenic.com>
parents:
15769
diff
changeset
|
456 if ord(c) & 0xff00 == 0xdc00: |
84c58da3a1f8
encoding: introduce utf8-b helpers
Matt Mackall <mpm@selenic.com>
parents:
15769
diff
changeset
|
457 r += chr(ord(c) & 0xff) |
84c58da3a1f8
encoding: introduce utf8-b helpers
Matt Mackall <mpm@selenic.com>
parents:
15769
diff
changeset
|
458 else: |
84c58da3a1f8
encoding: introduce utf8-b helpers
Matt Mackall <mpm@selenic.com>
parents:
15769
diff
changeset
|
459 r += c.encode("utf-8") |
84c58da3a1f8
encoding: introduce utf8-b helpers
Matt Mackall <mpm@selenic.com>
parents:
15769
diff
changeset
|
460 return r |