Mercurial > hg
changeset 3770:f96c158ea3a3
Add functions for transcoding and manipulating multibyte strings
author | Matt Mackall <mpm@selenic.com> |
---|---|
date | Sun, 03 Dec 2006 16:16:33 -0600 |
parents | 96095d9ff1f8 |
children | 29d91e57d055 |
files | mercurial/util.py |
diffstat | 1 files changed, 52 insertions(+), 0 deletions(-) [+] |
line wrap: on
line diff
--- a/mercurial/util.py Sun Dec 03 16:16:33 2006 -0600 +++ b/mercurial/util.py Sun Dec 03 16:16:33 2006 -0600 @@ -18,6 +18,58 @@ demandload(globals(), "os threading time calendar ConfigParser locale") _encoding = os.environ.get("HGENCODING") or locale.getpreferredencoding() +_encodingmode = os.environ.get("HGENCODINGMODE", "strict") + +def tolocal(s): + """ + Convert a string from internal UTF-8 to local encoding + + All internal strings should be UTF-8 but some repos before the + implementation of locale support may contain latin1 or possibly + other character sets. We attempt to decode everything strictly + using UTF-8, then Latin-1, and failing that, we use UTF-8 and + replace unknown characters. + """ + for e in "utf-8 latin1".split(): + try: + u = s.decode(e) # attempt strict decoding + return u.encode(_encoding, "replace") + except UnicodeDecodeError: + pass + u = s.decode("utf-8", "replace") # last ditch + return u.encode(_encoding, "replace") + +def fromlocal(s): + """ + Convert a string from the local character encoding to UTF-8 + + We attempt to decode strings using the encoding mode set by + HG_ENCODINGMODE, which defaults to 'strict'. In this mode, unknown + characters will cause an error message. Other modes include + 'replace', which replaces unknown characters with a special + Unicode character, and 'ignore', which drops the character. + """ + try: + return s.decode(_encoding, _encodingmode).encode("utf-8") + except UnicodeDecodeError, inst: + sub = s[max(0, inst.start-10):inst.start+10] + raise Abort("decoding near '%s': %s!\n" % (sub, inst)) + +def locallen(s): + """Find the length in characters of a local string""" + return len(s.decode(_encoding, "replace")) + +def localsub(s, a, b=None): + try: + u = s.decode(_encoding, _encodingmode) + if b is not None: + u = u[a:b] + else: + u = u[:a] + return u.encode(_encoding, _encodingmode) + except UnicodeDecodeError, inst: + sub = s[max(0, inst.start-10), inst.start+10] + raise Abort("decoding near '%s': %s!\n" % (sub, inst)) # used by parsedate defaultdateformats = ('%Y-%m-%d %H:%M:%S', '%Y-%m-%d %H:%M',