Add functions for transcoding and manipulating multibyte strings
authorMatt Mackall <mpm@selenic.com>
Sun, 03 Dec 2006 16:16:33 -0600
changeset 3770 f96c158ea3a3
parent 3769 96095d9ff1f8
child 3771 29d91e57d055
Add functions for transcoding and manipulating multibyte strings
mercurial/util.py
--- a/mercurial/util.py	Sun Dec 03 16:16:33 2006 -0600
+++ b/mercurial/util.py	Sun Dec 03 16:16:33 2006 -0600
@@ -18,6 +18,58 @@
 demandload(globals(), "os threading time calendar ConfigParser locale")
 
 _encoding = os.environ.get("HGENCODING") or locale.getpreferredencoding()
+_encodingmode = os.environ.get("HGENCODINGMODE", "strict")
+
+def tolocal(s):
+    """
+    Convert a string from internal UTF-8 to local encoding
+
+    All internal strings should be UTF-8 but some repos before the
+    implementation of locale support may contain latin1 or possibly
+    other character sets. We attempt to decode everything strictly
+    using UTF-8, then Latin-1, and failing that, we use UTF-8 and
+    replace unknown characters.
+    """
+    for e in "utf-8 latin1".split():
+        try:
+            u = s.decode(e) # attempt strict decoding
+            return u.encode(_encoding, "replace")
+        except UnicodeDecodeError:
+            pass
+    u = s.decode("utf-8", "replace") # last ditch
+    return u.encode(_encoding, "replace")
+
+def fromlocal(s):
+    """
+    Convert a string from the local character encoding to UTF-8
+
+    We attempt to decode strings using the encoding mode set by
+    HG_ENCODINGMODE, which defaults to 'strict'. In this mode, unknown
+    characters will cause an error message. Other modes include
+    'replace', which replaces unknown characters with a special
+    Unicode character, and 'ignore', which drops the character.
+    """
+    try:
+        return s.decode(_encoding, _encodingmode).encode("utf-8")
+    except UnicodeDecodeError, inst:
+        sub = s[max(0, inst.start-10):inst.start+10]
+        raise Abort("decoding near '%s': %s!\n" % (sub, inst))
+
+def locallen(s):
+    """Find the length in characters of a local string"""
+    return len(s.decode(_encoding, "replace"))
+
+def localsub(s, a, b=None):
+    try:
+        u = s.decode(_encoding, _encodingmode)
+        if b is not None:
+            u = u[a:b]
+        else:
+            u = u[:a]
+        return u.encode(_encoding, _encodingmode)
+    except UnicodeDecodeError, inst:
+        sub = s[max(0, inst.start-10), inst.start+10]
+        raise Abort("decoding near '%s': %s!\n" % (sub, inst))
 
 # used by parsedate
 defaultdateformats = ('%Y-%m-%d %H:%M:%S', '%Y-%m-%d %H:%M',