--- a/mercurial/util.py Sun Dec 03 16:16:33 2006 -0600
+++ b/mercurial/util.py Sun Dec 03 16:16:33 2006 -0600
@@ -18,6 +18,58 @@
demandload(globals(), "os threading time calendar ConfigParser locale")
_encoding = os.environ.get("HGENCODING") or locale.getpreferredencoding()
+_encodingmode = os.environ.get("HGENCODINGMODE", "strict")
+
+def tolocal(s):
+ """
+ Convert a string from internal UTF-8 to local encoding
+
+ All internal strings should be UTF-8 but some repos before the
+ implementation of locale support may contain latin1 or possibly
+ other character sets. We attempt to decode everything strictly
+ using UTF-8, then Latin-1, and failing that, we use UTF-8 and
+ replace unknown characters.
+ """
+ for e in "utf-8 latin1".split():
+ try:
+ u = s.decode(e) # attempt strict decoding
+ return u.encode(_encoding, "replace")
+ except UnicodeDecodeError:
+ pass
+ u = s.decode("utf-8", "replace") # last ditch
+ return u.encode(_encoding, "replace")
+
+def fromlocal(s):
+ """
+ Convert a string from the local character encoding to UTF-8
+
+ We attempt to decode strings using the encoding mode set by
+ HG_ENCODINGMODE, which defaults to 'strict'. In this mode, unknown
+ characters will cause an error message. Other modes include
+ 'replace', which replaces unknown characters with a special
+ Unicode character, and 'ignore', which drops the character.
+ """
+ try:
+ return s.decode(_encoding, _encodingmode).encode("utf-8")
+ except UnicodeDecodeError, inst:
+ sub = s[max(0, inst.start-10):inst.start+10]
+ raise Abort("decoding near '%s': %s!\n" % (sub, inst))
+
+def locallen(s):
+ """Find the length in characters of a local string"""
+ return len(s.decode(_encoding, "replace"))
+
+def localsub(s, a, b=None):
+ try:
+ u = s.decode(_encoding, _encodingmode)
+ if b is not None:
+ u = u[a:b]
+ else:
+ u = u[:a]
+ return u.encode(_encoding, _encodingmode)
+ except UnicodeDecodeError, inst:
+ sub = s[max(0, inst.start-10), inst.start+10]
+ raise Abort("decoding near '%s': %s!\n" % (sub, inst))
# used by parsedate
defaultdateformats = ('%Y-%m-%d %H:%M:%S', '%Y-%m-%d %H:%M',