changeset 11892:2be70ca17311 stable

encoding: improve handling of buggy getpreferredencoding() on Mac OS X Prior to version 2.7, calling locale.getpreferredencoding() would always return 'mac-roman' on Mac OS X. Previously, this was handled by a call to locale.setlocale(). Unfortunately, Python 2.6.5 and older have a bug where isspace() would incorrectly report True for 0x85 and 0xa0 after such a call. In order to fix this, we replace the previous _encodingfixup mapping to an _encodingfixers mapping. Rather than mapping encodings to their replacement, it maps them to a function returning the replacement. This allows us to provide an simplified implementation of getpreferredencoding() which extracts the expected encoding and restores the locale. This fix is based on a patch originally submitted by Martijn Pieters as well as feedback from Brodie Rao.
author Dan Villiom Podlaski Christiansen <danchr@gmail.com>
date Sat, 14 Aug 2010 01:30:54 +0200
parents 0bedf3a2062a
children aa50d07208d2 528ff7610cba 263768abd494
files mercurial/encoding.py
diffstat 1 files changed, 30 insertions(+), 10 deletions(-) [+]
line wrap: on
line diff
--- a/mercurial/encoding.py	Sun Aug 15 17:50:52 2010 +0200
+++ b/mercurial/encoding.py	Sat Aug 14 01:30:54 2010 +0200
@@ -8,21 +8,41 @@
 import error
 import sys, unicodedata, locale, os
 
-_encodingfixup = {'646': 'ascii', 'ANSI_X3.4-1968': 'ascii'}
+def _getpreferredencoding():
+    '''
+    On darwin, getpreferredencoding ignores the locale environment and
+    always returns mac-roman. http://bugs.python.org/issue6202 fixes this
+    for Python 2.7 and up. This is the same corrected code for earlier
+    Python versions.
+
+    However, we can't use a version check for this method, as some distributions 
+    patch Python to fix this. Instead, we use it as a 'fixer' for the mac-roman
+    encoding, as it is unlikely that this encoding is the actually expected.
+    '''
+    try:
+        locale.CODESET
+    except AttributeError:
+        # Fall back to parsing environment variables :-(
+        return locale.getdefaultlocale()[1]
+
+    oldloc = locale.setlocale(locale.LC_CTYPE)
+    locale.setlocale(locale.LC_CTYPE, "")
+    result = locale.nl_langinfo(locale.CODESET)
+    locale.setlocale(locale.LC_CTYPE, oldloc)
+
+    return result
+
+_encodingfixers = {
+    '646': lambda: 'ascii',
+    'ANSI_X3.4-1968': lambda: 'ascii',
+    'mac-roman': _getpreferredencoding
+}
 
 try:
     encoding = os.environ.get("HGENCODING")
-    if sys.platform == 'darwin' and not encoding:
-        # On darwin, getpreferredencoding ignores the locale environment and
-        # always returns mac-roman. We override this if the environment is
-        # not C (has been customized by the user).
-        lc = locale.setlocale(locale.LC_CTYPE, '')
-        if lc == 'UTF-8':
-            locale.setlocale(locale.LC_CTYPE, 'en_US.UTF-8')
-        encoding = locale.getlocale()[1]
     if not encoding:
         encoding = locale.getpreferredencoding() or 'ascii'
-        encoding = _encodingfixup.get(encoding, encoding)
+        encoding = _encodingfixers.get(encoding, lambda: encoding)()
 except locale.Error:
     encoding = 'ascii'
 encodingmode = os.environ.get("HGENCODINGMODE", "strict")