diff hgext/convert/subversion.py @ 45022:e3b19004087a stable

convert: correctly convert paths to UTF-8 for Subversion The previous code using encoding.tolocal() only worked by chance in these situations: * The string is ASCII: The fast path was triggered and the string was returned unmodified. * The local encoding is UTF-8: The source and target encoding is the same. * The string is not valid UTF-8 and the native encoding is ISO-8859-1: If the string doesn’t decode using UTF-8, ISO-8859-1 is tried as a fallback. During `hg convert`, the local encoding is always UTF-8. The irony is that in this case, encoding.tolocal() behaves like what someone would expect the reverse function, encoding.fromlocal(), to do. When the locale encoding is ISO-8859-15, trying to convert a SVN repo `/tmp/a€` failed before like this: file:///tmp/a%C2%A4 does not look like a Subversion repository to libsvn version 1.14.0 The correct URL is `file:///tmp/a%E2%82%AC`. Unlike previously (with the ISO-8859-1 fallback), decoding the path using the locale encoding can fail. In this case, we have to bail out, as Subversion won’t be able to do anything useful with the path.
author Manuel Jacob <me@manueljacob.de>
date Mon, 29 Jun 2020 15:03:36 +0200
parents cb097496138a
children e54c3cafda15
line wrap: on
line diff
--- a/hgext/convert/subversion.py	Tue Jun 30 05:04:36 2020 +0200
+++ b/hgext/convert/subversion.py	Mon Jun 29 15:03:36 2020 +0200
@@ -3,6 +3,8 @@
 # Copyright(C) 2007 Daniel Holth et al
 from __future__ import absolute_import
 
+import codecs
+import locale
 import os
 import re
 import xml.dom.minidom
@@ -63,6 +65,38 @@
     svn = None
 
 
+# In Subversion, paths are Unicode (encoded as UTF-8), which Subversion
+# converts from / to native strings when interfacing with the OS. When passing
+# paths to Subversion, we have to recode them such that it roundstrips with
+# what Subversion is doing.
+
+fsencoding = None
+
+
+def init_fsencoding():
+    global fsencoding, fsencoding_is_utf8
+    if fsencoding is not None:
+        return
+    if pycompat.iswindows:
+        # On Windows, filenames are Unicode, but we store them using the MBCS
+        # encoding.
+        fsencoding = 'mbcs'
+    else:
+        # This is the encoding used to convert UTF-8 back to natively-encoded
+        # strings in Subversion 1.14.0 or earlier with APR 1.7.0 or earlier.
+        with util.with_lc_ctype():
+            fsencoding = locale.nl_langinfo(locale.CODESET) or 'ISO-8859-1'
+    fsencoding = codecs.lookup(fsencoding).name
+    fsencoding_is_utf8 = fsencoding == codecs.lookup('utf-8').name
+
+
+def fs2svn(s):
+    if fsencoding_is_utf8:
+        return s
+    else:
+        return s.decode(fsencoding).encode('utf-8')
+
+
 class SvnPathNotFound(Exception):
     pass
 
@@ -117,7 +151,7 @@
             path = b'/' + util.normpath(path)
         # Module URL is later compared with the repository URL returned
         # by svn API, which is UTF-8.
-        path = encoding.tolocal(path)
+        path = fs2svn(path)
         path = b'file://%s' % quote(path)
     return svn.core.svn_path_canonicalize(path)
 
@@ -347,6 +381,17 @@
     except ValueError:
         proto = b'file'
         path = os.path.abspath(url)
+        try:
+            path.decode(fsencoding)
+        except UnicodeDecodeError:
+            ui.warn(
+                _(
+                    b'Subversion requires that paths can be converted to '
+                    b'Unicode using the current locale encoding (%s)\n'
+                )
+                % pycompat.sysbytes(fsencoding)
+            )
+            return False
     if proto == b'file':
         path = util.pconvert(path)
     elif proto in (b'http', 'https'):
@@ -384,6 +429,7 @@
     def __init__(self, ui, repotype, url, revs=None):
         super(svn_source, self).__init__(ui, repotype, url, revs=revs)
 
+        init_fsencoding()
         if not (
             url.startswith(b'svn://')
             or url.startswith(b'svn+ssh://')