Mercurial > hg

--- a/hgext/convert/subversion.py	Tue Jun 30 16:39:45 2020 +0200
+++ b/hgext/convert/subversion.py	Tue Jun 30 07:23:29 2020 +0200
@@ -354,6 +354,32 @@
 }


+class NonUtf8PercentEncodedBytes(Exception):
+    pass
+
+
+# Subversion paths are Unicode. Since the percent-decoding is done on
+# UTF-8-encoded strings, percent-encoded bytes are interpreted as UTF-8.
+def url2pathname_like_subversion(unicodepath):
+    if pycompat.ispy3:
+        # On Python 3, we have to pass unicode to urlreq.url2pathname().
+        # Percent-decoded bytes get decoded using UTF-8 and the 'replace' error
+        # handler.
+        unicodepath = urlreq.url2pathname(unicodepath)
+        if u'\N{REPLACEMENT CHARACTER}' in unicodepath:
+            raise NonUtf8PercentEncodedBytes
+        else:
+            return unicodepath
+    else:
+        # If we passed unicode on Python 2, it would be converted using the
+        # latin-1 encoding. Therefore, we pass UTF-8-encoded bytes.
+        unicodepath = urlreq.url2pathname(unicodepath.encode('utf-8'))
+        try:
+            return unicodepath.decode('utf-8')
+        except UnicodeDecodeError:
+            raise NonUtf8PercentEncodedBytes
+
+
 def issvnurl(ui, url):
     try:
         proto, path = url.split(b'://', 1)
@@ -366,7 +392,7 @@
             ):
                 path = path[:2] + b':/' + path[6:]
             try:
-                path.decode(fsencoding)
+                unicodepath = path.decode(fsencoding)
             except UnicodeDecodeError:
                 ui.warn(
                     _(
@@ -376,28 +402,22 @@
                     % pycompat.sysbytes(fsencoding)
                 )
                 return False
-            # FIXME: The following reasoning and logic is wrong and will be
-            # fixed in a following changeset.
-            # pycompat.fsdecode() / pycompat.fsencode() are used so that bytes
-            # in the URL roundtrip correctly on Unix. urlreq.url2pathname() on
-            # py3 will decode percent-encoded bytes using the utf-8 encoding
-            # and the "replace" error handler. This means that it will not
-            # preserve non-UTF-8 bytes (https://bugs.python.org/issue40983).
-            # url.open() uses the reverse function (urlreq.pathname2url()) and
-            # has a similar problem
-            # (https://bz.mercurial-scm.org/show_bug.cgi?id=6357). It makes
-            # sense to solve both problems together and handle all file URLs
-            # consistently. For now, we warn.
-            unicodepath = urlreq.url2pathname(pycompat.fsdecode(path))
-            if pycompat.ispy3 and u'\N{REPLACEMENT CHARACTER}' in unicodepath:
+            try:
+                unicodepath = url2pathname_like_subversion(unicodepath)
+            except NonUtf8PercentEncodedBytes:
                 ui.warn(
                     _(
-                        b'on Python 3, we currently do not support non-UTF-8 '
-                        b'percent-encoded bytes in file URLs for Subversion '
-                        b'repositories\n'
+                        b'Subversion does not support non-UTF-8 '
+                        b'percent-encoded bytes in file URLs\n'
                     )
                 )
-            path = pycompat.fsencode(unicodepath)
+                return False
+            # Below, we approximate how Subversion checks the path. On Unix, we
+            # should therefore convert the path to bytes using `fsencoding`
+            # (like Subversion does). On Windows, the right thing would
+            # actually be to leave the path as unicode. For now, we restrict
+            # the path to MBCS.
+            path = unicodepath.encode(fsencoding)
     except ValueError:
         proto = b'file'
         path = os.path.abspath(url)
--- a/tests/test-convert-svn-encoding.t	Tue Jun 30 16:39:45 2020 +0200
+++ b/tests/test-convert-svn-encoding.t	Tue Jun 30 07:23:29 2020 +0200
@@ -197,13 +197,13 @@
   abort: file:/*/$TESTTMP/\xff: missing or unsupported repository (glob) (esc)
   [255]

-#if py3
-For now, on Python 3, we abort when encountering non-UTF-8 percent-encoded
-bytes in a filename.
+Subversion decodes percent-encoded bytes on the converted, UTF-8-encoded
+string. Therefore, if the percent-encoded bytes aren't valid UTF-8, Subversion
+would choke on them when converting them to the locale encoding.

   $ hg convert file://$TESTTMP/%FF test
   initializing destination test repository
-  on Python 3, we currently do not support non-UTF-8 percent-encoded bytes in file URLs for Subversion repositories
+  Subversion does not support non-UTF-8 percent-encoded bytes in file URLs
   file:/*/$TESTTMP/%FF does not look like a CVS checkout (glob)
   $TESTTMP/file:$TESTTMP/%FF does not look like a Git repository
   file:/*/$TESTTMP/%FF does not look like a Subversion repository (glob)
@@ -215,4 +215,3 @@
   file:/*/$TESTTMP/%FF does not look like a P4 repository (glob)
   abort: file:/*/$TESTTMP/%FF: missing or unsupported repository (glob)
   [255]
-#endif