convert: handle percent-encoded bytes in file URLs like Subversion stable 5.4.2
authorManuel Jacob <me@manueljacob.de>
Tue, 30 Jun 2020 07:23:29 +0200
branchstable
changeset 45027 0ea9c86fac89
parent 45026 ddf66c218104
child 45028 99f818777f45
convert: handle percent-encoded bytes in file URLs like Subversion 75b59d221aa3 added most of the code that gets removed by this patch. It helped making progress on Python 3, but the reasoning was wrong in many ways. I tried to retract it while it was queued, but it was too late. Back then, I was asssuming that what happened on Python 2 (preserving bytes) is correct and my Python 3 change is a hack. However it turned out that Subversion interprets percent-encoded bytes as UTF-8. Accepting the same format as Subversion is a good idea. Consistency with urlreq.pathname2url() (as described in the removed comment) doesn’t matter because that function is only used for passing paths to urllib. This is not a backwards-incompatible change because before 5c0d5b48e58c, non-ASCII filenames didn’t work at all on Python 2. When the locale encoding is ISO-8859-15, `svn` accepts `file:///tmp/a%E2%82%AC` for `/tmp/a€`. Before this patch, this was the case for this extension on Python 3, but not on Python 2. This patch makes it work like with `svn` on both Python 2 and Python 3.
hgext/convert/subversion.py
tests/test-convert-svn-encoding.t
--- a/hgext/convert/subversion.py	Tue Jun 30 16:39:45 2020 +0200
+++ b/hgext/convert/subversion.py	Tue Jun 30 07:23:29 2020 +0200
@@ -354,6 +354,32 @@
 }
 
 
+class NonUtf8PercentEncodedBytes(Exception):
+    pass
+
+
+# Subversion paths are Unicode. Since the percent-decoding is done on
+# UTF-8-encoded strings, percent-encoded bytes are interpreted as UTF-8.
+def url2pathname_like_subversion(unicodepath):
+    if pycompat.ispy3:
+        # On Python 3, we have to pass unicode to urlreq.url2pathname().
+        # Percent-decoded bytes get decoded using UTF-8 and the 'replace' error
+        # handler.
+        unicodepath = urlreq.url2pathname(unicodepath)
+        if u'\N{REPLACEMENT CHARACTER}' in unicodepath:
+            raise NonUtf8PercentEncodedBytes
+        else:
+            return unicodepath
+    else:
+        # If we passed unicode on Python 2, it would be converted using the
+        # latin-1 encoding. Therefore, we pass UTF-8-encoded bytes.
+        unicodepath = urlreq.url2pathname(unicodepath.encode('utf-8'))
+        try:
+            return unicodepath.decode('utf-8')
+        except UnicodeDecodeError:
+            raise NonUtf8PercentEncodedBytes
+
+
 def issvnurl(ui, url):
     try:
         proto, path = url.split(b'://', 1)
@@ -366,7 +392,7 @@
             ):
                 path = path[:2] + b':/' + path[6:]
             try:
-                path.decode(fsencoding)
+                unicodepath = path.decode(fsencoding)
             except UnicodeDecodeError:
                 ui.warn(
                     _(
@@ -376,28 +402,22 @@
                     % pycompat.sysbytes(fsencoding)
                 )
                 return False
-            # FIXME: The following reasoning and logic is wrong and will be
-            # fixed in a following changeset.
-            # pycompat.fsdecode() / pycompat.fsencode() are used so that bytes
-            # in the URL roundtrip correctly on Unix. urlreq.url2pathname() on
-            # py3 will decode percent-encoded bytes using the utf-8 encoding
-            # and the "replace" error handler. This means that it will not
-            # preserve non-UTF-8 bytes (https://bugs.python.org/issue40983).
-            # url.open() uses the reverse function (urlreq.pathname2url()) and
-            # has a similar problem
-            # (https://bz.mercurial-scm.org/show_bug.cgi?id=6357). It makes
-            # sense to solve both problems together and handle all file URLs
-            # consistently. For now, we warn.
-            unicodepath = urlreq.url2pathname(pycompat.fsdecode(path))
-            if pycompat.ispy3 and u'\N{REPLACEMENT CHARACTER}' in unicodepath:
+            try:
+                unicodepath = url2pathname_like_subversion(unicodepath)
+            except NonUtf8PercentEncodedBytes:
                 ui.warn(
                     _(
-                        b'on Python 3, we currently do not support non-UTF-8 '
-                        b'percent-encoded bytes in file URLs for Subversion '
-                        b'repositories\n'
+                        b'Subversion does not support non-UTF-8 '
+                        b'percent-encoded bytes in file URLs\n'
                     )
                 )
-            path = pycompat.fsencode(unicodepath)
+                return False
+            # Below, we approximate how Subversion checks the path. On Unix, we
+            # should therefore convert the path to bytes using `fsencoding`
+            # (like Subversion does). On Windows, the right thing would
+            # actually be to leave the path as unicode. For now, we restrict
+            # the path to MBCS.
+            path = unicodepath.encode(fsencoding)
     except ValueError:
         proto = b'file'
         path = os.path.abspath(url)
--- a/tests/test-convert-svn-encoding.t	Tue Jun 30 16:39:45 2020 +0200
+++ b/tests/test-convert-svn-encoding.t	Tue Jun 30 07:23:29 2020 +0200
@@ -197,13 +197,13 @@
   abort: file:/*/$TESTTMP/\xff: missing or unsupported repository (glob) (esc)
   [255]
 
-#if py3
-For now, on Python 3, we abort when encountering non-UTF-8 percent-encoded
-bytes in a filename.
+Subversion decodes percent-encoded bytes on the converted, UTF-8-encoded
+string. Therefore, if the percent-encoded bytes aren't valid UTF-8, Subversion
+would choke on them when converting them to the locale encoding.
 
   $ hg convert file://$TESTTMP/%FF test
   initializing destination test repository
-  on Python 3, we currently do not support non-UTF-8 percent-encoded bytes in file URLs for Subversion repositories
+  Subversion does not support non-UTF-8 percent-encoded bytes in file URLs
   file:/*/$TESTTMP/%FF does not look like a CVS checkout (glob)
   $TESTTMP/file:$TESTTMP/%FF does not look like a Git repository
   file:/*/$TESTTMP/%FF does not look like a Subversion repository (glob)
@@ -215,4 +215,3 @@
   file:/*/$TESTTMP/%FF does not look like a P4 repository (glob)
   abort: file:/*/$TESTTMP/%FF: missing or unsupported repository (glob)
   [255]
-#endif