# HG changeset patch # User Manuel Jacob # Date 1593435816 -7200 # Node ID e3b19004087a7d760e58507f91c5cb136fe0df40 # Parent cb097496138a009bc41edd0017ac74c19bea4006 convert: correctly convert paths to UTF-8 for Subversion The previous code using encoding.tolocal() only worked by chance in these situations: * The string is ASCII: The fast path was triggered and the string was returned unmodified. * The local encoding is UTF-8: The source and target encoding is the same. * The string is not valid UTF-8 and the native encoding is ISO-8859-1: If the string doesn’t decode using UTF-8, ISO-8859-1 is tried as a fallback. During `hg convert`, the local encoding is always UTF-8. The irony is that in this case, encoding.tolocal() behaves like what someone would expect the reverse function, encoding.fromlocal(), to do. When the locale encoding is ISO-8859-15, trying to convert a SVN repo `/tmp/a€` failed before like this: file:///tmp/a%C2%A4 does not look like a Subversion repository to libsvn version 1.14.0 The correct URL is `file:///tmp/a%E2%82%AC`. Unlike previously (with the ISO-8859-1 fallback), decoding the path using the locale encoding can fail. In this case, we have to bail out, as Subversion won’t be able to do anything useful with the path. diff -r cb097496138a -r e3b19004087a hgext/convert/subversion.py --- a/hgext/convert/subversion.py Tue Jun 30 05:04:36 2020 +0200 +++ b/hgext/convert/subversion.py Mon Jun 29 15:03:36 2020 +0200 @@ -3,6 +3,8 @@ # Copyright(C) 2007 Daniel Holth et al from __future__ import absolute_import +import codecs +import locale import os import re import xml.dom.minidom @@ -63,6 +65,38 @@ svn = None +# In Subversion, paths are Unicode (encoded as UTF-8), which Subversion +# converts from / to native strings when interfacing with the OS. When passing +# paths to Subversion, we have to recode them such that it roundstrips with +# what Subversion is doing. + +fsencoding = None + + +def init_fsencoding(): + global fsencoding, fsencoding_is_utf8 + if fsencoding is not None: + return + if pycompat.iswindows: + # On Windows, filenames are Unicode, but we store them using the MBCS + # encoding. + fsencoding = 'mbcs' + else: + # This is the encoding used to convert UTF-8 back to natively-encoded + # strings in Subversion 1.14.0 or earlier with APR 1.7.0 or earlier. + with util.with_lc_ctype(): + fsencoding = locale.nl_langinfo(locale.CODESET) or 'ISO-8859-1' + fsencoding = codecs.lookup(fsencoding).name + fsencoding_is_utf8 = fsencoding == codecs.lookup('utf-8').name + + +def fs2svn(s): + if fsencoding_is_utf8: + return s + else: + return s.decode(fsencoding).encode('utf-8') + + class SvnPathNotFound(Exception): pass @@ -117,7 +151,7 @@ path = b'/' + util.normpath(path) # Module URL is later compared with the repository URL returned # by svn API, which is UTF-8. - path = encoding.tolocal(path) + path = fs2svn(path) path = b'file://%s' % quote(path) return svn.core.svn_path_canonicalize(path) @@ -347,6 +381,17 @@ except ValueError: proto = b'file' path = os.path.abspath(url) + try: + path.decode(fsencoding) + except UnicodeDecodeError: + ui.warn( + _( + b'Subversion requires that paths can be converted to ' + b'Unicode using the current locale encoding (%s)\n' + ) + % pycompat.sysbytes(fsencoding) + ) + return False if proto == b'file': path = util.pconvert(path) elif proto in (b'http', 'https'): @@ -384,6 +429,7 @@ def __init__(self, ui, repotype, url, revs=None): super(svn_source, self).__init__(ui, repotype, url, revs=revs) + init_fsencoding() if not ( url.startswith(b'svn://') or url.startswith(b'svn+ssh://') diff -r cb097496138a -r e3b19004087a tests/test-convert-svn-encoding.t --- a/tests/test-convert-svn-encoding.t Tue Jun 30 05:04:36 2020 +0200 +++ b/tests/test-convert-svn-encoding.t Mon Jun 29 15:03:36 2020 +0200 @@ -163,6 +163,26 @@ abort: http://localhost:$HGPORT/\xff: missing or unsupported repository (esc) [255] +In Subversion, paths are Unicode (encoded as UTF-8). Therefore paths that can't +be converted between UTF-8 and the locale encoding (which is always ASCII in +tests) don't work. + + $ cp -R svn-repo $XFF + $ hg convert $XFF test + initializing destination test repository + Subversion requires that paths can be converted to Unicode using the current locale encoding (ascii) + \xff does not look like a CVS checkout (glob) (esc) + $TESTTMP/\xff does not look like a Git repository (esc) + \xff does not look like a Subversion repository (glob) (esc) + \xff is not a local Mercurial repository (glob) (esc) + \xff does not look like a darcs repository (glob) (esc) + \xff does not look like a monotone repository (glob) (esc) + \xff does not look like a GNU Arch repository (glob) (esc) + \xff does not look like a Bazaar repository (glob) (esc) + cannot find required "p4" tool + abort: \xff: missing or unsupported repository (glob) (esc) + [255] + #if py3 For now, on Python 3, we abort when encountering non-UTF-8 percent-encoded bytes in a filename.