comparison hgext/convert/subversion.py @ 45027:0ea9c86fac89 stable 5.4.2

convert: handle percent-encoded bytes in file URLs like Subversion 75b59d221aa3 added most of the code that gets removed by this patch. It helped making progress on Python 3, but the reasoning was wrong in many ways. I tried to retract it while it was queued, but it was too late. Back then, I was asssuming that what happened on Python 2 (preserving bytes) is correct and my Python 3 change is a hack. However it turned out that Subversion interprets percent-encoded bytes as UTF-8. Accepting the same format as Subversion is a good idea. Consistency with urlreq.pathname2url() (as described in the removed comment) doesn’t matter because that function is only used for passing paths to urllib. This is not a backwards-incompatible change because before 5c0d5b48e58c, non-ASCII filenames didn’t work at all on Python 2. When the locale encoding is ISO-8859-15, `svn` accepts `file:///tmp/a%E2%82%AC` for `/tmp/a€`. Before this patch, this was the case for this extension on Python 3, but not on Python 2. This patch makes it work like with `svn` on both Python 2 and Python 3.
author Manuel Jacob <me@manueljacob.de>
date Tue, 30 Jun 2020 07:23:29 +0200
parents ddf66c218104
children df3660cc60f5
comparison
equal deleted inserted replaced
45026:ddf66c218104 45027:0ea9c86fac89
352 b'https': httpcheck, 352 b'https': httpcheck,
353 b'file': filecheck, 353 b'file': filecheck,
354 } 354 }
355 355
356 356
357 class NonUtf8PercentEncodedBytes(Exception):
358 pass
359
360
361 # Subversion paths are Unicode. Since the percent-decoding is done on
362 # UTF-8-encoded strings, percent-encoded bytes are interpreted as UTF-8.
363 def url2pathname_like_subversion(unicodepath):
364 if pycompat.ispy3:
365 # On Python 3, we have to pass unicode to urlreq.url2pathname().
366 # Percent-decoded bytes get decoded using UTF-8 and the 'replace' error
367 # handler.
368 unicodepath = urlreq.url2pathname(unicodepath)
369 if u'\N{REPLACEMENT CHARACTER}' in unicodepath:
370 raise NonUtf8PercentEncodedBytes
371 else:
372 return unicodepath
373 else:
374 # If we passed unicode on Python 2, it would be converted using the
375 # latin-1 encoding. Therefore, we pass UTF-8-encoded bytes.
376 unicodepath = urlreq.url2pathname(unicodepath.encode('utf-8'))
377 try:
378 return unicodepath.decode('utf-8')
379 except UnicodeDecodeError:
380 raise NonUtf8PercentEncodedBytes
381
382
357 def issvnurl(ui, url): 383 def issvnurl(ui, url):
358 try: 384 try:
359 proto, path = url.split(b'://', 1) 385 proto, path = url.split(b'://', 1)
360 if proto == b'file': 386 if proto == b'file':
361 if ( 387 if (
364 and path[1:2].isalpha() 390 and path[1:2].isalpha()
365 and path[2:6].lower() == b'%3a/' 391 and path[2:6].lower() == b'%3a/'
366 ): 392 ):
367 path = path[:2] + b':/' + path[6:] 393 path = path[:2] + b':/' + path[6:]
368 try: 394 try:
369 path.decode(fsencoding) 395 unicodepath = path.decode(fsencoding)
370 except UnicodeDecodeError: 396 except UnicodeDecodeError:
371 ui.warn( 397 ui.warn(
372 _( 398 _(
373 b'Subversion requires that file URLs can be converted ' 399 b'Subversion requires that file URLs can be converted '
374 b'to Unicode using the current locale encoding (%s)\n' 400 b'to Unicode using the current locale encoding (%s)\n'
375 ) 401 )
376 % pycompat.sysbytes(fsencoding) 402 % pycompat.sysbytes(fsencoding)
377 ) 403 )
378 return False 404 return False
379 # FIXME: The following reasoning and logic is wrong and will be 405 try:
380 # fixed in a following changeset. 406 unicodepath = url2pathname_like_subversion(unicodepath)
381 # pycompat.fsdecode() / pycompat.fsencode() are used so that bytes 407 except NonUtf8PercentEncodedBytes:
382 # in the URL roundtrip correctly on Unix. urlreq.url2pathname() on
383 # py3 will decode percent-encoded bytes using the utf-8 encoding
384 # and the "replace" error handler. This means that it will not
385 # preserve non-UTF-8 bytes (https://bugs.python.org/issue40983).
386 # url.open() uses the reverse function (urlreq.pathname2url()) and
387 # has a similar problem
388 # (https://bz.mercurial-scm.org/show_bug.cgi?id=6357). It makes
389 # sense to solve both problems together and handle all file URLs
390 # consistently. For now, we warn.
391 unicodepath = urlreq.url2pathname(pycompat.fsdecode(path))
392 if pycompat.ispy3 and u'\N{REPLACEMENT CHARACTER}' in unicodepath:
393 ui.warn( 408 ui.warn(
394 _( 409 _(
395 b'on Python 3, we currently do not support non-UTF-8 ' 410 b'Subversion does not support non-UTF-8 '
396 b'percent-encoded bytes in file URLs for Subversion ' 411 b'percent-encoded bytes in file URLs\n'
397 b'repositories\n'
398 ) 412 )
399 ) 413 )
400 path = pycompat.fsencode(unicodepath) 414 return False
415 # Below, we approximate how Subversion checks the path. On Unix, we
416 # should therefore convert the path to bytes using `fsencoding`
417 # (like Subversion does). On Windows, the right thing would
418 # actually be to leave the path as unicode. For now, we restrict
419 # the path to MBCS.
420 path = unicodepath.encode(fsencoding)
401 except ValueError: 421 except ValueError:
402 proto = b'file' 422 proto = b'file'
403 path = os.path.abspath(url) 423 path = os.path.abspath(url)
404 try: 424 try:
405 path.decode(fsencoding) 425 path.decode(fsencoding)