changeset 13848:b2798c1defff

url: be stricter about detecting schemes While the URL parser is very forgiving about what characters are allowed in each component, it's useful to be strict about the scheme so we don't accidentally interpret local paths with colons as URLs. This restricts schemes to containing alphanumeric characters, dashes, pluses, and dots (as specified in RFC 2396).
author Brodie Rao <brodie@bitheap.org>
date Thu, 31 Mar 2011 17:37:33 -0700
parents ddcb57a2eaeb
children 9f97de157aad
files mercurial/url.py tests/test-url.py
diffstat 2 files changed, 9 insertions(+), 2 deletions(-) [+]
line wrap: on
line diff
--- a/mercurial/url.py	Fri Apr 01 11:45:29 2011 -0500
+++ b/mercurial/url.py	Thu Mar 31 17:37:33 2011 -0700
@@ -7,7 +7,7 @@
 # This software may be used and distributed according to the terms of the
 # GNU General Public License version 2 or any later version.
 
-import urllib, urllib2, httplib, os, socket, cStringIO
+import urllib, urllib2, httplib, os, socket, cStringIO, re
 import __builtin__
 from i18n import _
 import keepalive, util
@@ -64,6 +64,7 @@
 
     _safechars = "!~*'()+"
     _safepchars = "/!~*'()+"
+    _matchscheme = re.compile(r'^[a-zA-Z0-9+.\-]+:').match
 
     def __init__(self, path, parsequery=True, parsefragment=True):
         # We slowly chomp away at path until we have only the path left
@@ -88,7 +89,7 @@
             self.path = path
             return
 
-        if not path.startswith('/') and ':' in path:
+        if self._matchscheme(path):
             parts = path.split(':', 1)
             if parts[0]:
                 self.scheme, path = parts
--- a/tests/test-url.py	Fri Apr 01 11:45:29 2011 -0500
+++ b/tests/test-url.py	Thu Mar 31 17:37:33 2011 -0700
@@ -157,6 +157,12 @@
     <url path: 'a/b/c/d.g.f'>
     >>> url('/x///z/y/')
     <url path: '/x///z/y/'>
+    >>> url('/foo:bar')
+    <url path: '/foo:bar'>
+    >>> url('\\\\foo:bar')
+    <url path: '\\\\foo:bar'>
+    >>> url('./foo:bar')
+    <url path: './foo:bar'>
 
     Non-localhost file URL: