changeset 13770:4e8f2310f310

url: provide url object This adds a url object that re-implements urlsplit() and unsplit(). The implementation splits out usernames, passwords, and ports. The implementation is based on the behavior specified by RFC 2396[1]. However, it is much more forgiving than the RFC's specification; it places no specific restrictions on what characters are allowed in each segment of the URL other than what is necessary to split the URL into its constituent parts. [1]: http://www.ietf.org/rfc/rfc2396.txt
author Brodie Rao <brodie@bitheap.org>
date Fri, 25 Mar 2011 22:58:56 -0700
parents 8796fb6af67e
children ce6227306c9a
files mercurial/url.py tests/test-url.py
diffstat 2 files changed, 328 insertions(+), 0 deletions(-) [+]
line wrap: on
line diff
--- a/mercurial/url.py	Wed Feb 23 23:30:48 2011 +0100
+++ b/mercurial/url.py	Fri Mar 25 22:58:56 2011 -0700
@@ -23,6 +23,198 @@
         result = scheme + '://' + result[len(scheme + ':'):]
     return result
 
+class url(object):
+    """Reliable URL parser.
+
+    This parses URLs and provides attributes for the following
+    components:
+
+    <scheme>://<user>:<passwd>@<host>:<port>/<path>?<query>#<fragment>
+
+    Missing components are set to None. The only exception is
+    fragment, which is set to '' if present but empty.
+
+    If parse_fragment is False, fragment is included in query. If
+    parse_query is False, query is included in path. If both are
+    False, both fragment and query are included in path.
+
+    See http://www.ietf.org/rfc/rfc2396.txt for more information.
+
+    Examples:
+
+    >>> url('http://www.ietf.org/rfc/rfc2396.txt')
+    <url scheme: 'http', host: 'www.ietf.org', path: 'rfc/rfc2396.txt'>
+    >>> url('ssh://[::1]:2200//home/joe/repo')
+    <url scheme: 'ssh', host: '[::1]', port: '2200', path: '/home/joe/repo'>
+    >>> url('file:///home/joe/repo')
+    <url scheme: 'file', path: '/home/joe/repo'>
+    >>> url('bundle:foo')
+    <url scheme: 'bundle', path: 'foo'>
+
+    Authentication credentials:
+
+    >>> url('ssh://joe:xyz@x/repo')
+    <url scheme: 'ssh', user: 'joe', passwd: 'xyz', host: 'x', path: 'repo'>
+    >>> url('ssh://joe@x/repo')
+    <url scheme: 'ssh', user: 'joe', host: 'x', path: 'repo'>
+
+    Query strings and fragments:
+
+    >>> url('http://host/a?b#c')
+    <url scheme: 'http', host: 'host', path: 'a', query: 'b', fragment: 'c'>
+    >>> url('http://host/a?b#c', parse_query=False, parse_fragment=False)
+    <url scheme: 'http', host: 'host', path: 'a?b#c'>
+    """
+
+    _safechars = "!~*'()+"
+    _safepchars = "/!~*'()+"
+
+    def __init__(self, path, parse_query=True, parse_fragment=True):
+        # We slowly chomp away at path until we have only the path left
+        self.scheme = self.user = self.passwd = self.host = None
+        self.port = self.path = self.query = self.fragment = None
+        self._localpath = True
+
+        if not path.startswith('/') and ':' in path:
+            parts = path.split(':', 1)
+            if parts[0]:
+                self.scheme, path = parts
+                self._localpath = False
+
+        if not path:
+            path = None
+            if self._localpath:
+                self.path = ''
+                return
+        else:
+            if parse_fragment and '#' in path:
+                path, self.fragment = path.split('#', 1)
+                if not path:
+                    path = None
+            if self._localpath:
+                self.path = path
+                return
+
+            if parse_query and '?' in path:
+                path, self.query = path.split('?', 1)
+                if not path:
+                    path = None
+                if not self.query:
+                    self.query = None
+
+            # // is required to specify a host/authority
+            if path and path.startswith('//'):
+                parts = path[2:].split('/', 1)
+                if len(parts) > 1:
+                    self.host, path = parts
+                    path = path
+                else:
+                    self.host = parts[0]
+                    path = None
+                if not self.host:
+                    self.host = None
+                    if path:
+                        path = '/' + path
+
+            if self.host and '@' in self.host:
+                self.user, self.host = self.host.rsplit('@', 1)
+                if ':' in self.user:
+                    self.user, self.passwd = self.user.split(':', 1)
+                if not self.host:
+                    self.host = None
+
+            # Don't split on colons in IPv6 addresses without ports
+            if (self.host and ':' in self.host and
+                not (self.host.startswith('[') and self.host.endswith(']'))):
+                self.host, self.port = self.host.rsplit(':', 1)
+                if not self.host:
+                    self.host = None
+        self.path = path
+
+        for a in ('user', 'passwd', 'host', 'port',
+                  'path', 'query', 'fragment'):
+            v = getattr(self, a)
+            if v is not None:
+                setattr(self, a, urllib.unquote(v))
+
+    def __repr__(self):
+        attrs = []
+        for a in ('scheme', 'user', 'passwd', 'host', 'port', 'path',
+                  'query', 'fragment'):
+            v = getattr(self, a)
+            if v is not None:
+                attrs.append('%s: %r' % (a, v))
+        return '<url %s>' % ', '.join(attrs)
+
+    def __str__(self):
+        """Join the URL's components back into a URL string.
+
+        Examples:
+
+        >>> str(url('http://user:pw@host:80/?foo#bar'))
+        'http://user:pw@host:80/?foo#bar'
+        >>> str(url('ssh://user:pw@[::1]:2200//home/joe#'))
+        'ssh://user:pw@[::1]:2200//home/joe#'
+        >>> str(url('http://localhost:80//'))
+        'http://localhost:80//'
+        >>> str(url('http://localhost:80/'))
+        'http://localhost:80/'
+        >>> str(url('http://localhost:80'))
+        'http://localhost:80'
+        >>> str(url('bundle:foo'))
+        'bundle:foo'
+        >>> str(url('path'))
+        'path'
+        """
+        if self._localpath:
+            s = self.path
+            if self.fragment:
+                s += '#' + self.fragment
+            return s
+
+        s = self.scheme + ':'
+        if (self.user or self.passwd or self.host or
+            self.scheme and not self.path):
+            s += '//'
+        if self.user:
+            s += urllib.quote(self.user, safe=self._safechars)
+        if self.passwd:
+            s += ':' + urllib.quote(self.passwd, safe=self._safechars)
+        if self.user or self.passwd:
+            s += '@'
+        if self.host:
+            if not (self.host.startswith('[') and self.host.endswith(']')):
+                s += urllib.quote(self.host)
+            else:
+                s += self.host
+        if self.port:
+            s += ':' + urllib.quote(self.port)
+        if ((self.host and self.path is not None) or
+            (self.host and self.query or self.fragment)):
+            s += '/'
+        if self.path:
+            s += urllib.quote(self.path, safe=self._safepchars)
+        if self.query:
+            s += '?' + urllib.quote(self.query, safe=self._safepchars)
+        if self.fragment is not None:
+            s += '#' + urllib.quote(self.fragment, safe=self._safepchars)
+        return s
+
+    def authinfo(self):
+        user, passwd = self.user, self.passwd
+        try:
+            self.user, self.passwd = None, None
+            s = str(self)
+        finally:
+            self.user, self.passwd = user, passwd
+        if not self.user:
+            return (s, None)
+        return (s, (None, (str(self), self.host),
+                    self.user, self.passwd or ''))
+
+def has_scheme(path):
+    return bool(url(path).scheme)
+
 def hidepassword(url):
     '''hide user credential in a url string'''
     scheme, netloc, path, params, query, fragment = urlparse.urlparse(url)
--- a/tests/test-url.py	Wed Feb 23 23:30:48 2011 +0100
+++ b/tests/test-url.py	Fri Mar 25 22:58:56 2011 -0700
@@ -49,6 +49,142 @@
 check(_verifycert(None, 'example.com'),
       'no certificate received')
 
+import doctest
+
+def test_url():
+    """
+    >>> from mercurial.url import url
+
+    This tests for edge cases in url.URL's parsing algorithm. Most of
+    these aren't useful for documentation purposes, so they aren't
+    part of the class's doc tests.
+
+    Query strings and fragments:
+
+    >>> url('http://host/a?b#c')
+    <url scheme: 'http', host: 'host', path: 'a', query: 'b', fragment: 'c'>
+    >>> url('http://host/a?')
+    <url scheme: 'http', host: 'host', path: 'a'>
+    >>> url('http://host/a#b#c')
+    <url scheme: 'http', host: 'host', path: 'a', fragment: 'b#c'>
+    >>> url('http://host/a#b?c')
+    <url scheme: 'http', host: 'host', path: 'a', fragment: 'b?c'>
+    >>> url('http://host/?a#b')
+    <url scheme: 'http', host: 'host', path: '', query: 'a', fragment: 'b'>
+    >>> url('http://host/?a#b', parse_query=False)
+    <url scheme: 'http', host: 'host', path: '?a', fragment: 'b'>
+    >>> url('http://host/?a#b', parse_fragment=False)
+    <url scheme: 'http', host: 'host', path: '', query: 'a#b'>
+    >>> url('http://host/?a#b', parse_query=False, parse_fragment=False)
+    <url scheme: 'http', host: 'host', path: '?a#b'>
+
+    IPv6 addresses:
+
+    >>> url('ldap://[2001:db8::7]/c=GB?objectClass?one')
+    <url scheme: 'ldap', host: '[2001:db8::7]', path: 'c=GB',
+         query: 'objectClass?one'>
+    >>> url('ldap://joe:xxx@[2001:db8::7]:80/c=GB?objectClass?one')
+    <url scheme: 'ldap', user: 'joe', passwd: 'xxx', host: '[2001:db8::7]',
+         port: '80', path: 'c=GB', query: 'objectClass?one'>
+
+    Missing scheme, host, etc.:
+
+    >>> url('://192.0.2.16:80/')
+    <url path: '://192.0.2.16:80/'>
+    >>> url('http://mercurial.selenic.com')
+    <url scheme: 'http', host: 'mercurial.selenic.com'>
+    >>> url('/foo')
+    <url path: '/foo'>
+    >>> url('bundle:/foo')
+    <url scheme: 'bundle', path: '/foo'>
+    >>> url('a?b#c')
+    <url path: 'a?b', fragment: 'c'>
+    >>> url('http://x.com?arg=/foo')
+    <url scheme: 'http', host: 'x.com', query: 'arg=/foo'>
+    >>> url('http://joe:xxx@/foo')
+    <url scheme: 'http', user: 'joe', passwd: 'xxx', path: 'foo'>
+
+    Just a scheme and a path:
+
+    >>> url('mailto:John.Doe@example.com')
+    <url scheme: 'mailto', path: 'John.Doe@example.com'>
+    >>> url('a:b:c:d')
+    <url scheme: 'a', path: 'b:c:d'>
+
+    SSH examples:
+
+    >>> url('ssh://joe@host//home/joe')
+    <url scheme: 'ssh', user: 'joe', host: 'host', path: '/home/joe'>
+    >>> url('ssh://joe:xxx@host/src')
+    <url scheme: 'ssh', user: 'joe', passwd: 'xxx', host: 'host', path: 'src'>
+    >>> url('ssh://joe:xxx@host')
+    <url scheme: 'ssh', user: 'joe', passwd: 'xxx', host: 'host'>
+    >>> url('ssh://joe@host')
+    <url scheme: 'ssh', user: 'joe', host: 'host'>
+    >>> url('ssh://host')
+    <url scheme: 'ssh', host: 'host'>
+    >>> url('ssh://')
+    <url scheme: 'ssh'>
+    >>> url('ssh:')
+    <url scheme: 'ssh'>
+
+    Non-numeric port:
+
+    >>> url('http://example.com:dd')
+    <url scheme: 'http', host: 'example.com', port: 'dd'>
+    >>> url('ssh://joe:xxx@host:ssh/foo')
+    <url scheme: 'ssh', user: 'joe', passwd: 'xxx', host: 'host', port: 'ssh',
+         path: 'foo'>
+
+    Bad authentication credentials:
+
+    >>> url('http://joe@joeville:123@4:@host/a?b#c')
+    <url scheme: 'http', user: 'joe@joeville', passwd: '123@4:',
+         host: 'host', path: 'a', query: 'b', fragment: 'c'>
+    >>> url('http://!*#?/@!*#?/:@host/a?b#c')
+    <url scheme: 'http', host: '!*', fragment: '?/@!*#?/:@host/a?b#c'>
+    >>> url('http://!*#?@!*#?:@host/a?b#c')
+    <url scheme: 'http', host: '!*', fragment: '?@!*#?:@host/a?b#c'>
+    >>> url('http://!*@:!*@@host/a?b#c')
+    <url scheme: 'http', user: '!*@', passwd: '!*@', host: 'host',
+         path: 'a', query: 'b', fragment: 'c'>
+
+    File paths:
+
+    >>> url('a/b/c/d.g.f')
+    <url path: 'a/b/c/d.g.f'>
+    >>> url('/x///z/y/')
+    <url path: '/x///z/y/'>
+
+    Empty URL:
+
+    >>> u = url('')
+    >>> u
+    <url path: ''>
+    >>> str(u)
+    ''
+
+    Empty path with query string:
+
+    >>> str(url('http://foo/?bar'))
+    'http://foo/?bar'
+
+    Invalid path:
+
+    >>> u = url('http://foo/bar')
+    >>> u.path = 'bar'
+    >>> str(u)
+    'http://foo/bar'
+
+    >>> u = url('file:///foo/bar/baz')
+    >>> u
+    <url scheme: 'file', path: '/foo/bar/baz'>
+    >>> str(u)
+    'file:/foo/bar/baz'
+    """
+
+doctest.testmod(optionflags=doctest.NORMALIZE_WHITESPACE)
+
 # Unicode (IDN) certname isn't supported
 check(_verifycert(cert(u'\u4f8b.jp'), 'example.jp'),
       'IDN in certificate not supported')