Mercurial > hg
changeset 13770:4e8f2310f310
url: provide url object
This adds a url object that re-implements urlsplit() and
unsplit(). The implementation splits out usernames, passwords, and
ports.
The implementation is based on the behavior specified by RFC
2396[1]. However, it is much more forgiving than the RFC's
specification; it places no specific restrictions on what characters
are allowed in each segment of the URL other than what is necessary to
split the URL into its constituent parts.
[1]: http://www.ietf.org/rfc/rfc2396.txt
author | Brodie Rao <brodie@bitheap.org> |
---|---|
date | Fri, 25 Mar 2011 22:58:56 -0700 |
parents | 8796fb6af67e |
children | ce6227306c9a |
files | mercurial/url.py tests/test-url.py |
diffstat | 2 files changed, 328 insertions(+), 0 deletions(-) [+] |
line wrap: on
line diff
--- a/mercurial/url.py Wed Feb 23 23:30:48 2011 +0100 +++ b/mercurial/url.py Fri Mar 25 22:58:56 2011 -0700 @@ -23,6 +23,198 @@ result = scheme + '://' + result[len(scheme + ':'):] return result +class url(object): + """Reliable URL parser. + + This parses URLs and provides attributes for the following + components: + + <scheme>://<user>:<passwd>@<host>:<port>/<path>?<query>#<fragment> + + Missing components are set to None. The only exception is + fragment, which is set to '' if present but empty. + + If parse_fragment is False, fragment is included in query. If + parse_query is False, query is included in path. If both are + False, both fragment and query are included in path. + + See http://www.ietf.org/rfc/rfc2396.txt for more information. + + Examples: + + >>> url('http://www.ietf.org/rfc/rfc2396.txt') + <url scheme: 'http', host: 'www.ietf.org', path: 'rfc/rfc2396.txt'> + >>> url('ssh://[::1]:2200//home/joe/repo') + <url scheme: 'ssh', host: '[::1]', port: '2200', path: '/home/joe/repo'> + >>> url('file:///home/joe/repo') + <url scheme: 'file', path: '/home/joe/repo'> + >>> url('bundle:foo') + <url scheme: 'bundle', path: 'foo'> + + Authentication credentials: + + >>> url('ssh://joe:xyz@x/repo') + <url scheme: 'ssh', user: 'joe', passwd: 'xyz', host: 'x', path: 'repo'> + >>> url('ssh://joe@x/repo') + <url scheme: 'ssh', user: 'joe', host: 'x', path: 'repo'> + + Query strings and fragments: + + >>> url('http://host/a?b#c') + <url scheme: 'http', host: 'host', path: 'a', query: 'b', fragment: 'c'> + >>> url('http://host/a?b#c', parse_query=False, parse_fragment=False) + <url scheme: 'http', host: 'host', path: 'a?b#c'> + """ + + _safechars = "!~*'()+" + _safepchars = "/!~*'()+" + + def __init__(self, path, parse_query=True, parse_fragment=True): + # We slowly chomp away at path until we have only the path left + self.scheme = self.user = self.passwd = self.host = None + self.port = self.path = self.query = self.fragment = None + self._localpath = True + + if not path.startswith('/') and ':' in path: + parts = path.split(':', 1) + if parts[0]: + self.scheme, path = parts + self._localpath = False + + if not path: + path = None + if self._localpath: + self.path = '' + return + else: + if parse_fragment and '#' in path: + path, self.fragment = path.split('#', 1) + if not path: + path = None + if self._localpath: + self.path = path + return + + if parse_query and '?' in path: + path, self.query = path.split('?', 1) + if not path: + path = None + if not self.query: + self.query = None + + # // is required to specify a host/authority + if path and path.startswith('//'): + parts = path[2:].split('/', 1) + if len(parts) > 1: + self.host, path = parts + path = path + else: + self.host = parts[0] + path = None + if not self.host: + self.host = None + if path: + path = '/' + path + + if self.host and '@' in self.host: + self.user, self.host = self.host.rsplit('@', 1) + if ':' in self.user: + self.user, self.passwd = self.user.split(':', 1) + if not self.host: + self.host = None + + # Don't split on colons in IPv6 addresses without ports + if (self.host and ':' in self.host and + not (self.host.startswith('[') and self.host.endswith(']'))): + self.host, self.port = self.host.rsplit(':', 1) + if not self.host: + self.host = None + self.path = path + + for a in ('user', 'passwd', 'host', 'port', + 'path', 'query', 'fragment'): + v = getattr(self, a) + if v is not None: + setattr(self, a, urllib.unquote(v)) + + def __repr__(self): + attrs = [] + for a in ('scheme', 'user', 'passwd', 'host', 'port', 'path', + 'query', 'fragment'): + v = getattr(self, a) + if v is not None: + attrs.append('%s: %r' % (a, v)) + return '<url %s>' % ', '.join(attrs) + + def __str__(self): + """Join the URL's components back into a URL string. + + Examples: + + >>> str(url('http://user:pw@host:80/?foo#bar')) + 'http://user:pw@host:80/?foo#bar' + >>> str(url('ssh://user:pw@[::1]:2200//home/joe#')) + 'ssh://user:pw@[::1]:2200//home/joe#' + >>> str(url('http://localhost:80//')) + 'http://localhost:80//' + >>> str(url('http://localhost:80/')) + 'http://localhost:80/' + >>> str(url('http://localhost:80')) + 'http://localhost:80' + >>> str(url('bundle:foo')) + 'bundle:foo' + >>> str(url('path')) + 'path' + """ + if self._localpath: + s = self.path + if self.fragment: + s += '#' + self.fragment + return s + + s = self.scheme + ':' + if (self.user or self.passwd or self.host or + self.scheme and not self.path): + s += '//' + if self.user: + s += urllib.quote(self.user, safe=self._safechars) + if self.passwd: + s += ':' + urllib.quote(self.passwd, safe=self._safechars) + if self.user or self.passwd: + s += '@' + if self.host: + if not (self.host.startswith('[') and self.host.endswith(']')): + s += urllib.quote(self.host) + else: + s += self.host + if self.port: + s += ':' + urllib.quote(self.port) + if ((self.host and self.path is not None) or + (self.host and self.query or self.fragment)): + s += '/' + if self.path: + s += urllib.quote(self.path, safe=self._safepchars) + if self.query: + s += '?' + urllib.quote(self.query, safe=self._safepchars) + if self.fragment is not None: + s += '#' + urllib.quote(self.fragment, safe=self._safepchars) + return s + + def authinfo(self): + user, passwd = self.user, self.passwd + try: + self.user, self.passwd = None, None + s = str(self) + finally: + self.user, self.passwd = user, passwd + if not self.user: + return (s, None) + return (s, (None, (str(self), self.host), + self.user, self.passwd or '')) + +def has_scheme(path): + return bool(url(path).scheme) + def hidepassword(url): '''hide user credential in a url string''' scheme, netloc, path, params, query, fragment = urlparse.urlparse(url)
--- a/tests/test-url.py Wed Feb 23 23:30:48 2011 +0100 +++ b/tests/test-url.py Fri Mar 25 22:58:56 2011 -0700 @@ -49,6 +49,142 @@ check(_verifycert(None, 'example.com'), 'no certificate received') +import doctest + +def test_url(): + """ + >>> from mercurial.url import url + + This tests for edge cases in url.URL's parsing algorithm. Most of + these aren't useful for documentation purposes, so they aren't + part of the class's doc tests. + + Query strings and fragments: + + >>> url('http://host/a?b#c') + <url scheme: 'http', host: 'host', path: 'a', query: 'b', fragment: 'c'> + >>> url('http://host/a?') + <url scheme: 'http', host: 'host', path: 'a'> + >>> url('http://host/a#b#c') + <url scheme: 'http', host: 'host', path: 'a', fragment: 'b#c'> + >>> url('http://host/a#b?c') + <url scheme: 'http', host: 'host', path: 'a', fragment: 'b?c'> + >>> url('http://host/?a#b') + <url scheme: 'http', host: 'host', path: '', query: 'a', fragment: 'b'> + >>> url('http://host/?a#b', parse_query=False) + <url scheme: 'http', host: 'host', path: '?a', fragment: 'b'> + >>> url('http://host/?a#b', parse_fragment=False) + <url scheme: 'http', host: 'host', path: '', query: 'a#b'> + >>> url('http://host/?a#b', parse_query=False, parse_fragment=False) + <url scheme: 'http', host: 'host', path: '?a#b'> + + IPv6 addresses: + + >>> url('ldap://[2001:db8::7]/c=GB?objectClass?one') + <url scheme: 'ldap', host: '[2001:db8::7]', path: 'c=GB', + query: 'objectClass?one'> + >>> url('ldap://joe:xxx@[2001:db8::7]:80/c=GB?objectClass?one') + <url scheme: 'ldap', user: 'joe', passwd: 'xxx', host: '[2001:db8::7]', + port: '80', path: 'c=GB', query: 'objectClass?one'> + + Missing scheme, host, etc.: + + >>> url('://192.0.2.16:80/') + <url path: '://192.0.2.16:80/'> + >>> url('http://mercurial.selenic.com') + <url scheme: 'http', host: 'mercurial.selenic.com'> + >>> url('/foo') + <url path: '/foo'> + >>> url('bundle:/foo') + <url scheme: 'bundle', path: '/foo'> + >>> url('a?b#c') + <url path: 'a?b', fragment: 'c'> + >>> url('http://x.com?arg=/foo') + <url scheme: 'http', host: 'x.com', query: 'arg=/foo'> + >>> url('http://joe:xxx@/foo') + <url scheme: 'http', user: 'joe', passwd: 'xxx', path: 'foo'> + + Just a scheme and a path: + + >>> url('mailto:John.Doe@example.com') + <url scheme: 'mailto', path: 'John.Doe@example.com'> + >>> url('a:b:c:d') + <url scheme: 'a', path: 'b:c:d'> + + SSH examples: + + >>> url('ssh://joe@host//home/joe') + <url scheme: 'ssh', user: 'joe', host: 'host', path: '/home/joe'> + >>> url('ssh://joe:xxx@host/src') + <url scheme: 'ssh', user: 'joe', passwd: 'xxx', host: 'host', path: 'src'> + >>> url('ssh://joe:xxx@host') + <url scheme: 'ssh', user: 'joe', passwd: 'xxx', host: 'host'> + >>> url('ssh://joe@host') + <url scheme: 'ssh', user: 'joe', host: 'host'> + >>> url('ssh://host') + <url scheme: 'ssh', host: 'host'> + >>> url('ssh://') + <url scheme: 'ssh'> + >>> url('ssh:') + <url scheme: 'ssh'> + + Non-numeric port: + + >>> url('http://example.com:dd') + <url scheme: 'http', host: 'example.com', port: 'dd'> + >>> url('ssh://joe:xxx@host:ssh/foo') + <url scheme: 'ssh', user: 'joe', passwd: 'xxx', host: 'host', port: 'ssh', + path: 'foo'> + + Bad authentication credentials: + + >>> url('http://joe@joeville:123@4:@host/a?b#c') + <url scheme: 'http', user: 'joe@joeville', passwd: '123@4:', + host: 'host', path: 'a', query: 'b', fragment: 'c'> + >>> url('http://!*#?/@!*#?/:@host/a?b#c') + <url scheme: 'http', host: '!*', fragment: '?/@!*#?/:@host/a?b#c'> + >>> url('http://!*#?@!*#?:@host/a?b#c') + <url scheme: 'http', host: '!*', fragment: '?@!*#?:@host/a?b#c'> + >>> url('http://!*@:!*@@host/a?b#c') + <url scheme: 'http', user: '!*@', passwd: '!*@', host: 'host', + path: 'a', query: 'b', fragment: 'c'> + + File paths: + + >>> url('a/b/c/d.g.f') + <url path: 'a/b/c/d.g.f'> + >>> url('/x///z/y/') + <url path: '/x///z/y/'> + + Empty URL: + + >>> u = url('') + >>> u + <url path: ''> + >>> str(u) + '' + + Empty path with query string: + + >>> str(url('http://foo/?bar')) + 'http://foo/?bar' + + Invalid path: + + >>> u = url('http://foo/bar') + >>> u.path = 'bar' + >>> str(u) + 'http://foo/bar' + + >>> u = url('file:///foo/bar/baz') + >>> u + <url scheme: 'file', path: '/foo/bar/baz'> + >>> str(u) + 'file:/foo/bar/baz' + """ + +doctest.testmod(optionflags=doctest.NORMALIZE_WHITESPACE) + # Unicode (IDN) certname isn't supported check(_verifycert(cert(u'\u4f8b.jp'), 'example.jp'), 'IDN in certificate not supported')