# HG changeset patch # User Brodie Rao # Date 1301119136 25200 # Node ID 4e8f2310f310663527e0e9a58e885668ba153379 # Parent 8796fb6af67e663e88e8a2102b266f44b5dce9ca url: provide url object This adds a url object that re-implements urlsplit() and unsplit(). The implementation splits out usernames, passwords, and ports. The implementation is based on the behavior specified by RFC 2396[1]. However, it is much more forgiving than the RFC's specification; it places no specific restrictions on what characters are allowed in each segment of the URL other than what is necessary to split the URL into its constituent parts. [1]: http://www.ietf.org/rfc/rfc2396.txt diff -r 8796fb6af67e -r 4e8f2310f310 mercurial/url.py --- a/mercurial/url.py Wed Feb 23 23:30:48 2011 +0100 +++ b/mercurial/url.py Fri Mar 25 22:58:56 2011 -0700 @@ -23,6 +23,198 @@ result = scheme + '://' + result[len(scheme + ':'):] return result +class url(object): + """Reliable URL parser. + + This parses URLs and provides attributes for the following + components: + + ://:@:/?# + + Missing components are set to None. The only exception is + fragment, which is set to '' if present but empty. + + If parse_fragment is False, fragment is included in query. If + parse_query is False, query is included in path. If both are + False, both fragment and query are included in path. + + See http://www.ietf.org/rfc/rfc2396.txt for more information. + + Examples: + + >>> url('http://www.ietf.org/rfc/rfc2396.txt') + + >>> url('ssh://[::1]:2200//home/joe/repo') + + >>> url('file:///home/joe/repo') + + >>> url('bundle:foo') + + + Authentication credentials: + + >>> url('ssh://joe:xyz@x/repo') + + >>> url('ssh://joe@x/repo') + + + Query strings and fragments: + + >>> url('http://host/a?b#c') + + >>> url('http://host/a?b#c', parse_query=False, parse_fragment=False) + + """ + + _safechars = "!~*'()+" + _safepchars = "/!~*'()+" + + def __init__(self, path, parse_query=True, parse_fragment=True): + # We slowly chomp away at path until we have only the path left + self.scheme = self.user = self.passwd = self.host = None + self.port = self.path = self.query = self.fragment = None + self._localpath = True + + if not path.startswith('/') and ':' in path: + parts = path.split(':', 1) + if parts[0]: + self.scheme, path = parts + self._localpath = False + + if not path: + path = None + if self._localpath: + self.path = '' + return + else: + if parse_fragment and '#' in path: + path, self.fragment = path.split('#', 1) + if not path: + path = None + if self._localpath: + self.path = path + return + + if parse_query and '?' in path: + path, self.query = path.split('?', 1) + if not path: + path = None + if not self.query: + self.query = None + + # // is required to specify a host/authority + if path and path.startswith('//'): + parts = path[2:].split('/', 1) + if len(parts) > 1: + self.host, path = parts + path = path + else: + self.host = parts[0] + path = None + if not self.host: + self.host = None + if path: + path = '/' + path + + if self.host and '@' in self.host: + self.user, self.host = self.host.rsplit('@', 1) + if ':' in self.user: + self.user, self.passwd = self.user.split(':', 1) + if not self.host: + self.host = None + + # Don't split on colons in IPv6 addresses without ports + if (self.host and ':' in self.host and + not (self.host.startswith('[') and self.host.endswith(']'))): + self.host, self.port = self.host.rsplit(':', 1) + if not self.host: + self.host = None + self.path = path + + for a in ('user', 'passwd', 'host', 'port', + 'path', 'query', 'fragment'): + v = getattr(self, a) + if v is not None: + setattr(self, a, urllib.unquote(v)) + + def __repr__(self): + attrs = [] + for a in ('scheme', 'user', 'passwd', 'host', 'port', 'path', + 'query', 'fragment'): + v = getattr(self, a) + if v is not None: + attrs.append('%s: %r' % (a, v)) + return '' % ', '.join(attrs) + + def __str__(self): + """Join the URL's components back into a URL string. + + Examples: + + >>> str(url('http://user:pw@host:80/?foo#bar')) + 'http://user:pw@host:80/?foo#bar' + >>> str(url('ssh://user:pw@[::1]:2200//home/joe#')) + 'ssh://user:pw@[::1]:2200//home/joe#' + >>> str(url('http://localhost:80//')) + 'http://localhost:80//' + >>> str(url('http://localhost:80/')) + 'http://localhost:80/' + >>> str(url('http://localhost:80')) + 'http://localhost:80' + >>> str(url('bundle:foo')) + 'bundle:foo' + >>> str(url('path')) + 'path' + """ + if self._localpath: + s = self.path + if self.fragment: + s += '#' + self.fragment + return s + + s = self.scheme + ':' + if (self.user or self.passwd or self.host or + self.scheme and not self.path): + s += '//' + if self.user: + s += urllib.quote(self.user, safe=self._safechars) + if self.passwd: + s += ':' + urllib.quote(self.passwd, safe=self._safechars) + if self.user or self.passwd: + s += '@' + if self.host: + if not (self.host.startswith('[') and self.host.endswith(']')): + s += urllib.quote(self.host) + else: + s += self.host + if self.port: + s += ':' + urllib.quote(self.port) + if ((self.host and self.path is not None) or + (self.host and self.query or self.fragment)): + s += '/' + if self.path: + s += urllib.quote(self.path, safe=self._safepchars) + if self.query: + s += '?' + urllib.quote(self.query, safe=self._safepchars) + if self.fragment is not None: + s += '#' + urllib.quote(self.fragment, safe=self._safepchars) + return s + + def authinfo(self): + user, passwd = self.user, self.passwd + try: + self.user, self.passwd = None, None + s = str(self) + finally: + self.user, self.passwd = user, passwd + if not self.user: + return (s, None) + return (s, (None, (str(self), self.host), + self.user, self.passwd or '')) + +def has_scheme(path): + return bool(url(path).scheme) + def hidepassword(url): '''hide user credential in a url string''' scheme, netloc, path, params, query, fragment = urlparse.urlparse(url) diff -r 8796fb6af67e -r 4e8f2310f310 tests/test-url.py --- a/tests/test-url.py Wed Feb 23 23:30:48 2011 +0100 +++ b/tests/test-url.py Fri Mar 25 22:58:56 2011 -0700 @@ -49,6 +49,142 @@ check(_verifycert(None, 'example.com'), 'no certificate received') +import doctest + +def test_url(): + """ + >>> from mercurial.url import url + + This tests for edge cases in url.URL's parsing algorithm. Most of + these aren't useful for documentation purposes, so they aren't + part of the class's doc tests. + + Query strings and fragments: + + >>> url('http://host/a?b#c') + + >>> url('http://host/a?') + + >>> url('http://host/a#b#c') + + >>> url('http://host/a#b?c') + + >>> url('http://host/?a#b') + + >>> url('http://host/?a#b', parse_query=False) + + >>> url('http://host/?a#b', parse_fragment=False) + + >>> url('http://host/?a#b', parse_query=False, parse_fragment=False) + + + IPv6 addresses: + + >>> url('ldap://[2001:db8::7]/c=GB?objectClass?one') + + >>> url('ldap://joe:xxx@[2001:db8::7]:80/c=GB?objectClass?one') + + + Missing scheme, host, etc.: + + >>> url('://192.0.2.16:80/') + + >>> url('http://mercurial.selenic.com') + + >>> url('/foo') + + >>> url('bundle:/foo') + + >>> url('a?b#c') + + >>> url('http://x.com?arg=/foo') + + >>> url('http://joe:xxx@/foo') + + + Just a scheme and a path: + + >>> url('mailto:John.Doe@example.com') + + >>> url('a:b:c:d') + + + SSH examples: + + >>> url('ssh://joe@host//home/joe') + + >>> url('ssh://joe:xxx@host/src') + + >>> url('ssh://joe:xxx@host') + + >>> url('ssh://joe@host') + + >>> url('ssh://host') + + >>> url('ssh://') + + >>> url('ssh:') + + + Non-numeric port: + + >>> url('http://example.com:dd') + + >>> url('ssh://joe:xxx@host:ssh/foo') + + + Bad authentication credentials: + + >>> url('http://joe@joeville:123@4:@host/a?b#c') + + >>> url('http://!*#?/@!*#?/:@host/a?b#c') + + >>> url('http://!*#?@!*#?:@host/a?b#c') + + >>> url('http://!*@:!*@@host/a?b#c') + + + File paths: + + >>> url('a/b/c/d.g.f') + + >>> url('/x///z/y/') + + + Empty URL: + + >>> u = url('') + >>> u + + >>> str(u) + '' + + Empty path with query string: + + >>> str(url('http://foo/?bar')) + 'http://foo/?bar' + + Invalid path: + + >>> u = url('http://foo/bar') + >>> u.path = 'bar' + >>> str(u) + 'http://foo/bar' + + >>> u = url('file:///foo/bar/baz') + >>> u + + >>> str(u) + 'file:/foo/bar/baz' + """ + +doctest.testmod(optionflags=doctest.NORMALIZE_WHITESPACE) + # Unicode (IDN) certname isn't supported check(_verifycert(cert(u'\u4f8b.jp'), 'example.jp'), 'IDN in certificate not supported')