view mercurial/hgweb/request.py @ 36814:f9078c6caeb6

hgweb: parse and store HTTP request headers WSGI transmits HTTP request headers as HTTP_* environment variables. We teach our parser about these and hook up a dict-like data structure that supports case insensitive header manipulation. Differential Revision: https://phab.mercurial-scm.org/D2742
author Gregory Szorc <gregory.szorc@gmail.com>
date Thu, 08 Mar 2018 16:22:25 -0800
parents cfb9ef24968c
children ed0456fde625
line wrap: on
line source

# hgweb/request.py - An http request from either CGI or the standalone server.
#
# Copyright 21 May 2005 - (c) 2005 Jake Edge <jake@edge2.net>
# Copyright 2005, 2006 Matt Mackall <mpm@selenic.com>
#
# This software may be used and distributed according to the terms of the
# GNU General Public License version 2 or any later version.

from __future__ import absolute_import

import cgi
import errno
import socket
import wsgiref.headers as wsgiheaders
#import wsgiref.validate

from .common import (
    ErrorResponse,
    HTTP_NOT_MODIFIED,
    statusmessage,
)

from ..thirdparty import (
    attr,
)
from .. import (
    pycompat,
    util,
)

shortcuts = {
    'cl': [('cmd', ['changelog']), ('rev', None)],
    'sl': [('cmd', ['shortlog']), ('rev', None)],
    'cs': [('cmd', ['changeset']), ('node', None)],
    'f': [('cmd', ['file']), ('filenode', None)],
    'fl': [('cmd', ['filelog']), ('filenode', None)],
    'fd': [('cmd', ['filediff']), ('node', None)],
    'fa': [('cmd', ['annotate']), ('filenode', None)],
    'mf': [('cmd', ['manifest']), ('manifest', None)],
    'ca': [('cmd', ['archive']), ('node', None)],
    'tags': [('cmd', ['tags'])],
    'tip': [('cmd', ['changeset']), ('node', ['tip'])],
    'static': [('cmd', ['static']), ('file', None)]
}

def normalize(form):
    # first expand the shortcuts
    for k in shortcuts:
        if k in form:
            for name, value in shortcuts[k]:
                if value is None:
                    value = form[k]
                form[name] = value
            del form[k]
    # And strip the values
    bytesform = {}
    for k, v in form.iteritems():
        bytesform[pycompat.bytesurl(k)] = [
            pycompat.bytesurl(i.strip()) for i in v]
    return bytesform

@attr.s(frozen=True)
class parsedrequest(object):
    """Represents a parsed WSGI request / static HTTP request parameters."""

    # Full URL for this request.
    url = attr.ib()
    # URL without any path components. Just <proto>://<host><port>.
    baseurl = attr.ib()
    # Advertised URL. Like ``url`` and ``baseurl`` but uses SERVER_NAME instead
    # of HTTP: Host header for hostname. This is likely what clients used.
    advertisedurl = attr.ib()
    advertisedbaseurl = attr.ib()
    # WSGI application path.
    apppath = attr.ib()
    # List of path parts to be used for dispatch.
    dispatchparts = attr.ib()
    # URL path component (no query string) used for dispatch.
    dispatchpath = attr.ib()
    # Whether there is a path component to this request. This can be true
    # when ``dispatchpath`` is empty due to REPO_NAME muckery.
    havepathinfo = attr.ib()
    # Raw query string (part after "?" in URL).
    querystring = attr.ib()
    # List of 2-tuples of query string arguments.
    querystringlist = attr.ib()
    # Dict of query string arguments. Values are lists with at least 1 item.
    querystringdict = attr.ib()
    # wsgiref.headers.Headers instance. Operates like a dict with case
    # insensitive keys.
    headers = attr.ib()

def parserequestfromenv(env):
    """Parse URL components from environment variables.

    WSGI defines request attributes via environment variables. This function
    parses the environment variables into a data structure.
    """
    # PEP-0333 defines the WSGI spec and is a useful reference for this code.

    # We first validate that the incoming object conforms with the WSGI spec.
    # We only want to be dealing with spec-conforming WSGI implementations.
    # TODO enable this once we fix internal violations.
    #wsgiref.validate.check_environ(env)

    # PEP-0333 states that environment keys and values are native strings
    # (bytes on Python 2 and str on Python 3). The code points for the Unicode
    # strings on Python 3 must be between \00000-\000FF. We deal with bytes
    # in Mercurial, so mass convert string keys and values to bytes.
    if pycompat.ispy3:
        env = {k.encode('latin-1'): v for k, v in env.iteritems()}
        env = {k: v.encode('latin-1') if isinstance(v, str) else v
               for k, v in env.iteritems()}

    # https://www.python.org/dev/peps/pep-0333/#environ-variables defines
    # the environment variables.
    # https://www.python.org/dev/peps/pep-0333/#url-reconstruction defines
    # how URLs are reconstructed.
    fullurl = env['wsgi.url_scheme'] + '://'
    advertisedfullurl = fullurl

    def addport(s):
        if env['wsgi.url_scheme'] == 'https':
            if env['SERVER_PORT'] != '443':
                s += ':' + env['SERVER_PORT']
        else:
            if env['SERVER_PORT'] != '80':
                s += ':' + env['SERVER_PORT']

        return s

    if env.get('HTTP_HOST'):
        fullurl += env['HTTP_HOST']
    else:
        fullurl += env['SERVER_NAME']
        fullurl = addport(fullurl)

    advertisedfullurl += env['SERVER_NAME']
    advertisedfullurl = addport(advertisedfullurl)

    baseurl = fullurl
    advertisedbaseurl = advertisedfullurl

    fullurl += util.urlreq.quote(env.get('SCRIPT_NAME', ''))
    advertisedfullurl += util.urlreq.quote(env.get('SCRIPT_NAME', ''))
    fullurl += util.urlreq.quote(env.get('PATH_INFO', ''))
    advertisedfullurl += util.urlreq.quote(env.get('PATH_INFO', ''))

    if env.get('QUERY_STRING'):
        fullurl += '?' + env['QUERY_STRING']
        advertisedfullurl += '?' + env['QUERY_STRING']

    # When dispatching requests, we look at the URL components (PATH_INFO
    # and QUERY_STRING) after the application root (SCRIPT_NAME). But hgwebdir
    # has the concept of "virtual" repositories. This is defined via REPO_NAME.
    # If REPO_NAME is defined, we append it to SCRIPT_NAME to form a new app
    # root. We also exclude its path components from PATH_INFO when resolving
    # the dispatch path.

    apppath = env['SCRIPT_NAME']

    if env.get('REPO_NAME'):
        if not apppath.endswith('/'):
            apppath += '/'

        apppath += env.get('REPO_NAME')

    if 'PATH_INFO' in env:
        dispatchparts = env['PATH_INFO'].strip('/').split('/')

        # Strip out repo parts.
        repoparts = env.get('REPO_NAME', '').split('/')
        if dispatchparts[:len(repoparts)] == repoparts:
            dispatchparts = dispatchparts[len(repoparts):]
    else:
        dispatchparts = []

    dispatchpath = '/'.join(dispatchparts)

    querystring = env.get('QUERY_STRING', '')

    # We store as a list so we have ordering information. We also store as
    # a dict to facilitate fast lookup.
    querystringlist = util.urlreq.parseqsl(querystring, keep_blank_values=True)

    querystringdict = {}
    for k, v in querystringlist:
        if k in querystringdict:
            querystringdict[k].append(v)
        else:
            querystringdict[k] = [v]

    # HTTP_* keys contain HTTP request headers. The Headers structure should
    # perform case normalization for us. We just rewrite underscore to dash
    # so keys match what likely went over the wire.
    headers = []
    for k, v in env.iteritems():
        if k.startswith('HTTP_'):
            headers.append((k[len('HTTP_'):].replace('_', '-'), v))

    headers = wsgiheaders.Headers(headers)

    return parsedrequest(url=fullurl, baseurl=baseurl,
                         advertisedurl=advertisedfullurl,
                         advertisedbaseurl=advertisedbaseurl,
                         apppath=apppath,
                         dispatchparts=dispatchparts, dispatchpath=dispatchpath,
                         havepathinfo='PATH_INFO' in env,
                         querystring=querystring,
                         querystringlist=querystringlist,
                         querystringdict=querystringdict,
                         headers=headers)

class wsgirequest(object):
    """Higher-level API for a WSGI request.

    WSGI applications are invoked with 2 arguments. They are used to
    instantiate instances of this class, which provides higher-level APIs
    for obtaining request parameters, writing HTTP output, etc.
    """
    def __init__(self, wsgienv, start_response):
        version = wsgienv[r'wsgi.version']
        if (version < (1, 0)) or (version >= (2, 0)):
            raise RuntimeError("Unknown and unsupported WSGI version %d.%d"
                               % version)
        self.inp = wsgienv[r'wsgi.input']
        self.err = wsgienv[r'wsgi.errors']
        self.threaded = wsgienv[r'wsgi.multithread']
        self.multiprocess = wsgienv[r'wsgi.multiprocess']
        self.run_once = wsgienv[r'wsgi.run_once']
        self.env = wsgienv
        self.form = normalize(cgi.parse(self.inp,
                                        self.env,
                                        keep_blank_values=1))
        self._start_response = start_response
        self.server_write = None
        self.headers = []

    def __iter__(self):
        return iter([])

    def read(self, count=-1):
        return self.inp.read(count)

    def drain(self):
        '''need to read all data from request, httplib is half-duplex'''
        length = int(self.env.get('CONTENT_LENGTH') or 0)
        for s in util.filechunkiter(self.inp, limit=length):
            pass

    def respond(self, status, type, filename=None, body=None):
        if not isinstance(type, str):
            type = pycompat.sysstr(type)
        if self._start_response is not None:
            self.headers.append((r'Content-Type', type))
            if filename:
                filename = (filename.rpartition('/')[-1]
                            .replace('\\', '\\\\').replace('"', '\\"'))
                self.headers.append(('Content-Disposition',
                                     'inline; filename="%s"' % filename))
            if body is not None:
                self.headers.append((r'Content-Length', str(len(body))))

            for k, v in self.headers:
                if not isinstance(v, str):
                    raise TypeError('header value must be string: %r' % (v,))

            if isinstance(status, ErrorResponse):
                self.headers.extend(status.headers)
                if status.code == HTTP_NOT_MODIFIED:
                    # RFC 2616 Section 10.3.5: 304 Not Modified has cases where
                    # it MUST NOT include any headers other than these and no
                    # body
                    self.headers = [(k, v) for (k, v) in self.headers if
                                    k in ('Date', 'ETag', 'Expires',
                                          'Cache-Control', 'Vary')]
                status = statusmessage(status.code, pycompat.bytestr(status))
            elif status == 200:
                status = '200 Script output follows'
            elif isinstance(status, int):
                status = statusmessage(status)

            self.server_write = self._start_response(
                pycompat.sysstr(status), self.headers)
            self._start_response = None
            self.headers = []
        if body is not None:
            self.write(body)
            self.server_write = None

    def write(self, thing):
        if thing:
            try:
                self.server_write(thing)
            except socket.error as inst:
                if inst[0] != errno.ECONNRESET:
                    raise

    def writelines(self, lines):
        for line in lines:
            self.write(line)

    def flush(self):
        return None

    def close(self):
        return None

def wsgiapplication(app_maker):
    '''For compatibility with old CGI scripts. A plain hgweb() or hgwebdir()
    can and should now be used as a WSGI application.'''
    application = app_maker()
    def run_wsgi(env, respond):
        return application(env, respond)
    return run_wsgi