bazarr/libs/waitress/parser.py

314 lines
11 KiB
Python

##############################################################################
#
# Copyright (c) 2001, 2002 Zope Foundation and Contributors.
# All Rights Reserved.
#
# This software is subject to the provisions of the Zope Public License,
# Version 2.1 (ZPL). A copy of the ZPL should accompany this distribution.
# THIS SOFTWARE IS PROVIDED "AS IS" AND ANY AND ALL EXPRESS OR IMPLIED
# WARRANTIES ARE DISCLAIMED, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
# WARRANTIES OF TITLE, MERCHANTABILITY, AGAINST INFRINGEMENT, AND FITNESS
# FOR A PARTICULAR PURPOSE.
#
##############################################################################
"""HTTP Request Parser
This server uses asyncore to accept connections and do initial
processing but threads to do work.
"""
import re
from io import BytesIO
from waitress.compat import (
tostr,
urlparse,
unquote_bytes_to_wsgi,
)
from waitress.buffers import OverflowableBuffer
from waitress.receiver import (
FixedStreamReceiver,
ChunkedReceiver,
)
from waitress.utilities import (
find_double_newline,
RequestEntityTooLarge,
RequestHeaderFieldsTooLarge,
BadRequest,
)
class ParsingError(Exception):
pass
class HTTPRequestParser(object):
"""A structure that collects the HTTP request.
Once the stream is completed, the instance is passed to
a server task constructor.
"""
completed = False # Set once request is completed.
empty = False # Set if no request was made.
expect_continue = False # client sent "Expect: 100-continue" header
headers_finished = False # True when headers have been read
header_plus = b''
chunked = False
content_length = 0
header_bytes_received = 0
body_bytes_received = 0
body_rcv = None
version = '1.0'
error = None
connection_close = False
# Other attributes: first_line, header, headers, command, uri, version,
# path, query, fragment
def __init__(self, adj):
"""
adj is an Adjustments object.
"""
# headers is a mapping containing keys translated to uppercase
# with dashes turned into underscores.
self.headers = {}
self.adj = adj
def received(self, data):
"""
Receives the HTTP stream for one request. Returns the number of
bytes consumed. Sets the completed flag once both the header and the
body have been received.
"""
if self.completed:
return 0 # Can't consume any more.
datalen = len(data)
br = self.body_rcv
if br is None:
# In header.
s = self.header_plus + data
index = find_double_newline(s)
if index >= 0:
# Header finished.
header_plus = s[:index]
consumed = len(data) - (len(s) - index)
# Remove preceeding blank lines.
header_plus = header_plus.lstrip()
if not header_plus:
self.empty = True
self.completed = True
else:
try:
self.parse_header(header_plus)
except ParsingError as e:
self.error = BadRequest(e.args[0])
self.completed = True
else:
if self.body_rcv is None:
# no content-length header and not a t-e: chunked
# request
self.completed = True
if self.content_length > 0:
max_body = self.adj.max_request_body_size
# we won't accept this request if the content-length
# is too large
if self.content_length >= max_body:
self.error = RequestEntityTooLarge(
'exceeds max_body of %s' % max_body)
self.completed = True
self.headers_finished = True
return consumed
else:
# Header not finished yet.
self.header_bytes_received += datalen
max_header = self.adj.max_request_header_size
if self.header_bytes_received >= max_header:
# malformed header, we need to construct some request
# on our own. we disregard the incoming(?) requests HTTP
# version and just use 1.0. IOW someone just sent garbage
# over the wire
self.parse_header(b'GET / HTTP/1.0\n')
self.error = RequestHeaderFieldsTooLarge(
'exceeds max_header of %s' % max_header)
self.completed = True
self.header_plus = s
return datalen
else:
# In body.
consumed = br.received(data)
self.body_bytes_received += consumed
max_body = self.adj.max_request_body_size
if self.body_bytes_received >= max_body:
# this will only be raised during t-e: chunked requests
self.error = RequestEntityTooLarge(
'exceeds max_body of %s' % max_body)
self.completed = True
elif br.error:
# garbage in chunked encoding input probably
self.error = br.error
self.completed = True
elif br.completed:
# The request (with the body) is ready to use.
self.completed = True
if self.chunked:
# We've converted the chunked transfer encoding request
# body into a normal request body, so we know its content
# length; set the header here. We already popped the
# TRANSFER_ENCODING header in parse_header, so this will
# appear to the client to be an entirely non-chunked HTTP
# request with a valid content-length.
self.headers['CONTENT_LENGTH'] = str(br.__len__())
return consumed
def parse_header(self, header_plus):
"""
Parses the header_plus block of text (the headers plus the
first line of the request).
"""
index = header_plus.find(b'\n')
if index >= 0:
first_line = header_plus[:index].rstrip()
header = header_plus[index + 1:]
else:
first_line = header_plus.rstrip()
header = b''
self.first_line = first_line # for testing
lines = get_header_lines(header)
headers = self.headers
for line in lines:
index = line.find(b':')
if index > 0:
key = line[:index]
if b'_' in key:
continue
value = line[index + 1:].strip()
key1 = tostr(key.upper().replace(b'-', b'_'))
# If a header already exists, we append subsequent values
# seperated by a comma. Applications already need to handle
# the comma seperated values, as HTTP front ends might do
# the concatenation for you (behavior specified in RFC2616).
try:
headers[key1] += tostr(b', ' + value)
except KeyError:
headers[key1] = tostr(value)
# else there's garbage in the headers?
# command, uri, version will be bytes
command, uri, version = crack_first_line(first_line)
version = tostr(version)
command = tostr(command)
self.command = command
self.version = version
(self.proxy_scheme,
self.proxy_netloc,
self.path,
self.query, self.fragment) = split_uri(uri)
self.url_scheme = self.adj.url_scheme
connection = headers.get('CONNECTION', '')
if version == '1.0':
if connection.lower() != 'keep-alive':
self.connection_close = True
if version == '1.1':
# since the server buffers data from chunked transfers and clients
# never need to deal with chunked requests, downstream clients
# should not see the HTTP_TRANSFER_ENCODING header; we pop it
# here
te = headers.pop('TRANSFER_ENCODING', '')
if te.lower() == 'chunked':
self.chunked = True
buf = OverflowableBuffer(self.adj.inbuf_overflow)
self.body_rcv = ChunkedReceiver(buf)
expect = headers.get('EXPECT', '').lower()
self.expect_continue = expect == '100-continue'
if connection.lower() == 'close':
self.connection_close = True
if not self.chunked:
try:
cl = int(headers.get('CONTENT_LENGTH', 0))
except ValueError:
cl = 0
self.content_length = cl
if cl > 0:
buf = OverflowableBuffer(self.adj.inbuf_overflow)
self.body_rcv = FixedStreamReceiver(cl, buf)
def get_body_stream(self):
body_rcv = self.body_rcv
if body_rcv is not None:
return body_rcv.getfile()
else:
return BytesIO()
def close(self):
body_rcv = self.body_rcv
if body_rcv is not None:
body_rcv.getbuf().close()
def split_uri(uri):
# urlsplit handles byte input by returning bytes on py3, so
# scheme, netloc, path, query, and fragment are bytes
try:
scheme, netloc, path, query, fragment = urlparse.urlsplit(uri)
except UnicodeError:
raise ParsingError('Bad URI')
return (
tostr(scheme),
tostr(netloc),
unquote_bytes_to_wsgi(path),
tostr(query),
tostr(fragment),
)
def get_header_lines(header):
"""
Splits the header into lines, putting multi-line headers together.
"""
r = []
lines = header.split(b'\n')
for line in lines:
if line.startswith((b' ', b'\t')):
if not r:
# http://corte.si/posts/code/pathod/pythonservers/index.html
raise ParsingError('Malformed header line "%s"' % tostr(line))
r[-1] += line
else:
r.append(line)
return r
first_line_re = re.compile(
b'([^ ]+) '
b'((?:[^ :?#]+://[^ ?#/]*(?:[0-9]{1,5})?)?[^ ]+)'
b'(( HTTP/([0-9.]+))$|$)'
)
def crack_first_line(line):
m = first_line_re.match(line)
if m is not None and m.end() == len(line):
if m.group(3):
version = m.group(5)
else:
version = None
method = m.group(1)
# the request methods that are currently defined are all uppercase:
# https://www.iana.org/assignments/http-methods/http-methods.xhtml and
# the request method is case sensitive according to
# https://tools.ietf.org/html/rfc7231#section-4.1
# By disallowing anything but uppercase methods we save poor
# unsuspecting souls from sending lowercase HTTP methods to waitress
# and having the request complete, while servers like nginx drop the
# request onto the floor.
if method != method.upper():
raise ParsingError('Malformed HTTP method "%s"' % tostr(method))
uri = m.group(2)
return method, uri, version
else:
return b'', b'', b''