2018-09-17 00:27:00 +00:00
|
|
|
from __future__ import absolute_import
|
2019-09-18 15:30:46 +00:00
|
|
|
import re
|
2018-09-17 00:27:00 +00:00
|
|
|
from collections import namedtuple
|
|
|
|
|
|
|
|
from ..exceptions import LocationParseError
|
2019-09-18 15:30:46 +00:00
|
|
|
from ..packages import six, rfc3986
|
|
|
|
from ..packages.rfc3986.exceptions import RFC3986Exception, ValidationError
|
|
|
|
from ..packages.rfc3986.validators import Validator
|
|
|
|
from ..packages.rfc3986 import abnf_regexp, normalizers, compat, misc
|
2018-09-17 00:27:00 +00:00
|
|
|
|
|
|
|
|
|
|
|
url_attrs = ['scheme', 'auth', 'host', 'port', 'path', 'query', 'fragment']
|
|
|
|
|
|
|
|
# We only want to normalize urls with an HTTP(S) scheme.
|
|
|
|
# urllib3 infers URLs without a scheme (None) to be http.
|
|
|
|
NORMALIZABLE_SCHEMES = ('http', 'https', None)
|
|
|
|
|
2019-09-18 15:30:46 +00:00
|
|
|
# Regex for detecting URLs with schemes. RFC 3986 Section 3.1
|
|
|
|
SCHEME_REGEX = re.compile(r"^(?:[a-zA-Z][a-zA-Z0-9+\-]*:|/)")
|
|
|
|
|
|
|
|
PATH_CHARS = abnf_regexp.UNRESERVED_CHARS_SET | abnf_regexp.SUB_DELIMITERS_SET | {':', '@', '/'}
|
|
|
|
QUERY_CHARS = FRAGMENT_CHARS = PATH_CHARS | {'?'}
|
|
|
|
|
2018-09-17 00:27:00 +00:00
|
|
|
|
|
|
|
class Url(namedtuple('Url', url_attrs)):
|
|
|
|
"""
|
2019-09-18 15:30:46 +00:00
|
|
|
Data structure for representing an HTTP URL. Used as a return value for
|
2018-09-17 00:27:00 +00:00
|
|
|
:func:`parse_url`. Both the scheme and host are normalized as they are
|
|
|
|
both case-insensitive according to RFC 3986.
|
|
|
|
"""
|
|
|
|
__slots__ = ()
|
|
|
|
|
|
|
|
def __new__(cls, scheme=None, auth=None, host=None, port=None, path=None,
|
|
|
|
query=None, fragment=None):
|
|
|
|
if path and not path.startswith('/'):
|
|
|
|
path = '/' + path
|
2019-09-18 15:30:46 +00:00
|
|
|
if scheme is not None:
|
2018-09-17 00:27:00 +00:00
|
|
|
scheme = scheme.lower()
|
|
|
|
return super(Url, cls).__new__(cls, scheme, auth, host, port, path,
|
|
|
|
query, fragment)
|
|
|
|
|
|
|
|
@property
|
|
|
|
def hostname(self):
|
|
|
|
"""For backwards-compatibility with urlparse. We're nice like that."""
|
|
|
|
return self.host
|
|
|
|
|
|
|
|
@property
|
|
|
|
def request_uri(self):
|
|
|
|
"""Absolute path including the query string."""
|
|
|
|
uri = self.path or '/'
|
|
|
|
|
|
|
|
if self.query is not None:
|
|
|
|
uri += '?' + self.query
|
|
|
|
|
|
|
|
return uri
|
|
|
|
|
|
|
|
@property
|
|
|
|
def netloc(self):
|
|
|
|
"""Network location including host and port"""
|
|
|
|
if self.port:
|
|
|
|
return '%s:%d' % (self.host, self.port)
|
|
|
|
return self.host
|
|
|
|
|
|
|
|
@property
|
|
|
|
def url(self):
|
|
|
|
"""
|
|
|
|
Convert self into a url
|
|
|
|
|
|
|
|
This function should more or less round-trip with :func:`.parse_url`. The
|
|
|
|
returned url may not be exactly the same as the url inputted to
|
|
|
|
:func:`.parse_url`, but it should be equivalent by the RFC (e.g., urls
|
|
|
|
with a blank port will have : removed).
|
|
|
|
|
|
|
|
Example: ::
|
|
|
|
|
|
|
|
>>> U = parse_url('http://google.com/mail/')
|
|
|
|
>>> U.url
|
|
|
|
'http://google.com/mail/'
|
|
|
|
>>> Url('http', 'username:password', 'host.com', 80,
|
|
|
|
... '/path', 'query', 'fragment').url
|
|
|
|
'http://username:password@host.com:80/path?query#fragment'
|
|
|
|
"""
|
|
|
|
scheme, auth, host, port, path, query, fragment = self
|
2019-09-18 15:30:46 +00:00
|
|
|
url = u''
|
2018-09-17 00:27:00 +00:00
|
|
|
|
|
|
|
# We use "is not None" we want things to happen with empty strings (or 0 port)
|
|
|
|
if scheme is not None:
|
2019-09-18 15:30:46 +00:00
|
|
|
url += scheme + u'://'
|
2018-09-17 00:27:00 +00:00
|
|
|
if auth is not None:
|
2019-09-18 15:30:46 +00:00
|
|
|
url += auth + u'@'
|
2018-09-17 00:27:00 +00:00
|
|
|
if host is not None:
|
|
|
|
url += host
|
|
|
|
if port is not None:
|
2019-09-18 15:30:46 +00:00
|
|
|
url += u':' + str(port)
|
2018-09-17 00:27:00 +00:00
|
|
|
if path is not None:
|
|
|
|
url += path
|
|
|
|
if query is not None:
|
2019-09-18 15:30:46 +00:00
|
|
|
url += u'?' + query
|
2018-09-17 00:27:00 +00:00
|
|
|
if fragment is not None:
|
2019-09-18 15:30:46 +00:00
|
|
|
url += u'#' + fragment
|
2018-09-17 00:27:00 +00:00
|
|
|
|
|
|
|
return url
|
|
|
|
|
|
|
|
def __str__(self):
|
|
|
|
return self.url
|
|
|
|
|
|
|
|
|
|
|
|
def split_first(s, delims):
|
|
|
|
"""
|
2019-09-18 15:30:46 +00:00
|
|
|
.. deprecated:: 1.25
|
|
|
|
|
2018-09-17 00:27:00 +00:00
|
|
|
Given a string and an iterable of delimiters, split on the first found
|
|
|
|
delimiter. Return two split parts and the matched delimiter.
|
|
|
|
|
|
|
|
If not found, then the first part is the full input string.
|
|
|
|
|
|
|
|
Example::
|
|
|
|
|
|
|
|
>>> split_first('foo/bar?baz', '?/=')
|
|
|
|
('foo', 'bar?baz', '/')
|
|
|
|
>>> split_first('foo/bar?baz', '123')
|
|
|
|
('foo/bar?baz', '', None)
|
|
|
|
|
|
|
|
Scales linearly with number of delims. Not ideal for large number of delims.
|
|
|
|
"""
|
|
|
|
min_idx = None
|
|
|
|
min_delim = None
|
|
|
|
for d in delims:
|
|
|
|
idx = s.find(d)
|
|
|
|
if idx < 0:
|
|
|
|
continue
|
|
|
|
|
|
|
|
if min_idx is None or idx < min_idx:
|
|
|
|
min_idx = idx
|
|
|
|
min_delim = d
|
|
|
|
|
|
|
|
if min_idx is None or min_idx < 0:
|
|
|
|
return s, '', None
|
|
|
|
|
|
|
|
return s[:min_idx], s[min_idx + 1:], min_delim
|
|
|
|
|
|
|
|
|
2019-09-18 15:30:46 +00:00
|
|
|
def _encode_invalid_chars(component, allowed_chars, encoding='utf-8'):
|
|
|
|
"""Percent-encodes a URI component without reapplying
|
|
|
|
onto an already percent-encoded component. Based on
|
|
|
|
rfc3986.normalizers.encode_component()
|
|
|
|
"""
|
|
|
|
if component is None:
|
|
|
|
return component
|
|
|
|
|
|
|
|
# Try to see if the component we're encoding is already percent-encoded
|
|
|
|
# so we can skip all '%' characters but still encode all others.
|
|
|
|
percent_encodings = len(normalizers.PERCENT_MATCHER.findall(
|
|
|
|
compat.to_str(component, encoding)))
|
|
|
|
|
|
|
|
uri_bytes = component.encode('utf-8', 'surrogatepass')
|
|
|
|
is_percent_encoded = percent_encodings == uri_bytes.count(b'%')
|
|
|
|
|
|
|
|
encoded_component = bytearray()
|
|
|
|
|
|
|
|
for i in range(0, len(uri_bytes)):
|
|
|
|
# Will return a single character bytestring on both Python 2 & 3
|
|
|
|
byte = uri_bytes[i:i+1]
|
|
|
|
byte_ord = ord(byte)
|
|
|
|
if ((is_percent_encoded and byte == b'%')
|
|
|
|
or (byte_ord < 128 and byte.decode() in allowed_chars)):
|
|
|
|
encoded_component.extend(byte)
|
|
|
|
continue
|
|
|
|
encoded_component.extend('%{0:02x}'.format(byte_ord).encode().upper())
|
|
|
|
|
|
|
|
return encoded_component.decode(encoding)
|
|
|
|
|
|
|
|
|
2018-09-17 00:27:00 +00:00
|
|
|
def parse_url(url):
|
|
|
|
"""
|
|
|
|
Given a url, return a parsed :class:`.Url` namedtuple. Best-effort is
|
|
|
|
performed to parse incomplete urls. Fields not provided will be None.
|
2019-09-18 15:30:46 +00:00
|
|
|
This parser is RFC 3986 compliant.
|
|
|
|
|
|
|
|
:param str url: URL to parse into a :class:`.Url` namedtuple.
|
2018-09-17 00:27:00 +00:00
|
|
|
|
|
|
|
Partly backwards-compatible with :mod:`urlparse`.
|
|
|
|
|
|
|
|
Example::
|
|
|
|
|
|
|
|
>>> parse_url('http://google.com/mail/')
|
|
|
|
Url(scheme='http', host='google.com', port=None, path='/mail/', ...)
|
|
|
|
>>> parse_url('google.com:80')
|
|
|
|
Url(scheme=None, host='google.com', port=80, path=None, ...)
|
|
|
|
>>> parse_url('/foo?bar')
|
|
|
|
Url(scheme=None, host=None, port=None, path='/foo', query='bar', ...)
|
|
|
|
"""
|
|
|
|
if not url:
|
|
|
|
# Empty
|
|
|
|
return Url()
|
|
|
|
|
2019-09-18 15:30:46 +00:00
|
|
|
is_string = not isinstance(url, six.binary_type)
|
2018-09-17 00:27:00 +00:00
|
|
|
|
2019-09-18 15:30:46 +00:00
|
|
|
# RFC 3986 doesn't like URLs that have a host but don't start
|
|
|
|
# with a scheme and we support URLs like that so we need to
|
|
|
|
# detect that problem and add an empty scheme indication.
|
|
|
|
# We don't get hurt on path-only URLs here as it's stripped
|
|
|
|
# off and given an empty scheme anyways.
|
|
|
|
if not SCHEME_REGEX.search(url):
|
|
|
|
url = "//" + url
|
2018-09-17 00:27:00 +00:00
|
|
|
|
2019-09-18 15:30:46 +00:00
|
|
|
def idna_encode(name):
|
|
|
|
if name and any([ord(x) > 128 for x in name]):
|
|
|
|
try:
|
|
|
|
import idna
|
|
|
|
except ImportError:
|
|
|
|
raise LocationParseError("Unable to parse URL without the 'idna' module")
|
|
|
|
try:
|
|
|
|
return idna.encode(name.lower(), strict=True, std3_rules=True)
|
|
|
|
except idna.IDNAError:
|
|
|
|
raise LocationParseError(u"Name '%s' is not a valid IDNA label" % name)
|
|
|
|
return name
|
|
|
|
|
|
|
|
try:
|
|
|
|
split_iri = misc.IRI_MATCHER.match(compat.to_str(url)).groupdict()
|
|
|
|
iri_ref = rfc3986.IRIReference(
|
|
|
|
split_iri['scheme'], split_iri['authority'],
|
|
|
|
_encode_invalid_chars(split_iri['path'], PATH_CHARS),
|
|
|
|
_encode_invalid_chars(split_iri['query'], QUERY_CHARS),
|
|
|
|
_encode_invalid_chars(split_iri['fragment'], FRAGMENT_CHARS)
|
|
|
|
)
|
|
|
|
has_authority = iri_ref.authority is not None
|
|
|
|
uri_ref = iri_ref.encode(idna_encoder=idna_encode)
|
|
|
|
except (ValueError, RFC3986Exception):
|
|
|
|
return six.raise_from(LocationParseError(url), None)
|
|
|
|
|
|
|
|
# rfc3986 strips the authority if it's invalid
|
|
|
|
if has_authority and uri_ref.authority is None:
|
|
|
|
raise LocationParseError(url)
|
|
|
|
|
|
|
|
# Only normalize schemes we understand to not break http+unix
|
|
|
|
# or other schemes that don't follow RFC 3986.
|
|
|
|
if uri_ref.scheme is None or uri_ref.scheme.lower() in NORMALIZABLE_SCHEMES:
|
|
|
|
uri_ref = uri_ref.normalize()
|
|
|
|
|
|
|
|
# Validate all URIReference components and ensure that all
|
|
|
|
# components that were set before are still set after
|
|
|
|
# normalization has completed.
|
|
|
|
validator = Validator()
|
|
|
|
try:
|
|
|
|
validator.check_validity_of(
|
|
|
|
*validator.COMPONENT_NAMES
|
|
|
|
).validate(uri_ref)
|
|
|
|
except ValidationError:
|
|
|
|
return six.raise_from(LocationParseError(url), None)
|
|
|
|
|
|
|
|
# For the sake of backwards compatibility we put empty
|
|
|
|
# string values for path if there are any defined values
|
|
|
|
# beyond the path in the URL.
|
|
|
|
# TODO: Remove this when we break backwards compatibility.
|
|
|
|
path = uri_ref.path
|
2018-09-17 00:27:00 +00:00
|
|
|
if not path:
|
2019-09-18 15:30:46 +00:00
|
|
|
if (uri_ref.query is not None
|
|
|
|
or uri_ref.fragment is not None):
|
|
|
|
path = ""
|
|
|
|
else:
|
|
|
|
path = None
|
|
|
|
|
|
|
|
# Ensure that each part of the URL is a `str` for
|
|
|
|
# backwards compatibility.
|
|
|
|
def to_input_type(x):
|
|
|
|
if x is None:
|
|
|
|
return None
|
|
|
|
elif not is_string and not isinstance(x, six.binary_type):
|
|
|
|
return x.encode('utf-8')
|
|
|
|
return x
|
|
|
|
|
|
|
|
return Url(
|
|
|
|
scheme=to_input_type(uri_ref.scheme),
|
|
|
|
auth=to_input_type(uri_ref.userinfo),
|
|
|
|
host=to_input_type(uri_ref.host),
|
|
|
|
port=int(uri_ref.port) if uri_ref.port is not None else None,
|
|
|
|
path=to_input_type(path),
|
|
|
|
query=to_input_type(uri_ref.query),
|
|
|
|
fragment=to_input_type(uri_ref.fragment)
|
|
|
|
)
|
2018-09-17 00:27:00 +00:00
|
|
|
|
|
|
|
|
|
|
|
def get_host(url):
|
|
|
|
"""
|
|
|
|
Deprecated. Use :func:`parse_url` instead.
|
|
|
|
"""
|
|
|
|
p = parse_url(url)
|
|
|
|
return p.scheme or 'http', p.hostname, p.port
|