2022-01-20 18:27:56 +00:00
|
|
|
// This file Copyright © 2021-2022 Mnemosyne LLC.
|
2022-02-07 16:25:02 +00:00
|
|
|
// It may be used under GPLv2 (SPDX: GPL-2.0-only), GPLv3 (SPDX: GPL-3.0-only),
|
2022-01-20 18:27:56 +00:00
|
|
|
// or any future license endorsed by Mnemosyne LLC.
|
|
|
|
// License text can be found in the licenses/ folder.
|
2021-11-09 03:30:03 +00:00
|
|
|
|
|
|
|
#include <algorithm>
|
|
|
|
#include <array>
|
|
|
|
#include <cctype>
|
2022-08-17 16:08:36 +00:00
|
|
|
#include <cstdlib> // for strtoul()
|
2021-11-09 03:30:03 +00:00
|
|
|
#include <cstddef>
|
2021-12-29 08:28:12 +00:00
|
|
|
#include <limits>
|
2021-11-09 03:30:03 +00:00
|
|
|
#include <optional>
|
2022-01-13 02:13:58 +00:00
|
|
|
#include <string>
|
2021-11-09 03:30:03 +00:00
|
|
|
#include <string_view>
|
2022-08-17 16:08:36 +00:00
|
|
|
#include <utility>
|
2021-11-09 03:30:03 +00:00
|
|
|
|
2022-04-04 18:36:48 +00:00
|
|
|
#include <fmt/format.h>
|
|
|
|
|
2022-02-12 17:30:27 +00:00
|
|
|
#define PSL_STATIC
|
|
|
|
#include <libpsl.h>
|
|
|
|
|
2021-11-09 03:30:03 +00:00
|
|
|
#include "transmission.h"
|
|
|
|
|
|
|
|
#include "net.h"
|
2022-05-24 04:05:16 +00:00
|
|
|
#include "tr-strbuf.h"
|
2021-11-09 03:30:03 +00:00
|
|
|
#include "utils.h"
|
2022-05-24 04:05:16 +00:00
|
|
|
#include "web-utils.h"
|
2021-11-09 03:30:03 +00:00
|
|
|
|
|
|
|
using namespace std::literals;
|
|
|
|
|
|
|
|
/***
|
|
|
|
****
|
|
|
|
***/
|
|
|
|
|
2022-08-03 06:15:37 +00:00
|
|
|
bool tr_addressIsIP(char const* address)
|
2021-11-09 03:30:03 +00:00
|
|
|
{
|
2022-12-09 02:27:52 +00:00
|
|
|
return address != nullptr && tr_address::from_string(address).has_value();
|
2021-11-09 03:30:03 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
char const* tr_webGetResponseStr(long code)
|
|
|
|
{
|
|
|
|
switch (code)
|
|
|
|
{
|
|
|
|
case 0:
|
|
|
|
return "No Response";
|
|
|
|
|
|
|
|
case 101:
|
|
|
|
return "Switching Protocols";
|
|
|
|
|
|
|
|
case 200:
|
|
|
|
return "OK";
|
|
|
|
|
|
|
|
case 201:
|
|
|
|
return "Created";
|
|
|
|
|
|
|
|
case 202:
|
|
|
|
return "Accepted";
|
|
|
|
|
|
|
|
case 203:
|
|
|
|
return "Non-Authoritative Information";
|
|
|
|
|
|
|
|
case 204:
|
|
|
|
return "No Content";
|
|
|
|
|
|
|
|
case 205:
|
|
|
|
return "Reset Content";
|
|
|
|
|
|
|
|
case 206:
|
|
|
|
return "Partial Content";
|
|
|
|
|
|
|
|
case 300:
|
|
|
|
return "Multiple Choices";
|
|
|
|
|
|
|
|
case 301:
|
|
|
|
return "Moved Permanently";
|
|
|
|
|
|
|
|
case 302:
|
|
|
|
return "Found";
|
|
|
|
|
|
|
|
case 303:
|
|
|
|
return "See Other";
|
|
|
|
|
|
|
|
case 304:
|
|
|
|
return "Not Modified";
|
|
|
|
|
|
|
|
case 305:
|
|
|
|
return "Use Proxy";
|
|
|
|
|
|
|
|
case 306:
|
|
|
|
return " (Unused)";
|
|
|
|
|
|
|
|
case 307:
|
|
|
|
return "Temporary Redirect";
|
|
|
|
|
|
|
|
case 400:
|
|
|
|
return "Bad Request";
|
|
|
|
|
|
|
|
case 401:
|
|
|
|
return "Unauthorized";
|
|
|
|
|
|
|
|
case 402:
|
|
|
|
return "Payment Required";
|
|
|
|
|
|
|
|
case 403:
|
|
|
|
return "Forbidden";
|
|
|
|
|
|
|
|
case 404:
|
|
|
|
return "Not Found";
|
|
|
|
|
|
|
|
case 405:
|
|
|
|
return "Method Not Allowed";
|
|
|
|
|
|
|
|
case 406:
|
|
|
|
return "Not Acceptable";
|
|
|
|
|
|
|
|
case 407:
|
|
|
|
return "Proxy Authentication Required";
|
|
|
|
|
|
|
|
case 408:
|
|
|
|
return "Request Timeout";
|
|
|
|
|
|
|
|
case 409:
|
|
|
|
return "Conflict";
|
|
|
|
|
|
|
|
case 410:
|
|
|
|
return "Gone";
|
|
|
|
|
|
|
|
case 411:
|
|
|
|
return "Length Required";
|
|
|
|
|
|
|
|
case 412:
|
|
|
|
return "Precondition Failed";
|
|
|
|
|
|
|
|
case 413:
|
|
|
|
return "Request Entity Too Large";
|
|
|
|
|
|
|
|
case 414:
|
|
|
|
return "Request-URI Too Long";
|
|
|
|
|
|
|
|
case 415:
|
|
|
|
return "Unsupported Media Type";
|
|
|
|
|
|
|
|
case 416:
|
|
|
|
return "Requested Range Not Satisfiable";
|
|
|
|
|
|
|
|
case 417:
|
|
|
|
return "Expectation Failed";
|
|
|
|
|
|
|
|
case 421:
|
|
|
|
return "Misdirected Request";
|
|
|
|
|
|
|
|
case 500:
|
|
|
|
return "Internal Server Error";
|
|
|
|
|
|
|
|
case 501:
|
|
|
|
return "Not Implemented";
|
|
|
|
|
|
|
|
case 502:
|
|
|
|
return "Bad Gateway";
|
|
|
|
|
|
|
|
case 503:
|
|
|
|
return "Service Unavailable";
|
|
|
|
|
|
|
|
case 504:
|
|
|
|
return "Gateway Timeout";
|
|
|
|
|
|
|
|
case 505:
|
|
|
|
return "HTTP Version Not Supported";
|
|
|
|
|
|
|
|
default:
|
|
|
|
return "Unknown Error";
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
//// URLs
|
|
|
|
|
|
|
|
namespace
|
|
|
|
{
|
|
|
|
|
2021-12-29 08:28:12 +00:00
|
|
|
auto parsePort(std::string_view port_sv)
|
2021-11-09 03:30:03 +00:00
|
|
|
{
|
2021-12-29 08:28:12 +00:00
|
|
|
auto const port = tr_parseNum<int>(port_sv);
|
2021-11-09 03:30:03 +00:00
|
|
|
|
2022-04-21 15:58:13 +00:00
|
|
|
using PortLimits = std::numeric_limits<uint16_t>;
|
|
|
|
return port && PortLimits::min() <= *port && *port <= PortLimits::max() ? *port : -1;
|
2021-11-09 03:30:03 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
constexpr std::string_view getPortForScheme(std::string_view scheme)
|
|
|
|
{
|
|
|
|
auto constexpr KnownSchemes = std::array<std::pair<std::string_view, std::string_view>, 5>{ {
|
|
|
|
{ "ftp"sv, "21"sv },
|
|
|
|
{ "http"sv, "80"sv },
|
|
|
|
{ "https"sv, "443"sv },
|
|
|
|
{ "sftp"sv, "22"sv },
|
|
|
|
{ "udp"sv, "80"sv },
|
|
|
|
} };
|
|
|
|
|
|
|
|
for (auto const& [known_scheme, port] : KnownSchemes)
|
|
|
|
{
|
|
|
|
if (scheme == known_scheme)
|
|
|
|
{
|
|
|
|
return port;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
return "-1"sv;
|
|
|
|
}
|
|
|
|
|
|
|
|
bool urlCharsAreValid(std::string_view url)
|
|
|
|
{
|
|
|
|
// rfc2396
|
|
|
|
auto constexpr ValidChars = std::string_view{
|
|
|
|
"abcdefghijklmnopqrstuvwxyz" // lowalpha
|
|
|
|
"ABCDEFGHIJKLMNOPQRSTUVWXYZ" // upalpha
|
|
|
|
"0123456789" // digit
|
|
|
|
"-_.!~*'()" // mark
|
|
|
|
";/?:@&=+$," // reserved
|
|
|
|
"<>#%<\"" // delims
|
|
|
|
"{}|\\^[]`" // unwise
|
|
|
|
};
|
|
|
|
|
|
|
|
return !std::empty(url) &&
|
2021-11-10 00:13:47 +00:00
|
|
|
std::all_of(std::begin(url), std::end(url), [&ValidChars](auto ch) { return tr_strvContains(ValidChars, ch); });
|
2021-11-09 03:30:03 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
bool tr_isValidTrackerScheme(std::string_view scheme)
|
|
|
|
{
|
|
|
|
auto constexpr Schemes = std::array<std::string_view, 3>{ "http"sv, "https"sv, "udp"sv };
|
|
|
|
return std::find(std::begin(Schemes), std::end(Schemes), scheme) != std::end(Schemes);
|
|
|
|
}
|
|
|
|
|
2022-05-31 02:55:05 +00:00
|
|
|
bool isAsciiNonUpperCase(std::string_view host)
|
2022-05-24 04:05:16 +00:00
|
|
|
{
|
|
|
|
return std::all_of(
|
|
|
|
std::begin(host),
|
|
|
|
std::end(host),
|
2022-05-31 02:55:05 +00:00
|
|
|
[](unsigned char ch) { return (ch < 128) && (std::isupper(ch) == 0); });
|
2022-05-24 04:05:16 +00:00
|
|
|
}
|
|
|
|
|
2022-02-12 17:30:27 +00:00
|
|
|
// www.example.com -> example
|
|
|
|
// www.example.co.uk -> example
|
|
|
|
// 127.0.0.1 -> 127.0.0.1
|
|
|
|
std::string_view getSiteName(std::string_view host)
|
|
|
|
{
|
|
|
|
// is it empty?
|
|
|
|
if (std::empty(host))
|
|
|
|
{
|
|
|
|
return host;
|
|
|
|
}
|
|
|
|
|
|
|
|
// is it an IP?
|
2022-12-09 02:27:52 +00:00
|
|
|
if (auto const addr = tr_address::from_string(host); addr)
|
2022-02-12 17:30:27 +00:00
|
|
|
{
|
|
|
|
return host;
|
|
|
|
}
|
|
|
|
|
2022-07-25 22:25:55 +00:00
|
|
|
// psl needs a zero-terminated hostname
|
|
|
|
auto const szhost = tr_urlbuf{ host };
|
|
|
|
|
2022-02-12 17:30:27 +00:00
|
|
|
// is it a registered name?
|
2022-05-31 02:55:05 +00:00
|
|
|
if (isAsciiNonUpperCase(host))
|
2022-05-24 04:05:16 +00:00
|
|
|
{
|
|
|
|
if (char const* const top = psl_registrable_domain(psl_builtin(), std::data(szhost)); top != nullptr)
|
|
|
|
{
|
|
|
|
host.remove_prefix(top - std::data(szhost));
|
|
|
|
}
|
|
|
|
}
|
|
|
|
else if (char* lower = nullptr; psl_str_to_utf8lower(std::data(szhost), nullptr, nullptr, &lower) == PSL_SUCCESS)
|
2022-02-12 17:30:27 +00:00
|
|
|
{
|
|
|
|
// www.example.com -> example.com
|
2022-02-14 05:44:38 +00:00
|
|
|
if (char const* const top = psl_registrable_domain(psl_builtin(), lower); top != nullptr)
|
2022-02-12 17:30:27 +00:00
|
|
|
{
|
|
|
|
host.remove_prefix(top - lower);
|
|
|
|
}
|
2022-02-14 05:44:38 +00:00
|
|
|
|
2022-02-12 17:30:27 +00:00
|
|
|
psl_free_string(lower);
|
|
|
|
}
|
|
|
|
|
|
|
|
// example.com -> example
|
2022-02-14 05:44:38 +00:00
|
|
|
if (auto const dot_pos = host.find('.'); dot_pos != std::string_view::npos)
|
2022-02-12 17:30:27 +00:00
|
|
|
{
|
|
|
|
host = host.substr(0, dot_pos);
|
|
|
|
}
|
|
|
|
|
|
|
|
return host;
|
|
|
|
}
|
|
|
|
|
2021-11-09 03:30:03 +00:00
|
|
|
} // namespace
|
|
|
|
|
|
|
|
std::optional<tr_url_parsed_t> tr_urlParse(std::string_view url)
|
|
|
|
{
|
2021-11-10 00:13:47 +00:00
|
|
|
url = tr_strvStrip(url);
|
2021-11-09 03:30:03 +00:00
|
|
|
|
2022-01-16 15:28:18 +00:00
|
|
|
auto parsed = tr_url_parsed_t{};
|
|
|
|
parsed.full = url;
|
|
|
|
|
|
|
|
// So many magnet links are malformed, e.g. not escaping text
|
|
|
|
// in the display name, that we're better off handling magnets
|
|
|
|
// as a special case before even scanning for invalid chars.
|
2022-02-08 05:44:31 +00:00
|
|
|
if (auto constexpr MagnetStart = "magnet:?"sv; tr_strvStartsWith(url, MagnetStart))
|
2022-01-16 15:28:18 +00:00
|
|
|
{
|
|
|
|
parsed.scheme = "magnet"sv;
|
|
|
|
parsed.query = url.substr(std::size(MagnetStart));
|
|
|
|
return parsed;
|
|
|
|
}
|
|
|
|
|
2021-11-09 03:30:03 +00:00
|
|
|
if (!urlCharsAreValid(url))
|
|
|
|
{
|
2021-12-21 22:14:15 +00:00
|
|
|
return std::nullopt;
|
2021-11-09 03:30:03 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
// scheme
|
2021-11-10 00:13:47 +00:00
|
|
|
parsed.scheme = tr_strvSep(&url, ':');
|
|
|
|
if (std::empty(parsed.scheme))
|
2021-11-09 03:30:03 +00:00
|
|
|
{
|
2021-12-21 22:14:15 +00:00
|
|
|
return std::nullopt;
|
2021-11-09 03:30:03 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
// authority
|
|
|
|
// The authority component is preceded by a double slash ("//") and is
|
|
|
|
// terminated by the next slash ("/"), question mark ("?"), or number
|
|
|
|
// sign ("#") character, or by the end of the URI.
|
2021-12-17 05:47:51 +00:00
|
|
|
if (auto key = "//"sv; tr_strvStartsWith(url, key))
|
2021-11-09 03:30:03 +00:00
|
|
|
{
|
2021-11-10 00:13:47 +00:00
|
|
|
url.remove_prefix(std::size(key));
|
|
|
|
auto pos = url.find_first_of("/?#");
|
2021-11-09 03:30:03 +00:00
|
|
|
parsed.authority = url.substr(0, pos);
|
2021-12-17 05:47:51 +00:00
|
|
|
url = pos == std::string_view::npos ? ""sv : url.substr(pos);
|
2021-11-09 03:30:03 +00:00
|
|
|
|
2021-11-10 00:13:47 +00:00
|
|
|
auto remain = parsed.authority;
|
|
|
|
parsed.host = tr_strvSep(&remain, ':');
|
2022-02-12 17:30:27 +00:00
|
|
|
parsed.sitename = getSiteName(parsed.host);
|
2022-04-16 18:13:42 +00:00
|
|
|
parsed.port = parsePort(!std::empty(remain) ? remain : getPortForScheme(parsed.scheme));
|
2021-11-09 03:30:03 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
// The path is terminated by the first question mark ("?") or
|
|
|
|
// number sign ("#") character, or by the end of the URI.
|
2021-11-10 00:13:47 +00:00
|
|
|
auto pos = url.find_first_of("?#");
|
2021-11-09 03:30:03 +00:00
|
|
|
parsed.path = url.substr(0, pos);
|
2021-12-17 05:47:51 +00:00
|
|
|
url = pos == std::string_view::npos ? ""sv : url.substr(pos);
|
2021-11-09 03:30:03 +00:00
|
|
|
|
|
|
|
// query
|
2021-11-10 02:42:18 +00:00
|
|
|
if (tr_strvStartsWith(url, '?'))
|
2021-11-09 03:30:03 +00:00
|
|
|
{
|
|
|
|
url.remove_prefix(1);
|
|
|
|
pos = url.find('#');
|
|
|
|
parsed.query = url.substr(0, pos);
|
2021-12-17 05:47:51 +00:00
|
|
|
url = pos == std::string_view::npos ? ""sv : url.substr(pos);
|
2021-11-09 03:30:03 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
// fragment
|
2021-11-10 02:42:18 +00:00
|
|
|
if (tr_strvStartsWith(url, '#'))
|
2021-11-09 03:30:03 +00:00
|
|
|
{
|
|
|
|
parsed.fragment = url.substr(1);
|
|
|
|
}
|
|
|
|
|
|
|
|
return parsed;
|
|
|
|
}
|
|
|
|
|
|
|
|
std::optional<tr_url_parsed_t> tr_urlParseTracker(std::string_view url)
|
|
|
|
{
|
|
|
|
auto const parsed = tr_urlParse(url);
|
2021-12-21 22:14:15 +00:00
|
|
|
return parsed && tr_isValidTrackerScheme(parsed->scheme) ? std::make_optional(*parsed) : std::nullopt;
|
2021-11-09 03:30:03 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
bool tr_urlIsValidTracker(std::string_view url)
|
|
|
|
{
|
|
|
|
return !!tr_urlParseTracker(url);
|
|
|
|
}
|
|
|
|
|
|
|
|
bool tr_urlIsValid(std::string_view url)
|
|
|
|
{
|
|
|
|
auto constexpr Schemes = std::array<std::string_view, 5>{ "http"sv, "https"sv, "ftp"sv, "sftp"sv, "udp"sv };
|
|
|
|
auto const parsed = tr_urlParse(url);
|
|
|
|
return parsed && std::find(std::begin(Schemes), std::end(Schemes), parsed->scheme) != std::end(Schemes);
|
|
|
|
}
|
|
|
|
|
|
|
|
tr_url_query_view::iterator& tr_url_query_view::iterator::operator++()
|
|
|
|
{
|
2021-11-10 00:13:47 +00:00
|
|
|
auto pair = tr_strvSep(&remain, '&');
|
|
|
|
keyval.first = tr_strvSep(&pair, '=');
|
|
|
|
keyval.second = pair;
|
2021-11-09 03:30:03 +00:00
|
|
|
return *this;
|
|
|
|
}
|
|
|
|
|
|
|
|
tr_url_query_view::iterator tr_url_query_view::begin() const
|
|
|
|
{
|
|
|
|
auto it = iterator{};
|
|
|
|
it.remain = query;
|
|
|
|
++it;
|
|
|
|
return it;
|
|
|
|
}
|
|
|
|
|
|
|
|
std::string tr_urlPercentDecode(std::string_view in)
|
|
|
|
{
|
|
|
|
auto out = std::string{};
|
|
|
|
out.reserve(std::size(in));
|
|
|
|
|
|
|
|
for (;;)
|
|
|
|
{
|
|
|
|
auto pos = in.find('%');
|
|
|
|
out += in.substr(0, pos);
|
2021-12-17 05:47:51 +00:00
|
|
|
if (pos == std::string_view::npos)
|
2021-11-09 03:30:03 +00:00
|
|
|
{
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
|
|
|
|
in.remove_prefix(pos);
|
2022-02-07 04:28:36 +00:00
|
|
|
if (std::size(in) >= 3 && in[0] == '%' && (std::isxdigit(in[1]) != 0) && (std::isxdigit(in[2]) != 0))
|
2021-11-09 03:30:03 +00:00
|
|
|
{
|
|
|
|
auto hexstr = std::array<char, 3>{ in[1], in[2], '\0' };
|
|
|
|
auto const hex = strtoul(std::data(hexstr), nullptr, 16);
|
|
|
|
out += char(hex);
|
|
|
|
in.remove_prefix(3);
|
|
|
|
}
|
|
|
|
else
|
|
|
|
{
|
|
|
|
out += in.front();
|
|
|
|
in.remove_prefix(1);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
return out;
|
|
|
|
}
|