bazarr/libs/tld/tests/test_core.py

852 lines
28 KiB
Python

# -*- coding: utf-8 -*-
import copy
import logging
import unittest
from os.path import abspath, join
from tempfile import gettempdir
from typing import Type
from urllib.parse import SplitResult, urlsplit
from faker import Faker # type: ignore
from .. import defaults
from ..base import BaseTLDSourceParser, Registry
from ..conf import get_setting, reset_settings, set_setting
from ..exceptions import (
TldBadUrl,
TldDomainNotFound,
TldImproperlyConfigured,
TldIOError,
)
from ..helpers import project_dir
from ..utils import (
BaseMozillaTLDSourceParser,
MozillaTLDSourceParser,
get_fld,
get_tld,
get_tld_names,
get_tld_names_container,
is_tld,
parse_tld,
reset_tld_names,
update_tld_names,
update_tld_names_cli,
)
from .base import internet_available_only, log_info
__author__ = "Artur Barseghyan"
__copyright__ = "2013-2023 Artur Barseghyan"
__license__ = "MPL-1.1 OR GPL-2.0-only OR LGPL-2.1-or-later"
__all__ = ("TestCore",)
LOGGER = logging.getLogger(__name__)
class TestCore(unittest.TestCase):
"""Core tld functionality tests."""
@classmethod
def setUpClass(cls):
cls.faker = Faker()
cls.temp_dir = gettempdir()
def setUp(self):
"""Set up."""
self.good_patterns = [
{
"url": "http://www.google.co.uk",
"fld": "google.co.uk",
"subdomain": "www",
"domain": "google",
"suffix": "co.uk",
"tld": "co.uk",
"kwargs": {"fail_silently": True},
},
{
"url": "http://www.v2.google.co.uk",
"fld": "google.co.uk",
"subdomain": "www.v2",
"domain": "google",
"suffix": "co.uk",
"tld": "co.uk",
"kwargs": {"fail_silently": True},
},
# No longer valid
# {
# 'url': 'http://www.me.congresodelalengua3.ar',
# 'tld': 'me.congresodelalengua3.ar',
# 'subdomain': 'www',
# 'domain': 'me',
# 'suffix': 'congresodelalengua3.ar',
# },
{
"url": "http://хром.гугл.рф",
"fld": "гугл.рф",
"subdomain": "хром",
"domain": "гугл",
"suffix": "рф",
"tld": "рф",
"kwargs": {"fail_silently": True},
},
{
"url": "http://www.google.co.uk:8001/lorem-ipsum/",
"fld": "google.co.uk",
"subdomain": "www",
"domain": "google",
"suffix": "co.uk",
"tld": "co.uk",
"kwargs": {"fail_silently": True},
},
{
"url": "http://www.me.cloudfront.net",
"fld": "me.cloudfront.net",
"subdomain": "www",
"domain": "me",
"suffix": "cloudfront.net",
"tld": "cloudfront.net",
"kwargs": {"fail_silently": True},
},
{
"url": "http://www.v2.forum.tech.google.co.uk:8001/"
"lorem-ipsum/",
"fld": "google.co.uk",
"subdomain": "www.v2.forum.tech",
"domain": "google",
"suffix": "co.uk",
"tld": "co.uk",
"kwargs": {"fail_silently": True},
},
{
"url": "https://pantheon.io/",
"fld": "pantheon.io",
"subdomain": "",
"domain": "pantheon",
"suffix": "io",
"tld": "io",
"kwargs": {"fail_silently": True},
},
{
"url": "v2.www.google.com",
"fld": "google.com",
"subdomain": "v2.www",
"domain": "google",
"suffix": "com",
"tld": "com",
"kwargs": {"fail_silently": True, "fix_protocol": True},
},
{
"url": "//v2.www.google.com",
"fld": "google.com",
"subdomain": "v2.www",
"domain": "google",
"suffix": "com",
"tld": "com",
"kwargs": {"fail_silently": True, "fix_protocol": True},
},
{
"url": "http://foo@bar.com",
"fld": "bar.com",
"subdomain": "",
"domain": "bar",
"suffix": "com",
"tld": "com",
"kwargs": {"fail_silently": True},
},
{
"url": "http://user:foo@bar.com",
"fld": "bar.com",
"subdomain": "",
"domain": "bar",
"suffix": "com",
"tld": "com",
"kwargs": {"fail_silently": True},
},
{
"url": "https://faguoren.xn--fiqs8s",
"fld": "faguoren.xn--fiqs8s",
"subdomain": "",
"domain": "faguoren",
"suffix": "xn--fiqs8s",
"tld": "xn--fiqs8s",
"kwargs": {"fail_silently": True},
},
{
"url": "blogs.lemonde.paris",
"fld": "lemonde.paris",
"subdomain": "blogs",
"domain": "lemonde",
"suffix": "paris",
"tld": "paris",
"kwargs": {"fail_silently": True, "fix_protocol": True},
},
{
"url": "axel.brighton.ac.uk",
"fld": "brighton.ac.uk",
"subdomain": "axel",
"domain": "brighton",
"suffix": "ac.uk",
"tld": "ac.uk",
"kwargs": {"fail_silently": True, "fix_protocol": True},
},
{
"url": "m.fr.blogspot.com.au",
"fld": "fr.blogspot.com.au",
"subdomain": "m",
"domain": "fr",
"suffix": "blogspot.com.au",
"tld": "blogspot.com.au",
"kwargs": {"fail_silently": True, "fix_protocol": True},
},
{
"url": "help.www.福岡.jp",
"fld": "www.福岡.jp",
"subdomain": "help",
"domain": "www",
"suffix": "福岡.jp",
"tld": "福岡.jp",
"kwargs": {"fail_silently": True, "fix_protocol": True},
},
{
"url": "syria.arabic.variant.سوريا",
"fld": "variant.سوريا",
"subdomain": "syria.arabic",
"domain": "variant",
"suffix": "سوريا",
"tld": "سوريا",
"kwargs": {"fail_silently": True, "fix_protocol": True},
},
{
"url": "http://www.help.kawasaki.jp",
"fld": "www.help.kawasaki.jp",
"subdomain": "",
"domain": "www",
"suffix": "help.kawasaki.jp",
"tld": "help.kawasaki.jp",
"kwargs": {"fail_silently": True},
},
{
"url": "http://www.city.kawasaki.jp",
"fld": "city.kawasaki.jp",
"subdomain": "www",
"domain": "city",
"suffix": "kawasaki.jp",
"tld": "kawasaki.jp",
"kwargs": {"fail_silently": True},
},
{
"url": "http://fedoraproject.org",
"fld": "fedoraproject.org",
"subdomain": "",
"domain": "fedoraproject",
"suffix": "org",
"tld": "org",
"kwargs": {"fail_silently": True},
},
{
"url": "http://www.cloud.fedoraproject.org",
"fld": "www.cloud.fedoraproject.org",
"subdomain": "",
"domain": "www",
"suffix": "cloud.fedoraproject.org",
"tld": "cloud.fedoraproject.org",
"kwargs": {"fail_silently": True},
},
{
"url": "https://www.john.app.os.fedoraproject.org",
"fld": "john.app.os.fedoraproject.org",
"subdomain": "www",
"domain": "john",
"suffix": "app.os.fedoraproject.org",
"tld": "app.os.fedoraproject.org",
"kwargs": {"fail_silently": True},
},
{
"url": "ftp://www.xn--mxail5aa.xn--11b4c3d",
"fld": "xn--mxail5aa.xn--11b4c3d",
"subdomain": "www",
"domain": "xn--mxail5aa",
"suffix": "xn--11b4c3d",
"tld": "xn--11b4c3d",
"kwargs": {"fail_silently": True},
},
{
"url": "http://cloud.fedoraproject.org",
"fld": "cloud.fedoraproject.org",
"subdomain": "",
"domain": "cloud.fedoraproject.org",
"suffix": "cloud.fedoraproject.org",
"tld": "cloud.fedoraproject.org",
"kwargs": {"fail_silently": True},
},
{
"url": "github.io",
"fld": "github.io",
"subdomain": "",
"domain": "github.io",
"suffix": "github.io",
"tld": "github.io",
"kwargs": {"fail_silently": True, "fix_protocol": True},
},
{
"url": urlsplit("http://lemonde.fr/article.html"),
"fld": "lemonde.fr",
"subdomain": "",
"domain": "lemonde",
"suffix": "fr",
"tld": "fr",
"kwargs": {"fail_silently": True},
},
{
"url": "https://github.com....../barseghyanartur/tld/",
"fld": "github.com",
"subdomain": "",
"domain": "github",
"suffix": "com",
"tld": "com",
"kwargs": {"fail_silently": True},
},
]
self.bad_patterns = {
"v2.www.google.com": {
"exception": TldBadUrl,
},
"/index.php?a=1&b=2": {
"exception": TldBadUrl,
},
"http://www.tld.doesnotexist": {
"exception": TldDomainNotFound,
},
"https://2001:0db8:0000:85a3:0000:0000:ac1f:8001": {
"exception": TldDomainNotFound,
},
"http://192.169.1.1": {
"exception": TldDomainNotFound,
},
"http://localhost:8080": {
"exception": TldDomainNotFound,
},
"https://localhost": {
"exception": TldDomainNotFound,
},
"https://localhost2": {
"exception": TldImproperlyConfigured,
"kwargs": {"search_public": False, "search_private": False},
},
}
self.invalid_tlds = {
"v2.www.google.com",
"tld.doesnotexist",
"2001:0db8:0000:85a3:0000:0000:ac1f",
"192.169.1.1",
"localhost",
"google.com",
}
self.tld_names_local_path_custom = project_dir(
join("tests", "res", "effective_tld_names_custom.dat.txt")
)
self.good_patterns_custom_parser = [
{
"url": "http://www.foreverchild",
"fld": "www.foreverchild",
"subdomain": "",
"domain": "www",
"suffix": "foreverchild",
"tld": "foreverchild",
"kwargs": {
"fail_silently": True,
# 'parser_class': self.get_custom_parser_class(),
},
},
{
"url": "http://www.v2.foreverchild",
"fld": "v2.foreverchild",
"subdomain": "www",
"domain": "v2",
"suffix": "foreverchild",
"tld": "foreverchild",
"kwargs": {
"fail_silently": True,
# 'parser_class': self.get_custom_parser_class(),
},
},
]
reset_settings()
def tearDown(self):
"""Tear down."""
reset_settings()
Registry.reset()
@property
def good_url(self):
return self.good_patterns[0]["url"]
@property
def bad_url(self):
return list(self.bad_patterns.keys())[0]
def get_custom_parser_class(
self,
uid: str = "custom_mozilla",
source_url: str = None,
local_path: str = "tests/res/effective_tld_names_custom.dat.txt",
) -> Type[BaseTLDSourceParser]:
# Define a custom TLD source parser class
parser_class = type(
"CustomMozillaTLDSourceParser",
(BaseMozillaTLDSourceParser,),
{
"uid": uid,
"source_url": source_url,
"local_path": local_path,
},
)
return parser_class
@log_info
def test_0_tld_names_loaded(self):
"""Test if tld names are loaded."""
get_fld("http://www.google.co.uk")
from ..utils import tld_names
res = len(tld_names) > 0
self.assertTrue(res)
return res
@internet_available_only
@log_info
def test_1_update_tld_names(self):
"""Test updating the tld names (re-fetch mozilla source)."""
res = update_tld_names(fail_silently=False)
self.assertTrue(res)
return res
@log_info
def test_2_fld_good_patterns_pass(self):
"""Test good URL patterns."""
res = []
for data in self.good_patterns:
_res = get_fld(data["url"], **data["kwargs"])
self.assertEqual(_res, data["fld"])
res.append(_res)
return res
@log_info
def test_3_fld_bad_patterns_pass(self):
"""Test bad URL patterns."""
res = []
for url, params in self.bad_patterns.items():
_res = get_fld(url, fail_silently=True)
self.assertEqual(_res, None)
res.append(_res)
return res
@log_info
def test_4_override_settings(self):
"""Testing settings override."""
def override_settings():
"""Override settings."""
return get_setting("DEBUG")
self.assertEqual(defaults.DEBUG, override_settings())
set_setting("DEBUG", True)
self.assertEqual(True, override_settings())
return override_settings()
@log_info
def test_5_tld_good_patterns_pass_parsed_object(self):
"""Test good URL patterns."""
res = []
for data in self.good_patterns:
kwargs = copy.copy(data["kwargs"])
kwargs["as_object"] = True
_res = get_tld(data["url"], **kwargs)
self.assertEqual(_res.tld, data["tld"])
self.assertEqual(_res.subdomain, data["subdomain"])
self.assertEqual(_res.domain, data["domain"])
self.assertEqual(_res.suffix, data["suffix"])
self.assertEqual(_res.fld, data["fld"])
self.assertEqual(
str(_res).encode("utf8"), data["tld"].encode("utf8")
)
self.assertEqual(
_res.__dict__,
{
"tld": _res.tld,
"domain": _res.domain,
"subdomain": _res.subdomain,
"fld": _res.fld,
"parsed_url": _res.parsed_url,
},
)
res.append(_res)
return res
@log_info
def test_6_override_full_names_path(self):
default = project_dir("dummy.txt")
override_base = "/tmp/test"
set_setting("NAMES_LOCAL_PATH_PARENT", override_base)
modified = project_dir("dummy.txt")
self.assertNotEqual(default, modified)
self.assertEqual(modified, abspath("/tmp/test/dummy.txt"))
@log_info
def test_7_public_private(self):
res = get_fld(
"http://silly.cc.ua",
fail_silently=True,
search_private=False,
parser_class=MozillaTLDSourceParser,
)
self.assertEqual(res, None)
res = get_fld(
"http://silly.cc.ua", fail_silently=True, search_private=False
)
self.assertEqual(res, "cc.ua")
res = get_fld(
"http://silly.cc.ua", fail_silently=True, search_private=True
)
self.assertEqual(res, "silly.cc.ua")
res = get_fld(
"mercy.compute.amazonaws.com",
fail_silently=True,
search_private=False,
fix_protocol=True,
parser_class=MozillaTLDSourceParser,
)
self.assertEqual(res, None)
res = get_fld(
"mercy.compute.amazonaws.com",
fail_silently=True,
search_private=False,
fix_protocol=True,
)
self.assertEqual(res, "amazonaws.com")
res = get_fld(
"http://whatever.com", fail_silently=True, search_public=False
)
self.assertEqual(res, None)
@log_info
def test_8_fld_bad_patterns_exceptions(self):
"""Test exceptions."""
res = []
for url, params in self.bad_patterns.items():
kwargs = params["kwargs"] if "kwargs" in params else {}
kwargs["fail_silently"] = False
with self.assertRaises(params["exception"]):
_res = get_fld(url, **kwargs)
res.append(_res)
return res
@log_info
def test_9_tld_good_patterns_pass(self):
"""Test `get_tld` good URL patterns."""
res = []
for data in self.good_patterns:
_res = get_tld(data["url"], **data["kwargs"])
self.assertEqual(_res, data["tld"])
res.append(_res)
return res
@log_info
def test_10_tld_bad_patterns_pass(self):
"""Test `get_tld` bad URL patterns."""
res = []
for url, params in self.bad_patterns.items():
_res = get_tld(url, fail_silently=True)
self.assertEqual(_res, None)
res.append(_res)
return res
@log_info
def test_11_parse_tld_good_patterns(self):
"""Test `parse_tld` good URL patterns."""
res = []
for data in self.good_patterns:
_res = parse_tld(data["url"], **data["kwargs"])
self.assertEqual(
_res, (data["tld"], data["domain"], data["subdomain"])
)
res.append(_res)
return res
@log_info
def test_12_is_tld_good_patterns(self):
"""Test `is_tld` good URL patterns."""
for data in self.good_patterns:
self.assertTrue(is_tld(data["tld"]))
@log_info
def test_13_is_tld_bad_patterns(self):
"""Test `is_tld` bad URL patterns."""
for _tld in self.invalid_tlds:
self.assertFalse(is_tld(_tld))
@log_info
def test_14_fail_update_tld_names(self):
"""Test fail `update_tld_names`."""
parser_class = self.get_custom_parser_class(
uid="custom_mozilla_2", source_url="i-do-not-exist"
)
# Assert raise TldIOError on wrong NAMES_SOURCE_URL
with self.assertRaises(TldIOError):
update_tld_names(fail_silently=False, parser_uid=parser_class.uid)
# Assert return False on wrong NAMES_SOURCE_URL
self.assertFalse(
update_tld_names(fail_silently=True, parser_uid=parser_class.uid)
)
@log_info
def test_15_fail_get_tld_names(self):
"""Test fail `update_tld_names`."""
parser_class = self.get_custom_parser_class(
uid="custom_mozilla_3",
source_url="i-do-not-exist",
local_path="/srv/tests/res/effective_tld_names_custom_3.dat.txt",
)
reset_tld_names()
# Assert raise TldIOError on wrong NAMES_SOURCE_URL
for params in self.good_patterns:
kwargs = {"url": params["url"]}
kwargs.update(params["kwargs"])
kwargs["fail_silently"] = False
kwargs["parser_class"] = parser_class
with self.assertRaises(TldIOError):
get_tld(**kwargs)
@log_info
def test_16_fail_get_fld_wrong_kwargs(self):
"""Test fail `get_fld` with wrong kwargs."""
with self.assertRaises(TldImproperlyConfigured):
get_fld(self.good_url, as_object=True)
@log_info
def test_17_fail_parse_tld(self):
"""Test fail `parse_tld`.
Assert raise TldIOError on wrong `NAMES_SOURCE_URL` for `parse_tld`.
"""
parser_class = self.get_custom_parser_class(source_url="i-do-not-exist")
parsed_tld = parse_tld(
self.bad_url, fail_silently=False, parser_class=parser_class
)
self.assertEqual(parsed_tld, (None, None, None))
@log_info
def test_18_get_tld_names_and_reset_tld_names(self):
"""Test fail `get_tld_names` and repair using `reset_tld_names`."""
tmp_filename = join(gettempdir(), f"{self.faker.uuid4()}.dat.txt")
parser_class = self.get_custom_parser_class(
source_url="i-do-not-exist", local_path=tmp_filename
)
reset_tld_names()
with self.subTest("Assert raise TldIOError"):
# Assert raise TldIOError on wrong NAMES_SOURCE_URL for
# `get_tld_names`
with self.assertRaises(TldIOError):
get_tld_names(fail_silently=False, parser_class=parser_class)
tmp_filename = join(gettempdir(), f"{self.faker.uuid4()}.dat.txt")
parser_class_2 = self.get_custom_parser_class(
source_url="i-do-not-exist-2", local_path=tmp_filename
)
reset_tld_names()
with self.subTest("Assert get None"):
# Assert get None on wrong `NAMES_SOURCE_URL` for `get_tld_names`
self.assertIsNone(
get_tld_names(fail_silently=True, parser_class=parser_class_2)
)
@internet_available_only
@log_info
def test_19_update_tld_names_cli(self):
"""Test the return code of the CLI version of `update_tld_names`."""
reset_tld_names()
res = update_tld_names_cli()
self.assertEqual(res, 0)
@log_info
def test_20_parse_tld_custom_tld_names_good_patterns(self):
"""Test `parse_tld` good URL patterns for custom tld names."""
res = []
for data in self.good_patterns_custom_parser:
kwargs = copy.copy(data["kwargs"])
kwargs["parser_class"] = self.get_custom_parser_class()
_res = parse_tld(data["url"], **kwargs)
self.assertEqual(
_res, (data["tld"], data["domain"], data["subdomain"])
)
res.append(_res)
return res
@log_info
def test_21_tld_custom_tld_names_good_patterns_pass_parsed_object(self):
"""Test `get_tld` good URL patterns for custom tld names."""
res = []
for data in self.good_patterns_custom_parser:
kwargs = copy.copy(data["kwargs"])
kwargs.update(
{
"as_object": True,
"parser_class": self.get_custom_parser_class(),
}
)
_res = get_tld(data["url"], **kwargs)
self.assertEqual(_res.tld, data["tld"])
self.assertEqual(_res.subdomain, data["subdomain"])
self.assertEqual(_res.domain, data["domain"])
self.assertEqual(_res.suffix, data["suffix"])
self.assertEqual(_res.fld, data["fld"])
self.assertEqual(
str(_res).encode("utf8"), data["tld"].encode("utf8")
)
self.assertEqual(
_res.__dict__,
{
"tld": _res.tld,
"domain": _res.domain,
"subdomain": _res.subdomain,
"fld": _res.fld,
"parsed_url": _res.parsed_url,
},
)
res.append(_res)
return res
@log_info
def test_22_reset_tld_names_for_custom_parser(self):
"""Test `reset_tld_names` for `tld_names_local_path`."""
res = []
parser_class = self.get_custom_parser_class()
for data in self.good_patterns_custom_parser:
kwargs = copy.copy(data["kwargs"])
kwargs.update(
{
"as_object": True,
"parser_class": self.get_custom_parser_class(),
}
)
_res = get_tld(data["url"], **kwargs)
self.assertEqual(_res.tld, data["tld"])
self.assertEqual(_res.subdomain, data["subdomain"])
self.assertEqual(_res.domain, data["domain"])
self.assertEqual(_res.suffix, data["suffix"])
self.assertEqual(_res.fld, data["fld"])
self.assertEqual(
str(_res).encode("utf8"), data["tld"].encode("utf8")
)
self.assertEqual(
_res.__dict__,
{
"tld": _res.tld,
"domain": _res.domain,
"subdomain": _res.subdomain,
"fld": _res.fld,
"parsed_url": _res.parsed_url,
},
)
res.append(_res)
tld_names = get_tld_names_container()
self.assertIn(parser_class.local_path, tld_names)
reset_tld_names(parser_class.local_path)
self.assertNotIn(parser_class.local_path, tld_names)
return res
@log_info
def test_23_fail_define_custom_parser_class_without_uid(self):
"""Test fail define custom parser class without `uid`."""
class CustomParser(BaseTLDSourceParser):
pass
class AnotherCustomParser(BaseTLDSourceParser):
uid = "another-custom-parser"
# Assert raise TldImproperlyConfigured
with self.assertRaises(TldImproperlyConfigured):
CustomParser.get_tld_names()
# Assert raise NotImplementedError
with self.assertRaises(NotImplementedError):
AnotherCustomParser.get_tld_names()
@log_info
def test_24_len_trie_nodes(self):
"""Test len of the trie nodes."""
get_tld("http://delusionalinsanity.com")
tld_names = get_tld_names_container()
self.assertGreater(len(tld_names[MozillaTLDSourceParser.local_path]), 0)
@log_info
def test_25_get_tld_names_no_arguments(self):
"""Test len of the trie nodes."""
tld_names = get_tld_names()
self.assertGreater(len(tld_names), 0)
@log_info
def test_26_case(self):
res = get_tld(
"https://MyDomain.com/AsDrFt?QUeRY=12aA",
fail_silently=True,
search_private=False,
as_object=True,
)
self.assertEqual(res.tld, "com")
self.assertEqual(res.domain, "mydomain")
self.assertEqual(res.subdomain, "")
self.assertEqual(res.fld, "mydomain.com")
self.assertEqual(
res.parsed_url,
SplitResult(
scheme="https",
netloc="MyDomain.com",
path="/AsDrFt",
query="QUeRY=12aA",
fragment="",
),
)
@log_info
def test_27_tld_fail_silently_pass(self):
"""Test `get_tld` bad URL patterns that would raise exception
if `fail_silently` isn't `True`.
"""
res = []
bad_url = ["https://user:password[@host.com", "https://user[@host.com"]
for url in bad_url:
_res = get_tld(url, fail_silently=True)
self.assertEqual(_res, None)
res.append(_res)
return res