mirror of https://github.com/morpheus65535/bazarr
302 lines
10 KiB
Python
302 lines
10 KiB
Python
"""pycountry"""
|
|
|
|
import os.path
|
|
import unicodedata
|
|
from importlib import metadata as importlib_metadata
|
|
from typing import Dict, List, Optional, Type
|
|
|
|
import pycountry.db
|
|
|
|
# We prioritise importing the backported `importlib_resources`
|
|
# because the function we use (`importlib.resources.files`) is only
|
|
# available from Python 3.9, but the module itself exists since 3.7.
|
|
# We install `importlib_resources` on Python < 3.9.
|
|
# TODO: Remove usage of importlib_resources once support for Python 3.8 is dropped
|
|
try:
|
|
import importlib_resources # type: ignore
|
|
except ModuleNotFoundError:
|
|
from importlib import resources as importlib_resources
|
|
|
|
|
|
def resource_filename(package_or_requirement: str, resource_name: str) -> str:
|
|
return str(
|
|
importlib_resources.files(package_or_requirement) / resource_name
|
|
)
|
|
|
|
|
|
def get_version(distribution_name: str) -> Optional[str]:
|
|
try:
|
|
return importlib_metadata.version(distribution_name)
|
|
except importlib_metadata.PackageNotFoundError:
|
|
return "n/a"
|
|
|
|
|
|
# Variable annotations
|
|
LOCALES_DIR: str = resource_filename("pycountry", "locales")
|
|
DATABASE_DIR: str = resource_filename("pycountry", "databases")
|
|
__version__: Optional[str] = get_version("pycountry")
|
|
|
|
|
|
def remove_accents(input_str: str) -> str:
|
|
output_str = input_str
|
|
if not input_str.isascii():
|
|
# Borrowed from https://stackoverflow.com/a/517974/1509718
|
|
nfkd_form = unicodedata.normalize("NFKD", input_str)
|
|
output_str = "".join(
|
|
[c for c in nfkd_form if not unicodedata.combining(c)]
|
|
)
|
|
return output_str
|
|
|
|
|
|
class ExistingCountries(pycountry.db.Database):
|
|
"""Provides access to an ISO 3166 database (Countries)."""
|
|
|
|
data_class = pycountry.db.Country
|
|
root_key = "3166-1"
|
|
|
|
def search_fuzzy(self, query: str) -> List[Type["ExistingCountries"]]:
|
|
query = remove_accents(query.strip().lower())
|
|
|
|
# A country-code to points mapping for later sorting countries
|
|
# based on the query's matching incidence.
|
|
results: dict[str, int] = {}
|
|
|
|
def add_result(country: "pycountry.db.Country", points: int) -> None:
|
|
results.setdefault(country.alpha_2, 0)
|
|
results[country.alpha_2] += points
|
|
|
|
# Prio 1: exact matches on country names
|
|
try:
|
|
add_result(self.lookup(query), 50)
|
|
except LookupError:
|
|
pass
|
|
|
|
# Prio 2: exact matches on subdivision names
|
|
match_subdivions = pycountry.Subdivisions.match(
|
|
self=subdivisions, query=query
|
|
)
|
|
for candidate in match_subdivions:
|
|
print(candidate)
|
|
add_result(candidate.country, 49)
|
|
|
|
# Prio 3: partial matches on country names
|
|
for candidate in self:
|
|
# Higher priority for a match on the common name
|
|
for v in [
|
|
candidate._fields.get("name"),
|
|
candidate._fields.get("official_name"),
|
|
candidate._fields.get("comment"),
|
|
]:
|
|
if v is not None:
|
|
v = remove_accents(v.lower())
|
|
if query in v:
|
|
# This prefers countries with a match early in their name
|
|
# and also balances against countries with a number of
|
|
# partial matches and their name containing 'new' in the
|
|
# middle
|
|
add_result(
|
|
candidate, max([5, 30 - (2 * v.find(query))])
|
|
)
|
|
break
|
|
|
|
# Prio 4: partial matches on subdivision names
|
|
partial_match_subdivisions = pycountry.Subdivisions.partial_match(
|
|
self=subdivisions, query=query
|
|
)
|
|
for candidate in partial_match_subdivisions:
|
|
v = candidate._fields.get("name")
|
|
v = remove_accents(v.lower())
|
|
if query in v:
|
|
add_result(candidate.country, max([1, 5 - v.find(query)]))
|
|
|
|
if not results:
|
|
raise LookupError(query)
|
|
|
|
sorted_results = [
|
|
self.get(alpha_2=x[0])
|
|
# sort by points first, by alpha2 code second, and to ensure stable
|
|
# results the negative value allows us to sort reversely on the
|
|
# points but ascending on the country code.
|
|
for x in sorted(results.items(), key=lambda x: (-x[1], x[0]))
|
|
]
|
|
return sorted_results
|
|
|
|
|
|
class HistoricCountries(ExistingCountries):
|
|
"""Provides access to an ISO 3166-3 database
|
|
(Countries that have been removed from the standard)."""
|
|
|
|
data_class = pycountry.db.Country
|
|
root_key = "3166-3"
|
|
|
|
|
|
class Scripts(pycountry.db.Database):
|
|
"""Provides access to an ISO 15924 database (Scripts)."""
|
|
|
|
data_class = "Script"
|
|
root_key = "15924"
|
|
|
|
|
|
class Currencies(pycountry.db.Database):
|
|
"""Provides access to an ISO 4217 database (Currencies)."""
|
|
|
|
data_class = "Currency"
|
|
root_key = "4217"
|
|
|
|
|
|
class Languages(pycountry.db.Database):
|
|
"""Provides access to an ISO 639-1/2T/3 database (Languages)."""
|
|
|
|
no_index = ["status", "scope", "type", "inverted_name", "common_name"]
|
|
|
|
data_class = "Language"
|
|
root_key = "639-3"
|
|
|
|
|
|
class LanguageFamilies(pycountry.db.Database):
|
|
"""Provides access to an ISO 639-5 database
|
|
(Language Families and Groups)."""
|
|
|
|
data_class = "LanguageFamily"
|
|
root_key = "639-5"
|
|
|
|
|
|
class SubdivisionHierarchy(pycountry.db.Data):
|
|
def __init__(self, **kw):
|
|
if "parent" in kw:
|
|
kw["parent_code"] = kw["parent"]
|
|
else:
|
|
kw["parent_code"] = None
|
|
super().__init__(**kw)
|
|
self.country_code = self.code.split("-")[0]
|
|
if self.parent_code is not None:
|
|
self.parent_code = f"{self.country_code}-{self.parent_code}"
|
|
|
|
@property
|
|
def country(self):
|
|
return countries.get(alpha_2=self.country_code)
|
|
|
|
@property
|
|
def parent(self):
|
|
if not self.parent_code:
|
|
return None
|
|
return subdivisions.get(code=self.parent_code)
|
|
|
|
|
|
class Subdivisions(pycountry.db.Database):
|
|
# Note: subdivisions can be hierarchical to other subdivisions. The
|
|
# parent_code attribute is related to other subdivisions, *not*
|
|
# the country!
|
|
|
|
data_class = SubdivisionHierarchy
|
|
no_index = ["name", "parent_code", "parent", "type"]
|
|
root_key = "3166-2"
|
|
|
|
def _load(self, *args, **kw):
|
|
super()._load(*args, **kw)
|
|
|
|
# Add index for the country code.
|
|
self.indices["country_code"] = {}
|
|
for subdivision in self:
|
|
divs = self.indices["country_code"].setdefault(
|
|
subdivision.country_code.lower(), set()
|
|
)
|
|
divs.add(subdivision)
|
|
|
|
def get(self, **kw):
|
|
default = kw.setdefault("default", None)
|
|
subdivisions = super().get(**kw)
|
|
if subdivisions is default and "country_code" in kw:
|
|
# This handles the case where we know about a country but there
|
|
# are no subdivisions: we return an empty list in this case
|
|
# (sticking to the expected type here) instead of None.
|
|
if countries.get(alpha_2=kw["country_code"]) is not None:
|
|
return []
|
|
return subdivisions
|
|
|
|
def match(self, query):
|
|
query = remove_accents(query.strip().lower())
|
|
matching_candidates = []
|
|
for candidate in subdivisions:
|
|
for v in candidate._fields.values():
|
|
if v is not None:
|
|
v = remove_accents(v.lower())
|
|
# Some names include alternative versions which we want to
|
|
# match exactly.
|
|
for w in v.split(";"):
|
|
if w == query:
|
|
matching_candidates.append(candidate)
|
|
break
|
|
|
|
return matching_candidates
|
|
|
|
def partial_match(self, query):
|
|
query = remove_accents(query.strip().lower())
|
|
matching_candidates = []
|
|
for candidate in subdivisions:
|
|
v = candidate._fields.get("name")
|
|
v = remove_accents(v.lower())
|
|
if query in v:
|
|
matching_candidates.append(candidate)
|
|
|
|
return matching_candidates
|
|
|
|
def search_fuzzy(self, query: str) -> List[Type["Subdivisions"]]:
|
|
query = remove_accents(query.strip().lower())
|
|
|
|
# A Subdivision's code to points mapping for later sorting subdivisions
|
|
# based on the query's matching incidence.
|
|
results: dict[str, int] = {}
|
|
|
|
def add_result(
|
|
subdivision: "pycountry.db.Subdivision", points: int
|
|
) -> None:
|
|
results.setdefault(subdivision.code, 0)
|
|
results[subdivision.code] += points
|
|
|
|
# Prio 1: exact matches on subdivision names
|
|
match_subdivisions = self.match(query)
|
|
for candidate in match_subdivisions:
|
|
add_result(candidate, 50)
|
|
|
|
# Prio 2: partial matches on subdivision names
|
|
partial_match_subdivisions = self.partial_match(query)
|
|
for candidate in partial_match_subdivisions:
|
|
v = candidate._fields.get("name")
|
|
v = remove_accents(v.lower())
|
|
if query in v:
|
|
add_result(candidate, max([1, 5 - v.find(query)]))
|
|
|
|
if not results:
|
|
raise LookupError(query)
|
|
|
|
sorted_results = [
|
|
self.get(code=x[0])
|
|
# sort by points first, by alpha2 code second, and to ensure stable
|
|
# results the negative value allows us to sort reversely on the
|
|
# points but ascending on the country code.
|
|
for x in sorted(results.items(), key=lambda x: (-x[1], x[0]))
|
|
]
|
|
return sorted_results
|
|
|
|
|
|
# Initialize instances with type hints
|
|
countries: ExistingCountries = ExistingCountries(
|
|
os.path.join(DATABASE_DIR, "iso3166-1.json")
|
|
)
|
|
subdivisions: Subdivisions = Subdivisions(
|
|
os.path.join(DATABASE_DIR, "iso3166-2.json")
|
|
)
|
|
historic_countries: HistoricCountries = HistoricCountries(
|
|
os.path.join(DATABASE_DIR, "iso3166-3.json")
|
|
)
|
|
|
|
currencies: Currencies = Currencies(os.path.join(DATABASE_DIR, "iso4217.json"))
|
|
|
|
languages: Languages = Languages(os.path.join(DATABASE_DIR, "iso639-3.json"))
|
|
language_families: LanguageFamilies = LanguageFamilies(
|
|
os.path.join(DATABASE_DIR, "iso639-5.json")
|
|
)
|
|
|
|
scripts: Scripts = Scripts(os.path.join(DATABASE_DIR, "iso15924.json"))
|