bazarr/libs/trakit/language.py

170 lines
6.5 KiB
Python
Raw Normal View History

import typing
from babelfish import (
COUNTRIES,
Country,
CountryReverseError,
LANGUAGE_MATRIX,
Language,
LanguageReverseError,
SCRIPTS,
Script,
country_converters,
language_converters
)
from babelfish.converters import CaseInsensitiveDict
from rebulk import Rebulk
from rebulk.match import Match
from trakit.config import Config
from trakit.context import Context
from trakit.converters.country import GuessCountryConverter
from trakit.converters.language import GuessLanguageConverter
from trakit.words import blank_match, blank_release_names, to_combinations, to_match, to_sentence, to_words
class LanguageFinder:
def __init__(self, config: Config):
self.country_max_words = 1
for k, v in COUNTRIES.items():
self.country_max_words = max(self.country_max_words, v.count(' '))
self.language_max_words = 1
for v in LANGUAGE_MATRIX:
self.language_max_words = max(self.language_max_words, v.name.count(' '))
self.script_max_words = 1
for v in config.scripts.keys():
self.script_max_words = max(self.script_max_words, v.count(' '))
self.region_max_words = 1
for v in config.regions.keys():
self.region_max_words = max(self.region_max_words, v.count(' '))
SCRIPTS['419'] = 'Latin America and the Caribbean' # Until babelfish support UN.M49
country_converters['guess'] = GuessCountryConverter(config.countries)
language_converters['guess'] = GuessLanguageConverter(config.languages)
self.regions = CaseInsensitiveDict(config.regions)
self.scripts = CaseInsensitiveDict(config.scripts)
self.common_words = CaseInsensitiveDict(dict.fromkeys(config.ignored, 0))
self.implicit = CaseInsensitiveDict(config.implicit_languages)
def _find_country(self, value: str):
combinations = to_combinations(to_words(value), self.country_max_words)
for c in combinations:
code = to_sentence(c)
try:
return to_match(c, Country.fromguess(code))
except CountryReverseError:
continue
def _find_script(self, value: str):
combinations = to_combinations(to_words(value), self.script_max_words)
for c in combinations:
code = to_sentence(c)
try:
return to_match(c, Script(self.scripts.get(code, code)))
except ValueError:
continue
def _find_region(self, value: str):
combinations = to_combinations(to_words(value), self.region_max_words)
for c in combinations:
code = to_sentence(c)
try:
return to_match(c, Script(self.regions.get(code, code)))
except ValueError:
continue
def _find_implicit_language(self, combinations: typing.List[typing.List[Match]]):
for c in combinations:
sentence = to_sentence(c)
if sentence in self.implicit:
return to_match(c, Language.fromietf(self.implicit[sentence]))
region = self._find_region(sentence)
if region and region.value.code in self.implicit:
lang = Language.fromietf(self.implicit[region.value.code])
return Match(region.start, region.end, value=lang, input_string=region.input_string)
try:
country = Country.fromguess(sentence)
if country.alpha2 in self.implicit:
lang = Language.fromietf(self.implicit[country.alpha2])
if lang.name.lower() == sentence.lower():
lang = Language.fromname(sentence)
return to_match(c, lang)
except CountryReverseError:
pass
def accept_word(self, string: str):
return string.lower() not in self.common_words and not string.isnumeric()
def find_language(self, value: str, context: Context):
value = blank_release_names(value)
all_words = to_words(value, predicate=self.accept_word)
combinations = to_combinations(all_words, self.language_max_words)
implicit_lang = self._find_implicit_language(combinations)
implicit_accepted = implicit_lang and context.accept(implicit_lang.value)
if implicit_accepted and implicit_lang.value.script and implicit_lang.value.script.code.isnumeric():
return implicit_lang
elif implicit_lang and not implicit_accepted:
value = blank_match(implicit_lang)
all_words = to_words(value, predicate=self.accept_word)
combinations = to_combinations(all_words, self.language_max_words)
for c in combinations:
language_sentence = to_sentence(c)
try:
lang = Language.fromguess(language_sentence)
except LanguageReverseError:
continue
match_lang = to_match(c, lang)
remaining_sentence = blank_match(match_lang)
for combination in to_combinations(to_words(remaining_sentence), self.country_max_words):
sentence = to_sentence(combination)
country = self._find_country(sentence)
if country:
try:
# discard country if value is actually the language name
Language.fromguess(country.raw)
except LanguageReverseError:
lang = Language(lang.alpha3, country=country.value, script=lang.script)
break
region = self._find_region(sentence)
if region:
lang = Language(lang.alpha3, country=lang.country, script=region.value)
break
script = self._find_script(sentence)
if script:
lang = Language(lang.alpha3, country=lang.country, script=script.value)
break
if implicit_accepted and implicit_lang.value.alpha3 == lang.alpha3 and not lang.country and not lang.script:
return implicit_lang
if context.accept(lang):
return to_match(c, lang)
if implicit_accepted:
return implicit_lang
def find(self, value: str, context: Context):
match = self.find_language(value, context)
if match:
return match.start, match.end, {'value': match.value}
def language(config: Config):
rebulk = Rebulk()
rebulk.functional(LanguageFinder(config).find, name='language')
return rebulk