From 7455496c4c42518df5f20646d50a93ca66c1a912 Mon Sep 17 00:00:00 2001 From: morpheus65535 Date: Tue, 21 Mar 2023 23:15:01 -0400 Subject: [PATCH] Trying to fix Segmentation fault caused by mediainfo in docker container. #2098 --- libs/knowit/__init__.py | 5 +- libs/knowit/__main__.py | 2 +- libs/knowit/api.py | 2 +- libs/knowit/core.py | 16 +- libs/knowit/defaults.yml | 28 +- libs/knowit/properties/video.py | 9 +- libs/knowit/provider.py | 10 +- libs/knowit/providers/enzyme.py | 17 +- libs/knowit/providers/ffmpeg.py | 24 +- libs/knowit/providers/mediainfo.py | 45 +- libs/knowit/providers/mkvmerge.py | 10 +- libs/knowit/rules/general.py | 37 +- libs/knowit/rules/subtitle.py | 17 +- libs/knowit/units.py | 20 +- libs/pymediainfo/__init__.py | 2 +- libs/trakit/__init__.py | 8 + libs/trakit/__main__.py | 108 ++++ libs/trakit/api.py | 24 + libs/trakit/config.py | 19 + libs/trakit/context.py | 22 + libs/trakit/converters/__init__.py | 0 libs/trakit/converters/country.py | 32 ++ libs/trakit/converters/language.py | 30 + libs/trakit/data/config.json | 860 +++++++++++++++++++++++++++++ libs/trakit/language.py | 169 ++++++ libs/trakit/patterns.py | 32 ++ libs/trakit/words.py | 99 ++++ libs/version.txt | 5 +- 28 files changed, 1555 insertions(+), 97 deletions(-) mode change 100644 => 100755 libs/knowit/provider.py create mode 100644 libs/trakit/__init__.py create mode 100644 libs/trakit/__main__.py create mode 100644 libs/trakit/api.py create mode 100644 libs/trakit/config.py create mode 100644 libs/trakit/context.py create mode 100644 libs/trakit/converters/__init__.py create mode 100644 libs/trakit/converters/country.py create mode 100644 libs/trakit/converters/language.py create mode 100644 libs/trakit/data/config.json create mode 100644 libs/trakit/language.py create mode 100644 libs/trakit/patterns.py create mode 100644 libs/trakit/words.py diff --git a/libs/knowit/__init__.py b/libs/knowit/__init__.py index eda706779..bf225e195 100644 --- a/libs/knowit/__init__.py +++ b/libs/knowit/__init__.py @@ -1,10 +1,9 @@ """Know your media files better.""" __title__ = 'knowit' -__version__ = '0.4.0' -__short_version__ = '.'.join(__version__.split('.')[:2]) +__version__ = '0.5.2' +__short_version__ = '0.5' __author__ = 'Rato AQ2' __license__ = 'MIT' -__copyright__ = 'Copyright 2016-2021, Rato AQ2' __url__ = 'https://github.com/ratoaq2/knowit' #: Video extensions diff --git a/libs/knowit/__main__.py b/libs/knowit/__main__.py index c30148421..d9255ffd4 100644 --- a/libs/knowit/__main__.py +++ b/libs/knowit/__main__.py @@ -169,7 +169,7 @@ def dumps( return convert(info, context) -def main(args: typing.List[str] = None) -> None: +def main(args: typing.Optional[typing.List[str]] = None) -> None: """Execute main function for entry point.""" argument_parser = build_argument_parser() args = args or sys.argv[1:] diff --git a/libs/knowit/api.py b/libs/knowit/api.py index 4df780605..c6ebd3bd6 100644 --- a/libs/knowit/api.py +++ b/libs/knowit/api.py @@ -65,7 +65,7 @@ def know( raise KnowitException(debug_info(context=context, exc_info=True)) -def dependencies(context: typing.Mapping = None) -> typing.Mapping: +def dependencies(context: typing.Optional[typing.Mapping] = None) -> typing.Mapping: """Return all dependencies detected by knowit.""" deps = {} try: diff --git a/libs/knowit/core.py b/libs/knowit/core.py index 9736d7ba2..ede307dad 100644 --- a/libs/knowit/core.py +++ b/libs/knowit/core.py @@ -63,6 +63,17 @@ class Property(Reportable[T]): # Used to detect duplicated values. e.g.: en / en or High@L4.0 / High@L4.0 or Progressive / Progressive self.delimiter = delimiter + @classmethod + def _extract_value(cls, + track: typing.Mapping, + name: str, + names: typing.List[str]): + if len(names) == 2: + parent_value = track.get(names[0], track.get(names[0].upper(), {})) + return parent_value.get(names[1], parent_value.get(names[1].upper())) + + return track.get(name, track.get(name.upper())) + def extract_value( self, track: typing.Mapping, @@ -71,7 +82,7 @@ class Property(Reportable[T]): """Extract the property value from a given track.""" for name in self.names: names = name.split('.') - value = track.get(names[0], {}).get(names[1]) if len(names) == 2 else track.get(name) + value = self._extract_value(track, name, names) if value is None: if self.default is None: continue @@ -216,9 +227,10 @@ class MultiValue(Property): class Rule(Reportable[T]): """Rule abstract class.""" - def __init__(self, name: str, override=False, **kwargs): + def __init__(self, name: str, private=False, override=False, **kwargs): """Initialize the object.""" super().__init__(name, **kwargs) + self.private = private self.override = override def execute(self, props, pv_props, context: typing.Mapping): diff --git a/libs/knowit/defaults.yml b/libs/knowit/defaults.yml index 9dd7e46db..af6b79c32 100644 --- a/libs/knowit/defaults.yml +++ b/libs/knowit/defaults.yml @@ -455,46 +455,46 @@ profiles: VideoProfileLevel: L1: - default: "1" + default: '1' technical: Level 1 L11: - default: "1.1" + default: '1.1' technical: Level 1.1 L13: - default: "1.3" + default: '1.3' technical: Level 1.3 L2: - default: "2" + default: '2' technical: Level 2 L21: - default: "2.1" + default: '2.1' technical: Level 2.1 L22: - default: "2.2" + default: '2.2' technical: Level 2.2 L3: - default: "3" + default: '3' technical: Level 3 L31: - default: "3.1" + default: '3.1' technical: Level 3.1 L32: - default: "3.2" + default: '3.2' technical: Level 3.2 L4: - default: "4" + default: '4' technical: Level 4 L41: - default: "4.1" + default: '4.1' technical: Level 4.1 L42: - default: "4.2" + default: '4.2' technical: Level 4.2 L5: - default: "5" + default: '5' technical: Level 5 L51: - default: "5.1" + default: '5.1' technical: Level 5.1 LOW: default: Low diff --git a/libs/knowit/properties/video.py b/libs/knowit/properties/video.py index e1b293d01..60c5b8264 100644 --- a/libs/knowit/properties/video.py +++ b/libs/knowit/properties/video.py @@ -106,11 +106,12 @@ class Ratio(Property[Decimal]): if (width, height) == ('0', '1'): # identity return Decimal('1.0') - result = round_decimal(Decimal(width) / Decimal(height), min_digits=1, max_digits=3) - if self.unit: - result *= self.unit + if height: + result = round_decimal(Decimal(width) / Decimal(height), min_digits=1, max_digits=3) + if self.unit: + result *= self.unit - return result + return result self.report(value, context) return None diff --git a/libs/knowit/provider.py b/libs/knowit/provider.py old mode 100644 new mode 100755 index f8c29f5f3..5306d8388 --- a/libs/knowit/provider.py +++ b/libs/knowit/provider.py @@ -103,10 +103,7 @@ class Provider: value = prop.extract_value(track, context) if value is not None: - if not prop.private: - which = props - else: - which = pv_props + which = props if not prop.private else pv_props which[name] = value for name, rule in self.rules.get(track_type, {}).items(): @@ -116,8 +113,9 @@ class Provider: value = rule.execute(props, pv_props, context) if value is not None: - props[name] = value - elif name in props and not rule.override: + which = props if not rule.private else pv_props + which[name] = value + elif name in props and (not rule.override or props[name] is None): del props[name] return props diff --git a/libs/knowit/providers/enzyme.py b/libs/knowit/providers/enzyme.py index 5dd3d8cef..6a06599d4 100644 --- a/libs/knowit/providers/enzyme.py +++ b/libs/knowit/providers/enzyme.py @@ -26,6 +26,7 @@ from knowit.rules import ( LanguageRule, ResolutionRule, ) +from knowit.rules.general import GuessTitleRule from knowit.serializer import get_json_encoder from knowit.units import units from knowit.utils import to_dict @@ -83,17 +84,20 @@ class EnzymeProvider(Provider): }, }, { 'video': { - 'language': LanguageRule('video language'), + 'guessed': GuessTitleRule('guessed properties', private=True), + 'language': LanguageRule('video language', override=True), 'resolution': ResolutionRule('video resolution'), }, 'audio': { - 'language': LanguageRule('audio language'), + 'guessed': GuessTitleRule('guessed properties', private=True), + 'language': LanguageRule('audio language', override=True), 'channels': AudioChannelsRule('audio channels'), }, 'subtitle': { - 'language': LanguageRule('subtitle language'), - 'hearing_impaired': HearingImpairedRule('subtitle hearing impaired'), - 'closed_caption': ClosedCaptionRule('closed caption'), + 'guessed': GuessTitleRule('guessed properties', private=True), + 'language': LanguageRule('subtitle language', override=True), + 'hearing_impaired': HearingImpairedRule('subtitle hearing impaired', override=True), + 'closed_caption': ClosedCaptionRule('closed caption', override=True), } }) @@ -130,7 +134,8 @@ class EnzymeProvider(Provider): if logger.level == logging.DEBUG: logger.debug('Video {video_path} scanned using Enzyme {version} has raw data:\n{data}', - video_path=video_path, version=enzyme.__version__, data=json.dumps(data)) + video_path=video_path, version=enzyme.__version__, + data=json.dumps(data, cls=get_json_encoder(context), indent=4, ensure_ascii=False)) result = self._describe_tracks(video_path, data.get('info', {}), data.get('video_tracks'), data.get('audio_tracks'), data.get('subtitle_tracks'), context) diff --git a/libs/knowit/providers/ffmpeg.py b/libs/knowit/providers/ffmpeg.py index 2474408cc..f19cea90b 100644 --- a/libs/knowit/providers/ffmpeg.py +++ b/libs/knowit/providers/ffmpeg.py @@ -34,6 +34,7 @@ from knowit.rules import ( LanguageRule, ResolutionRule, ) +from knowit.rules.general import GuessTitleRule from knowit.serializer import get_json_encoder from knowit.units import units from knowit.utils import ( @@ -77,7 +78,7 @@ class FFmpegExecutor: def extract_info(self, filename): """Extract media info.""" json_dump = self._execute(filename) - return json.loads(json_dump) + return json.loads(json_dump) if json_dump else {} def _execute(self, filename): raise NotImplementedError @@ -144,7 +145,7 @@ class FFmpegProvider(Provider): 'id': Basic('index', data_type=int, allow_fallback=True, description='video track number'), 'name': Property('tags.title', description='video track name'), 'language': Language('tags.language', description='video language'), - 'duration': Duration('duration', description='video duration'), + 'duration': Duration('duration', 'tags.duration', description='video duration'), 'width': Quantity('width', unit=units.pixel), 'height': Quantity('height', unit=units.pixel), 'scan_type': ScanType(config, 'field_order', default='Progressive', description='video scan type'), @@ -153,7 +154,7 @@ class FFmpegProvider(Provider): 'resolution': None, # populated with ResolutionRule 'frame_rate': Ratio('r_frame_rate', unit=units.FPS, description='video frame rate'), # frame_rate_mode - 'bit_rate': Quantity('bit_rate', unit=units.bps, description='video bit rate'), + 'bit_rate': Quantity('bit_rate', 'tags.bps', unit=units.bps, description='video bit rate'), 'bit_depth': Quantity('bits_per_raw_sample', unit=units.bit, description='video bit depth'), 'codec': VideoCodec(config, 'codec_name', description='video codec'), 'profile': VideoProfile(config, 'profile', description='video codec profile'), @@ -166,13 +167,13 @@ class FFmpegProvider(Provider): 'id': Basic('index', data_type=int, allow_fallback=True, description='audio track number'), 'name': Property('tags.title', description='audio track name'), 'language': Language('tags.language', description='audio language'), - 'duration': Duration('duration', description='audio duration'), + 'duration': Duration('duration', 'tags.duration', description='audio duration'), 'codec': AudioCodec(config, 'profile', 'codec_name', description='audio codec'), 'profile': AudioProfile(config, 'profile', description='audio codec profile'), 'channels_count': AudioChannels('channels', description='audio channels count'), 'channels': None, # populated with AudioChannelsRule 'bit_depth': Quantity('bits_per_raw_sample', unit=units.bit, description='audio bit depth'), - 'bit_rate': Quantity('bit_rate', unit=units.bps, description='audio bit rate'), + 'bit_rate': Quantity('bit_rate', 'tags.bps', unit=units.bps, description='audio bit rate'), 'sampling_rate': Quantity('sample_rate', unit=units.Hz, description='audio sampling rate'), 'forced': YesNo('disposition.forced', hide_value=False, description='audio track forced'), 'default': YesNo('disposition.default', hide_value=False, description='audio track default'), @@ -190,17 +191,20 @@ class FFmpegProvider(Provider): }, }, { 'video': { - 'language': LanguageRule('video language'), + 'guessed': GuessTitleRule('guessed properties', private=True), + 'language': LanguageRule('video language', override=True), 'resolution': ResolutionRule('video resolution'), }, 'audio': { - 'language': LanguageRule('audio language'), + 'guessed': GuessTitleRule('guessed properties', private=True), + 'language': LanguageRule('audio language', override=True), 'channels': AudioChannelsRule('audio channels'), }, 'subtitle': { - 'language': LanguageRule('subtitle language'), - 'hearing_impaired': HearingImpairedRule('subtitle hearing impaired'), - 'closed_caption': ClosedCaptionRule('closed caption'), + 'guessed': GuessTitleRule('guessed properties', private=True), + 'language': LanguageRule('subtitle language', override=True), + 'hearing_impaired': HearingImpairedRule('subtitle hearing impaired', override=True), + 'closed_caption': ClosedCaptionRule('closed caption', override=True), }, }) self.executor = FFmpegExecutor.get_executor_instance(suggested_path) diff --git a/libs/knowit/providers/mediainfo.py b/libs/knowit/providers/mediainfo.py index 39fd403ed..a19301bc6 100644 --- a/libs/knowit/providers/mediainfo.py +++ b/libs/knowit/providers/mediainfo.py @@ -1,5 +1,6 @@ - +import ctypes import json +import os import re from ctypes import c_void_p, c_wchar_p from decimal import Decimal @@ -43,6 +44,7 @@ from knowit.rules import ( LanguageRule, ResolutionRule, ) +from knowit.rules.general import GuessTitleRule from knowit.units import units from knowit.utils import ( define_candidate, @@ -77,7 +79,7 @@ class MediaInfoExecutor: locations = { 'unix': ('/usr/local/mediainfo/lib', '/usr/local/mediainfo/bin', '__PATH__'), - 'windows': ('__PATH__', ), + 'windows': ('C:\\Program Files\\MediaInfo', 'C:\\Program Files (x86)\\MediaInfo', '__PATH__'), 'macos': ('__PATH__', ), } @@ -121,12 +123,28 @@ class MediaInfoCliExecutor(MediaInfoExecutor): } def _execute(self, filename): - return json.loads(check_output([self.location, '--Output=JSON', '--Full', filename]).decode()) + data = check_output([self.location, '--Output=JSON', '--Full', filename]).decode() + + return json.loads(data) if data else {} + + @classmethod + def _is_gui_exe(cls, candidate: str): + if not candidate.endswith('MediaInfo.exe') or not os.path.isfile(candidate): + return False + + try: + shell32 = ctypes.WinDLL('shell32', use_last_error=True) # type: ignore + return bool(shell32.ExtractIconExW(candidate, 0, None, None, 1)) + except Exception: + return False @classmethod def create(cls, os_family=None, suggested_path=None): """Create the executor instance.""" for candidate in define_candidate(cls.locations, cls.names, os_family, suggested_path): + if cls._is_gui_exe(candidate): + continue + try: output = check_output([candidate, '--version']).decode() version = cls._get_version(output) @@ -154,7 +172,9 @@ class MediaInfoCTypesExecutor(MediaInfoExecutor): def _execute(self, filename): # Create a MediaInfo handle - return json.loads(MediaInfo.parse(filename, library_file=self.location, output='JSON')) + data = MediaInfo.parse(filename, library_file=self.location, output='JSON') + + return json.loads(data) if data else {} @classmethod def create(cls, os_family=None, suggested_path=None): @@ -254,19 +274,22 @@ class MediaInfoProvider(Provider): }, }, { 'video': { - 'language': LanguageRule('video language'), + 'guessed': GuessTitleRule('guessed properties', private=True), + 'language': LanguageRule('video language', override=True), 'resolution': ResolutionRule('video resolution'), }, 'audio': { - 'language': LanguageRule('audio language'), + 'guessed': GuessTitleRule('guessed properties', private=True), + 'language': LanguageRule('audio language', override=True), 'channels': AudioChannelsRule('audio channels'), - '_atmosrule': AtmosRule(config, 'atmos rule'), - '_dtshdrule': DtsHdRule(config, 'dts-hd rule'), + 'atmos': AtmosRule(config, 'atmos rule', private=True), + 'dtshd': DtsHdRule(config, 'dts-hd rule', private=True), }, 'subtitle': { - 'language': LanguageRule('subtitle language'), - 'hearing_impaired': HearingImpairedRule('subtitle hearing impaired'), - 'closed_caption': ClosedCaptionRule('closed caption'), + 'guessed': GuessTitleRule('guessed properties', private=True), + 'language': LanguageRule('subtitle language', override=True), + 'hearing_impaired': HearingImpairedRule('subtitle hearing impaired', override=True), + 'closed_caption': ClosedCaptionRule('closed caption', override=True), } }) self.executor = MediaInfoExecutor.get_executor_instance(suggested_path) diff --git a/libs/knowit/providers/mkvmerge.py b/libs/knowit/providers/mkvmerge.py index e5aca1550..ff422f8b4 100644 --- a/libs/knowit/providers/mkvmerge.py +++ b/libs/knowit/providers/mkvmerge.py @@ -28,6 +28,7 @@ from knowit.rules import ( LanguageRule, ResolutionRule, ) +from knowit.rules.general import GuessTitleRule from knowit.serializer import get_json_encoder from knowit.units import units from knowit.utils import define_candidate, detect_os @@ -67,7 +68,7 @@ class MkvMergeExecutor: def extract_info(self, filename): """Extract media info.""" json_dump = self._execute(filename) - return json.loads(json_dump) + return json.loads(json_dump) if json_dump else {} def _execute(self, filename): raise NotImplementedError @@ -166,17 +167,20 @@ class MkvMergeProvider(Provider): }, }, { 'video': { + 'guessed': GuessTitleRule('guessed properties', private=True), 'language': LanguageRule('video language', override=True), 'resolution': ResolutionRule('video resolution'), }, 'audio': { + 'guessed': GuessTitleRule('guessed properties', private=True), 'language': LanguageRule('audio language', override=True), 'channels': AudioChannelsRule('audio channels'), }, 'subtitle': { + 'guessed': GuessTitleRule('guessed properties', private=True), 'language': LanguageRule('subtitle language', override=True), - 'hearing_impaired': HearingImpairedRule('subtitle hearing impaired'), - 'closed_caption': ClosedCaptionRule('closed caption'), + 'hearing_impaired': HearingImpairedRule('subtitle hearing impaired', override=True), + 'closed_caption': ClosedCaptionRule('closed caption', override=True), } }) self.executor = MkvMergeExecutor.get_executor_instance(suggested_path) diff --git a/libs/knowit/rules/general.py b/libs/knowit/rules/general.py index b492c03a5..ad2c7734f 100644 --- a/libs/knowit/rules/general.py +++ b/libs/knowit/rules/general.py @@ -1,8 +1,6 @@ - -import re from logging import NullHandler, getLogger -import babelfish +from trakit.api import trakit from knowit.core import Rule @@ -10,22 +8,27 @@ logger = getLogger(__name__) logger.addHandler(NullHandler()) -class LanguageRule(Rule): - """Language rules.""" - - name_re = re.compile(r'(?P\w+)\b', re.IGNORECASE) +class GuessTitleRule(Rule): + """Guess properties from track title.""" def execute(self, props, pv_props, context): """Language detection using name.""" - if 'language' in props: + if 'name' in props: + language = props.get('language') + options = {'expected_language': language} if language else {} + guessed = trakit(props['name'], options) + if guessed: + return guessed + + +class LanguageRule(Rule): + """Language rules.""" + + def execute(self, props, pv_props, context): + """Language detection using name.""" + if 'guessed' not in pv_props: return - if 'name' in props: - name = props.get('name', '') - match = self.name_re.match(name) - if match: - try: - return babelfish.Language.fromname(match.group('name')) - except babelfish.Error: - pass - logger.info('Invalid %s: %r', self.description, name) + guess = pv_props['guessed'] + if 'language' in guess: + return guess['language'] diff --git a/libs/knowit/rules/subtitle.py b/libs/knowit/rules/subtitle.py index fa16fdbc1..704109f99 100644 --- a/libs/knowit/rules/subtitle.py +++ b/libs/knowit/rules/subtitle.py @@ -10,18 +10,19 @@ class ClosedCaptionRule(Rule): def execute(self, props, pv_props, context): """Execute closed caption rule.""" - for name in (pv_props.get('_closed_caption'), props.get('name')): - if name and self.cc_re.search(name): - return True + if '_closed_caption' in pv_props and self.cc_re.search(pv_props['_closed_caption']): + return True + + if 'guessed' in pv_props: + guessed = pv_props['guessed'] + return guessed.get('closed_caption') class HearingImpairedRule(Rule): """Hearing Impaired rule.""" - hi_re = re.compile(r'(\bsdh\b)', re.IGNORECASE) - def execute(self, props, pv_props, context): """Hearing Impaired.""" - name = props.get('name') - if name and self.hi_re.search(name): - return True + if 'guessed' in pv_props: + guessed = pv_props['guessed'] + return guessed.get('hearing_impaired') diff --git a/libs/knowit/units.py b/libs/knowit/units.py index 73ec16a5a..51e6cae73 100644 --- a/libs/knowit/units.py +++ b/libs/knowit/units.py @@ -1,10 +1,5 @@ import typing -try: - import pint -except ImportError: - pint = False - class NullRegistry: """A NullRegistry that masquerades as a pint.UnitRegistry.""" @@ -25,9 +20,18 @@ class NullRegistry: def _build_unit_registry(): - registry = pint.UnitRegistry() if pint else NullRegistry() - registry.define('FPS = 1 * hertz') - return registry + try: + import pint + + registry = pint.UnitRegistry() + registry.define('FPS = 1 * hertz') + + pint.set_application_registry(registry) + return registry + except ModuleNotFoundError: + pass + + return NullRegistry() units = _build_unit_registry() diff --git a/libs/pymediainfo/__init__.py b/libs/pymediainfo/__init__.py index 9c186798b..840ec18c3 100644 --- a/libs/pymediainfo/__init__.py +++ b/libs/pymediainfo/__init__.py @@ -386,7 +386,7 @@ class MediaInfo: A higher value will yield more precise results in some cases but will also increase parsing time. :param bool full: display additional tags, including computer-readable values - for sizes and durations. + for sizes and durations, corresponds to the CLI's ``--Full``/``-f`` parameter. :param bool legacy_stream_display: display additional information about streams. :param dict mediainfo_options: additional options that will be passed to the `MediaInfo_Option` function, for example: ``{"Language": "raw"}``. diff --git a/libs/trakit/__init__.py b/libs/trakit/__init__.py new file mode 100644 index 000000000..b134ad1a0 --- /dev/null +++ b/libs/trakit/__init__.py @@ -0,0 +1,8 @@ +__title__ = 'trakit' +__version__ = '0.2.1' +__short_version__ = '0.2' +__author__ = 'RatoAQ' +__license__ = 'MIT' +__url__ = 'https://github.com/ratoaq2/trakit' + +from .api import TrakItApi, trakit diff --git a/libs/trakit/__main__.py b/libs/trakit/__main__.py new file mode 100644 index 000000000..61f07324a --- /dev/null +++ b/libs/trakit/__main__.py @@ -0,0 +1,108 @@ +import argparse +import json +import logging +import sys +import typing + +import babelfish + +from trakit import TrakItApi, __version__ + +logging.basicConfig(stream=sys.stdout, format='%(message)s') +logging.getLogger('CONSOLE').setLevel(logging.INFO) +logging.getLogger('trakit').setLevel(logging.WARNING) + +console = logging.getLogger('CONSOLE') +logger = logging.getLogger('trakit') + + +def build_argument_parser() -> argparse.ArgumentParser: + """Build the argument parser.""" + opts = argparse.ArgumentParser() + opts.add_argument( + dest='value', + help='track title to guess', + type=str, + ) + + conf_opts = opts.add_argument_group('Configuration') + conf_opts.add_argument( + '-l', + '--expected-language', + dest='expected_language', + help='The expected language to be guessed', + type=str, + ) + + output_opts = opts.add_argument_group('Output') + output_opts.add_argument( + '--debug', + action='store_true', + dest='debug', + help='Print information for debugging trakit and for reporting bugs.' + ) + output_opts.add_argument( + '-y', + '--yaml', + action='store_true', + dest='yaml', + help='Display output in yaml format' + ) + + information_opts = opts.add_argument_group('Information') + information_opts.add_argument('--version', action='version', version=__version__) + + return opts + + +def _as_yaml(value: str, info: typing.Mapping[str, typing.Any]) -> str: + """Convert info to string using YAML format.""" + import yaml + + def default_representer(r: yaml.representer.SafeRepresenter, data: typing.Any): + return r.represent_scalar('tag:yaml.org,2002:str', str(data)) + + yaml.representer.SafeRepresenter.add_representer(babelfish.Language, default_representer) + + return yaml.safe_dump({value: dict(info)}, allow_unicode=True, sort_keys=False) + + +def _as_json(info: typing.Mapping[str, typing.Any]) -> str: + """Convert info to string using JSON format.""" + return json.dumps(info, ensure_ascii=False, indent=2, default=str) + + +def dump(value: str, info: typing.Mapping[str, typing.Any], opts: argparse.Namespace) -> str: + """Convert info to string using json or yaml format.""" + if opts.yaml: + return _as_yaml(value, info) + + return _as_json(info) + + +def trakit(value: str, opts: argparse.Namespace) -> typing.Mapping: + """Extract video metadata.""" + if not opts.yaml: + console.info('Parsing: %s', value) + options = {k: v for k, v in vars(opts).items() if v is not None} + info = TrakItApi().trakit(value, options) + console.info('TrakIt %s found: ', __version__) + console.info(dump(value, info, opts)) + return info + + +def main(args: typing.Optional[typing.List[str]] = None): + """Execute main function for entry point.""" + argument_parser = build_argument_parser() + args = args or sys.argv[1:] + opts = argument_parser.parse_args(args) + + if opts.debug: + logger.setLevel(logging.DEBUG) + logging.getLogger('rebulk').setLevel(logging.DEBUG) + + return trakit(opts.value, opts) + + +if __name__ == '__main__': + main(sys.argv[1:]) diff --git a/libs/trakit/api.py b/libs/trakit/api.py new file mode 100644 index 000000000..286207aa4 --- /dev/null +++ b/libs/trakit/api.py @@ -0,0 +1,24 @@ +import typing + +from trakit.config import Config +from trakit.context import Context +from trakit.patterns import configure + + +class TrakItApi: + + def __init__(self, config: typing.Optional[typing.Mapping[str, typing.Any]] = None): + self.rebulk = configure(Config(config)) + + def trakit(self, string: str, options: typing.Optional[typing.Mapping[str, typing.Any]] = None): + """Return a mapping of extracted information.""" + matches = self.rebulk.matches(string, Context(options)) + guess: typing.Mapping[str, typing.Any] = matches.to_dict() + return guess + + +default_api = TrakItApi() + + +def trakit(string: str, options: typing.Optional[typing.Mapping[str, typing.Any]] = None): + return default_api.trakit(string, options) diff --git a/libs/trakit/config.py b/libs/trakit/config.py new file mode 100644 index 000000000..6458b4bbd --- /dev/null +++ b/libs/trakit/config.py @@ -0,0 +1,19 @@ +import json +import typing + +from pkg_resources import resource_stream + + +class Config: + def __init__(self, config: typing.Optional[typing.Mapping[str, typing.Any]]): + with resource_stream('trakit', 'data/config.json') as f: + cfg: typing.Dict[str, typing.Any] = json.load(f) + if config: + cfg.update(config) + + self.ignored: typing.Set[str] = set(cfg.get('ignored', [])) + self.countries: typing.Mapping[str, str] = cfg.get('countries', {}) + self.languages: typing.Mapping[str, str] = cfg.get('languages', {}) + self.scripts: typing.Mapping[str, str] = cfg.get('scripts', {}) + self.regions: typing.Mapping[str, str] = cfg.get('regions', {}) + self.implicit_languages: typing.Mapping[str, str] = cfg.get('implicit-languages', {}) diff --git a/libs/trakit/context.py b/libs/trakit/context.py new file mode 100644 index 000000000..9a023ce08 --- /dev/null +++ b/libs/trakit/context.py @@ -0,0 +1,22 @@ +import typing + +import babelfish + + +class Context(dict): + def __init__(self, options: typing.Optional[typing.Mapping[str, typing.Any]] = None): + super().__init__(options or {}) + language = self['expected_language'] if 'expected_language' in self else None + if language and not isinstance(language, babelfish.Language): + language = babelfish.Language.fromietf(str(language)) + self.expected_language: typing.Optional[babelfish.Language] = language + + def accept(self, lang: babelfish.Language): + if self.expected_language is None: + return True + if self.expected_language.alpha3 != lang.alpha3: + return False + if self.expected_language.script and self.expected_language != lang.script: + return False + + return not self.expected_language.country or self.expected_language == lang.country diff --git a/libs/trakit/converters/__init__.py b/libs/trakit/converters/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/libs/trakit/converters/country.py b/libs/trakit/converters/country.py new file mode 100644 index 000000000..5bfd6908d --- /dev/null +++ b/libs/trakit/converters/country.py @@ -0,0 +1,32 @@ +import typing + +from babelfish import Country, CountryReverseConverter, CountryReverseError +from babelfish.converters import CaseInsensitiveDict + + +class GuessCountryConverter(CountryReverseConverter): + def __init__(self, config: typing.Mapping[str, str]): + self.synonyms = CaseInsensitiveDict(config) + + def convert(self, alpha2): + return str(Country(alpha2)) + + def reverse(self, name: str): + try: + return self.synonyms[name] + except KeyError: + pass + + if name.isupper() and len(name) == 2: + try: + return Country(name).alpha2 + except ValueError: + pass + + for conv in (Country.fromname,): + try: + return conv(name).alpha2 + except CountryReverseError: + pass + + raise CountryReverseError(name) diff --git a/libs/trakit/converters/language.py b/libs/trakit/converters/language.py new file mode 100644 index 000000000..0309a642a --- /dev/null +++ b/libs/trakit/converters/language.py @@ -0,0 +1,30 @@ +import typing + +from babelfish import Language, LanguageReverseConverter, LanguageReverseError +from babelfish.converters import CaseInsensitiveDict + + +class GuessLanguageConverter(LanguageReverseConverter): + def __init__(self, config: typing.Mapping[str, str]): + self.synonyms = CaseInsensitiveDict() + for synonym, code in config.items(): + lang = Language.fromietf(code) if '-' in code else Language(code) + self.synonyms[synonym] = (lang.alpha3, lang.country.alpha2 if lang.country else None, lang.script) + + def convert(self, alpha3: str, country=None, script=None): + return str(Language(alpha3, country, script)) + + def reverse(self, name: str): + try: + return self.synonyms[name] + except KeyError: + pass + + for conv in (Language.fromname,): + try: + reverse = conv(name) + return reverse.alpha3, reverse.country, reverse.script + except (ValueError, LanguageReverseError): + pass + + raise LanguageReverseError(name) diff --git a/libs/trakit/data/config.json b/libs/trakit/data/config.json new file mode 100644 index 000000000..aa7138042 --- /dev/null +++ b/libs/trakit/data/config.json @@ -0,0 +1,860 @@ +{ + "countries": { + "Afghan": "AF", + "Aforika Borwa": "ZA", + "Afrika Borwa": "ZA", + "Afrika Dzonga": "ZA", + "Afurika Tshipembe": "ZA", + "Aland": "AX", + "Alandish": "AX", + "Albanian": "AL", + "Algerian": "DZ", + "American": "US", + "American Islander": "UM", + "American Samoan": "AS", + "American Virgin Islander": "VI", + "Andorran": "AD", + "Angolan": "AO", + "Anguillian": "AI", + "Antarctican": "AQ", + "Antiguan Barbudan": "AG", + "Ao Men": "MO", + "Aotearoa": "NZ", + "Argentine": "AR", + "Armenian": "AM", + "Aruban": "AW", + "Australian": "AU", + "Austrian": "AT", + "Ayiti": "HT", + "Azerbaidzhan": "AZ", + "Azerbaijani": "AZ", + "Azərbaycan": "AZ", + "Bahamian": "BS", + "Bahraini": "BH", + "Bangladeshi": "BD", + "Barbadian": "BB", + "Beafrika": "CF", + "Belarusian": "BY", + "Belau": "PW", + "Belgian": "BE", + "Belgie": "BE", + "Belgien": "BE", + "Belgique": "BE", + "België": "BE", + "Belice": "BZ", + "Belizean": "BZ", + "Beninese": "BJ", + "Bermudian": "BM", + "Bhutanese": "BT", + "Blgariia": "BG", + "Bolivia": "BO", + "Bolivian": "BO", + "Boneiru Sint Eustatius y Saba": "BQ", + "Bosna i Hercegovina": "BA", + "Bosna i Khertsegovina": "BA", + "Bosnian Herzegovinian": "BA", + "Bouvetoya": "BV", + "Bouvetøya": "BV", + "Brasil": "BR", + "Brazilian": "BR", + "British": "GB", + "British Virgin Islander": "VG", + "British Virgin Islands": "VG", + "Bruneian": "BN", + "Bulgarian": "BG", + "Buliwya": "BO", + "Burkinabe": "BF", + "Burmese": "MM", + "Burundian": "BI", + "Bénin": "BJ", + "Bêafrîka": "CF", + "Cabo Verde": "CV", + "Cambodian": "KH", + "Cameroonian": "CM", + "Cameroun": "CM", + "Canadian": "CA", + "Cape Verdian": "CV", + "Caribisch Nederland": "BQ", + "Caymanian": "KY", + "Central African": "CF", + "Cesko": "CZ", + "Chadian": "TD", + "Channel Islander": "JE", + "Chilean": "CL", + "Chinese": "CN", + "Christmas Islander": "CX", + "Cocos Islander": "CC", + "Cocos Keeling Islands": "CC", + "Colombian": "CO", + "Comoran": "KM", + "Comores": "KM", + "Congolese": "CD", + "Cook Islander": "CK", + "Costa Rican": "CR", + "Cote dIvoire": "CI", + "Croatian": "HR", + "Cuban": "CU", + "Curacao": "CW", + "Curacaoan": "CW", + "Curaçaoan": "CW", + "Cypriot": "CY", + "Czech": "CZ", + "Côte dIvoire": "CI", + "Danish": "DK", + "Danmark": "DK", + "Deutschland": "DE", + "Dgernesiais": "GG", + "Dgèrnésiais": "GG", + "Ditunga dia Kongu wa Mungalaata": "CD", + "Dominican": "DO", + "Dutch": "NL", + "East Timorese": "TL", + "Ecuadorean": "EC", + "Eesti": "EE", + "Egyptian": "EG", + "Eire": "IE", + "Ellada": "GR", + "Emirati": "AE", + "Equatorial Guinean": "GQ", + "Eritrean": "ER", + "Espana": "ES", + "España": "ES", + "Estados Unidos": "US", + "Estonian": "EE", + "Eswatini": "SZ", + "Ethiopian": "ET", + "Faereyjar": "FO", + "Faeroerne": "FO", + "Falkland Islander": "FK", + "Falkland Islands": "FK", + "Faroese": "FO", + "Fijian": "FJ", + "Filipino": "PH", + "Finnish": "FI", + "Foroyar": "FO", + "French": "FR", + "French Polynesian": "PF", + "Færeyjar": "FO", + "Færøerne": "FO", + "Føroyar": "FO", + "Gabonese": "GA", + "Gambian": "GM", + "Georgian": "GE", + "German": "DE", + "Ghanaian": "GH", + "Greek": "GR", + "Greenlandic": "GL", + "Grenadian": "GD", + "Guadeloupian": "GP", + "Guahan": "GU", + "Guamanian": "GU", + "Guatemalan": "GT", + "Guernesey": "GG", + "Guianan": "GF", + "Guine Bissau": "GW", + "Guine Equatorial": "GQ", + "Guinea Bissauan": "GW", + "Guinea Ecuatorial": "GQ", + "Guinean": "GN", + "Guinee": "GN", + "Guinee equatoriale": "GQ", + "Guiné Bissau": "GW", + "Guiné Equatorial": "GQ", + "Guinée": "GN", + "Guinée équatoriale": "GQ", + "Guyane francaise": "GF", + "Guyane française": "GF", + "Guyanese": "GY", + "Guåhån": "GU", + "Haitian": "HT", + "Hayastan": "AM", + "Haïti": "HT", + "Heard and McDonald Islander": "HM", + "Honduran": "HN", + "Hong Konger": "HK", + "Hrvatska": "HR", + "Hungarian": "HU", + "I Kiribati": "KI", + "Icelander": "IS", + "Indian": "IN", + "Indonesian": "ID", + "Iranian": "IR", + "Iraqi": "IQ", + "Irish": "IE", + "Island": "IS", + "Israeli": "IL", + "Italia": "IT", + "Italian": "IT", + "Ivorian": "CI", + "Jamaican": "JM", + "Jamhuri ya Kidemokrasia ya Kongo": "CD", + "Japanese": "JP", + "Jerri": "JE", + "Jordanian": "JO", + "Jèrri": "JE", + "Kalaallit Nunaat": "GL", + "Kampuchea": "KH", + "Kazakhstani": "KZ", + "Kazakstan": "KZ", + "Kenyan": "KE", + "Kibris": "CY", + "Kirghiz": "KG", + "Kirgiziia": "KG", + "Kittitian or Nevisian": "KN", + "Komori": "KM", + "Kuki Airani": "CK", + "Kupros": "CY", + "Kuwaiti": "KW", + "Kâmpŭchéa": "KH", + "Kıbrıs": "CY", + "Kūki Āirani": "CK", + "La Reunion": "RE", + "La Réunion": "RE", + "Laotian": "LA", + "Latvian": "LV", + "Latvija": "LV", + "Lebanese": "LB", + "Letzebuerg": "LU", + "Liban": "LB", + "Liberian": "LR", + "Libyan": "LY", + "Liechtensteiner": "LI", + "Lietuva": "LT", + "Lithuanian": "LT", + "Luxembourger": "LU", + "Luxemburg": "LU", + "Lëtzebuerg": "LU", + "Macanese": "MO", + "Macau": "MO", + "Macedonian": "MK", + "Madagasikara": "MG", + "Magyarorszag": "HU", + "Magyarország": "HU", + "Mahoran": "YT", + "Majel": "MH", + "Makedonija": "MK", + "Makedonski": "MK", + "Malagasy": "MG", + "Malawian": "MW", + "Malaysian": "MY", + "Malaŵi": "MW", + "Maldivan": "MV", + "Malian": "ML", + "Maltese": "MT", + "Mannin": "IM", + "Manx": "IM", + "Marshallese": "MH", + "Martinican": "MQ", + "Maurice": "MU", + "Mauritanian": "MR", + "Mauritian": "MU", + "Mexican": "MX", + "Micronesia": "FM", + "Micronesian": "FM", + "Mocambique": "MZ", + "Moldova": "MD", + "Moldovan": "MD", + "Monegasque": "MC", + "Mongol uls": "MN", + "Mongolian": "MN", + "Montenegrin": "ME", + "Montserratian": "MS", + "Moris": "MU", + "Moroccan": "MA", + "Mosotho": "LS", + "Motswana": "BW", + "Mozambican": "MZ", + "Moçambique": "MZ", + "Mzantsi Afrika": "ZA", + "México": "MX", + "M̧ajeļ": "MH", + "Na Islas Marianas": "MP", + "Na Islas Mariånas": "MP", + "Namibian": "NA", + "Namibie": "NA", + "Namibië": "NA", + "Nauruan": "NR", + "Nederland": "NL", + "Negara Brunei Darussalam": "BN", + "Nepalese": "NP", + "New Caledonian": "NC", + "New Zealander": "NZ", + "Ni Vanuatu": "VU", + "Nicaraguan": "NI", + "Nigerian": "NG", + "Nigerien": "NE", + "Ningizimu Afrika": "ZA", + "Niuean": "NU", + "Niuē": "NU", + "Noreg": "NO", + "Norfk Ailen": "NF", + "Norfolk Islander": "NF", + "Norge": "NO", + "Norgga": "NO", + "North Korean": "KP", + "Norwegian": "NO", + "Nouvelle Caledonie": "NC", + "Nouvelle Calédonie": "NC", + "Omani": "OM", + "Osterreich": "AT", + "Owganystan": "AF", + "Ozbekiston": "UZ", + "O‘zbekiston": "UZ", + "Pais Korsou": "CW", + "Pais Kòrsou": "CW", + "Pakistani": "PK", + "Palauan": "PW", + "Palestinian": "PS", + "Panamanian": "PA", + "Panamá": "PA", + "Papua New Guinean": "PG", + "Papua Niu Gini": "PG", + "Papua Niugini": "PG", + "Paraguai": "PY", + "Paraguayan": "PY", + "Paraguái": "PY", + "Peruvian": "PE", + "Perú": "PE", + "Pilipinas": "PH", + "Piruw": "PE", + "Pitcairn Islander": "PN", + "Pitcairn Islands": "PN", + "Polish": "PL", + "Polska": "PL", + "Polynesie francaise": "PF", + "Polynésie française": "PF", + "Portuguese": "PT", + "Puerto Rican": "PR", + "Qatari": "QA", + "RD Congo": "CD", + "Repubilika ya Kongo": "CG", + "Repubilika ya Kongo Demokratiki": "CD", + "Republica Dominicana": "DO", + "Republiki ya Kongo": "CG", + "Republiki ya Kongo Demokratiki": "CD", + "Republiki ya Kongó Demokratiki": "CD", + "Republique centrafricaine": "CF", + "Republique du Congo": "CG", + "Republíki ya Kongó": "CG", + "República Dominicana": "DO", + "Reunionese": "RE", + "Ri Ben": "JP", + "Romanian": "RO", + "România": "RO", + "Rossiia": "RU", + "Russian": "RU", + "Rwandan": "RW", + "République centrafricaine": "CF", + "République du Congo": "CG", + "Réunionese": "RE", + "Sahara Occidental": "EH", + "Sahrawi": "EH", + "Saint Barthelemy": "BL", + "Saint Barthelemy Islander": "BL", + "Saint Barthélemy Islander": "BL", + "Saint Helena Ascension and Tristan da Cunha": "SH", + "Saint Helenian": "SH", + "Saint Lucian": "LC", + "Saint Martin": "MF", + "Saint Martin Islander": "MF", + "Saint Pierrais Miquelonnais": "PM", + "Saint Pierre et Miquelon": "PM", + "Saint Vincentian": "VC", + "Salvadoran": "SV", + "Sammarinese": "SM", + "Samoa Amelika": "AS", + "Samoan": "WS", + "Sao Tome e Principe": "ST", + "Sao Tomean": "ST", + "Saudi Arabian": "SA", + "Schweiz": "CH", + "Senegalese": "SN", + "Serbian": "RS", + "Sesel": "SC", + "Sewula Afrika": "ZA", + "Seychellois": "SC", + "Shqiperia": "AL", + "Shqipëria": "AL", + "Sierra Leonean": "SL", + "Singaporean": "SG", + "Singapura": "SG", + "Sint Maarten": "SX", + "Slovak": "SK", + "Slovene": "SI", + "Slovenija": "SI", + "Slovensko": "SK", + "Solomon Islander": "SB", + "Somali": "SO", + "Soomaaliya": "SO", + "South African": "ZA", + "South Georgia": "GS", + "South Georgian South Sandwich Islander": "GS", + "South Korean": "KR", + "South Sudanese": "SS", + "Spanish": "ES", + "Srbija": "RS", + "Sri Lankan": "LK", + "St Maartener": "SX", + "Sudanese": "SD", + "Suisse": "CH", + "Suomi": "FI", + "Surinamer": "SR", + "Svalbard og Jan Mayen": "SJ", + "Sverige": "SE", + "Svizra": "CH", + "Svizzera": "CH", + "Swazi": "SZ", + "Swedish": "SE", + "Swiss": "CH", + "Syrian": "SY", + "São Tomé e Príncipe": "ST", + "Sénégal": "SN", + "Sāmoa": "WS", + "Sāmoa Amelika": "AS", + "Tadzhik": "TJ", + "Tadzhikistan": "TJ", + "Tai Wan": "TW", + "Taiwanese": "TW", + "Tanzania": "TZ", + "Tanzanian": "TZ", + "Tchad": "TD", + "Terres australes et antarctiques francaises": "TF", + "Terres australes et antarctiques françaises": "TF", + "Thai": "TH", + "Timor Leste": "TL", + "Timór Leste": "TL", + "Tochikiston": "TJ", + "Togolese": "TG", + "Tokelauan": "TK", + "Tongan": "TO", + "Trinidadian": "TT", + "Tsrna Gora": "ME", + "Tunisian": "TN", + "Turkish": "TR", + "Turkiye": "TR", + "Turkmen": "TM", + "Turkmeniia": "TM", + "Turks and Caicos Islander": "TC", + "Tuvaluan": "TV", + "Türkiye": "TR", + "Türkmenistan": "TM", + "UK": "GB", + "US": "US", + "Uburundi": "BI", + "Ugandan": "UG", + "Ukrainian": "UA", + "Ukrayina": "UA", + "United States Virgin Islands": "VI", + "Uruguayan": "UY", + "Uzbekistani": "UZ", + "Vatican": "VA", + "Vaticanae": "VA", + "Vaticano": "VA", + "Vaticanæ": "VA", + "Venezuela": "VE", + "Venezuelan": "VE", + "Vietnam": "VN", + "Vietnamese": "VN", + "Viti": "FJ", + "Việt Nam": "VN", + "Volivia": "BO", + "Volívia": "BO", + "Wallis and Futuna Islander": "WF", + "Wallis et Futuna": "WF", + "Wuliwya": "BO", + "Xiang Gang": "HK", + "Xin Jia Po": "SG", + "Yemeni": "YE", + "Zambian": "ZM", + "Zhong Guo": "CN", + "Zhong Guo Da Lu": "CN", + "Zimbabwean": "ZW", + "`mn": "OM", + "baaNlaadesh": "BD", + "bbaart nuuN": "IN", + "bhaart": "IN", + "brug-yul-": "BT", + "canadien": "CA", + "cingkppuur": "SG", + "dhivehiraajeyge": "MV", + "eSwatini": "SZ", + "eereteraa": "ER", + "fGnstn": "AF", + "flsTyn": "PS", + "hangug": "KR", + "ilngkai": "LK", + "intiyaa": "IN", + "joseon": "KP", + "jybwty": "DJ", + "khoemry": "IQ", + "lSwml": "SO", + "l`rq": "IQ", + "lbHryn": "BH", + "lbnn": "LB", + "ljzyr": "DZ", + "lkwyt": "KW", + "lmGrb": "MA", + "lqmr": "KM", + "lrdn": "JO", + "lswdn": "SD", + "lyaman": "YE", + "lyby": "LY", + "mSr": "EG", + "mlysy": "MY", + "mnmaa": "MM", + "mwrytny": "MR", + "nepaal": "NP", + "phijii": "FJ", + "pkstn": "PK", + "praethsaithy": "TH", + "qTr": "QA", + "qwutnA": "IQ", + "rtry": "ER", + "sak`art`velo": "GE", + "shrii lNkaav": "LK", + "spplaaw": "LA", + "sryyl": "IL", + "swry": "SY", + "teyopheyaa": "ET", + "tshd": "TD", + "twns": "TN", + "ySHrAl": "IL", + "yrn": "IR", + "Åland": "AX", + "Ålandish": "AX", + "Éire": "IE", + "Ísland": "IS", + "Österreich": "AT", + "Česko": "CZ", + "Ελλάδα": "GR", + "Κύπρος": "CY", + "Азербайджан": "AZ", + "Белару́сь": "BY", + "Беларусь": "BY", + "Боснa и Херцеговина": "BA", + "България": "BG", + "Казахстан": "KZ", + "Киргизия": "KG", + "Кыргызстан": "KG", + "Македонија": "MK", + "Македонски": "MK", + "Монгол улс": "MN", + "Россия": "RU", + "Србија": "RS", + "Таджикистан": "TJ", + "Тоҷикистон": "TJ", + "Туркмения": "TM", + "Узбекистан": "UZ", + "Україна": "UA", + "Црна Гора": "ME", + "Қазақстан": "KZ", + "Հայաստան": "AM", + "ישראל": "IL", + "إرتريا‎": "ER", + "إسرائيل": "IL", + "افغانستان": "AF", + "الأردن": "JO", + "البحرين": "BH", + "الجزائر": "DZ", + "السعودية": "SA", + "السودان": "SD", + "الصحراء الغربية": "EH", + "الصومال‎‎": "SO", + "العراق": "IQ", + "العربية السعودية": "SA", + "القمر‎": "KM", + "الكويت": "KW", + "المغرب": "MA", + "اليَمَن": "YE", + "ایران": "IR", + "تشاد‎": "TD", + "تونس": "TN", + "جيبوتي‎": "DJ", + "دولة الإمارات العربية المتحدة": "AE", + "سوريا": "SY", + "عمان": "OM", + "فلسطين": "PS", + "قطر": "QA", + "لبنان": "LB", + "ليبيا": "LY", + "مصر": "EG", + "مليسيا": "MY", + "موريتانيا": "MR", + "پاكستان": "PK", + "کۆماری": "IQ", + "ܩܘܼܛܢܵܐ": "IQ", + "ދިވެހިރާއްޖޭގެ": "MV", + "नेपाल": "NP", + "फिजी": "FJ", + "भारत": "IN", + "বাংলাদেশ": "BD", + "ভারত": "IN", + "ਭਾਰਤ ਨੂੰ": "IN", + "இந்தியா": "IN", + "இலங்கை": "LK", + "சிங்கப்பூர்": "SG", + "ශ්‍රී ලංකාව": "LK", + "ประเทศไทย": "TH", + "ສປປລາວ": "LA", + "འབྲུག་ཡུལ་": "BT", + "မြန်မာ": "MM", + "საქართველო": "GE", + "ኢትዮጵያ": "ET", + "ኤርትራ": "ER", + "ⵍⵎⴰⵖⵔⵉⴱ": "MA", + "中国": "CN", + "中国大陆": "CN", + "台灣": "TW", + "新加坡": "SG", + "日本": "JP", + "澳门": "MO", + "香港": "HK", + "조선": "KP", + "한국": "KR" + }, + "ignored": [ + "bit", + "cc", + "ch", + "dan", + "day", + "gun", + "hr", + "jordan", + "la", + "ma", + "na", + "the", + "to" + ], + "implicit-languages": { + "419": "es-419", + "BR": "pt-BR", + "CA": "fr-CA", + "Cantonese": "zh", + "Castilian": "es", + "FR": "fr-FR", + "GR": "ell", + "HK": "zh-HK", + "ID": "id-ID", + "Mandarin": "zh", + "Parisian": "fr-FR", + "Simplified": "zh-Hans", + "Traditional": "zh-Hant", + "UA": "uk-UA", + "UK": "en-GB", + "US": "en-US", + "VFF": "fr-FR", + "VFQ": "fr-CA", + "VN": "vie", + "cant": "zh", + "eng": "en", + "ita": "it", + "简体双语": "zh-Hans", + "繁体双语": "zh-Hant" + }, + "languages": { + "Adygebze": "ady", + "Avanee": "grn", + "Avañeẽ": "grn", + "Aymar aru": "aym", + "Azərbaycan dili": "aze", + "Bahasa Indonesia": "ind", + "Bahasa Melayu": "msa", + "Basa Jawa": "jav", + "Basa Sunda": "sun", + "Belaruskaia": "bel", + "Blgarski": "bul", + "Bosanski": "bos", + "Brezhoneg": "bre", + "Catala": "cat", + "Català": "cat", + "Cestina": "ces", + "Cymraeg": "cym", + "Dansk": "dan", + "Davvisamegiella": "sme", + "Davvisámegiella": "sme", + "Deutsch": "deu", + "Dolnoserbscina": "dsb", + "Dolnoserbšćina": "dsb", + "Eesti": "est", + "Ellenika": "ell", + "Espanol": "spa", + "Espanol Latinoamerica": "es-419", + "Español": "spa", + "Español Latinoamérica": "es-419", + "Euskara": "eus", + "Foroyskt": "fao", + "Francais": "fra", + "Français": "fra", + "Frysk": "fry", + "Føroyskt": "fao", + "Gaeilge": "gle", + "Gaelg": "glv", + "Gaidhlig": "gla", + "Galego": "glg", + "Greek": "ell", + "Guang Dong Hua ": "zho", + "Gàidhlig": "gla", + "Hayeren": "hye", + "Hornjoserbscina": "hsb", + "Hornjoserbšćina": "hsb", + "Hrvatski": "hrv", + "Islenska": "isl", + "Italiano": "ita", + "Kazaksha": "kaz", + "Kernewek": "cor", + "Kiswahili": "swa", + "Kreyol": "hat", + "Kreyòl": "hat", + "Kurdi": "kur", + "Kurdî": "kur", + "Latviesu": "lav", + "Latviešu": "lav", + "Lemborgs": "lim", + "Letzebuergesch": "ltz", + "Lietuviu": "lit", + "Lietuvių": "lit", + "Lwo": "ach", + "Lèmbörgs": "lim", + "Lëtzebuergesch": "ltz", + "Magyar": "hun", + "Makedonski": "mkd", + "Malay": "msa", + "Malti": "mlt", + "Maya Kaqchikel": "cak", + "Melayu": "msa", + "Mongol": "mon", + "Nederlands": "nld", + "Norsk": "nor", + "Norsk bokmal": "nob", + "Norsk bokmål": "nob", + "Norsk nynorsk": "nno", + "Occitan": "oci", + "Ozbek": "uzb", + "Polski": "pol", + "Portugues": "por", + "Português": "por", + "Qhichwa": "que", + "Ri Ben Yu": "jpn", + "Romana": "ron", + "Română": "ron", + "Rumantsch": "roh", + "Russkii": "rus", + "Shqip": "sqi", + "Slovencina": "slk", + "Slovenscina": "slv", + "Slovenčina": "slk", + "Slovenščina": "slv", + "Soomaaliga": "som", + "Srpski": "srp", + "Suomi": "fin", + "Svenska": "swe", + "Taqbaylit": "kab", + "TcYi": "aka", + "Tieng Viet": "vie", + "Tiếng Việt": "vie", + "Turkce": "tur", + "Türkçe": "tur", + "Tɕɥi": "aka", + "Ukrayinska": "ukr", + "Zhong Wen": "zho", + "Zhong Wen Fan Ti": "zh-Hant", + "Zhong Wen Jian Ti": "zh-Hans", + "`bryt": "heb", + "aithy": "tha", + "baaNlaa": "ben", + "bhaasaakhmaer": "khm", + "bmaackaa": "mya", + "eesti keel": "est", + "frsy": "fas", + "gujraatii": "guj", + "hangugeo": "kor", + "hindii": "hin", + "isiXhosa": "xho", + "isiZulu": "zul", + "k`art`uli": "kat", + "knndd": "kan", + "maithilii maithilii": "mai", + "mlyaallN": "mal", + "mraatthii": "mar", + "nepaalii": "nep", + "oddiaa": "ori", + "pNjaabii": "pan", + "pStw": "pus", + "phaasaaaithy": "tha", + "rdw": "urd", + "sNskRtm": "san", + "siNhl": "sin", + "srpskokhrvatski": "hbs", + "tatarcha": "tat", + "telugu": "tel", + "tlhIngan Hol": "tlh", + "tmilll": "tam", + "tochiki": "tgk", + "yyidySH": "yid", + "zaboni tochiki": "tgk", + "Íslenska": "isl", + "Čeština": "ces", + "Ελληνικά": "ell", + "Адыгэбзэ": "ady", + "Беларуская": "bel", + "Български": "bul", + "Македонски": "mkd", + "Монгол": "mon", + "Русский": "rus", + "Српски": "srp", + "Українська": "ukr", + "забо́ни тоҷикӣ́": "tgk", + "српскохрватски": "hbs", + "татарча": "tat", + "тоҷикӣ": "tgk", + "Қазақша": "kaz", + "Հայերեն": "hye", + "ייִדיש": "yid", + "עברית": "heb", + "اردو": "urd", + "العربية": "ara", + "فارسی": "fas", + "پښتو": "pus", + "नेपाली": "nep", + "मराठी": "mar", + "मैथिली মৈথিলী": "mai", + "संस्कृतम्": "san", + "हिन्दी": "hin", + "বাংলা": "ben", + "ਪੰਜਾਬੀ": "pan", + "ગુજરાતી": "guj", + "ଓଡ଼ିଆ": "ori", + "தமிழ்": "tam", + "తెలుగు": "tel", + "ಕನ್ನಡ": "kan", + "മലയാളം": "mal", + "සිංහල": "sin", + "ภาษาไทย": "tha", + "ไทย": "tha", + "ဗမာစကာ": "mya", + "ქართული": "kat", + "ភាសាខ្មែរ": "khm", + "中文": "zho", + "中文简体": "zh-Hans", + "中文繁體": "zh-Hant", + "廣東話": "zho", + "日本語": "jpn", + "한국어": "kor" + }, + "regions": { + "Latin": "419", + "Latinoamerica": "419", + "Latinoamericano": "419", + "Latinoamérica": "419" + }, + "scripts": { + "Fan Ti ": "Hant", + "Jian Ti ": "Hans", + "Simplified": "Hans", + "Traditional": "Hant", + "简体": "Hans", + "繁體": "Hant" + } +} \ No newline at end of file diff --git a/libs/trakit/language.py b/libs/trakit/language.py new file mode 100644 index 000000000..e1a621745 --- /dev/null +++ b/libs/trakit/language.py @@ -0,0 +1,169 @@ +import typing + +from babelfish import ( + COUNTRIES, + Country, + CountryReverseError, + LANGUAGE_MATRIX, + Language, + LanguageReverseError, + SCRIPTS, + Script, + country_converters, + language_converters +) +from babelfish.converters import CaseInsensitiveDict + +from rebulk import Rebulk +from rebulk.match import Match + +from trakit.config import Config +from trakit.context import Context +from trakit.converters.country import GuessCountryConverter +from trakit.converters.language import GuessLanguageConverter +from trakit.words import blank_match, blank_release_names, to_combinations, to_match, to_sentence, to_words + + +class LanguageFinder: + + def __init__(self, config: Config): + self.country_max_words = 1 + for k, v in COUNTRIES.items(): + self.country_max_words = max(self.country_max_words, v.count(' ')) + + self.language_max_words = 1 + for v in LANGUAGE_MATRIX: + self.language_max_words = max(self.language_max_words, v.name.count(' ')) + + self.script_max_words = 1 + for v in config.scripts.keys(): + self.script_max_words = max(self.script_max_words, v.count(' ')) + + self.region_max_words = 1 + for v in config.regions.keys(): + self.region_max_words = max(self.region_max_words, v.count(' ')) + + SCRIPTS['419'] = 'Latin America and the Caribbean' # Until babelfish support UN.M49 + country_converters['guess'] = GuessCountryConverter(config.countries) + language_converters['guess'] = GuessLanguageConverter(config.languages) + self.regions = CaseInsensitiveDict(config.regions) + self.scripts = CaseInsensitiveDict(config.scripts) + self.common_words = CaseInsensitiveDict(dict.fromkeys(config.ignored, 0)) + self.implicit = CaseInsensitiveDict(config.implicit_languages) + + def _find_country(self, value: str): + combinations = to_combinations(to_words(value), self.country_max_words) + for c in combinations: + code = to_sentence(c) + try: + return to_match(c, Country.fromguess(code)) + except CountryReverseError: + continue + + def _find_script(self, value: str): + combinations = to_combinations(to_words(value), self.script_max_words) + for c in combinations: + code = to_sentence(c) + try: + return to_match(c, Script(self.scripts.get(code, code))) + except ValueError: + continue + + def _find_region(self, value: str): + combinations = to_combinations(to_words(value), self.region_max_words) + for c in combinations: + code = to_sentence(c) + try: + return to_match(c, Script(self.regions.get(code, code))) + except ValueError: + continue + + def _find_implicit_language(self, combinations: typing.List[typing.List[Match]]): + for c in combinations: + sentence = to_sentence(c) + if sentence in self.implicit: + return to_match(c, Language.fromietf(self.implicit[sentence])) + + region = self._find_region(sentence) + if region and region.value.code in self.implicit: + lang = Language.fromietf(self.implicit[region.value.code]) + return Match(region.start, region.end, value=lang, input_string=region.input_string) + + try: + country = Country.fromguess(sentence) + if country.alpha2 in self.implicit: + lang = Language.fromietf(self.implicit[country.alpha2]) + if lang.name.lower() == sentence.lower(): + lang = Language.fromname(sentence) + + return to_match(c, lang) + except CountryReverseError: + pass + + def accept_word(self, string: str): + return string.lower() not in self.common_words and not string.isnumeric() + + def find_language(self, value: str, context: Context): + value = blank_release_names(value) + all_words = to_words(value, predicate=self.accept_word) + combinations = to_combinations(all_words, self.language_max_words) + implicit_lang = self._find_implicit_language(combinations) + implicit_accepted = implicit_lang and context.accept(implicit_lang.value) + + if implicit_accepted and implicit_lang.value.script and implicit_lang.value.script.code.isnumeric(): + return implicit_lang + elif implicit_lang and not implicit_accepted: + value = blank_match(implicit_lang) + all_words = to_words(value, predicate=self.accept_word) + combinations = to_combinations(all_words, self.language_max_words) + + for c in combinations: + language_sentence = to_sentence(c) + try: + lang = Language.fromguess(language_sentence) + except LanguageReverseError: + continue + + match_lang = to_match(c, lang) + remaining_sentence = blank_match(match_lang) + for combination in to_combinations(to_words(remaining_sentence), self.country_max_words): + sentence = to_sentence(combination) + country = self._find_country(sentence) + if country: + try: + # discard country if value is actually the language name + Language.fromguess(country.raw) + except LanguageReverseError: + lang = Language(lang.alpha3, country=country.value, script=lang.script) + break + + region = self._find_region(sentence) + if region: + lang = Language(lang.alpha3, country=lang.country, script=region.value) + break + + script = self._find_script(sentence) + if script: + lang = Language(lang.alpha3, country=lang.country, script=script.value) + break + + if implicit_accepted and implicit_lang.value.alpha3 == lang.alpha3 and not lang.country and not lang.script: + return implicit_lang + + if context.accept(lang): + return to_match(c, lang) + + if implicit_accepted: + return implicit_lang + + def find(self, value: str, context: Context): + match = self.find_language(value, context) + if match: + return match.start, match.end, {'value': match.value} + + +def language(config: Config): + rebulk = Rebulk() + rebulk.functional(LanguageFinder(config).find, name='language') + + return rebulk diff --git a/libs/trakit/patterns.py b/libs/trakit/patterns.py new file mode 100644 index 000000000..66eb79954 --- /dev/null +++ b/libs/trakit/patterns.py @@ -0,0 +1,32 @@ +import re +from functools import partial + +from rebulk import Rebulk +from rebulk.validators import chars_surround + +from trakit.config import Config +from trakit.language import language +from trakit.words import seps + + +def configure(config: Config): + seps_surround = partial(chars_surround, seps) + + others = Rebulk() + others.defaults(ignore_case=True, validator=seps_surround) + others.regex_defaults(flags=re.IGNORECASE, + abbreviations=[(r'-', rf'[{re.escape("".join(seps))}]')], + validator=seps_surround) + for name in ('forced', 'commentary', 'external'): + others.string(name, name=name, value=True) + + others.string('sdh', name='hearing_impaired', value=True) + others.string('alternate', name='version', value='alternate') + others.string('descriptive', name='descriptive', value=True) + others.regex('cc', 'closed-captions?', name='closed_caption', value=True) + + rebulk = Rebulk() + rebulk.rebulk(language(config)) + rebulk.rebulk(others) + + return rebulk diff --git a/libs/trakit/words.py b/libs/trakit/words.py new file mode 100644 index 000000000..1ee244c96 --- /dev/null +++ b/libs/trakit/words.py @@ -0,0 +1,99 @@ +import re +import typing + +from rebulk.match import Match + +seps = frozenset(r' [](){}+*|=-_~#/\\.,;:' + '\uff08\uff09') +suppress_chars = frozenset("'") +release_name_re = re.compile(r'(?P[^\.\s]+(?:\.[^\.\s]+){2,})') + + +def to_words(value: str, + separators: typing.FrozenSet[str] = seps, + ignore_chars: typing.FrozenSet[str] = suppress_chars, + predicate: typing.Callable[[str], bool] = lambda x: True): + input_string = value + start = 0 + i = 0 + word = '' + words: typing.List[Match] = [] + for c in input_string: + i += 1 + if c in ignore_chars: + continue + + if c not in separators: + word += c + continue + + if not word: + start = i + continue + + end = i - 1 + if not predicate(value[start:end]): + input_string = blank(input_string, start, end) + else: + words.append(Match(start, i - 1, value=word)) + + word = '' + start = i + + if word: + if not predicate(value[start:]): + input_string = blank(input_string, start, len(input_string)) + else: + words.append(Match(start, i, value=word)) + + for w in words: + w.input_string = input_string + + return words + + +def to_combinations(words: typing.List[Match], max_items: int): + results: typing.List[typing.List[Match]] = [] + n_words = len(words) + cur_size = min(max_items, n_words) + start = 0 + while cur_size > 0: + end = start + cur_size + if end > n_words: + start = 0 + cur_size -= 1 + continue + + results.append(words[start:end]) + start += 1 + + return results + + +def to_sentence(combination: typing.List[Match]): + return ' '.join([c.value for c in combination]) + + +def to_match(combination: typing.List[Match], value: typing.Any): + start = combination[0].start + end = combination[-1].end + input_string = combination[0].input_string + + return Match(start, end, value=value, input_string=input_string) + + +def blank(string: str, start: int, end: int): + return string[:start] + ''.ljust(end - start, ' ') + string[end:] + + +def blank_match(match: Match): + return blank(match.input_string, match.start, match.end) + + +def blank_release_names(value: str): + result = value + match = release_name_re.search(value) + while match: + result = blank(result, match.start('release'), match.end('release')) + match = release_name_re.search(value, match.end('release')) + + return result diff --git a/libs/version.txt b/libs/version.txt index e1dd091f3..39c3151a8 100644 --- a/libs/version.txt +++ b/libs/version.txt @@ -17,7 +17,7 @@ ga4mp==2.0.4 guess_language-spirit==0.5.3 guessit==3.5.0 jsonschema==4.17.0 -knowit==0.4.0 +knowit==0.5.2 peewee==3.15.3 py-pretty==1 pycountry==22.3.5 @@ -80,8 +80,9 @@ zipp==3.10.0 markupsafe==2.1.1 # Required-by: knowit -pymediainfo==5.1.0 +pymediainfo==6.0.1 pyyaml==6.0 +trakit==0.2.1 # Required-by: python-socketio bidict==0.22.0