mirror of https://github.com/morpheus65535/bazarr
Trying to fix Segmentation fault caused by mediainfo in docker container. #2098
This commit is contained in:
parent
7136383098
commit
7455496c4c
|
@ -1,10 +1,9 @@
|
|||
"""Know your media files better."""
|
||||
__title__ = 'knowit'
|
||||
__version__ = '0.4.0'
|
||||
__short_version__ = '.'.join(__version__.split('.')[:2])
|
||||
__version__ = '0.5.2'
|
||||
__short_version__ = '0.5'
|
||||
__author__ = 'Rato AQ2'
|
||||
__license__ = 'MIT'
|
||||
__copyright__ = 'Copyright 2016-2021, Rato AQ2'
|
||||
__url__ = 'https://github.com/ratoaq2/knowit'
|
||||
|
||||
#: Video extensions
|
||||
|
|
|
@ -169,7 +169,7 @@ def dumps(
|
|||
return convert(info, context)
|
||||
|
||||
|
||||
def main(args: typing.List[str] = None) -> None:
|
||||
def main(args: typing.Optional[typing.List[str]] = None) -> None:
|
||||
"""Execute main function for entry point."""
|
||||
argument_parser = build_argument_parser()
|
||||
args = args or sys.argv[1:]
|
||||
|
|
|
@ -65,7 +65,7 @@ def know(
|
|||
raise KnowitException(debug_info(context=context, exc_info=True))
|
||||
|
||||
|
||||
def dependencies(context: typing.Mapping = None) -> typing.Mapping:
|
||||
def dependencies(context: typing.Optional[typing.Mapping] = None) -> typing.Mapping:
|
||||
"""Return all dependencies detected by knowit."""
|
||||
deps = {}
|
||||
try:
|
||||
|
|
|
@ -63,6 +63,17 @@ class Property(Reportable[T]):
|
|||
# Used to detect duplicated values. e.g.: en / en or High@L4.0 / High@L4.0 or Progressive / Progressive
|
||||
self.delimiter = delimiter
|
||||
|
||||
@classmethod
|
||||
def _extract_value(cls,
|
||||
track: typing.Mapping,
|
||||
name: str,
|
||||
names: typing.List[str]):
|
||||
if len(names) == 2:
|
||||
parent_value = track.get(names[0], track.get(names[0].upper(), {}))
|
||||
return parent_value.get(names[1], parent_value.get(names[1].upper()))
|
||||
|
||||
return track.get(name, track.get(name.upper()))
|
||||
|
||||
def extract_value(
|
||||
self,
|
||||
track: typing.Mapping,
|
||||
|
@ -71,7 +82,7 @@ class Property(Reportable[T]):
|
|||
"""Extract the property value from a given track."""
|
||||
for name in self.names:
|
||||
names = name.split('.')
|
||||
value = track.get(names[0], {}).get(names[1]) if len(names) == 2 else track.get(name)
|
||||
value = self._extract_value(track, name, names)
|
||||
if value is None:
|
||||
if self.default is None:
|
||||
continue
|
||||
|
@ -216,9 +227,10 @@ class MultiValue(Property):
|
|||
class Rule(Reportable[T]):
|
||||
"""Rule abstract class."""
|
||||
|
||||
def __init__(self, name: str, override=False, **kwargs):
|
||||
def __init__(self, name: str, private=False, override=False, **kwargs):
|
||||
"""Initialize the object."""
|
||||
super().__init__(name, **kwargs)
|
||||
self.private = private
|
||||
self.override = override
|
||||
|
||||
def execute(self, props, pv_props, context: typing.Mapping):
|
||||
|
|
|
@ -455,46 +455,46 @@ profiles:
|
|||
|
||||
VideoProfileLevel:
|
||||
L1:
|
||||
default: "1"
|
||||
default: '1'
|
||||
technical: Level 1
|
||||
L11:
|
||||
default: "1.1"
|
||||
default: '1.1'
|
||||
technical: Level 1.1
|
||||
L13:
|
||||
default: "1.3"
|
||||
default: '1.3'
|
||||
technical: Level 1.3
|
||||
L2:
|
||||
default: "2"
|
||||
default: '2'
|
||||
technical: Level 2
|
||||
L21:
|
||||
default: "2.1"
|
||||
default: '2.1'
|
||||
technical: Level 2.1
|
||||
L22:
|
||||
default: "2.2"
|
||||
default: '2.2'
|
||||
technical: Level 2.2
|
||||
L3:
|
||||
default: "3"
|
||||
default: '3'
|
||||
technical: Level 3
|
||||
L31:
|
||||
default: "3.1"
|
||||
default: '3.1'
|
||||
technical: Level 3.1
|
||||
L32:
|
||||
default: "3.2"
|
||||
default: '3.2'
|
||||
technical: Level 3.2
|
||||
L4:
|
||||
default: "4"
|
||||
default: '4'
|
||||
technical: Level 4
|
||||
L41:
|
||||
default: "4.1"
|
||||
default: '4.1'
|
||||
technical: Level 4.1
|
||||
L42:
|
||||
default: "4.2"
|
||||
default: '4.2'
|
||||
technical: Level 4.2
|
||||
L5:
|
||||
default: "5"
|
||||
default: '5'
|
||||
technical: Level 5
|
||||
L51:
|
||||
default: "5.1"
|
||||
default: '5.1'
|
||||
technical: Level 5.1
|
||||
LOW:
|
||||
default: Low
|
||||
|
|
|
@ -106,11 +106,12 @@ class Ratio(Property[Decimal]):
|
|||
if (width, height) == ('0', '1'): # identity
|
||||
return Decimal('1.0')
|
||||
|
||||
result = round_decimal(Decimal(width) / Decimal(height), min_digits=1, max_digits=3)
|
||||
if self.unit:
|
||||
result *= self.unit
|
||||
if height:
|
||||
result = round_decimal(Decimal(width) / Decimal(height), min_digits=1, max_digits=3)
|
||||
if self.unit:
|
||||
result *= self.unit
|
||||
|
||||
return result
|
||||
return result
|
||||
|
||||
self.report(value, context)
|
||||
return None
|
||||
|
|
|
@ -103,10 +103,7 @@ class Provider:
|
|||
|
||||
value = prop.extract_value(track, context)
|
||||
if value is not None:
|
||||
if not prop.private:
|
||||
which = props
|
||||
else:
|
||||
which = pv_props
|
||||
which = props if not prop.private else pv_props
|
||||
which[name] = value
|
||||
|
||||
for name, rule in self.rules.get(track_type, {}).items():
|
||||
|
@ -116,8 +113,9 @@ class Provider:
|
|||
|
||||
value = rule.execute(props, pv_props, context)
|
||||
if value is not None:
|
||||
props[name] = value
|
||||
elif name in props and not rule.override:
|
||||
which = props if not rule.private else pv_props
|
||||
which[name] = value
|
||||
elif name in props and (not rule.override or props[name] is None):
|
||||
del props[name]
|
||||
|
||||
return props
|
||||
|
|
|
@ -26,6 +26,7 @@ from knowit.rules import (
|
|||
LanguageRule,
|
||||
ResolutionRule,
|
||||
)
|
||||
from knowit.rules.general import GuessTitleRule
|
||||
from knowit.serializer import get_json_encoder
|
||||
from knowit.units import units
|
||||
from knowit.utils import to_dict
|
||||
|
@ -83,17 +84,20 @@ class EnzymeProvider(Provider):
|
|||
},
|
||||
}, {
|
||||
'video': {
|
||||
'language': LanguageRule('video language'),
|
||||
'guessed': GuessTitleRule('guessed properties', private=True),
|
||||
'language': LanguageRule('video language', override=True),
|
||||
'resolution': ResolutionRule('video resolution'),
|
||||
},
|
||||
'audio': {
|
||||
'language': LanguageRule('audio language'),
|
||||
'guessed': GuessTitleRule('guessed properties', private=True),
|
||||
'language': LanguageRule('audio language', override=True),
|
||||
'channels': AudioChannelsRule('audio channels'),
|
||||
},
|
||||
'subtitle': {
|
||||
'language': LanguageRule('subtitle language'),
|
||||
'hearing_impaired': HearingImpairedRule('subtitle hearing impaired'),
|
||||
'closed_caption': ClosedCaptionRule('closed caption'),
|
||||
'guessed': GuessTitleRule('guessed properties', private=True),
|
||||
'language': LanguageRule('subtitle language', override=True),
|
||||
'hearing_impaired': HearingImpairedRule('subtitle hearing impaired', override=True),
|
||||
'closed_caption': ClosedCaptionRule('closed caption', override=True),
|
||||
}
|
||||
})
|
||||
|
||||
|
@ -130,7 +134,8 @@ class EnzymeProvider(Provider):
|
|||
|
||||
if logger.level == logging.DEBUG:
|
||||
logger.debug('Video {video_path} scanned using Enzyme {version} has raw data:\n{data}',
|
||||
video_path=video_path, version=enzyme.__version__, data=json.dumps(data))
|
||||
video_path=video_path, version=enzyme.__version__,
|
||||
data=json.dumps(data, cls=get_json_encoder(context), indent=4, ensure_ascii=False))
|
||||
|
||||
result = self._describe_tracks(video_path, data.get('info', {}), data.get('video_tracks'),
|
||||
data.get('audio_tracks'), data.get('subtitle_tracks'), context)
|
||||
|
|
|
@ -34,6 +34,7 @@ from knowit.rules import (
|
|||
LanguageRule,
|
||||
ResolutionRule,
|
||||
)
|
||||
from knowit.rules.general import GuessTitleRule
|
||||
from knowit.serializer import get_json_encoder
|
||||
from knowit.units import units
|
||||
from knowit.utils import (
|
||||
|
@ -77,7 +78,7 @@ class FFmpegExecutor:
|
|||
def extract_info(self, filename):
|
||||
"""Extract media info."""
|
||||
json_dump = self._execute(filename)
|
||||
return json.loads(json_dump)
|
||||
return json.loads(json_dump) if json_dump else {}
|
||||
|
||||
def _execute(self, filename):
|
||||
raise NotImplementedError
|
||||
|
@ -144,7 +145,7 @@ class FFmpegProvider(Provider):
|
|||
'id': Basic('index', data_type=int, allow_fallback=True, description='video track number'),
|
||||
'name': Property('tags.title', description='video track name'),
|
||||
'language': Language('tags.language', description='video language'),
|
||||
'duration': Duration('duration', description='video duration'),
|
||||
'duration': Duration('duration', 'tags.duration', description='video duration'),
|
||||
'width': Quantity('width', unit=units.pixel),
|
||||
'height': Quantity('height', unit=units.pixel),
|
||||
'scan_type': ScanType(config, 'field_order', default='Progressive', description='video scan type'),
|
||||
|
@ -153,7 +154,7 @@ class FFmpegProvider(Provider):
|
|||
'resolution': None, # populated with ResolutionRule
|
||||
'frame_rate': Ratio('r_frame_rate', unit=units.FPS, description='video frame rate'),
|
||||
# frame_rate_mode
|
||||
'bit_rate': Quantity('bit_rate', unit=units.bps, description='video bit rate'),
|
||||
'bit_rate': Quantity('bit_rate', 'tags.bps', unit=units.bps, description='video bit rate'),
|
||||
'bit_depth': Quantity('bits_per_raw_sample', unit=units.bit, description='video bit depth'),
|
||||
'codec': VideoCodec(config, 'codec_name', description='video codec'),
|
||||
'profile': VideoProfile(config, 'profile', description='video codec profile'),
|
||||
|
@ -166,13 +167,13 @@ class FFmpegProvider(Provider):
|
|||
'id': Basic('index', data_type=int, allow_fallback=True, description='audio track number'),
|
||||
'name': Property('tags.title', description='audio track name'),
|
||||
'language': Language('tags.language', description='audio language'),
|
||||
'duration': Duration('duration', description='audio duration'),
|
||||
'duration': Duration('duration', 'tags.duration', description='audio duration'),
|
||||
'codec': AudioCodec(config, 'profile', 'codec_name', description='audio codec'),
|
||||
'profile': AudioProfile(config, 'profile', description='audio codec profile'),
|
||||
'channels_count': AudioChannels('channels', description='audio channels count'),
|
||||
'channels': None, # populated with AudioChannelsRule
|
||||
'bit_depth': Quantity('bits_per_raw_sample', unit=units.bit, description='audio bit depth'),
|
||||
'bit_rate': Quantity('bit_rate', unit=units.bps, description='audio bit rate'),
|
||||
'bit_rate': Quantity('bit_rate', 'tags.bps', unit=units.bps, description='audio bit rate'),
|
||||
'sampling_rate': Quantity('sample_rate', unit=units.Hz, description='audio sampling rate'),
|
||||
'forced': YesNo('disposition.forced', hide_value=False, description='audio track forced'),
|
||||
'default': YesNo('disposition.default', hide_value=False, description='audio track default'),
|
||||
|
@ -190,17 +191,20 @@ class FFmpegProvider(Provider):
|
|||
},
|
||||
}, {
|
||||
'video': {
|
||||
'language': LanguageRule('video language'),
|
||||
'guessed': GuessTitleRule('guessed properties', private=True),
|
||||
'language': LanguageRule('video language', override=True),
|
||||
'resolution': ResolutionRule('video resolution'),
|
||||
},
|
||||
'audio': {
|
||||
'language': LanguageRule('audio language'),
|
||||
'guessed': GuessTitleRule('guessed properties', private=True),
|
||||
'language': LanguageRule('audio language', override=True),
|
||||
'channels': AudioChannelsRule('audio channels'),
|
||||
},
|
||||
'subtitle': {
|
||||
'language': LanguageRule('subtitle language'),
|
||||
'hearing_impaired': HearingImpairedRule('subtitle hearing impaired'),
|
||||
'closed_caption': ClosedCaptionRule('closed caption'),
|
||||
'guessed': GuessTitleRule('guessed properties', private=True),
|
||||
'language': LanguageRule('subtitle language', override=True),
|
||||
'hearing_impaired': HearingImpairedRule('subtitle hearing impaired', override=True),
|
||||
'closed_caption': ClosedCaptionRule('closed caption', override=True),
|
||||
},
|
||||
})
|
||||
self.executor = FFmpegExecutor.get_executor_instance(suggested_path)
|
||||
|
|
|
@ -1,5 +1,6 @@
|
|||
|
||||
import ctypes
|
||||
import json
|
||||
import os
|
||||
import re
|
||||
from ctypes import c_void_p, c_wchar_p
|
||||
from decimal import Decimal
|
||||
|
@ -43,6 +44,7 @@ from knowit.rules import (
|
|||
LanguageRule,
|
||||
ResolutionRule,
|
||||
)
|
||||
from knowit.rules.general import GuessTitleRule
|
||||
from knowit.units import units
|
||||
from knowit.utils import (
|
||||
define_candidate,
|
||||
|
@ -77,7 +79,7 @@ class MediaInfoExecutor:
|
|||
|
||||
locations = {
|
||||
'unix': ('/usr/local/mediainfo/lib', '/usr/local/mediainfo/bin', '__PATH__'),
|
||||
'windows': ('__PATH__', ),
|
||||
'windows': ('C:\\Program Files\\MediaInfo', 'C:\\Program Files (x86)\\MediaInfo', '__PATH__'),
|
||||
'macos': ('__PATH__', ),
|
||||
}
|
||||
|
||||
|
@ -121,12 +123,28 @@ class MediaInfoCliExecutor(MediaInfoExecutor):
|
|||
}
|
||||
|
||||
def _execute(self, filename):
|
||||
return json.loads(check_output([self.location, '--Output=JSON', '--Full', filename]).decode())
|
||||
data = check_output([self.location, '--Output=JSON', '--Full', filename]).decode()
|
||||
|
||||
return json.loads(data) if data else {}
|
||||
|
||||
@classmethod
|
||||
def _is_gui_exe(cls, candidate: str):
|
||||
if not candidate.endswith('MediaInfo.exe') or not os.path.isfile(candidate):
|
||||
return False
|
||||
|
||||
try:
|
||||
shell32 = ctypes.WinDLL('shell32', use_last_error=True) # type: ignore
|
||||
return bool(shell32.ExtractIconExW(candidate, 0, None, None, 1))
|
||||
except Exception:
|
||||
return False
|
||||
|
||||
@classmethod
|
||||
def create(cls, os_family=None, suggested_path=None):
|
||||
"""Create the executor instance."""
|
||||
for candidate in define_candidate(cls.locations, cls.names, os_family, suggested_path):
|
||||
if cls._is_gui_exe(candidate):
|
||||
continue
|
||||
|
||||
try:
|
||||
output = check_output([candidate, '--version']).decode()
|
||||
version = cls._get_version(output)
|
||||
|
@ -154,7 +172,9 @@ class MediaInfoCTypesExecutor(MediaInfoExecutor):
|
|||
|
||||
def _execute(self, filename):
|
||||
# Create a MediaInfo handle
|
||||
return json.loads(MediaInfo.parse(filename, library_file=self.location, output='JSON'))
|
||||
data = MediaInfo.parse(filename, library_file=self.location, output='JSON')
|
||||
|
||||
return json.loads(data) if data else {}
|
||||
|
||||
@classmethod
|
||||
def create(cls, os_family=None, suggested_path=None):
|
||||
|
@ -254,19 +274,22 @@ class MediaInfoProvider(Provider):
|
|||
},
|
||||
}, {
|
||||
'video': {
|
||||
'language': LanguageRule('video language'),
|
||||
'guessed': GuessTitleRule('guessed properties', private=True),
|
||||
'language': LanguageRule('video language', override=True),
|
||||
'resolution': ResolutionRule('video resolution'),
|
||||
},
|
||||
'audio': {
|
||||
'language': LanguageRule('audio language'),
|
||||
'guessed': GuessTitleRule('guessed properties', private=True),
|
||||
'language': LanguageRule('audio language', override=True),
|
||||
'channels': AudioChannelsRule('audio channels'),
|
||||
'_atmosrule': AtmosRule(config, 'atmos rule'),
|
||||
'_dtshdrule': DtsHdRule(config, 'dts-hd rule'),
|
||||
'atmos': AtmosRule(config, 'atmos rule', private=True),
|
||||
'dtshd': DtsHdRule(config, 'dts-hd rule', private=True),
|
||||
},
|
||||
'subtitle': {
|
||||
'language': LanguageRule('subtitle language'),
|
||||
'hearing_impaired': HearingImpairedRule('subtitle hearing impaired'),
|
||||
'closed_caption': ClosedCaptionRule('closed caption'),
|
||||
'guessed': GuessTitleRule('guessed properties', private=True),
|
||||
'language': LanguageRule('subtitle language', override=True),
|
||||
'hearing_impaired': HearingImpairedRule('subtitle hearing impaired', override=True),
|
||||
'closed_caption': ClosedCaptionRule('closed caption', override=True),
|
||||
}
|
||||
})
|
||||
self.executor = MediaInfoExecutor.get_executor_instance(suggested_path)
|
||||
|
|
|
@ -28,6 +28,7 @@ from knowit.rules import (
|
|||
LanguageRule,
|
||||
ResolutionRule,
|
||||
)
|
||||
from knowit.rules.general import GuessTitleRule
|
||||
from knowit.serializer import get_json_encoder
|
||||
from knowit.units import units
|
||||
from knowit.utils import define_candidate, detect_os
|
||||
|
@ -67,7 +68,7 @@ class MkvMergeExecutor:
|
|||
def extract_info(self, filename):
|
||||
"""Extract media info."""
|
||||
json_dump = self._execute(filename)
|
||||
return json.loads(json_dump)
|
||||
return json.loads(json_dump) if json_dump else {}
|
||||
|
||||
def _execute(self, filename):
|
||||
raise NotImplementedError
|
||||
|
@ -166,17 +167,20 @@ class MkvMergeProvider(Provider):
|
|||
},
|
||||
}, {
|
||||
'video': {
|
||||
'guessed': GuessTitleRule('guessed properties', private=True),
|
||||
'language': LanguageRule('video language', override=True),
|
||||
'resolution': ResolutionRule('video resolution'),
|
||||
},
|
||||
'audio': {
|
||||
'guessed': GuessTitleRule('guessed properties', private=True),
|
||||
'language': LanguageRule('audio language', override=True),
|
||||
'channels': AudioChannelsRule('audio channels'),
|
||||
},
|
||||
'subtitle': {
|
||||
'guessed': GuessTitleRule('guessed properties', private=True),
|
||||
'language': LanguageRule('subtitle language', override=True),
|
||||
'hearing_impaired': HearingImpairedRule('subtitle hearing impaired'),
|
||||
'closed_caption': ClosedCaptionRule('closed caption'),
|
||||
'hearing_impaired': HearingImpairedRule('subtitle hearing impaired', override=True),
|
||||
'closed_caption': ClosedCaptionRule('closed caption', override=True),
|
||||
}
|
||||
})
|
||||
self.executor = MkvMergeExecutor.get_executor_instance(suggested_path)
|
||||
|
|
|
@ -1,8 +1,6 @@
|
|||
|
||||
import re
|
||||
from logging import NullHandler, getLogger
|
||||
|
||||
import babelfish
|
||||
from trakit.api import trakit
|
||||
|
||||
from knowit.core import Rule
|
||||
|
||||
|
@ -10,22 +8,27 @@ logger = getLogger(__name__)
|
|||
logger.addHandler(NullHandler())
|
||||
|
||||
|
||||
class LanguageRule(Rule):
|
||||
"""Language rules."""
|
||||
|
||||
name_re = re.compile(r'(?P<name>\w+)\b', re.IGNORECASE)
|
||||
class GuessTitleRule(Rule):
|
||||
"""Guess properties from track title."""
|
||||
|
||||
def execute(self, props, pv_props, context):
|
||||
"""Language detection using name."""
|
||||
if 'language' in props:
|
||||
if 'name' in props:
|
||||
language = props.get('language')
|
||||
options = {'expected_language': language} if language else {}
|
||||
guessed = trakit(props['name'], options)
|
||||
if guessed:
|
||||
return guessed
|
||||
|
||||
|
||||
class LanguageRule(Rule):
|
||||
"""Language rules."""
|
||||
|
||||
def execute(self, props, pv_props, context):
|
||||
"""Language detection using name."""
|
||||
if 'guessed' not in pv_props:
|
||||
return
|
||||
|
||||
if 'name' in props:
|
||||
name = props.get('name', '')
|
||||
match = self.name_re.match(name)
|
||||
if match:
|
||||
try:
|
||||
return babelfish.Language.fromname(match.group('name'))
|
||||
except babelfish.Error:
|
||||
pass
|
||||
logger.info('Invalid %s: %r', self.description, name)
|
||||
guess = pv_props['guessed']
|
||||
if 'language' in guess:
|
||||
return guess['language']
|
||||
|
|
|
@ -10,18 +10,19 @@ class ClosedCaptionRule(Rule):
|
|||
|
||||
def execute(self, props, pv_props, context):
|
||||
"""Execute closed caption rule."""
|
||||
for name in (pv_props.get('_closed_caption'), props.get('name')):
|
||||
if name and self.cc_re.search(name):
|
||||
return True
|
||||
if '_closed_caption' in pv_props and self.cc_re.search(pv_props['_closed_caption']):
|
||||
return True
|
||||
|
||||
if 'guessed' in pv_props:
|
||||
guessed = pv_props['guessed']
|
||||
return guessed.get('closed_caption')
|
||||
|
||||
|
||||
class HearingImpairedRule(Rule):
|
||||
"""Hearing Impaired rule."""
|
||||
|
||||
hi_re = re.compile(r'(\bsdh\b)', re.IGNORECASE)
|
||||
|
||||
def execute(self, props, pv_props, context):
|
||||
"""Hearing Impaired."""
|
||||
name = props.get('name')
|
||||
if name and self.hi_re.search(name):
|
||||
return True
|
||||
if 'guessed' in pv_props:
|
||||
guessed = pv_props['guessed']
|
||||
return guessed.get('hearing_impaired')
|
||||
|
|
|
@ -1,10 +1,5 @@
|
|||
import typing
|
||||
|
||||
try:
|
||||
import pint
|
||||
except ImportError:
|
||||
pint = False
|
||||
|
||||
|
||||
class NullRegistry:
|
||||
"""A NullRegistry that masquerades as a pint.UnitRegistry."""
|
||||
|
@ -25,9 +20,18 @@ class NullRegistry:
|
|||
|
||||
|
||||
def _build_unit_registry():
|
||||
registry = pint.UnitRegistry() if pint else NullRegistry()
|
||||
registry.define('FPS = 1 * hertz')
|
||||
return registry
|
||||
try:
|
||||
import pint
|
||||
|
||||
registry = pint.UnitRegistry()
|
||||
registry.define('FPS = 1 * hertz')
|
||||
|
||||
pint.set_application_registry(registry)
|
||||
return registry
|
||||
except ModuleNotFoundError:
|
||||
pass
|
||||
|
||||
return NullRegistry()
|
||||
|
||||
|
||||
units = _build_unit_registry()
|
||||
|
|
|
@ -386,7 +386,7 @@ class MediaInfo:
|
|||
A higher value will yield more precise results in some cases
|
||||
but will also increase parsing time.
|
||||
:param bool full: display additional tags, including computer-readable values
|
||||
for sizes and durations.
|
||||
for sizes and durations, corresponds to the CLI's ``--Full``/``-f`` parameter.
|
||||
:param bool legacy_stream_display: display additional information about streams.
|
||||
:param dict mediainfo_options: additional options that will be passed to the
|
||||
`MediaInfo_Option` function, for example: ``{"Language": "raw"}``.
|
||||
|
|
|
@ -0,0 +1,8 @@
|
|||
__title__ = 'trakit'
|
||||
__version__ = '0.2.1'
|
||||
__short_version__ = '0.2'
|
||||
__author__ = 'RatoAQ'
|
||||
__license__ = 'MIT'
|
||||
__url__ = 'https://github.com/ratoaq2/trakit'
|
||||
|
||||
from .api import TrakItApi, trakit
|
|
@ -0,0 +1,108 @@
|
|||
import argparse
|
||||
import json
|
||||
import logging
|
||||
import sys
|
||||
import typing
|
||||
|
||||
import babelfish
|
||||
|
||||
from trakit import TrakItApi, __version__
|
||||
|
||||
logging.basicConfig(stream=sys.stdout, format='%(message)s')
|
||||
logging.getLogger('CONSOLE').setLevel(logging.INFO)
|
||||
logging.getLogger('trakit').setLevel(logging.WARNING)
|
||||
|
||||
console = logging.getLogger('CONSOLE')
|
||||
logger = logging.getLogger('trakit')
|
||||
|
||||
|
||||
def build_argument_parser() -> argparse.ArgumentParser:
|
||||
"""Build the argument parser."""
|
||||
opts = argparse.ArgumentParser()
|
||||
opts.add_argument(
|
||||
dest='value',
|
||||
help='track title to guess',
|
||||
type=str,
|
||||
)
|
||||
|
||||
conf_opts = opts.add_argument_group('Configuration')
|
||||
conf_opts.add_argument(
|
||||
'-l',
|
||||
'--expected-language',
|
||||
dest='expected_language',
|
||||
help='The expected language to be guessed',
|
||||
type=str,
|
||||
)
|
||||
|
||||
output_opts = opts.add_argument_group('Output')
|
||||
output_opts.add_argument(
|
||||
'--debug',
|
||||
action='store_true',
|
||||
dest='debug',
|
||||
help='Print information for debugging trakit and for reporting bugs.'
|
||||
)
|
||||
output_opts.add_argument(
|
||||
'-y',
|
||||
'--yaml',
|
||||
action='store_true',
|
||||
dest='yaml',
|
||||
help='Display output in yaml format'
|
||||
)
|
||||
|
||||
information_opts = opts.add_argument_group('Information')
|
||||
information_opts.add_argument('--version', action='version', version=__version__)
|
||||
|
||||
return opts
|
||||
|
||||
|
||||
def _as_yaml(value: str, info: typing.Mapping[str, typing.Any]) -> str:
|
||||
"""Convert info to string using YAML format."""
|
||||
import yaml
|
||||
|
||||
def default_representer(r: yaml.representer.SafeRepresenter, data: typing.Any):
|
||||
return r.represent_scalar('tag:yaml.org,2002:str', str(data))
|
||||
|
||||
yaml.representer.SafeRepresenter.add_representer(babelfish.Language, default_representer)
|
||||
|
||||
return yaml.safe_dump({value: dict(info)}, allow_unicode=True, sort_keys=False)
|
||||
|
||||
|
||||
def _as_json(info: typing.Mapping[str, typing.Any]) -> str:
|
||||
"""Convert info to string using JSON format."""
|
||||
return json.dumps(info, ensure_ascii=False, indent=2, default=str)
|
||||
|
||||
|
||||
def dump(value: str, info: typing.Mapping[str, typing.Any], opts: argparse.Namespace) -> str:
|
||||
"""Convert info to string using json or yaml format."""
|
||||
if opts.yaml:
|
||||
return _as_yaml(value, info)
|
||||
|
||||
return _as_json(info)
|
||||
|
||||
|
||||
def trakit(value: str, opts: argparse.Namespace) -> typing.Mapping:
|
||||
"""Extract video metadata."""
|
||||
if not opts.yaml:
|
||||
console.info('Parsing: %s', value)
|
||||
options = {k: v for k, v in vars(opts).items() if v is not None}
|
||||
info = TrakItApi().trakit(value, options)
|
||||
console.info('TrakIt %s found: ', __version__)
|
||||
console.info(dump(value, info, opts))
|
||||
return info
|
||||
|
||||
|
||||
def main(args: typing.Optional[typing.List[str]] = None):
|
||||
"""Execute main function for entry point."""
|
||||
argument_parser = build_argument_parser()
|
||||
args = args or sys.argv[1:]
|
||||
opts = argument_parser.parse_args(args)
|
||||
|
||||
if opts.debug:
|
||||
logger.setLevel(logging.DEBUG)
|
||||
logging.getLogger('rebulk').setLevel(logging.DEBUG)
|
||||
|
||||
return trakit(opts.value, opts)
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
main(sys.argv[1:])
|
|
@ -0,0 +1,24 @@
|
|||
import typing
|
||||
|
||||
from trakit.config import Config
|
||||
from trakit.context import Context
|
||||
from trakit.patterns import configure
|
||||
|
||||
|
||||
class TrakItApi:
|
||||
|
||||
def __init__(self, config: typing.Optional[typing.Mapping[str, typing.Any]] = None):
|
||||
self.rebulk = configure(Config(config))
|
||||
|
||||
def trakit(self, string: str, options: typing.Optional[typing.Mapping[str, typing.Any]] = None):
|
||||
"""Return a mapping of extracted information."""
|
||||
matches = self.rebulk.matches(string, Context(options))
|
||||
guess: typing.Mapping[str, typing.Any] = matches.to_dict()
|
||||
return guess
|
||||
|
||||
|
||||
default_api = TrakItApi()
|
||||
|
||||
|
||||
def trakit(string: str, options: typing.Optional[typing.Mapping[str, typing.Any]] = None):
|
||||
return default_api.trakit(string, options)
|
|
@ -0,0 +1,19 @@
|
|||
import json
|
||||
import typing
|
||||
|
||||
from pkg_resources import resource_stream
|
||||
|
||||
|
||||
class Config:
|
||||
def __init__(self, config: typing.Optional[typing.Mapping[str, typing.Any]]):
|
||||
with resource_stream('trakit', 'data/config.json') as f:
|
||||
cfg: typing.Dict[str, typing.Any] = json.load(f)
|
||||
if config:
|
||||
cfg.update(config)
|
||||
|
||||
self.ignored: typing.Set[str] = set(cfg.get('ignored', []))
|
||||
self.countries: typing.Mapping[str, str] = cfg.get('countries', {})
|
||||
self.languages: typing.Mapping[str, str] = cfg.get('languages', {})
|
||||
self.scripts: typing.Mapping[str, str] = cfg.get('scripts', {})
|
||||
self.regions: typing.Mapping[str, str] = cfg.get('regions', {})
|
||||
self.implicit_languages: typing.Mapping[str, str] = cfg.get('implicit-languages', {})
|
|
@ -0,0 +1,22 @@
|
|||
import typing
|
||||
|
||||
import babelfish
|
||||
|
||||
|
||||
class Context(dict):
|
||||
def __init__(self, options: typing.Optional[typing.Mapping[str, typing.Any]] = None):
|
||||
super().__init__(options or {})
|
||||
language = self['expected_language'] if 'expected_language' in self else None
|
||||
if language and not isinstance(language, babelfish.Language):
|
||||
language = babelfish.Language.fromietf(str(language))
|
||||
self.expected_language: typing.Optional[babelfish.Language] = language
|
||||
|
||||
def accept(self, lang: babelfish.Language):
|
||||
if self.expected_language is None:
|
||||
return True
|
||||
if self.expected_language.alpha3 != lang.alpha3:
|
||||
return False
|
||||
if self.expected_language.script and self.expected_language != lang.script:
|
||||
return False
|
||||
|
||||
return not self.expected_language.country or self.expected_language == lang.country
|
|
@ -0,0 +1,32 @@
|
|||
import typing
|
||||
|
||||
from babelfish import Country, CountryReverseConverter, CountryReverseError
|
||||
from babelfish.converters import CaseInsensitiveDict
|
||||
|
||||
|
||||
class GuessCountryConverter(CountryReverseConverter):
|
||||
def __init__(self, config: typing.Mapping[str, str]):
|
||||
self.synonyms = CaseInsensitiveDict(config)
|
||||
|
||||
def convert(self, alpha2):
|
||||
return str(Country(alpha2))
|
||||
|
||||
def reverse(self, name: str):
|
||||
try:
|
||||
return self.synonyms[name]
|
||||
except KeyError:
|
||||
pass
|
||||
|
||||
if name.isupper() and len(name) == 2:
|
||||
try:
|
||||
return Country(name).alpha2
|
||||
except ValueError:
|
||||
pass
|
||||
|
||||
for conv in (Country.fromname,):
|
||||
try:
|
||||
return conv(name).alpha2
|
||||
except CountryReverseError:
|
||||
pass
|
||||
|
||||
raise CountryReverseError(name)
|
|
@ -0,0 +1,30 @@
|
|||
import typing
|
||||
|
||||
from babelfish import Language, LanguageReverseConverter, LanguageReverseError
|
||||
from babelfish.converters import CaseInsensitiveDict
|
||||
|
||||
|
||||
class GuessLanguageConverter(LanguageReverseConverter):
|
||||
def __init__(self, config: typing.Mapping[str, str]):
|
||||
self.synonyms = CaseInsensitiveDict()
|
||||
for synonym, code in config.items():
|
||||
lang = Language.fromietf(code) if '-' in code else Language(code)
|
||||
self.synonyms[synonym] = (lang.alpha3, lang.country.alpha2 if lang.country else None, lang.script)
|
||||
|
||||
def convert(self, alpha3: str, country=None, script=None):
|
||||
return str(Language(alpha3, country, script))
|
||||
|
||||
def reverse(self, name: str):
|
||||
try:
|
||||
return self.synonyms[name]
|
||||
except KeyError:
|
||||
pass
|
||||
|
||||
for conv in (Language.fromname,):
|
||||
try:
|
||||
reverse = conv(name)
|
||||
return reverse.alpha3, reverse.country, reverse.script
|
||||
except (ValueError, LanguageReverseError):
|
||||
pass
|
||||
|
||||
raise LanguageReverseError(name)
|
|
@ -0,0 +1,860 @@
|
|||
{
|
||||
"countries": {
|
||||
"Afghan": "AF",
|
||||
"Aforika Borwa": "ZA",
|
||||
"Afrika Borwa": "ZA",
|
||||
"Afrika Dzonga": "ZA",
|
||||
"Afurika Tshipembe": "ZA",
|
||||
"Aland": "AX",
|
||||
"Alandish": "AX",
|
||||
"Albanian": "AL",
|
||||
"Algerian": "DZ",
|
||||
"American": "US",
|
||||
"American Islander": "UM",
|
||||
"American Samoan": "AS",
|
||||
"American Virgin Islander": "VI",
|
||||
"Andorran": "AD",
|
||||
"Angolan": "AO",
|
||||
"Anguillian": "AI",
|
||||
"Antarctican": "AQ",
|
||||
"Antiguan Barbudan": "AG",
|
||||
"Ao Men": "MO",
|
||||
"Aotearoa": "NZ",
|
||||
"Argentine": "AR",
|
||||
"Armenian": "AM",
|
||||
"Aruban": "AW",
|
||||
"Australian": "AU",
|
||||
"Austrian": "AT",
|
||||
"Ayiti": "HT",
|
||||
"Azerbaidzhan": "AZ",
|
||||
"Azerbaijani": "AZ",
|
||||
"Azərbaycan": "AZ",
|
||||
"Bahamian": "BS",
|
||||
"Bahraini": "BH",
|
||||
"Bangladeshi": "BD",
|
||||
"Barbadian": "BB",
|
||||
"Beafrika": "CF",
|
||||
"Belarusian": "BY",
|
||||
"Belau": "PW",
|
||||
"Belgian": "BE",
|
||||
"Belgie": "BE",
|
||||
"Belgien": "BE",
|
||||
"Belgique": "BE",
|
||||
"België": "BE",
|
||||
"Belice": "BZ",
|
||||
"Belizean": "BZ",
|
||||
"Beninese": "BJ",
|
||||
"Bermudian": "BM",
|
||||
"Bhutanese": "BT",
|
||||
"Blgariia": "BG",
|
||||
"Bolivia": "BO",
|
||||
"Bolivian": "BO",
|
||||
"Boneiru Sint Eustatius y Saba": "BQ",
|
||||
"Bosna i Hercegovina": "BA",
|
||||
"Bosna i Khertsegovina": "BA",
|
||||
"Bosnian Herzegovinian": "BA",
|
||||
"Bouvetoya": "BV",
|
||||
"Bouvetøya": "BV",
|
||||
"Brasil": "BR",
|
||||
"Brazilian": "BR",
|
||||
"British": "GB",
|
||||
"British Virgin Islander": "VG",
|
||||
"British Virgin Islands": "VG",
|
||||
"Bruneian": "BN",
|
||||
"Bulgarian": "BG",
|
||||
"Buliwya": "BO",
|
||||
"Burkinabe": "BF",
|
||||
"Burmese": "MM",
|
||||
"Burundian": "BI",
|
||||
"Bénin": "BJ",
|
||||
"Bêafrîka": "CF",
|
||||
"Cabo Verde": "CV",
|
||||
"Cambodian": "KH",
|
||||
"Cameroonian": "CM",
|
||||
"Cameroun": "CM",
|
||||
"Canadian": "CA",
|
||||
"Cape Verdian": "CV",
|
||||
"Caribisch Nederland": "BQ",
|
||||
"Caymanian": "KY",
|
||||
"Central African": "CF",
|
||||
"Cesko": "CZ",
|
||||
"Chadian": "TD",
|
||||
"Channel Islander": "JE",
|
||||
"Chilean": "CL",
|
||||
"Chinese": "CN",
|
||||
"Christmas Islander": "CX",
|
||||
"Cocos Islander": "CC",
|
||||
"Cocos Keeling Islands": "CC",
|
||||
"Colombian": "CO",
|
||||
"Comoran": "KM",
|
||||
"Comores": "KM",
|
||||
"Congolese": "CD",
|
||||
"Cook Islander": "CK",
|
||||
"Costa Rican": "CR",
|
||||
"Cote dIvoire": "CI",
|
||||
"Croatian": "HR",
|
||||
"Cuban": "CU",
|
||||
"Curacao": "CW",
|
||||
"Curacaoan": "CW",
|
||||
"Curaçaoan": "CW",
|
||||
"Cypriot": "CY",
|
||||
"Czech": "CZ",
|
||||
"Côte dIvoire": "CI",
|
||||
"Danish": "DK",
|
||||
"Danmark": "DK",
|
||||
"Deutschland": "DE",
|
||||
"Dgernesiais": "GG",
|
||||
"Dgèrnésiais": "GG",
|
||||
"Ditunga dia Kongu wa Mungalaata": "CD",
|
||||
"Dominican": "DO",
|
||||
"Dutch": "NL",
|
||||
"East Timorese": "TL",
|
||||
"Ecuadorean": "EC",
|
||||
"Eesti": "EE",
|
||||
"Egyptian": "EG",
|
||||
"Eire": "IE",
|
||||
"Ellada": "GR",
|
||||
"Emirati": "AE",
|
||||
"Equatorial Guinean": "GQ",
|
||||
"Eritrean": "ER",
|
||||
"Espana": "ES",
|
||||
"España": "ES",
|
||||
"Estados Unidos": "US",
|
||||
"Estonian": "EE",
|
||||
"Eswatini": "SZ",
|
||||
"Ethiopian": "ET",
|
||||
"Faereyjar": "FO",
|
||||
"Faeroerne": "FO",
|
||||
"Falkland Islander": "FK",
|
||||
"Falkland Islands": "FK",
|
||||
"Faroese": "FO",
|
||||
"Fijian": "FJ",
|
||||
"Filipino": "PH",
|
||||
"Finnish": "FI",
|
||||
"Foroyar": "FO",
|
||||
"French": "FR",
|
||||
"French Polynesian": "PF",
|
||||
"Færeyjar": "FO",
|
||||
"Færøerne": "FO",
|
||||
"Føroyar": "FO",
|
||||
"Gabonese": "GA",
|
||||
"Gambian": "GM",
|
||||
"Georgian": "GE",
|
||||
"German": "DE",
|
||||
"Ghanaian": "GH",
|
||||
"Greek": "GR",
|
||||
"Greenlandic": "GL",
|
||||
"Grenadian": "GD",
|
||||
"Guadeloupian": "GP",
|
||||
"Guahan": "GU",
|
||||
"Guamanian": "GU",
|
||||
"Guatemalan": "GT",
|
||||
"Guernesey": "GG",
|
||||
"Guianan": "GF",
|
||||
"Guine Bissau": "GW",
|
||||
"Guine Equatorial": "GQ",
|
||||
"Guinea Bissauan": "GW",
|
||||
"Guinea Ecuatorial": "GQ",
|
||||
"Guinean": "GN",
|
||||
"Guinee": "GN",
|
||||
"Guinee equatoriale": "GQ",
|
||||
"Guiné Bissau": "GW",
|
||||
"Guiné Equatorial": "GQ",
|
||||
"Guinée": "GN",
|
||||
"Guinée équatoriale": "GQ",
|
||||
"Guyane francaise": "GF",
|
||||
"Guyane française": "GF",
|
||||
"Guyanese": "GY",
|
||||
"Guåhån": "GU",
|
||||
"Haitian": "HT",
|
||||
"Hayastan": "AM",
|
||||
"Haïti": "HT",
|
||||
"Heard and McDonald Islander": "HM",
|
||||
"Honduran": "HN",
|
||||
"Hong Konger": "HK",
|
||||
"Hrvatska": "HR",
|
||||
"Hungarian": "HU",
|
||||
"I Kiribati": "KI",
|
||||
"Icelander": "IS",
|
||||
"Indian": "IN",
|
||||
"Indonesian": "ID",
|
||||
"Iranian": "IR",
|
||||
"Iraqi": "IQ",
|
||||
"Irish": "IE",
|
||||
"Island": "IS",
|
||||
"Israeli": "IL",
|
||||
"Italia": "IT",
|
||||
"Italian": "IT",
|
||||
"Ivorian": "CI",
|
||||
"Jamaican": "JM",
|
||||
"Jamhuri ya Kidemokrasia ya Kongo": "CD",
|
||||
"Japanese": "JP",
|
||||
"Jerri": "JE",
|
||||
"Jordanian": "JO",
|
||||
"Jèrri": "JE",
|
||||
"Kalaallit Nunaat": "GL",
|
||||
"Kampuchea": "KH",
|
||||
"Kazakhstani": "KZ",
|
||||
"Kazakstan": "KZ",
|
||||
"Kenyan": "KE",
|
||||
"Kibris": "CY",
|
||||
"Kirghiz": "KG",
|
||||
"Kirgiziia": "KG",
|
||||
"Kittitian or Nevisian": "KN",
|
||||
"Komori": "KM",
|
||||
"Kuki Airani": "CK",
|
||||
"Kupros": "CY",
|
||||
"Kuwaiti": "KW",
|
||||
"Kâmpŭchéa": "KH",
|
||||
"Kıbrıs": "CY",
|
||||
"Kūki Āirani": "CK",
|
||||
"La Reunion": "RE",
|
||||
"La Réunion": "RE",
|
||||
"Laotian": "LA",
|
||||
"Latvian": "LV",
|
||||
"Latvija": "LV",
|
||||
"Lebanese": "LB",
|
||||
"Letzebuerg": "LU",
|
||||
"Liban": "LB",
|
||||
"Liberian": "LR",
|
||||
"Libyan": "LY",
|
||||
"Liechtensteiner": "LI",
|
||||
"Lietuva": "LT",
|
||||
"Lithuanian": "LT",
|
||||
"Luxembourger": "LU",
|
||||
"Luxemburg": "LU",
|
||||
"Lëtzebuerg": "LU",
|
||||
"Macanese": "MO",
|
||||
"Macau": "MO",
|
||||
"Macedonian": "MK",
|
||||
"Madagasikara": "MG",
|
||||
"Magyarorszag": "HU",
|
||||
"Magyarország": "HU",
|
||||
"Mahoran": "YT",
|
||||
"Majel": "MH",
|
||||
"Makedonija": "MK",
|
||||
"Makedonski": "MK",
|
||||
"Malagasy": "MG",
|
||||
"Malawian": "MW",
|
||||
"Malaysian": "MY",
|
||||
"Malaŵi": "MW",
|
||||
"Maldivan": "MV",
|
||||
"Malian": "ML",
|
||||
"Maltese": "MT",
|
||||
"Mannin": "IM",
|
||||
"Manx": "IM",
|
||||
"Marshallese": "MH",
|
||||
"Martinican": "MQ",
|
||||
"Maurice": "MU",
|
||||
"Mauritanian": "MR",
|
||||
"Mauritian": "MU",
|
||||
"Mexican": "MX",
|
||||
"Micronesia": "FM",
|
||||
"Micronesian": "FM",
|
||||
"Mocambique": "MZ",
|
||||
"Moldova": "MD",
|
||||
"Moldovan": "MD",
|
||||
"Monegasque": "MC",
|
||||
"Mongol uls": "MN",
|
||||
"Mongolian": "MN",
|
||||
"Montenegrin": "ME",
|
||||
"Montserratian": "MS",
|
||||
"Moris": "MU",
|
||||
"Moroccan": "MA",
|
||||
"Mosotho": "LS",
|
||||
"Motswana": "BW",
|
||||
"Mozambican": "MZ",
|
||||
"Moçambique": "MZ",
|
||||
"Mzantsi Afrika": "ZA",
|
||||
"México": "MX",
|
||||
"M̧ajeļ": "MH",
|
||||
"Na Islas Marianas": "MP",
|
||||
"Na Islas Mariånas": "MP",
|
||||
"Namibian": "NA",
|
||||
"Namibie": "NA",
|
||||
"Namibië": "NA",
|
||||
"Nauruan": "NR",
|
||||
"Nederland": "NL",
|
||||
"Negara Brunei Darussalam": "BN",
|
||||
"Nepalese": "NP",
|
||||
"New Caledonian": "NC",
|
||||
"New Zealander": "NZ",
|
||||
"Ni Vanuatu": "VU",
|
||||
"Nicaraguan": "NI",
|
||||
"Nigerian": "NG",
|
||||
"Nigerien": "NE",
|
||||
"Ningizimu Afrika": "ZA",
|
||||
"Niuean": "NU",
|
||||
"Niuē": "NU",
|
||||
"Noreg": "NO",
|
||||
"Norfk Ailen": "NF",
|
||||
"Norfolk Islander": "NF",
|
||||
"Norge": "NO",
|
||||
"Norgga": "NO",
|
||||
"North Korean": "KP",
|
||||
"Norwegian": "NO",
|
||||
"Nouvelle Caledonie": "NC",
|
||||
"Nouvelle Calédonie": "NC",
|
||||
"Omani": "OM",
|
||||
"Osterreich": "AT",
|
||||
"Owganystan": "AF",
|
||||
"Ozbekiston": "UZ",
|
||||
"O‘zbekiston": "UZ",
|
||||
"Pais Korsou": "CW",
|
||||
"Pais Kòrsou": "CW",
|
||||
"Pakistani": "PK",
|
||||
"Palauan": "PW",
|
||||
"Palestinian": "PS",
|
||||
"Panamanian": "PA",
|
||||
"Panamá": "PA",
|
||||
"Papua New Guinean": "PG",
|
||||
"Papua Niu Gini": "PG",
|
||||
"Papua Niugini": "PG",
|
||||
"Paraguai": "PY",
|
||||
"Paraguayan": "PY",
|
||||
"Paraguái": "PY",
|
||||
"Peruvian": "PE",
|
||||
"Perú": "PE",
|
||||
"Pilipinas": "PH",
|
||||
"Piruw": "PE",
|
||||
"Pitcairn Islander": "PN",
|
||||
"Pitcairn Islands": "PN",
|
||||
"Polish": "PL",
|
||||
"Polska": "PL",
|
||||
"Polynesie francaise": "PF",
|
||||
"Polynésie française": "PF",
|
||||
"Portuguese": "PT",
|
||||
"Puerto Rican": "PR",
|
||||
"Qatari": "QA",
|
||||
"RD Congo": "CD",
|
||||
"Repubilika ya Kongo": "CG",
|
||||
"Repubilika ya Kongo Demokratiki": "CD",
|
||||
"Republica Dominicana": "DO",
|
||||
"Republiki ya Kongo": "CG",
|
||||
"Republiki ya Kongo Demokratiki": "CD",
|
||||
"Republiki ya Kongó Demokratiki": "CD",
|
||||
"Republique centrafricaine": "CF",
|
||||
"Republique du Congo": "CG",
|
||||
"Republíki ya Kongó": "CG",
|
||||
"República Dominicana": "DO",
|
||||
"Reunionese": "RE",
|
||||
"Ri Ben": "JP",
|
||||
"Romanian": "RO",
|
||||
"România": "RO",
|
||||
"Rossiia": "RU",
|
||||
"Russian": "RU",
|
||||
"Rwandan": "RW",
|
||||
"République centrafricaine": "CF",
|
||||
"République du Congo": "CG",
|
||||
"Réunionese": "RE",
|
||||
"Sahara Occidental": "EH",
|
||||
"Sahrawi": "EH",
|
||||
"Saint Barthelemy": "BL",
|
||||
"Saint Barthelemy Islander": "BL",
|
||||
"Saint Barthélemy Islander": "BL",
|
||||
"Saint Helena Ascension and Tristan da Cunha": "SH",
|
||||
"Saint Helenian": "SH",
|
||||
"Saint Lucian": "LC",
|
||||
"Saint Martin": "MF",
|
||||
"Saint Martin Islander": "MF",
|
||||
"Saint Pierrais Miquelonnais": "PM",
|
||||
"Saint Pierre et Miquelon": "PM",
|
||||
"Saint Vincentian": "VC",
|
||||
"Salvadoran": "SV",
|
||||
"Sammarinese": "SM",
|
||||
"Samoa Amelika": "AS",
|
||||
"Samoan": "WS",
|
||||
"Sao Tome e Principe": "ST",
|
||||
"Sao Tomean": "ST",
|
||||
"Saudi Arabian": "SA",
|
||||
"Schweiz": "CH",
|
||||
"Senegalese": "SN",
|
||||
"Serbian": "RS",
|
||||
"Sesel": "SC",
|
||||
"Sewula Afrika": "ZA",
|
||||
"Seychellois": "SC",
|
||||
"Shqiperia": "AL",
|
||||
"Shqipëria": "AL",
|
||||
"Sierra Leonean": "SL",
|
||||
"Singaporean": "SG",
|
||||
"Singapura": "SG",
|
||||
"Sint Maarten": "SX",
|
||||
"Slovak": "SK",
|
||||
"Slovene": "SI",
|
||||
"Slovenija": "SI",
|
||||
"Slovensko": "SK",
|
||||
"Solomon Islander": "SB",
|
||||
"Somali": "SO",
|
||||
"Soomaaliya": "SO",
|
||||
"South African": "ZA",
|
||||
"South Georgia": "GS",
|
||||
"South Georgian South Sandwich Islander": "GS",
|
||||
"South Korean": "KR",
|
||||
"South Sudanese": "SS",
|
||||
"Spanish": "ES",
|
||||
"Srbija": "RS",
|
||||
"Sri Lankan": "LK",
|
||||
"St Maartener": "SX",
|
||||
"Sudanese": "SD",
|
||||
"Suisse": "CH",
|
||||
"Suomi": "FI",
|
||||
"Surinamer": "SR",
|
||||
"Svalbard og Jan Mayen": "SJ",
|
||||
"Sverige": "SE",
|
||||
"Svizra": "CH",
|
||||
"Svizzera": "CH",
|
||||
"Swazi": "SZ",
|
||||
"Swedish": "SE",
|
||||
"Swiss": "CH",
|
||||
"Syrian": "SY",
|
||||
"São Tomé e Príncipe": "ST",
|
||||
"Sénégal": "SN",
|
||||
"Sāmoa": "WS",
|
||||
"Sāmoa Amelika": "AS",
|
||||
"Tadzhik": "TJ",
|
||||
"Tadzhikistan": "TJ",
|
||||
"Tai Wan": "TW",
|
||||
"Taiwanese": "TW",
|
||||
"Tanzania": "TZ",
|
||||
"Tanzanian": "TZ",
|
||||
"Tchad": "TD",
|
||||
"Terres australes et antarctiques francaises": "TF",
|
||||
"Terres australes et antarctiques françaises": "TF",
|
||||
"Thai": "TH",
|
||||
"Timor Leste": "TL",
|
||||
"Timór Leste": "TL",
|
||||
"Tochikiston": "TJ",
|
||||
"Togolese": "TG",
|
||||
"Tokelauan": "TK",
|
||||
"Tongan": "TO",
|
||||
"Trinidadian": "TT",
|
||||
"Tsrna Gora": "ME",
|
||||
"Tunisian": "TN",
|
||||
"Turkish": "TR",
|
||||
"Turkiye": "TR",
|
||||
"Turkmen": "TM",
|
||||
"Turkmeniia": "TM",
|
||||
"Turks and Caicos Islander": "TC",
|
||||
"Tuvaluan": "TV",
|
||||
"Türkiye": "TR",
|
||||
"Türkmenistan": "TM",
|
||||
"UK": "GB",
|
||||
"US": "US",
|
||||
"Uburundi": "BI",
|
||||
"Ugandan": "UG",
|
||||
"Ukrainian": "UA",
|
||||
"Ukrayina": "UA",
|
||||
"United States Virgin Islands": "VI",
|
||||
"Uruguayan": "UY",
|
||||
"Uzbekistani": "UZ",
|
||||
"Vatican": "VA",
|
||||
"Vaticanae": "VA",
|
||||
"Vaticano": "VA",
|
||||
"Vaticanæ": "VA",
|
||||
"Venezuela": "VE",
|
||||
"Venezuelan": "VE",
|
||||
"Vietnam": "VN",
|
||||
"Vietnamese": "VN",
|
||||
"Viti": "FJ",
|
||||
"Việt Nam": "VN",
|
||||
"Volivia": "BO",
|
||||
"Volívia": "BO",
|
||||
"Wallis and Futuna Islander": "WF",
|
||||
"Wallis et Futuna": "WF",
|
||||
"Wuliwya": "BO",
|
||||
"Xiang Gang": "HK",
|
||||
"Xin Jia Po": "SG",
|
||||
"Yemeni": "YE",
|
||||
"Zambian": "ZM",
|
||||
"Zhong Guo": "CN",
|
||||
"Zhong Guo Da Lu": "CN",
|
||||
"Zimbabwean": "ZW",
|
||||
"`mn": "OM",
|
||||
"baaNlaadesh": "BD",
|
||||
"bbaart nuuN": "IN",
|
||||
"bhaart": "IN",
|
||||
"brug-yul-": "BT",
|
||||
"canadien": "CA",
|
||||
"cingkppuur": "SG",
|
||||
"dhivehiraajeyge": "MV",
|
||||
"eSwatini": "SZ",
|
||||
"eereteraa": "ER",
|
||||
"fGnstn": "AF",
|
||||
"flsTyn": "PS",
|
||||
"hangug": "KR",
|
||||
"ilngkai": "LK",
|
||||
"intiyaa": "IN",
|
||||
"joseon": "KP",
|
||||
"jybwty": "DJ",
|
||||
"khoemry": "IQ",
|
||||
"lSwml": "SO",
|
||||
"l`rq": "IQ",
|
||||
"lbHryn": "BH",
|
||||
"lbnn": "LB",
|
||||
"ljzyr": "DZ",
|
||||
"lkwyt": "KW",
|
||||
"lmGrb": "MA",
|
||||
"lqmr": "KM",
|
||||
"lrdn": "JO",
|
||||
"lswdn": "SD",
|
||||
"lyaman": "YE",
|
||||
"lyby": "LY",
|
||||
"mSr": "EG",
|
||||
"mlysy": "MY",
|
||||
"mnmaa": "MM",
|
||||
"mwrytny": "MR",
|
||||
"nepaal": "NP",
|
||||
"phijii": "FJ",
|
||||
"pkstn": "PK",
|
||||
"praethsaithy": "TH",
|
||||
"qTr": "QA",
|
||||
"qwutnA": "IQ",
|
||||
"rtry": "ER",
|
||||
"sak`art`velo": "GE",
|
||||
"shrii lNkaav": "LK",
|
||||
"spplaaw": "LA",
|
||||
"sryyl": "IL",
|
||||
"swry": "SY",
|
||||
"teyopheyaa": "ET",
|
||||
"tshd": "TD",
|
||||
"twns": "TN",
|
||||
"ySHrAl": "IL",
|
||||
"yrn": "IR",
|
||||
"Åland": "AX",
|
||||
"Ålandish": "AX",
|
||||
"Éire": "IE",
|
||||
"Ísland": "IS",
|
||||
"Österreich": "AT",
|
||||
"Česko": "CZ",
|
||||
"Ελλάδα": "GR",
|
||||
"Κύπρος": "CY",
|
||||
"Азербайджан": "AZ",
|
||||
"Белару́сь": "BY",
|
||||
"Беларусь": "BY",
|
||||
"Боснa и Херцеговина": "BA",
|
||||
"България": "BG",
|
||||
"Казахстан": "KZ",
|
||||
"Киргизия": "KG",
|
||||
"Кыргызстан": "KG",
|
||||
"Македонија": "MK",
|
||||
"Македонски": "MK",
|
||||
"Монгол улс": "MN",
|
||||
"Россия": "RU",
|
||||
"Србија": "RS",
|
||||
"Таджикистан": "TJ",
|
||||
"Тоҷикистон": "TJ",
|
||||
"Туркмения": "TM",
|
||||
"Узбекистан": "UZ",
|
||||
"Україна": "UA",
|
||||
"Црна Гора": "ME",
|
||||
"Қазақстан": "KZ",
|
||||
"Հայաստան": "AM",
|
||||
"ישראל": "IL",
|
||||
"إرتريا": "ER",
|
||||
"إسرائيل": "IL",
|
||||
"افغانستان": "AF",
|
||||
"الأردن": "JO",
|
||||
"البحرين": "BH",
|
||||
"الجزائر": "DZ",
|
||||
"السعودية": "SA",
|
||||
"السودان": "SD",
|
||||
"الصحراء الغربية": "EH",
|
||||
"الصومال": "SO",
|
||||
"العراق": "IQ",
|
||||
"العربية السعودية": "SA",
|
||||
"القمر": "KM",
|
||||
"الكويت": "KW",
|
||||
"المغرب": "MA",
|
||||
"اليَمَن": "YE",
|
||||
"ایران": "IR",
|
||||
"تشاد": "TD",
|
||||
"تونس": "TN",
|
||||
"جيبوتي": "DJ",
|
||||
"دولة الإمارات العربية المتحدة": "AE",
|
||||
"سوريا": "SY",
|
||||
"عمان": "OM",
|
||||
"فلسطين": "PS",
|
||||
"قطر": "QA",
|
||||
"لبنان": "LB",
|
||||
"ليبيا": "LY",
|
||||
"مصر": "EG",
|
||||
"مليسيا": "MY",
|
||||
"موريتانيا": "MR",
|
||||
"پاكستان": "PK",
|
||||
"کۆماری": "IQ",
|
||||
"ܩܘܼܛܢܵܐ": "IQ",
|
||||
"ދިވެހިރާއްޖޭގެ": "MV",
|
||||
"नेपाल": "NP",
|
||||
"फिजी": "FJ",
|
||||
"भारत": "IN",
|
||||
"বাংলাদেশ": "BD",
|
||||
"ভারত": "IN",
|
||||
"ਭਾਰਤ ਨੂੰ": "IN",
|
||||
"இந்தியா": "IN",
|
||||
"இலங்கை": "LK",
|
||||
"சிங்கப்பூர்": "SG",
|
||||
"ශ්රී ලංකාව": "LK",
|
||||
"ประเทศไทย": "TH",
|
||||
"ສປປລາວ": "LA",
|
||||
"འབྲུག་ཡུལ་": "BT",
|
||||
"မြန်မာ": "MM",
|
||||
"საქართველო": "GE",
|
||||
"ኢትዮጵያ": "ET",
|
||||
"ኤርትራ": "ER",
|
||||
"ⵍⵎⴰⵖⵔⵉⴱ": "MA",
|
||||
"中国": "CN",
|
||||
"中国大陆": "CN",
|
||||
"台灣": "TW",
|
||||
"新加坡": "SG",
|
||||
"日本": "JP",
|
||||
"澳门": "MO",
|
||||
"香港": "HK",
|
||||
"조선": "KP",
|
||||
"한국": "KR"
|
||||
},
|
||||
"ignored": [
|
||||
"bit",
|
||||
"cc",
|
||||
"ch",
|
||||
"dan",
|
||||
"day",
|
||||
"gun",
|
||||
"hr",
|
||||
"jordan",
|
||||
"la",
|
||||
"ma",
|
||||
"na",
|
||||
"the",
|
||||
"to"
|
||||
],
|
||||
"implicit-languages": {
|
||||
"419": "es-419",
|
||||
"BR": "pt-BR",
|
||||
"CA": "fr-CA",
|
||||
"Cantonese": "zh",
|
||||
"Castilian": "es",
|
||||
"FR": "fr-FR",
|
||||
"GR": "ell",
|
||||
"HK": "zh-HK",
|
||||
"ID": "id-ID",
|
||||
"Mandarin": "zh",
|
||||
"Parisian": "fr-FR",
|
||||
"Simplified": "zh-Hans",
|
||||
"Traditional": "zh-Hant",
|
||||
"UA": "uk-UA",
|
||||
"UK": "en-GB",
|
||||
"US": "en-US",
|
||||
"VFF": "fr-FR",
|
||||
"VFQ": "fr-CA",
|
||||
"VN": "vie",
|
||||
"cant": "zh",
|
||||
"eng": "en",
|
||||
"ita": "it",
|
||||
"简体双语": "zh-Hans",
|
||||
"繁体双语": "zh-Hant"
|
||||
},
|
||||
"languages": {
|
||||
"Adygebze": "ady",
|
||||
"Avanee": "grn",
|
||||
"Avañeẽ": "grn",
|
||||
"Aymar aru": "aym",
|
||||
"Azərbaycan dili": "aze",
|
||||
"Bahasa Indonesia": "ind",
|
||||
"Bahasa Melayu": "msa",
|
||||
"Basa Jawa": "jav",
|
||||
"Basa Sunda": "sun",
|
||||
"Belaruskaia": "bel",
|
||||
"Blgarski": "bul",
|
||||
"Bosanski": "bos",
|
||||
"Brezhoneg": "bre",
|
||||
"Catala": "cat",
|
||||
"Català": "cat",
|
||||
"Cestina": "ces",
|
||||
"Cymraeg": "cym",
|
||||
"Dansk": "dan",
|
||||
"Davvisamegiella": "sme",
|
||||
"Davvisámegiella": "sme",
|
||||
"Deutsch": "deu",
|
||||
"Dolnoserbscina": "dsb",
|
||||
"Dolnoserbšćina": "dsb",
|
||||
"Eesti": "est",
|
||||
"Ellenika": "ell",
|
||||
"Espanol": "spa",
|
||||
"Espanol Latinoamerica": "es-419",
|
||||
"Español": "spa",
|
||||
"Español Latinoamérica": "es-419",
|
||||
"Euskara": "eus",
|
||||
"Foroyskt": "fao",
|
||||
"Francais": "fra",
|
||||
"Français": "fra",
|
||||
"Frysk": "fry",
|
||||
"Føroyskt": "fao",
|
||||
"Gaeilge": "gle",
|
||||
"Gaelg": "glv",
|
||||
"Gaidhlig": "gla",
|
||||
"Galego": "glg",
|
||||
"Greek": "ell",
|
||||
"Guang Dong Hua ": "zho",
|
||||
"Gàidhlig": "gla",
|
||||
"Hayeren": "hye",
|
||||
"Hornjoserbscina": "hsb",
|
||||
"Hornjoserbšćina": "hsb",
|
||||
"Hrvatski": "hrv",
|
||||
"Islenska": "isl",
|
||||
"Italiano": "ita",
|
||||
"Kazaksha": "kaz",
|
||||
"Kernewek": "cor",
|
||||
"Kiswahili": "swa",
|
||||
"Kreyol": "hat",
|
||||
"Kreyòl": "hat",
|
||||
"Kurdi": "kur",
|
||||
"Kurdî": "kur",
|
||||
"Latviesu": "lav",
|
||||
"Latviešu": "lav",
|
||||
"Lemborgs": "lim",
|
||||
"Letzebuergesch": "ltz",
|
||||
"Lietuviu": "lit",
|
||||
"Lietuvių": "lit",
|
||||
"Lwo": "ach",
|
||||
"Lèmbörgs": "lim",
|
||||
"Lëtzebuergesch": "ltz",
|
||||
"Magyar": "hun",
|
||||
"Makedonski": "mkd",
|
||||
"Malay": "msa",
|
||||
"Malti": "mlt",
|
||||
"Maya Kaqchikel": "cak",
|
||||
"Melayu": "msa",
|
||||
"Mongol": "mon",
|
||||
"Nederlands": "nld",
|
||||
"Norsk": "nor",
|
||||
"Norsk bokmal": "nob",
|
||||
"Norsk bokmål": "nob",
|
||||
"Norsk nynorsk": "nno",
|
||||
"Occitan": "oci",
|
||||
"Ozbek": "uzb",
|
||||
"Polski": "pol",
|
||||
"Portugues": "por",
|
||||
"Português": "por",
|
||||
"Qhichwa": "que",
|
||||
"Ri Ben Yu": "jpn",
|
||||
"Romana": "ron",
|
||||
"Română": "ron",
|
||||
"Rumantsch": "roh",
|
||||
"Russkii": "rus",
|
||||
"Shqip": "sqi",
|
||||
"Slovencina": "slk",
|
||||
"Slovenscina": "slv",
|
||||
"Slovenčina": "slk",
|
||||
"Slovenščina": "slv",
|
||||
"Soomaaliga": "som",
|
||||
"Srpski": "srp",
|
||||
"Suomi": "fin",
|
||||
"Svenska": "swe",
|
||||
"Taqbaylit": "kab",
|
||||
"TcYi": "aka",
|
||||
"Tieng Viet": "vie",
|
||||
"Tiếng Việt": "vie",
|
||||
"Turkce": "tur",
|
||||
"Türkçe": "tur",
|
||||
"Tɕɥi": "aka",
|
||||
"Ukrayinska": "ukr",
|
||||
"Zhong Wen": "zho",
|
||||
"Zhong Wen Fan Ti": "zh-Hant",
|
||||
"Zhong Wen Jian Ti": "zh-Hans",
|
||||
"`bryt": "heb",
|
||||
"aithy": "tha",
|
||||
"baaNlaa": "ben",
|
||||
"bhaasaakhmaer": "khm",
|
||||
"bmaackaa": "mya",
|
||||
"eesti keel": "est",
|
||||
"frsy": "fas",
|
||||
"gujraatii": "guj",
|
||||
"hangugeo": "kor",
|
||||
"hindii": "hin",
|
||||
"isiXhosa": "xho",
|
||||
"isiZulu": "zul",
|
||||
"k`art`uli": "kat",
|
||||
"knndd": "kan",
|
||||
"maithilii maithilii": "mai",
|
||||
"mlyaallN": "mal",
|
||||
"mraatthii": "mar",
|
||||
"nepaalii": "nep",
|
||||
"oddiaa": "ori",
|
||||
"pNjaabii": "pan",
|
||||
"pStw": "pus",
|
||||
"phaasaaaithy": "tha",
|
||||
"rdw": "urd",
|
||||
"sNskRtm": "san",
|
||||
"siNhl": "sin",
|
||||
"srpskokhrvatski": "hbs",
|
||||
"tatarcha": "tat",
|
||||
"telugu": "tel",
|
||||
"tlhIngan Hol": "tlh",
|
||||
"tmilll": "tam",
|
||||
"tochiki": "tgk",
|
||||
"yyidySH": "yid",
|
||||
"zaboni tochiki": "tgk",
|
||||
"Íslenska": "isl",
|
||||
"Čeština": "ces",
|
||||
"Ελληνικά": "ell",
|
||||
"Адыгэбзэ": "ady",
|
||||
"Беларуская": "bel",
|
||||
"Български": "bul",
|
||||
"Македонски": "mkd",
|
||||
"Монгол": "mon",
|
||||
"Русский": "rus",
|
||||
"Српски": "srp",
|
||||
"Українська": "ukr",
|
||||
"забо́ни тоҷикӣ́": "tgk",
|
||||
"српскохрватски": "hbs",
|
||||
"татарча": "tat",
|
||||
"тоҷикӣ": "tgk",
|
||||
"Қазақша": "kaz",
|
||||
"Հայերեն": "hye",
|
||||
"ייִדיש": "yid",
|
||||
"עברית": "heb",
|
||||
"اردو": "urd",
|
||||
"العربية": "ara",
|
||||
"فارسی": "fas",
|
||||
"پښتو": "pus",
|
||||
"नेपाली": "nep",
|
||||
"मराठी": "mar",
|
||||
"मैथिली মৈথিলী": "mai",
|
||||
"संस्कृतम्": "san",
|
||||
"हिन्दी": "hin",
|
||||
"বাংলা": "ben",
|
||||
"ਪੰਜਾਬੀ": "pan",
|
||||
"ગુજરાતી": "guj",
|
||||
"ଓଡ଼ିଆ": "ori",
|
||||
"தமிழ்": "tam",
|
||||
"తెలుగు": "tel",
|
||||
"ಕನ್ನಡ": "kan",
|
||||
"മലയാളം": "mal",
|
||||
"සිංහල": "sin",
|
||||
"ภาษาไทย": "tha",
|
||||
"ไทย": "tha",
|
||||
"ဗမာစကာ": "mya",
|
||||
"ქართული": "kat",
|
||||
"ភាសាខ្មែរ": "khm",
|
||||
"中文": "zho",
|
||||
"中文简体": "zh-Hans",
|
||||
"中文繁體": "zh-Hant",
|
||||
"廣東話": "zho",
|
||||
"日本語": "jpn",
|
||||
"한국어": "kor"
|
||||
},
|
||||
"regions": {
|
||||
"Latin": "419",
|
||||
"Latinoamerica": "419",
|
||||
"Latinoamericano": "419",
|
||||
"Latinoamérica": "419"
|
||||
},
|
||||
"scripts": {
|
||||
"Fan Ti ": "Hant",
|
||||
"Jian Ti ": "Hans",
|
||||
"Simplified": "Hans",
|
||||
"Traditional": "Hant",
|
||||
"简体": "Hans",
|
||||
"繁體": "Hant"
|
||||
}
|
||||
}
|
|
@ -0,0 +1,169 @@
|
|||
import typing
|
||||
|
||||
from babelfish import (
|
||||
COUNTRIES,
|
||||
Country,
|
||||
CountryReverseError,
|
||||
LANGUAGE_MATRIX,
|
||||
Language,
|
||||
LanguageReverseError,
|
||||
SCRIPTS,
|
||||
Script,
|
||||
country_converters,
|
||||
language_converters
|
||||
)
|
||||
from babelfish.converters import CaseInsensitiveDict
|
||||
|
||||
from rebulk import Rebulk
|
||||
from rebulk.match import Match
|
||||
|
||||
from trakit.config import Config
|
||||
from trakit.context import Context
|
||||
from trakit.converters.country import GuessCountryConverter
|
||||
from trakit.converters.language import GuessLanguageConverter
|
||||
from trakit.words import blank_match, blank_release_names, to_combinations, to_match, to_sentence, to_words
|
||||
|
||||
|
||||
class LanguageFinder:
|
||||
|
||||
def __init__(self, config: Config):
|
||||
self.country_max_words = 1
|
||||
for k, v in COUNTRIES.items():
|
||||
self.country_max_words = max(self.country_max_words, v.count(' '))
|
||||
|
||||
self.language_max_words = 1
|
||||
for v in LANGUAGE_MATRIX:
|
||||
self.language_max_words = max(self.language_max_words, v.name.count(' '))
|
||||
|
||||
self.script_max_words = 1
|
||||
for v in config.scripts.keys():
|
||||
self.script_max_words = max(self.script_max_words, v.count(' '))
|
||||
|
||||
self.region_max_words = 1
|
||||
for v in config.regions.keys():
|
||||
self.region_max_words = max(self.region_max_words, v.count(' '))
|
||||
|
||||
SCRIPTS['419'] = 'Latin America and the Caribbean' # Until babelfish support UN.M49
|
||||
country_converters['guess'] = GuessCountryConverter(config.countries)
|
||||
language_converters['guess'] = GuessLanguageConverter(config.languages)
|
||||
self.regions = CaseInsensitiveDict(config.regions)
|
||||
self.scripts = CaseInsensitiveDict(config.scripts)
|
||||
self.common_words = CaseInsensitiveDict(dict.fromkeys(config.ignored, 0))
|
||||
self.implicit = CaseInsensitiveDict(config.implicit_languages)
|
||||
|
||||
def _find_country(self, value: str):
|
||||
combinations = to_combinations(to_words(value), self.country_max_words)
|
||||
for c in combinations:
|
||||
code = to_sentence(c)
|
||||
try:
|
||||
return to_match(c, Country.fromguess(code))
|
||||
except CountryReverseError:
|
||||
continue
|
||||
|
||||
def _find_script(self, value: str):
|
||||
combinations = to_combinations(to_words(value), self.script_max_words)
|
||||
for c in combinations:
|
||||
code = to_sentence(c)
|
||||
try:
|
||||
return to_match(c, Script(self.scripts.get(code, code)))
|
||||
except ValueError:
|
||||
continue
|
||||
|
||||
def _find_region(self, value: str):
|
||||
combinations = to_combinations(to_words(value), self.region_max_words)
|
||||
for c in combinations:
|
||||
code = to_sentence(c)
|
||||
try:
|
||||
return to_match(c, Script(self.regions.get(code, code)))
|
||||
except ValueError:
|
||||
continue
|
||||
|
||||
def _find_implicit_language(self, combinations: typing.List[typing.List[Match]]):
|
||||
for c in combinations:
|
||||
sentence = to_sentence(c)
|
||||
if sentence in self.implicit:
|
||||
return to_match(c, Language.fromietf(self.implicit[sentence]))
|
||||
|
||||
region = self._find_region(sentence)
|
||||
if region and region.value.code in self.implicit:
|
||||
lang = Language.fromietf(self.implicit[region.value.code])
|
||||
return Match(region.start, region.end, value=lang, input_string=region.input_string)
|
||||
|
||||
try:
|
||||
country = Country.fromguess(sentence)
|
||||
if country.alpha2 in self.implicit:
|
||||
lang = Language.fromietf(self.implicit[country.alpha2])
|
||||
if lang.name.lower() == sentence.lower():
|
||||
lang = Language.fromname(sentence)
|
||||
|
||||
return to_match(c, lang)
|
||||
except CountryReverseError:
|
||||
pass
|
||||
|
||||
def accept_word(self, string: str):
|
||||
return string.lower() not in self.common_words and not string.isnumeric()
|
||||
|
||||
def find_language(self, value: str, context: Context):
|
||||
value = blank_release_names(value)
|
||||
all_words = to_words(value, predicate=self.accept_word)
|
||||
combinations = to_combinations(all_words, self.language_max_words)
|
||||
implicit_lang = self._find_implicit_language(combinations)
|
||||
implicit_accepted = implicit_lang and context.accept(implicit_lang.value)
|
||||
|
||||
if implicit_accepted and implicit_lang.value.script and implicit_lang.value.script.code.isnumeric():
|
||||
return implicit_lang
|
||||
elif implicit_lang and not implicit_accepted:
|
||||
value = blank_match(implicit_lang)
|
||||
all_words = to_words(value, predicate=self.accept_word)
|
||||
combinations = to_combinations(all_words, self.language_max_words)
|
||||
|
||||
for c in combinations:
|
||||
language_sentence = to_sentence(c)
|
||||
try:
|
||||
lang = Language.fromguess(language_sentence)
|
||||
except LanguageReverseError:
|
||||
continue
|
||||
|
||||
match_lang = to_match(c, lang)
|
||||
remaining_sentence = blank_match(match_lang)
|
||||
for combination in to_combinations(to_words(remaining_sentence), self.country_max_words):
|
||||
sentence = to_sentence(combination)
|
||||
country = self._find_country(sentence)
|
||||
if country:
|
||||
try:
|
||||
# discard country if value is actually the language name
|
||||
Language.fromguess(country.raw)
|
||||
except LanguageReverseError:
|
||||
lang = Language(lang.alpha3, country=country.value, script=lang.script)
|
||||
break
|
||||
|
||||
region = self._find_region(sentence)
|
||||
if region:
|
||||
lang = Language(lang.alpha3, country=lang.country, script=region.value)
|
||||
break
|
||||
|
||||
script = self._find_script(sentence)
|
||||
if script:
|
||||
lang = Language(lang.alpha3, country=lang.country, script=script.value)
|
||||
break
|
||||
|
||||
if implicit_accepted and implicit_lang.value.alpha3 == lang.alpha3 and not lang.country and not lang.script:
|
||||
return implicit_lang
|
||||
|
||||
if context.accept(lang):
|
||||
return to_match(c, lang)
|
||||
|
||||
if implicit_accepted:
|
||||
return implicit_lang
|
||||
|
||||
def find(self, value: str, context: Context):
|
||||
match = self.find_language(value, context)
|
||||
if match:
|
||||
return match.start, match.end, {'value': match.value}
|
||||
|
||||
|
||||
def language(config: Config):
|
||||
rebulk = Rebulk()
|
||||
rebulk.functional(LanguageFinder(config).find, name='language')
|
||||
|
||||
return rebulk
|
|
@ -0,0 +1,32 @@
|
|||
import re
|
||||
from functools import partial
|
||||
|
||||
from rebulk import Rebulk
|
||||
from rebulk.validators import chars_surround
|
||||
|
||||
from trakit.config import Config
|
||||
from trakit.language import language
|
||||
from trakit.words import seps
|
||||
|
||||
|
||||
def configure(config: Config):
|
||||
seps_surround = partial(chars_surround, seps)
|
||||
|
||||
others = Rebulk()
|
||||
others.defaults(ignore_case=True, validator=seps_surround)
|
||||
others.regex_defaults(flags=re.IGNORECASE,
|
||||
abbreviations=[(r'-', rf'[{re.escape("".join(seps))}]')],
|
||||
validator=seps_surround)
|
||||
for name in ('forced', 'commentary', 'external'):
|
||||
others.string(name, name=name, value=True)
|
||||
|
||||
others.string('sdh', name='hearing_impaired', value=True)
|
||||
others.string('alternate', name='version', value='alternate')
|
||||
others.string('descriptive', name='descriptive', value=True)
|
||||
others.regex('cc', 'closed-captions?', name='closed_caption', value=True)
|
||||
|
||||
rebulk = Rebulk()
|
||||
rebulk.rebulk(language(config))
|
||||
rebulk.rebulk(others)
|
||||
|
||||
return rebulk
|
|
@ -0,0 +1,99 @@
|
|||
import re
|
||||
import typing
|
||||
|
||||
from rebulk.match import Match
|
||||
|
||||
seps = frozenset(r' [](){}+*|=-_~#/\\.,;:' + '\uff08\uff09')
|
||||
suppress_chars = frozenset("'")
|
||||
release_name_re = re.compile(r'(?P<release>[^\.\s]+(?:\.[^\.\s]+){2,})')
|
||||
|
||||
|
||||
def to_words(value: str,
|
||||
separators: typing.FrozenSet[str] = seps,
|
||||
ignore_chars: typing.FrozenSet[str] = suppress_chars,
|
||||
predicate: typing.Callable[[str], bool] = lambda x: True):
|
||||
input_string = value
|
||||
start = 0
|
||||
i = 0
|
||||
word = ''
|
||||
words: typing.List[Match] = []
|
||||
for c in input_string:
|
||||
i += 1
|
||||
if c in ignore_chars:
|
||||
continue
|
||||
|
||||
if c not in separators:
|
||||
word += c
|
||||
continue
|
||||
|
||||
if not word:
|
||||
start = i
|
||||
continue
|
||||
|
||||
end = i - 1
|
||||
if not predicate(value[start:end]):
|
||||
input_string = blank(input_string, start, end)
|
||||
else:
|
||||
words.append(Match(start, i - 1, value=word))
|
||||
|
||||
word = ''
|
||||
start = i
|
||||
|
||||
if word:
|
||||
if not predicate(value[start:]):
|
||||
input_string = blank(input_string, start, len(input_string))
|
||||
else:
|
||||
words.append(Match(start, i, value=word))
|
||||
|
||||
for w in words:
|
||||
w.input_string = input_string
|
||||
|
||||
return words
|
||||
|
||||
|
||||
def to_combinations(words: typing.List[Match], max_items: int):
|
||||
results: typing.List[typing.List[Match]] = []
|
||||
n_words = len(words)
|
||||
cur_size = min(max_items, n_words)
|
||||
start = 0
|
||||
while cur_size > 0:
|
||||
end = start + cur_size
|
||||
if end > n_words:
|
||||
start = 0
|
||||
cur_size -= 1
|
||||
continue
|
||||
|
||||
results.append(words[start:end])
|
||||
start += 1
|
||||
|
||||
return results
|
||||
|
||||
|
||||
def to_sentence(combination: typing.List[Match]):
|
||||
return ' '.join([c.value for c in combination])
|
||||
|
||||
|
||||
def to_match(combination: typing.List[Match], value: typing.Any):
|
||||
start = combination[0].start
|
||||
end = combination[-1].end
|
||||
input_string = combination[0].input_string
|
||||
|
||||
return Match(start, end, value=value, input_string=input_string)
|
||||
|
||||
|
||||
def blank(string: str, start: int, end: int):
|
||||
return string[:start] + ''.ljust(end - start, ' ') + string[end:]
|
||||
|
||||
|
||||
def blank_match(match: Match):
|
||||
return blank(match.input_string, match.start, match.end)
|
||||
|
||||
|
||||
def blank_release_names(value: str):
|
||||
result = value
|
||||
match = release_name_re.search(value)
|
||||
while match:
|
||||
result = blank(result, match.start('release'), match.end('release'))
|
||||
match = release_name_re.search(value, match.end('release'))
|
||||
|
||||
return result
|
|
@ -17,7 +17,7 @@ ga4mp==2.0.4
|
|||
guess_language-spirit==0.5.3
|
||||
guessit==3.5.0
|
||||
jsonschema==4.17.0
|
||||
knowit==0.4.0
|
||||
knowit==0.5.2
|
||||
peewee==3.15.3
|
||||
py-pretty==1
|
||||
pycountry==22.3.5
|
||||
|
@ -80,8 +80,9 @@ zipp==3.10.0
|
|||
markupsafe==2.1.1
|
||||
|
||||
# Required-by: knowit
|
||||
pymediainfo==5.1.0
|
||||
pymediainfo==6.0.1
|
||||
pyyaml==6.0
|
||||
trakit==0.2.1
|
||||
|
||||
# Required-by: python-socketio
|
||||
bidict==0.22.0
|
||||
|
|
Loading…
Reference in New Issue