From 1548263a6c960354ef61445a5206c2190eaa79e9 Mon Sep 17 00:00:00 2001 From: Xavier Xiong Date: Fri, 12 Feb 2021 20:15:19 +0100 Subject: [PATCH] Adding simplified and traditional Chinese subtitles support (#1236) --- bazarr/embedded_subs_reader.py | 5 +++ bazarr/get_languages.py | 8 +++++ bazarr/get_subtitle.py | 14 ++++++++ bazarr/list_subtitles.py | 43 +++++++++++++++++++++++ bazarr/utils.py | 2 ++ libs/subliminal_patch/converters/assrt.py | 10 +++--- libs/subliminal_patch/core.py | 25 ++++++++++--- tests/test_assrt.py | 8 ++--- 8 files changed, 102 insertions(+), 13 deletions(-) diff --git a/bazarr/embedded_subs_reader.py b/bazarr/embedded_subs_reader.py index 8eb0f0fe5..b70487ba1 100644 --- a/bazarr/embedded_subs_reader.py +++ b/bazarr/embedded_subs_reader.py @@ -20,10 +20,15 @@ class EmbeddedSubsReader: api.initialize({'provider': 'ffmpeg', 'ffmpeg': self.ffprobe}) data = api.know(file) + traditional_chinese = ["cht", "tc", "traditional", "zht", "hant", "big5", u"繁", u"雙語"] + if 'subtitle' in data: for detected_language in data['subtitle']: if 'language' in detected_language: language = detected_language['language'].alpha3 + if language == 'zho' and 'name' in detected_language: + if any (ext in (detected_language['name'].lower()) for ext in traditional_chinese): + language = 'zht' forced = detected_language['forced'] if 'forced' in detected_language else False hearing_impaired = detected_language['hearing_impaired'] if 'hearing_impaired' in \ detected_language else False diff --git a/bazarr/get_languages.py b/bazarr/get_languages.py index 9d96c4620..a38a90215 100644 --- a/bazarr/get_languages.py +++ b/bazarr/get_languages.py @@ -19,6 +19,9 @@ def load_language_in_db(): database.execute("INSERT OR IGNORE INTO table_settings_languages (code3, code2, name) " "VALUES ('pob', 'pb', 'Brazilian Portuguese')") + database.execute("INSERT OR IGNORE INTO table_settings_languages (code3, code2, name) " + "VALUES ('zht', 'zt', 'Chinese Traditional')") + langs = [[lang.bibliographic, lang.alpha_3] for lang in pycountry.languages if hasattr(lang, 'alpha_2') and hasattr(lang, 'bibliographic')] @@ -32,6 +35,9 @@ def load_language_in_db(): def create_languages_dict(): global languages_dict + #replace chinese by chinese simplified + database.execute("UPDATE table_settings_languages SET name = 'Chinese Simplified' WHERE code3 = 'zho'") + languages_dict = database.execute("SELECT name, code2, code3, code3b FROM table_settings_languages") @@ -69,6 +75,8 @@ def get_language_set(): for lang in languages: if lang['code3'] == 'pob': language_set.add(Language('por', 'BR')) + elif lang['code3'] == 'zht': + language_set.add(Language('zho', 'TW')) else: language_set.add(Language(lang['code3'])) diff --git a/bazarr/get_subtitle.py b/bazarr/get_subtitle.py index 11fa32ebb..520e60fb6 100644 --- a/bazarr/get_subtitle.py +++ b/bazarr/get_subtitle.py @@ -135,6 +135,12 @@ def download_subtitle(path, language, audio_language, hi, forced, providers, pro lang_obj = Language.rebuild(lang_obj, forced=True) if hi == "force HI": lang_obj = Language.rebuild(lang_obj, hi=True) + elif l == 'zht': + lang_obj = Language('zho', 'TW') + if forced == "True": + lang_obj = Language.rebuild(lang_obj, forced=True) + if hi == "force HI": + lang_obj = Language.rebuild(lang_obj, hi=True) else: lang_obj = Language(l) if forced == "True": @@ -214,6 +220,8 @@ def download_subtitle(path, language, audio_language, hi, forced, providers, pro downloaded_provider = subtitle.provider_name if subtitle.language == 'pt-BR': downloaded_language_code3 = 'pob' + elif subtitle.language == 'zh-TW': + downloaded_language_code3 = 'zht' else: downloaded_language_code3 = subtitle.language.alpha3 downloaded_language = language_from_alpha3(downloaded_language_code3) @@ -323,6 +331,8 @@ def manual_search(path, profileId, providers, providers_auth, sceneName, title, if lang == 'pob': lang_obj = Language('por', 'BR') + elif lang == 'zht': + lang_obj = Language('zho', 'TW') else: lang_obj = Language(lang) @@ -530,6 +540,8 @@ def manual_download_subtitle(path, language, audio_language, hi, forced, subtitl downloaded_provider = saved_subtitle.provider_name if saved_subtitle.language == 'pt-BR': downloaded_language_code3 = 'pob' + elif saved_subtitle.language == 'zh-TW': + downloaded_language_code3 = 'zht' else: downloaded_language_code3 = subtitle.language.alpha3 downloaded_language = language_from_alpha3(downloaded_language_code3) @@ -631,6 +643,8 @@ def manual_upload_subtitle(path, language, forced, title, scene_name, media_type if language == 'pob': lang_obj = Language('por', 'BR') + elif language == 'zht': + lang_obj = Language('zho', 'TW') else: lang_obj = Language(language) diff --git a/bazarr/list_subtitles.py b/bazarr/list_subtitles.py index bb997ad22..cc2521feb 100644 --- a/bazarr/list_subtitles.py +++ b/bazarr/list_subtitles.py @@ -58,6 +58,10 @@ def store_subtitles(original_path, reversed_path): brazilian_portuguese = [".pt-br", ".pob", "pb"] brazilian_portuguese_forced = [".pt-br.forced", ".pob.forced", "pb.forced"] + simplified_chinese = [".chs", ".sc", ".zhs", ".hans", ".gb", u"简", u"双语"] + simplified_chinese_forced = [".chs.forced", ".sc.forced", ".zhs.forced", ".hans.forced", ".gb.forced", u"简体中文.forced", u"双语.forced"] + traditional_chinese = [".cht", ".tc", ".zht", ".hant", ".big5", u"繁", u"雙語"] + traditional_chinese_forced = [".cht.forced", ".tc.forced", ".zht.forced",".hant.forced", ".big5.forced", u"繁體中文.forced", u"雙語.forced"] try: dest_folder = get_subtitle_destination_folder() core.CUSTOM_PATHS = [dest_folder] if dest_folder else [] @@ -84,6 +88,22 @@ def store_subtitles(original_path, reversed_path): logging.debug("BAZARR external subtitles detected: " + "pb:forced") actual_subtitles.append( [str("pb:forced"), path_mappings.path_replace_reverse(subtitle_path)]) + elif any(ext in (str(os.path.splitext(subtitle)[0]).lower())[-12:] for ext in simplified_chinese): + logging.debug("BAZARR external subtitles detected: " + "zh") + actual_subtitles.append( + [str("zh"), path_mappings.path_replace_reverse(subtitle_path)]) + elif any(ext in (str(os.path.splitext(subtitle)[0]).lower())[-12:] for ext in simplified_chinese_forced): + logging.debug("BAZARR external subtitles detected: " + "zh:forced") + actual_subtitles.append( + [str("zh:forced"), path_mappings.path_replace_reverse(subtitle_path)]) + elif any(ext in (str(os.path.splitext(subtitle)[0]).lower())[-12:] for ext in traditional_chinese): + logging.debug("BAZARR external subtitles detected: " + "zt") + actual_subtitles.append( + [str("zt"), path_mappings.path_replace_reverse(subtitle_path)]) + elif any(ext in (str(os.path.splitext(subtitle)[0]).lower())[-12:] for ext in traditional_chinese_forced): + logging.debug("BAZARR external subtitles detected: " + "zt:forced") + actual_subtitles.append( + [str("zt:forced"), path_mappings.path_replace_reverse(subtitle_path)]) elif not language: continue elif str(language) != 'und': @@ -149,6 +169,10 @@ def store_subtitles_movie(original_path, reversed_path): brazilian_portuguese = [".pt-br", ".pob", "pb"] brazilian_portuguese_forced = [".pt-br.forced", ".pob.forced", "pb.forced"] + simplified_chinese = [".chs", ".sc", ".zhs", ".hans", ".gb", u"简", u"双语"] + simplified_chinese_forced = [".chs.forced", ".sc.forced", ".zhs.forced", ".hans.forced", ".gb.forced", u"简体中文.forced", u"双语.forced"] + traditional_chinese = [".cht", ".tc", ".zht", ".hant", ".big5", u"繁", u"雙語", "zh-tw"] + traditional_chinese_forced = [".cht.forced", ".tc.forced", ".zht.forced",".hant.forced", ".big5.forced", u"繁體中文.forced", u"雙語.forced", "zh-tw.forced"] try: dest_folder = get_subtitle_destination_folder() or '' core.CUSTOM_PATHS = [dest_folder] if dest_folder else [] @@ -172,6 +196,18 @@ def store_subtitles_movie(original_path, reversed_path): elif str(os.path.splitext(subtitle)[0]).lower().endswith(tuple(brazilian_portuguese_forced)): logging.debug("BAZARR external subtitles detected: " + "pb:forced") actual_subtitles.append([str("pb:forced"), path_mappings.path_replace_reverse_movie(subtitle_path)]) + elif any(ext in (str(os.path.splitext(subtitle)[0]).lower())[-12:] for ext in simplified_chinese): + logging.debug("BAZARR external subtitles detected: " + "zh") + actual_subtitles.append([str("zh"), path_mappings.path_replace_reverse_movie(subtitle_path)]) + elif any(ext in (str(os.path.splitext(subtitle)[0]).lower())[-12:] for ext in simplified_chinese_forced): + logging.debug("BAZARR external subtitles detected: " + "zh:forced") + actual_subtitles.append([str("zh:forced"), path_mappings.path_replace_reverse_movie(subtitle_path)]) + elif any(ext in (str(os.path.splitext(subtitle)[0]).lower())[-12:] for ext in traditional_chinese): + logging.debug("BAZARR external subtitles detected: " + "zt") + actual_subtitles.append([str("zt"), path_mappings.path_replace_reverse_movie(subtitle_path)]) + elif any(ext in (str(os.path.splitext(subtitle)[0]).lower())[-12:] for ext in traditional_chinese_forced): + logging.debug("BAZARR external subtitles detected: " + "zt:forced") + actual_subtitles.append([str("zt:forced"), path_mappings.path_replace_reverse_movie(subtitle_path)]) elif not language: continue elif str(language.basename) != 'und': @@ -492,6 +528,13 @@ def guess_external_subtitles(dest_folder, subtitles): try: text = text.decode('utf-8') detected_language = guess_language(text) + #add simplified and traditional chinese detection + if detected_language == 'zh': + simplified_chinese = [".chs", ".sc", ".zhs", ".hans", ".gb", u"简", u"双语"] + if any(ext in str(subtitle_path) for ext in simplified_chinese): + detected_language == 'zh' + else: + detected_language == 'zt' except UnicodeDecodeError: detector = Detector() try: diff --git a/bazarr/utils.py b/bazarr/utils.py index ba8955eeb..09c2a5c60 100644 --- a/bazarr/utils.py +++ b/bazarr/utils.py @@ -326,6 +326,8 @@ def subtitles_apply_mods(language, subtitle_path, mods): if language == 'pob': lang_obj = Language('por', 'BR') + elif language == 'zht': + lang_obj = Language('zho', 'TW') else: lang_obj = Language(language) diff --git a/libs/subliminal_patch/converters/assrt.py b/libs/subliminal_patch/converters/assrt.py index 536d4b4ff..eb3ed29da 100644 --- a/libs/subliminal_patch/converters/assrt.py +++ b/libs/subliminal_patch/converters/assrt.py @@ -5,12 +5,12 @@ from subliminal.exceptions import ConfigurationError class AssrtConverter(LanguageReverseConverter): def __init__(self): - self.from_assrt = { u'简体': ('zho', None, 'Hans'), u'繁体': ('zho', None, 'Hant'), - u'簡體': ('zho', None, 'Hans'), u'繁體': ('zho', None, 'Hant'), + self.from_assrt = { u'简体': ('zho', 'CN', None), u'繁体': ('zho', 'TW', None), + u'簡體': ('zho', 'CN', None), u'繁體': ('zho', 'TW', None), u'英文': ('eng',), - u'chs': ('zho', None, 'Hans'), u'cht': ('zho', None, 'Hant'), - u'chn': ('zho', None, 'Hans'), u'twn': ('zho', None, 'Hant')} - self.to_assrt = { ('zho', None, 'Hans'): u'chs', ('zho', None, 'Hant'): u'cht', + u'chs': ('zho', 'CN', None), u'cht': ('zho', 'TW', None), + u'chn': ('zho', 'CN', None), u'twn': ('zho', 'TW', None)} + self.to_assrt = { ('zho', 'CN', None): u'chs', ('zho', 'TW', None): u'cht', ('eng', None, None) : u'eng', ('zho', None, None): u'chs'} self.codes = set(self.from_assrt.keys()) diff --git a/libs/subliminal_patch/core.py b/libs/subliminal_patch/core.py index de3829188..76e11402a 100644 --- a/libs/subliminal_patch/core.py +++ b/libs/subliminal_patch/core.py @@ -636,6 +636,13 @@ def _search_external_subtitles(path, languages=None, only_one=False, scandir_gen hi_tag = ["hi", "cc", "sdh"] hi = any(i for i in hi_tag if i in adv_tag) + #add simplified/traditional chinese detection + simplified_chinese = ["chs", "sc", "zhs", "hans", "gb", u"简", u"双语"] + traditional_chinese = ["cht", "tc", "zht", "hant", "big5", u"繁", u"雙語"] + FULL_LANGUAGE_LIST.extend(simplified_chinese) + FULL_LANGUAGE_LIST.extend(traditional_chinese) + p_root = p_root.replace('zh-TW', 'zht') + # remove possible language code for matching p_root_bare = ENDSWITH_LANGUAGECODE_RE.sub( lambda m: "" if str(m.group(1)).lower() in FULL_LANGUAGE_LIST else m.group(0), p_root) @@ -655,14 +662,24 @@ def _search_external_subtitles(path, languages=None, only_one=False, scandir_gen try: language_code = p_root.rsplit(".", 1)[1].replace('_', '-') try: - language = Language.fromietf(language_code) + language = Language.fromietf(language_code) language.forced = forced language.hi = hi except (ValueError, LanguageReverseError): - logger.error('Cannot parse language code %r', language_code) - language_code = None + #add simplified/traditional chinese detection + if any(ext in str(language_code) for ext in simplified_chinese): + language = Language.fromietf('zh') + language.forced = forced + language.hi = hi + elif any(ext in str(language_code) for ext in traditional_chinese): + language = Language.fromietf('zh') + language.forced = forced + language.hi = hi + else: + logger.error('Cannot parse language code %r', language_code) + language_code = None except IndexError: - language_code = None + language_code = None if not language and not language_code and only_one: language = Language.rebuild(list(languages)[0], forced=forced, hi=hi) diff --git a/tests/test_assrt.py b/tests/test_assrt.py index 89f51a16e..903bb5ebb 100644 --- a/tests/test_assrt.py +++ b/tests/test_assrt.py @@ -75,15 +75,15 @@ def test_get_matches_movie_name(movies): @pytest.mark.converter def test_converter_convert_alpha3(): - assert language_converters['assrt'].convert('zho', None, 'Hans') == 'chs' - assert language_converters['assrt'].convert('zho', None, 'Hant') == 'cht' + assert language_converters['assrt'].convert('zho', None, 'Hans') == 'chi' + assert language_converters['assrt'].convert('zho', None, 'Hant') == 'zht' assert language_converters['assrt'].convert('eng') == 'eng' @pytest.mark.converter def test_converter_reverse(): - assert language_converters['assrt'].reverse('chs') == ('zho', None, 'Hans') - assert language_converters['assrt'].reverse('cht') == ('zho', None, 'Hant') + assert language_converters['assrt'].reverse('chi') == ('zho', None, 'Hans') + assert language_converters['assrt'].reverse('zht') == ('zho', None, 'Hant') assert language_converters['assrt'].reverse(u'簡體') == ('zho', None, 'Hans') assert language_converters['assrt'].reverse(u'繁體') == ('zho', None, 'Hant') assert language_converters['assrt'].reverse(u'简体') == ('zho', None, 'Hans')