From 6e521143e1de1f720dd9cf908aa7c907696187f0 Mon Sep 17 00:00:00 2001 From: vitiko98 Date: Fri, 2 Sep 2022 23:35:56 -0400 Subject: [PATCH] Embedded Subtitles provider: improve ASS subtitles filtering --- .../providers/embeddedsubtitles.py | 39 ++++++++++++++++++- .../test_embeddedsubtitles.py | 15 +++++++ 2 files changed, 53 insertions(+), 1 deletion(-) diff --git a/libs/subliminal_patch/providers/embeddedsubtitles.py b/libs/subliminal_patch/providers/embeddedsubtitles.py index e253dd310..38089070d 100644 --- a/libs/subliminal_patch/providers/embeddedsubtitles.py +++ b/libs/subliminal_patch/providers/embeddedsubtitles.py @@ -3,6 +3,7 @@ import functools import logging import os +import re import shutil import tempfile @@ -182,8 +183,19 @@ class EmbeddedSubtitlesProvider(Provider): "series" if isinstance(video, Episode) else "movie", ) - def download_subtitle(self, subtitle): + def download_subtitle(self, subtitle: EmbeddedSubtitle): path = self._get_subtitle_path(subtitle) + + modifiers = _type_modifiers.get(subtitle.stream.codec_name) + logger.debug( + "Found modifiers for %s type: %s", subtitle.stream.codec_name, modifiers + ) + + if modifiers is not None: + for mod in modifiers: + logger.debug("Running %s modifier for %s", mod, path) + mod(path, path) + with open(path, "rb") as sub: content = sub.read() subtitle.content = fix_line_ending(content) @@ -303,3 +315,28 @@ def _discard_possible_incomplete_subtitles(streams): def _get_pretty_release_name(stream, container): bname = os.path.basename(container.path) return f"{os.path.splitext(bname)[0]}.{stream.suffix}" + + +# TODO: improve this +_SIGNS_LINE_RE = re.compile(r",([\w|_]{,15}(sign|fx|karaoke))", flags=re.IGNORECASE) + + +def _clean_ass_subtitles(path, output_path): + """An attempt to ignore extraneous lines from ASS anime subtitles. Experimental.""" + + clean_lines = [] + + with open(path, "r") as f: + lines = f.readlines() + for line in lines: + if _SIGNS_LINE_RE.search(line) is None: + clean_lines.append(line) + + logger.debug("Cleaned lines: %d", abs(len(lines) - len(clean_lines))) + + with open(output_path, "w") as f: + f.writelines(clean_lines) + logger.debug("Lines written to output path: %s", output_path) + + +_type_modifiers = {"ass": {_clean_ass_subtitles}} diff --git a/tests/subliminal_patch/test_embeddedsubtitles.py b/tests/subliminal_patch/test_embeddedsubtitles.py index d858581f8..ab0dc1c41 100644 --- a/tests/subliminal_patch/test_embeddedsubtitles.py +++ b/tests/subliminal_patch/test_embeddedsubtitles.py @@ -10,6 +10,7 @@ from subliminal_patch.core import Episode from subliminal_patch.core import Movie from subliminal_patch.providers.embeddedsubtitles import ( _discard_possible_incomplete_subtitles, + _clean_ass_subtitles, ) from subliminal_patch.providers.embeddedsubtitles import _get_pretty_release_name from subliminal_patch.providers.embeddedsubtitles import _MemoizedFFprobeVideoContainer @@ -283,6 +284,20 @@ def test_get_pretty_release_name(): assert _get_pretty_release_name(stream, container) == "foo.en.forced.srt" +def test_clean_ass_subtitles(data, tmp_path): + path = os.path.join(data, "subs.ass") + + with open(path, "r") as f: + og_lines_len = len(f.readlines()) + + output_path = os.path.join(tmp_path, "subs.ass") + + _clean_ass_subtitles(path, output_path) + + with open(output_path, "r") as f: + assert og_lines_len > len(f.readlines()) + + def test_download_subtitle_multiple(video_multiple_languages): with EmbeddedSubtitlesProvider() as provider: languages = {Language.fromalpha2(code) for code in ("en", "it", "fr")} | {