bazarr/libs/subliminal_patch/providers/utils.py

from collections import namedtuple
from difflib import SequenceMatcher
import io
import logging
import os
import re
import tempfile
from typing import Iterable, Union
import zipfile

from guessit import guessit
import pysubs2
import rarfile
from subliminal.subtitle import fix_line_ending
from subliminal_patch.exceptions import MustGetBlacklisted
from subliminal_patch.core import Episode
from subliminal_patch.subtitle import guess_matches

from ._agent_list import FIRST_THOUSAND_OR_SO_USER_AGENTS

logger = logging.getLogger(__name__)


_MatchingSub = namedtuple("_MatchingSub", ("file", "priority", "context"))


def blacklist_on(*exc_types):
    "Raise MustGetBlacklisted if any of the exc_types are raised."

    def decorator(method):
        def wrapper(self, subtitle):
            try:
                return method(self, subtitle)
            except exc_types:
                logger.error("Sending blacklist exception", exc_info=True)
                raise MustGetBlacklisted(subtitle.id, subtitle.media_type)

        return wrapper

    return decorator


def _get_matching_sub(
    sub_names, forced=False, episode=None, episode_title=None, **kwargs
):
    guess_options = {"single_value": True}
    if episode is not None:
        guess_options["type"] = "episode"  # type: ignore

    matching_subs = []

    for sub_name in sub_names:
        if not forced and os.path.splitext(sub_name.lower())[0].endswith("forced"):
            logger.debug("Ignoring forced subtitle: %s", sub_name)
            continue

        # If it's a movie then get the first subtitle
        if episode is None and episode_title is None:
            logger.debug("Movie subtitle found: %s", sub_name)
            matching_subs.append(_MatchingSub(sub_name, 2, "Movie subtitle"))
            break

        guess = guessit(sub_name, options=guess_options)

        matched_episode_num = guess.get("episode")
        if matched_episode_num:
            logger.debug("No episode number found in file: %s", sub_name)

        if episode_title is not None:
            from_name = _analize_sub_name(sub_name, episode_title)
            if from_name is not None:
                matching_subs.append(from_name)

        if episode == matched_episode_num:
            logger.debug("Episode matched from number: %s", sub_name)
            matching_subs.append(_MatchingSub(sub_name, 2, "Episode number matched"))

    if matching_subs:
        matching_subs.sort(key=lambda x: x.priority, reverse=True)
        logger.debug("Matches: %s", matching_subs)
        return matching_subs[0].file
    else:
        logger.debug("Nothing matched")
        return None


def _analize_sub_name(sub_name: str, title_):
    titles = re.split(r"[.-]", os.path.splitext(sub_name)[0])
    for title in titles:
        title = title.strip()
        ratio = SequenceMatcher(None, title, title_).ratio()
        if ratio > 0.85:
            logger.debug(
                "Episode title matched: '%s' -> '%s' [%s]", title, sub_name, ratio
            )

            # Avoid false positives with short titles
            if len(title_) > 4 and ratio >= 0.98:
                return _MatchingSub(sub_name, 3, "Perfect title ratio")

            return _MatchingSub(sub_name, 1, "Normal title ratio")

    logger.debug("No episode title matched from file: %s", sub_name)
    return None


def get_subtitle_from_archive(
    archive, forced=False, episode=None, get_first_subtitle=False, **kwargs
):
    "Get subtitle from Rarfile/Zipfile object. Return None if nothing is found."
    subs_in_archive = [
        name
        for name in archive.namelist()
        if name.endswith((".srt", ".sub", ".ssa", ".ass"))
    ]

    if not subs_in_archive:
        logger.info("No subtitles found in archive")
        return None

    logger.debug("Subtitles in archive: %s", subs_in_archive)

    if len(subs_in_archive) == 1 or get_first_subtitle:
        logger.debug("Getting first subtitle in archive: %s", subs_in_archive)
        return fix_line_ending(archive.read(subs_in_archive[0]))

    matching_sub = _get_matching_sub(subs_in_archive, forced, episode, **kwargs)

    if matching_sub is not None:
        logger.info("Using %s from archive", matching_sub)
        return fix_line_ending(archive.read(matching_sub))

    logger.debug("No subtitle found in archive")
    return None


def is_episode(content):
    return "episode" in guessit(content, {"type": "episode"})


_ENCS = ("utf-8", "ascii", "iso-8859-1", "iso-8859-2", "iso-8859-5", "cp1252")


def _zip_from_subtitle_file(content):
    with tempfile.NamedTemporaryFile(prefix="spsub", suffix=".srt") as tmp_f:
        tmp_f.write(content)
        sub = None
        for enc in _ENCS:
            try:
                logger.debug("Trying %s encoding", enc)
                sub = pysubs2.load(tmp_f.name, encoding=enc)
            except Exception as error:
                logger.debug("%s: %s", type(error).__name__, error)
                continue
            else:
                break

        if sub is not None:
            logger.debug("Identified subtitle file: %s", sub)
            zip_obj = zipfile.ZipFile(io.BytesIO(), mode="x")
            zip_obj.write(tmp_f.name, os.path.basename(tmp_f.name))
            return zip_obj

        logger.debug("Couldn't load subtitle file")
        return None


def get_archive_from_bytes(content: bytes):
    """Get RarFile/ZipFile object from bytes. A ZipFile instance will be returned
    if a subtitle-like stream is found. Return None if something else is found."""
    archive_stream = io.BytesIO(content)

    if rarfile.is_rarfile(archive_stream):
        logger.debug("Identified rar archive")
        return rarfile.RarFile(archive_stream)
    elif zipfile.is_zipfile(archive_stream):
        logger.debug("Identified zip archive")
        return zipfile.ZipFile(archive_stream)

    logger.debug("No compression format found. Trying with subtitle-like files")
    return _zip_from_subtitle_file(content)


def update_matches(
    matches,
    video,
    release_info: Union[str, Iterable[str]],
    split="\n",
    **guessit_options,
):
    """Update matches set from release info string or Iterable.

    Use the split parameter to iterate over the set delimiter; set None to avoid split.
    """

    guessit_options["type"] = "episode" if isinstance(video, Episode) else "movie"

    logger.debug("Guessit options to update matches: %s", guessit_options)

    if isinstance(release_info, str):
        release_info = release_info.split(split)

    for release in release_info:
        for release_split in release.split(split):
            logger.debug("Updating matches from release info: %s", release)
            matches |= guess_matches(
                video, guessit(release_split.strip(), guessit_options)
            )
            logger.debug("New matches: %s", matches)

    return matches
Update provider utils 2022-10-26 20:53:41 +00:00			`from collections import namedtuple`
			`from difflib import SequenceMatcher`
no log: add subliminal_patch provider utils This will be used to modulate and simplify some tasks and reduce redundant code 2022-04-18 20:38:23 +00:00			`import io`
			`import logging`
			`import os`
Update provider utils 2022-10-26 20:53:41 +00:00			`import re`
Improve providers utils 2023-02-16 00:49:56 +00:00			`import tempfile`
			`from typing import Iterable, Union`
no log: add subliminal_patch provider utils This will be used to modulate and simplify some tasks and reduce redundant code 2022-04-18 20:38:23 +00:00			`import zipfile`
update deps 2018-10-31 16:08:29 +00:00
no log: add subliminal_patch provider utils This will be used to modulate and simplify some tasks and reduce redundant code 2022-04-18 20:38:23 +00:00			`from guessit import guessit`
Improve providers utils 2023-02-16 00:49:56 +00:00			`import pysubs2`
Update provider utils 2022-10-26 20:53:41 +00:00			`import rarfile`
no log: add subliminal_patch provider utils This will be used to modulate and simplify some tasks and reduce redundant code 2022-04-18 20:38:23 +00:00			`from subliminal.subtitle import fix_line_ending`
EmbeddedSubtitles provider: add blacklist support 2023-10-10 07:37:45 +00:00			`from subliminal_patch.exceptions import MustGetBlacklisted`
no log: add update_matches() providers util 2022-04-19 00:12:07 +00:00			`from subliminal_patch.core import Episode`
			`from subliminal_patch.subtitle import guess_matches`

no log: add subliminal_patch provider utils This will be used to modulate and simplify some tasks and reduce redundant code 2022-04-18 20:38:23 +00:00			`from ._agent_list import FIRST_THOUSAND_OR_SO_USER_AGENTS`

			`logger = logging.getLogger(__name__)`


Improve file picker for season packs 2022-10-27 03:53:29 +00:00			`_MatchingSub = namedtuple("_MatchingSub", ("file", "priority", "context"))`
no log: add subliminal_patch provider utils This will be used to modulate and simplify some tasks and reduce redundant code 2022-04-18 20:38:23 +00:00
Update provider utils 2022-10-26 20:53:41 +00:00
EmbeddedSubtitles provider: add blacklist support 2023-10-10 07:37:45 +00:00			`def blacklist_on(*exc_types):`
			`"Raise MustGetBlacklisted if any of the exc_types are raised."`

			`def decorator(method):`
			`def wrapper(self, subtitle):`
			`try:`
			`return method(self, subtitle)`
			`except exc_types:`
			`logger.error("Sending blacklist exception", exc_info=True)`
			`raise MustGetBlacklisted(subtitle.id, subtitle.media_type)`

			`return wrapper`

			`return decorator`


no log: add kwargs to private func 2022-10-26 20:55:00 +00:00			`def _get_matching_sub(`
			`sub_names, forced=False, episode=None, episode_title=None, **kwargs`
			`):`
Improve subtitles pack extraction 2022-05-04 03:38:55 +00:00			`guess_options = {"single_value": True}`
			`if episode is not None:`
			`guess_options["type"] = "episode" # type: ignore`

Update provider utils 2022-10-26 20:53:41 +00:00			`matching_subs = []`

Improve subtitles pack extraction 2022-05-04 03:38:55 +00:00			`for sub_name in sub_names:`
no log: add subliminal_patch provider utils This will be used to modulate and simplify some tasks and reduce redundant code 2022-04-18 20:38:23 +00:00			`if not forced and os.path.splitext(sub_name.lower())[0].endswith("forced"):`
			`logger.debug("Ignoring forced subtitle: %s", sub_name)`
			`continue`

			`# If it's a movie then get the first subtitle`
Update provider utils 2022-10-26 20:53:41 +00:00			`if episode is None and episode_title is None:`
no log: add subliminal_patch provider utils This will be used to modulate and simplify some tasks and reduce redundant code 2022-04-18 20:38:23 +00:00			`logger.debug("Movie subtitle found: %s", sub_name)`
Improve file picker for season packs 2022-10-27 03:53:29 +00:00			`matching_subs.append(_MatchingSub(sub_name, 2, "Movie subtitle"))`
no log: add subliminal_patch provider utils This will be used to modulate and simplify some tasks and reduce redundant code 2022-04-18 20:38:23 +00:00			`break`

Improve subtitles pack extraction 2022-05-04 03:38:55 +00:00			`guess = guessit(sub_name, options=guess_options)`

Update provider utils 2022-10-26 20:53:41 +00:00			`matched_episode_num = guess.get("episode")`
			`if matched_episode_num:`
			`logger.debug("No episode number found in file: %s", sub_name)`

			`if episode_title is not None:`
Improve file picker for season packs 2022-10-27 03:53:29 +00:00			`from_name = _analize_sub_name(sub_name, episode_title)`
			`if from_name is not None:`
			`matching_subs.append(from_name)`
Update provider utils 2022-10-26 20:53:41 +00:00
			`if episode == matched_episode_num:`
			`logger.debug("Episode matched from number: %s", sub_name)`
Improve file picker for season packs 2022-10-27 03:53:29 +00:00			`matching_subs.append(_MatchingSub(sub_name, 2, "Episode number matched"))`
Update provider utils 2022-10-26 20:53:41 +00:00
			`if matching_subs:`
			`matching_subs.sort(key=lambda x: x.priority, reverse=True)`
			`logger.debug("Matches: %s", matching_subs)`
			`return matching_subs[0].file`
			`else:`
			`logger.debug("Nothing matched")`
			`return None`
no log: add subliminal_patch provider utils This will be used to modulate and simplify some tasks and reduce redundant code 2022-04-18 20:38:23 +00:00

Update provider utils 2022-10-26 20:53:41 +00:00			`def _analize_sub_name(sub_name: str, title_):`
			`titles = re.split(r"[.-]", os.path.splitext(sub_name)[0])`
			`for title in titles:`
Improve file picker for season packs 2022-10-27 03:53:29 +00:00			`title = title.strip()`
Update provider utils 2022-10-26 20:53:41 +00:00			`ratio = SequenceMatcher(None, title, title_).ratio()`
			`if ratio > 0.85:`
			`logger.debug(`
			`"Episode title matched: '%s' -> '%s' [%s]", title, sub_name, ratio`
			`)`
no log: add subliminal_patch provider utils This will be used to modulate and simplify some tasks and reduce redundant code 2022-04-18 20:38:23 +00:00
Improve file picker for season packs 2022-10-27 03:53:29 +00:00			`# Avoid false positives with short titles`
			`if len(title_) > 4 and ratio >= 0.98:`
			`return _MatchingSub(sub_name, 3, "Perfect title ratio")`

			`return _MatchingSub(sub_name, 1, "Normal title ratio")`

			`logger.debug("No episode title matched from file: %s", sub_name)`
			`return None`
no log: add subliminal_patch provider utils This will be used to modulate and simplify some tasks and reduce redundant code 2022-04-18 20:38:23 +00:00

			`def get_subtitle_from_archive(`
Update provider utils 2022-10-26 20:53:41 +00:00			`archive, forced=False, episode=None, get_first_subtitle=False, **kwargs`
no log: add subliminal_patch provider utils This will be used to modulate and simplify some tasks and reduce redundant code 2022-04-18 20:38:23 +00:00			`):`
			`"Get subtitle from Rarfile/Zipfile object. Return None if nothing is found."`
			`subs_in_archive = [`
			`name`
			`for name in archive.namelist()`
			`if name.endswith((".srt", ".sub", ".ssa", ".ass"))`
			`]`

			`if not subs_in_archive:`
			`logger.info("No subtitles found in archive")`
			`return None`

			`logger.debug("Subtitles in archive: %s", subs_in_archive)`

			`if len(subs_in_archive) == 1 or get_first_subtitle:`
			`logger.debug("Getting first subtitle in archive: %s", subs_in_archive)`
			`return fix_line_ending(archive.read(subs_in_archive[0]))`

Update provider utils 2022-10-26 20:53:41 +00:00			`matching_sub = _get_matching_sub(subs_in_archive, forced, episode, **kwargs)`
no log: add subliminal_patch provider utils This will be used to modulate and simplify some tasks and reduce redundant code 2022-04-18 20:38:23 +00:00
			`if matching_sub is not None:`
			`logger.info("Using %s from archive", matching_sub)`
			`return fix_line_ending(archive.read(matching_sub))`

			`logger.debug("No subtitle found in archive")`
			`return None`


no log: add providers utility 2022-05-22 06:49:54 +00:00			`def is_episode(content):`
			`return "episode" in guessit(content, {"type": "episode"})`


SuperSubtitles provider: fix hungarian subtitles downloads 2023-05-04 23:15:35 +00:00			`_ENCS = ("utf-8", "ascii", "iso-8859-1", "iso-8859-2", "iso-8859-5", "cp1252")`


			`def _zip_from_subtitle_file(content):`
			`with tempfile.NamedTemporaryFile(prefix="spsub", suffix=".srt") as tmp_f:`
			`tmp_f.write(content)`
			`sub = None`
			`for enc in _ENCS:`
			`try:`
			`logger.debug("Trying %s encoding", enc)`
			`sub = pysubs2.load(tmp_f.name, encoding=enc)`
			`except Exception as error:`
			`logger.debug("%s: %s", type(error).__name__, error)`
			`continue`
			`else:`
			`break`

			`if sub is not None:`
			`logger.debug("Identified subtitle file: %s", sub)`
			`zip_obj = zipfile.ZipFile(io.BytesIO(), mode="x")`
			`zip_obj.write(tmp_f.name, os.path.basename(tmp_f.name))`
			`return zip_obj`

			`logger.debug("Couldn't load subtitle file")`
			`return None`


no log: add subliminal_patch provider utils This will be used to modulate and simplify some tasks and reduce redundant code 2022-04-18 20:38:23 +00:00			`def get_archive_from_bytes(content: bytes):`
Improve providers utils 2023-02-16 00:49:56 +00:00			`"""Get RarFile/ZipFile object from bytes. A ZipFile instance will be returned`
			`if a subtitle-like stream is found. Return None if something else is found."""`
no log: add subliminal_patch provider utils This will be used to modulate and simplify some tasks and reduce redundant code 2022-04-18 20:38:23 +00:00			`archive_stream = io.BytesIO(content)`
Improve providers utils 2023-02-16 00:49:56 +00:00
no log: add subliminal_patch provider utils This will be used to modulate and simplify some tasks and reduce redundant code 2022-04-18 20:38:23 +00:00			`if rarfile.is_rarfile(archive_stream):`
			`logger.debug("Identified rar archive")`
			`return rarfile.RarFile(archive_stream)`
			`elif zipfile.is_zipfile(archive_stream):`
			`logger.debug("Identified zip archive")`
			`return zipfile.ZipFile(archive_stream)`

Improve providers utils 2023-02-16 00:49:56 +00:00			`logger.debug("No compression format found. Trying with subtitle-like files")`
SuperSubtitles provider: fix hungarian subtitles downloads 2023-05-04 23:15:35 +00:00			`return _zip_from_subtitle_file(content)`
no log: add update_matches() providers util 2022-04-19 00:12:07 +00:00

Improve providers utils 2023-02-16 00:49:56 +00:00			`def update_matches(`
			`matches,`
			`video,`
			`release_info: Union[str, Iterable[str]],`
			`split="\n",`
EmbeddedSubtitles provider: add blacklist support 2023-10-10 07:37:45 +00:00			`**guessit_options,`
Improve providers utils 2023-02-16 00:49:56 +00:00			`):`
			`"""Update matches set from release info string or Iterable.`

EmbeddedSubtitles provider: add blacklist support 2023-10-10 07:37:45 +00:00			`Use the split parameter to iterate over the set delimiter; set None to avoid split.`
			`"""`
Improve providers utils 2023-02-16 00:49:56 +00:00
no log: add update_matches() providers util 2022-04-19 00:12:07 +00:00			`guessit_options["type"] = "episode" if isinstance(video, Episode) else "movie"`
Improve providers utils 2023-02-16 00:49:56 +00:00
no log: add update_matches() providers util 2022-04-19 00:12:07 +00:00			`logger.debug("Guessit options to update matches: %s", guessit_options)`

Improve providers utils 2023-02-16 00:49:56 +00:00			`if isinstance(release_info, str):`
			`release_info = release_info.split(split)`

			`for release in release_info:`
			`for release_split in release.split(split):`
			`logger.debug("Updating matches from release info: %s", release)`
			`matches \|= guess_matches(`
			`video, guessit(release_split.strip(), guessit_options)`
			`)`
			`logger.debug("New matches: %s", matches)`
no log: add update_matches() providers util 2022-04-19 00:12:07 +00:00
			`return matches`