bazarr/libs/subliminal_patch/providers/subf2m.py

# -*- coding: utf-8 -*-

from difflib import SequenceMatcher
import functools
import logging
import re
import time
import urllib.parse

from bs4 import BeautifulSoup as bso
from guessit import guessit
from requests import Session
from subliminal_patch.core import Episode
from subliminal_patch.core import Movie
from subliminal_patch.exceptions import APIThrottled
from subliminal_patch.providers import Provider
from subliminal_patch.providers.utils import get_archive_from_bytes
from subliminal_patch.providers.utils import get_subtitle_from_archive
from subliminal_patch.providers.utils import update_matches
from subliminal_patch.subtitle import Subtitle
from subzero.language import Language

logger = logging.getLogger(__name__)


class Subf2mSubtitle(Subtitle):
    provider_name = "subf2m"
    hash_verifiable = False

    def __init__(self, language, page_link, release_info, episode_number=None):
        super().__init__(language, page_link=page_link)

        self.release_info = release_info
        self.episode_number = episode_number
        self.episode_title = None

        self._matches = set(
            ("title", "year")
            if episode_number is None
            else ("title", "series", "year", "season", "episode")
        )

    def get_matches(self, video):
        update_matches(self._matches, video, self.release_info)

        return self._matches

    @property
    def id(self):
        return self.page_link


_BASE_URL = "https://subf2m.co"

# TODO: add more seasons and languages

_SEASONS = (
    "First",
    "Second",
    "Third",
    "Fourth",
    "Fifth",
    "Sixth",
    "Seventh",
    "Eighth",
    "Ninth",
    "Tenth",
    "Eleventh",
    "Twelfth",
    "Thirdteenth",
    "Fourthteenth",
    "Fifteenth",
    "Sixteenth",
    "Seventeenth",
    "Eightheenth",
    "Nineteenth",
    "Tweentieth",
)

_LANGUAGE_MAP = {
    "english": "eng",
    "farsi_persian": "per",
    "arabic": "ara",
    "spanish": "spa",
    "portuguese": "por",
    "italian": "ita",
    "dutch": "dut",
    "hebrew": "heb",
    "indonesian": "ind",
    "danish": "dan",
    "norwegian": "nor",
    "bengali": "ben",
    "bulgarian": "bul",
    "croatian": "hrv",
    "swedish": "swe",
    "vietnamese": "vie",
    "czech": "cze",
    "finnish": "fin",
    "french": "fre",
    "german": "ger",
    "greek": "gre",
    "hungarian": "hun",
    "icelandic": "ice",
    "japanese": "jpn",
    "macedonian": "mac",
    "malay": "may",
    "polish": "pol",
    "romanian": "rum",
    "russian": "rus",
    "serbian": "srp",
    "thai": "tha",
    "turkish": "tur",
}


class Subf2mProvider(Provider):
    provider_name = "subf2m"

    _movie_title_regex = re.compile(r"^(.+?)( \((\d{4})\))?$")
    _tv_show_title_regex = re.compile(
        r"^(.+?) [-\(]\s?(.*?) (season|series)\)?( \((\d{4})\))?$"
    )
    _supported_languages = {}
    _supported_languages["brazillian-portuguese"] = Language("por", "BR")

    for key, val in _LANGUAGE_MAP.items():
        _supported_languages[key] = Language.fromalpha3b(val)

    _supported_languages_reversed = {
        val: key for key, val in _supported_languages.items()
    }

    languages = set(_supported_languages.values())

    video_types = (Episode, Movie)
    subtitle_class = Subf2mSubtitle

    def initialize(self):
        self._session = Session()
        self._session.headers.update({"user-agent": "Bazarr"})

    def terminate(self):
        self._session.close()

    def _safe_get_text(self, url, retry=3, default_return=""):
        req = None

        for n in range(retry):
            req = self._session.get(url, stream=True)

            if req.status_code == 403:
                logger.debug("Access to this resource is forbidden: %s", url)
                break

            # Sometimes subf2m will return a 503 code. This error usually disappears
            # retrying the query
            if req.status_code == 503:
                logger.debug("503 returned. Trying again [%d] in 3 seconds", n + 1)
                time.sleep(3)
                continue
            else:
                req.raise_for_status()
                break

        if req is not None:
            return "\n".join(
                line for line in req.iter_lines(decode_unicode=True) if line
            )

        return default_return

    def _gen_results(self, query):
        query = urllib.parse.quote(query)

        url = f"{_BASE_URL}/subtitles/searchbytitle?query={query}&l="

        text = self._safe_get_text(url)
        soup = bso(text, "html.parser")

        for title in soup.select("li div[class='title'] a"):
            yield title

    def _search_movie(self, title, year):
        title = title.lower()
        year = str(year)

        found_movie = None

        results = []
        for result in self._gen_results(title):
            text = result.text.lower()
            match = self._movie_title_regex.match(text)
            if not match:
                continue
            match_title = match.group(1)
            match_year = match.group(3)
            if year == match_year:
                results.append(
                    {
                        "href": result.get("href"),
                        "similarity": SequenceMatcher(None, title, match_title).ratio(),
                    }
                )

        if results:
            results.sort(key=lambda x: x["similarity"], reverse=True)
            found_movie = results[0]["href"]
            logger.debug("Movie found: %s", results[0])
        return found_movie

    def _search_tv_show_season(self, title, season, year=None):
        try:
            season_str = _SEASONS[season - 1].lower()
        except IndexError:
            logger.debug("Season number not supported: %s", season)
            return None

        found_tv_show_season = None

        results = []
        for result in self._gen_results(title):
            text = result.text.lower()

            match = self._tv_show_title_regex.match(text)
            if not match:
                logger.debug("Series title not matched: %s", text)
                continue
            else:
                logger.debug("Series title matched: %s", text)

            match_title = match.group(1)
            match_season = match.group(2)

            # Match "complete series" titles as they usually contain season packs
            if season_str == match_season or "complete" in match_season:
                plus = 0.1 if year and str(year) in text else 0
                results.append(
                    {
                        "href": result.get("href"),
                        "similarity": SequenceMatcher(None, title, match_title).ratio()
                        + plus,
                    }
                )

        if results:
            results.sort(key=lambda x: x["similarity"], reverse=True)
            found_tv_show_season = results[0]["href"]
            logger.debug("TV Show season found: %s", results[0])

        return found_tv_show_season

    def _find_movie_subtitles(self, path, language):
        soup = self._get_subtitle_page_soup(path, language)
        subtitles = []

        for item in soup.select("li.item"):
            subtitle = _get_subtitle_from_item(item, language)
            if subtitle is None:
                continue

            logger.debug("Found subtitle: %s", subtitle)
            subtitles.append(subtitle)

        return subtitles

    def _find_episode_subtitles(
        self, path, season, episode, language, episode_title=None
    ):
        soup = self._get_subtitle_page_soup(path, language)

        subtitles = []

        for item in soup.select("li.item"):
            valid_item = None
            clean_text = " ".join(item.text.split())

            if not clean_text:
                continue

            # It will return list values
            guess = _memoized_episode_guess(clean_text)

            if "season" not in guess:
                if "complete series" in clean_text.lower():
                    logger.debug("Complete series pack found: %s", clean_text)
                    guess["season"] = [season]
                else:
                    logger.debug("Nothing guessed from release: %s", clean_text)
                    continue

            if season in guess["season"] and episode in guess.get("episode", []):
                logger.debug("Episode match found: %s - %s", guess, clean_text)
                valid_item = item

            elif season in guess["season"] and not "episode" in guess:
                logger.debug("Season pack found: %s", clean_text)
                valid_item = item

            if valid_item is None:
                continue

            subtitle = _get_subtitle_from_item(item, language, episode)

            if subtitle is None:
                continue

            subtitle.episode_title = episode_title

            logger.debug("Found subtitle: %s", subtitle)
            subtitles.append(subtitle)

        return subtitles

    def _get_subtitle_page_soup(self, path, language):
        language_path = self._supported_languages_reversed[language]

        text = self._safe_get_text(f"{_BASE_URL}{path}/{language_path}")

        return bso(text, "html.parser")

    def list_subtitles(self, video, languages):
        is_episode = isinstance(video, Episode)

        if is_episode:
            result = self._search_tv_show_season(video.series, video.season, video.year)
        else:
            result = self._search_movie(video.title, video.year)

        if result is None:
            logger.debug("No results")
            return []

        subtitles = []

        for language in languages:
            if is_episode:
                subtitles.extend(
                    self._find_episode_subtitles(
                        result, video.season, video.episode, language, video.title
                    )
                )
            else:
                subtitles.extend(self._find_movie_subtitles(result, language))

        return subtitles

    def download_subtitle(self, subtitle):
        # TODO: add MustGetBlacklisted support

        text = self._safe_get_text(subtitle.page_link)
        soup = bso(text, "html.parser")
        try:
            download_url = _BASE_URL + str(
                soup.select_one("a[id='downloadButton']")["href"]  # type: ignore
            )
        except (AttributeError, KeyError, TypeError):
            raise APIThrottled(f"Couldn't get download url from {subtitle.page_link}")

        downloaded = self._session.get(download_url, allow_redirects=True)

        archive = get_archive_from_bytes(downloaded.content)

        if archive is None:
            raise APIThrottled(f"Invalid archive: {subtitle.page_link}")

        subtitle.content = get_subtitle_from_archive(
            archive,
            episode=subtitle.episode_number,
            episode_title=subtitle.episode_title,
        )


@functools.lru_cache(2048)
def _memoized_episode_guess(content):
    # Use include to save time from unnecessary checks
    return guessit(
        content,
        {
            "type": "episode",
            # Add codec keys to avoid matching x264, 5.1, etc as episode info
            "includes": ["season", "episode", "video_codec", "audio_codec"],
            "enforce_list": True,
        },
    )


def _get_subtitle_from_item(item, language, episode_number=None):
    release_info = [
        rel.text.strip() for rel in item.find("ul", {"class": "scrolllist"})
    ]

    try:
        text = item.find("div", {"class": "comment-col"}).find("p").text
        release_info.append(text.replace("\n", " ").strip())
    except AttributeError:
        pass

    release_info = "\n".join([item for item in release_info if item])

    try:
        path = item.find("a", {"class": "download icon-download"})["href"]  # type: ignore
    except (AttributeError, KeyError):
        logger.debug("Couldn't get path: %s", item)
        return None

    return Subf2mSubtitle(language, _BASE_URL + path, release_info, episode_number)