bazarr/libs/subliminal_patch/providers/subf2m.py

407 lines
12 KiB
Python

# -*- coding: utf-8 -*-
from difflib import SequenceMatcher
import functools
import logging
import re
import time
import urllib.parse
from bs4 import BeautifulSoup as bso
from guessit import guessit
from requests import Session
from subliminal_patch.core import Episode
from subliminal_patch.core import Movie
from subliminal_patch.exceptions import APIThrottled
from subliminal_patch.providers import Provider
from subliminal_patch.providers.utils import get_archive_from_bytes
from subliminal_patch.providers.utils import get_subtitle_from_archive
from subliminal_patch.providers.utils import update_matches
from subliminal_patch.subtitle import Subtitle
from subzero.language import Language
logger = logging.getLogger(__name__)
class Subf2mSubtitle(Subtitle):
provider_name = "subf2m"
hash_verifiable = False
def __init__(self, language, page_link, release_info, episode_number=None):
super().__init__(language, page_link=page_link)
self.release_info = release_info
self.episode_number = episode_number
self.episode_title = None
self._matches = set(
("title", "year")
if episode_number is None
else ("title", "series", "year", "season", "episode")
)
def get_matches(self, video):
update_matches(self._matches, video, self.release_info)
return self._matches
@property
def id(self):
return self.page_link
_BASE_URL = "https://subf2m.co"
# TODO: add more seasons and languages
_SEASONS = (
"First",
"Second",
"Third",
"Fourth",
"Fifth",
"Sixth",
"Seventh",
"Eighth",
"Ninth",
"Tenth",
"Eleventh",
"Twelfth",
"Thirdteenth",
"Fourthteenth",
"Fifteenth",
"Sixteenth",
"Seventeenth",
"Eightheenth",
"Nineteenth",
"Tweentieth",
)
_LANGUAGE_MAP = {
"english": "eng",
"farsi_persian": "per",
"arabic": "ara",
"spanish": "spa",
"portuguese": "por",
"italian": "ita",
"dutch": "dut",
"hebrew": "heb",
"indonesian": "ind",
"danish": "dan",
"norwegian": "nor",
"bengali": "ben",
"bulgarian": "bul",
"croatian": "hrv",
"swedish": "swe",
"vietnamese": "vie",
"czech": "cze",
"finnish": "fin",
"french": "fre",
"german": "ger",
"greek": "gre",
"hungarian": "hun",
"icelandic": "ice",
"japanese": "jpn",
"macedonian": "mac",
"malay": "may",
"polish": "pol",
"romanian": "rum",
"russian": "rus",
"serbian": "srp",
"thai": "tha",
"turkish": "tur",
}
class Subf2mProvider(Provider):
provider_name = "subf2m"
_movie_title_regex = re.compile(r"^(.+?)( \((\d{4})\))?$")
_tv_show_title_regex = re.compile(
r"^(.+?) [-\(]\s?(.*?) (season|series)\)?( \((\d{4})\))?$"
)
_supported_languages = {}
_supported_languages["brazillian-portuguese"] = Language("por", "BR")
for key, val in _LANGUAGE_MAP.items():
_supported_languages[key] = Language.fromalpha3b(val)
_supported_languages_reversed = {
val: key for key, val in _supported_languages.items()
}
languages = set(_supported_languages.values())
video_types = (Episode, Movie)
subtitle_class = Subf2mSubtitle
def initialize(self):
self._session = Session()
self._session.headers.update({"user-agent": "Bazarr"})
def terminate(self):
self._session.close()
def _safe_get_text(self, url, retry=3, default_return=""):
req = None
for n in range(retry):
req = self._session.get(url, stream=True)
if req.status_code == 403:
logger.debug("Access to this resource is forbidden: %s", url)
break
# Sometimes subf2m will return a 503 code. This error usually disappears
# retrying the query
if req.status_code == 503:
logger.debug("503 returned. Trying again [%d] in 3 seconds", n + 1)
time.sleep(3)
continue
else:
req.raise_for_status()
break
if req is not None:
return "\n".join(
line for line in req.iter_lines(decode_unicode=True) if line
)
return default_return
def _gen_results(self, query):
query = urllib.parse.quote(query)
url = f"{_BASE_URL}/subtitles/searchbytitle?query={query}&l="
text = self._safe_get_text(url)
soup = bso(text, "html.parser")
for title in soup.select("li div[class='title'] a"):
yield title
def _search_movie(self, title, year):
title = title.lower()
year = str(year)
found_movie = None
results = []
for result in self._gen_results(title):
text = result.text.lower()
match = self._movie_title_regex.match(text)
if not match:
continue
match_title = match.group(1)
match_year = match.group(3)
if year == match_year:
results.append(
{
"href": result.get("href"),
"similarity": SequenceMatcher(None, title, match_title).ratio(),
}
)
if results:
results.sort(key=lambda x: x["similarity"], reverse=True)
found_movie = results[0]["href"]
logger.debug("Movie found: %s", results[0])
return found_movie
def _search_tv_show_season(self, title, season, year=None):
try:
season_str = _SEASONS[season - 1].lower()
except IndexError:
logger.debug("Season number not supported: %s", season)
return None
found_tv_show_season = None
results = []
for result in self._gen_results(title):
text = result.text.lower()
match = self._tv_show_title_regex.match(text)
if not match:
logger.debug("Series title not matched: %s", text)
continue
else:
logger.debug("Series title matched: %s", text)
match_title = match.group(1)
match_season = match.group(2)
# Match "complete series" titles as they usually contain season packs
if season_str == match_season or "complete" in match_season:
plus = 0.1 if year and str(year) in text else 0
results.append(
{
"href": result.get("href"),
"similarity": SequenceMatcher(None, title, match_title).ratio()
+ plus,
}
)
if results:
results.sort(key=lambda x: x["similarity"], reverse=True)
found_tv_show_season = results[0]["href"]
logger.debug("TV Show season found: %s", results[0])
return found_tv_show_season
def _find_movie_subtitles(self, path, language):
soup = self._get_subtitle_page_soup(path, language)
subtitles = []
for item in soup.select("li.item"):
subtitle = _get_subtitle_from_item(item, language)
if subtitle is None:
continue
logger.debug("Found subtitle: %s", subtitle)
subtitles.append(subtitle)
return subtitles
def _find_episode_subtitles(
self, path, season, episode, language, episode_title=None
):
soup = self._get_subtitle_page_soup(path, language)
subtitles = []
for item in soup.select("li.item"):
valid_item = None
clean_text = " ".join(item.text.split())
if not clean_text:
continue
# It will return list values
guess = _memoized_episode_guess(clean_text)
if "season" not in guess:
if "complete series" in clean_text.lower():
logger.debug("Complete series pack found: %s", clean_text)
guess["season"] = [season]
else:
logger.debug("Nothing guessed from release: %s", clean_text)
continue
if season in guess["season"] and episode in guess.get("episode", []):
logger.debug("Episode match found: %s - %s", guess, clean_text)
valid_item = item
elif season in guess["season"] and not "episode" in guess:
logger.debug("Season pack found: %s", clean_text)
valid_item = item
if valid_item is None:
continue
subtitle = _get_subtitle_from_item(item, language, episode)
if subtitle is None:
continue
subtitle.episode_title = episode_title
logger.debug("Found subtitle: %s", subtitle)
subtitles.append(subtitle)
return subtitles
def _get_subtitle_page_soup(self, path, language):
language_path = self._supported_languages_reversed[language]
text = self._safe_get_text(f"{_BASE_URL}{path}/{language_path}")
return bso(text, "html.parser")
def list_subtitles(self, video, languages):
is_episode = isinstance(video, Episode)
if is_episode:
result = self._search_tv_show_season(video.series, video.season, video.year)
else:
result = self._search_movie(video.title, video.year)
if result is None:
logger.debug("No results")
return []
subtitles = []
for language in languages:
if is_episode:
subtitles.extend(
self._find_episode_subtitles(
result, video.season, video.episode, language, video.title
)
)
else:
subtitles.extend(self._find_movie_subtitles(result, language))
return subtitles
def download_subtitle(self, subtitle):
# TODO: add MustGetBlacklisted support
text = self._safe_get_text(subtitle.page_link)
soup = bso(text, "html.parser")
try:
download_url = _BASE_URL + str(
soup.select_one("a[id='downloadButton']")["href"] # type: ignore
)
except (AttributeError, KeyError, TypeError):
raise APIThrottled(f"Couldn't get download url from {subtitle.page_link}")
downloaded = self._session.get(download_url, allow_redirects=True)
archive = get_archive_from_bytes(downloaded.content)
if archive is None:
raise APIThrottled(f"Invalid archive: {subtitle.page_link}")
subtitle.content = get_subtitle_from_archive(
archive,
episode=subtitle.episode_number,
episode_title=subtitle.episode_title,
)
@functools.lru_cache(2048)
def _memoized_episode_guess(content):
# Use include to save time from unnecessary checks
return guessit(
content,
{
"type": "episode",
# Add codec keys to avoid matching x264, 5.1, etc as episode info
"includes": ["season", "episode", "video_codec", "audio_codec"],
"enforce_list": True,
},
)
def _get_subtitle_from_item(item, language, episode_number=None):
release_info = [
rel.text.strip() for rel in item.find("ul", {"class": "scrolllist"})
]
try:
text = item.find("div", {"class": "comment-col"}).find("p").text
release_info.append(text.replace("\n", " ").strip())
except AttributeError:
pass
release_info = "\n".join([item for item in release_info if item])
try:
path = item.find("a", {"class": "download icon-download"})["href"] # type: ignore
except (AttributeError, KeyError):
logger.debug("Couldn't get path: %s", item)
return None
return Subf2mSubtitle(language, _BASE_URL + path, release_info, episode_number)