# coding=utf-8 import io import logging import re import time from babelfish import language_converters from subzero.language import Language from requests import Session from requests.exceptions import JSONDecodeError import urllib.parse from random import randint from subliminal.subtitle import fix_line_ending from subliminal_patch.providers import Provider from subliminal_patch.providers.mixins import ProviderSubtitleArchiveMixin from subliminal.providers import ParserBeautifulSoup from bs4.element import Tag, NavigableString from subliminal.score import get_equivalent_release_groups from subliminal_patch.subtitle import Subtitle, guess_matches from subliminal_patch.exceptions import APIThrottled from subliminal.utils import sanitize, sanitize_release_group from subliminal.video import Episode, Movie from zipfile import ZipFile, is_zipfile from rarfile import RarFile, is_rarfile from subliminal_patch.utils import sanitize, fix_inconsistent_naming from guessit import guessit from .utils import FIRST_THOUSAND_OR_SO_USER_AGENTS as AGENT_LIST logger = logging.getLogger(__name__) language_converters.register('supersubtitles = subliminal_patch.converters.supersubtitles:SuperSubtitlesConverter') def fix_tv_naming(title): """Fix TV show titles with inconsistent naming using dictionary, but do not sanitize them. :param str title: original title. :return: new title. :rtype: str """ return fix_inconsistent_naming(title, {"DC's Legends of Tomorrow": "Legends of Tomorrow", "Star Trek: The Next Generation": "Star Trek TNG", "Loki (aka. Marvel\'s Loki)": "Loki", "Marvel's": "", }, True) def fix_movie_naming(title): return fix_inconsistent_naming(title, { }, True) class SuperSubtitlesSubtitle(Subtitle): """SuperSubtitles Subtitle.""" provider_name = 'supersubtitles' def __str__(self): subtit = "Subtitle id: " + str(self.subtitle_id) \ + " Series: " + self.series \ + " Season: " + str(self.season) \ + " Episode: " + str(self.episode) \ + " Version: " + str(self.version) \ + " Releases: " + str(self.releases) \ + " DownloadLink: " + str(self.page_link) \ + " Matches: " + str(self.matches) if self.year: subtit = subtit + " Year: " + str(self.year) return subtit.encode('utf-8') def __init__(self, language, page_link, subtitle_id, series, season, episode, version, releases, year, imdb_id, uploader, asked_for_episode=None, asked_for_release_group=None): super(SuperSubtitlesSubtitle, self).__init__(language, page_link=page_link) self.subtitle_id = subtitle_id self.series = series self.season = season self.episode = episode self.version = version self.releases = releases self.year = year self.uploader = uploader if year: self.year = int(year) self.release_info = u" ,".join([u"%s (%s)" % (self.__get_name(), releases[0])] + (releases[1:] if len(releases) > 1 else [])) self.page_link = page_link self.asked_for_release_group = asked_for_release_group self.asked_for_episode = asked_for_episode self.imdb_id = imdb_id self.is_pack = True self.matches = None def numeric_id(self): return self.subtitle_id def __get_name(self): ep_addon = (" S%02dE%02d" % (self.season, self.episode)) if self.episode else "" return u"%s%s%s" % (self.series, " (%s)" % self.year if self.year else "", ep_addon) def __repr__(self): return '<%s %r [%s]>' % ( self.__class__.__name__, u"%s [%s]" % (self.__get_name(), self.version), self.language) @property def id(self): return str(self.subtitle_id) def get_matches(self, video): type_ = "movie" if isinstance(video, Movie) else "episode" matches = guess_matches(video, guessit(self.release_info, {"type": type_})) # episode if isinstance(video, Episode): # series if video.series and sanitize(self.series) == sanitize(video.series): matches.add('series') # imdb_id if video.series_imdb_id and self.imdb_id and str(self.imdb_id) == str(video.series_imdb_id): matches.add('series_imdb_id') matches.add('series') matches.add('year') # year if 'year' not in matches and 'series' in matches and video.original_series and self.year is None: matches.add('year') # movie elif isinstance(video, Movie): # title if video.title and (sanitize(self.series) in ( sanitize(name) for name in [video.title] + video.alternative_titles)): matches.add('title') # imdb_id if video.imdb_id and self.imdb_id == video.imdb_id: matches.add('imdb_id') matches.add('title') matches.add('year') # year if video.year and self.year == video.year: matches.add('year') # release_group if video.release_group and self.releases: video_release_groups = get_equivalent_release_groups(sanitize_release_group(video.release_group)) for release in self.releases: if any(r in sanitize_release_group(release) for r in video_release_groups): matches.add('release_group') if video.resolution and video.resolution in release.lower(): matches.add('resolution') if video.source and video.source in release.lower(): matches.add('source') # We don't have to continue in case it is a perfect match if all(m in matches for m in ['release_group', 'resolution', 'source']): break self.matches = matches return matches class SuperSubtitlesProvider(Provider, ProviderSubtitleArchiveMixin): """SuperSubtitles Provider.""" languages = {Language('hun', 'HU')} | {Language(lang) for lang in [ 'hun', 'eng' ]} video_types = (Episode, Movie) # https://www.feliratok.info/?search=&soriSorszam=&nyelv=&sorozatnev=The+Flash+%282014%29&sid=3212&complexsearch=true&knyelv=0&evad=4&epizod1=1&cimke=0&minoseg=0&rlsr=0&tab=all server_url = 'https://www.feliratok.info/' hearing_impaired_verifiable = False multi_result_throttle = 2 # seconds def __init__(self): self.session = None def initialize(self): self.session = Session() self.session.headers = { 'User-Agent': AGENT_LIST[randint(0, len(AGENT_LIST) - 1)], 'Referer': 'https://www.feliratok.info/index.php' } def terminate(self): self.session.close() @staticmethod def get_language(text): if text == 'Magyar': return Language.fromsupersubtitles('hu') if text == 'Angol': return Language.fromsupersubtitles('en') return None def find_imdb_id(self, sub_id): """ """ # TODO: add memoization to this method logic url = self.server_url + "index.php?tipus=adatlap&azon=a_" + str(sub_id) # url = https://www.feliratok.info/index.php?tipus=adatlap&azon=a_1518600916 logger.info('Get IMDB id from URL %s', url) r = self.session.get(url, timeout=10).content soup = ParserBeautifulSoup(r, ['lxml']) links = soup.find_all("a") for value in links: if "imdb.com" in str(value): # iMDB imdb_id = re.search(r'(?<=www\.imdb\.com/title/).*(?=/")', str(value)) imdb_id = imdb_id.group() if imdb_id else '' logger.debug("IMDB ID found: %s", imdb_id) return imdb_id return None def find_id(self, series, year, original_title): """ We need to find the id of the series at the following url: https://www.feliratok.info/index.php?term=SERIESNAME&nyelv=0&action=autoname Where SERIESNAME is a searchable string. The result will be something like this: [{"name":"DC\u2019s Legends of Tomorrow (2016)","ID":"3725"},{"name":"Miles from Tomorrowland (2015)", "ID":"3789"},{"name":"No Tomorrow (2016)","ID":"4179"}] """ # Search for exact name url = self.server_url + "index.php?term=" + series + "&nyelv=0&action=autoname" # url = self.server_url + "index.php?term=" + "fla"+ "&nyelv=0&action=autoname" logger.info('Get series id from URL %s', url) r = self.session.get(url, timeout=10) # r is something like this: # [{"name":"DC\u2019s Legends of Tomorrow (2016)","ID":"3725"},{"name":"Miles from Tomorrowland (2015)", # "ID":"3789"},{"name":"No Tomorrow (2016)","ID":"4179"}] results = r.json() # check all of the results: for result in results: try: # "name":"Miles from Tomorrowland (2015)","ID":"3789" result_year = re.search(r"(?<=\()\d\d\d\d(?=\))", result['name']) result_year = result_year.group() if result_year else '' except IndexError: result_year = "" try: # "name":"Miles from Tomorrowland (2015)","ID":"3789" result_title = re.search(r".*(?=\(\d\d\d\d\))", result['name']) result_title = result_title.group() if result_title else '' result_id = result['ID'] except IndexError: continue for title in (result_title, fix_tv_naming(result_title)): title = title.strip().replace("�", "").replace("& ", "").replace(" ", ".") if not title: continue guessable = title.strip() + ".s01e01." + result_year guess = guessit(guessable, {'type': "episode"}) sanitized_original_title = sanitize(original_title.replace('& ', '')) guess_title = sanitize(guess['title']) if sanitized_original_title == guess_title and year and guess['year'] and \ year == guess['year']: # Return the founded id return result_id elif sanitized_original_title == guess_title and not year: # Return the founded id return result_id return None def query(self, series, languages, video=None): year = video.year subtitle = None if isinstance(video, Episode): series = video.series season = video.season episode = video.episode # seriesa = series.replace(' ', '+') # Get ID of series with original name series_id = self.find_id(series, year, series) if not series_id: # If not founded try without ' char modified_series = urllib.parse.quote_plus(series.replace('\'', '')) series_id = self.find_id(modified_series, year, series) if not series_id and modified_series: # If still not founded try with the longest word is series title modified_series = modified_series.split('+') modified_series = max(modified_series, key=len) series_id = self.find_id(modified_series, year, series) if not series_id: return None subtitle = self.retrieve_series_subtitles(series_id, season, episode, video, languages) if isinstance(video, Movie): title = urllib.parse.quote_plus(series) # https://www.feliratok.info/index.php?search=The+Hitman%27s+BodyGuard&soriSorszam=&nyelv=&tab=film url = self.server_url + "index.php?search=" + title + "&soriSorszam=&nyelv=&tab=film" subtitle = self.process_subs(languages, video, url) return subtitle def retrieve_series_subtitles(self, series_id, season, episode, video, languages): """ Retrieve subtitles for a given episode :param series_id: the ID of the series returned by @find_id. :param season: the season number :param episode: the episode number :param video: video details :param languages: languages to search for :return: list of subtitles for the given episode """ if isinstance(video, Movie): return None subtitles = [] logger.info('Getting the list of subtitles for %s', video) # First, try using every param that we got episode_subs, season_subs = self.get_subtitle_list(series_id, season, episode, video) if episode_subs: sub_list = episode_subs else: ''' Sometimes the site is a bit buggy when you are searching for an episode sub that is only present in a season pack, so we have to make a separate call for that without supplying the episode number ''' _, sub_list = self.get_subtitle_list(series_id, season, None, video) series_imdb_id = None # Convert the list of subtitles for the proper format for sub in sub_list.values(): ''' Since it is not possible to narrow down the languages in the request, we need to filter out the inappropriate elements ''' if sub['language'] in languages: link = self.server_url + '/index.php?action=letolt&felirat=' + str(sub['id']) # For episodes we open the series page so all subtitles imdb_id must be the same if series_imdb_id is None: series_imdb_id = self.find_imdb_id(sub['id']) # Let's create a SuperSubtitlesSubtitle instance from the data that we got and add it to the list subtitles.append(SuperSubtitlesSubtitle(sub['language'], link, sub['id'], sub['name'], sub['season'], sub['episode'], ', '.join(sub['releases']), sub['releases'], video.year, series_imdb_id, sub['uploader'], video.episode, asked_for_release_group=video.release_group)) return subtitles def get_subtitle_list(self, series_id, season, episode, video): """ We can retrieve the list of subtitles for a given show via the following url: https://www.feliratok.info/index.php?action=xbmc&sid=SERIES_ID&ev=SEASON&rtol=EPISODE SERIES_ID is the ID of the show returned by the @find_id method. It is a mandatory parameter. SEASON is the season number. Optional paramter. EPISODE is the episode number. Optional parameter (using this param can cause problems). NOTE: you gonna get back multiple records for the same subtitle, in case it is compatible with multiple releases """ # Construct the url url = self.server_url + "index.php?action=xbmc&sid=" + str(series_id) + "&ev=" + str(season) # Use the 'rtol' param in case we have a valid episode number if episode: url += "&rtol=" + str(episode) try: results = self.session.get(url, timeout=10).json() except JSONDecodeError: # provider returned improper JSON results = None ''' In order to work, the result should be a JSON like this: { "10": { "language":"Angol", "nev":"The Flash (Season 5) (1080p)", "baselink":"http://www.feliratok.info/index.php", "fnev":"The.Flash.S05.HDTV.WEB.720p.1080p.ENG.zip", "felirat":"1560706755", "evad":"5", "ep":"-1", "feltolto":"J1GG4", "pontos_talalat":"111", "evadpakk":"1" }, ... } ''' subtitle_list = {} season_pack_list = {} # Check the results. If a list or a Nonetype is returned, ignore it: if results and not isinstance(results, list): for result in results.values(): ''' Gonna get back multiple records for the same subtitle, in case it is compatible with multiple releases, so we have to group them manually ''' sub_id = int(result['felirat']) # 'Nev' is something like this: # Marvel's The Falcon and the Winter Soldier - 1x05 (WEB.2160p-KOGi) # or # Loki (Season 1) (DSNP.WEB-DL.720p-TOMMY) search_name = re.search(r'^(.*)\s(?:-\s\d+x\d+|(\(Season\s\d+\)))?\s\((.*)\)$', result['nev']) name = search_name.group(1) if search_name else '' release = search_name.group(3) if search_name else '' # In case of 0 it is an episode sub, in other cases, it is a season pack target = subtitle_list if not int(result['evadpakk']) else season_pack_list # Check that this sub_id is not already in the list if sub_id not in target.keys(): target[sub_id] = { 'id': sub_id, 'name': name, 'language': self.get_language(result['language']), 'season': int(result['evad']), 'episode': result['ep'] if not result['evadpakk'] else int(video.episode), 'uploader': result['feltolto'], 'releases': [release], 'fname': result['fnev'] } else: target[sub_id]['releases'].append(release) else: logger.debug("Invalid results: %s", results) return subtitle_list, season_pack_list def process_subs(self, languages, video, url): if isinstance(video, Episode): return None subtitles = [] logger.info('URL for subtitles %s', url) r = self.session.get(url, timeout=10).content soup = ParserBeautifulSoup(r, ['lxml']) tables = soup.find_all("table") tables = tables[0].find_all("tr") i = 0 for table in tables: if "vilagit" in str(table) and i > 1: asked_for_episode = None sub_season = None sub_episode = None sub_english = table.findAll("div", {"class": "eredeti"}) sub_english_name = re.search(r'(?<=
).*?(?=
)', str(sub_english)) sub_english_name = sub_english_name.group() if sub_english_name else '' sub_english_name = sub_english_name.split(' (')[0] sub_english_name = sub_english_name.replace('&', '&') sub_version = 'n/a' if len(str(sub_english).split('(')) > 1: sub_version = (str(sub_english).split('(')[len(str(sub_english).split('(')) - 1]).split(')')[0] # Angol lang = table.find("small") sub_language = re.search(r"(?<=).*(?=)", str(lang)) sub_language = sub_language.group() if sub_language else '' sub_language = self.get_language(sub_language) # link = str(table.findAll("a")[len(table.findAll("a")) - 1]).replace("amp;", "") sub_downloadlink = re.search(r'(?<=href="/).*(?=">)', link) sub_downloadlink = sub_downloadlink.group() if sub_downloadlink else '' sub_downloadlink = self.server_url + sub_downloadlink sub_id = re.search(r"(?<=felirat=).*(?=\">)", link) sub_id = sub_id.group() if sub_id else '' sub_year = video.year sub_releases = [s.strip() for s in sub_version.split(',')] uploader = '' for item in table.contents[7].contents: if isinstance(item, Tag): uploader = item.text.lstrip('\r\n\t\t\t\t\t').rstrip('\r\n\t\t\t\t') elif isinstance(item, NavigableString): uploader = item.lstrip('\r\n\t\t\t\t\t').rstrip('\r\n\t\t\t\t') sub_imdb_id = self.find_imdb_id(sub_id) subtitle = SuperSubtitlesSubtitle(sub_language, sub_downloadlink, sub_id, sub_english_name.strip(), sub_season, sub_episode, sub_version, sub_releases, sub_year, sub_imdb_id, uploader, asked_for_episode, asked_for_release_group=video.release_group) if subtitle.language in languages: subtitles.append(subtitle) i = i + 1 return subtitles def list_subtitles(self, video, languages): titles = [] if isinstance(video, Episode): titles = [video.series] + video.alternative_series elif isinstance(video, Movie): titles = [video.title] + video.alternative_titles subtitles = [] for title in titles: subs = self.query(title, languages, video=video) if subs: for item in subs: if isinstance(video, Episode): fixed_title = fix_tv_naming(item.series) else: fixed_title = fix_movie_naming(item.series) # Check for the original and the fixed titles too if any(x in (fixed_title.strip(), item.series) for x in titles): subtitles.append(item) time.sleep(self.multi_result_throttle) return subtitles def download_subtitle(self, subtitle): # download as a zip logger.info('Downloading subtitle %r', subtitle.subtitle_id) r = self.session.get(subtitle.page_link, timeout=10) r.raise_for_status() archive_stream = io.BytesIO(r.content) archive = None if is_rarfile(archive_stream): archive = RarFile(archive_stream) elif is_zipfile(archive_stream): archive = ZipFile(archive_stream) else: subtitle.content = fix_line_ending(r.content) if archive is not None: subtitle.content = self.get_subtitle_from_archive(subtitle, archive)