bazarr/libs/subliminal_patch/providers/titulky.py

472 lines
19 KiB
Python

# -*- coding: utf-8 -*-
import enum
import io
import logging
import re
import zipfile
from random import randint
from urllib.parse import urljoin, urlparse, parse_qs, quote
import rarfile
from guessit import guessit
from requests import Session
from requests.adapters import HTTPAdapter
from requests.exceptions import HTTPError
from subliminal.cache import region as cache
from subliminal.exceptions import AuthenticationError, ConfigurationError, DownloadLimitExceeded, ProviderError
from subliminal.providers import ParserBeautifulSoup
from subliminal.subtitle import fix_line_ending
from subliminal.video import Episode, Movie
from subliminal_patch.providers import Provider
from subliminal_patch.providers.mixins import ProviderSubtitleArchiveMixin
from subliminal_patch.subtitle import Subtitle, guess_matches
from dogpile.cache.api import NO_VALUE
from subzero.language import Language
from .utils import FIRST_THOUSAND_OR_SO_USER_AGENTS as AGENT_LIST
logger = logging.getLogger(__name__)
class SubtitlesType(enum.Enum):
EPISODE = enum.auto()
MOVIE = enum.auto()
class TitulkySubtitle(Subtitle):
provider_name = 'titulky'
hash_verifiable = False
hearing_impaired_verifiable = False
def __init__(self,
sub_id,
imdb_id,
language,
season,
episode,
release_info,
uploader,
approved,
page_link,
download_link,
asked_for_episode=None):
super().__init__(language, page_link=page_link)
self.sub_id = sub_id
self.imdb_id = imdb_id
self.season = season
self.episode = episode
self.releases = [release_info]
self.release_info = release_info
self.language = language
self.approved = approved
self.page_link = page_link
self.uploader = uploader
self.download_link = download_link
self.asked_for_episode = asked_for_episode
self.matches = None
@property
def id(self):
return self.sub_id
def get_matches(self, video):
matches = set()
media_type = 'movie' if isinstance(video, Movie) else 'episode'
if media_type == 'episode':
# match imdb_id of a series
if video.series_imdb_id and video.series_imdb_id == self.imdb_id:
matches |= {'series_imdb_id', 'series', 'year'}
# match season/episode
if self.season and self.season == video.season:
matches.add('season')
if self.episode and self.episode == video.episode:
matches.add('episode')
elif media_type == 'movie':
# match imdb_id of a movie
if video.imdb_id and video.imdb_id == self.imdb_id:
matches |= {'imdb_id', 'title', 'year'}
matches |= guess_matches(video, guessit(self.release_info, {"type": media_type}))
self.matches = matches
return matches
class TitulkyProvider(Provider, ProviderSubtitleArchiveMixin):
languages = {Language(l) for l in ['ces', 'slk']}
video_types = (Episode, Movie)
hash_verifiable = False
hearing_impaired_verifiable = False
server_url = 'https://premium.titulky.com'
login_url = server_url
logout_url = f"{server_url}?action=logout"
download_url = f"{server_url}/download.php?id="
timeout = 30
max_threads = 5
subtitle_class = TitulkySubtitle
def __init__(self,
username=None,
password=None,
approved_only=None):
if not all([username, password]):
raise ConfigurationError("Username and password must be specified!")
if type(approved_only) is not bool:
raise ConfigurationError(f"Approved_only {approved_only} must be a boolean!")
self.username = username
self.password = password
self.approved_only = approved_only
self.session = None
def initialize(self):
self.session = Session()
# Set headers
cached_user_agent = cache.get('titulky_user_agent')
if cached_user_agent == NO_VALUE:
new_user_agent = AGENT_LIST[randint(0, len(AGENT_LIST) - 1)]
cache.set('titulky_user_agent', new_user_agent)
self.session.headers['User-Agent'] = new_user_agent
else:
self.session.headers['User-Agent'] = cached_user_agent
self.session.headers['Accept'] = 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8'
self.session.headers['Accept-Language'] = 'sk,cz,en;q=0.5'
self.session.headers['Accept-Encoding'] = 'gzip, deflate'
self.session.headers['DNT'] = '1'
self.session.headers['Connection'] = 'keep-alive'
self.session.headers['Upgrade-Insecure-Requests'] = '1'
self.session.headers['Cache-Control'] = 'max-age=0'
self.login()
def terminate(self):
self.session.close()
def login(self, bypass_cache=False):
# Reuse all cookies if found in cache and skip login.
cached_cookiejar = cache.get('titulky_cookiejar')
if not bypass_cache and cached_cookiejar != NO_VALUE:
logger.info("Titulky.com: Reusing cached cookies.")
self.session.cookies.update(cached_cookiejar)
return True
logger.debug("Titulky.com: Logging in...")
data = {'LoginName': self.username, 'LoginPassword': self.password}
res = self.session.post(self.server_url,
data,
allow_redirects=False,
timeout=self.timeout,
headers={'Referer': self.server_url})
location_qs = parse_qs(urlparse(res.headers['Location']).query)
# If the response is a redirect and doesnt point to an error message page, then we are logged in
if res.status_code == 302 and location_qs['msg_type'][0] == 'i':
if 'omezené' in location_qs['msg'][0].lower():
raise AuthenticationError("V.I.P. account is required for this provider to work!")
else:
logger.info("Titulky.com: Successfully logged in, caching cookies for future connections...")
cache.set('titulky_cookiejar', self.session.cookies.copy())
return True
else:
raise AuthenticationError("Login failed")
def logout(self):
logger.info("Titulky.com: Logging out")
res = self.session.get(self.logout_url,
allow_redirects=False,
timeout=self.timeout,
headers={'Referer': self.server_url})
location_qs = parse_qs(urlparse(res.headers['Location']).query)
logger.info("Titulky.com: Clearing cache...")
cache.delete('titulky_cookiejar')
cache.delete('titulky_user_agent')
# If the response is a redirect and doesnt point to an error message page, then we are logged out
if res.is_redirect and location_qs['msg_type'][0] == 'i':
return True
else:
raise AuthenticationError("Logout failed.")
# GET request a page. This functions acts as a requests.session.get proxy handling expired cached cookies
# and subsequent relogging and sending the original request again. If all went well, returns the response.
# Additionally handle allow_redirects by ourselves to follow redirects UNLESS they are redirecting to an
# error page. In such case we would like to know what has happend and act accordingly.
def get_request(self, url, ref=server_url, allow_redirects=False, _recursion=0):
# That's deep... recursion... Stop. We don't have infinite memmory. And don't want to
# spam titulky's server either. So we have to just accept the defeat. Let it throw!
if _recursion >= 10:
raise AuthenticationError("Got into a redirect loop! Oops.")
logger.debug(f"Titulky.com: Fetching url: {url}")
res = self.session.get(
url,
timeout=self.timeout,
allow_redirects=False,
headers={'Referer': quote(ref) if ref else None}) # URL encode ref if it has value
if res.is_redirect:
# Dont bother doing anything if we do not want to redirect. Just return the original response..
if allow_redirects is False:
return res
location_qs = parse_qs(urlparse(res.headers['Location']).query)
# If the msg_type query parameter does NOT equal to 'e' or is absent, follow the URL in the Location header.
if allow_redirects is True and ('msg_type' not in location_qs or ('msg_type' in location_qs and location_qs['msg_type'][0] != 'e')):
return self.get_request(urljoin(res.headers['Origin'] or self.server_url, res.headers['Location']), ref=url, allow_redirects=True, _recursion=(_recursion + 1))
# Check if we got redirected because login cookies expired.
if "přihlašte" in location_qs['msg'][0].lower():
logger.info(f"Titulky.com: Login cookies expired.")
self.login(True)
return self.get_request(url, ref=ref, allow_redirects=True, _recursion=(_recursion + 1))
return res
def fetch_page(self, url, ref=server_url, allow_redirects=False):
res = self.get_request(url, ref=ref, allow_redirects=allow_redirects)
if res.status_code != 200:
raise HTTPError(f"Fetch failed with status code {res.status_code}")
if not res.text:
raise ProviderError("No response returned from the provider")
return res.text
def build_url(self, params):
result = f"{self.server_url}/?"
for key, value in params.items():
result += f'{key}={value}&'
# Remove the last &
result = result[:-1]
# Remove spaces
result = result.replace(' ', '+')
return result
"""
There are multiple ways to find substitles on Titulky.com, however we are
going to utilize a page that lists all available subtitles for all episodes in a season
To my surprise, the server in this case treats movies as a tv series with a "0" season and "0" episode
BROWSE subtitles by IMDB ID:
- Subtitles are here categorised by seasons and episodes
- URL: https://premium.titulky.com/?action=serial&step=<SEASON>&id=<IMDB ID>
- it seems that the url redirects to a page with their own internal ID, redirects should be allowed here
"""
def query(self, languages,
media_type,
imdb_id,
season=0,
episode=0):
params = {
'action': 'serial',
# If browsing subtitles for a movie, then set the step parameter to 0
'step': season,
# Remove the "tt" prefix
'id': imdb_id[2:]
}
browse_url = self.build_url(params)
html_src = self.fetch_page(browse_url, allow_redirects=True)
browse_page_soup = ParserBeautifulSoup(html_src, ['lxml', 'html.parser'])
# Container element containing subtitle div rows, None if the series was not found or similar
container = browse_page_soup.find('form', class_='cloudForm')
# No container with subtitles
if not container:
logger.info("Titulky.com: Could not find container element. No subtitles found.")
return []
# All rows: subtitle rows, episode number rows, useless rows... Gotta filter this out.
all_rows = container.find_all('div', class_='row')
# Filtering and parsing rows
episodes_dict = {}
last_ep_num = None
for row in all_rows:
# This element holds the episode number of following row(s) of subtitles
# E.g.: 1., 2., 3., 4.
number_container = row.find('h5')
# Link to the sub details
anchor = row.find('a') if 'pbl1' in row['class'] or 'pbl0' in row['class'] else None
if number_container:
# The text content of this container is the episode number
try:
# Remove period at the end and parse the string into a number
number_str = number_container.text.strip().rstrip('.')
number = int(number_str) if number_str else 0
last_ep_num = number
except:
raise ProviderError("Could not parse episode number!")
elif anchor:
# The container contains link to details page
if last_ep_num is None:
raise ProviderError("Previous episode number missing, can't parse.")
release_info = anchor.get_text(strip=True)
if release_info == '???':
release_info = ''
details_link = f"{self.server_url}{anchor.get('href')[1:]}"
id_match = re.findall(r'id=(\d+)', details_link)
sub_id = id_match[0] if len(id_match) > 0 else None
download_link = f"{self.download_url}{sub_id}"
# Approved subtitles have a pbl1 class for their row, others have a pbl0 class
approved = True if 'pbl1' in row.get('class') else False
uploader = row.contents[5].get_text(strip=True)
# Parse language to filter out subtitles that are not in the desired language
sub_language = None
czech_flag = row.select('img[src*=\'flag-CZ\']')
slovak_flag = row.select('img[src*=\'flag-SK\']')
if czech_flag and not slovak_flag:
sub_language = Language('ces')
elif slovak_flag and not czech_flag:
sub_language = Language('slk')
else:
logger.debug("Titulky.com: Unknown language while parsing subtitles!")
continue
# If the subtitles language is not requested
if sub_language not in languages:
logger.debug("Titulky.com: Language not in desired languages, skipping...")
continue
# Skip unapproved subtitles if turned on in settings
if self.approved_only and not approved:
logger.debug("Titulky.com: Approved only, skipping...")
continue
result = {
'id': sub_id,
'release_info': release_info,
'approved': approved,
'language': sub_language,
'uploader': uploader,
'details_link': details_link,
'download_link': download_link
}
# If this row contains the first subtitles to an episode number,
# add an empty array into the episodes dict at its place.
if not last_ep_num in episodes_dict:
episodes_dict[last_ep_num] = []
episodes_dict[last_ep_num].append(result)
# Clean up
browse_page_soup.decompose()
browse_page_soup = None
# Rows parsed into episodes_dict, now lets read what we got.
if not episode in episodes_dict:
# well, we got nothing, that happens!
logger.info("Titulky.com: No subtitles found")
return []
sub_infos = episodes_dict[episode]
# After parsing, create new instances of Subtitle class
subtitles = []
for sub_info in sub_infos:
subtitle_instance = self.subtitle_class(
sub_info['id'],
imdb_id,
sub_info['language'],
season if media_type is SubtitlesType.EPISODE else None,
episode if media_type is SubtitlesType.EPISODE else None,
sub_info['release_info'],
sub_info['uploader'],
sub_info['approved'],
sub_info['details_link'],
sub_info['download_link'],
asked_for_episode=(media_type is SubtitlesType.EPISODE)
)
subtitles.append(subtitle_instance)
return subtitles
def list_subtitles(self, video, languages):
subtitles = []
if isinstance(video, Episode):
if video.series_imdb_id:
logger.info("Titulky.com: Searching subtitles for a TV series episode")
subtitles = self.query(languages, SubtitlesType.EPISODE,
imdb_id=video.series_imdb_id,
season=video.season,
episode=video.episode)
else:
logger.info(f"Titulky.com: Skipping {video}! No IMDB ID found.")
elif isinstance(video, Movie):
if video.imdb_id:
logger.info("Titulky.com: Searching subtitles for a movie")
subtitles = self.query(languages, SubtitlesType.MOVIE, imdb_id=video.imdb_id)
else:
logger.info(f"Titulky.com: Skipping {video}! No IMDB ID found.")
return subtitles
def download_subtitle(self, subtitle):
res = self.get_request(subtitle.download_link, ref=subtitle.page_link)
try:
res.raise_for_status()
except:
raise HTTPError(f"An error occured during the download request to {subtitle.download_link}")
archive_stream = io.BytesIO(res.content)
archive = None
if rarfile.is_rarfile(archive_stream):
logger.debug("Titulky.com: Identified rar archive")
archive = rarfile.RarFile(archive_stream)
subtitle_content = self.get_subtitle_from_archive(subtitle, archive)
elif zipfile.is_zipfile(archive_stream):
logger.debug("Titulky.com: Identified zip archive")
archive = zipfile.ZipFile(archive_stream)
subtitle_content = self.get_subtitle_from_archive(subtitle, archive)
else:
subtitle_content = fix_line_ending(res.content)
if archive and len(archive.infolist()) > 1 and not subtitle_content:
logger.info(f"Titulky.com: Couldn't find a proper subtitle file in the downloaded archive.")
elif archive and len(archive.infolist()) == 1 and not subtitle_content:
raise DownloadLimitExceeded("Subtitles download limit has been exceeded")
elif not subtitle_content:
raise ProviderError("No subtitles provided from titulky")
subtitle.content = subtitle_content