bazarr/libs/subliminal_patch/providers/opensubtitlescom.py

607 lines
24 KiB
Python

# -*- coding: utf-8 -*-
import logging
import os
import time
import datetime
import json
from requests import Session, ConnectionError, Timeout, ReadTimeout, RequestException
from requests.exceptions import JSONDecodeError
from subzero.language import Language
from babelfish import language_converters
from subliminal import Episode, Movie
from subliminal.score import get_equivalent_release_groups
from subliminal.utils import sanitize_release_group, sanitize
from subliminal_patch.exceptions import TooManyRequests, APIThrottled
from subliminal.exceptions import DownloadLimitExceeded, AuthenticationError, ConfigurationError, ServiceUnavailable, \
ProviderError
from .mixins import ProviderRetryMixin
from subliminal_patch.subtitle import Subtitle
from subliminal.subtitle import fix_line_ending, SUBTITLE_EXTENSIONS
from subliminal_patch.providers import Provider
from subliminal_patch.subtitle import guess_matches
from subliminal_patch.utils import fix_inconsistent_naming
from subliminal.cache import region
from dogpile.cache.api import NO_VALUE
from guessit import guessit
logger = logging.getLogger(__name__)
SHOW_EXPIRATION_TIME = datetime.timedelta(weeks=1).total_seconds()
TOKEN_EXPIRATION_TIME = datetime.timedelta(hours=12).total_seconds()
retry_amount = 3
def fix_tv_naming(title):
"""Fix TV show titles with inconsistent naming using dictionary, but do not sanitize them.
:param str title: original title.
:return: new title.
:rtype: str
"""
return fix_inconsistent_naming(title, {"Superman & Lois": "Superman and Lois",
}, True)
def fix_movie_naming(title):
return fix_inconsistent_naming(title, {
}, True)
custom_languages = {
'pt': 'pt-PT',
'zh': 'zh-CN',
}
def to_opensubtitlescom(lang):
if lang in custom_languages.keys():
return custom_languages[lang]
else:
return lang
def from_opensubtitlescom(lang):
from_custom_languages = {v: k for k, v in custom_languages.items()}
if lang in from_custom_languages.keys():
return from_custom_languages[lang]
else:
return lang
class OpenSubtitlesComSubtitle(Subtitle):
provider_name = 'opensubtitlescom'
hash_verifiable = True
hearing_impaired_verifiable = True
def __init__(self, language, forced, hearing_impaired, page_link, file_id, releases, uploader, title, year,
hash_matched, file_hash=None, season=None, episode=None, imdb_match=False):
super().__init__(language, hearing_impaired, page_link)
language = Language.rebuild(language, hi=hearing_impaired, forced=forced)
self.title = title
self.year = year
self.season = season
self.episode = episode
self.releases = releases
self.release_info = releases
self.language = language
self.hearing_impaired = hearing_impaired
self.forced = forced
self.file_id = file_id
self.page_link = page_link
self.download_link = None
self.uploader = uploader
self.matches = None
self.hash = file_hash
self.encoding = 'utf-8'
self.hash_matched = hash_matched
self.imdb_match = imdb_match
@property
def id(self):
return self.file_id
def get_matches(self, video):
matches = set()
type_ = "movie" if isinstance(video, Movie) else "episode"
# handle movies and series separately
if type_ == "episode":
# series
matches.add('series')
# season
if video.season == self.season:
matches.add('season')
# episode
if video.episode == self.episode:
matches.add('episode')
# imdb
if self.imdb_match:
matches.add('series_imdb_id')
else:
# title
matches.add('title')
# imdb
if self.imdb_match:
matches.add('imdb_id')
# rest is same for both groups
# year
if video.year == self.year:
matches.add('year')
# release_group
if (video.release_group and self.releases and
any(r in sanitize_release_group(self.releases)
for r in get_equivalent_release_groups(sanitize_release_group(video.release_group)))):
matches.add('release_group')
if self.hash_matched:
matches.add('hash')
# other properties
matches |= guess_matches(video, guessit(self.releases, {"type": type_}))
self.matches = matches
return matches
class OpenSubtitlesComProvider(ProviderRetryMixin, Provider):
"""OpenSubtitlesCom Provider"""
server_hostname = 'api.opensubtitles.com'
languages = {Language.fromopensubtitles(lang) for lang in language_converters['szopensubtitles'].codes}
languages.update(set(Language.rebuild(lang, forced=True) for lang in languages))
languages.update(set(Language.rebuild(l, hi=True) for l in languages))
video_types = (Episode, Movie)
def __init__(self, username=None, password=None, use_hash=True, include_ai_translated=False, api_key=None):
if not all((username, password)):
raise ConfigurationError('Username and password must be specified')
if not api_key:
raise ConfigurationError('Api_key must be specified')
if not all((username, password)):
raise ConfigurationError('Username and password must be specified')
self.session = Session()
self.session.headers = {'User-Agent': os.environ.get("SZ_USER_AGENT", "Sub-Zero/2"),
'Api-Key': api_key,
'Content-Type': 'application/json'}
self.token = None
self.username = username
self.password = password
self.video = None
self.use_hash = use_hash
self.include_ai_translated = include_ai_translated
self._started = None
def initialize(self):
self._started = time.time()
if region.get("oscom_token", expiration_time=TOKEN_EXPIRATION_TIME) is NO_VALUE:
logger.debug("No cached token, we'll try to login again.")
self.login()
else:
self.token = region.get("oscom_token", expiration_time=TOKEN_EXPIRATION_TIME)
if region.get("oscom_server", expiration_time=TOKEN_EXPIRATION_TIME) is NO_VALUE:
logger.debug("No cached server, we'll try to login again.")
self.login()
else:
self.server_hostname = region.get("oscom_server", expiration_time=TOKEN_EXPIRATION_TIME)
def terminate(self):
self.session.close()
def ping(self):
return self._started and (time.time() - self._started) < TOKEN_EXPIRATION_TIME
def server_url(self):
return f'https://{self.server_hostname}/api/v1/'
def login(self, is_retry=False):
r = self.checked(
lambda: self.session.post(self.server_url() + 'login',
json={"username": self.username, "password": self.password},
allow_redirects=False,
timeout=30),
is_retry=is_retry)
try:
self.token = r.json()['token']
except (ValueError, JSONDecodeError):
log_request_response(r)
raise ProviderError("Cannot get token from provider login response")
else:
log_request_response(r, non_standard=False)
region.set("oscom_token", self.token)
try:
self.server_hostname = r.json()['base_url']
except (ValueError, JSONDecodeError):
log_request_response(r)
raise ProviderError("Cannot get server from provider login response")
else:
log_request_response(r, non_standard=False)
region.set("oscom_server", self.server_hostname)
finally:
if self.server_hostname.startswith('vip'):
self.session.headers.update({'Authorization': 'Bearer ' + self.token})
else:
self.session.headers.pop('Authorization', None)
@staticmethod
def sanitize_external_ids(external_id):
if isinstance(external_id, str):
external_id = external_id.lower().lstrip('tt').lstrip('0')
sanitized_id = external_id[:-1].lstrip('0') + external_id[-1]
return int(sanitized_id)
@region.cache_on_arguments(expiration_time=SHOW_EXPIRATION_TIME)
def search_titles(self, title):
title_id = None
parameters = {'query': title.lower()}
logger.debug(f'Searching using this title: {title}')
results = self.retry(
lambda: self.checked(
lambda: self.session.get(self.server_url() + 'features', params=parameters, timeout=30),
validate_json=True,
json_key_name='data'
),
amount=retry_amount
)
# deserialize results
results_dict = results.json()['data']
# loop over results
for result in results_dict:
if 'title' in result['attributes']:
if isinstance(self.video, Episode):
if fix_tv_naming(title).lower() == result['attributes']['title'].lower() and \
(not self.video.year or self.video.year == int(result['attributes']['year'])):
title_id = result['id']
break
else:
if fix_movie_naming(title).lower() == result['attributes']['title'].lower() and \
(not self.video.year or self.video.year == int(result['attributes']['year'])):
title_id = result['id']
break
else:
continue
if title_id:
logger.debug(f'Found this title ID: {title_id}')
return self.sanitize_external_ids(title_id)
if not title_id:
logger.debug(f'No match found for {title}')
def query(self, languages, video):
self.video = video
if self.use_hash:
file_hash = self.video.hashes.get('opensubtitlescom')
logger.debug(f'Searching using this hash: {file_hash}')
else:
file_hash = None
if isinstance(self.video, Episode):
title = self.video.series
else:
title = self.video.title
imdb_id = None
if isinstance(self.video, Episode) and self.video.series_imdb_id:
imdb_id = self.sanitize_external_ids(self.video.series_imdb_id)
elif isinstance(self.video, Movie) and self.video.imdb_id:
imdb_id = self.sanitize_external_ids(self.video.imdb_id)
title_id = None
if not imdb_id:
title_id = self.search_titles(title)
if not title_id:
return []
# be sure to remove duplicates using list(set())
langs_list = sorted(list(set([to_opensubtitlescom(lang.basename).lower() for lang in languages])))
langs = ','.join(langs_list)
logger.debug(f'Searching for those languages: {langs}')
# query the server
if isinstance(self.video, Episode):
res = self.retry(
lambda: self.checked(
lambda: self.session.get(self.server_url() + 'subtitles',
params=(('ai_translated', 'exclude' if not self.include_ai_translated
else 'include'),
('episode_number', self.video.episode),
('imdb_id', imdb_id if not title_id else None),
('languages', langs),
('moviehash', file_hash),
('parent_feature_id', title_id if title_id else None),
('season_number', self.video.season)),
timeout=30),
validate_json=True,
json_key_name='data'
),
amount=retry_amount
)
else:
res = self.retry(
lambda: self.checked(
lambda: self.session.get(self.server_url() + 'subtitles',
params=(('ai_translated', 'exclude' if not self.include_ai_translated
else 'include'),
('id', title_id if title_id else None),
('imdb_id', imdb_id if not title_id else None),
('languages', langs),
('moviehash', file_hash)),
timeout=30),
validate_json=True,
json_key_name='data'
),
amount=retry_amount
)
subtitles = []
result = res.json()
# filter out forced subtitles or not depending on the required languages
if all([lang.forced for lang in languages]): # only forced
result['data'] = [x for x in result['data'] if x['attributes']['foreign_parts_only']]
elif any([lang.forced for lang in languages]): # also forced
pass
else: # not forced
result['data'] = [x for x in result['data'] if not x['attributes']['foreign_parts_only']]
logger.debug(f"Query returned {len(result['data'])} subtitles")
if len(result['data']):
for item in result['data']:
# ignore AI translated subtitles
if not self.include_ai_translated:
if 'ai_translated' in item['attributes'] and item['attributes']['ai_translated']:
logger.debug("Skipping AI translated subtitles")
continue
# ignore machine translated subtitles
if 'machine_translated' in item['attributes'] and item['attributes']['machine_translated']:
logger.debug("Skipping machine translated subtitles")
continue
if 'season_number' in item['attributes']['feature_details']:
season_number = item['attributes']['feature_details']['season_number']
else:
season_number = None
if 'episode_number' in item['attributes']['feature_details']:
episode_number = item['attributes']['feature_details']['episode_number']
else:
episode_number = None
if 'moviehash_match' in item['attributes']:
moviehash_match = item['attributes']['moviehash_match']
else:
moviehash_match = False
try:
year = int(item['attributes']['feature_details']['year'])
except TypeError:
year = item['attributes']['feature_details']['year']
if len(item['attributes']['files']):
subtitle = OpenSubtitlesComSubtitle(
language=Language.fromietf(from_opensubtitlescom(item['attributes']['language'])),
forced=item['attributes']['foreign_parts_only'],
hearing_impaired=item['attributes']['hearing_impaired'],
page_link=item['attributes']['url'],
file_id=item['attributes']['files'][0]['file_id'],
releases=item['attributes']['release'],
uploader=item['attributes']['uploader']['name'],
title=item['attributes']['feature_details']['movie_name'],
year=year,
season=season_number,
episode=episode_number,
hash_matched=moviehash_match,
imdb_match=True if imdb_id else False
)
subtitle.get_matches(self.video)
subtitles.append(subtitle)
return subtitles
def list_subtitles(self, video, languages):
return self.query(languages, video)
def download_subtitle(self, subtitle):
logger.info('Downloading subtitle %r', subtitle)
headers = {'Accept': 'application/json', 'Content-Type': 'application/json',
'Authorization': 'Bearer ' + self.token}
res = self.retry(
lambda: self.checked(
lambda: self.session.post(self.server_url() + 'download',
json={'file_id': subtitle.file_id, 'sub_format': 'srt'},
headers=headers,
timeout=30),
validate_json=True,
json_key_name='link'
),
amount=retry_amount
)
download_data = res.json()
subtitle.download_link = download_data['link']
r = self.retry(
lambda: self.checked(
lambda: self.session.get(subtitle.download_link, timeout=30),
validate_content=True
),
amount=retry_amount
)
if not r:
logger.debug(f'Could not download subtitle from {subtitle.download_link}')
subtitle.content = None
return
else:
subtitle_content = r.content
subtitle.content = fix_line_ending(subtitle_content)
@staticmethod
def reset_token():
logger.debug('Authentication failed: clearing cache and attempting to login.')
region.delete("oscom_token")
region.delete("oscom_server")
return
def checked(self, fn, raise_api_limit=False, validate_json=False, json_key_name=None, validate_content=False,
is_retry=False):
"""Run :fn: and check the response status before returning it.
:param fn: the function to make an API call to OpenSubtitles.com.
:param raise_api_limit: if True we wait a little bit longer before running the call again.
:param validate_json: test if response is valid json.
:param json_key_name: test if returned json contain a specific key.
:param validate_content: test if response have a content (used with download).
:param is_retry: prevent additional retries with login endpoint.
:return: the response.
"""
response = None
try:
try:
response = fn()
except APIThrottled:
if not raise_api_limit:
logger.info("API request limit hit, waiting and trying again once.")
time.sleep(15)
return self.checked(fn, raise_api_limit=True)
raise
except (ConnectionError, Timeout, ReadTimeout):
raise ServiceUnavailable(f'Unknown Error, empty response: {response.status_code}: {response}')
except Exception:
logger.exception('Unhandled exception raised.')
raise ProviderError('Unhandled exception raised. Check log.')
else:
status_code = response.status_code
except Exception:
status_code = None
else:
if status_code == 400:
try:
json_response = response.json()
message = json_response['message']
except JSONDecodeError:
raise ProviderError('Invalid JSON returned by provider')
else:
log_request_response(response)
raise ConfigurationError(message)
elif status_code == 401:
log_request_response(response)
self.reset_token()
if is_retry:
raise AuthenticationError('Login failed')
else:
time.sleep(1)
self.login(is_retry=True)
self.checked(fn, raise_api_limit=raise_api_limit, validate_json=validate_json,
json_key_name=json_key_name, validate_content=validate_content, is_retry=True)
elif status_code == 403:
log_request_response(response)
raise ProviderError("Bazarr API key seems to be in problem")
elif status_code == 406:
try:
json_response = response.json()
download_count = json_response['requests']
remaining_download = json_response['remaining']
quota_reset_time = json_response['reset_time']
except JSONDecodeError:
raise ProviderError('Invalid JSON returned by provider')
else:
log_request_response(response)
raise DownloadLimitExceeded(f"Daily download limit reached. {download_count} subtitles have been "
f"downloaded and {remaining_download} remaining subtitles can be "
f"downloaded. Quota will be reset in {quota_reset_time}.")
elif status_code == 410:
log_request_response(response)
raise ProviderError("Download as expired")
elif status_code == 429:
log_request_response(response)
raise TooManyRequests()
elif status_code == 500:
logger.debug("Server side exception raised while downloading from opensubtitles.com website. They "
"should mitigate this soon.")
return None
elif status_code == 502:
# this one should deal with Bad Gateway issue on their side.
raise APIThrottled()
elif 500 <= status_code <= 599:
raise ProviderError(response.reason)
if status_code != 200:
log_request_response(response)
raise ProviderError(f'Bad status code: {response.status_code}')
if validate_json:
try:
json_test = response.json()
except JSONDecodeError:
raise ProviderError('Invalid JSON returned by provider')
else:
if json_key_name not in json_test:
raise ProviderError(f'Invalid JSON returned by provider: no {json_key_name} key in returned json.')
if validate_content:
if not hasattr(response, 'content'):
logger.error('Download link returned no content attribute.')
return False
elif not response.content:
logger.error(f'This download link returned empty content: {response.url}')
return False
return response
def log_request_response(response, non_standard=True):
redacted_request_headers = response.request.headers
if 'Authorization' in redacted_request_headers and isinstance(redacted_request_headers['Authorization'], str):
redacted_request_headers['Authorization'] = redacted_request_headers['Authorization'][:-8]+8*'x'
if response.request.body:
redacted_request_body = json.loads(response.request.body)
if 'password' in redacted_request_body:
redacted_request_body['password'] = 'redacted'
else:
redacted_request_body = None
redacted_response_body = json.loads(response.text)
if 'token' in redacted_response_body and isinstance(redacted_response_body['token'], str):
redacted_response_body['token'] = redacted_response_body['token'][:-8] + 8 * 'x'
if non_standard:
logger.debug("opensubtitlescom returned a non standard response. Logging request/response for debugging "
"purpose.")
else:
logger.debug("opensubtitlescom returned a standard response. Logging request/response for debugging purpose.")
logger.debug(f"Request URL: {response.request.url}")
logger.debug(f"Request Headers: {redacted_request_headers}")
logger.debug(f"Request Body: {json.dumps(redacted_request_body)}")
logger.debug(f"Response Status Code: {response.status_code}")
logger.debug(f"Response Headers: {response.headers}")
logger.debug(f"Response Body: {json.dumps(redacted_response_body)}")