diff --git a/libs/subliminal_patch/providers/soustitreseu.py b/libs/subliminal_patch/providers/soustitreseu.py new file mode 100644 index 000000000..454c1dfeb --- /dev/null +++ b/libs/subliminal_patch/providers/soustitreseu.py @@ -0,0 +1,297 @@ +# -*- coding: utf-8 -*- +from __future__ import absolute_import +import io +import os +import logging +from urllib.parse import unquote +from random import randint + +from zipfile import ZipFile, is_zipfile +from rarfile import RarFile, is_rarfile + +from guessit import guessit +from requests import Session +import chardet +from bs4 import NavigableString, UnicodeDammit +from subzero.language import Language + +from subliminal_patch.providers import Provider +from subliminal_patch.providers.mixins import ProviderSubtitleArchiveMixin +from subliminal_patch.subtitle import Subtitle +from subliminal_patch.score import get_scores, framerate_equal +from subliminal.providers import ParserBeautifulSoup +from subliminal.subtitle import sanitize, guess_matches, SUBTITLE_EXTENSIONS +from subliminal.video import Episode, Movie +from .utils import FIRST_THOUSAND_OR_SO_USER_AGENTS as AGENT_LIST + +logger = logging.getLogger(__name__) + + +class SoustitreseuSubtitle(Subtitle): + """Sous-Titres.eu Subtitle.""" + provider_name = 'soustitreseu' + + def __init__(self, language, video, name, data, content, is_perfect_match): + self.language = language + self.srt_filename = name + self.release_info = name + self.page_link = None + self.download_link = None + self.data = data + self.video = video + self.matches = None + self.content = content + self.hearing_impaired = None + self.is_perfect_match = is_perfect_match + + @property + def id(self): + return self.srt_filename + + def get_matches(self, video): + matches = set() + + if self.is_perfect_match: + if isinstance(video, Episode): + matches.add('series') + else: + matches.add('title') + + # guess additional info from data + matches |= guess_matches(video, self.data) + + self.matches = matches + self.data = None # removing this make the subtitles object unpickable + return matches + + def guess_encoding(self): + # override default subtitle guess_encoding method to not include language-specific encodings guessing + # chardet encoding detection seem to yield better results + """Guess encoding using chardet. + + :return: the guessed encoding. + :rtype: str + + """ + if self._guessed_encoding: + return self._guessed_encoding + + logger.info('Guessing encoding for language %s', self.language) + + # guess/detect encoding using chardet + encoding = chardet.detect(self.content)['encoding'] + logger.info('Chardet found encoding %s', encoding) + + if not encoding: + # fallback on bs4 + logger.info('Falling back to bs4 detection') + a = UnicodeDammit(self.content) + + logger.info("bs4 detected encoding: %s", a.original_encoding) + + if a.original_encoding: + self._guessed_encoding = a.original_encoding + return a.original_encoding + raise ValueError(u"Couldn't guess the proper encoding for %s", self) + + self._guessed_encoding = encoding + return encoding + + +class SoustitreseuProvider(Provider, ProviderSubtitleArchiveMixin): + """Sous-Titres.eu Provider.""" + subtitle_class = SoustitreseuSubtitle + languages = {Language(l) for l in ['fra', 'eng']} + server_url = 'https://www.sous-titres.eu/' + search_url = server_url + 'search.html' + + def __init__(self): + self.session = None + self.is_perfect_match = False + + def initialize(self): + self.session = Session() + self.session.headers['User-Agent'] = AGENT_LIST[randint(0, len(AGENT_LIST) - 1)] + self.session.headers['Referer'] = self.server_url + + def terminate(self): + self.session.close() + + def query_series(self, video, title): + subtitles = [] + + r = self.session.get(self.search_url, params={'q': title}, timeout=10) + r.raise_for_status() + + soup = ParserBeautifulSoup(r.content.decode('utf-8', 'ignore'), ['html.parser']) + + # loop over series name + self.is_perfect_match = False + series_url = [] + series = soup.select('.serie > h3 > a') + for item in series: + # title + if title in item.text: + series_url.append(item.attrs['href']) + self.is_perfect_match = True + + series_subs_archives_url = [] + for series_page in series_url: + page_link = self.server_url + series_page + r = self.session.get(page_link, timeout=10) + r.raise_for_status() + + soup = ParserBeautifulSoup(r.content.decode('utf-8', 'ignore'), ['html.parser']) + + series_subs_archives = soup.select('a.subList') + for item in series_subs_archives: + matching_archive = False + subtitles_archive_name = unquote(item.attrs['href'].split('/')[-1:][0][:-4]) + guessed_subs = guessit(subtitles_archive_name, {'type': 'episode'}) + try: + season, episode = item.select_one('.episodenum').text.split('×') + guessed_subs.update({'season': int(season), 'episode': int(episode)}) + except ValueError: + season = item.select_one('.episodenum').text[1:] + episode = None + guessed_subs.update({'season': int(season)}) + + if guessed_subs['season'] == video.season: + if 'episode' in guessed_subs: + if guessed_subs['episode'] == video.episode: + matching_archive = True + else: + matching_archive = True + + if guessed_subs['season'] == 16: + print('test') + + if matching_archive: + download_link = self.server_url + 'series/' + item.attrs['href'] + res = self.session.get(download_link, timeout=10) + res.raise_for_status() + + archive = self._get_archive(res.content) + # extract the subtitle + if archive: + subtitles_from_archive = self._get_subtitle_from_archive(archive, video) + for subtitle in subtitles_from_archive: + subtitle.page_link = page_link + subtitle.download_link = download_link + subtitles.append(subtitle) + + return subtitles + + def query_movies(self, video, title): + subtitles = [] + + r = self.session.get(self.search_url, params={'q': title}, timeout=10) + r.raise_for_status() + + soup = ParserBeautifulSoup(r.content.decode('utf-8', 'ignore'), ['html.parser']) + + # loop over movies name + movies_url = [] + self.is_perfect_match = False + movies = soup.select('.film > h3 > a') + for item in movies: + # title + if title.lower() in item.text.lower(): + movies_url.append(item.attrs['href']) + self.is_perfect_match = True + + series_subs_archives_url = [] + for movies_page in movies_url: + page_link = self.server_url + movies_page + r = self.session.get(page_link, timeout=10) + r.raise_for_status() + + soup = ParserBeautifulSoup(r.content.decode('utf-8', 'ignore'), ['html.parser']) + + movies_subs_archives = soup.select('a.subList') + for item in movies_subs_archives: + download_link = self.server_url + 'films/' + item.attrs['href'] + res = self.session.get(download_link, timeout=10) + res.raise_for_status() + + archive = self._get_archive(res.content) + # extract the subtitle + if archive: + subtitles_from_archive = self._get_subtitle_from_archive(archive, video) + for subtitle in subtitles_from_archive: + subtitle.page_link = page_link + subtitle.download_link = download_link + subtitles.append(subtitle) + + return subtitles + + def list_subtitles(self, video, languages): + if isinstance(video, Episode): + titles = [video.series] + video.alternative_series + else: + titles = [video.title] + video.alternative_titles + + subtitles = [] + # query for subtitles + for title in titles: + if isinstance(video, Episode): + subtitles += [s for s in self.query_series(video, title) if s.language in languages] + else: + subtitles += [s for s in self.query_movies(video, title) if s.language in languages] + + return subtitles + + def download_subtitle(self, subtitle): + return subtitle + + def _get_archive(self, content): + # open the archive + archive_stream = io.BytesIO(content) + if is_rarfile(archive_stream): + logger.debug('Sous-Titres.eu: Identified rar archive') + archive = RarFile(archive_stream) + elif is_zipfile(archive_stream): + logger.debug('Sous-Titres.eu: Identified zip archive') + archive = ZipFile(archive_stream) + else: + logger.error('Sous-Titres.eu: Unsupported compressed format') + return None + return archive + + def _get_subtitle_from_archive(self, archive, video): + subtitles = [] + + # some files have a non subtitle with .txt extension + _tmp = list(SUBTITLE_EXTENSIONS) + _tmp.remove('.txt') + _subtitle_extensions = tuple(_tmp) + _scores = get_scores(video) + + for name in archive.namelist(): + # discard hidden files + if os.path.split(name)[-1].startswith('.'): + continue + + # discard non-subtitle files + if not name.lower().endswith(_subtitle_extensions): + continue + + # get subtitles language + if '.en.' in name.lower(): + language = Language.fromopensubtitles('eng') + else: + language = Language.fromopensubtitles('fre') + + release = name[:-4].lower().rstrip('tag').rstrip('en').rstrip('fr') + _guess = guessit(release) + if isinstance(video, Episode): + if video.episode != _guess['episode'] or video.season != _guess['season']: + continue + + matches = set() + matches |= guess_matches(video, _guess) + _score = sum((_scores.get(match, 0) for match in matches)) + content = archive.read(name) + subtitles.append(SoustitreseuSubtitle(language, video, name, _guess, content, self.is_perfect_match)) + + return subtitles diff --git a/views/settingsproviders.html b/views/settingsproviders.html index ffb8d1c06..e45658bab 100644 --- a/views/settingsproviders.html +++ b/views/settingsproviders.html @@ -381,6 +381,18 @@ +
+
+ Sous-Titres.eu +
+
+ +
+
+
Subdivx