Added Sous-Titres.eu provider.

2024-12-25 17:17:12 +00:00 · 2020-07-21 00:28:34 -04:00 · 2020-07-21 00:28:34 -04:00 · d7873470fd
commit d7873470fd
parent d8500a57ba
2 changed files with 309 additions and 0 deletions
--- a/libs/subliminal_patch/providers/soustitreseu.py
+++ b/libs/subliminal_patch/providers/soustitreseu.py
@ -0,0 +1,297 @@
+# -*- coding: utf-8 -*-
+from __future__ import absolute_import
+import io
+import os
+import logging
+from urllib.parse import unquote
+from random import randint
+
+from zipfile import ZipFile, is_zipfile
+from rarfile import RarFile, is_rarfile
+
+from guessit import guessit
+from requests import Session
+import chardet
+from bs4 import NavigableString, UnicodeDammit
+from subzero.language import Language
+
+from subliminal_patch.providers import Provider
+from subliminal_patch.providers.mixins import ProviderSubtitleArchiveMixin
+from subliminal_patch.subtitle import Subtitle
+from subliminal_patch.score import get_scores, framerate_equal
+from subliminal.providers import ParserBeautifulSoup
+from subliminal.subtitle import sanitize, guess_matches, SUBTITLE_EXTENSIONS
+from subliminal.video import Episode, Movie
+from .utils import FIRST_THOUSAND_OR_SO_USER_AGENTS as AGENT_LIST
+
+logger = logging.getLogger(__name__)
+
+
+class SoustitreseuSubtitle(Subtitle):
+    """Sous-Titres.eu Subtitle."""
+    provider_name = 'soustitreseu'
+
+    def __init__(self, language, video, name, data, content, is_perfect_match):
+        self.language = language
+        self.srt_filename = name
+        self.release_info = name
+        self.page_link = None
+        self.download_link = None
+        self.data = data
+        self.video = video
+        self.matches = None
+        self.content = content
+        self.hearing_impaired = None
+        self.is_perfect_match = is_perfect_match
+
+    @property
+    def id(self):
+        return self.srt_filename
+
+    def get_matches(self, video):
+        matches = set()
+
+        if self.is_perfect_match:
+            if isinstance(video, Episode):
+                matches.add('series')
+            else:
+                matches.add('title')
+
+        # guess additional info from data
+        matches |= guess_matches(video, self.data)
+
+        self.matches = matches
+        self.data = None  # removing this make the subtitles object unpickable
+        return matches
+
+    def guess_encoding(self):
+        # override default subtitle guess_encoding method to not include language-specific encodings guessing
+        # chardet encoding detection seem to yield better results
+        """Guess encoding using chardet.
+
+        :return: the guessed encoding.
+        :rtype: str
+
+        """
+        if self._guessed_encoding:
+            return self._guessed_encoding
+
+        logger.info('Guessing encoding for language %s', self.language)
+
+        # guess/detect encoding using chardet
+        encoding = chardet.detect(self.content)['encoding']
+        logger.info('Chardet found encoding %s', encoding)
+
+        if not encoding:
+            # fallback on bs4
+            logger.info('Falling back to bs4 detection')
+            a = UnicodeDammit(self.content)
+
+            logger.info("bs4 detected encoding: %s", a.original_encoding)
+
+            if a.original_encoding:
+                self._guessed_encoding = a.original_encoding
+                return a.original_encoding
+            raise ValueError(u"Couldn't guess the proper encoding for %s", self)
+
+        self._guessed_encoding = encoding
+        return encoding
+
+
+class SoustitreseuProvider(Provider, ProviderSubtitleArchiveMixin):
+    """Sous-Titres.eu Provider."""
+    subtitle_class = SoustitreseuSubtitle
+    languages = {Language(l) for l in ['fra', 'eng']}
+    server_url = 'https://www.sous-titres.eu/'
+    search_url = server_url + 'search.html'
+
+    def __init__(self):
+        self.session = None
+        self.is_perfect_match = False
+
+    def initialize(self):
+        self.session = Session()
+        self.session.headers['User-Agent'] = AGENT_LIST[randint(0, len(AGENT_LIST) - 1)]
+        self.session.headers['Referer'] = self.server_url
+
+    def terminate(self):
+        self.session.close()
+
+    def query_series(self, video, title):
+        subtitles = []
+
+        r = self.session.get(self.search_url, params={'q': title}, timeout=10)
+        r.raise_for_status()
+
+        soup = ParserBeautifulSoup(r.content.decode('utf-8', 'ignore'), ['html.parser'])
+
+        # loop over series name
+        self.is_perfect_match = False
+        series_url = []
+        series = soup.select('.serie > h3 > a')
+        for item in series:
+            # title
+            if title in item.text:
+                series_url.append(item.attrs['href'])
+                self.is_perfect_match = True
+
+        series_subs_archives_url = []
+        for series_page in series_url:
+            page_link = self.server_url + series_page
+            r = self.session.get(page_link, timeout=10)
+            r.raise_for_status()
+
+            soup = ParserBeautifulSoup(r.content.decode('utf-8', 'ignore'), ['html.parser'])
+
+            series_subs_archives = soup.select('a.subList')
+            for item in series_subs_archives:
+                matching_archive = False
+                subtitles_archive_name = unquote(item.attrs['href'].split('/')[-1:][0][:-4])
+                guessed_subs = guessit(subtitles_archive_name, {'type': 'episode'})
+                try:
+                    season, episode = item.select_one('.episodenum').text.split('×')
+                    guessed_subs.update({'season': int(season), 'episode': int(episode)})
+                except ValueError:
+                    season = item.select_one('.episodenum').text[1:]
+                    episode = None
+                    guessed_subs.update({'season': int(season)})
+
+                if guessed_subs['season'] == video.season:
+                    if 'episode' in guessed_subs:
+                        if guessed_subs['episode'] == video.episode:
+                            matching_archive = True
+                    else:
+                        matching_archive = True
+
+                if guessed_subs['season'] == 16:
+                    print('test')
+
+                if matching_archive:
+                    download_link = self.server_url + 'series/' + item.attrs['href']
+                    res = self.session.get(download_link, timeout=10)
+                    res.raise_for_status()
+
+                    archive = self._get_archive(res.content)
+                    # extract the subtitle
+                    if archive:
+                        subtitles_from_archive = self._get_subtitle_from_archive(archive, video)
+                        for subtitle in subtitles_from_archive:
+                            subtitle.page_link = page_link
+                            subtitle.download_link = download_link
+                            subtitles.append(subtitle)
+
+        return subtitles
+
+    def query_movies(self, video, title):
+        subtitles = []
+
+        r = self.session.get(self.search_url, params={'q': title}, timeout=10)
+        r.raise_for_status()
+
+        soup = ParserBeautifulSoup(r.content.decode('utf-8', 'ignore'), ['html.parser'])
+
+        # loop over movies name
+        movies_url = []
+        self.is_perfect_match = False
+        movies = soup.select('.film > h3 > a')
+        for item in movies:
+            # title
+            if title.lower() in item.text.lower():
+                movies_url.append(item.attrs['href'])
+                self.is_perfect_match = True
+
+        series_subs_archives_url = []
+        for movies_page in movies_url:
+            page_link = self.server_url + movies_page
+            r = self.session.get(page_link, timeout=10)
+            r.raise_for_status()
+
+            soup = ParserBeautifulSoup(r.content.decode('utf-8', 'ignore'), ['html.parser'])
+
+            movies_subs_archives = soup.select('a.subList')
+            for item in movies_subs_archives:
+                download_link = self.server_url + 'films/' + item.attrs['href']
+                res = self.session.get(download_link, timeout=10)
+                res.raise_for_status()
+
+                archive = self._get_archive(res.content)
+                # extract the subtitle
+                if archive:
+                    subtitles_from_archive = self._get_subtitle_from_archive(archive, video)
+                    for subtitle in subtitles_from_archive:
+                        subtitle.page_link = page_link
+                        subtitle.download_link = download_link
+                        subtitles.append(subtitle)
+
+        return subtitles
+
+    def list_subtitles(self, video, languages):
+        if isinstance(video, Episode):
+            titles = [video.series] + video.alternative_series
+        else:
+            titles = [video.title] + video.alternative_titles
+
+        subtitles = []
+        # query for subtitles
+        for title in titles:
+            if isinstance(video, Episode):
+                subtitles += [s for s in self.query_series(video, title) if s.language in languages]
+            else:
+                subtitles += [s for s in self.query_movies(video, title) if s.language in languages]
+
+        return subtitles
+
+    def download_subtitle(self, subtitle):
+        return subtitle
+
+    def _get_archive(self, content):
+        # open the archive
+        archive_stream = io.BytesIO(content)
+        if is_rarfile(archive_stream):
+            logger.debug('Sous-Titres.eu: Identified rar archive')
+            archive = RarFile(archive_stream)
+        elif is_zipfile(archive_stream):
+            logger.debug('Sous-Titres.eu: Identified zip archive')
+            archive = ZipFile(archive_stream)
+        else:
+            logger.error('Sous-Titres.eu: Unsupported compressed format')
+            return None
+        return archive
+
+    def _get_subtitle_from_archive(self, archive, video):
+        subtitles = []
+
+        # some files have a non subtitle with .txt extension
+        _tmp = list(SUBTITLE_EXTENSIONS)
+        _tmp.remove('.txt')
+        _subtitle_extensions = tuple(_tmp)
+        _scores = get_scores(video)
+
+        for name in archive.namelist():
+            # discard hidden files
+            if os.path.split(name)[-1].startswith('.'):
+                continue
+
+            # discard non-subtitle files
+            if not name.lower().endswith(_subtitle_extensions):
+                continue
+
+            # get subtitles language
+            if '.en.' in name.lower():
+                language = Language.fromopensubtitles('eng')
+            else:
+                language = Language.fromopensubtitles('fre')
+
+            release = name[:-4].lower().rstrip('tag').rstrip('en').rstrip('fr')
+            _guess = guessit(release)
+            if isinstance(video, Episode):
+                if video.episode != _guess['episode'] or video.season != _guess['season']:
+                    continue
+
+            matches = set()
+            matches |= guess_matches(video, _guess)
+            _score = sum((_scores.get(match, 0) for match in matches))
+            content = archive.read(name)
+            subtitles.append(SoustitreseuSubtitle(language, video, name, _guess, content, self.is_perfect_match))
+
+        return subtitles
--- a/views/settingsproviders.html
+++ b/views/settingsproviders.html
@ -381,6 +381,18 @@
                </div>
            </div>

+            <div class="row">
+                <div class="col-sm-3 text-right">
+                    <b>Sous-Titres.eu</b>
+                </div>
+                <div class="form-group col-sm-8">
+                    <label class="custom-control custom-checkbox">
+                        <input type="checkbox" class="custom-control-input provider" id="soustitreseu">
+                        <span class="custom-control-label">Mostly French Subtitles Provider.</span>
+                    </label>
+                </div>
+            </div>
+
            <div class="row">
                <div class="col-sm-3 text-right">
                    <b>Subdivx</b>