Merge pull request #586 from ngosang/feature/subdivx5

Improve Subdivx provider, handle more exceptions
2019-10-05 10:00:13 -04:00 · 2019-10-05 10:00:13 -04:00 · 4e5e7d6744
parent 9ca305740b 69f23c65a8
commit 4e5e7d6744
4 changed files with 92 additions and 78 deletions
--- a/bazarr/get_providers.py
+++ b/bazarr/get_providers.py
@ -8,10 +8,11 @@ import time

 from get_args import args
 from config import settings
-from subliminal_patch.exceptions import TooManyRequests, APIThrottled
+from subliminal_patch.exceptions import TooManyRequests, APIThrottled, ParseResponseError
 from subliminal.exceptions import DownloadLimitExceeded, ServiceUnavailable

-VALID_THROTTLE_EXCEPTIONS = (TooManyRequests, DownloadLimitExceeded, ServiceUnavailable, APIThrottled)
+VALID_THROTTLE_EXCEPTIONS = (TooManyRequests, DownloadLimitExceeded, ServiceUnavailable, APIThrottled,
+                             ParseResponseError)
 VALID_COUNT_EXCEPTIONS = ('TooManyRequests', 'ServiceUnavailable', 'APIThrottled')

 PROVIDER_THROTTLE_MAP = {
@ -20,6 +21,7 @@ PROVIDER_THROTTLE_MAP = {
        DownloadLimitExceeded: (datetime.timedelta(hours=3), "3 hours"),
        ServiceUnavailable: (datetime.timedelta(minutes=20), "20 minutes"),
        APIThrottled: (datetime.timedelta(minutes=10), "10 minutes"),
+        ParseResponseError: (datetime.timedelta(hours=6), "6 hours"),
    },
    "opensubtitles": {
        TooManyRequests: (datetime.timedelta(hours=3), "3 hours"),
--- a/libs/subliminal_patch/core.py
+++ b/libs/subliminal_patch/core.py
@ -28,7 +28,7 @@ from subliminal.utils import hash_napiprojekt, hash_opensubtitles, hash_shooter,
 from subliminal.video import VIDEO_EXTENSIONS, Video, Episode, Movie
 from subliminal.core import guessit, ProviderPool, io, is_windows_special_path, \
    ThreadPoolExecutor, check_video
-from subliminal_patch.exceptions import TooManyRequests, APIThrottled
+from subliminal_patch.exceptions import TooManyRequests, APIThrottled, ParseResponseError

 from subzero.language import Language
 from scandir import scandir, scandir_generic as _scandir_generic
@ -280,7 +280,7 @@ class SZProviderPool(ProviderPool):
                logger.debug("RAR Traceback: %s", traceback.format_exc())
                return False

-            except (TooManyRequests, DownloadLimitExceeded, ServiceUnavailable, APIThrottled), e:
+            except (TooManyRequests, DownloadLimitExceeded, ServiceUnavailable, APIThrottled, ParseResponseError) as e:
                self.throttle_callback(subtitle.provider_name, e)
                self.discarded_providers.add(subtitle.provider_name)
                return False
--- a/libs/subliminal_patch/exceptions.py
+++ b/libs/subliminal_patch/exceptions.py
@ -9,3 +9,8 @@ class TooManyRequests(ProviderError):

 class APIThrottled(ProviderError):
    pass
+
+
+class ParseResponseError(ProviderError):
+    """Exception raised by providers when they are not able to parse the response."""
+    pass
--- a/libs/subliminal_patch/providers/subdivx.py
+++ b/libs/subliminal_patch/providers/subdivx.py
@ -7,13 +7,14 @@ import zipfile

 import rarfile
 from subzero.language import Language
-from guessit import guessit
 from requests import Session

 from subliminal import __short_version__
+from subliminal.exceptions import ServiceUnavailable
 from subliminal.providers import ParserBeautifulSoup, Provider
 from subliminal.subtitle import SUBTITLE_EXTENSIONS, Subtitle, fix_line_ending,guess_matches
 from subliminal.video import Episode, Movie
+from subliminal_patch.exceptions import ParseResponseError

 logger = logging.getLogger(__name__)

@ -119,35 +120,17 @@ class SubdivxSubtitlesProvider(Provider):
        language = self.language_list[0]
        search_link = self.server_url + 'index.php'
        while True:
-            r = self.session.get(search_link, params=params, timeout=10)
-            r.raise_for_status()
+            response = self.session.get(search_link, params=params, timeout=10)
+            self._check_response(response)

-            if not r.content:
-                logger.debug('No data returned from provider')
-                return []
+            try:
+                page_subtitles = self._parse_subtitles_page(response, language)
+            except Exception as e:
+                raise ParseResponseError('Error parsing subtitles list: ' + str(e))

-            page_soup = ParserBeautifulSoup(r.content.decode('iso-8859-1', 'ignore'), ['lxml', 'html.parser'])
-            title_soups = page_soup.find_all("div", {'id': 'menu_detalle_buscador'})
-            body_soups = page_soup.find_all("div", {'id': 'buscador_detalle'})
-            if len(title_soups) != len(body_soups):
-                logger.debug('Error in provider data')
-                return []
-            for subtitle in range(0, len(title_soups)):
-                title_soup, body_soup = title_soups[subtitle], body_soups[subtitle]
+            subtitles += page_subtitles

-                # title
-                title = title_soup.find("a").text.replace("Subtitulo de ", "")
-                page_link = title_soup.find("a")["href"].replace('http://', 'https://')
-
-                # body
-                description = body_soup.find("div", {'id': 'buscador_detalle_sub'}).text
-
-                subtitle = self.subtitle_class(language, page_link, description, title)
-
-                logger.debug('Found subtitle %r', subtitle)
-                subtitles.append(subtitle)
-
-            if len(title_soups) >= 20:
+            if len(page_subtitles) >= 20:
                params['pg'] += 1  # search next page
                time.sleep(self.multi_result_throttle)
            else:
@ -175,58 +158,82 @@ class SubdivxSubtitlesProvider(Provider):

        return subtitles

-    def get_download_link(self, subtitle):
-        r = self.session.get(subtitle.page_link, timeout=10)
-        r.raise_for_status()
-
-        if r.content:
-            page_soup = ParserBeautifulSoup(r.content.decode('iso-8859-1', 'ignore'), ['lxml', 'html.parser'])
-            links_soup = page_soup.find_all("a", {'class': 'detalle_link'})
-            for link_soup in links_soup:
-                if link_soup['href'].startswith('bajar'):
-                    return self.server_url + link_soup['href']
-
-        logger.debug('No data returned from provider')
-        return None
-
    def download_subtitle(self, subtitle):
        if isinstance(subtitle, SubdivxSubtitle):
            # download the subtitle
            logger.info('Downloading subtitle %r', subtitle)

            # get download link
-            download_link = self.get_download_link(subtitle)
-            r = self.session.get(download_link, headers={'Referer': subtitle.page_link}, timeout=30)
-            r.raise_for_status()
+            download_link = self._get_download_link(subtitle)

-            if not r.content:
-                logger.debug('Unable to download subtitle. No data returned from provider')
-                return
+            # download zip / rar file with the subtitle
+            response = self.session.get(download_link, headers={'Referer': subtitle.page_link}, timeout=30)
+            self._check_response(response)

-            archive = _get_archive(r.content)
+            # open the compressed archive
+            archive = self._get_archive(response.content)

-            subtitle_content = _get_subtitle_from_archive(archive)
-            if subtitle_content:
+            # extract the subtitle
+            subtitle_content = self._get_subtitle_from_archive(archive)
            subtitle.content = fix_line_ending(subtitle_content)
-            else:
-                logger.debug('Could not extract subtitle from %r', archive)

+    def _check_response(self, response):
+        if response.status_code != 200:
+            raise ServiceUnavailable('Bad status code: ' + str(response.status_code))

-def _get_archive(content):
+    def _parse_subtitles_page(self, response, language):
+        subtitles = []
+
+        page_soup = ParserBeautifulSoup(response.content.decode('iso-8859-1', 'ignore'), ['lxml', 'html.parser'])
+        title_soups = page_soup.find_all("div", {'id': 'menu_detalle_buscador'})
+        body_soups = page_soup.find_all("div", {'id': 'buscador_detalle'})
+
+        for subtitle in range(0, len(title_soups)):
+            title_soup, body_soup = title_soups[subtitle], body_soups[subtitle]
+
+            # title
+            title = title_soup.find("a").text.replace("Subtitulo de ", "")
+            page_link = title_soup.find("a")["href"].replace('http://', 'https://')
+
+            # body
+            description = body_soup.find("div", {'id': 'buscador_detalle_sub'}).text
+
+            subtitle = self.subtitle_class(language, page_link, description, title)
+
+            logger.debug('Found subtitle %r', subtitle)
+            subtitles.append(subtitle)
+
+        return subtitles
+
+    def _get_download_link(self, subtitle):
+        response = self.session.get(subtitle.page_link, timeout=10)
+        self._check_response(response)
+        try:
+            page_soup = ParserBeautifulSoup(response.content.decode('iso-8859-1', 'ignore'), ['lxml', 'html.parser'])
+            links_soup = page_soup.find_all("a", {'class': 'detalle_link'})
+            for link_soup in links_soup:
+                if link_soup['href'].startswith('bajar'):
+                    return self.server_url + link_soup['href']
+        except Exception as e:
+            raise ParseResponseError('Error parsing download link: ' + str(e))
+
+        raise ParseResponseError('Download link not found')
+
+    def _get_archive(self, content):
        # open the archive
        archive_stream = io.BytesIO(content)
-    archive = None
        if rarfile.is_rarfile(archive_stream):
            logger.debug('Identified rar archive')
            archive = rarfile.RarFile(archive_stream)
        elif zipfile.is_zipfile(archive_stream):
            logger.debug('Identified zip archive')
            archive = zipfile.ZipFile(archive_stream)
+        else:
+            raise ParseResponseError('Unsupported compressed format')

        return archive

-
-def _get_subtitle_from_archive(archive):
+    def _get_subtitle_from_archive(self, archive):
        for name in archive.namelist():
            # discard hidden files
            if os.path.split(name)[-1].startswith('.'):
@ -238,4 +245,4 @@ def _get_subtitle_from_archive(archive):

            return archive.read(name)

-    return None
+        raise ParseResponseError('Can not find the subtitle in the compressed file')