From 71fe2b7a1ddc832ac903bdfcdc8c4e80e84e32db Mon Sep 17 00:00:00 2001 From: Vitiko Date: Wed, 19 Oct 2022 01:17:54 -0400 Subject: [PATCH 1/5] Subdivx Provider: improve episode queries --- libs/subliminal_patch/providers/subdivx.py | 14 ++++++++++++++ tests/subliminal_patch/test_subdivx.py | 13 +++++++++++++ 2 files changed, 27 insertions(+) diff --git a/libs/subliminal_patch/providers/subdivx.py b/libs/subliminal_patch/providers/subdivx.py index 5be08e45e..0d1752dd7 100644 --- a/libs/subliminal_patch/providers/subdivx.py +++ b/libs/subliminal_patch/providers/subdivx.py @@ -63,6 +63,8 @@ class SubdivxSubtitle(Subtitle): # episode if isinstance(video, Episode): # already matched in search query + + # TODO: avoid false positive with some short/common titles matches.update(["title", "series", "season", "episode", "year"]) # movie @@ -106,11 +108,23 @@ class SubdivxSubtitlesProvider(Provider): subtitles = [] if isinstance(video, Episode): + # TODO: cache pack queries (TV SHOW S01 / TV SHOW 2022 S01). + # Too many redundant server calls. + for query in ( f"{video.series} S{video.season:02}E{video.episode:02}", f"{video.series} S{video.season:02}", ): subtitles += self._handle_multi_page_search(query, video) + + # Try with year + if len(subtitles) <= 5 and video.year: + logger.debug("Few results. Trying with year") + for query in ( + f"{video.series} {video.year} S{video.season:02}E{video.episode:02}", + f"{video.series} {video.year} S{video.season:02}", + ): + subtitles += self._handle_multi_page_search(query, video) else: for query in (video.title, f"{video.title} ({video.year})"): subtitles += self._handle_multi_page_search(query, video) diff --git a/tests/subliminal_patch/test_subdivx.py b/tests/subliminal_patch/test_subdivx.py index ae9676aee..bc932d494 100644 --- a/tests/subliminal_patch/test_subdivx.py +++ b/tests/subliminal_patch/test_subdivx.py @@ -52,6 +52,19 @@ def test_list_subtitles_castillian_spanish(episodes): assert provider.list_subtitles(item, {Language.fromietf("es")}) +def test_list_subtitles_episode_with_year(episodes): + item = list(episodes.values())[0] + item.series = "The Bear" + item.name = "The Bear" + item.season = 1 + item.episode = 1 + item.year = 2022 + + with SubdivxSubtitlesProvider() as provider: + subtitles = provider.list_subtitles(item, {Language("spa", "MX")}) + assert len(subtitles) > 2 + + def test_download_subtitle(movies): subtitle = SubdivxSubtitle( Language("spa", "MX"), From 2f8814dcf801552ceb4be5c5a98c93a22799695c Mon Sep 17 00:00:00 2001 From: Vitiko Date: Wed, 19 Oct 2022 20:06:00 -0400 Subject: [PATCH 2/5] Subdivx Provider: improve series matches --- libs/subliminal_patch/providers/subdivx.py | 84 +++++++++++++--------- tests/subliminal_patch/test_subdivx.py | 7 +- 2 files changed, 56 insertions(+), 35 deletions(-) diff --git a/libs/subliminal_patch/providers/subdivx.py b/libs/subliminal_patch/providers/subdivx.py index 0d1752dd7..e0cae126e 100644 --- a/libs/subliminal_patch/providers/subdivx.py +++ b/libs/subliminal_patch/providers/subdivx.py @@ -25,9 +25,10 @@ _CLEAN_TITLE_RES = [ (r"´|`", "'"), (r" {2,}", " "), ] -_SPANISH_RE = re.compile(r"españa|ib[eé]rico|castellano|gallego|castilla") +_SPANISH_RE = re.compile(r"españa|ib[eé]rico|castellano|gallego|castilla") _YEAR_RE = re.compile(r"(\(\d{4}\))") +_SERIES_RE = re.compile(r"\(?\d{4}\)?|[sS]\d{2}([eE]\d{2})?") logger = logging.getLogger(__name__) @@ -47,11 +48,13 @@ class SubdivxSubtitle(Subtitle): self.download_url = download_url self.uploader = uploader - self.release_info = str(title) - self.description = str(description).strip() + self._title = str(title).strip() + self._description = str(description).strip() - if self.description: - self.release_info += " | " + self.description + self.release_info = self._title + + if self._description: + self.release_info += " | " + self._description @property def id(self): @@ -62,20 +65,18 @@ class SubdivxSubtitle(Subtitle): # episode if isinstance(video, Episode): - # already matched in search query - - # TODO: avoid false positive with some short/common titles + # already matched within provider matches.update(["title", "series", "season", "episode", "year"]) # movie elif isinstance(video, Movie): - # already matched in search query + # already matched within provider matches.update(["title", "year"]) - update_matches(matches, video, self.description) + update_matches(matches, video, self._description) # Don't lowercase; otherwise it will match a lot of false positives - if video.release_group and video.release_group in self.description: + if video.release_group and video.release_group in self._description: matches.add("release_group") return matches @@ -108,7 +109,7 @@ class SubdivxSubtitlesProvider(Provider): subtitles = [] if isinstance(video, Episode): - # TODO: cache pack queries (TV SHOW S01 / TV SHOW 2022 S01). + # TODO: cache pack queries (TV SHOW S01). # Too many redundant server calls. for query in ( @@ -117,14 +118,10 @@ class SubdivxSubtitlesProvider(Provider): ): subtitles += self._handle_multi_page_search(query, video) - # Try with year - if len(subtitles) <= 5 and video.year: - logger.debug("Few results. Trying with year") - for query in ( - f"{video.series} {video.year} S{video.season:02}E{video.episode:02}", - f"{video.series} {video.year} S{video.season:02}", - ): - subtitles += self._handle_multi_page_search(query, video) + # Try only with series title + if len(subtitles) <= 5: + subtitles += self._handle_multi_page_search(video.series, video, 1) + else: for query in (video.title, f"{video.title} ({video.year})"): subtitles += self._handle_multi_page_search(query, video) @@ -149,20 +146,25 @@ class SubdivxSubtitlesProvider(Provider): max_loops_not_met = True while max_loops_not_met: - loops += 1 max_loops_not_met = loops < max_loops - page_subtitles = self._get_page_subtitles(params, video) + page_subtitles, last_page = self._get_page_subtitles(params, video) - logger.debug("Yielding %d subtitles", len(page_subtitles)) + logger.debug("Yielding %d subtitles [loop #%d]", len(page_subtitles), loops) yield from page_subtitles - if len(page_subtitles) < 100: - break # this is the last page + if last_page: + logger.debug("Last page for '%s' query. Breaking loop", query) + break + + loops += 1 params["pg"] += 1 # search next page time.sleep(self.multi_result_throttle) + if not max_loops_not_met: + logger.debug("Max loops limit exceeded") + def _get_page_subtitles(self, params, video): search_link = f"{_SERVER_URL}/index.php" response = self.session.get( @@ -170,19 +172,19 @@ class SubdivxSubtitlesProvider(Provider): ) try: - page_subtitles = self._parse_subtitles_page(video, response) + page_subtitles, last_page = self._parse_subtitles_page(video, response) except Exception as error: logger.error(f"Error parsing subtitles list: {error}") return [] - return page_subtitles + return page_subtitles, last_page def list_subtitles(self, video, languages): return self.query(video, languages) def download_subtitle(self, subtitle): # download the subtitle - logger.info("Downloading subtitle %r", subtitle) + logger.debug("Downloading subtitle %r", subtitle) # download zip / rar file with the subtitle response = self.session.get( @@ -212,7 +214,8 @@ class SubdivxSubtitlesProvider(Provider): ) title_soups = page_soup.find_all("div", {"id": "menu_detalle_buscador"}) body_soups = page_soup.find_all("div", {"id": "buscador_detalle"}) - episode = isinstance(video, Episode) + + title_checker = _check_episode if isinstance(video, Episode) else _check_movie for subtitle in range(0, len(title_soups)): title_soup, body_soup = title_soups[subtitle], body_soups[subtitle] @@ -224,8 +227,7 @@ class SubdivxSubtitlesProvider(Provider): logger.debug("Skipping forced subtitles: %s", title) continue - # Check movie title (if the video is a movie) - if not episode and not _check_movie(video, title): + if not title_checker(video, title): continue # Data @@ -257,7 +259,7 @@ class SubdivxSubtitlesProvider(Provider): logger.debug("Found subtitle %r", subtitle) subtitles.append(subtitle) - return subtitles + return subtitles, len(title_soups) < 100 def _clean_title(title): @@ -282,6 +284,24 @@ def _get_download_url(data): return None +def _check_episode(video, title): + series_title = _SERIES_RE.sub("", title).strip() + + distance = abs(len(series_title) - len(video.series)) + + series_matched = distance < 4 + + logger.debug( + "Series matched? %s [%s -> %s] [distance: %d]", + series_matched, + video.series, + series_title, + distance, + ) + + return series_matched + + def _check_movie(video, title): if str(video.year) not in title: return False diff --git a/tests/subliminal_patch/test_subdivx.py b/tests/subliminal_patch/test_subdivx.py index bc932d494..c45fd1d34 100644 --- a/tests/subliminal_patch/test_subdivx.py +++ b/tests/subliminal_patch/test_subdivx.py @@ -33,6 +33,8 @@ def test_handle_multi_page_search(episodes): "Game Of Thrones", episodes["got_s03e10"] ) ) + print(len(subs)) + return assert len(subs) > 100 @@ -52,13 +54,12 @@ def test_list_subtitles_castillian_spanish(episodes): assert provider.list_subtitles(item, {Language.fromietf("es")}) -def test_list_subtitles_episode_with_year(episodes): +def test_list_subtitles_episode_with_title_only_fallback(episodes): item = list(episodes.values())[0] item.series = "The Bear" item.name = "The Bear" item.season = 1 item.episode = 1 - item.year = 2022 with SubdivxSubtitlesProvider() as provider: subtitles = provider.list_subtitles(item, {Language("spa", "MX")}) @@ -120,7 +121,7 @@ def test_subtitle_description_not_lowercase(video): with SubdivxSubtitlesProvider() as provider: subtitles = provider.list_subtitles(video, {Language("spa", "MX")}) assert subtitles - assert not subtitles[0].description.islower() + assert not subtitles[0]._description.islower() def test_subtitle_matches(video): From 9c5a88f880d0230eb988b31b9841f17c975e22a8 Mon Sep 17 00:00:00 2001 From: Vitiko Date: Wed, 19 Oct 2022 20:11:10 -0400 Subject: [PATCH 3/5] no log: update subdivx provider tests --- tests/subliminal_patch/test_subdivx.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/tests/subliminal_patch/test_subdivx.py b/tests/subliminal_patch/test_subdivx.py index c45fd1d34..852e3ca6f 100644 --- a/tests/subliminal_patch/test_subdivx.py +++ b/tests/subliminal_patch/test_subdivx.py @@ -33,8 +33,6 @@ def test_handle_multi_page_search(episodes): "Game Of Thrones", episodes["got_s03e10"] ) ) - print(len(subs)) - return assert len(subs) > 100 From 21359b32b551ac13ecedc07b83f8c90bf080ec53 Mon Sep 17 00:00:00 2001 From: Vitiko Date: Thu, 20 Oct 2022 19:42:25 -0400 Subject: [PATCH 4/5] Subdivx Provider: add more search improvements --- libs/subliminal_patch/providers/subdivx.py | 45 +++++++++++++++++----- tests/subliminal_patch/test_subdivx.py | 10 ++--- 2 files changed, 39 insertions(+), 16 deletions(-) diff --git a/libs/subliminal_patch/providers/subdivx.py b/libs/subliminal_patch/providers/subdivx.py index e0cae126e..a1e3669fb 100644 --- a/libs/subliminal_patch/providers/subdivx.py +++ b/libs/subliminal_patch/providers/subdivx.py @@ -28,7 +28,17 @@ _CLEAN_TITLE_RES = [ _SPANISH_RE = re.compile(r"españa|ib[eé]rico|castellano|gallego|castilla") _YEAR_RE = re.compile(r"(\(\d{4}\))") -_SERIES_RE = re.compile(r"\(?\d{4}\)?|[sS]\d{2}([eE]\d{2})?") +_SERIES_RE = re.compile( + r"\(?\d{4}\)?|(s\d{1,2}(e\d{1,2})?|(season|temporada)\s\d{1,2}).*?$", + flags=re.IGNORECASE, +) +_EPISODE_NUM_RE = re.compile(r"[eE](?P\d{1,2})") +_SEASON_NUM_RE = re.compile( + r"(s|(season|temporada)\s)(?P\d{1,2})", flags=re.IGNORECASE +) +_UNSUPPORTED_RE = re.compile( + r"(\)?\d{4}\)?|[sS]\d{1,2})\s.{,3}(extras|forzado(s)?|forced)", flags=re.IGNORECASE +) logger = logging.getLogger(__name__) @@ -131,7 +141,7 @@ class SubdivxSubtitlesProvider(Provider): return subtitles - def _handle_multi_page_search(self, query, video, max_loops=3): + def _handle_multi_page_search(self, query, video, max_loops=2): params = { "buscar2": query, "accion": "5", @@ -163,7 +173,7 @@ class SubdivxSubtitlesProvider(Provider): time.sleep(self.multi_result_throttle) if not max_loops_not_met: - logger.debug("Max loops limit exceeded") + logger.debug("Max loops limit exceeded (%d)", max_loops) def _get_page_subtitles(self, params, video): search_link = f"{_SERVER_URL}/index.php" @@ -222,9 +232,8 @@ class SubdivxSubtitlesProvider(Provider): # title title = _clean_title(title_soup.find("a").text) - # Forced subtitles are not supported - if title.lower().rstrip().endswith(("forzado", "forzados")): - logger.debug("Skipping forced subtitles: %s", title) + if _UNSUPPORTED_RE.search(title): + logger.debug("Skipping unsupported subtitles: %s", title) continue if not title_checker(video, title): @@ -285,17 +294,33 @@ def _get_download_url(data): def _check_episode(video, title): + ep_num = _EPISODE_NUM_RE.search(title) + season_num = _SEASON_NUM_RE.search(title) + + if season_num is None: + logger.debug("Not a season/episode: %s", title) + return False + + season_num = int(season_num.group("x")) + + if ep_num is not None: + ep_num = int(ep_num.group("x")) + + ep_matches = ( + (video.episode == ep_num) or (ep_num is None) + ) and season_num == video.season + series_title = _SERIES_RE.sub("", title).strip() distance = abs(len(series_title) - len(video.series)) - series_matched = distance < 4 + series_matched = distance < 4 and ep_matches logger.debug( - "Series matched? %s [%s -> %s] [distance: %d]", + "Series matched? %s [%s -> %s] [title distance: %d]", series_matched, - video.series, - series_title, + video, + title, distance, ) diff --git a/tests/subliminal_patch/test_subdivx.py b/tests/subliminal_patch/test_subdivx.py index 852e3ca6f..26afdedde 100644 --- a/tests/subliminal_patch/test_subdivx.py +++ b/tests/subliminal_patch/test_subdivx.py @@ -28,12 +28,10 @@ def test_list_subtitles_movie_with_year_fallback(movies): def test_handle_multi_page_search(episodes): with SubdivxSubtitlesProvider() as provider: - subs = list( - provider._handle_multi_page_search( - "Game Of Thrones", episodes["got_s03e10"] - ) - ) - assert len(subs) > 100 + for _ in provider._handle_multi_page_search( + "Game Of Thrones", episodes["got_s03e10"] + ): + pass @pytest.mark.parametrize( From d4203ee7cb511feb520b603fea875f9d3451af06 Mon Sep 17 00:00:00 2001 From: silentcommitter <116306456+silentcommitter@users.noreply.github.com> Date: Sat, 22 Oct 2022 21:26:28 +0200 Subject: [PATCH 5/5] Subf2m provider improvements (#1973) * subf2m provider: add more languages * subf2m provider: use urllib parse rather than string replacement * subf2m provider: change movie title matching to match exact year and use similarity based title matching * subf2m provider: change tvshow title matching to match exact season and use similarity based title matching * no log: Subf2m Provider: add tests * Subf2m Provider: add serbian support Co-authored-by: Vitiko --- libs/subliminal_patch/providers/subf2m.py | 80 +++++++++++++++++++---- tests/subliminal_patch/test_subf2m.py | 37 +++++++++-- 2 files changed, 98 insertions(+), 19 deletions(-) diff --git a/libs/subliminal_patch/providers/subf2m.py b/libs/subliminal_patch/providers/subf2m.py index e6d8c1bc9..f4cf35e2d 100644 --- a/libs/subliminal_patch/providers/subf2m.py +++ b/libs/subliminal_patch/providers/subf2m.py @@ -2,10 +2,13 @@ import functools import logging +import urllib.parse +import re from bs4 import BeautifulSoup as bso from guessit import guessit from requests import Session +from difflib import SequenceMatcher from subliminal_patch.core import Episode from subliminal_patch.core import Movie from subliminal_patch.exceptions import APIThrottled @@ -82,12 +85,37 @@ _LANGUAGE_MAP = { "dutch": "dut", "hebrew": "heb", "indonesian": "ind", + "danish": "dan", + "norwegian": "nor", + "bengali": "ben", + "bulgarian": "bul", + "croatian": "hrv", + "swedish": "swe", + "vietnamese": "vie", + "czech": "cze", + "finnish": "fin", + "french": "fre", + "german": "ger", + "greek": "gre", + "hungarian": "hun", + "icelandic": "ice", + "japanese": "jpn", + "macedonian": "mac", + "malay": "may", + "polish": "pol", + "romanian": "rum", + "russian": "rus", + "serbian": "srp", + "thai": "tha", + "turkish": "tur", } class Subf2mProvider(Provider): provider_name = "subf2m" + _movie_title_regex = re.compile(r"^(.+?)( \((\d{4})\))?$") + _tv_show_title_regex = re.compile(r"^(.+?) - (.*?) season( \((\d{4})\))?$") _supported_languages = {} _supported_languages["brazillian-portuguese"] = Language("por", "BR") @@ -112,7 +140,7 @@ class Subf2mProvider(Provider): def _gen_results(self, query): req = self._session.get( - f"{_BASE_URL}/subtitles/searchbytitle?query={query.replace(' ', '+')}&l=", + f"{_BASE_URL}/subtitles/searchbytitle?query={urllib.parse.quote(query)}&l=", stream=True, ) text = "\n".join(line for line in req.iter_lines(decode_unicode=True) if line) @@ -123,35 +151,61 @@ class Subf2mProvider(Provider): def _search_movie(self, title, year): title = title.lower() - year = f"({year})" + year = str(year) found_movie = None + results = [] for result in self._gen_results(title): text = result.text.lower() - if title.lower() in text and year in text: - found_movie = result.get("href") - logger.debug("Movie found: %s", found_movie) - break + match = self._movie_title_regex.match(text) + if not match: + continue + match_title = match.group(1) + match_year = match.group(3) + if year == match_year: + results.append( + { + "href": result.get("href"), + "similarity": SequenceMatcher(None, title, match_title).ratio(), + } + ) + if results: + results.sort(key=lambda x: x["similarity"], reverse=True) + found_movie = results[0]["href"] + logger.debug("Movie found: %s", results[0]) return found_movie def _search_tv_show_season(self, title, season): try: - season_str = f"{_SEASONS[season - 1]} Season" + season_str = _SEASONS[season - 1].lower() except IndexError: logger.debug("Season number not supported: %s", season) return None - expected_result = f"{title} - {season_str}".lower() - found_tv_show_season = None + results = [] for result in self._gen_results(title): - if expected_result in result.text.lower(): - found_tv_show_season = result.get("href") - logger.debug("TV Show season found: %s", found_tv_show_season) - break + text = result.text.lower() + match = self._tv_show_title_regex.match(text) + if not match: + continue + match_title = match.group(1) + match_season = match.group(2) + if season_str == match_season: + results.append( + { + "href": result.get("href"), + "similarity": SequenceMatcher(None, title, match_title).ratio(), + } + ) + + if results: + results.sort(key=lambda x: x["similarity"], reverse=True) + found_tv_show_season = results[0]["href"] + logger.debug("TV Show season found: %s", results[0]) return found_tv_show_season diff --git a/tests/subliminal_patch/test_subf2m.py b/tests/subliminal_patch/test_subf2m.py index 49651f3d5..0e1b70121 100644 --- a/tests/subliminal_patch/test_subf2m.py +++ b/tests/subliminal_patch/test_subf2m.py @@ -5,20 +5,45 @@ from subliminal_patch.providers.subf2m import Subf2mSubtitle from subzero.language import Language -def test_search_movie(movies): - movie = movies["dune"] +@pytest.mark.parametrize( + "title,year,expected_url", + [ + ( + "Dead Man's Chest", + 2006, + "/subtitles/pirates-of-the-caribbean-2-dead-mans-chest", + ), + ("Dune", 2021, "/subtitles/dune-2021"), + ("Cure", 1997, "/subtitles/cure-kyua"), + ], +) +def test_search_movie(movies, title, year, expected_url): + movie = list(movies.values())[0] + movie.title = title + movie.year = year with Subf2mProvider() as provider: result = provider._search_movie(movie.title, movie.year) - assert result == "/subtitles/dune-2021" + assert result == expected_url -def test_search_tv_show_season(episodes): - episode = episodes["breaking_bad_s01e01"] +@pytest.mark.parametrize( + "title,season,expected_url", + [ + ("Breaking Bad", 1, "/subtitles/breaking-bad-first-season"), + ("House Of The Dragon", 1, "/subtitles/house-of-the-dragon-first-season"), + ("The Bear", 1, "/subtitles/the-bear-first-season"), + ], +) +def test_search_tv_show_season(episodes, title, season, expected_url): + episode = list(episodes.values())[0] + episode.name = title + episode.series = title + episode.season = season with Subf2mProvider() as provider: result = provider._search_tv_show_season(episode.series, episode.season) - assert result == "/subtitles/breaking-bad-first-season" + assert result == expected_url @pytest.mark.parametrize("language", [Language.fromalpha2("en"), Language("por", "BR")])