Improved subtitles scoring system

This commit is contained in:
morpheus65535 2021-06-10 16:00:42 -04:00 committed by GitHub
parent f9997ca969
commit e86d537ca2
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
12 changed files with 385 additions and 108 deletions

View File

@ -180,6 +180,34 @@ defaults = {
'use_subsync_movie_threshold': 'False',
'subsync_movie_threshold': '70',
'debug': 'False'
},
'series_scores': {
"hash": 359,
"series": 180,
"year": 90,
"season": 30,
"episode": 30,
"release_group": 15,
"source": 7,
"audio_codec": 3,
"resolution": 2,
"video_codec": 2,
"hearing_impaired": 1,
"streaming_service": 0,
"edition": 0,
},
'movie_scores': {
"hash": 119,
"title": 60,
"year": 30,
"release_group": 15,
"source": 7,
"audio_codec": 3,
"resolution": 2,
"video_codec": 2,
"hearing_impaired": 1,
"streaming_service": 0,
"edition": 0,
}
}

View File

@ -232,6 +232,27 @@ class TableShowsRootfolder(BaseModel):
primary_key = False
class TableCustomScoreProfiles(BaseModel):
id = AutoField()
name = TextField(null=True)
media = TextField(null=True)
score = IntegerField(null=True)
class Meta:
table_name = 'table_custom_score_profiles'
class TableCustomScoreProfileConditions(BaseModel):
profile_id = ForeignKeyField(TableCustomScoreProfiles, to_field="id")
type = TextField(null=True) # provider, uploader, regex, etc
value = TextField(null=True) # opensubtitles, jane_doe, [a-z], etc
required = BooleanField(default=False)
negate = BooleanField(default=False)
class Meta:
table_name = 'table_custom_score_profile_conditions'
def init_db():
# Create tables if they don't exists.
database.create_tables([System,
@ -246,7 +267,9 @@ def init_db():
TableSettingsLanguages,
TableSettingsNotifier,
TableShows,
TableShowsRootfolder])
TableShowsRootfolder,
TableCustomScoreProfiles,
TableCustomScoreProfileConditions])
# add the system table single row if it's not existing
# we must retry until the tables are created

View File

@ -42,6 +42,7 @@ from embedded_subs_reader import parse_video_metadata
from analytics import track_event
from locale import getpreferredencoding
from score import movie_score, series_score
def get_video(path, title, sceneName, providers=None, media_type="movie"):
@ -82,30 +83,6 @@ def get_video(path, title, sceneName, providers=None, media_type="movie"):
logging.exception("BAZARR Error trying to get video information for this file: " + original_path)
def get_scores(video, media_type, min_score_movie_perc=60 * 100 / 120.0, min_score_series_perc=240 * 100 / 360.0,
min_score_special_ep=180 * 100 / 360.0):
"""
Get score range for a video.
:param video: `Video` instance
:param media_type: movie/series
:param min_score_movie_perc: Percentage of max score for min score of movies
:param min_score_series_perc: Percentage of max score for min score of series
:param min_score_special_ep: Percentage of max score for min score of series special episode
:return: tuple(min_score, max_score, set(scores))
"""
max_score = 120.0
min_score = max_score * min_score_movie_perc / 100.0
scores = list(subliminal_scores.movie_scores.keys())
if media_type == "series":
max_score = 360.0
min_score = max_score * min_score_series_perc / 100.0
scores = list(subliminal_scores.episode_scores.keys())
if video.is_special:
min_score = max_score * min_score_special_ep / 100.0
return min_score, max_score, set(scores)
def download_subtitle(path, language, audio_language, hi, forced, providers, providers_auth, sceneName, title,
media_type, forced_minimum_score=None, is_upgrade=False):
# fixme: supply all missing languages, not only one, to hit providers only once who support multiple languages in
@ -168,8 +145,8 @@ def download_subtitle(path, language, audio_language, hi, forced, providers, pro
video = get_video(force_unicode(path), title, sceneName, providers=providers,
media_type=media_type)
if video:
min_score, max_score, scores = get_scores(video, media_type, min_score_movie_perc=int(minimum_score_movie),
min_score_series_perc=int(minimum_score))
handler = series_score if media_type == "series" else movie_score
min_score, max_score, scores = _get_scores(media_type, minimum_score_movie, minimum_score)
if providers:
if forced_minimum_score:
@ -182,6 +159,7 @@ def download_subtitle(path, language, audio_language, hi, forced, providers, pro
throttle_time=None, # fixme
blacklist=get_blacklist(media_type=media_type),
throttle_callback=provider_throttle,
score_obj=handler,
pre_download_hook=None, # fixme
post_download_hook=None, # fixme
language_hook=None) # fixme
@ -368,8 +346,8 @@ def manual_search(path, profileId, providers, providers_auth, sceneName, title,
logging.info("BAZARR All providers are throttled")
return None
if video:
min_score, max_score, scores = get_scores(video, media_type, min_score_movie_perc=int(minimum_score_movie),
min_score_series_perc=int(minimum_score))
handler = series_score if media_type == "series" else movie_score
min_score, max_score, scores = _get_scores(media_type, minimum_score_movie, minimum_score)
try:
if providers:
@ -431,12 +409,12 @@ def manual_search(path, profileId, providers, providers_auth, sceneName, title,
if not initial_hi_match:
initial_hi = None
score, score_without_hash = compute_score(matches, s, video, hearing_impaired=initial_hi)
score, score_without_hash = compute_score(matches, s, video, hearing_impaired=initial_hi, score_obj=handler)
if 'hash' not in matches:
not_matched = scores - matches
else:
not_matched = set()
s.score = score
s.score = score_without_hash
if s.hearing_impaired == initial_hi:
matches.add('hearing_impaired')
@ -505,7 +483,7 @@ def manual_download_subtitle(path, language, audio_language, hi, forced, subtitl
video = get_video(force_unicode(path), title, sceneName, providers={provider},
media_type=media_type)
if video:
min_score, max_score, scores = get_scores(video, media_type)
min_score, max_score, scores = _get_scores(media_type)
try:
if provider:
download_subtitles([subtitle],
@ -1717,3 +1695,11 @@ def _get_lang_obj(alpha3):
return Language(alpha3)
return sub.subzero_language()
def _get_scores(media_type, min_movie=None, min_ep=None):
series = "series" == media_type
handler = series_score if series else movie_score
min_movie = min_movie or (60 * 100 / handler.max_score)
min_ep = min_ep or (240 * 100 / handler.max_score)
min_score_ = int(min_ep if series else min_movie)
return handler.get_scores(min_score_)

221
bazarr/score.py Normal file
View File

@ -0,0 +1,221 @@
# -*- coding: utf-8 -*-
import logging
import re
from config import get_settings
from database import TableCustomScoreProfileConditions as conditions_table
from database import TableCustomScoreProfiles as profiles_table
logger = logging.getLogger(__name__)
class CustomScoreProfile:
table = profiles_table
conditions_table = conditions_table
def __init__(self, id=None, name=None, score=0, media=None):
self.id = id
self.name = name or "N/A"
self.score = score
self.media = media
self._conditions = []
self._conditions_loaded = False
def load_conditions(self):
try:
self._conditions = list(
self.conditions_table.select()
.where(self.conditions_table.profile_id == self.id)
.dicts()
)
except self.conditions_table.DoesNotExist:
logger.debug("Conditions not found for %s", self)
self._conditions = []
self._conditions_loaded = True
def check(self, subtitle):
# Avoid calling the database on every score check
if not self._conditions_loaded:
self.load_conditions()
# Always return False if no conditions are set
if not self._conditions:
logger.debug("No conditions found in %s profile", self)
return False
logger.debug("Checking conditions for %s profile", self)
met = self._check_conditions(subtitle)
logger.debug("Profile conditions met? %s", met)
return met
def _check_conditions(self, subtitle):
checkers = {
"provider": subtitle.provider_name,
"uploader": subtitle.uploader,
"language": subtitle.language,
"regex": subtitle.release_info,
}
matches = []
for condition in self._conditions:
# Condition dict example:
# {type: provider, value: subdivx, required: False, negate: False}
key = condition.get("type")
sub_value = checkers.get(key)
if sub_value is None:
continue
cond_value = condition.get("value", "")
negate = condition.get("negate", False)
logger.debug("Checking %s: %s (condition: %s)", key, sub_value, condition)
if key == "regex" and re.findall(rf"{cond_value}", sub_value):
logger.debug("Regex matched: %s -> %s", cond_value, sub_value)
matches.append(not negate and True)
elif cond_value == sub_value:
logger.debug("%s condition met: %s -> %s", key, cond_value, sub_value)
matches.append(not negate and True)
# Return False if any required condition is not met
elif condition.get("required"):
logger.debug("%s required condition not met, discarding profile", key)
return False
return True in matches
def __repr__(self):
return f"<ScoreProfile {self.name} (score: {self.score})>"
class Score:
media = None
defaults = {}
profiles_table = profiles_table
def __init__(self, load_profiles=False, **kwargs):
self.data = self.defaults.copy()
self.data.update(**kwargs)
self._profiles = []
self._profiles_loaded = False
if load_profiles:
self.load_profiles()
def check_custom_profiles(self, subtitle, matches):
if not self._profiles_loaded:
self.load_profiles()
for profile in self._profiles:
if profile.check(subtitle):
matches.add(profile.name)
def load_profiles(self):
"""Load the profiles associated with the class. This method must be called
after every custom profile creation or update."""
try:
self._profiles = [
CustomScoreProfile(**item)
for item in self.profiles_table.select()
.where(self.profiles_table.media == self.media)
.dicts()
]
logger.debug("Loaded profiles: %s", self._profiles)
except self.profiles_table.DoesNotExist:
logger.debug("No score profiles found")
self._profiles = []
self._profiles_loaded = True
def reset(self):
self.data.update(self.defaults)
def update(self, **kwargs):
self.data.update(kwargs)
@classmethod
def from_config(cls, **kwargs):
return cls(True, **kwargs)
def get_scores(self, min_percent, special=None):
return (
self.max_score * (special or min_percent) / 100,
self.max_score,
set(list(self.scores.keys())),
)
@property
def custom_profile_scores(self):
return {item.name: item.score for item in self._profiles}
@property
def scores(self):
return {**self.custom_profile_scores, **self.data}
@property
def max_score(self):
return (
sum(val for val in self.scores.values() if val > 0)
+ sum(item.score for item in self._profiles if item.score > 0)
- self.data["hash"]
)
def __str__(self):
return f"<{self.media} Score class>"
class SeriesScore(Score):
media = "series"
defaults = {
"hash": 359,
"series": 180,
"year": 90,
"season": 30,
"episode": 30,
"release_group": 15,
"source": 7,
"audio_codec": 3,
"resolution": 2,
"video_codec": 2,
"hearing_impaired": 1,
"streaming_service": 0,
"edition": 0,
}
@classmethod
def from_config(cls, **kwargs):
return cls(True, **kwargs["series_scores"])
def update(self, **kwargs):
self.data.update(kwargs["series_scores"])
class MovieScore(Score):
media = "movies"
defaults = {
"hash": 119,
"title": 60,
"year": 30,
"release_group": 15,
"source": 7,
"audio_codec": 3,
"resolution": 2,
"video_codec": 2,
"hearing_impaired": 1,
"streaming_service": 0,
"edition": 0,
}
@classmethod
def from_config(cls, **kwargs):
return cls(True, **kwargs["movie_scores"])
def update(self, **kwargs):
self.data.update(kwargs["movie_scores"])
series_score = SeriesScore.from_config(**get_settings())
movie_score = MovieScore.from_config(**get_settings())

View File

@ -7,7 +7,6 @@ import re
import zlib
from babelfish import Language, language_converters
from guessit import guessit
from six.moves.xmlrpc_client import ServerProxy
from . import Provider, TimeoutSafeTransport

View File

@ -179,7 +179,9 @@ class Episode(Video):
return cls(name, guess['title'], guess.get('season', 1), episode, title=guess.get('episode_title'),
year=guess.get('year'), source=guess.get('source'), original_series='year' not in guess,
release_group=guess.get('release_group'), resolution=guess.get('screen_size'),
video_codec=guess.get('video_codec'), audio_codec=guess.get('audio_codec'))
video_codec=guess.get('video_codec'), audio_codec=guess.get('audio_codec'),
streaming_service=guess.get("streaming_service"),
edition=guess.get("edition", guess.get("alternative_title")))
@classmethod
def fromname(cls, name):
@ -227,7 +229,8 @@ class Movie(Video):
return cls(name, guess['title'], source=guess.get('source'), release_group=guess.get('release_group'),
resolution=guess.get('screen_size'), video_codec=guess.get('video_codec'),
audio_codec=guess.get('audio_codec'), year=guess.get('year'), alternative_titles=alternative_titles)
audio_codec=guess.get('audio_codec'), year=guess.get('year'), alternative_titles=alternative_titles,
streaming_service=guess.get("streaming_service"), edition=guess.get("edition"))
@classmethod
def fromname(cls, name):

View File

@ -319,7 +319,7 @@ class SZProviderPool(ProviderPool):
return True
def download_best_subtitles(self, subtitles, video, languages, min_score=0, hearing_impaired=False, only_one=False,
compute_score=None):
compute_score=None, score_obj=None):
"""Download the best matching subtitles.
patch:
@ -365,7 +365,8 @@ class SZProviderPool(ProviderPool):
orig_matches = matches.copy()
logger.debug('%r: Found matches %r', s, matches)
score, score_without_hash = compute_score(matches, s, video, hearing_impaired=use_hearing_impaired)
score, score_without_hash = compute_score(matches, s, video, hearing_impaired=use_hearing_impaired,
score_obj=score_obj)
unsorted_subtitles.append(
(s, score, score_without_hash, matches, orig_matches))
@ -774,7 +775,7 @@ def download_subtitles(subtitles, pool_class=ProviderPool, **kwargs):
def download_best_subtitles(videos, languages, min_score=0, hearing_impaired=False, only_one=False, compute_score=None,
pool_class=ProviderPool, throttle_time=0, **kwargs):
pool_class=ProviderPool, throttle_time=0, score_obj=None, **kwargs):
"""List and download the best matching subtitles.
The `videos` must pass the `languages` and `undefined` (`only_one`) checks of :func:`check_video`.
@ -818,7 +819,7 @@ def download_best_subtitles(videos, languages, min_score=0, hearing_impaired=Fal
subtitles = pool.download_best_subtitles(pool.list_subtitles(video, languages - video.subtitle_languages),
video, languages, min_score=min_score,
hearing_impaired=hearing_impaired, only_one=only_one,
compute_score=compute_score)
compute_score=compute_score, score_obj=score_obj)
logger.info('Downloaded %d subtitle(s)', len(subtitles))
downloaded_subtitles[video].extend(subtitles)

View File

@ -20,6 +20,7 @@ from .mixins import ProviderRetryMixin
from subliminal.subtitle import fix_line_ending
from subliminal_patch.http import SubZeroRequestsTransport
from subliminal_patch.utils import sanitize, fix_inconsistent_naming
from subliminal_patch.subtitle import guess_matches
from subliminal.cache import region
from subliminal_patch.score import framerate_equal
from subliminal_patch.subtitle import guess_matches

View File

@ -99,7 +99,7 @@ class SuchaProvider(Provider):
result.raise_for_status()
results = result.json()
if isinstance(result, dict):
if isinstance(results, dict):
logger.debug("No subtitles found")
return []

View File

@ -9,9 +9,7 @@ from subliminal.score import get_scores
logger = logging.getLogger(__name__)
FPS_EQUALITY = (
(23.976, 23.98, 24.0),
)
FPS_EQUALITY = ((23.976, 23.98, 24.0),)
def framerate_equal(source, check):
@ -30,7 +28,7 @@ def framerate_equal(source, check):
return False
def compute_score(matches, subtitle, video, hearing_impaired=None):
def compute_score(matches, subtitle, video, hearing_impaired=None, score_obj=None):
"""Compute the score of the `subtitle` against the `video` with `hearing_impaired` preference.
patch:
@ -51,9 +49,8 @@ def compute_score(matches, subtitle, video, hearing_impaired=None):
"""
logger.info('%r: Computing score for video %r with %r', subtitle, video, dict(hearing_impaired=hearing_impaired))
# get the scores dict
scores = get_scores(video)
# logger.debug('Using scores %r', scores)
scores = score_obj.scores or get_scores(video)
score_obj.check_custom_profiles(subtitle, matches)
is_episode = isinstance(video, Episode)
is_movie = isinstance(video, Movie)
@ -64,23 +61,22 @@ def compute_score(matches, subtitle, video, hearing_impaired=None):
orig_matches = matches.copy()
# on hash match, discard everything else
if subtitle.hash_verifiable:
if 'hash' in matches:
# hash is error-prone, try to fix that
hash_valid_if = episode_hash_valid_if if is_episode else movie_hash_valid_if
if subtitle.hash_verifiable and 'hash' in matches:
# hash is error-prone, try to fix that
hash_valid_if = episode_hash_valid_if if is_episode else movie_hash_valid_if
# don't validate hashes of specials, as season and episode tend to be wrong
if is_movie or not video.is_special:
if hash_valid_if <= set(matches):
# series, season and episode matched, hash is valid
logger.debug('%r: Using valid hash, as %s are correct (%r) and (%r)', subtitle, hash_valid_if, matches,
video)
matches &= {'hash'}
else:
# no match, invalidate hash
logger.debug('%r: Ignoring hash as other matches are wrong (missing: %r) and (%r)', subtitle,
hash_valid_if - matches, video)
matches -= {"hash"}
# don't validate hashes of specials, as season and episode tend to be wrong
if is_movie or not video.is_special:
if hash_valid_if <= set(matches):
# series, season and episode matched, hash is valid
logger.debug('%r: Using valid hash, as %s are correct (%r) and (%r)', subtitle, hash_valid_if, matches,
video)
matches &= {'hash'}
else:
# no match, invalidate hash
logger.debug('%r: Ignoring hash as other matches are wrong (missing: %r) and (%r)', subtitle,
hash_valid_if - matches, video)
matches -= {"hash"}
elif 'hash' in matches:
logger.debug('%r: Hash not verifiable for this provider. Keeping it', subtitle)
matches &= {'hash'}
@ -88,32 +84,10 @@ def compute_score(matches, subtitle, video, hearing_impaired=None):
# handle equivalent matches
eq_matches = set()
if is_episode:
if 'title' in matches:
logger.debug('Adding title match equivalent')
eq_matches.add('episode')
if 'series_imdb_id' in matches:
logger.debug('Adding series_imdb_id match equivalent')
eq_matches |= {'series', 'year'}
if 'imdb_id' in matches:
logger.debug('Adding imdb_id match equivalents')
eq_matches |= {'series', 'year', 'season', 'episode'}
if 'tvdb_id' in matches:
logger.debug('Adding tvdb_id match equivalents')
eq_matches |= {'series', 'year', 'season', 'episode', 'title'}
if 'series_tvdb_id' in matches:
logger.debug('Adding series_tvdb_id match equivalents')
eq_matches |= {'series', 'year'}
# specials
if video.is_special and 'title' in matches and 'series' in matches \
and 'year' in matches:
logger.debug('Adding special title match equivalent')
eq_matches |= {'season', 'episode'}
elif is_movie:
if 'imdb_id' in matches:
logger.debug('Adding imdb_id match equivalents')
eq_matches |= {'title', 'year'}
_episode_checks(video, eq_matches, matches)
elif is_movie and 'imdb_id' in matches:
logger.debug('Adding imdb_id match equivalents')
eq_matches |= {'title', 'year'}
matches |= eq_matches
@ -130,3 +104,26 @@ def compute_score(matches, subtitle, video, hearing_impaired=None):
score_without_hash = sum((scores.get(match, 0) for match in orig_matches | eq_matches if match != "hash"))
return score, score_without_hash
def _episode_checks(video, eq_matches, matches):
if "title" in matches:
logger.debug("Adding title match equivalent")
eq_matches.add("episode")
if "series_imdb_id" in matches:
logger.debug("Adding series_imdb_id match equivalent")
eq_matches |= {"series", "year"}
if "imdb_id" in matches:
logger.debug("Adding imdb_id match equivalents")
eq_matches |= {"series", "year", "season", "episode"}
if "tvdb_id" in matches:
logger.debug("Adding tvdb_id match equivalents")
eq_matches |= {"series", "year", "season", "episode", "title"}
if "series_tvdb_id" in matches:
logger.debug("Adding series_tvdb_id match equivalents")
eq_matches |= {"series", "year"}
# specials
if video.is_special and "title" in matches and "series" in matches and "year" in matches:
logger.debug("Adding special title match equivalent")
eq_matches |= {"season", "episode"}

View File

@ -406,20 +406,28 @@ MERGED_FORMATS = {
"Air": ("SATRip", "DVB", "PPV"),
"Disk-HD": ("HD-DVD", "Blu-ray"),
"Disk-SD": ("DVD", "VHS"),
"Web": ("Web",),
}
MERGED_FORMATS_REV = dict((v.lower(), k.lower()) for k in MERGED_FORMATS for v in MERGED_FORMATS[k])
def _has_match(video, guess, key) -> bool:
value = getattr(video, key)
if value is None:
guess_value = guess.get(key)
# To avoid extra debug calls
if guess_value is None or value is None:
return False
guess_value = guess.get(key)
if isinstance(guess_value, list):
return any(value == item for item in guess_value)
matched = any(value == item for item in guess_value)
else:
matched = value == guess_value
logger.debug("%s matched? %s (%s -> %s)", key, matched, value, guess_value)
return matched
return value == guess_value
def guess_matches(video, guess, partial=False):
@ -438,7 +446,6 @@ def guess_matches(video, guess, partial=False):
:rtype: set
"""
matches = set()
if isinstance(video, Episode):
# series
@ -498,11 +505,6 @@ def guess_matches(video, guess, partial=False):
get_equivalent_release_groups(sanitize_release_group(video.release_group))):
matches.add('release_group')
break
# resolution
if video.resolution and 'screen_size' in guess and guess['screen_size'] == video.resolution:
matches.add('resolution')
# source
if 'source' in guess:
formats = guess["source"]
@ -512,24 +514,37 @@ def guess_matches(video, guess, partial=False):
if video.source:
video_format = video.source.lower()
_video_gen_format = MERGED_FORMATS_REV.get(video_format)
if _video_gen_format:
logger.debug("Treating %s as %s the same", video_format, _video_gen_format)
matched = False
for frmt in formats:
_guess_gen_frmt = MERGED_FORMATS_REV.get(frmt.lower())
# We don't want to match a singleton
if _guess_gen_frmt is None: # If the source is not in MERGED_FORMATS
_guess_gen_frmt = guess["source"]
if _guess_gen_frmt == _video_gen_format:
matched = True
matches.add('source')
break
logger.debug("Source match found? %s: %s -> %s", matched, video.source, formats)
if "release_group" in matches and "source" not in matches:
logger.info("Release group matched but source didn't. Remnoving release group match.")
logger.info("Release group matched but source didn't. Removing release group match.")
matches.remove("release_group")
guess.update({"resolution": guess.get("screen_size")})
if _has_match(video, guess, "video_codec"):
matches.add("video_codec")
# Solve match keys for potential lists
for key in ("video_codec", "audio_codec", "edition", "streaming_service", "resolution"):
if _has_match(video, guess, key):
matches.add(key)
if _has_match(video, guess, "audio_codec"):
matches.add("audio_codec")
# Add streaming service match for non-web sources
if video.source and video.source != "Web":
matches.add("streaming_service")
# As edition tags are rare, add edition match if the video doesn't have an edition
if not video.edition:
matches.add("edition")
return matches

View File

@ -16,7 +16,8 @@ class Video(Video_):
external_subtitle_languages = None
def __init__(self, name, source=None, release_group=None, resolution=None, video_codec=None, audio_codec=None,
imdb_id=None, hashes=None, size=None, subtitle_languages=None, audio_languages=None):
imdb_id=None, hashes=None, size=None, subtitle_languages=None, audio_languages=None,
streaming_service=None, edition=None):
super(Video, self).__init__(name, source=source, release_group=release_group, resolution=resolution,
video_codec=video_codec, audio_codec=audio_codec, imdb_id=imdb_id, hashes=hashes,
size=size, subtitle_languages=subtitle_languages)
@ -25,3 +26,5 @@ class Video(Video_):
self.hints = {}
self.audio_languages = audio_languages or set()
self.external_subtitle_languages = set()
self.streaming_service = streaming_service
self.edition = edition