mirror of
https://github.com/morpheus65535/bazarr
synced 2024-12-30 19:46:25 +00:00
104 lines
3.8 KiB
Python
104 lines
3.8 KiB
Python
|
#!/usr/bin/env python
|
||
|
# -*- coding: utf-8 -*-
|
||
|
"""
|
||
|
Website property.
|
||
|
"""
|
||
|
from pkg_resources import resource_stream # @UnresolvedImport
|
||
|
from rebulk.remodule import re
|
||
|
|
||
|
from rebulk import Rebulk, Rule, RemoveMatch
|
||
|
from ..common import seps
|
||
|
from ..common.formatters import cleanup
|
||
|
from ..common.pattern import is_disabled
|
||
|
from ..common.validators import seps_surround
|
||
|
from ...reutils import build_or_pattern
|
||
|
|
||
|
|
||
|
def website(config):
|
||
|
"""
|
||
|
Builder for rebulk object.
|
||
|
|
||
|
:param config: rule configuration
|
||
|
:type config: dict
|
||
|
:return: Created Rebulk object
|
||
|
:rtype: Rebulk
|
||
|
"""
|
||
|
rebulk = Rebulk(disabled=lambda context: is_disabled(context, 'website'))
|
||
|
rebulk = rebulk.regex_defaults(flags=re.IGNORECASE).string_defaults(ignore_case=True)
|
||
|
rebulk.defaults(name="website")
|
||
|
|
||
|
tlds = [l.strip().decode('utf-8')
|
||
|
for l in resource_stream('guessit', 'tlds-alpha-by-domain.txt').readlines()
|
||
|
if b'--' not in l][1:] # All registered domain extension
|
||
|
|
||
|
safe_tlds = config['safe_tlds'] # For sure a website extension
|
||
|
safe_subdomains = config['safe_subdomains'] # For sure a website subdomain
|
||
|
safe_prefix = config['safe_prefixes'] # Those words before a tlds are sure
|
||
|
website_prefixes = config['prefixes']
|
||
|
|
||
|
rebulk.regex(r'(?:[^a-z0-9]|^)((?:'+build_or_pattern(safe_subdomains) +
|
||
|
r'\.)+(?:[a-z-]+\.)+(?:'+build_or_pattern(tlds) +
|
||
|
r'))(?:[^a-z0-9]|$)',
|
||
|
children=True)
|
||
|
rebulk.regex(r'(?:[^a-z0-9]|^)((?:'+build_or_pattern(safe_subdomains) +
|
||
|
r'\.)*[a-z-]+\.(?:'+build_or_pattern(safe_tlds) +
|
||
|
r'))(?:[^a-z0-9]|$)',
|
||
|
safe_subdomains=safe_subdomains, safe_tlds=safe_tlds, children=True)
|
||
|
rebulk.regex(r'(?:[^a-z0-9]|^)((?:'+build_or_pattern(safe_subdomains) +
|
||
|
r'\.)*[a-z-]+\.(?:'+build_or_pattern(safe_prefix) +
|
||
|
r'\.)+(?:'+build_or_pattern(tlds) +
|
||
|
r'))(?:[^a-z0-9]|$)',
|
||
|
safe_subdomains=safe_subdomains, safe_prefix=safe_prefix, tlds=tlds, children=True)
|
||
|
|
||
|
rebulk.string(*website_prefixes,
|
||
|
validator=seps_surround, private=True, tags=['website.prefix'])
|
||
|
|
||
|
class PreferTitleOverWebsite(Rule):
|
||
|
"""
|
||
|
If found match is more likely a title, remove website.
|
||
|
"""
|
||
|
consequence = RemoveMatch
|
||
|
|
||
|
@staticmethod
|
||
|
def valid_followers(match):
|
||
|
"""
|
||
|
Validator for next website matches
|
||
|
"""
|
||
|
return any(name in ['season', 'episode', 'year'] for name in match.names)
|
||
|
|
||
|
def when(self, matches, context):
|
||
|
to_remove = []
|
||
|
for website_match in matches.named('website'):
|
||
|
safe = False
|
||
|
for safe_start in safe_subdomains + safe_prefix:
|
||
|
if website_match.value.lower().startswith(safe_start):
|
||
|
safe = True
|
||
|
break
|
||
|
if not safe:
|
||
|
suffix = matches.next(website_match, PreferTitleOverWebsite.valid_followers, 0)
|
||
|
if suffix:
|
||
|
to_remove.append(website_match)
|
||
|
return to_remove
|
||
|
|
||
|
rebulk.rules(PreferTitleOverWebsite, ValidateWebsitePrefix)
|
||
|
|
||
|
return rebulk
|
||
|
|
||
|
|
||
|
class ValidateWebsitePrefix(Rule):
|
||
|
"""
|
||
|
Validate website prefixes
|
||
|
"""
|
||
|
priority = 64
|
||
|
consequence = RemoveMatch
|
||
|
|
||
|
def when(self, matches, context):
|
||
|
to_remove = []
|
||
|
for prefix in matches.tagged('website.prefix'):
|
||
|
website_match = matches.next(prefix, predicate=lambda match: match.name == 'website', index=0)
|
||
|
if (not website_match or
|
||
|
matches.holes(prefix.end, website_match.start,
|
||
|
formatter=cleanup, seps=seps, predicate=lambda match: match.value)):
|
||
|
to_remove.append(prefix)
|
||
|
return to_remove
|