This commit is contained in:
Louis Vézina 2019-09-20 17:56:33 -04:00
parent e7cb2a71e2
commit a7b40eaf79
17 changed files with 75 additions and 99 deletions

View File

@ -202,7 +202,7 @@ def download_subtitle(path, language, hi, forced, providers, providers_auth, sce
directory=fld,
chmod=chmod,
# formats=("srt", "vtt")
path_decoder=force_unicode
path_decoder=None
)
except Exception as e:
logging.exception('BAZARR Error saving subtitles file to disk for this file:' + path)
@ -419,7 +419,6 @@ def manual_download_subtitle(path, language, hi, forced, subtitle, provider, pro
if not subtitle.is_valid():
logging.exception('BAZARR No valid subtitles file found for this file: ' + path)
return
logging.debug('BAZARR Subtitles file downloaded for this file:' + path)
try:
score = round(subtitle.score / max_score * 100, 2)
fld = get_target_folder(path)

View File

@ -17,7 +17,7 @@ class Color(_Color):
return _Color.__new__(cls, r, g, b, a)
#: Version of the pysubs2 library.
VERSION = "0.2.3"
VERSION = "0.2.4"
PY3 = sys.version_info.major == 3

View File

@ -4,6 +4,7 @@ from .subrip import SubripFormat
from .jsonformat import JSONFormat
from .substation import SubstationFormat
from .mpl2 import MPL2Format
from .tmp import TmpFormat
from .exceptions import *
#: Dict mapping file extensions to format identifiers.
@ -13,6 +14,7 @@ FILE_EXTENSION_TO_FORMAT_IDENTIFIER = {
".ssa": "ssa",
".sub": "microdvd",
".json": "json",
".txt": "tmp",
}
#: Dict mapping format identifiers to implementations (FormatBase subclasses).
@ -23,6 +25,7 @@ FORMAT_IDENTIFIER_TO_FORMAT_CLASS = {
"microdvd": MicroDVDFormat,
"json": JSONFormat,
"mpl2": MPL2Format,
"tmp": TmpFormat,
}
def get_format_class(format_):

View File

@ -66,7 +66,14 @@ class SSAFile(MutableSequence):
be detected from the file, in which case you don't need
to specify it here (when given, this argument overrides
autodetection).
kwargs: Extra options for the parser.
keep_unknown_html_tags (bool): This affects SubRip only (SRT),
for other formats this argument is ignored.
By default, HTML tags are converted to equivalent SubStation tags
(eg. ``<i>`` to ``{\\i1}`` and any remaining tags are removed
to keep the text clean. Set this parameter to ``True``
if you want to pass through these tags (eg. ``<sub>``).
This is useful if your output format is SRT and your player
supports these tags.
Returns:
SSAFile
@ -86,6 +93,7 @@ class SSAFile(MutableSequence):
Example:
>>> subs1 = pysubs2.load("subrip-subtitles.srt")
>>> subs2 = pysubs2.load("microdvd-subtitles.sub", fps=23.976)
>>> subs3 = pysubs2.load("subrip-subtitles-with-fancy-tags.srt", keep_unknown_html_tags=True)
"""
with open(path, encoding=encoding) as fp:

View File

@ -56,7 +56,7 @@ class SSAStyle(object):
self.encoding = 1 #: Charset
for k, v in fields.items():
if k in self.FIELDS and v is not None:
if k in self.FIELDS:
setattr(self, k, v)
else:
raise ValueError("SSAStyle has no field named %r" % k)

View File

@ -31,7 +31,7 @@ class SubripFormat(FormatBase):
return "srt"
@classmethod
def from_file(cls, subs, fp, format_, **kwargs):
def from_file(cls, subs, fp, format_, keep_unknown_html_tags=False, **kwargs):
timestamps = [] # (start, end)
following_lines = [] # contains lists of lines following each timestamp
@ -56,15 +56,15 @@ class SubripFormat(FormatBase):
# Handle the general case.
s = "".join(lines).strip()
s = re.sub(r"\n+ *\d+ *$", "", s) # strip number of next subtitle
s = re.sub(r"< *i *>", r"{\i1}", s)
s = re.sub(r"< */ *i *>", r"{\i0}", s)
s = re.sub(r"< *s *>", r"{\s1}", s)
s = re.sub(r"< */ *s *>", r"{\s0}", s)
s = re.sub(r"< *u *>", "{\\u1}", s) # not r" for Python 2.7 compat, triggers unicodeescape
s = re.sub(r"< */ *u *>", "{\\u0}", s)
s = re.sub(r"< */? *[a-zA-Z][^>]*>", "", s) # strip other HTML tags
s = re.sub(r"\r", "", s) # convert newlines
s = re.sub(r"\n", r"\N", s) # convert newlines
s = re.sub(r"< *i *>", r"{\\i1}", s)
s = re.sub(r"< */ *i *>", r"{\\i0}", s)
s = re.sub(r"< *s *>", r"{\\s1}", s)
s = re.sub(r"< */ *s *>", r"{\\s0}", s)
s = re.sub(r"< *u *>", "{\\\\u1}", s) # not r" for Python 2.7 compat, triggers unicodeescape
s = re.sub(r"< */ *u *>", "{\\\\u0}", s)
if not keep_unknown_html_tags:
s = re.sub(r"< */? *[a-zA-Z][^>]*>", "", s) # strip other HTML tags
s = re.sub(r"\n", r"\\N", s) # convert newlines
return s
subs.events = [SSAEvent(start=start, end=end, text=prepare_text(lines))

View File

@ -145,7 +145,12 @@ class SubstationFormat(FormatBase):
def string_to_field(f, v):
if f in {"start", "end"}:
return timestamp_to_ms(TIMESTAMP.match(v).groups())
if v.startswith("-"):
# handle negative timestamps
v = v[1:]
return -timestamp_to_ms(TIMESTAMP.match(v).groups())
else:
return timestamp_to_ms(TIMESTAMP.match(v).groups())
elif "color" in f:
if format_ == "ass":
return ass_rgba_to_color(v)
@ -184,22 +189,22 @@ class SubstationFormat(FormatBase):
elif inside_info_section or inside_aegisub_section:
if line.startswith(";"): continue # skip comments
try:
k, v = line.split(": ", 1)
k, v = line.split(":", 1)
if inside_info_section:
subs.info[k] = v
subs.info[k] = v.strip()
elif inside_aegisub_section:
subs.aegisub_project[k] = v
subs.aegisub_project[k] = v.strip()
except ValueError:
pass
elif line.startswith("Style:"):
_, rest = line.split(": ", 1)
_, rest = line.split(":", 1)
buf = rest.strip().split(",")
name, raw_fields = buf[0], buf[1:] # splat workaround for Python 2.7
field_dict = {f: string_to_field(f, v) for f, v in zip(STYLE_FIELDS[format_], raw_fields)}
sty = SSAStyle(**field_dict)
subs.styles[name] = sty
elif line.startswith("Dialogue:") or line.startswith("Comment:"):
ev_type, rest = line.split(": ", 1)
ev_type, rest = line.split(":", 1)
raw_fields = rest.strip().split(",", len(EVENT_FIELDS[format_])-1)
field_dict = {f: string_to_field(f, v) for f, v in zip(EVENT_FIELDS[format_], raw_fields)}
field_dict["type"] = ev_type

View File

@ -49,6 +49,20 @@ def timestamp_to_ms(groups):
ms += h * 3600000
return ms
def tmptimestamp_to_ms(groups):
"""
Convert groups from :data:`pysubs2.time.TMPTIMESTAMP` match to milliseconds.
Example:
>>> timestamp_to_ms(TIMESTAMP.match("0:00:01").groups())
1000
"""
h, m, s = map(int, groups)
ms = s * 1000
ms += m * 60000
ms += h * 3600000
return ms
def times_to_ms(h=0, m=0, s=0, ms=0):
"""
Convert hours, minutes, seconds to milliseconds.

View File

@ -1,45 +0,0 @@
# coding=utf-8
from __future__ import print_function, division, unicode_literals
import re
from numbers import Number
from pysubs2.time import times_to_ms
from .formatbase import FormatBase
from .ssaevent import SSAEvent
from .ssastyle import SSAStyle
# thanks to http://otsaloma.io/gaupol/doc/api/aeidon.files.mpl2_source.html
MPL2_FORMAT = re.compile(r"^(?um)\[(-?\d+)\]\[(-?\d+)\](.*?)$")
class TXTGenericFormat(FormatBase):
@classmethod
def guess_format(cls, text):
if MPL2_FORMAT.match(text):
return "mpl2"
class MPL2Format(FormatBase):
@classmethod
def guess_format(cls, text):
return TXTGenericFormat.guess_format(text)
@classmethod
def from_file(cls, subs, fp, format_, **kwargs):
def prepare_text(lines):
out = []
for s in lines.split("|"):
if s.startswith("/"):
out.append(r"{\i1}%s{\i0}" % s[1:])
continue
out.append(s)
return "\n".join(out)
subs.events = [SSAEvent(start=times_to_ms(s=float(start) / 10), end=times_to_ms(s=float(end) / 10),
text=prepare_text(text)) for start, end, text in MPL2_FORMAT.findall(fp.getvalue())]
@classmethod
def to_file(cls, subs, fp, format_, **kwargs):
raise NotImplemented

View File

@ -854,8 +854,8 @@ def save_subtitles(file_path, subtitles, single=False, directory=None, chmod=Non
logger.debug(u"Saving %r to %r", subtitle, subtitle_path)
content = subtitle.get_modified_content(format=format, debug=debug_mods)
if content:
with open(subtitle_path, 'w') as f:
f.write(content.decode('utf-8'))
with open(subtitle_path, 'wb') as f:
f.write(content)
subtitle.storage_path = subtitle_path
else:
logger.error(u"Something went wrong when getting modified subtitle for %s", subtitle)

View File

@ -148,7 +148,7 @@ class CFSession(CloudScraper):
cache_key = "cf_data3_%s" % domain
if not self.cookies.get("cf_clearance", "", domain=domain):
cf_data = region.get(cache_key)
cf_data = str(region.get(cache_key))
if cf_data is not NO_VALUE:
cf_cookies, hdrs = cf_data
logger.debug("Trying to use old cf data for %s: %s", domain, cf_data)
@ -165,9 +165,9 @@ class CFSession(CloudScraper):
pass
else:
if cf_data and "cf_clearance" in cf_data[0] and cf_data[0]["cf_clearance"]:
if cf_data != region.get(cache_key):
if cf_data != str(region.get(cache_key)):
logger.debug("Storing cf data for %s: %s", domain, cf_data)
region.set(cache_key, cf_data)
region.set(cache_key, bytearray(cf_data, encoding='utf-8'))
elif cf_data[0]["cf_clearance"]:
logger.debug("CF Live tokens not updated")

View File

@ -257,4 +257,4 @@ def load_verification(site_name, session, callback=lambda x: None):
def store_verification(site_name, session):
region.set("%s_data" % site_name, session.cookies._cookies, session.headers["User-Agent"])
region.set("%s_data" % site_name, (session.cookies._cookies, session.headers["User-Agent"]))

View File

@ -104,11 +104,11 @@ class Addic7edProvider(_Addic7edProvider):
tries = 0
while tries < 3:
r = self.session.get(self.server_url + 'login.php', timeout=10, headers={"Referer": self.server_url})
if "grecaptcha" in r.content:
if "grecaptcha" in r.text:
logger.info('Addic7ed: Solving captcha. This might take a couple of minutes, but should only '
'happen once every so often')
site_key = re.search(r'grecaptcha.execute\(\'(.+?)\',', r.content).group(1)
site_key = re.search(r'grecaptcha.execute\(\'(.+?)\',', r.text).group(1)
if not site_key:
logger.error("Addic7ed: Captcha site-key not found!")
return
@ -127,11 +127,11 @@ class Addic7edProvider(_Addic7edProvider):
r = self.session.post(self.server_url + 'dologin.php', data, allow_redirects=False, timeout=10,
headers={"Referer": self.server_url + "login.php"})
if "relax, slow down" in r.content:
if "relax, slow down" in r.text:
raise TooManyRequests(self.username)
if r.status_code != 302:
if "User <b></b> doesn't exist" in r.content and tries <= 2:
if "User <b></b> doesn't exist" in r.text and tries <= 2:
logger.info("Addic7ed: Error, trying again. (%s/%s)", tries+1, 3)
tries += 1
continue
@ -208,8 +208,8 @@ class Addic7edProvider(_Addic7edProvider):
if show_cells:
soup = ParserBeautifulSoup(b''.join(show_cells), ['lxml', 'html.parser'])
else:
# If RegEx fails, fall back to original r.content and use 'html.parser'
soup = ParserBeautifulSoup(r.content, ['html.parser'])
# If RegEx fails, fall back to original r.text and use 'html.parser'
soup = ParserBeautifulSoup(r.text, ['html.parser'])
# populate the show ids
show_ids = {}
@ -265,7 +265,7 @@ class Addic7edProvider(_Addic7edProvider):
r = self.session.get(self.server_url + endpoint, params=params, timeout=10, headers=headers)
r.raise_for_status()
if r.content and "Sorry, your search" not in r.content:
if r.text and "Sorry, your search" not in r.text:
break
time.sleep(4)
@ -273,7 +273,7 @@ class Addic7edProvider(_Addic7edProvider):
if r.status_code == 304:
raise TooManyRequests()
soup = ParserBeautifulSoup(r.content, ['lxml', 'html.parser'])
soup = ParserBeautifulSoup(r.text, ['lxml', 'html.parser'])
suggestion = None
@ -315,13 +315,13 @@ class Addic7edProvider(_Addic7edProvider):
if r.status_code == 304:
raise TooManyRequests()
if not r.content:
if not r.text:
# Provider wrongful return a status of 304 Not Modified with an empty content
# raise_for_status won't raise exception for that status code
logger.error('No data returned from provider')
return []
soup = ParserBeautifulSoup(r.content, ['lxml', 'html.parser'])
soup = ParserBeautifulSoup(r.text, ['lxml', 'html.parser'])
# loop over subtitle rows
subtitles = []
@ -364,7 +364,7 @@ class Addic7edProvider(_Addic7edProvider):
if r.status_code == 304:
raise TooManyRequests()
if not r.content:
if not r.text:
# Provider wrongful return a status of 304 Not Modified with an empty content
# raise_for_status won't raise exception for that status code
logger.error('Unable to download subtitle. No data returned from provider')

View File

@ -116,7 +116,7 @@ class HosszupuskaSubtitle(Subtitle):
if video.format and self.version and video.format.lower() in self.version.lower():
matches.add('format')
# other properties
matches |= guess_matches(video, guessit(self.release_info.encode("utf-8")))
matches |= guess_matches(video, guessit(self.release_info))
return matches

View File

@ -199,7 +199,7 @@ class LegendasTVProvider(_LegendasTVProvider):
# attempt to get the releases from the cache
cache_key = releases_key.format(archive_id=a.id, archive_name=a.name)
releases = region.get(cache_key, expiration_time=expiration_time)
releases = str(region.get(cache_key, expiration_time=expiration_time))
# the releases are not in cache or cache is expired
if releases == NO_VALUE:
@ -226,7 +226,7 @@ class LegendasTVProvider(_LegendasTVProvider):
releases.append(name)
# cache the releases
region.set(cache_key, releases)
region.set(cache_key, bytearray(releases, encoding='utf-8'))
# iterate over releases
for r in releases:

View File

@ -158,13 +158,5 @@ class ProviderSubtitleArchiveMixin(object):
elif subs_fallback:
matching_sub = subs_fallback[0]
try:
matching_sub_unicode = matching_sub.decode("utf-8")
except UnicodeDecodeError:
try:
matching_sub_unicode = matching_sub.decode("cp437")
except UnicodeDecodeError:
matching_sub_unicode = matching_sub.decode("utf-8", errors='replace')
logger.info(u"Using %s from the archive", matching_sub_unicode)
logger.info(u"Using %s from the archive", matching_sub)
return fix_line_ending(archive.read(matching_sub))

View File

@ -141,7 +141,7 @@ class SubsceneProvider(Provider, ProviderSubtitleArchiveMixin):
logger.info("Creating session")
self.session = RetryingCFSession()
prev_cookies = region.get("subscene_cookies2")
prev_cookies = str(region.get("subscene_cookies2"))
if prev_cookies != NO_VALUE:
logger.debug("Re-using old subscene cookies: %r", prev_cookies)
self.session.cookies.update(prev_cookies)
@ -194,7 +194,7 @@ class SubsceneProvider(Provider, ProviderSubtitleArchiveMixin):
del cj[cn]
logger.debug("Storing cookies: %r", cj)
region.set("subscene_cookies2", cj)
region.set("subscene_cookies2", bytearray(cj, encoding='utf-8'))
return
raise ProviderError("Something went wrong when trying to log in #1")
@ -219,9 +219,9 @@ class SubsceneProvider(Provider, ProviderSubtitleArchiveMixin):
acc_filters["SelectedIds"] = selected_ids
self.filters["LanguageFilter"] = ",".join(acc_filters["SelectedIds"])
last_filters = region.get("subscene_filters")
last_filters = str(region.get("subscene_filters"))
if last_filters != acc_filters:
region.set("subscene_filters", acc_filters)
region.set("subscene_filters", bytearray(acc_filters, encoding='utf-8'))
logger.debug("Setting account filters to %r", acc_filters)
self.session.post("https://u.subscene.com/filter", acc_filters, allow_redirects=False)