mirror of
https://github.com/morpheus65535/bazarr
synced 2025-01-30 19:02:34 +00:00
Fix for subtitles file without a language code that aren't text based.
This commit is contained in:
parent
8b10b0e5e4
commit
304a5ff1b8
4 changed files with 195 additions and 42 deletions
|
@ -15,6 +15,7 @@ from subliminal_patch import search_external_subtitles
|
||||||
from subzero.language import Language
|
from subzero.language import Language
|
||||||
from bs4 import UnicodeDammit
|
from bs4 import UnicodeDammit
|
||||||
import six
|
import six
|
||||||
|
from binaryornot.check import is_binary
|
||||||
|
|
||||||
from get_args import args
|
from get_args import args
|
||||||
from database import database
|
from database import database
|
||||||
|
@ -86,27 +87,6 @@ def store_subtitles(original_path, reversed_path):
|
||||||
logging.debug("BAZARR external subtitles detected: " + str(language))
|
logging.debug("BAZARR external subtitles detected: " + str(language))
|
||||||
actual_subtitles.append(
|
actual_subtitles.append(
|
||||||
[str(language), path_replace_reverse(subtitle_path)])
|
[str(language), path_replace_reverse(subtitle_path)])
|
||||||
else:
|
|
||||||
if os.path.splitext(subtitle)[1] != ".sub":
|
|
||||||
logging.debug("BAZARR falling back to file content analysis to detect language.")
|
|
||||||
with open(os.path.join(os.path.dirname(reversed_path), subtitle), 'r') as f:
|
|
||||||
text = f.read()
|
|
||||||
try:
|
|
||||||
encoding = UnicodeDammit(text)
|
|
||||||
if six.PY2:
|
|
||||||
text = text.decode(encoding.original_encoding)
|
|
||||||
detected_language = langdetect.detect(text)
|
|
||||||
except Exception as e:
|
|
||||||
logging.exception(
|
|
||||||
'BAZARR Error trying to detect language for this subtitles file: ' +
|
|
||||||
os.path.join(os.path.dirname(reversed_path), subtitle) +
|
|
||||||
' You should try to delete this subtitles file manually and ask Bazarr to download it again.')
|
|
||||||
else:
|
|
||||||
if len(detected_language) > 0:
|
|
||||||
logging.debug(
|
|
||||||
"BAZARR external subtitles detected and analysis guessed this language: " + str(
|
|
||||||
detected_language))
|
|
||||||
actual_subtitles.append([str(detected_language), path_replace_reverse(subtitle_path)])
|
|
||||||
|
|
||||||
database.execute("UPDATE table_episodes SET subtitles=? WHERE path=?",
|
database.execute("UPDATE table_episodes SET subtitles=? WHERE path=?",
|
||||||
(str(actual_subtitles), original_path))
|
(str(actual_subtitles), original_path))
|
||||||
|
@ -178,27 +158,6 @@ def store_subtitles_movie(original_path, reversed_path):
|
||||||
elif str(language) != 'und':
|
elif str(language) != 'und':
|
||||||
logging.debug("BAZARR external subtitles detected: " + str(language))
|
logging.debug("BAZARR external subtitles detected: " + str(language))
|
||||||
actual_subtitles.append([str(language), path_replace_reverse_movie(subtitle_path)])
|
actual_subtitles.append([str(language), path_replace_reverse_movie(subtitle_path)])
|
||||||
else:
|
|
||||||
if os.path.splitext(subtitle)[1] != ".sub":
|
|
||||||
logging.debug("BAZARR falling back to file content analysis to detect language.")
|
|
||||||
with open(os.path.join(os.path.dirname(reversed_path), dest_folder, subtitle), 'r') as f:
|
|
||||||
text = f.read()
|
|
||||||
try:
|
|
||||||
encoding = UnicodeDammit(text)
|
|
||||||
if six.PY2:
|
|
||||||
text = text.decode(encoding.original_encoding)
|
|
||||||
detected_language = langdetect.detect(text)
|
|
||||||
except Exception as e:
|
|
||||||
logging.exception(
|
|
||||||
'BAZARR Error trying to detect language for this subtitles file: ' +
|
|
||||||
os.path.join(os.path.dirname(reversed_path), subtitle) +
|
|
||||||
' You should try to delete this subtitles file manually and ask Bazarr to download it again.')
|
|
||||||
else:
|
|
||||||
if len(detected_language) > 0:
|
|
||||||
logging.debug(
|
|
||||||
"BAZARR external subtitles detected and analysis guessed this language: " +
|
|
||||||
str(detected_language))
|
|
||||||
actual_subtitles.append([str(detected_language), path_replace_reverse_movie(subtitle_path)])
|
|
||||||
|
|
||||||
database.execute("UPDATE table_movies SET subtitles=? WHERE path=?",
|
database.execute("UPDATE table_movies SET subtitles=? WHERE path=?",
|
||||||
(str(actual_subtitles), original_path))
|
(str(actual_subtitles), original_path))
|
||||||
|
@ -400,6 +359,10 @@ def guess_external_subtitles(dest_folder, subtitles):
|
||||||
subtitle_path = os.path.join(dest_folder, subtitle)
|
subtitle_path = os.path.join(dest_folder, subtitle)
|
||||||
if os.path.exists(subtitle_path) and os.path.splitext(subtitle_path)[1] in core.SUBTITLE_EXTENSIONS:
|
if os.path.exists(subtitle_path) and os.path.splitext(subtitle_path)[1] in core.SUBTITLE_EXTENSIONS:
|
||||||
logging.debug("BAZARR falling back to file content analysis to detect language.")
|
logging.debug("BAZARR falling back to file content analysis to detect language.")
|
||||||
|
if is_binary(subtitle_path):
|
||||||
|
logging.debug("BAZARR subtitles file doesn't seems to be text based. Skipping this file: " +
|
||||||
|
subtitle_path)
|
||||||
|
continue
|
||||||
detected_language = None
|
detected_language = None
|
||||||
with open(subtitle_path, 'r') as f:
|
with open(subtitle_path, 'r') as f:
|
||||||
text = f.read()
|
text = f.read()
|
||||||
|
|
3
libs/binaryornot/__init__.py
Normal file
3
libs/binaryornot/__init__.py
Normal file
|
@ -0,0 +1,3 @@
|
||||||
|
__author__ = 'Audrey Roy'
|
||||||
|
__email__ = 'audreyr@gmail.com'
|
||||||
|
__version__ = '0.4.4'
|
52
libs/binaryornot/check.py
Normal file
52
libs/binaryornot/check.py
Normal file
|
@ -0,0 +1,52 @@
|
||||||
|
# -*- coding: utf-8 -*-
|
||||||
|
|
||||||
|
"""
|
||||||
|
binaryornot.check
|
||||||
|
-----------------
|
||||||
|
|
||||||
|
Main code for checking if a file is binary or text.
|
||||||
|
"""
|
||||||
|
|
||||||
|
import logging
|
||||||
|
import argparse
|
||||||
|
|
||||||
|
from binaryornot.helpers import get_starting_chunk, is_binary_string
|
||||||
|
|
||||||
|
|
||||||
|
logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
|
|
||||||
|
def is_binary(filename):
|
||||||
|
"""
|
||||||
|
:param filename: File to check.
|
||||||
|
:returns: True if it's a binary file, otherwise False.
|
||||||
|
"""
|
||||||
|
logger.debug('is_binary: %(filename)r', locals())
|
||||||
|
|
||||||
|
# Check if the file extension is in a list of known binary types
|
||||||
|
# binary_extensions = ['.pyc', ]
|
||||||
|
# for ext in binary_extensions:
|
||||||
|
# if filename.endswith(ext):
|
||||||
|
# return True
|
||||||
|
|
||||||
|
# Check if the starting chunk is a binary string
|
||||||
|
chunk = get_starting_chunk(filename)
|
||||||
|
return is_binary_string(chunk)
|
||||||
|
|
||||||
|
|
||||||
|
def main():
|
||||||
|
parser = argparse.ArgumentParser(description="Check if a "
|
||||||
|
"file passed as argument is "
|
||||||
|
"binary or not")
|
||||||
|
|
||||||
|
parser.add_argument("filename", help="File name to check for. If "
|
||||||
|
"the file is not in the same folder, "
|
||||||
|
"include full path")
|
||||||
|
|
||||||
|
args = parser.parse_args()
|
||||||
|
|
||||||
|
print(is_binary(**vars(args)))
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
main()
|
135
libs/binaryornot/helpers.py
Normal file
135
libs/binaryornot/helpers.py
Normal file
|
@ -0,0 +1,135 @@
|
||||||
|
# -*- coding: utf-8 -*-
|
||||||
|
|
||||||
|
|
||||||
|
"""
|
||||||
|
binaryornot.helpers
|
||||||
|
-------------------
|
||||||
|
|
||||||
|
Helper utilities used by BinaryOrNot.
|
||||||
|
"""
|
||||||
|
|
||||||
|
import chardet
|
||||||
|
import logging
|
||||||
|
|
||||||
|
|
||||||
|
logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
|
|
||||||
|
def print_as_hex(s):
|
||||||
|
"""
|
||||||
|
Print a string as hex bytes.
|
||||||
|
"""
|
||||||
|
print(":".join("{0:x}".format(ord(c)) for c in s))
|
||||||
|
|
||||||
|
|
||||||
|
def get_starting_chunk(filename, length=1024):
|
||||||
|
"""
|
||||||
|
:param filename: File to open and get the first little chunk of.
|
||||||
|
:param length: Number of bytes to read, default 1024.
|
||||||
|
:returns: Starting chunk of bytes.
|
||||||
|
"""
|
||||||
|
# Ensure we open the file in binary mode
|
||||||
|
try:
|
||||||
|
with open(filename, 'rb') as f:
|
||||||
|
chunk = f.read(length)
|
||||||
|
return chunk
|
||||||
|
except IOError as e:
|
||||||
|
print(e)
|
||||||
|
|
||||||
|
|
||||||
|
_control_chars = b'\n\r\t\f\b'
|
||||||
|
if bytes is str:
|
||||||
|
# Python 2 means we need to invoke chr() explicitly
|
||||||
|
_printable_ascii = _control_chars + b''.join(map(chr, range(32, 127)))
|
||||||
|
_printable_high_ascii = b''.join(map(chr, range(127, 256)))
|
||||||
|
else:
|
||||||
|
# Python 3 means bytes accepts integer input directly
|
||||||
|
_printable_ascii = _control_chars + bytes(range(32, 127))
|
||||||
|
_printable_high_ascii = bytes(range(127, 256))
|
||||||
|
|
||||||
|
|
||||||
|
def is_binary_string(bytes_to_check):
|
||||||
|
"""
|
||||||
|
Uses a simplified version of the Perl detection algorithm,
|
||||||
|
based roughly on Eli Bendersky's translation to Python:
|
||||||
|
http://eli.thegreenplace.net/2011/10/19/perls-guess-if-file-is-text-or-binary-implemented-in-python/
|
||||||
|
|
||||||
|
This is biased slightly more in favour of deeming files as text
|
||||||
|
files than the Perl algorithm, since all ASCII compatible character
|
||||||
|
sets are accepted as text, not just utf-8.
|
||||||
|
|
||||||
|
:param bytes: A chunk of bytes to check.
|
||||||
|
:returns: True if appears to be a binary, otherwise False.
|
||||||
|
"""
|
||||||
|
|
||||||
|
# Empty files are considered text files
|
||||||
|
if not bytes_to_check:
|
||||||
|
return False
|
||||||
|
|
||||||
|
# Now check for a high percentage of ASCII control characters
|
||||||
|
# Binary if control chars are > 30% of the string
|
||||||
|
low_chars = bytes_to_check.translate(None, _printable_ascii)
|
||||||
|
nontext_ratio1 = float(len(low_chars)) / float(len(bytes_to_check))
|
||||||
|
logger.debug('nontext_ratio1: %(nontext_ratio1)r', locals())
|
||||||
|
|
||||||
|
# and check for a low percentage of high ASCII characters:
|
||||||
|
# Binary if high ASCII chars are < 5% of the string
|
||||||
|
# From: https://en.wikipedia.org/wiki/UTF-8
|
||||||
|
# If the bytes are random, the chances of a byte with the high bit set
|
||||||
|
# starting a valid UTF-8 character is only 6.64%. The chances of finding 7
|
||||||
|
# of these without finding an invalid sequence is actually lower than the
|
||||||
|
# chance of the first three bytes randomly being the UTF-8 BOM.
|
||||||
|
|
||||||
|
high_chars = bytes_to_check.translate(None, _printable_high_ascii)
|
||||||
|
nontext_ratio2 = float(len(high_chars)) / float(len(bytes_to_check))
|
||||||
|
logger.debug('nontext_ratio2: %(nontext_ratio2)r', locals())
|
||||||
|
|
||||||
|
if nontext_ratio1 > 0.90 and nontext_ratio2 > 0.90:
|
||||||
|
return True
|
||||||
|
|
||||||
|
is_likely_binary = (
|
||||||
|
(nontext_ratio1 > 0.3 and nontext_ratio2 < 0.05) or
|
||||||
|
(nontext_ratio1 > 0.8 and nontext_ratio2 > 0.8)
|
||||||
|
)
|
||||||
|
logger.debug('is_likely_binary: %(is_likely_binary)r', locals())
|
||||||
|
|
||||||
|
# then check for binary for possible encoding detection with chardet
|
||||||
|
detected_encoding = chardet.detect(bytes_to_check)
|
||||||
|
logger.debug('detected_encoding: %(detected_encoding)r', locals())
|
||||||
|
|
||||||
|
# finally use all the check to decide binary or text
|
||||||
|
decodable_as_unicode = False
|
||||||
|
if (detected_encoding['confidence'] > 0.9 and
|
||||||
|
detected_encoding['encoding'] != 'ascii'):
|
||||||
|
try:
|
||||||
|
try:
|
||||||
|
bytes_to_check.decode(encoding=detected_encoding['encoding'])
|
||||||
|
except TypeError:
|
||||||
|
# happens only on Python 2.6
|
||||||
|
unicode(bytes_to_check, encoding=detected_encoding['encoding']) # noqa
|
||||||
|
decodable_as_unicode = True
|
||||||
|
logger.debug('success: decodable_as_unicode: '
|
||||||
|
'%(decodable_as_unicode)r', locals())
|
||||||
|
except LookupError:
|
||||||
|
logger.debug('failure: could not look up encoding %(encoding)s',
|
||||||
|
detected_encoding)
|
||||||
|
except UnicodeDecodeError:
|
||||||
|
logger.debug('failure: decodable_as_unicode: '
|
||||||
|
'%(decodable_as_unicode)r', locals())
|
||||||
|
|
||||||
|
logger.debug('failure: decodable_as_unicode: '
|
||||||
|
'%(decodable_as_unicode)r', locals())
|
||||||
|
if is_likely_binary:
|
||||||
|
if decodable_as_unicode:
|
||||||
|
return False
|
||||||
|
else:
|
||||||
|
return True
|
||||||
|
else:
|
||||||
|
if decodable_as_unicode:
|
||||||
|
return False
|
||||||
|
else:
|
||||||
|
if b'\x00' in bytes_to_check or b'\xff' in bytes_to_check:
|
||||||
|
# Check for NULL bytes last
|
||||||
|
logger.debug('has nulls:' + repr(b'\x00' in bytes_to_check))
|
||||||
|
return True
|
||||||
|
return False
|
Loading…
Reference in a new issue