Fix for subtitles file without a language code that aren't text based.

2025-03-03 10:06:24 +00:00 · 2019-11-27 21:48:39 -05:00 · 2019-11-27 21:48:39 -05:00 · 304a5ff1b8
commit 304a5ff1b8
parent 8b10b0e5e4
4 changed files with 195 additions and 42 deletions
--- a/bazarr/list_subtitles.py
+++ b/bazarr/list_subtitles.py
@ -15,6 +15,7 @@ from subliminal_patch import search_external_subtitles
 from subzero.language import Language
 from bs4 import UnicodeDammit
 import six
+from binaryornot.check import is_binary

 from get_args import args
 from database import database
@ -86,27 +87,6 @@ def store_subtitles(original_path, reversed_path):
                    logging.debug("BAZARR external subtitles detected: " + str(language))
                    actual_subtitles.append(
                        [str(language), path_replace_reverse(subtitle_path)])
-                else:
-                    if os.path.splitext(subtitle)[1] != ".sub":
-                        logging.debug("BAZARR falling back to file content analysis to detect language.")
-                        with open(os.path.join(os.path.dirname(reversed_path), subtitle), 'r') as f:
-                            text = f.read()
-                            try:
-                                encoding = UnicodeDammit(text)
-                                if six.PY2:
-                                    text = text.decode(encoding.original_encoding)
-                                detected_language = langdetect.detect(text)
-                            except Exception as e:
-                                logging.exception(
-                                    'BAZARR Error trying to detect language for this subtitles file: ' +
-                                    os.path.join(os.path.dirname(reversed_path), subtitle) +
-                                    ' You should try to delete this subtitles file manually and ask Bazarr to download it again.')
-                            else:
-                                if len(detected_language) > 0:
-                                    logging.debug(
-                                        "BAZARR external subtitles detected and analysis guessed this language: " + str(
-                                            detected_language))
-                                    actual_subtitles.append([str(detected_language), path_replace_reverse(subtitle_path)])

        database.execute("UPDATE table_episodes SET subtitles=? WHERE path=?",
                         (str(actual_subtitles), original_path))
@ -178,27 +158,6 @@ def store_subtitles_movie(original_path, reversed_path):
                elif str(language) != 'und':
                    logging.debug("BAZARR external subtitles detected: " + str(language))
                    actual_subtitles.append([str(language), path_replace_reverse_movie(subtitle_path)])
-                else:
-                    if os.path.splitext(subtitle)[1] != ".sub":
-                        logging.debug("BAZARR falling back to file content analysis to detect language.")
-                        with open(os.path.join(os.path.dirname(reversed_path), dest_folder, subtitle), 'r') as f:
-                            text = f.read()
-                            try:
-                                encoding = UnicodeDammit(text)
-                                if six.PY2:
-                                    text = text.decode(encoding.original_encoding)
-                                detected_language = langdetect.detect(text)
-                            except Exception as e:
-                                logging.exception(
-                                    'BAZARR Error trying to detect language for this subtitles file: ' +
-                                    os.path.join(os.path.dirname(reversed_path), subtitle) +
-                                    ' You should try to delete this subtitles file manually and ask Bazarr to download it again.')
-                            else:
-                                if len(detected_language) > 0:
-                                    logging.debug(
-                                        "BAZARR external subtitles detected and analysis guessed this language: " +
-                                        str(detected_language))
-                                    actual_subtitles.append([str(detected_language), path_replace_reverse_movie(subtitle_path)])
        
        database.execute("UPDATE table_movies SET subtitles=? WHERE path=?",
                         (str(actual_subtitles), original_path))
@ -400,6 +359,10 @@ def guess_external_subtitles(dest_folder, subtitles):
            subtitle_path = os.path.join(dest_folder, subtitle)
            if os.path.exists(subtitle_path) and os.path.splitext(subtitle_path)[1] in core.SUBTITLE_EXTENSIONS:
                logging.debug("BAZARR falling back to file content analysis to detect language.")
+                if is_binary(subtitle_path):
+                    logging.debug("BAZARR subtitles file doesn't seems to be text based. Skipping this file: " +
+                                  subtitle_path)
+                    continue
                detected_language = None
                with open(subtitle_path, 'r') as f:
                    text = f.read()
--- a/libs/binaryornot/init.py
+++ b/libs/binaryornot/init.py
@ -0,0 +1,3 @@
+__author__ = 'Audrey Roy'
+__email__ = 'audreyr@gmail.com'
+__version__ = '0.4.4'
--- a/libs/binaryornot/check.py
+++ b/libs/binaryornot/check.py
@ -0,0 +1,52 @@
+# -*- coding: utf-8 -*-
+
+"""
+binaryornot.check
+-----------------
+
+Main code for checking if a file is binary or text.
+"""
+
+import logging
+import argparse
+
+from binaryornot.helpers import get_starting_chunk, is_binary_string
+
+
+logger = logging.getLogger(__name__)
+
+
+def is_binary(filename):
+    """
+    :param filename: File to check.
+    :returns: True if it's a binary file, otherwise False.
+    """
+    logger.debug('is_binary: %(filename)r', locals())
+
+    # Check if the file extension is in a list of known binary types
+#     binary_extensions = ['.pyc', ]
+#     for ext in binary_extensions:
+#         if filename.endswith(ext):
+#             return True
+
+    # Check if the starting chunk is a binary string
+    chunk = get_starting_chunk(filename)
+    return is_binary_string(chunk)
+
+
+def main():
+    parser = argparse.ArgumentParser(description="Check if a "
+                                                 "file passed as argument is "
+                                                 "binary or not")
+
+    parser.add_argument("filename", help="File name to check for. If "
+                                         "the file is not in the same folder, "
+                                         "include full path")
+
+    args = parser.parse_args()
+
+    print(is_binary(**vars(args)))
+
+
+if __name__ == "__main__":
+    main()
--- a/libs/binaryornot/helpers.py
+++ b/libs/binaryornot/helpers.py
@ -0,0 +1,135 @@
+# -*- coding: utf-8 -*-
+
+
+"""
+binaryornot.helpers
+-------------------
+
+Helper utilities used by BinaryOrNot.
+"""
+
+import chardet
+import logging
+
+
+logger = logging.getLogger(__name__)
+
+
+def print_as_hex(s):
+    """
+    Print a string as hex bytes.
+    """
+    print(":".join("{0:x}".format(ord(c)) for c in s))
+
+
+def get_starting_chunk(filename, length=1024):
+    """
+    :param filename: File to open and get the first little chunk of.
+    :param length: Number of bytes to read, default 1024.
+    :returns: Starting chunk of bytes.
+    """
+    # Ensure we open the file in binary mode
+    try:
+        with open(filename, 'rb') as f:
+            chunk = f.read(length)
+            return chunk
+    except IOError as e:
+        print(e)
+
+
+_control_chars = b'\n\r\t\f\b'
+if bytes is str:
+    # Python 2 means we need to invoke chr() explicitly
+    _printable_ascii = _control_chars + b''.join(map(chr, range(32, 127)))
+    _printable_high_ascii = b''.join(map(chr, range(127, 256)))
+else:
+    # Python 3 means bytes accepts integer input directly
+    _printable_ascii = _control_chars + bytes(range(32, 127))
+    _printable_high_ascii = bytes(range(127, 256))
+
+
+def is_binary_string(bytes_to_check):
+    """
+    Uses a simplified version of the Perl detection algorithm,
+    based roughly on Eli Bendersky's translation to Python:
+    http://eli.thegreenplace.net/2011/10/19/perls-guess-if-file-is-text-or-binary-implemented-in-python/
+
+    This is biased slightly more in favour of deeming files as text
+    files than the Perl algorithm, since all ASCII compatible character
+    sets are accepted as text, not just utf-8.
+
+    :param bytes: A chunk of bytes to check.
+    :returns: True if appears to be a binary, otherwise False.
+    """
+
+    # Empty files are considered text files
+    if not bytes_to_check:
+        return False
+
+    # Now check for a high percentage of ASCII control characters
+    # Binary if control chars are > 30% of the string
+    low_chars = bytes_to_check.translate(None, _printable_ascii)
+    nontext_ratio1 = float(len(low_chars)) / float(len(bytes_to_check))
+    logger.debug('nontext_ratio1: %(nontext_ratio1)r', locals())
+
+    # and check for a low percentage of high ASCII characters:
+    # Binary if high ASCII chars are < 5% of the string
+    # From: https://en.wikipedia.org/wiki/UTF-8
+    # If the bytes are random, the chances of a byte with the high bit set
+    # starting a valid UTF-8 character is only 6.64%. The chances of finding 7
+    # of these without finding an invalid sequence is actually lower than the
+    # chance of the first three bytes randomly being the UTF-8 BOM.
+
+    high_chars = bytes_to_check.translate(None, _printable_high_ascii)
+    nontext_ratio2 = float(len(high_chars)) / float(len(bytes_to_check))
+    logger.debug('nontext_ratio2: %(nontext_ratio2)r', locals())
+
+    if nontext_ratio1 > 0.90 and nontext_ratio2 > 0.90:
+        return True
+
+    is_likely_binary = (
+        (nontext_ratio1 > 0.3 and nontext_ratio2 < 0.05) or
+        (nontext_ratio1 > 0.8 and nontext_ratio2 > 0.8)
+    )
+    logger.debug('is_likely_binary: %(is_likely_binary)r', locals())
+
+    # then check for binary for possible encoding detection with chardet
+    detected_encoding = chardet.detect(bytes_to_check)
+    logger.debug('detected_encoding: %(detected_encoding)r', locals())
+
+    # finally use all the check to decide binary or text
+    decodable_as_unicode = False
+    if (detected_encoding['confidence'] > 0.9 and
+            detected_encoding['encoding'] != 'ascii'):
+        try:
+            try:
+                bytes_to_check.decode(encoding=detected_encoding['encoding'])
+            except TypeError:
+                # happens only on Python 2.6
+                unicode(bytes_to_check, encoding=detected_encoding['encoding'])  # noqa
+            decodable_as_unicode = True
+            logger.debug('success: decodable_as_unicode: '
+                         '%(decodable_as_unicode)r', locals())
+        except LookupError:
+            logger.debug('failure: could not look up encoding %(encoding)s',
+                         detected_encoding)
+        except UnicodeDecodeError:
+            logger.debug('failure: decodable_as_unicode: '
+                         '%(decodable_as_unicode)r', locals())
+
+    logger.debug('failure: decodable_as_unicode: '
+                 '%(decodable_as_unicode)r', locals())
+    if is_likely_binary:
+        if decodable_as_unicode:
+            return False
+        else:
+            return True
+    else:
+        if decodable_as_unicode:
+            return False
+        else:
+            if b'\x00' in bytes_to_check or b'\xff' in bytes_to_check:
+                # Check for NULL bytes last
+                logger.debug('has nulls:' + repr(b'\x00' in bytes_to_check))
+                return True
+        return False