mirror of
https://github.com/morpheus65535/bazarr
synced 2025-01-31 03:12:12 +00:00
Convert subtitle text to unicode before sending to guess_language
- Remove is_binary as it test only the first 1024 bytes and the encoding of teh detected file must be above 0.9 to recognize the file as a text. The new implementation assumes that if the file is binary, then detection of character encoding will be with confidence lower than 80% or text.decode() will raise an exception. - skip detection of subtitle files larger than 5M
This commit is contained in:
parent
1a44dbc31a
commit
4056796eb1
1 changed files with 20 additions and 17 deletions
|
@ -13,9 +13,7 @@ import operator
|
||||||
from subliminal import core
|
from subliminal import core
|
||||||
from subliminal_patch import search_external_subtitles
|
from subliminal_patch import search_external_subtitles
|
||||||
from subzero.language import Language
|
from subzero.language import Language
|
||||||
from bs4 import UnicodeDammit
|
|
||||||
import six
|
import six
|
||||||
from binaryornot.check import is_binary
|
|
||||||
|
|
||||||
from get_args import args
|
from get_args import args
|
||||||
from database import database
|
from database import database
|
||||||
|
@ -27,6 +25,7 @@ from helper import path_replace, path_replace_movie, path_replace_reverse, \
|
||||||
from queueconfig import notifications
|
from queueconfig import notifications
|
||||||
from embedded_subs_reader import embedded_subs_reader
|
from embedded_subs_reader import embedded_subs_reader
|
||||||
import six
|
import six
|
||||||
|
import chardet
|
||||||
|
|
||||||
gc.enable()
|
gc.enable()
|
||||||
|
|
||||||
|
@ -367,25 +366,29 @@ def guess_external_subtitles(dest_folder, subtitles):
|
||||||
subtitle_path = os.path.join(dest_folder, subtitle)
|
subtitle_path = os.path.join(dest_folder, subtitle)
|
||||||
if os.path.exists(subtitle_path) and os.path.splitext(subtitle_path)[1] in core.SUBTITLE_EXTENSIONS:
|
if os.path.exists(subtitle_path) and os.path.splitext(subtitle_path)[1] in core.SUBTITLE_EXTENSIONS:
|
||||||
logging.debug("BAZARR falling back to file content analysis to detect language.")
|
logging.debug("BAZARR falling back to file content analysis to detect language.")
|
||||||
if is_binary(subtitle_path):
|
|
||||||
logging.debug("BAZARR subtitles file doesn't seems to be text based. Skipping this file: " +
|
|
||||||
subtitle_path)
|
|
||||||
continue
|
|
||||||
detected_language = None
|
detected_language = None
|
||||||
|
|
||||||
if six.PY3:
|
# to improve performance, skip detection of files larger that 5M
|
||||||
with open(subtitle_path, 'r', errors='ignore') as f:
|
if os.path.getsize(subtitle_path) > 5*1024*1024:
|
||||||
text = f.read()
|
logging.debug("BAZARR subtitles file is too large to be text based. Skipping this file: " +
|
||||||
else:
|
subtitle_path)
|
||||||
with open(subtitle_path, 'r') as f:
|
continue
|
||||||
|
|
||||||
|
with open(subtitle_path, 'rb') as f:
|
||||||
text = f.read()
|
text = f.read()
|
||||||
|
|
||||||
try:
|
try:
|
||||||
encoding = UnicodeDammit(text)
|
# to improve performance, use only the first 8K to detect encoding
|
||||||
if six.PY2:
|
if len(text) > 8192: guess = chardet.detect(text[:8192])
|
||||||
text = text.decode(encoding.original_encoding)
|
else: guess = chardet.detect(text)
|
||||||
|
if guess["confidence"] < 0.8:
|
||||||
|
raise UnicodeError
|
||||||
|
text = text.decode(guess["encoding"])
|
||||||
detected_language = guess_language(text)
|
detected_language = guess_language(text)
|
||||||
except Exception as e:
|
except UnicodeError:
|
||||||
|
logging.exception("BAZARR subtitles file doesn't seems to be text based. Skipping this file: " +
|
||||||
|
subtitle_path)
|
||||||
|
except:
|
||||||
logging.exception('BAZARR Error trying to detect language for this subtitles file: ' +
|
logging.exception('BAZARR Error trying to detect language for this subtitles file: ' +
|
||||||
subtitle_path + ' You should try to delete this subtitles file manually and ask '
|
subtitle_path + ' You should try to delete this subtitles file manually and ask '
|
||||||
'Bazarr to download it again.')
|
'Bazarr to download it again.')
|
||||||
|
|
Loading…
Reference in a new issue