Merge pull request #890 from josdion/development

improve character encoding detection
This commit is contained in:
morpheus65535 2020-03-26 15:04:58 -04:00 committed by GitHub
commit 42160dc0e5
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
1 changed files with 5 additions and 3 deletions

View File

@ -379,10 +379,12 @@ def guess_external_subtitles(dest_folder, subtitles):
try:
# to improve performance, use only the first 32K to detect encoding
if len(text) > 32768: guess = chardet.detect(text[:32768])
else: guess = chardet.detect(text)
if guess["confidence"] < 0.8:
guess = chardet.detect(text[:32768])
logging.debug('BAZARR detected encoding %r', guess)
if guess["confidence"] < 0.6:
raise UnicodeError
if guess["confidence"] < 0.8 or guess["encoding"] == "ascii":
guess["encoding"] = "utf-8"
text = text.decode(guess["encoding"])
detected_language = guess_language(text)
except UnicodeError: