Replaced chardet by charamel to improve character encoding detection.

This commit is contained in:
Louis Vézina 2020-09-23 10:28:44 -04:00
parent 3726433fe4
commit efafe4a126
2 changed files with 20 additions and 19 deletions

View File

@ -4,7 +4,7 @@ import os
import re import re
import logging import logging
import chardet from charamel import Detector
from bs4 import UnicodeDammit from bs4 import UnicodeDammit
from config import settings from config import settings
@ -163,9 +163,10 @@ def force_unicode(s):
try: try:
s = s.decode("utf-8") s = s.decode("utf-8")
except UnicodeDecodeError: except UnicodeDecodeError:
t = chardet.detect(s) detector = Detector()
t = detector.detect(s)
try: try:
s = s.decode(t["encoding"]) s = s.decode(t)
except UnicodeDecodeError: except UnicodeDecodeError:
s = UnicodeDammit(s).unicode_markup s = UnicodeDammit(s).unicode_markup
return s return s

View File

@ -16,7 +16,7 @@ from helper import path_mappings, get_subtitle_destination_folder
from embedded_subs_reader import embedded_subs_reader from embedded_subs_reader import embedded_subs_reader
from event_handler import event_stream from event_handler import event_stream
import chardet from charamel import Detector
gc.enable() gc.enable()
@ -413,18 +413,16 @@ def guess_external_subtitles(dest_folder, subtitles):
text = f.read() text = f.read()
try: try:
guess = chardet.detect(text) text = text.decode('utf-8')
except UnicodeDecodeError:
detector = Detector()
guess = detector.detect(text)
logging.debug('BAZARR detected encoding %r', guess) logging.debug('BAZARR detected encoding %r', guess)
text = text.decode(guess["encoding"]) text = text.decode(guess)
detected_language = guess_language(text) detected_language = guess_language(text)
except (UnicodeDecodeError, TypeError):
logging.exception("BAZARR subtitles file doesn't seems to be text based. Skipping this file: " +
subtitle_path)
except: except:
logging.exception('BAZARR Error trying to detect language for this subtitles file: ' + pass
subtitle_path + ' You should try to delete this subtitles file manually and ask ' finally:
'Bazarr to download it again.')
else:
if detected_language: if detected_language:
logging.debug("BAZARR external subtitles detected and guessed this language: " + str( logging.debug("BAZARR external subtitles detected and guessed this language: " + str(
detected_language)) detected_language))
@ -442,13 +440,15 @@ def guess_external_subtitles(dest_folder, subtitles):
text = f.read() text = f.read()
try: try:
guess = chardet.detect(text) text = text.decode('utf-8')
except UnicodeDecodeError:
detector = Detector()
guess = detector.detect(text)
logging.debug('BAZARR detected encoding %r', guess) logging.debug('BAZARR detected encoding %r', guess)
text = text.decode(guess["encoding"]) text = text.decode(guess)
except (UnicodeDecodeError, TypeError): except:
logging.exception("BAZARR subtitles file doesn't seems to be text based. Skipping this file: " + pass
subtitle_path) finally:
else:
if bool(re.search(hi_regex, text)): if bool(re.search(hi_regex, text)):
subtitles[subtitle] = Language.rebuild(subtitles[subtitle], forced=False, hi=True) subtitles[subtitle] = Language.rebuild(subtitles[subtitle], forced=False, hi=True)
return subtitles return subtitles