mirror of
https://github.com/morpheus65535/bazarr
synced 2024-12-27 01:57:33 +00:00
Replaced chardet by charamel to improve character encoding detection.
This commit is contained in:
parent
3726433fe4
commit
efafe4a126
2 changed files with 20 additions and 19 deletions
|
@ -4,7 +4,7 @@ import os
|
||||||
import re
|
import re
|
||||||
import logging
|
import logging
|
||||||
|
|
||||||
import chardet
|
from charamel import Detector
|
||||||
from bs4 import UnicodeDammit
|
from bs4 import UnicodeDammit
|
||||||
|
|
||||||
from config import settings
|
from config import settings
|
||||||
|
@ -163,9 +163,10 @@ def force_unicode(s):
|
||||||
try:
|
try:
|
||||||
s = s.decode("utf-8")
|
s = s.decode("utf-8")
|
||||||
except UnicodeDecodeError:
|
except UnicodeDecodeError:
|
||||||
t = chardet.detect(s)
|
detector = Detector()
|
||||||
|
t = detector.detect(s)
|
||||||
try:
|
try:
|
||||||
s = s.decode(t["encoding"])
|
s = s.decode(t)
|
||||||
except UnicodeDecodeError:
|
except UnicodeDecodeError:
|
||||||
s = UnicodeDammit(s).unicode_markup
|
s = UnicodeDammit(s).unicode_markup
|
||||||
return s
|
return s
|
||||||
|
|
|
@ -16,7 +16,7 @@ from helper import path_mappings, get_subtitle_destination_folder
|
||||||
|
|
||||||
from embedded_subs_reader import embedded_subs_reader
|
from embedded_subs_reader import embedded_subs_reader
|
||||||
from event_handler import event_stream
|
from event_handler import event_stream
|
||||||
import chardet
|
from charamel import Detector
|
||||||
|
|
||||||
gc.enable()
|
gc.enable()
|
||||||
|
|
||||||
|
@ -413,18 +413,16 @@ def guess_external_subtitles(dest_folder, subtitles):
|
||||||
text = f.read()
|
text = f.read()
|
||||||
|
|
||||||
try:
|
try:
|
||||||
guess = chardet.detect(text)
|
text = text.decode('utf-8')
|
||||||
|
except UnicodeDecodeError:
|
||||||
|
detector = Detector()
|
||||||
|
guess = detector.detect(text)
|
||||||
logging.debug('BAZARR detected encoding %r', guess)
|
logging.debug('BAZARR detected encoding %r', guess)
|
||||||
text = text.decode(guess["encoding"])
|
text = text.decode(guess)
|
||||||
detected_language = guess_language(text)
|
detected_language = guess_language(text)
|
||||||
except (UnicodeDecodeError, TypeError):
|
|
||||||
logging.exception("BAZARR subtitles file doesn't seems to be text based. Skipping this file: " +
|
|
||||||
subtitle_path)
|
|
||||||
except:
|
except:
|
||||||
logging.exception('BAZARR Error trying to detect language for this subtitles file: ' +
|
pass
|
||||||
subtitle_path + ' You should try to delete this subtitles file manually and ask '
|
finally:
|
||||||
'Bazarr to download it again.')
|
|
||||||
else:
|
|
||||||
if detected_language:
|
if detected_language:
|
||||||
logging.debug("BAZARR external subtitles detected and guessed this language: " + str(
|
logging.debug("BAZARR external subtitles detected and guessed this language: " + str(
|
||||||
detected_language))
|
detected_language))
|
||||||
|
@ -442,13 +440,15 @@ def guess_external_subtitles(dest_folder, subtitles):
|
||||||
text = f.read()
|
text = f.read()
|
||||||
|
|
||||||
try:
|
try:
|
||||||
guess = chardet.detect(text)
|
text = text.decode('utf-8')
|
||||||
|
except UnicodeDecodeError:
|
||||||
|
detector = Detector()
|
||||||
|
guess = detector.detect(text)
|
||||||
logging.debug('BAZARR detected encoding %r', guess)
|
logging.debug('BAZARR detected encoding %r', guess)
|
||||||
text = text.decode(guess["encoding"])
|
text = text.decode(guess)
|
||||||
except (UnicodeDecodeError, TypeError):
|
except:
|
||||||
logging.exception("BAZARR subtitles file doesn't seems to be text based. Skipping this file: " +
|
pass
|
||||||
subtitle_path)
|
finally:
|
||||||
else:
|
|
||||||
if bool(re.search(hi_regex, text)):
|
if bool(re.search(hi_regex, text)):
|
||||||
subtitles[subtitle] = Language.rebuild(subtitles[subtitle], forced=False, hi=True)
|
subtitles[subtitle] = Language.rebuild(subtitles[subtitle], forced=False, hi=True)
|
||||||
return subtitles
|
return subtitles
|
||||||
|
|
Loading…
Reference in a new issue