Replaced chardet by charamel to improve character encoding detection.

2024-12-27 01:57:33 +00:00 · 2020-09-23 10:28:44 -04:00 · 2020-09-23 10:28:44 -04:00 · efafe4a126
commit efafe4a126
parent 3726433fe4
2 changed files with 20 additions and 19 deletions
--- a/bazarr/helper.py
+++ b/bazarr/helper.py
@ -4,7 +4,7 @@ import os
 import re
 import logging
-import chardet
+from charamel import Detector
 from bs4 import UnicodeDammit
 from config import settings
@ -163,9 +163,10 @@ def force_unicode(s):
        try:
            s = s.decode("utf-8")
        except UnicodeDecodeError:
-            t = chardet.detect(s)
+            detector = Detector()
            t = detector.detect(s)
            try:
-                s = s.decode(t["encoding"])
+                s = s.decode(t)
            except UnicodeDecodeError:
                s = UnicodeDammit(s).unicode_markup
    return s
--- a/bazarr/list_subtitles.py
+++ b/bazarr/list_subtitles.py
@ -16,7 +16,7 @@ from helper import path_mappings, get_subtitle_destination_folder
 from embedded_subs_reader import embedded_subs_reader
 from event_handler import event_stream
-import chardet
+from charamel import Detector
 gc.enable()
@ -413,18 +413,16 @@ def guess_external_subtitles(dest_folder, subtitles):
                    text = f.read()
                try:
-                    guess = chardet.detect(text)
+                    text = text.decode('utf-8')
                except UnicodeDecodeError:
                    detector = Detector()
                    guess = detector.detect(text)
                    logging.debug('BAZARR detected encoding %r', guess)
-                    text = text.decode(guess["encoding"])
+                    text = text.decode(guess)
                    detected_language = guess_language(text)
                except (UnicodeDecodeError, TypeError):
                    logging.exception("BAZARR subtitles file doesn't seems to be text based. Skipping this file: " +
                                      subtitle_path)
                except:
-                    logging.exception('BAZARR Error trying to detect language for this subtitles file: ' +
+                    pass
-                                      subtitle_path + ' You should try to delete this subtitles file manually and ask '
+                finally:
                                                      'Bazarr to download it again.')
                else:
                    if detected_language:
                        logging.debug("BAZARR external subtitles detected and guessed this language: " + str(
                            detected_language))
@ -442,13 +440,15 @@ def guess_external_subtitles(dest_folder, subtitles):
                text = f.read()
            try:
-                guess = chardet.detect(text)
+                text = text.decode('utf-8')
            except UnicodeDecodeError:
                detector = Detector()
                guess = detector.detect(text)
                logging.debug('BAZARR detected encoding %r', guess)
-                text = text.decode(guess["encoding"])
+                text = text.decode(guess)
-            except (UnicodeDecodeError, TypeError):
+            except:
-                logging.exception("BAZARR subtitles file doesn't seems to be text based. Skipping this file: " +
+                pass
-                                  subtitle_path)
+            finally:
            else:
                if bool(re.search(hi_regex, text)):
                    subtitles[subtitle] = Language.rebuild(subtitles[subtitle], forced=False, hi=True)
    return subtitles