From a8840c2fb60df70f75d6652e330855411b40403c Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=C4=A2irts=20Kokars?= <girts.kokars@gmail.com>
Date: Wed, 8 May 2019 15:17:01 +0300
Subject: [PATCH] override subtitle guess_encoding method to not include
 language-specific encodings and go straight to chardet as it seems to yield
 far better results

---
 libs/subliminal_patch/providers/nekur.py      | 43 ++++++++++++++++---
 libs/subliminal_patch/providers/subtitriid.py | 42 +++++++++++++++---
 2 files changed, 72 insertions(+), 13 deletions(-)

diff --git a/libs/subliminal_patch/providers/nekur.py b/libs/subliminal_patch/providers/nekur.py
index 859025865..f8950d1f2 100644
--- a/libs/subliminal_patch/providers/nekur.py
+++ b/libs/subliminal_patch/providers/nekur.py
@@ -8,8 +8,8 @@ from rarfile import RarFile, is_rarfile
 
 from guessit import guessit
 from requests import Session
-from bs4 import NavigableString
-from ftfy import fix_text
+import chardet
+from bs4 import NavigableString, UnicodeDammit
 from subzero.language import Language
 
 from subliminal_patch.providers import Provider
@@ -38,7 +38,6 @@ class NekurSubtitle(Subtitle):
         self.fps = fps
         self.notes = notes
         self.matches = None
-        # self.encoding = 'utf-16'
 
     @property
     def id(self):
@@ -66,6 +65,39 @@ class NekurSubtitle(Subtitle):
         self.matches = matches
         return matches
 
+    def guess_encoding(self):
+        # override default subtitle guess_encoding method to not include language-specific encodings guessing
+        # chardet encoding detection seem to yield better results
+        """Guess encoding using chardet.
+
+        :return: the guessed encoding.
+        :rtype: str
+
+        """
+        if self._guessed_encoding:
+            return self._guessed_encoding
+
+        logger.info('Guessing encoding for language %s', self.language)
+
+        # guess/detect encoding using chardet
+        encoding = chardet.detect(self.content)['encoding']
+        logger.info('Chardet found encoding %s', encoding)
+
+        if not encoding:
+            # fallback on bs4
+            logger.info('Falling back to bs4 detection')
+            a = UnicodeDammit(self.content)
+
+            logger.info("bs4 detected encoding: %s", a.original_encoding)
+
+            if a.original_encoding:
+                self._guessed_encoding = a.original_encoding
+                return a.original_encoding
+            raise ValueError(u"Couldn't guess the proper encoding for %s", self)
+
+        self._guessed_encoding = encoding
+        return encoding
+
 
 class NekurProvider(Provider, ProviderSubtitleArchiveMixin):
     """Nekur Provider."""
@@ -172,7 +204,4 @@ class NekurProvider(Provider, ProviderSubtitleArchiveMixin):
 
                 raise ProviderError('Unidentified archive type')
 
-            subtitle_content = self.get_subtitle_from_archive(subtitle, archive)
-            # fix content encoding (utf-16 encoded by default)
-            fixed_subtitle_content = fix_text(subtitle_content.decode('utf-16'), {'uncurl_quotes': False, 'fix_character_width': False}).encode(encoding='utf-8')
-            subtitle.content = fixed_subtitle_content
+            subtitle.content = self.get_subtitle_from_archive(subtitle, archive)
diff --git a/libs/subliminal_patch/providers/subtitriid.py b/libs/subliminal_patch/providers/subtitriid.py
index cec25d37d..d21ec7324 100644
--- a/libs/subliminal_patch/providers/subtitriid.py
+++ b/libs/subliminal_patch/providers/subtitriid.py
@@ -7,7 +7,8 @@ from zipfile import ZipFile, is_zipfile
 from rarfile import RarFile, is_rarfile
 
 from requests import Session
-from ftfy import fix_text
+import chardet
+from bs4 import UnicodeDammit
 from subzero.language import Language
 
 from subliminal_patch.providers import Provider
@@ -33,7 +34,6 @@ class SubtitriIdSubtitle(Subtitle):
         self.year = year
         self.imdb_id = imdb_id
         self.matches = None
-        # self.encoding = 'utf-16'
 
     @property
     def id(self):
@@ -55,6 +55,39 @@ class SubtitriIdSubtitle(Subtitle):
         self.matches = matches
         return matches
 
+    def guess_encoding(self):
+        # override default subtitle guess_encoding method to not include language-specific encodings guessing
+        # chardet encoding detection seem to yield better results
+        """Guess encoding using chardet.
+
+        :return: the guessed encoding.
+        :rtype: str
+
+        """
+        if self._guessed_encoding:
+            return self._guessed_encoding
+
+        logger.info('Guessing encoding for language %s', self.language)
+
+        # guess/detect encoding using chardet
+        encoding = chardet.detect(self.content)['encoding']
+        logger.info('Chardet found encoding %s', encoding)
+
+        if not encoding:
+            # fallback on bs4
+            logger.info('Falling back to bs4 detection')
+            a = UnicodeDammit(self.content)
+
+            logger.info("bs4 detected encoding: %s", a.original_encoding)
+
+            if a.original_encoding:
+                self._guessed_encoding = a.original_encoding
+                return a.original_encoding
+            raise ValueError(u"Couldn't guess the proper encoding for %s", self)
+
+        self._guessed_encoding = encoding
+        return encoding
+
 
 class SubtitriIdProvider(Provider, ProviderSubtitleArchiveMixin):
     """subtitri.id.lv Provider."""
@@ -155,7 +188,4 @@ class SubtitriIdProvider(Provider, ProviderSubtitleArchiveMixin):
 
                 raise ProviderError('Unidentified archive type')
 
-            subtitle_content = self.get_subtitle_from_archive(subtitle, archive)
-            # fix content encoding (utf-16 encoded by default)
-            fixed_subtitle_content = fix_text(subtitle_content.decode('utf-16'), {'uncurl_quotes': False, 'fix_character_width': False}).encode(encoding='utf-8')
-            subtitle.content = fixed_subtitle_content
+            subtitle.content = self.get_subtitle_from_archive(subtitle, archive)