override subtitle guess_encoding method to not include language-specific encodings and go straight to chardet as it seems to yield far better results

2019-05-08 15:17:01 +03:00 · 2019-05-08 15:17:01 +03:00 · a8840c2fb6
parent 1f8469f83a
commit a8840c2fb6
2 changed files with 72 additions and 13 deletions
--- a/libs/subliminal_patch/providers/nekur.py
+++ b/libs/subliminal_patch/providers/nekur.py
@ -8,8 +8,8 @@ from rarfile import RarFile, is_rarfile
 from guessit import guessit
 from requests import Session
-from bs4 import NavigableString
+import chardet
-from ftfy import fix_text
+from bs4 import NavigableString, UnicodeDammit
 from subzero.language import Language
 from subliminal_patch.providers import Provider
@ -38,7 +38,6 @@ class NekurSubtitle(Subtitle):
        self.fps = fps
        self.notes = notes
        self.matches = None
        # self.encoding = 'utf-16'
    @property
    def id(self):
@ -66,6 +65,39 @@ class NekurSubtitle(Subtitle):
        self.matches = matches
        return matches
    def guess_encoding(self):
        # override default subtitle guess_encoding method to not include language-specific encodings guessing
        # chardet encoding detection seem to yield better results
        """Guess encoding using chardet.
        :return: the guessed encoding.
        :rtype: str
        """
        if self._guessed_encoding:
            return self._guessed_encoding
        logger.info('Guessing encoding for language %s', self.language)
        # guess/detect encoding using chardet
        encoding = chardet.detect(self.content)['encoding']
        logger.info('Chardet found encoding %s', encoding)
        if not encoding:
            # fallback on bs4
            logger.info('Falling back to bs4 detection')
            a = UnicodeDammit(self.content)
            logger.info("bs4 detected encoding: %s", a.original_encoding)
            if a.original_encoding:
                self._guessed_encoding = a.original_encoding
                return a.original_encoding
            raise ValueError(u"Couldn't guess the proper encoding for %s", self)
        self._guessed_encoding = encoding
        return encoding
 class NekurProvider(Provider, ProviderSubtitleArchiveMixin):
    """Nekur Provider."""
@ -172,7 +204,4 @@ class NekurProvider(Provider, ProviderSubtitleArchiveMixin):
                raise ProviderError('Unidentified archive type')
-            subtitle_content = self.get_subtitle_from_archive(subtitle, archive)
+            subtitle.content = self.get_subtitle_from_archive(subtitle, archive)
            # fix content encoding (utf-16 encoded by default)
            fixed_subtitle_content = fix_text(subtitle_content.decode('utf-16'), {'uncurl_quotes': False, 'fix_character_width': False}).encode(encoding='utf-8')
            subtitle.content = fixed_subtitle_content
--- a/libs/subliminal_patch/providers/subtitriid.py
+++ b/libs/subliminal_patch/providers/subtitriid.py
@ -7,7 +7,8 @@ from zipfile import ZipFile, is_zipfile
 from rarfile import RarFile, is_rarfile
 from requests import Session
-from ftfy import fix_text
+import chardet
 from bs4 import UnicodeDammit
 from subzero.language import Language
 from subliminal_patch.providers import Provider
@ -33,7 +34,6 @@ class SubtitriIdSubtitle(Subtitle):
        self.year = year
        self.imdb_id = imdb_id
        self.matches = None
        # self.encoding = 'utf-16'
    @property
    def id(self):
@ -55,6 +55,39 @@ class SubtitriIdSubtitle(Subtitle):
        self.matches = matches
        return matches
    def guess_encoding(self):
        # override default subtitle guess_encoding method to not include language-specific encodings guessing
        # chardet encoding detection seem to yield better results
        """Guess encoding using chardet.
        :return: the guessed encoding.
        :rtype: str
        """
        if self._guessed_encoding:
            return self._guessed_encoding
        logger.info('Guessing encoding for language %s', self.language)
        # guess/detect encoding using chardet
        encoding = chardet.detect(self.content)['encoding']
        logger.info('Chardet found encoding %s', encoding)
        if not encoding:
            # fallback on bs4
            logger.info('Falling back to bs4 detection')
            a = UnicodeDammit(self.content)
            logger.info("bs4 detected encoding: %s", a.original_encoding)
            if a.original_encoding:
                self._guessed_encoding = a.original_encoding
                return a.original_encoding
            raise ValueError(u"Couldn't guess the proper encoding for %s", self)
        self._guessed_encoding = encoding
        return encoding
 class SubtitriIdProvider(Provider, ProviderSubtitleArchiveMixin):
    """subtitri.id.lv Provider."""
@ -155,7 +188,4 @@ class SubtitriIdProvider(Provider, ProviderSubtitleArchiveMixin):
                raise ProviderError('Unidentified archive type')
-            subtitle_content = self.get_subtitle_from_archive(subtitle, archive)
+            subtitle.content = self.get_subtitle_from_archive(subtitle, archive)
            # fix content encoding (utf-16 encoded by default)
            fixed_subtitle_content = fix_text(subtitle_content.decode('utf-16'), {'uncurl_quotes': False, 'fix_character_width': False}).encode(encoding='utf-8')
            subtitle.content = fixed_subtitle_content