From a8840c2fb60df70f75d6652e330855411b40403c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=C4=A2irts=20Kokars?= Date: Wed, 8 May 2019 15:17:01 +0300 Subject: [PATCH] override subtitle guess_encoding method to not include language-specific encodings and go straight to chardet as it seems to yield far better results --- libs/subliminal_patch/providers/nekur.py | 43 ++++++++++++++++--- libs/subliminal_patch/providers/subtitriid.py | 42 +++++++++++++++--- 2 files changed, 72 insertions(+), 13 deletions(-) diff --git a/libs/subliminal_patch/providers/nekur.py b/libs/subliminal_patch/providers/nekur.py index 859025865..f8950d1f2 100644 --- a/libs/subliminal_patch/providers/nekur.py +++ b/libs/subliminal_patch/providers/nekur.py @@ -8,8 +8,8 @@ from rarfile import RarFile, is_rarfile from guessit import guessit from requests import Session -from bs4 import NavigableString -from ftfy import fix_text +import chardet +from bs4 import NavigableString, UnicodeDammit from subzero.language import Language from subliminal_patch.providers import Provider @@ -38,7 +38,6 @@ class NekurSubtitle(Subtitle): self.fps = fps self.notes = notes self.matches = None - # self.encoding = 'utf-16' @property def id(self): @@ -66,6 +65,39 @@ class NekurSubtitle(Subtitle): self.matches = matches return matches + def guess_encoding(self): + # override default subtitle guess_encoding method to not include language-specific encodings guessing + # chardet encoding detection seem to yield better results + """Guess encoding using chardet. + + :return: the guessed encoding. + :rtype: str + + """ + if self._guessed_encoding: + return self._guessed_encoding + + logger.info('Guessing encoding for language %s', self.language) + + # guess/detect encoding using chardet + encoding = chardet.detect(self.content)['encoding'] + logger.info('Chardet found encoding %s', encoding) + + if not encoding: + # fallback on bs4 + logger.info('Falling back to bs4 detection') + a = UnicodeDammit(self.content) + + logger.info("bs4 detected encoding: %s", a.original_encoding) + + if a.original_encoding: + self._guessed_encoding = a.original_encoding + return a.original_encoding + raise ValueError(u"Couldn't guess the proper encoding for %s", self) + + self._guessed_encoding = encoding + return encoding + class NekurProvider(Provider, ProviderSubtitleArchiveMixin): """Nekur Provider.""" @@ -172,7 +204,4 @@ class NekurProvider(Provider, ProviderSubtitleArchiveMixin): raise ProviderError('Unidentified archive type') - subtitle_content = self.get_subtitle_from_archive(subtitle, archive) - # fix content encoding (utf-16 encoded by default) - fixed_subtitle_content = fix_text(subtitle_content.decode('utf-16'), {'uncurl_quotes': False, 'fix_character_width': False}).encode(encoding='utf-8') - subtitle.content = fixed_subtitle_content + subtitle.content = self.get_subtitle_from_archive(subtitle, archive) diff --git a/libs/subliminal_patch/providers/subtitriid.py b/libs/subliminal_patch/providers/subtitriid.py index cec25d37d..d21ec7324 100644 --- a/libs/subliminal_patch/providers/subtitriid.py +++ b/libs/subliminal_patch/providers/subtitriid.py @@ -7,7 +7,8 @@ from zipfile import ZipFile, is_zipfile from rarfile import RarFile, is_rarfile from requests import Session -from ftfy import fix_text +import chardet +from bs4 import UnicodeDammit from subzero.language import Language from subliminal_patch.providers import Provider @@ -33,7 +34,6 @@ class SubtitriIdSubtitle(Subtitle): self.year = year self.imdb_id = imdb_id self.matches = None - # self.encoding = 'utf-16' @property def id(self): @@ -55,6 +55,39 @@ class SubtitriIdSubtitle(Subtitle): self.matches = matches return matches + def guess_encoding(self): + # override default subtitle guess_encoding method to not include language-specific encodings guessing + # chardet encoding detection seem to yield better results + """Guess encoding using chardet. + + :return: the guessed encoding. + :rtype: str + + """ + if self._guessed_encoding: + return self._guessed_encoding + + logger.info('Guessing encoding for language %s', self.language) + + # guess/detect encoding using chardet + encoding = chardet.detect(self.content)['encoding'] + logger.info('Chardet found encoding %s', encoding) + + if not encoding: + # fallback on bs4 + logger.info('Falling back to bs4 detection') + a = UnicodeDammit(self.content) + + logger.info("bs4 detected encoding: %s", a.original_encoding) + + if a.original_encoding: + self._guessed_encoding = a.original_encoding + return a.original_encoding + raise ValueError(u"Couldn't guess the proper encoding for %s", self) + + self._guessed_encoding = encoding + return encoding + class SubtitriIdProvider(Provider, ProviderSubtitleArchiveMixin): """subtitri.id.lv Provider.""" @@ -155,7 +188,4 @@ class SubtitriIdProvider(Provider, ProviderSubtitleArchiveMixin): raise ProviderError('Unidentified archive type') - subtitle_content = self.get_subtitle_from_archive(subtitle, archive) - # fix content encoding (utf-16 encoded by default) - fixed_subtitle_content = fix_text(subtitle_content.decode('utf-16'), {'uncurl_quotes': False, 'fix_character_width': False}).encode(encoding='utf-8') - subtitle.content = fixed_subtitle_content + subtitle.content = self.get_subtitle_from_archive(subtitle, archive)