mirror of https://github.com/morpheus65535/bazarr
override subtitle guess_encoding method to not include language-specific encodings and go straight to chardet as it seems to yield far better results
This commit is contained in:
parent
1f8469f83a
commit
a8840c2fb6
|
@ -8,8 +8,8 @@ from rarfile import RarFile, is_rarfile
|
||||||
|
|
||||||
from guessit import guessit
|
from guessit import guessit
|
||||||
from requests import Session
|
from requests import Session
|
||||||
from bs4 import NavigableString
|
import chardet
|
||||||
from ftfy import fix_text
|
from bs4 import NavigableString, UnicodeDammit
|
||||||
from subzero.language import Language
|
from subzero.language import Language
|
||||||
|
|
||||||
from subliminal_patch.providers import Provider
|
from subliminal_patch.providers import Provider
|
||||||
|
@ -38,7 +38,6 @@ class NekurSubtitle(Subtitle):
|
||||||
self.fps = fps
|
self.fps = fps
|
||||||
self.notes = notes
|
self.notes = notes
|
||||||
self.matches = None
|
self.matches = None
|
||||||
# self.encoding = 'utf-16'
|
|
||||||
|
|
||||||
@property
|
@property
|
||||||
def id(self):
|
def id(self):
|
||||||
|
@ -66,6 +65,39 @@ class NekurSubtitle(Subtitle):
|
||||||
self.matches = matches
|
self.matches = matches
|
||||||
return matches
|
return matches
|
||||||
|
|
||||||
|
def guess_encoding(self):
|
||||||
|
# override default subtitle guess_encoding method to not include language-specific encodings guessing
|
||||||
|
# chardet encoding detection seem to yield better results
|
||||||
|
"""Guess encoding using chardet.
|
||||||
|
|
||||||
|
:return: the guessed encoding.
|
||||||
|
:rtype: str
|
||||||
|
|
||||||
|
"""
|
||||||
|
if self._guessed_encoding:
|
||||||
|
return self._guessed_encoding
|
||||||
|
|
||||||
|
logger.info('Guessing encoding for language %s', self.language)
|
||||||
|
|
||||||
|
# guess/detect encoding using chardet
|
||||||
|
encoding = chardet.detect(self.content)['encoding']
|
||||||
|
logger.info('Chardet found encoding %s', encoding)
|
||||||
|
|
||||||
|
if not encoding:
|
||||||
|
# fallback on bs4
|
||||||
|
logger.info('Falling back to bs4 detection')
|
||||||
|
a = UnicodeDammit(self.content)
|
||||||
|
|
||||||
|
logger.info("bs4 detected encoding: %s", a.original_encoding)
|
||||||
|
|
||||||
|
if a.original_encoding:
|
||||||
|
self._guessed_encoding = a.original_encoding
|
||||||
|
return a.original_encoding
|
||||||
|
raise ValueError(u"Couldn't guess the proper encoding for %s", self)
|
||||||
|
|
||||||
|
self._guessed_encoding = encoding
|
||||||
|
return encoding
|
||||||
|
|
||||||
|
|
||||||
class NekurProvider(Provider, ProviderSubtitleArchiveMixin):
|
class NekurProvider(Provider, ProviderSubtitleArchiveMixin):
|
||||||
"""Nekur Provider."""
|
"""Nekur Provider."""
|
||||||
|
@ -172,7 +204,4 @@ class NekurProvider(Provider, ProviderSubtitleArchiveMixin):
|
||||||
|
|
||||||
raise ProviderError('Unidentified archive type')
|
raise ProviderError('Unidentified archive type')
|
||||||
|
|
||||||
subtitle_content = self.get_subtitle_from_archive(subtitle, archive)
|
subtitle.content = self.get_subtitle_from_archive(subtitle, archive)
|
||||||
# fix content encoding (utf-16 encoded by default)
|
|
||||||
fixed_subtitle_content = fix_text(subtitle_content.decode('utf-16'), {'uncurl_quotes': False, 'fix_character_width': False}).encode(encoding='utf-8')
|
|
||||||
subtitle.content = fixed_subtitle_content
|
|
||||||
|
|
|
@ -7,7 +7,8 @@ from zipfile import ZipFile, is_zipfile
|
||||||
from rarfile import RarFile, is_rarfile
|
from rarfile import RarFile, is_rarfile
|
||||||
|
|
||||||
from requests import Session
|
from requests import Session
|
||||||
from ftfy import fix_text
|
import chardet
|
||||||
|
from bs4 import UnicodeDammit
|
||||||
from subzero.language import Language
|
from subzero.language import Language
|
||||||
|
|
||||||
from subliminal_patch.providers import Provider
|
from subliminal_patch.providers import Provider
|
||||||
|
@ -33,7 +34,6 @@ class SubtitriIdSubtitle(Subtitle):
|
||||||
self.year = year
|
self.year = year
|
||||||
self.imdb_id = imdb_id
|
self.imdb_id = imdb_id
|
||||||
self.matches = None
|
self.matches = None
|
||||||
# self.encoding = 'utf-16'
|
|
||||||
|
|
||||||
@property
|
@property
|
||||||
def id(self):
|
def id(self):
|
||||||
|
@ -55,6 +55,39 @@ class SubtitriIdSubtitle(Subtitle):
|
||||||
self.matches = matches
|
self.matches = matches
|
||||||
return matches
|
return matches
|
||||||
|
|
||||||
|
def guess_encoding(self):
|
||||||
|
# override default subtitle guess_encoding method to not include language-specific encodings guessing
|
||||||
|
# chardet encoding detection seem to yield better results
|
||||||
|
"""Guess encoding using chardet.
|
||||||
|
|
||||||
|
:return: the guessed encoding.
|
||||||
|
:rtype: str
|
||||||
|
|
||||||
|
"""
|
||||||
|
if self._guessed_encoding:
|
||||||
|
return self._guessed_encoding
|
||||||
|
|
||||||
|
logger.info('Guessing encoding for language %s', self.language)
|
||||||
|
|
||||||
|
# guess/detect encoding using chardet
|
||||||
|
encoding = chardet.detect(self.content)['encoding']
|
||||||
|
logger.info('Chardet found encoding %s', encoding)
|
||||||
|
|
||||||
|
if not encoding:
|
||||||
|
# fallback on bs4
|
||||||
|
logger.info('Falling back to bs4 detection')
|
||||||
|
a = UnicodeDammit(self.content)
|
||||||
|
|
||||||
|
logger.info("bs4 detected encoding: %s", a.original_encoding)
|
||||||
|
|
||||||
|
if a.original_encoding:
|
||||||
|
self._guessed_encoding = a.original_encoding
|
||||||
|
return a.original_encoding
|
||||||
|
raise ValueError(u"Couldn't guess the proper encoding for %s", self)
|
||||||
|
|
||||||
|
self._guessed_encoding = encoding
|
||||||
|
return encoding
|
||||||
|
|
||||||
|
|
||||||
class SubtitriIdProvider(Provider, ProviderSubtitleArchiveMixin):
|
class SubtitriIdProvider(Provider, ProviderSubtitleArchiveMixin):
|
||||||
"""subtitri.id.lv Provider."""
|
"""subtitri.id.lv Provider."""
|
||||||
|
@ -155,7 +188,4 @@ class SubtitriIdProvider(Provider, ProviderSubtitleArchiveMixin):
|
||||||
|
|
||||||
raise ProviderError('Unidentified archive type')
|
raise ProviderError('Unidentified archive type')
|
||||||
|
|
||||||
subtitle_content = self.get_subtitle_from_archive(subtitle, archive)
|
subtitle.content = self.get_subtitle_from_archive(subtitle, archive)
|
||||||
# fix content encoding (utf-16 encoded by default)
|
|
||||||
fixed_subtitle_content = fix_text(subtitle_content.decode('utf-16'), {'uncurl_quotes': False, 'fix_character_width': False}).encode(encoding='utf-8')
|
|
||||||
subtitle.content = fixed_subtitle_content
|
|
||||||
|
|
Loading…
Reference in New Issue