bazarr/libs/subliminal_patch/subtitle.py

# coding=utf-8


from __future__ import absolute_import
import logging
import traceback

import re
import types

import chardet
import pysrt
import pysubs2
from bs4 import UnicodeDammit
from pysubs2 import SSAStyle
from pysubs2.subrip import parse_tags, MAX_REPRESENTABLE_TIME
from pysubs2.time import ms_to_times
from subzero.modification import SubtitleModifications
from subzero.language import Language
from subliminal import Subtitle as Subtitle_
from subliminal.subtitle import Episode, Movie, sanitize_release_group, get_equivalent_release_groups
from subliminal_patch.utils import sanitize
from ftfy import fix_text
from codecs import BOM_UTF8, BOM_UTF16_BE, BOM_UTF16_LE, BOM_UTF32_BE, BOM_UTF32_LE
from six import text_type

BOMS = (
    (BOM_UTF8, "UTF-8"),
    (BOM_UTF32_BE, "UTF-32-BE"),
    (BOM_UTF32_LE, "UTF-32-LE"),
    (BOM_UTF16_BE, "UTF-16-BE"),
    (BOM_UTF16_LE, "UTF-16-LE"),
)

logger = logging.getLogger(__name__)


ftfy_defaults = {
    "uncurl_quotes": False,
    "fix_character_width": False,
}


class Subtitle(Subtitle_):
    storage_path = None
    release_info = None
    matches = {}
    hash_verifiable = False
    hearing_impaired_verifiable = False
    mods = None
    plex_media_fps = None
    skip_wrong_fps = False
    wrong_fps = False
    wrong_series = False
    wrong_season_ep = False
    is_pack = False
    asked_for_release_group = None
    asked_for_episode = None
    uploader = None # string - uploader username

    pack_data = None
    _guessed_encoding = None
    _is_valid = False
    use_original_format = False
    format = "srt" # default format is srt

    def __init__(self, language, hearing_impaired=False, page_link=None, encoding=None, mods=None, original_format=False):
        # set subtitle language to hi if it's hearing_impaired
        if hearing_impaired:
            language = Language.rebuild(language, hi=True)

        super(Subtitle, self).__init__(language, hearing_impaired=hearing_impaired, page_link=page_link,
                                       encoding=encoding)
        self.mods = mods
        self._is_valid = False
        self.use_original_format = original_format

    def __repr__(self):
        return '<%s %r [%s:%s]>' % (
            self.__class__.__name__, self.page_link, self.language, self._guessed_encoding)

    @property
    def text(self):
        """Content as string

        If :attr:`encoding` is None, the encoding is guessed with :meth:`guess_encoding`

        """
        if not self.content:
            return

        if not isinstance(self.content, text_type):
            return self.content.decode(self.get_encoding(), errors='replace')

        return self.content

    @property
    def numeric_id(self):
        raise NotImplemented

    def get_fps(self):
        """
        :return: frames per second or None if not supported
        :rtype: float
        """
        return None

    def make_picklable(self):
        """
        some subtitle instances might have unpicklable objects stored; clean them up here
        :return: self
        """
        return self

    def get_encoding(self):
        return self.guess_encoding()

    def set_encoding(self, encoding):
        ge = self.get_encoding()
        if encoding == ge:
            return

        unicontent = self.text
        logger.debug("Changing encoding: to %s, from %s", encoding, ge)
        self.content = unicontent.encode(encoding)
        self._guessed_encoding = encoding

    def normalize(self):
        """
        Set encoding to UTF-8 and normalize line endings
        :return:
        """
        self.set_encoding("utf-8")

        # normalize line endings
        self.content = self.content.replace(b"\r\n", b"\n").replace(b'\r', b'\n')

    def _check_bom(self, data):
        return [encoding for bom, encoding in BOMS if data.startswith(bom)]

    def guess_encoding(self):
        """Guess encoding using the language, falling back on chardet.

        :return: the guessed encoding.
        :rtype: str

        """
        if self._guessed_encoding:
            return self._guessed_encoding

        if self.encoding:
            # check provider encoding and use it only if it is valid
            try:
                self.content.decode(self.encoding)
                self._guessed_encoding = self.encoding
                return self._guessed_encoding
            except:
                # provider specified encoding is invalid, fallback to guessing
                pass

        logger.info('Guessing encoding for language %s', self.language)

        encodings = ['utf-8']

        # check UTF BOMs
        bom_encodings = self._check_bom(self.content)
        if bom_encodings:
            encodings = list(set(enc.lower() for enc in bom_encodings + encodings))

        # add language-specific encodings
        # http://scratchpad.wikia.com/wiki/Character_Encoding_Recommendation_for_Languages

        if self.language.alpha3 == 'zho':
            encodings.extend(['cp936', 'gb2312', 'gbk', 'hz', 'iso2022_jp_2', 'cp950', 'big5hkscs', 'big5',
                              'gb18030', 'utf-16'])
        elif self.language.alpha3 == 'jpn':
            encodings.extend(['shift-jis', 'cp932', 'euc_jp', 'iso2022_jp', 'iso2022_jp_1', 'iso2022_jp_2',
                              'iso2022_jp_2004', 'iso2022_jp_3', 'iso2022_jp_ext', ])
        elif self.language.alpha3 == 'tha':
            encodings.extend(['tis-620', 'cp874'])

        # arabian/farsi
        elif self.language.alpha3 in ('ara', 'fas', 'per'):
            encodings.extend(['windows-1256', 'utf-16', 'utf-16le', 'ascii', 'iso-8859-6'])
        elif self.language.alpha3 == 'heb':
            encodings.extend(['windows-1255', 'iso-8859-8'])
        elif self.language.alpha3 == 'tur':
            encodings.extend(['windows-1254', 'iso-8859-9', 'iso-8859-3'])

        # Greek
        elif self.language.alpha3 in ('grc', 'gre', 'ell'):
            encodings.extend(['windows-1253', 'cp1253', 'cp737', 'iso8859-7', 'cp875', 'cp869', 'iso2022_jp_2',
                              'mac_greek'])

        # Polish, Czech, Slovak, Hungarian, Slovene, Bosnian, Croatian, Serbian (Latin script),
        # Romanian and Albanian
        elif self.language.alpha3 in ('pol', 'cze', 'ces', 'slk', 'slo', 'slv', 'hun', 'bos', 'hbs', 'hrv', 'rsb',
                                      'ron', 'rum', 'sqi', 'alb'):

            encodings.extend(['windows-1250', 'iso-8859-2'])

            # Eastern European Group 1
            if self.language.alpha3 == "slv":
                encodings.append('iso-8859-4')

            # Albanian
            elif self.language.alpha3 in ("sqi", "alb"):
                encodings.extend(['windows-1252', 'iso-8859-15', 'iso-8859-1', 'iso-8859-9'])

        # Bulgarian, Serbian and Macedonian, Ukranian and Russian
        elif self.language.alpha3 in ('bul', 'srp', 'mkd', 'mac', 'rus', 'ukr'):
            # Eastern European Group 2
            if self.language.alpha3 in ('bul', 'mkd', 'mac', 'rus', 'ukr'):
                encodings.extend(['windows-1251', 'iso-8859-5'])

            elif self.language.alpha3 == 'srp':
                if self.language.script == "Latn":
                    encodings.extend(['windows-1250', 'iso-8859-2'])
                elif self.language.script == "Cyrl":
                    encodings.extend(['windows-1251', 'iso-8859-5'])
                else:
                    encodings.extend(['windows-1250', 'windows-1251', 'iso-8859-2', 'iso-8859-5'])

        else:
            # Western European (windows-1252) / Northern European
            encodings.extend(['windows-1252', 'iso-8859-15', 'iso-8859-9', 'iso-8859-4', 'iso-8859-1'])

        # try to decode
        logger.debug('Trying encodings %r', encodings)
        for encoding in encodings:
            try:
                self.content.decode(encoding)

            except UnicodeDecodeError:
                pass
            else:
                logger.info('Guessed encoding %s', encoding)
                self._guessed_encoding = encoding
                return encoding

        logger.warning('Could not guess encoding from language')

        # fallback on chardet
        encoding = chardet.detect(self.content)['encoding']
        logger.info('Chardet found encoding %s', encoding)

        if not encoding:
            # fallback on bs4
            logger.info('Falling back to bs4 detection')
            a = UnicodeDammit(self.content)

            logger.info("bs4 detected encoding: %s", a.original_encoding)

            if a.original_encoding:
                self._guessed_encoding = a.original_encoding
                return a.original_encoding
            raise ValueError(u"Couldn't guess the proper encoding for %s", self)

        self._guessed_encoding = encoding
        return encoding

    def is_valid(self):
        """Check if a :attr:`text` is a valid SubRip format. Note that orignal format will pypass the checking

        :return: whether or not the subtitle is valid.
        :rtype: bool

        """
        if self._is_valid:
            return True

        text = self.text
        if not text:
            return False

        # valid srt
        try:
            pysrt.from_string(text, error_handling=pysrt.ERROR_RAISE)
        except Exception:
            logger.error("PySRT-parsing failed, trying pysubs2")
        else:
            self._is_valid = True
            return True

        # something else, try to return srt
        try:
            logger.debug("Trying parsing with PySubs2")
            try:
                # in case of microdvd, try parsing the fps from the subtitle
                subs = pysubs2.SSAFile.from_string(text)
                if subs.format == "microdvd":
                    logger.info("Got FPS from MicroDVD subtitle: %s", subs.fps)
                else:
                    logger.info("Got format: %s", subs.format)
                    if self.use_original_format:
                        self.format = subs.format
                        self._is_valid = True
                        logger.debug("Using original format")
                        return True

            except pysubs2.UnknownFPSError:
                # if parsing failed, use frame rate from provider
                sub_fps = self.get_fps()
                if not isinstance(sub_fps, float) or sub_fps < 10.0:
                    # or use our media file's fps as a fallback
                    sub_fps = self.plex_media_fps
                    logger.info("No FPS info in subtitle. Using our own media FPS for the MicroDVD subtitle: %s",
                                self.plex_media_fps)
                subs = pysubs2.SSAFile.from_string(text, fps=sub_fps)

            unicontent = self.pysubs2_to_unicode(subs)
            self.content = unicontent.encode(self.get_encoding())
        except:
            logger.exception("Couldn't convert subtitle %s to .srt format: %s", self, traceback.format_exc())
            return False

        self._is_valid = True
        return True

    @classmethod
    def pysubs2_to_unicode(cls, sub, format="srt"):
        """
        this is a modified version of pysubs2.SubripFormat.to_file with special handling for drawing tags in ASS
        :param sub:
        :param format:
        :return:
        """
        def ms_to_timestamp(ms, mssep=","):
            """Convert ms to 'HH:MM:SS,mmm'"""
            # XXX throw on overflow/underflow?
            if ms < 0: ms = 0
            if ms > MAX_REPRESENTABLE_TIME: ms = MAX_REPRESENTABLE_TIME
            h, m, s, ms = ms_to_times(ms)
            return "%02d:%02d:%02d%s%03d" % (h, m, s, mssep, ms)

        def prepare_text(text, style):
            body = []
            for fragment, sty in parse_tags(text, style, sub.styles):
                fragment = fragment.replace(r"\h", u" ")
                fragment = fragment.replace(r"\n", u"\n")
                fragment = fragment.replace(r"\N", u"\n")
                if sty.drawing:
                    raise pysubs2.ContentNotUsable

                if format == "srt":
                    if sty.italic:
                        fragment = u"<i>%s</i>" % fragment
                    if sty.underline:
                        fragment = u"<u>%s</u>" % fragment
                    if sty.strikeout:
                        fragment = u"<s>%s</s>" % fragment
                elif format == "vtt":
                    if sty.bold:
                        fragment = u"<b>%s</b>" % fragment
                    if sty.italic:
                        fragment = u"<i>%s</i>" % fragment
                    if sty.underline:
                        fragment = u"<u>%s</u>" % fragment

                body.append(fragment)

            return re.sub(u"\n+", u"\n", u"".join(body).strip())

        visible_lines = (line for line in sub if not line.is_comment)

        out = []
        mssep = ","

        if format == "vtt":
            out.append("WEBVTT\n\n")
            mssep = "."

        for i, line in enumerate(visible_lines, 1):
            start = ms_to_timestamp(line.start, mssep=mssep)
            end = ms_to_timestamp(line.end, mssep=mssep)
            try:
                text = prepare_text(line.text, sub.styles.get(line.style, SSAStyle.DEFAULT_STYLE))
            except pysubs2.ContentNotUsable:
                continue

            out.append(u"%d\n" % i)
            out.append(u"%s --> %s\n" % (start, end))
            out.append(u"%s%s" % (text, "\n\n"))

        return u"".join(out)

    def get_modified_content(self, format="srt", debug=False):
        """
        :return: string
        """
        if not self.mods:
            return fix_text(self.content.decode(encoding=self.get_encoding()), **ftfy_defaults).encode(
                encoding=self.get_encoding())

        submods = SubtitleModifications(debug=debug)
        if submods.load(content=self.text, language=self.language):
            logger.info("Applying mods: %s", self.mods)
            submods.modify(*self.mods)
            self.mods = submods.mods_used

            content = fix_text(self.pysubs2_to_unicode(submods.f, format=format), **ftfy_defaults)\
                .encode(encoding=self.get_encoding())
            submods.f = None
            del submods
            return content
        return None


class ModifiedSubtitle(Subtitle):
    id = None


MERGED_FORMATS = {
    "TV": ("HDTV", "SDTV", "AHDTV", "Ultra HDTV"),
    "Air": ("SATRip", "DVB", "PPV", "Digital TV"),
    "Disk-HD": ("HD-DVD", "Blu-ray", "Ultra HD Blu-ray"),
    "Disk-SD": ("DVD", "VHS"),
    "Web": ("Web",),
}

MERGED_FORMATS_REV = dict((v.lower(), k.lower()) for k in MERGED_FORMATS for v in MERGED_FORMATS[k])

def _has_match(video, guess, key) -> bool:
    value = getattr(video, key)
    guess_value = guess.get(key)

    # To avoid extra debug calls
    if guess_value is None or value is None:
        return False

    if isinstance(guess_value, list):
        matched = any(value == item for item in guess_value)
    else:
        matched = value == guess_value

    logger.debug("%s matched? %s (%s -> %s)", key, matched, value, guess_value)

    return matched


def guess_matches(video, guess, partial=False):
    """Get matches between a `video` and a `guess`.

    If a guess is `partial`, the absence information won't be counted as a match.

    Patch: add multiple release group and formats handling

    :param video: the video.
    :type video: :class:`~subliminal.video.Video`
    :param guess: the guess.
    :type guess: dict
    :param bool partial: whether or not the guess is partial.
    :return: matches between the `video` and the `guess`.
    :rtype: set

    """
    matches = set()
    if isinstance(video, Episode):
        # series
        if video.series and 'title' in guess:
            titles = guess["title"]
            if not isinstance(titles, list):
                titles = [titles]

            for title in titles:
                if sanitize(title) in (sanitize(name) for name in [video.series] + video.alternative_series):
                    matches.add('series')

        # title
        if video.title and 'episode_title' in guess and sanitize(guess['episode_title']) == sanitize(video.title):
            matches.add('title')

        # season
        if video.season and 'season' in guess and guess['season'] == video.season:
            matches.add('season')

        # episode
        # Currently we only have single-ep support (guessit returns a multi-ep as a list with int values)
        # Most providers only support single-ep, so make sure it contains only 1 episode
        # In case of multi-ep, take the lowest episode (subtitles will normally be available on lowest episode number)
        if video.episode and 'episode' in guess:
            episode_guess = guess['episode']
            episode = min(episode_guess) if episode_guess and isinstance(episode_guess, list) else episode_guess
            if episode == video.episode:
                matches.add('episode')

        # year
        if video.year and 'year' in guess and guess['year'] == video.year:
            matches.add('year')

        # count "no year" as an information
        if not partial and video.original_series and 'year' not in guess:
            matches.add('year')

    elif isinstance(video, Movie):
        # year
        if video.year and 'year' in guess and guess['year'] == video.year:
            matches.add('year')
        # title
        if video.title and 'title' in guess and sanitize(guess['title']) in (
                    sanitize(name) for name in [video.title] + video.alternative_titles):
            matches.add('title')

    # release_group
    if 'release_group' in guess:
        release_groups = guess["release_group"]
        if not isinstance(release_groups, list):
            release_groups = [release_groups]

        if video.release_group:
            for release_group in release_groups:
                if (sanitize_release_group(release_group) in
                        get_equivalent_release_groups(sanitize_release_group(video.release_group))):
                    matches.add('release_group')
                    break
    # source
    if 'source' in guess:
        formats = guess["source"]
        if not isinstance(formats, list):
            formats = [formats]

        if video.source:
            video_format = video.source.lower()
            _video_gen_format = MERGED_FORMATS_REV.get(video_format)
            matched = False
            for frmt in formats:
                _guess_gen_frmt = MERGED_FORMATS_REV.get(frmt.lower())
                # We don't want to match a singleton
                if _guess_gen_frmt is None: # If the source is not in MERGED_FORMATS
                    _guess_gen_frmt = guess["source"]

                if _guess_gen_frmt == _video_gen_format:
                    matched = True
                    matches.add('source')
                    break

            logger.debug("Source match found? %s: %s -> %s", matched, video.source, formats)

        if "release_group" in matches and "source" not in matches:
            logger.info("Release group matched but source didn't. Removing release group match.")
            matches.remove("release_group")

    guess.update({"resolution": guess.get("screen_size")})

    # Solve match keys for potential lists
    for key in ("video_codec", "audio_codec", "edition", "streaming_service", "resolution"):
        if _has_match(video, guess, key):
            matches.add(key)

    # Add streaming service match for non-web sources
    if video.source and video.source != "Web":
        matches.add("streaming_service")

    # As edition tags are rare, add edition match if the video doesn't have an edition
    if not video.edition:
        matches.add("edition")

    return matches