bazarr/libs/subliminal_patch/providers/zimuku.py

# -*- coding: utf-8 -*-
from __future__ import absolute_import
import base64
import io
import logging
import os
import zipfile
import re
import copy
from PIL import Image

try:
    from urlparse import urljoin
except ImportError:
    from urllib.parse import urljoin

import rarfile
from babelfish import language_converters
from subzero.language import Language
from guessit import guessit
from requests import Session
from six import text_type
from random import randint

from python_anticaptcha import AnticaptchaClient, ImageToTextTask
from subliminal.providers import ParserBeautifulSoup
from subliminal_patch.providers import Provider
from subliminal.subtitle import (
    SUBTITLE_EXTENSIONS,
    fix_line_ending
)
from subliminal_patch.subtitle import (
    Subtitle,
    guess_matches
)
from .utils import FIRST_THOUSAND_OR_SO_USER_AGENTS as AGENT_LIST
from subliminal.video import Episode, Movie

logger = logging.getLogger(__name__)

language_converters.register('zimuku = subliminal_patch.converters.zimuku:zimukuConverter')

supported_languages = list(language_converters['zimuku'].to_zimuku.keys())


class ZimukuSubtitle(Subtitle):
    """Zimuku Subtitle."""

    provider_name = "zimuku"

    def __init__(self, language, page_link, version, session, year):
        super(ZimukuSubtitle, self).__init__(language, page_link=page_link)
        self.version = version
        self.release_info = version
        self.hearing_impaired = False
        self.encoding = "utf-8"
        self.session = session
        self.year = year

    @property
    def id(self):
        return self.page_link

    def get_matches(self, video):
        matches = set()

        if video.year == self.year:
            matches.add('year')

        # episode
        if isinstance(video, Episode):
            info = guessit(self.version, {"type": "episode"})
            # other properties
            matches |= guess_matches(video, info)

            # add year to matches if video doesn't have a year but series, season and episode are matched
            if not video.year and all(item in matches for item in ['series', 'season', 'episode']):
                matches |= {'year'}
        # movie
        elif isinstance(video, Movie):
            # other properties
            matches |= guess_matches(video, guessit(self.version, {"type": "movie"}))

        return matches


def string_to_hex(s):
    val = ""
    for i in s:
        val += hex(ord(i))[2:]
    return val


class ZimukuProvider(Provider):
    """Zimuku Provider."""

    languages = {Language(*l) for l in supported_languages}
    video_types = (Episode, Movie)
    logger.info(str(supported_languages))

    server_url = "https://so.zimuku.org"
    search_url = "/search?q={}"

    subtitle_class = ZimukuSubtitle

    def __init__(self):
        self.session = None

    verify_token = ""
    code = ""
    location_re = re.compile(
        r'self\.location = "(.*)" \+ stringToHex\(text\)')
    verification_image_re = re.compile(r'<img.*?src="data:image/bmp;base64,(.*?)".*?>')

    def yunsuo_bypass(self, url, *args, **kwargs):
        def parse_verification_image(image_content: str):

            def bmp_to_image(base64_str, img_type='png'):
                img_data = base64.b64decode(base64_str)
                img = Image.open(io.BytesIO(img_data))
                img = img.convert("RGB")
                img_fp = io.BytesIO()
                img.save(img_fp, img_type)
                img_fp.seek(0)
                return img_fp

            fp = bmp_to_image(image_content)
            task = ImageToTextTask(fp)
            client = AnticaptchaClient(os.environ.get('ANTICAPTCHA_ACCOUNT_KEY'))
            job = client.createTask(task)
            job.join()
            return job.get_captcha_text()

        i = -1
        while True:
            i += 1
            r = self.session.get(url, *args, **kwargs)
            if r.status_code == 404:
                # mock js script logic
                tr = self.location_re.findall(r.text)
                verification_image = self.verification_image_re.findall(r.text)
                self.code = parse_verification_image(verification_image[0])
                self.session.cookies.set("srcurl", string_to_hex(r.url))
                if tr:
                    verify_resp = self.session.get(
                        urljoin(self.server_url, tr[0] + string_to_hex(self.code)), allow_redirects=False)
                    if verify_resp.status_code == 302 \
                            and self.session.cookies.get("security_session_verify") is not None:
                        pass
                    continue
            if len(self.location_re.findall(r.text)) == 0:
                self.verify_token = string_to_hex(self.code)
                return r

    def initialize(self):
        self.session = Session()
        self.session.headers["User-Agent"] = AGENT_LIST[randint(0, len(AGENT_LIST) - 1)]

    def terminate(self):
        self.session.close()

    def _parse_episode_page(self, link, year):
        r = self.yunsuo_bypass(link)
        bs_obj = ParserBeautifulSoup(
            r.content.decode("utf-8", "ignore"), ["html.parser"]
        )
        subs_body = bs_obj.find("tbody")
        subs = []
        for sub in subs_body.find_all("tr"):
            a = sub.find("a")
            name = _extract_name(a.text)
            name = os.path.splitext(name)[
                0
            ]  # remove ext because it can be an archive type

            language = Language("eng")
            for img in sub.find("td", class_="tac lang").find_all("img"):
                if (
                        "china" in img.attrs["src"]
                        and "hongkong" in img.attrs["src"]
                ):
                    language = Language("zho").add(Language('zho', 'TW', None))
                    logger.debug("language:" + str(language))
                elif (
                        "china" in img.attrs["src"]
                        or "jollyroger" in img.attrs["src"]
                ):
                    language = Language("zho")
                elif "hongkong" in img.attrs["src"]:
                    language = Language('zho', 'TW', None)
                    break
            sub_page_link = urljoin(self.server_url, a.attrs["href"])
            backup_session = copy.deepcopy(self.session)
            backup_session.headers["Referer"] = link

            subs.append(
                self.subtitle_class(language, sub_page_link, name, backup_session, year)
            )

        return subs

    def query(self, keyword, season=None, episode=None, year=None):
        params = keyword
        if season:
            params += ".S{season:02d}".format(season=season)
        elif year:
            params += " {:4d}".format(year)

        logger.debug("Searching subtitles %r", params)
        subtitles = []
        search_link = urljoin(self.server_url, text_type(self.search_url).format(params))

        r = self.yunsuo_bypass(search_link, timeout=30)
        r.raise_for_status()

        if not r.content:
            logger.debug("No data returned from provider")
            return []

        html = r.content.decode("utf-8", "ignore")
        # parse window location
        pattern = r"url\s*=\s*'([^']*)'\s*\+\s*url"
        parts = re.findall(pattern, html)
        redirect_url = search_link
        while parts:
            parts.reverse()
            redirect_url = urljoin(self.server_url, "".join(parts))
            r = self.session.get(redirect_url, timeout=30)
            html = r.content.decode("utf-8", "ignore")
            parts = re.findall(pattern, html)
        logger.debug("search url located: " + redirect_url)

        soup = ParserBeautifulSoup(
            r.content.decode("utf-8", "ignore"), ["lxml", "html.parser"]
        )

        # non-shooter result page
        if soup.find("div", {"class": "item"}):
            logger.debug("enter a non-shooter page")
            for item in soup.find_all("div", {"class": "item"}):
                title_a = item.find("p", class_="tt clearfix").find("a")
                subs_year = year
                if season:
                    # episode year in zimuku is the season's year not show's year
                    actual_subs_year = re.findall(r"\d{4}", title_a.text) or None
                    if actual_subs_year:
                        subs_year = int(actual_subs_year[0]) - season + 1
                    title = title_a.text
                    season_cn1 = re.search("第(.*)季", title)
                    if not season_cn1:
                        season_cn1 = "一"
                    else:
                        season_cn1 = season_cn1.group(1).strip()
                    season_cn2 = num_to_cn(str(season))
                    if season_cn1 != season_cn2:
                        continue
                episode_link = urljoin(self.server_url, title_a.attrs["href"])
                new_subs = self._parse_episode_page(episode_link, subs_year)
                subtitles += new_subs

        # NOTE: shooter result pages are ignored due to the existence of zimuku provider

        return subtitles

    def list_subtitles(self, video, languages):
        if isinstance(video, Episode):
            titles = [video.series] + video.alternative_series
        elif isinstance(video, Movie):
            titles = [video.title] + video.alternative_titles
        else:
            titles = []

        subtitles = []
        # query for subtitles with the show_id
        for title in titles:
            if isinstance(video, Episode):
                subtitles += [
                    s
                    for s in self.query(
                        title,
                        season=video.season,
                        episode=video.episode,
                        year=video.year,
                    )
                    if s.language in languages
                ]
            elif isinstance(video, Movie):
                subtitles += [
                    s
                    for s in self.query(title, year=video.year)
                    if s.language in languages
                ]

        return subtitles

    def download_subtitle(self, subtitle):
        def _get_archive_download_link(yunsuopass, sub_page_link):
            res = yunsuopass(sub_page_link)
            bs_obj = ParserBeautifulSoup(
                res.content.decode("utf-8", "ignore"), ["html.parser"]
            )
            down_page_link = bs_obj.find("a", {"id": "down1"}).attrs["href"]
            down_page_link = urljoin(sub_page_link, down_page_link)
            res = yunsuopass(down_page_link)
            bs_obj = ParserBeautifulSoup(
                res.content.decode("utf-8", "ignore"), ["html.parser"]
            )
            return urljoin(sub_page_link, bs_obj.find("a", {"rel": "nofollow"}).attrs["href"])

        # download the subtitle
        logger.info("Downloading subtitle %r", subtitle)
        download_link = _get_archive_download_link(self.yunsuo_bypass, subtitle.page_link)
        r = self.yunsuo_bypass(download_link, headers={'Referer': subtitle.page_link}, timeout=30)
        r.raise_for_status()
        try:
            filename = r.headers["Content-Disposition"]
        except KeyError:
            logger.debug("Unable to parse subtitles filename. Dropping this subtitles.")
            return

        if not r.content:
            logger.debug("Unable to download subtitle. No data returned from provider")
            return

        archive_stream = io.BytesIO(r.content)
        archive = None
        if rarfile.is_rarfile(archive_stream):
            logger.debug("Identified rar archive")
            if ".rar" not in filename:
                logger.debug(
                    ".rar should be in the downloaded file name: {}".format(filename)
                )
                return
            archive = rarfile.RarFile(archive_stream)
            subtitle_content = _get_subtitle_from_archive(archive)
        elif zipfile.is_zipfile(archive_stream):
            logger.debug("Identified zip archive")
            if ".zip" not in filename:
                logger.debug(
                    ".zip should be in the downloaded file name: {}".format(filename)
                )
                return
            archive = zipfile.ZipFile(archive_stream)
            subtitle_content = _get_subtitle_from_archive(archive)
        else:
            is_sub = ""
            for sub_ext in SUBTITLE_EXTENSIONS:
                if sub_ext in filename:
                    is_sub = sub_ext
                    break
            if not is_sub:
                logger.debug(
                    "unknown subtitle ext int downloaded file name: {}".format(filename)
                )
                return
            logger.debug("Identified {} file".format(is_sub))
            subtitle_content = r.content

        if subtitle_content:
            subtitle.content = fix_line_ending(subtitle_content)
        else:
            logger.debug("Could not extract subtitle from %r", archive)


def _get_subtitle_from_archive(archive):
    extract_subname, max_score = "", -1

    for subname in archive.namelist():
        # discard hidden files
        if os.path.split(subname)[-1].startswith("."):
            continue

        # discard non-subtitle files
        if not subname.lower().endswith(SUBTITLE_EXTENSIONS):
            continue

        # prefer ass/ssa/srt subtitles with double languages or simplified/traditional chinese
        score = ("ass" in subname or "ssa" in subname or "srt" in subname) * 1
        if "简体" in subname or "chs" in subname or ".gb." in subname:
            score += 2
        if "繁体" in subname or "cht" in subname or ".big5." in subname:
            score += 2
        if "chs.eng" in subname or "chs&eng" in subname or "cht.eng" in subname or "cht&eng" in subname:
            score += 2
        if "中英" in subname or "简英" in subname or "繁英" in subname or "双语" in subname or "简体&英文" in subname or "繁体&英文" in subname:
            score += 4
        logger.debug("subtitle {}, score: {}".format(subname, score))
        if score > max_score:
            max_score = score
            extract_subname = subname

    return archive.read(extract_subname) if max_score != -1 else None


def _extract_name(name):
    """ filter out Chinese characters from subtitle names """
    name, suffix = os.path.splitext(name)
    c_pattern = "[\u4e00-\u9fff]"
    e_pattern = "[a-zA-Z]"
    c_indices = [m.start(0) for m in re.finditer(c_pattern, name)]
    e_indices = [m.start(0) for m in re.finditer(e_pattern, name)]

    target, discard = e_indices, c_indices

    if len(target) == 0:
        return ""

    first_target, last_target = target[0], target[-1]
    first_discard = discard[0] if discard else -1
    last_discard = discard[-1] if discard else -1
    if last_discard < first_target:
        new_name = name[first_target:]
    elif last_target < first_discard:
        new_name = name[:first_discard]
    else:
        # try to find maximum continous part
        result, start, end = [0, 1], -1, 0
        while end < len(name):
            while end not in e_indices and end < len(name):
                end += 1
            if end == len(name):
                break
            start = end
            while end not in c_indices and end < len(name):
                end += 1
            if end - start > result[1] - result[0]:
                result = [start, end]
            start = end
            end += 1
        new_name = name[result[0]: result[1]]
    new_name = new_name.strip() + suffix
    return new_name


def num_to_cn(number):
    """ convert numbers(1-99) to Chinese """
    assert number.isdigit() and 1 <= int(number) <= 99

    trans_map = {n: c for n, c in zip("123456789", "一二三四五六七八九")}

    if len(number) == 1:
        return trans_map[number]
    else:
        part1 = "十" if number[0] == "1" else trans_map[number[0]] + "十"
        part2 = trans_map[number[1]] if number[1] != "0" else ""
        return part1 + part2