mylar/lib/comictaggerlib/issueidentifier.py

"""A class to automatically identify a comic archive"""

# Copyright 2012-2014 Anthony Beville

# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at

#     http://www.apache.org/licenses/LICENSE-2.0

# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

import sys
import StringIO
#import math
#import urllib2
#import urllib

try:
    from PIL import Image
    from PIL import WebPImagePlugin
    pil_available = True
except ImportError:
    pil_available = False

from genericmetadata import GenericMetadata
from comicvinetalker import ComicVineTalker, ComicVineTalkerException
from imagehasher import ImageHasher
from imagefetcher import ImageFetcher, ImageFetcherException
from issuestring import IssueString
import utils
#from settings import ComicTaggerSettings
#from comicvinecacher import ComicVineCacher


class IssueIdentifierNetworkError(Exception):
    pass


class IssueIdentifierCancelled(Exception):
    pass


class IssueIdentifier:

    ResultNoMatches = 0
    ResultFoundMatchButBadCoverScore = 1
    ResultFoundMatchButNotFirstPage = 2
    ResultMultipleMatchesWithBadImageScores = 3
    ResultOneGoodMatch = 4
    ResultMultipleGoodMatches = 5

    def __init__(self, comic_archive, settings):
        self.comic_archive = comic_archive
        self.image_hasher = 1

        self.onlyUseAdditionalMetaData = False

        # a decent hamming score, good enough to call it a match
        self.min_score_thresh = 16

        # for alternate covers, be more stringent, since we're a bit more
        # scattershot in comparisons
        self.min_alternate_score_thresh = 12

        # the min distance a hamming score must be to separate itself from
        # closest neighbor
        self.min_score_distance = 4

        # a very strong hamming score, almost certainly the same image
        self.strong_score_thresh = 8

        # used to eliminate series names that are too long based on our search
        # string
        self.length_delta_thresh = settings.id_length_delta_thresh

        # used to eliminate unlikely publishers
        self.publisher_blacklist = [
            s.strip().lower() for s in settings.id_publisher_blacklist.split(',')]

        self.additional_metadata = GenericMetadata()
        self.output_function = IssueIdentifier.defaultWriteOutput
        self.callback = None
        self.coverUrlCallback = None
        self.search_result = self.ResultNoMatches
        self.cover_page_index = 0
        self.cancel = False
        self.waitAndRetryOnRateLimit = False

    def setScoreMinThreshold(self, thresh):
        self.min_score_thresh = thresh

    def setScoreMinDistance(self, distance):
        self.min_score_distance = distance

    def setAdditionalMetadata(self, md):
        self.additional_metadata = md

    def setNameLengthDeltaThreshold(self, delta):
        self.length_delta_thresh = delta

    def setPublisherBlackList(self, blacklist):
        self.publisher_blacklist = blacklist

    def setHasherAlgorithm(self, algo):
        self.image_hasher = algo
        pass

    def setOutputFunction(self, func):
        self.output_function = func
        pass

    def calculateHash(self, image_data):
        if self.image_hasher == '3':
            return ImageHasher(data=image_data).dct_average_hash()
        elif self.image_hasher == '2':
            return ImageHasher(data=image_data).average_hash2()
        else:
            return ImageHasher(data=image_data).average_hash()

    def getAspectRatio(self, image_data):
        try:
            im = Image.open(StringIO.StringIO(image_data))
            w, h = im.size
            return float(h) / float(w)
        except:
            return 1.5

    def cropCover(self, image_data):

        im = Image.open(StringIO.StringIO(image_data))
        w, h = im.size

        try:
            cropped_im = im.crop((int(w / 2), 0, w, h))
        except Exception as e:
            sys.exc_clear()
            print "cropCover() error:", e
            return None

        output = StringIO.StringIO()
        cropped_im.save(output, format="PNG")
        cropped_image_data = output.getvalue()
        output.close()

        return cropped_image_data

    def setProgressCallback(self, cb_func):
        self.callback = cb_func

    def setCoverURLCallback(self, cb_func):
        self.coverUrlCallback = cb_func

    def getSearchKeys(self):

        ca = self.comic_archive
        search_keys = dict()
        search_keys['series'] = None
        search_keys['issue_number'] = None
        search_keys['month'] = None
        search_keys['year'] = None
        search_keys['issue_count'] = None

        if ca is None:
            return

        if self.onlyUseAdditionalMetaData:
            search_keys['series'] = self.additional_metadata.series
            search_keys['issue_number'] = self.additional_metadata.issue
            search_keys['year'] = self.additional_metadata.year
            search_keys['month'] = self.additional_metadata.month
            search_keys['issue_count'] = self.additional_metadata.issueCount
            return search_keys

        # see if the archive has any useful meta data for searching with
        if ca.hasCIX():
            internal_metadata = ca.readCIX()
        elif ca.hasCBI():
            internal_metadata = ca.readCBI()
        else:
            internal_metadata = ca.readCBI()

        # try to get some metadata from filename
        md_from_filename = ca.metadataFromFilename()

        # preference order:
        # 1. Additional metadata
        # 1. Internal metadata
        # 1. Filename metadata

        if self.additional_metadata.series is not None:
            search_keys['series'] = self.additional_metadata.series
        elif internal_metadata.series is not None:
            search_keys['series'] = internal_metadata.series
        else:
            search_keys['series'] = md_from_filename.series

        if self.additional_metadata.issue is not None:
            search_keys['issue_number'] = self.additional_metadata.issue
        elif internal_metadata.issue is not None:
            search_keys['issue_number'] = internal_metadata.issue
        else:
            search_keys['issue_number'] = md_from_filename.issue

        if self.additional_metadata.year is not None:
            search_keys['year'] = self.additional_metadata.year
        elif internal_metadata.year is not None:
            search_keys['year'] = internal_metadata.year
        else:
            search_keys['year'] = md_from_filename.year

        if self.additional_metadata.month is not None:
            search_keys['month'] = self.additional_metadata.month
        elif internal_metadata.month is not None:
            search_keys['month'] = internal_metadata.month
        else:
            search_keys['month'] = md_from_filename.month

        if self.additional_metadata.issueCount is not None:
            search_keys['issue_count'] = self.additional_metadata.issueCount
        elif internal_metadata.issueCount is not None:
            search_keys['issue_count'] = internal_metadata.issueCount
        else:
            search_keys['issue_count'] = md_from_filename.issueCount

        return search_keys

    @staticmethod
    def defaultWriteOutput(text):
        sys.stdout.write(text)
        sys.stdout.flush()

    def log_msg(self, msg, newline=True):
        self.output_function(msg)
        if newline:
            self.output_function("\n")

    def getIssueCoverMatchScore(
            self,
            comicVine,
            issue_id,
            primary_img_url,
            primary_thumb_url,
            page_url,
            localCoverHashList,
            useRemoteAlternates=False,
            useLog=True):
        # localHashes is a list of pre-calculated hashs.
        # useRemoteAlternates - indicates to use alternate covers from CV

        try:
            url_image_data = ImageFetcher().fetch(
                primary_thumb_url, blocking=True)
        except ImageFetcherException:
            self.log_msg(
                "Network issue while fetching cover image from Comic Vine. Aborting...")
            raise IssueIdentifierNetworkError

        if self.cancel:
            raise IssueIdentifierCancelled

        # alert the GUI, if needed
        if self.coverUrlCallback is not None:
            self.coverUrlCallback(url_image_data)

        remote_cover_list = []
        item = dict()
        item['url'] = primary_img_url

        item['hash'] = self.calculateHash(url_image_data)
        remote_cover_list.append(item)

        if self.cancel:
            raise IssueIdentifierCancelled

        if useRemoteAlternates:
            alt_img_url_list = comicVine.fetchAlternateCoverURLs(
                issue_id, page_url)
            for alt_url in alt_img_url_list:
                try:
                    alt_url_image_data = ImageFetcher().fetch(
                        alt_url, blocking=True)
                except ImageFetcherException:
                    self.log_msg(
                        "Network issue while fetching alt. cover image from Comic Vine. Aborting...")
                    raise IssueIdentifierNetworkError

                if self.cancel:
                    raise IssueIdentifierCancelled

                # alert the GUI, if needed
                if self.coverUrlCallback is not None:
                    self.coverUrlCallback(alt_url_image_data)

                item = dict()
                item['url'] = alt_url
                item['hash'] = self.calculateHash(alt_url_image_data)
                remote_cover_list.append(item)

                if self.cancel:
                    raise IssueIdentifierCancelled

        if useLog and useRemoteAlternates:
            self.log_msg(
                "[{0} alt. covers]".format(len(remote_cover_list) - 1), False)
        if useLog:
            self.log_msg("[ ", False)

        score_list = []
        done = False
        for local_cover_hash in localCoverHashList:
            for remote_cover_item in remote_cover_list:
                score = ImageHasher.hamming_distance(
                    local_cover_hash, remote_cover_item['hash'])
                score_item = dict()
                score_item['score'] = score
                score_item['url'] = remote_cover_item['url']
                score_item['hash'] = remote_cover_item['hash']
                score_list.append(score_item)
                if useLog:
                    self.log_msg("{0}".format(score), False)

                if score <= self.strong_score_thresh:
                    # such a good score, we can quit now, since for sure we
                    # have a winner
                    done = True
                    break
            if done:
                break

        if useLog:
            self.log_msg(" ]", False)

        best_score_item = min(score_list, key=lambda x: x['score'])

        return best_score_item

    # def validate(self, issue_id):
        # create hash list
    #    score = self.getIssueMatchScore(issue_id, hash_list, useRemoteAlternates = True)
    #    if score < 20:
    #        return True
    #    else:
    #        return False

    def search(self):

        ca = self.comic_archive
        self.match_list = []
        self.cancel = False
        self.search_result = self.ResultNoMatches

        if not pil_available:
            self.log_msg(
                "Python Imaging Library (PIL) is not available and is needed for issue identification.")
            return self.match_list

        if not ca.seemsToBeAComicArchive():
            self.log_msg(
                "Sorry, but " + opts.filename + " is not a comic archive!")
            return self.match_list

        cover_image_data = ca.getPage(self.cover_page_index)
        cover_hash = self.calculateHash(cover_image_data)

        # check the aspect ratio
        # if it's wider than it is high, it's probably a two page spread
        # if so, crop it and calculate a second hash
        narrow_cover_hash = None
        aspect_ratio = self.getAspectRatio(cover_image_data)
        if aspect_ratio < 1.0:
            right_side_image_data = self.cropCover(cover_image_data)
            if right_side_image_data is not None:
                narrow_cover_hash = self.calculateHash(right_side_image_data)

        #self.log_msg("Cover hash = {0:016x}".format(cover_hash))

        keys = self.getSearchKeys()
        # normalize the issue number
        keys['issue_number'] = IssueString(keys['issue_number']).asString()

        # we need, at minimum, a series and issue number
        if keys['series'] is None or keys['issue_number'] is None:
            self.log_msg("Not enough info for a search!")
            return []

        self.log_msg("Going to search for:")
        self.log_msg("\tSeries: " + keys['series'])
        self.log_msg("\tIssue:  " + keys['issue_number'])
        if keys['issue_count'] is not None:
            self.log_msg("\tCount:  " + str(keys['issue_count']))
        if keys['year'] is not None:
            self.log_msg("\tYear:   " + str(keys['year']))
        if keys['month'] is not None:
            self.log_msg("\tMonth:  " + str(keys['month']))

        #self.log_msg("Publisher Blacklist: " + str(self.publisher_blacklist))
        comicVine = ComicVineTalker()
        comicVine.wait_for_rate_limit = self.waitAndRetryOnRateLimit

        comicVine.setLogFunc(self.output_function)

        # self.log_msg(("Searching for " + keys['series'] + "...")
        self.log_msg(u"Searching for  {0} #{1} ...".format(
            keys['series'], keys['issue_number']))
        try:
            cv_search_results = comicVine.searchForSeries(keys['series'])
        except ComicVineTalkerException:
            self.log_msg(
                "Network issue while searching for series. Aborting...")
            return []

        #self.log_msg("Found " + str(len(cv_search_results)) + " initial results")
        if self.cancel:
            return []

        if cv_search_results is None:
            return []

        series_second_round_list = []

        #self.log_msg("Removing results with too long names, banned publishers, or future start dates")
        for item in cv_search_results:
            length_approved = False
            publisher_approved = True
            date_approved = True

            # remove any series that starts after the issue year
            if keys['year'] is not None and str(
                    keys['year']).isdigit() and item['start_year'] is not None and str(
                    item['start_year']).isdigit():
                if int(keys['year']) < int(item['start_year']):
                    date_approved = False

            # assume that our search name is close to the actual name, say
            # within ,e.g. 5 chars
            shortened_key = utils.removearticles(keys['series'])
            shortened_item_name = utils.removearticles(item['name'])
            if len(shortened_item_name) < (
                    len(shortened_key) + self.length_delta_thresh):
                length_approved = True

            # remove any series from publishers on the blacklist
            if item['publisher'] is not None:
                publisher = item['publisher']['name']
                if publisher is not None and publisher.lower(
                ) in self.publisher_blacklist:
                    publisher_approved = False

            if length_approved and publisher_approved and date_approved:
                series_second_round_list.append(item)

        self.log_msg(
            "Searching in " + str(len(series_second_round_list)) + " series")

        if self.callback is not None:
            self.callback(0, len(series_second_round_list))

        # now sort the list by name length
        series_second_round_list.sort(
            key=lambda x: len(x['name']), reverse=False)

        # build a list of volume IDs
        volume_id_list = list()
        for series in series_second_round_list:
            volume_id_list.append(series['id'])

        try:
            issue_list = comicVine.fetchIssuesByVolumeIssueNumAndYear(
                volume_id_list,
                keys['issue_number'],
                keys['year'])

        except ComicVineTalkerException:
            self.log_msg(
                "Network issue while searching for series details. Aborting...")
            return []

        if issue_list is None:
            return []

        shortlist = list()
        # now re-associate the issues and volumes
        for issue in issue_list:
            for series in series_second_round_list:
                if series['id'] == issue['volume']['id']:
                    shortlist.append((series, issue))
                    break

        if keys['year'] is None:
            self.log_msg(u"Found {0} series that have an issue #{1}".format(
                len(shortlist), keys['issue_number']))
        else:
            self.log_msg(
                u"Found {0} series that have an issue #{1} from {2}".format(
                    len(shortlist),
                    keys['issue_number'],
                    keys['year']))

        # now we have a shortlist of volumes with the desired issue number
        # Do first round of cover matching
        counter = len(shortlist)
        for series, issue in shortlist:
            if self.callback is not None:
                self.callback(counter, len(shortlist) * 3)
                counter += 1

            self.log_msg(u"Examining covers for  ID: {0} {1} ({2}) ...".format(
                series['id'],
                series['name'],
                series['start_year']), newline=False)

            # parse out the cover date
            day, month, year = comicVine.parseDateStr(issue['cover_date'])

            # Now check the cover match against the primary image
            hash_list = [cover_hash]
            if narrow_cover_hash is not None:
                hash_list.append(narrow_cover_hash)

            try:
                image_url = issue['image']['super_url']
                thumb_url = issue['image']['thumb_url']
                page_url = issue['site_detail_url']

                score_item = self.getIssueCoverMatchScore(
                    comicVine,
                    issue['id'],
                    image_url,
                    thumb_url,
                    page_url,
                    hash_list,
                    useRemoteAlternates=False)
            except:
                self.match_list = []
                return self.match_list

            match = dict()
            match['series'] = u"{0} ({1})".format(
                series['name'], series['start_year'])
            match['distance'] = score_item['score']
            match['issue_number'] = keys['issue_number']
            match['cv_issue_count'] = series['count_of_issues']
            match['url_image_hash'] = score_item['hash']
            match['issue_title'] = issue['name']
            match['issue_id'] = issue['id']
            match['volume_id'] = series['id']
            match['month'] = month
            match['year'] = year
            match['publisher'] = None
            if series['publisher'] is not None:
                match['publisher'] = series['publisher']['name']
            match['image_url'] = image_url
            match['thumb_url'] = thumb_url
            match['page_url'] = page_url
            match['description'] = issue['description']

            self.match_list.append(match)

            self.log_msg(" --> {0}".format(match['distance']), newline=False)

            self.log_msg("")

        if len(self.match_list) == 0:
            self.log_msg(":-(no matches!")
            self.search_result = self.ResultNoMatches
            return self.match_list

        # sort list by image match scores
        self.match_list.sort(key=lambda k: k['distance'])

        l = []
        for i in self.match_list:
            l.append(i['distance'])

        self.log_msg("Compared to covers in {0} issue(s):".format(
            len(self.match_list)), newline=False)
        self.log_msg(str(l))

        def print_match(item):
            self.log_msg(u"-----> {0} #{1} {2} ({3}/{4}) -- score: {5}".format(
                item['series'],
                item['issue_number'],
                item['issue_title'],
                item['month'],
                item['year'],
                item['distance']))

        best_score = self.match_list[0]['distance']

        if best_score >= self.min_score_thresh:
            # we have 1 or more low-confidence matches (all bad cover scores)
            # look at a few more pages in the archive, and also alternate
            # covers online
            self.log_msg(
                "Very weak scores for the cover. Analyzing alternate pages and covers...")
            hash_list = [cover_hash]
            if narrow_cover_hash is not None:
                hash_list.append(narrow_cover_hash)
            for i in range(1, min(3, ca.getNumberOfPages())):
                image_data = ca.getPage(i)
                page_hash = self.calculateHash(image_data)
                hash_list.append(page_hash)

            second_match_list = []
            counter = 2 * len(self.match_list)
            for m in self.match_list:
                if self.callback is not None:
                    self.callback(counter, len(self.match_list) * 3)
                    counter += 1
                self.log_msg(
                    u"Examining alternate covers for ID: {0} {1} ...".format(
                        m['volume_id'],
                        m['series']),
                    newline=False)
                try:
                    score_item = self.getIssueCoverMatchScore(
                        comicVine,
                        m['issue_id'],
                        m['image_url'],
                        m['thumb_url'],
                        m['page_url'],
                        hash_list,
                        useRemoteAlternates=True)
                except:
                    self.match_list = []
                    return self.match_list
                self.log_msg("--->{0}".format(score_item['score']))
                self.log_msg("")

                if score_item['score'] < self.min_alternate_score_thresh:
                    second_match_list.append(m)
                    m['distance'] = score_item['score']

            if len(second_match_list) == 0:
                if len(self.match_list) == 1:
                    self.log_msg("No matching pages in the issue.")
                    self.log_msg(
                        u"--------------------------------------------------------------------------")
                    print_match(self.match_list[0])
                    self.log_msg(
                        u"--------------------------------------------------------------------------")
                    self.search_result = self.ResultFoundMatchButBadCoverScore
                else:
                    self.log_msg(
                        u"--------------------------------------------------------------------------")
                    self.log_msg(
                        u"Multiple bad cover matches!  Need to use other info...")
                    self.log_msg(
                        u"--------------------------------------------------------------------------")
                    self.search_result = self.ResultMultipleMatchesWithBadImageScores
                return self.match_list
            else:
                # We did good, found something!
                self.log_msg("Success in secondary/alternate cover matching!")

                self.match_list = second_match_list
                # sort new list by image match scores
                self.match_list.sort(key=lambda k: k['distance'])
                best_score = self.match_list[0]['distance']
                self.log_msg(
                    "[Second round cover matching: best score = {0}]".format(best_score))
                # now drop down into the rest of the processing

        if self.callback is not None:
            self.callback(99, 100)

        # now pare down list, remove any item more than specified distant from
        # the top scores
        for item in reversed(self.match_list):
            if item['distance'] > best_score + self.min_score_distance:
                self.match_list.remove(item)

        # One more test for the case choosing limited series first issue vs a trade with the same cover:
        # if we have a given issue count > 1 and the volume from CV has
        # count==1, remove it from match list
        if len(self.match_list) >= 2 and keys[
                'issue_count'] is not None and keys['issue_count'] != 1:
            new_list = list()
            for match in self.match_list:
                if match['cv_issue_count'] != 1:
                    new_list.append(match)
                else:
                    self.log_msg(
                        "Removing volume {0} [{1}] from consideration (only 1 issue)".format(
                            match['series'],
                            match['volume_id']))

            if len(new_list) > 0:
                self.match_list = new_list

        if len(self.match_list) == 1:
            self.log_msg(
                u"--------------------------------------------------------------------------")
            print_match(self.match_list[0])
            self.log_msg(
                u"--------------------------------------------------------------------------")
            self.search_result = self.ResultOneGoodMatch

        elif len(self.match_list) == 0:
            self.log_msg(
                u"--------------------------------------------------------------------------")
            self.log_msg("No matches found :(")
            self.log_msg(
                u"--------------------------------------------------------------------------")
            self.search_result = self.ResultNoMatches
        else:
            # we've got multiple good matches:
            self.log_msg("More than one likely candidate.")
            self.search_result = self.ResultMultipleGoodMatches
            self.log_msg(
                u"--------------------------------------------------------------------------")
            for item in self.match_list:
                print_match(item)
            self.log_msg(
                u"--------------------------------------------------------------------------")

        return self.match_list