"""A class to automatically identify a comic archive""" # Copyright 2012-2014 Anthony Beville # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # http://www.apache.org/licenses/LICENSE-2.0 # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. import sys import StringIO #import math #import urllib2 #import urllib try: from PIL import Image from PIL import WebPImagePlugin pil_available = True except ImportError: pil_available = False from genericmetadata import GenericMetadata from comicvinetalker import ComicVineTalker, ComicVineTalkerException from imagehasher import ImageHasher from imagefetcher import ImageFetcher, ImageFetcherException from issuestring import IssueString import utils #from settings import ComicTaggerSettings #from comicvinecacher import ComicVineCacher class IssueIdentifierNetworkError(Exception): pass class IssueIdentifierCancelled(Exception): pass class IssueIdentifier: ResultNoMatches = 0 ResultFoundMatchButBadCoverScore = 1 ResultFoundMatchButNotFirstPage = 2 ResultMultipleMatchesWithBadImageScores = 3 ResultOneGoodMatch = 4 ResultMultipleGoodMatches = 5 def __init__(self, comic_archive, settings): self.comic_archive = comic_archive self.image_hasher = 1 self.onlyUseAdditionalMetaData = False # a decent hamming score, good enough to call it a match self.min_score_thresh = 16 # for alternate covers, be more stringent, since we're a bit more # scattershot in comparisons self.min_alternate_score_thresh = 12 # the min distance a hamming score must be to separate itself from # closest neighbor self.min_score_distance = 4 # a very strong hamming score, almost certainly the same image self.strong_score_thresh = 8 # used to eliminate series names that are too long based on our search # string self.length_delta_thresh = settings.id_length_delta_thresh # used to eliminate unlikely publishers self.publisher_blacklist = [ s.strip().lower() for s in settings.id_publisher_blacklist.split(',')] self.additional_metadata = GenericMetadata() self.output_function = IssueIdentifier.defaultWriteOutput self.callback = None self.coverUrlCallback = None self.search_result = self.ResultNoMatches self.cover_page_index = 0 self.cancel = False self.waitAndRetryOnRateLimit = False def setScoreMinThreshold(self, thresh): self.min_score_thresh = thresh def setScoreMinDistance(self, distance): self.min_score_distance = distance def setAdditionalMetadata(self, md): self.additional_metadata = md def setNameLengthDeltaThreshold(self, delta): self.length_delta_thresh = delta def setPublisherBlackList(self, blacklist): self.publisher_blacklist = blacklist def setHasherAlgorithm(self, algo): self.image_hasher = algo pass def setOutputFunction(self, func): self.output_function = func pass def calculateHash(self, image_data): if self.image_hasher == '3': return ImageHasher(data=image_data).dct_average_hash() elif self.image_hasher == '2': return ImageHasher(data=image_data).average_hash2() else: return ImageHasher(data=image_data).average_hash() def getAspectRatio(self, image_data): try: im = Image.open(StringIO.StringIO(image_data)) w, h = im.size return float(h) / float(w) except: return 1.5 def cropCover(self, image_data): im = Image.open(StringIO.StringIO(image_data)) w, h = im.size try: cropped_im = im.crop((int(w / 2), 0, w, h)) except Exception as e: sys.exc_clear() print "cropCover() error:", e return None output = StringIO.StringIO() cropped_im.save(output, format="PNG") cropped_image_data = output.getvalue() output.close() return cropped_image_data def setProgressCallback(self, cb_func): self.callback = cb_func def setCoverURLCallback(self, cb_func): self.coverUrlCallback = cb_func def getSearchKeys(self): ca = self.comic_archive search_keys = dict() search_keys['series'] = None search_keys['issue_number'] = None search_keys['month'] = None search_keys['year'] = None search_keys['issue_count'] = None if ca is None: return if self.onlyUseAdditionalMetaData: search_keys['series'] = self.additional_metadata.series search_keys['issue_number'] = self.additional_metadata.issue search_keys['year'] = self.additional_metadata.year search_keys['month'] = self.additional_metadata.month search_keys['issue_count'] = self.additional_metadata.issueCount return search_keys # see if the archive has any useful meta data for searching with if ca.hasCIX(): internal_metadata = ca.readCIX() elif ca.hasCBI(): internal_metadata = ca.readCBI() else: internal_metadata = ca.readCBI() # try to get some metadata from filename md_from_filename = ca.metadataFromFilename() # preference order: # 1. Additional metadata # 1. Internal metadata # 1. Filename metadata if self.additional_metadata.series is not None: search_keys['series'] = self.additional_metadata.series elif internal_metadata.series is not None: search_keys['series'] = internal_metadata.series else: search_keys['series'] = md_from_filename.series if self.additional_metadata.issue is not None: search_keys['issue_number'] = self.additional_metadata.issue elif internal_metadata.issue is not None: search_keys['issue_number'] = internal_metadata.issue else: search_keys['issue_number'] = md_from_filename.issue if self.additional_metadata.year is not None: search_keys['year'] = self.additional_metadata.year elif internal_metadata.year is not None: search_keys['year'] = internal_metadata.year else: search_keys['year'] = md_from_filename.year if self.additional_metadata.month is not None: search_keys['month'] = self.additional_metadata.month elif internal_metadata.month is not None: search_keys['month'] = internal_metadata.month else: search_keys['month'] = md_from_filename.month if self.additional_metadata.issueCount is not None: search_keys['issue_count'] = self.additional_metadata.issueCount elif internal_metadata.issueCount is not None: search_keys['issue_count'] = internal_metadata.issueCount else: search_keys['issue_count'] = md_from_filename.issueCount return search_keys @staticmethod def defaultWriteOutput(text): sys.stdout.write(text) sys.stdout.flush() def log_msg(self, msg, newline=True): self.output_function(msg) if newline: self.output_function("\n") def getIssueCoverMatchScore( self, comicVine, issue_id, primary_img_url, primary_thumb_url, page_url, localCoverHashList, useRemoteAlternates=False, useLog=True): # localHashes is a list of pre-calculated hashs. # useRemoteAlternates - indicates to use alternate covers from CV try: url_image_data = ImageFetcher().fetch( primary_thumb_url, blocking=True) except ImageFetcherException: self.log_msg( "Network issue while fetching cover image from Comic Vine. Aborting...") raise IssueIdentifierNetworkError if self.cancel: raise IssueIdentifierCancelled # alert the GUI, if needed if self.coverUrlCallback is not None: self.coverUrlCallback(url_image_data) remote_cover_list = [] item = dict() item['url'] = primary_img_url item['hash'] = self.calculateHash(url_image_data) remote_cover_list.append(item) if self.cancel: raise IssueIdentifierCancelled if useRemoteAlternates: alt_img_url_list = comicVine.fetchAlternateCoverURLs( issue_id, page_url) for alt_url in alt_img_url_list: try: alt_url_image_data = ImageFetcher().fetch( alt_url, blocking=True) except ImageFetcherException: self.log_msg( "Network issue while fetching alt. cover image from Comic Vine. Aborting...") raise IssueIdentifierNetworkError if self.cancel: raise IssueIdentifierCancelled # alert the GUI, if needed if self.coverUrlCallback is not None: self.coverUrlCallback(alt_url_image_data) item = dict() item['url'] = alt_url item['hash'] = self.calculateHash(alt_url_image_data) remote_cover_list.append(item) if self.cancel: raise IssueIdentifierCancelled if useLog and useRemoteAlternates: self.log_msg( "[{0} alt. covers]".format(len(remote_cover_list) - 1), False) if useLog: self.log_msg("[ ", False) score_list = [] done = False for local_cover_hash in localCoverHashList: for remote_cover_item in remote_cover_list: score = ImageHasher.hamming_distance( local_cover_hash, remote_cover_item['hash']) score_item = dict() score_item['score'] = score score_item['url'] = remote_cover_item['url'] score_item['hash'] = remote_cover_item['hash'] score_list.append(score_item) if useLog: self.log_msg("{0}".format(score), False) if score <= self.strong_score_thresh: # such a good score, we can quit now, since for sure we # have a winner done = True break if done: break if useLog: self.log_msg(" ]", False) best_score_item = min(score_list, key=lambda x: x['score']) return best_score_item # def validate(self, issue_id): # create hash list # score = self.getIssueMatchScore(issue_id, hash_list, useRemoteAlternates = True) # if score < 20: # return True # else: # return False def search(self): ca = self.comic_archive self.match_list = [] self.cancel = False self.search_result = self.ResultNoMatches if not pil_available: self.log_msg( "Python Imaging Library (PIL) is not available and is needed for issue identification.") return self.match_list if not ca.seemsToBeAComicArchive(): self.log_msg( "Sorry, but " + opts.filename + " is not a comic archive!") return self.match_list cover_image_data = ca.getPage(self.cover_page_index) cover_hash = self.calculateHash(cover_image_data) # check the aspect ratio # if it's wider than it is high, it's probably a two page spread # if so, crop it and calculate a second hash narrow_cover_hash = None aspect_ratio = self.getAspectRatio(cover_image_data) if aspect_ratio < 1.0: right_side_image_data = self.cropCover(cover_image_data) if right_side_image_data is not None: narrow_cover_hash = self.calculateHash(right_side_image_data) #self.log_msg("Cover hash = {0:016x}".format(cover_hash)) keys = self.getSearchKeys() # normalize the issue number keys['issue_number'] = IssueString(keys['issue_number']).asString() # we need, at minimum, a series and issue number if keys['series'] is None or keys['issue_number'] is None: self.log_msg("Not enough info for a search!") return [] self.log_msg("Going to search for:") self.log_msg("\tSeries: " + keys['series']) self.log_msg("\tIssue: " + keys['issue_number']) if keys['issue_count'] is not None: self.log_msg("\tCount: " + str(keys['issue_count'])) if keys['year'] is not None: self.log_msg("\tYear: " + str(keys['year'])) if keys['month'] is not None: self.log_msg("\tMonth: " + str(keys['month'])) #self.log_msg("Publisher Blacklist: " + str(self.publisher_blacklist)) comicVine = ComicVineTalker() comicVine.wait_for_rate_limit = self.waitAndRetryOnRateLimit comicVine.setLogFunc(self.output_function) # self.log_msg(("Searching for " + keys['series'] + "...") self.log_msg(u"Searching for {0} #{1} ...".format( keys['series'], keys['issue_number'])) try: cv_search_results = comicVine.searchForSeries(keys['series']) except ComicVineTalkerException: self.log_msg( "Network issue while searching for series. Aborting...") return [] #self.log_msg("Found " + str(len(cv_search_results)) + " initial results") if self.cancel: return [] if cv_search_results is None: return [] series_second_round_list = [] #self.log_msg("Removing results with too long names, banned publishers, or future start dates") for item in cv_search_results: length_approved = False publisher_approved = True date_approved = True # remove any series that starts after the issue year if keys['year'] is not None and str( keys['year']).isdigit() and item['start_year'] is not None and str( item['start_year']).isdigit(): if int(keys['year']) < int(item['start_year']): date_approved = False # assume that our search name is close to the actual name, say # within ,e.g. 5 chars shortened_key = utils.removearticles(keys['series']) shortened_item_name = utils.removearticles(item['name']) if len(shortened_item_name) < ( len(shortened_key) + self.length_delta_thresh): length_approved = True # remove any series from publishers on the blacklist if item['publisher'] is not None: publisher = item['publisher']['name'] if publisher is not None and publisher.lower( ) in self.publisher_blacklist: publisher_approved = False if length_approved and publisher_approved and date_approved: series_second_round_list.append(item) self.log_msg( "Searching in " + str(len(series_second_round_list)) + " series") if self.callback is not None: self.callback(0, len(series_second_round_list)) # now sort the list by name length series_second_round_list.sort( key=lambda x: len(x['name']), reverse=False) # build a list of volume IDs volume_id_list = list() for series in series_second_round_list: volume_id_list.append(series['id']) try: issue_list = comicVine.fetchIssuesByVolumeIssueNumAndYear( volume_id_list, keys['issue_number'], keys['year']) except ComicVineTalkerException: self.log_msg( "Network issue while searching for series details. Aborting...") return [] if issue_list is None: return [] shortlist = list() # now re-associate the issues and volumes for issue in issue_list: for series in series_second_round_list: if series['id'] == issue['volume']['id']: shortlist.append((series, issue)) break if keys['year'] is None: self.log_msg(u"Found {0} series that have an issue #{1}".format( len(shortlist), keys['issue_number'])) else: self.log_msg( u"Found {0} series that have an issue #{1} from {2}".format( len(shortlist), keys['issue_number'], keys['year'])) # now we have a shortlist of volumes with the desired issue number # Do first round of cover matching counter = len(shortlist) for series, issue in shortlist: if self.callback is not None: self.callback(counter, len(shortlist) * 3) counter += 1 self.log_msg(u"Examining covers for ID: {0} {1} ({2}) ...".format( series['id'], series['name'], series['start_year']), newline=False) # parse out the cover date day, month, year = comicVine.parseDateStr(issue['cover_date']) # Now check the cover match against the primary image hash_list = [cover_hash] if narrow_cover_hash is not None: hash_list.append(narrow_cover_hash) try: image_url = issue['image']['super_url'] thumb_url = issue['image']['thumb_url'] page_url = issue['site_detail_url'] score_item = self.getIssueCoverMatchScore( comicVine, issue['id'], image_url, thumb_url, page_url, hash_list, useRemoteAlternates=False) except: self.match_list = [] return self.match_list match = dict() match['series'] = u"{0} ({1})".format( series['name'], series['start_year']) match['distance'] = score_item['score'] match['issue_number'] = keys['issue_number'] match['cv_issue_count'] = series['count_of_issues'] match['url_image_hash'] = score_item['hash'] match['issue_title'] = issue['name'] match['issue_id'] = issue['id'] match['volume_id'] = series['id'] match['month'] = month match['year'] = year match['publisher'] = None if series['publisher'] is not None: match['publisher'] = series['publisher']['name'] match['image_url'] = image_url match['thumb_url'] = thumb_url match['page_url'] = page_url match['description'] = issue['description'] self.match_list.append(match) self.log_msg(" --> {0}".format(match['distance']), newline=False) self.log_msg("") if len(self.match_list) == 0: self.log_msg(":-(no matches!") self.search_result = self.ResultNoMatches return self.match_list # sort list by image match scores self.match_list.sort(key=lambda k: k['distance']) l = [] for i in self.match_list: l.append(i['distance']) self.log_msg("Compared to covers in {0} issue(s):".format( len(self.match_list)), newline=False) self.log_msg(str(l)) def print_match(item): self.log_msg(u"-----> {0} #{1} {2} ({3}/{4}) -- score: {5}".format( item['series'], item['issue_number'], item['issue_title'], item['month'], item['year'], item['distance'])) best_score = self.match_list[0]['distance'] if best_score >= self.min_score_thresh: # we have 1 or more low-confidence matches (all bad cover scores) # look at a few more pages in the archive, and also alternate # covers online self.log_msg( "Very weak scores for the cover. Analyzing alternate pages and covers...") hash_list = [cover_hash] if narrow_cover_hash is not None: hash_list.append(narrow_cover_hash) for i in range(1, min(3, ca.getNumberOfPages())): image_data = ca.getPage(i) page_hash = self.calculateHash(image_data) hash_list.append(page_hash) second_match_list = [] counter = 2 * len(self.match_list) for m in self.match_list: if self.callback is not None: self.callback(counter, len(self.match_list) * 3) counter += 1 self.log_msg( u"Examining alternate covers for ID: {0} {1} ...".format( m['volume_id'], m['series']), newline=False) try: score_item = self.getIssueCoverMatchScore( comicVine, m['issue_id'], m['image_url'], m['thumb_url'], m['page_url'], hash_list, useRemoteAlternates=True) except: self.match_list = [] return self.match_list self.log_msg("--->{0}".format(score_item['score'])) self.log_msg("") if score_item['score'] < self.min_alternate_score_thresh: second_match_list.append(m) m['distance'] = score_item['score'] if len(second_match_list) == 0: if len(self.match_list) == 1: self.log_msg("No matching pages in the issue.") self.log_msg( u"--------------------------------------------------------------------------") print_match(self.match_list[0]) self.log_msg( u"--------------------------------------------------------------------------") self.search_result = self.ResultFoundMatchButBadCoverScore else: self.log_msg( u"--------------------------------------------------------------------------") self.log_msg( u"Multiple bad cover matches! Need to use other info...") self.log_msg( u"--------------------------------------------------------------------------") self.search_result = self.ResultMultipleMatchesWithBadImageScores return self.match_list else: # We did good, found something! self.log_msg("Success in secondary/alternate cover matching!") self.match_list = second_match_list # sort new list by image match scores self.match_list.sort(key=lambda k: k['distance']) best_score = self.match_list[0]['distance'] self.log_msg( "[Second round cover matching: best score = {0}]".format(best_score)) # now drop down into the rest of the processing if self.callback is not None: self.callback(99, 100) # now pare down list, remove any item more than specified distant from # the top scores for item in reversed(self.match_list): if item['distance'] > best_score + self.min_score_distance: self.match_list.remove(item) # One more test for the case choosing limited series first issue vs a trade with the same cover: # if we have a given issue count > 1 and the volume from CV has # count==1, remove it from match list if len(self.match_list) >= 2 and keys[ 'issue_count'] is not None and keys['issue_count'] != 1: new_list = list() for match in self.match_list: if match['cv_issue_count'] != 1: new_list.append(match) else: self.log_msg( "Removing volume {0} [{1}] from consideration (only 1 issue)".format( match['series'], match['volume_id'])) if len(new_list) > 0: self.match_list = new_list if len(self.match_list) == 1: self.log_msg( u"--------------------------------------------------------------------------") print_match(self.match_list[0]) self.log_msg( u"--------------------------------------------------------------------------") self.search_result = self.ResultOneGoodMatch elif len(self.match_list) == 0: self.log_msg( u"--------------------------------------------------------------------------") self.log_msg("No matches found :(") self.log_msg( u"--------------------------------------------------------------------------") self.search_result = self.ResultNoMatches else: # we've got multiple good matches: self.log_msg("More than one likely candidate.") self.search_result = self.ResultMultipleGoodMatches self.log_msg( u"--------------------------------------------------------------------------") for item in self.match_list: print_match(item) self.log_msg( u"--------------------------------------------------------------------------") return self.match_list