# This file is part of Mylar. # # Mylar is free software: you can redistribute it and/or modify # it under the terms of the GNU General Public License as published by # the Free Software Foundation, either version 3 of the License, or # (at your option) any later version. # # Mylar is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the # implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public # License for more details. # # You should have received a copy of the GNU General Public License # along with Mylar. If not, see . import sys import os import re import time import logger import string import urllib2 import lib.feedparser import mylar import platform from mylar.helpers import cvapi_check from bs4 import BeautifulSoup as Soup import httplib def patch_http_response_read(func): def inner(*args): try: return func(*args) except httplib.IncompleteRead, e: return e.partial return inner httplib.HTTPResponse.read = patch_http_response_read(httplib.HTTPResponse.read) if platform.python_version() == '2.7.6': httplib.HTTPConnection._http_vsn = 10 httplib.HTTPConnection._http_vsn_str = 'HTTP/1.0' def pulldetails(comicid, type, issueid=None, offset=1, arclist=None, comicidlist=None): #import easy to use xml parser called minidom: from xml.dom.minidom import parseString if mylar.COMICVINE_API == 'None' or mylar.COMICVINE_API is None or mylar.COMICVINE_API == mylar.DEFAULT_CVAPI: logger.warn('You have not specified your own ComicVine API key - alot of things will be limited. Get your own @ http://api.comicvine.com.') comicapi = mylar.DEFAULT_CVAPI else: comicapi = mylar.COMICVINE_API if type == 'comic': if not comicid.startswith('4050-'): comicid = '4050-' + comicid PULLURL = mylar.CVURL + 'volume/' + str(comicid) + '/?api_key=' + str(comicapi) + '&format=xml&field_list=name,count_of_issues,issues,start_year,site_detail_url,image,publisher,description,first_issue,deck,aliases' elif type == 'issue': if mylar.CV_ONLY: cv_type = 'issues' if arclist is None: searchset = 'filter=volume:' + str(comicid) + '&field_list=cover_date,description,id,image,issue_number,name,date_last_updated,store_date' else: searchset = 'filter=id:' + (arclist) + '&field_list=cover_date,id,issue_number,name,date_last_updated,store_date,volume' else: cv_type = 'volume/' + str(comicid) searchset = 'name,count_of_issues,issues,start_year,site_detail_url,image,publisher,description,store_date' PULLURL = mylar.CVURL + str(cv_type) + '/?api_key=' + str(comicapi) + '&format=xml&' + str(searchset) + '&offset=' + str(offset) elif type == 'firstissue': #this is used ONLY for CV_ONLY PULLURL = mylar.CVURL + 'issues/?api_key=' + str(comicapi) + '&format=xml&filter=id:' + str(issueid) + '&field_list=cover_date' elif type == 'storyarc': PULLURL = mylar.CVURL + 'story_arcs/?api_key=' + str(comicapi) + '&format=xml&filter=name:' + str(issueid) + '&field_list=cover_date' elif type == 'comicyears': PULLURL = mylar.CVURL + 'volumes/?api_key=' + str(comicapi) + '&format=xml&filter=id:' + str(comicidlist) + '&field_list=name,id,start_year,publisher&offset=' + str(offset) #logger.info('PULLURL: ' + PULLURL) #CV API Check here. if mylar.CVAPI_COUNT == 0 or mylar.CVAPI_COUNT >= mylar.CVAPI_MAX: cvapi_check() #download the file: file = urllib2.urlopen(PULLURL) #increment CV API counter. mylar.CVAPI_COUNT += 1 #convert to string: data = file.read() #close file because we dont need it anymore: file.close() #parse the xml you downloaded dom = parseString(data) return dom def getComic(comicid, type, issueid=None, arc=None, arcid=None, arclist=None, comicidlist=None): if type == 'issue': offset = 1 issue = {} ndic = [] issuechoice = [] comicResults = [] firstdate = '2099-00-00' #let's find out how many results we get from the query... if comicid is None: #if comicid is None, it's coming from the story arc search results. id = arcid islist = arclist else: id = comicid islist = None searched = pulldetails(id, 'issue', None, 0, islist) if searched is None: return False totalResults = searched.getElementsByTagName('number_of_total_results')[0].firstChild.wholeText logger.fdebug("there are " + str(totalResults) + " search results...") if not totalResults: return False countResults = 0 while (countResults < int(totalResults)): logger.fdebug("querying " + str(countResults)) if countResults > 0: #new api - have to change to page # instead of offset count offsetcount = countResults searched = pulldetails(id, 'issue', None, offsetcount, islist) issuechoice, tmpdate = GetIssuesInfo(id, searched, arcid) if tmpdate < firstdate: firstdate = tmpdate ndic = ndic + issuechoice #search results are limited to 100 and by pagination now...let's account for this. countResults = countResults + 100 issue['issuechoice'] = ndic issue['firstdate'] = firstdate return issue elif type == 'comic': dom = pulldetails(comicid, 'comic', None, 1) return GetComicInfo(comicid, dom) elif type == 'firstissue': dom = pulldetails(comicid, 'firstissue', issueid, 1) return GetFirstIssue(issueid, dom) elif type == 'storyarc': dom = pulldetails(arc, 'storyarc', None, 1) return GetComicInfo(issueid, dom) elif type == 'comicyears': #used by the story arc searcher when adding a given arc to poll each ComicID in order to populate the Series Year. #this grabs each issue based on issueid, and then subsets the comicid for each to be used later. #set the offset to 0, since we're doing a filter. dom = pulldetails(arcid, 'comicyears', offset=0, comicidlist=comicidlist) return GetSeriesYears(dom) def GetComicInfo(comicid, dom, safechk=None): if safechk is None: #safetycheck when checking comicvine. If it times out, increment the chk on retry attempts up until 5 tries then abort. safechk = 1 elif safechk > 4: logger.error('Unable to add / refresh the series due to inablity to retrieve data from ComicVine. You might want to try abit later and/or make sure ComicVine is up.') return #comicvine isn't as up-to-date with issue counts.. #so this can get really buggered, really fast. tracks = dom.getElementsByTagName('issue') try: cntit = dom.getElementsByTagName('count_of_issues')[0].firstChild.wholeText except: cntit = len(tracks) trackcnt = len(tracks) logger.fdebug("number of issues I counted: " + str(trackcnt)) logger.fdebug("number of issues CV says it has: " + str(cntit)) # if the two don't match, use trackcnt as count_of_issues might be not upto-date for some reason if int(trackcnt) != int(cntit): cntit = trackcnt vari = "yes" else: vari = "no" logger.fdebug("vari is set to: " + str(vari)) #if str(trackcnt) != str(int(cntit)+2): # cntit = int(cntit) + 1 comic = {} comicchoice = [] cntit = int(cntit) #retrieve the first xml tag (data) #that the parser finds with name tagName: # to return the parent name of the node : dom.getElementsByTagName('name')[0].parentNode.nodeName # where [0] denotes the number of the name field(s) # where nodeName denotes the parentNode : ComicName = results, publisher = publisher, issues = issue try: names = len(dom.getElementsByTagName('name')) n = 0 while (n < names): if dom.getElementsByTagName('name')[n].parentNode.nodeName == 'results': try: comic['ComicName'] = dom.getElementsByTagName('name')[n].firstChild.wholeText comic['ComicName'] = comic['ComicName'].rstrip() except: logger.error('There was a problem retrieving the given data from ComicVine. Ensure that www.comicvine.com is accessible AND that you have provided your OWN ComicVine API key.') return elif dom.getElementsByTagName('name')[n].parentNode.nodeName == 'publisher': try: comic['ComicPublisher'] = dom.getElementsByTagName('name')[n].firstChild.wholeText except: comic['ComicPublisher'] = "Unknown" n += 1 except: logger.warn('Something went wrong retrieving from ComicVine. Ensure your API is up-to-date and that comicvine is accessible') return try: comic['ComicYear'] = dom.getElementsByTagName('start_year')[0].firstChild.wholeText except: comic['ComicYear'] = '0000' try: comic['ComicURL'] = dom.getElementsByTagName('site_detail_url')[trackcnt].firstChild.wholeText except: #this should never be an exception. If it is, it's probably due to CV timing out - so let's sleep for abit then retry. logger.warn('Unable to retrieve URL for volume. This is usually due to a timeout to CV, or going over the API. Retrying again in 10s.') time.sleep(10) safechk +=1 GetComicInfo(comicid, dom, safechk) desdeck = 0 #the description field actually holds the Volume# - so let's grab it try: descchunk = dom.getElementsByTagName('description')[0].firstChild.wholeText comic_desc = drophtml(descchunk) desdeck +=1 except: comic_desc = 'None' #sometimes the deck has volume labels try: deckchunk = dom.getElementsByTagName('deck')[0].firstChild.wholeText comic_deck = deckchunk desdeck +=1 except: comic_deck = 'None' #comic['ComicDescription'] = comic_desc try: comic['Aliases'] = dom.getElementsByTagName('aliases')[0].firstChild.wholeText #logger.fdebug('Aliases: ' + str(aliases)) except: comic['Aliases'] = 'None' comic['ComicVersion'] = 'noversion' #logger.info('comic_desc:' + comic_desc) #logger.info('comic_deck:' + comic_deck) #logger.info('desdeck: ' + str(desdeck)) while (desdeck > 0): if desdeck == 1: if comic_desc == 'None': comicDes = comic_deck[:30] else: #extract the first 60 characters comicDes = comic_desc[:60].replace('New 52', '') elif desdeck == 2: #extract the characters from the deck comicDes = comic_deck[:30].replace('New 52', '') else: break i = 0 while (i < 2): if 'volume' in comicDes.lower(): #found volume - let's grab it. v_find = comicDes.lower().find('volume') #arbitrarily grab the next 10 chars (6 for volume + 1 for space + 3 for the actual vol #) #increased to 10 to allow for text numbering (+5 max) #sometimes it's volume 5 and ocassionally it's fifth volume. if i == 0: vfind = comicDes[v_find:v_find +15] #if it's volume 5 format basenums = {'zero': '0', 'one': '1', 'two': '2', 'three': '3', 'four': '4', 'five': '5', 'six': '6', 'seven': '7', 'eight': '8', 'nine': '9', 'ten': '10', 'i': '1', 'ii': '2', 'iii': '3', 'iv': '4', 'v': '5'} logger.fdebug('volume X format - ' + str(i) + ': ' + vfind) else: vfind = comicDes[:v_find] # if it's fifth volume format basenums = {'zero': '0', 'first': '1', 'second': '2', 'third': '3', 'fourth': '4', 'fifth': '5', 'sixth': '6', 'seventh': '7', 'eighth': '8', 'nineth': '9', 'tenth': '10', 'i': '1', 'ii': '2', 'iii': '3', 'iv': '4', 'v': '5'} logger.fdebug('X volume format - ' + str(i) + ': ' + vfind) volconv = '' for nums in basenums: if nums in vfind.lower(): sconv = basenums[nums] vfind = re.sub(nums, sconv, vfind.lower()) break #logger.info('volconv: ' + str(volconv)) #now we attempt to find the character position after the word 'volume' if i == 0: volthis = vfind.lower().find('volume') volthis = volthis + 6 # add on the actual word to the position so that we can grab the subsequent digit vfind = vfind[volthis:volthis + 4] # grab the next 4 characters ;) elif i == 1: volthis = vfind.lower().find('volume') vfind = vfind[volthis - 4:volthis] # grab the next 4 characters ;) if '(' in vfind: #bracket detected in versioning' vfindit = re.findall('[^()]+', vfind) vfind = vfindit[0] vf = re.findall('[^<>]+', vfind) ledigit = re.sub("[^0-9]", "", vf[0]) if ledigit != '': comic['ComicVersion'] = ledigit logger.fdebug("Volume information found! Adding to series record : volume " + comic['ComicVersion']) break i += 1 else: i += 1 if comic['ComicVersion'] == 'noversion': logger.fdebug('comic[ComicVersion]:' + str(comic['ComicVersion'])) desdeck -= 1 else: break if vari == "yes": comic['ComicIssues'] = str(cntit) else: comic['ComicIssues'] = dom.getElementsByTagName('count_of_issues')[0].firstChild.wholeText comic['ComicImage'] = dom.getElementsByTagName('super_url')[0].firstChild.wholeText comic['ComicImageALT'] = dom.getElementsByTagName('small_url')[0].firstChild.wholeText comic['FirstIssueID'] = dom.getElementsByTagName('id')[0].firstChild.wholeText # print ("fistIss:" + str(comic['FirstIssueID'])) # comicchoice.append({ # 'ComicName': comic['ComicName'], # 'ComicYear': comic['ComicYear'], # 'Comicid': comicid, # 'ComicURL': comic['ComicURL'], # 'ComicIssues': comic['ComicIssues'], # 'ComicImage': comic['ComicImage'], # 'ComicVolume': ParseVol, # 'ComicPublisher': comic['ComicPublisher'] # }) # comic['comicchoice'] = comicchoice return comic def GetIssuesInfo(comicid, dom, arcid=None): subtracks = dom.getElementsByTagName('issue') if not mylar.CV_ONLY: cntiss = dom.getElementsByTagName('count_of_issues')[0].firstChild.wholeText logger.fdebug("issues I've counted: " + str(len(subtracks))) logger.fdebug("issues CV says it has: " + str(int(cntiss))) if int(len(subtracks)) != int(cntiss): logger.fdebug("CV's count is wrong, I counted different...going with my count for physicals" + str(len(subtracks))) cntiss = len(subtracks) # assume count of issues is wrong, go with ACTUAL physical api count cntiss = int(cntiss) n = cntiss -1 else: n = int(len(subtracks)) tempissue = {} issuech = [] firstdate = '2099-00-00' for subtrack in subtracks: if not mylar.CV_ONLY: if (dom.getElementsByTagName('name')[n].firstChild) is not None: issue['Issue_Name'] = dom.getElementsByTagName('name')[n].firstChild.wholeText else: issue['Issue_Name'] = 'None' issue['Issue_ID'] = dom.getElementsByTagName('id')[n].firstChild.wholeText issue['Issue_Number'] = dom.getElementsByTagName('issue_number')[n].firstChild.wholeText issuech.append({ 'Issue_ID': issue['Issue_ID'], 'Issue_Number': issue['Issue_Number'], 'Issue_Name': issue['Issue_Name'] }) else: try: totnames = len(subtrack.getElementsByTagName('name')) tot = 0 while (tot < totnames): if subtrack.getElementsByTagName('name')[tot].parentNode.nodeName == 'volume': tempissue['ComicName'] = subtrack.getElementsByTagName('name')[tot].firstChild.wholeText elif subtrack.getElementsByTagName('name')[tot].parentNode.nodeName == 'issue': try: tempissue['Issue_Name'] = subtrack.getElementsByTagName('name')[tot].firstChild.wholeText except: tempissue['Issue_Name'] = None tot += 1 except: tempissue['ComicName'] = 'None' try: totids = len(subtrack.getElementsByTagName('id')) idt = 0 while (idt < totids): if subtrack.getElementsByTagName('id')[idt].parentNode.nodeName == 'volume': tempissue['Comic_ID'] = subtrack.getElementsByTagName('id')[idt].firstChild.wholeText elif subtrack.getElementsByTagName('id')[idt].parentNode.nodeName == 'issue': tempissue['Issue_ID'] = subtrack.getElementsByTagName('id')[idt].firstChild.wholeText idt += 1 except: tempissue['Issue_Name'] = 'None' try: tempissue['CoverDate'] = subtrack.getElementsByTagName('cover_date')[0].firstChild.wholeText except: tempissue['CoverDate'] = '0000-00-00' try: tempissue['StoreDate'] = subtrack.getElementsByTagName('store_date')[0].firstChild.wholeText except: tempissue['StoreDate'] = '0000-00-00' try: tempissue['Issue_Number'] = subtrack.getElementsByTagName('issue_number')[0].firstChild.wholeText except: logger.fdebug('No Issue Number available - Trade Paperbacks, Graphic Novels and Compendiums are not supported as of yet.') if arcid is None: issuech.append({ 'Comic_ID': comicid, 'Issue_ID': tempissue['Issue_ID'], 'Issue_Number': tempissue['Issue_Number'], 'Issue_Date': tempissue['CoverDate'], 'Store_Date': tempissue['StoreDate'], 'Issue_Name': tempissue['Issue_Name'] }) else: issuech.append({ 'ArcID': arcid, 'ComicName': tempissue['ComicName'], 'ComicID': tempissue['Comic_ID'], 'IssueID': tempissue['Issue_ID'], 'Issue_Number': tempissue['Issue_Number'], 'Issue_Date': tempissue['CoverDate'], 'Store_Date': tempissue['StoreDate'], 'Issue_Name': tempissue['Issue_Name'] }) if tempissue['CoverDate'] < firstdate and tempissue['CoverDate'] != '0000-00-00': firstdate = tempissue['CoverDate'] n-= 1 #issue['firstdate'] = firstdate return issuech, firstdate def GetFirstIssue(issueid, dom): #if the Series Year doesn't exist, get the first issue and take the date from that try: first_year = dom.getElementsByTagName('cover_date')[0].firstChild.wholeText except: first_year = '0000' return first_year the_year = first_year[:4] the_month = first_year[5:7] the_date = the_year + '-' + the_month return the_year def GetSeriesYears(dom): #used by the 'add a story arc' option to individually populate the Series Year for each series within the given arc. #series year is required for alot of functionality. series = dom.getElementsByTagName('volume') tempseries = {} serieslist = [] for dm in series: try: totids = len(dm.getElementsByTagName('id')) idc = 0 while (idc < totids): if dm.getElementsByTagName('id')[idc].parentNode.nodeName == 'volume': tempseries['ComicID'] = dm.getElementsByTagName('id')[idc].firstChild.wholeText idc+=1 except: logger.warn('There was a problem retrieving a comicid for a series within the arc. This will have to manually corrected most likely.') tempseries['ComicID'] = 'None' tempseries['Series'] = 'None' tempseries['Publisher'] = 'None' try: totnames = len(dm.getElementsByTagName('name')) namesc = 0 while (namesc < totnames): if dm.getElementsByTagName('name')[namesc].parentNode.nodeName == 'volume': tempseries['Series'] = dm.getElementsByTagName('name')[namesc].firstChild.wholeText elif dm.getElementsByTagName('name')[namesc].parentNode.nodeName == 'publisher': tempseries['Publisher'] = dm.getElementsByTagName('name')[namesc].firstChild.wholeText namesc+=1 except: logger.warn('There was a problem retrieving a Series Name or Publisher for a series within the arc. This will have to manually corrected.') try: tempseries['SeriesYear'] = dm.getElementsByTagName('start_year')[0].firstChild.wholeText except: logger.warn('There was a problem retrieving the start year for a particular series within the story arc.') tempseries['SeriesYear'] = '0000' serieslist.append({"ComicID": tempseries['ComicID'], "ComicName": tempseries['Series'], "SeriesYear": tempseries['SeriesYear'], "Publisher": tempseries['Publisher']}) return serieslist def drophtml(html): from bs4 import BeautifulSoup soup = BeautifulSoup(html) text_parts = soup.findAll(text=True) #print ''.join(text_parts) return ''.join(text_parts)