# This file is part of Mylar. # # Mylar is free software: you can redistribute it and/or modify # it under the terms of the GNU General Public License as published by # the Free Software Foundation, either version 3 of the License, or # (at your option) any later version. # # Mylar is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the # implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public # License for more details. # # You should have received a copy of the GNU General Public License # along with Mylar. If not, see . import sys import os import re import time import logger import string import urllib2 import lib.feedparser import mylar import platform from bs4 import BeautifulSoup as Soup import httplib import lib.requests as requests def patch_http_response_read(func): def inner(*args): try: return func(*args) except httplib.IncompleteRead, e: return e.partial return inner httplib.HTTPResponse.read = patch_http_response_read(httplib.HTTPResponse.read) if platform.python_version() == '2.7.6': httplib.HTTPConnection._http_vsn = 10 httplib.HTTPConnection._http_vsn_str = 'HTTP/1.0' def pulldetails(comicid, type, issueid=None, offset=1, arclist=None, comicidlist=None): #import easy to use xml parser called minidom: from xml.dom.minidom import parseString if mylar.COMICVINE_API == 'None' or mylar.COMICVINE_API is None or mylar.COMICVINE_API == mylar.DEFAULT_CVAPI: logger.warn('You have not specified your own ComicVine API key - alot of things will be limited. Get your own @ http://api.comicvine.com.') comicapi = mylar.DEFAULT_CVAPI else: comicapi = mylar.COMICVINE_API if type == 'comic': if not comicid.startswith('4050-'): comicid = '4050-' + comicid PULLURL = mylar.CVURL + 'volume/' + str(comicid) + '/?api_key=' + str(comicapi) + '&format=xml&field_list=name,count_of_issues,issues,start_year,site_detail_url,image,publisher,description,first_issue,deck,aliases' elif type == 'issue': if mylar.CV_ONLY: cv_type = 'issues' if arclist is None: searchset = 'filter=volume:' + str(comicid) + '&field_list=cover_date,description,id,image,issue_number,name,date_last_updated,store_date' else: searchset = 'filter=id:' + (arclist) + '&field_list=cover_date,id,issue_number,name,date_last_updated,store_date,volume' else: cv_type = 'volume/' + str(comicid) searchset = 'name,count_of_issues,issues,start_year,site_detail_url,image,publisher,description,store_date' PULLURL = mylar.CVURL + str(cv_type) + '/?api_key=' + str(comicapi) + '&format=xml&' + str(searchset) + '&offset=' + str(offset) elif type == 'firstissue': #this is used ONLY for CV_ONLY PULLURL = mylar.CVURL + 'issues/?api_key=' + str(comicapi) + '&format=xml&filter=id:' + str(issueid) + '&field_list=cover_date' elif type == 'storyarc': PULLURL = mylar.CVURL + 'story_arcs/?api_key=' + str(comicapi) + '&format=xml&filter=name:' + str(issueid) + '&field_list=cover_date' elif type == 'comicyears': PULLURL = mylar.CVURL + 'volumes/?api_key=' + str(comicapi) + '&format=xml&filter=id:' + str(comicidlist) + '&field_list=name,id,start_year,publisher&offset=' + str(offset) elif type == 'import': PULLURL = mylar.CVURL + 'issues/?api_key=' + str(comicapi) + '&format=xml&filter=id:' + (comicidlist) + '&field_list=cover_date,id,issue_number,name,date_last_updated,store_date,volume' + '&offset=' + str(offset) #logger.info('CV.PULLURL: ' + PULLURL) #new CV API restriction - one api request / second. if mylar.CVAPI_RATE is None or mylar.CVAPI_RATE < 2: time.sleep(2) else: time.sleep(mylar.CVAPI_RATE) #download the file: #set payload to None for now... payload = None verify = False try: r = requests.get(PULLURL, params=payload, verify=verify, headers=mylar.CV_HEADERS) except Exception, e: logger.warn('Error fetching data from ComicVine: %s' % (e)) return logger.fdebug('cv status code : ' + str(r.status_code)) dom = parseString(r.content) return dom def getComic(comicid, type, issueid=None, arc=None, arcid=None, arclist=None, comicidlist=None): if type == 'issue': offset = 1 issue = {} ndic = [] issuechoice = [] comicResults = [] firstdate = '2099-00-00' #let's find out how many results we get from the query... if comicid is None: #if comicid is None, it's coming from the story arc search results. id = arcid #since the arclist holds the issueids, and the pertinent reading order - we need to strip out the reading order so this works. aclist = '' for ac in arclist.split('|'): aclist += ac[:ac.find(',')] + '|' if aclist.endswith('|'): aclist = aclist[:-1] islist = aclist else: id = comicid islist = None searched = pulldetails(id, 'issue', None, 0, islist) if searched is None: return False totalResults = searched.getElementsByTagName('number_of_total_results')[0].firstChild.wholeText logger.fdebug("there are " + str(totalResults) + " search results...") if not totalResults: return False countResults = 0 while (countResults < int(totalResults)): logger.fdebug("querying range from " + str(countResults) + " to " + str(countResults + 100)) if countResults > 0: #new api - have to change to page # instead of offset count offsetcount = countResults searched = pulldetails(id, 'issue', None, offsetcount, islist) issuechoice, tmpdate = GetIssuesInfo(id, searched, arcid) if tmpdate < firstdate: firstdate = tmpdate ndic = ndic + issuechoice #search results are limited to 100 and by pagination now...let's account for this. countResults = countResults + 100 issue['issuechoice'] = ndic issue['firstdate'] = firstdate return issue elif type == 'comic': dom = pulldetails(comicid, 'comic', None, 1) return GetComicInfo(comicid, dom) elif type == 'firstissue': dom = pulldetails(comicid, 'firstissue', issueid, 1) return GetFirstIssue(issueid, dom) elif type == 'storyarc': dom = pulldetails(arc, 'storyarc', None, 1) return GetComicInfo(issueid, dom) elif type == 'comicyears': #used by the story arc searcher when adding a given arc to poll each ComicID in order to populate the Series Year. #this grabs each issue based on issueid, and then subsets the comicid for each to be used later. #set the offset to 0, since we're doing a filter. dom = pulldetails(arcid, 'comicyears', offset=0, comicidlist=comicidlist) return GetSeriesYears(dom) elif type == 'import': #used by the importer when doing a scan with metatagging enabled. If metatagging comes back true, then there's an IssueID present #within the tagging (with CT). This compiles all of the IssueID's during a scan (in 100's), and returns the corresponding CV data #related to the given IssueID's - namely ComicID, Name, Volume (more at some point, but those are the important ones). offset = 1 if len(comicidlist) <= 100: endcnt = len(comicidlist) else: endcnt = 100 id_count = 0 import_list = [] logger.fdebug('comicidlist:' + str(comicidlist)) while id_count < len(comicidlist): #break it up by 100 per api hit #do the first 100 regardless in_cnt = 0 for i in range(id_count, endcnt): if in_cnt == 0: tmpidlist = str(comicidlist[i]) else: tmpidlist += '|' + str(comicidlist[i]) in_cnt +=1 logger.info('tmpidlist: ' + str(tmpidlist)) searched = pulldetails(None, 'import', offset=0, comicidlist=tmpidlist) if searched is None: break else: tGIL = GetImportList(searched) import_list += tGIL endcnt +=100 id_count +=100 return import_list def GetComicInfo(comicid, dom, safechk=None): if safechk is None: #safetycheck when checking comicvine. If it times out, increment the chk on retry attempts up until 5 tries then abort. safechk = 1 elif safechk > 4: logger.error('Unable to add / refresh the series due to inablity to retrieve data from ComicVine. You might want to try abit later and/or make sure ComicVine is up.') return #comicvine isn't as up-to-date with issue counts.. #so this can get really buggered, really fast. tracks = dom.getElementsByTagName('issue') try: cntit = dom.getElementsByTagName('count_of_issues')[0].firstChild.wholeText except: cntit = len(tracks) trackcnt = len(tracks) logger.fdebug("number of issues I counted: " + str(trackcnt)) logger.fdebug("number of issues CV says it has: " + str(cntit)) # if the two don't match, use trackcnt as count_of_issues might be not upto-date for some reason if int(trackcnt) != int(cntit): cntit = trackcnt vari = "yes" else: vari = "no" logger.fdebug("vari is set to: " + str(vari)) #if str(trackcnt) != str(int(cntit)+2): # cntit = int(cntit) + 1 comic = {} comicchoice = [] cntit = int(cntit) #retrieve the first xml tag (data) #that the parser finds with name tagName: # to return the parent name of the node : dom.getElementsByTagName('name')[0].parentNode.nodeName # where [0] denotes the number of the name field(s) # where nodeName denotes the parentNode : ComicName = results, publisher = publisher, issues = issue try: names = len(dom.getElementsByTagName('name')) n = 0 while (n < names): if dom.getElementsByTagName('name')[n].parentNode.nodeName == 'results': try: comic['ComicName'] = dom.getElementsByTagName('name')[n].firstChild.wholeText comic['ComicName'] = comic['ComicName'].rstrip() except: logger.error('There was a problem retrieving the given data from ComicVine. Ensure that www.comicvine.com is accessible AND that you have provided your OWN ComicVine API key.') return elif dom.getElementsByTagName('name')[n].parentNode.nodeName == 'publisher': try: comic['ComicPublisher'] = dom.getElementsByTagName('name')[n].firstChild.wholeText except: comic['ComicPublisher'] = "Unknown" n += 1 except: logger.warn('Something went wrong retrieving from ComicVine. Ensure your API is up-to-date and that comicvine is accessible') return try: comic['ComicYear'] = dom.getElementsByTagName('start_year')[0].firstChild.wholeText except: comic['ComicYear'] = '0000' try: comic['ComicURL'] = dom.getElementsByTagName('site_detail_url')[trackcnt].firstChild.wholeText except: #this should never be an exception. If it is, it's probably due to CV timing out - so let's sleep for abit then retry. logger.warn('Unable to retrieve URL for volume. This is usually due to a timeout to CV, or going over the API. Retrying again in 10s.') time.sleep(10) safechk +=1 GetComicInfo(comicid, dom, safechk) desdeck = 0 #the description field actually holds the Volume# - so let's grab it try: descchunk = dom.getElementsByTagName('description')[0].firstChild.wholeText comic_desc = drophtml(descchunk) desdeck +=1 except: comic_desc = 'None' #sometimes the deck has volume labels try: deckchunk = dom.getElementsByTagName('deck')[0].firstChild.wholeText comic_deck = deckchunk desdeck +=1 except: comic_deck = 'None' #comic['ComicDescription'] = comic_desc try: comic['Aliases'] = dom.getElementsByTagName('aliases')[0].firstChild.wholeText #logger.fdebug('Aliases: ' + str(aliases)) except: comic['Aliases'] = 'None' comic['ComicVersion'] = 'noversion' #logger.info('comic_desc:' + comic_desc) #logger.info('comic_deck:' + comic_deck) #logger.info('desdeck: ' + str(desdeck)) while (desdeck > 0): if desdeck == 1: if comic_desc == 'None': comicDes = comic_deck[:30] else: #extract the first 60 characters comicDes = comic_desc[:60].replace('New 52', '') elif desdeck == 2: #extract the characters from the deck comicDes = comic_deck[:30].replace('New 52', '') else: break i = 0 while (i < 2): if 'volume' in comicDes.lower(): #found volume - let's grab it. v_find = comicDes.lower().find('volume') #arbitrarily grab the next 10 chars (6 for volume + 1 for space + 3 for the actual vol #) #increased to 10 to allow for text numbering (+5 max) #sometimes it's volume 5 and ocassionally it's fifth volume. if i == 0: vfind = comicDes[v_find:v_find +15] #if it's volume 5 format basenums = {'zero': '0', 'one': '1', 'two': '2', 'three': '3', 'four': '4', 'five': '5', 'six': '6', 'seven': '7', 'eight': '8', 'nine': '9', 'ten': '10', 'i': '1', 'ii': '2', 'iii': '3', 'iv': '4', 'v': '5'} logger.fdebug('volume X format - ' + str(i) + ': ' + vfind) else: vfind = comicDes[:v_find] # if it's fifth volume format basenums = {'zero': '0', 'first': '1', 'second': '2', 'third': '3', 'fourth': '4', 'fifth': '5', 'sixth': '6', 'seventh': '7', 'eighth': '8', 'nineth': '9', 'tenth': '10', 'i': '1', 'ii': '2', 'iii': '3', 'iv': '4', 'v': '5'} logger.fdebug('X volume format - ' + str(i) + ': ' + vfind) volconv = '' for nums in basenums: if nums in vfind.lower(): sconv = basenums[nums] vfind = re.sub(nums, sconv, vfind.lower()) break #logger.info('volconv: ' + str(volconv)) #now we attempt to find the character position after the word 'volume' if i == 0: volthis = vfind.lower().find('volume') volthis = volthis + 6 # add on the actual word to the position so that we can grab the subsequent digit vfind = vfind[volthis:volthis + 4] # grab the next 4 characters ;) elif i == 1: volthis = vfind.lower().find('volume') vfind = vfind[volthis - 4:volthis] # grab the next 4 characters ;) if '(' in vfind: #bracket detected in versioning' vfindit = re.findall('[^()]+', vfind) vfind = vfindit[0] vf = re.findall('[^<>]+', vfind) try: ledigit = re.sub("[^0-9]", "", vf[0]) if ledigit != '': comic['ComicVersion'] = ledigit logger.fdebug("Volume information found! Adding to series record : volume " + comic['ComicVersion']) break except: pass i += 1 else: i += 1 if comic['ComicVersion'] == 'noversion': logger.fdebug('comic[ComicVersion]:' + str(comic['ComicVersion'])) desdeck -= 1 else: break if vari == "yes": comic['ComicIssues'] = str(cntit) else: comic['ComicIssues'] = dom.getElementsByTagName('count_of_issues')[0].firstChild.wholeText comic['ComicImage'] = dom.getElementsByTagName('super_url')[0].firstChild.wholeText comic['ComicImageALT'] = dom.getElementsByTagName('small_url')[0].firstChild.wholeText comic['FirstIssueID'] = dom.getElementsByTagName('id')[0].firstChild.wholeText # print ("fistIss:" + str(comic['FirstIssueID'])) # comicchoice.append({ # 'ComicName': comic['ComicName'], # 'ComicYear': comic['ComicYear'], # 'Comicid': comicid, # 'ComicURL': comic['ComicURL'], # 'ComicIssues': comic['ComicIssues'], # 'ComicImage': comic['ComicImage'], # 'ComicVolume': ParseVol, # 'ComicPublisher': comic['ComicPublisher'] # }) # comic['comicchoice'] = comicchoice return comic def GetIssuesInfo(comicid, dom, arcid=None): subtracks = dom.getElementsByTagName('issue') if not mylar.CV_ONLY: cntiss = dom.getElementsByTagName('count_of_issues')[0].firstChild.wholeText logger.fdebug("issues I've counted: " + str(len(subtracks))) logger.fdebug("issues CV says it has: " + str(int(cntiss))) if int(len(subtracks)) != int(cntiss): logger.fdebug("CV's count is wrong, I counted different...going with my count for physicals" + str(len(subtracks))) cntiss = len(subtracks) # assume count of issues is wrong, go with ACTUAL physical api count cntiss = int(cntiss) n = cntiss -1 else: n = int(len(subtracks)) tempissue = {} issuech = [] firstdate = '2099-00-00' for subtrack in subtracks: if not mylar.CV_ONLY: if (dom.getElementsByTagName('name')[n].firstChild) is not None: issue['Issue_Name'] = dom.getElementsByTagName('name')[n].firstChild.wholeText else: issue['Issue_Name'] = 'None' issue['Issue_ID'] = dom.getElementsByTagName('id')[n].firstChild.wholeText issue['Issue_Number'] = dom.getElementsByTagName('issue_number')[n].firstChild.wholeText issuech.append({ 'Issue_ID': issue['Issue_ID'], 'Issue_Number': issue['Issue_Number'], 'Issue_Name': issue['Issue_Name'] }) else: try: totnames = len(subtrack.getElementsByTagName('name')) tot = 0 while (tot < totnames): if subtrack.getElementsByTagName('name')[tot].parentNode.nodeName == 'volume': tempissue['ComicName'] = subtrack.getElementsByTagName('name')[tot].firstChild.wholeText elif subtrack.getElementsByTagName('name')[tot].parentNode.nodeName == 'issue': try: tempissue['Issue_Name'] = subtrack.getElementsByTagName('name')[tot].firstChild.wholeText except: tempissue['Issue_Name'] = None tot += 1 except: tempissue['ComicName'] = 'None' try: totids = len(subtrack.getElementsByTagName('id')) idt = 0 while (idt < totids): if subtrack.getElementsByTagName('id')[idt].parentNode.nodeName == 'volume': tempissue['Comic_ID'] = subtrack.getElementsByTagName('id')[idt].firstChild.wholeText elif subtrack.getElementsByTagName('id')[idt].parentNode.nodeName == 'issue': tempissue['Issue_ID'] = subtrack.getElementsByTagName('id')[idt].firstChild.wholeText idt += 1 except: tempissue['Issue_Name'] = 'None' try: tempissue['CoverDate'] = subtrack.getElementsByTagName('cover_date')[0].firstChild.wholeText except: tempissue['CoverDate'] = '0000-00-00' try: tempissue['StoreDate'] = subtrack.getElementsByTagName('store_date')[0].firstChild.wholeText except: tempissue['StoreDate'] = '0000-00-00' try: tempissue['Issue_Number'] = subtrack.getElementsByTagName('issue_number')[0].firstChild.wholeText except: logger.fdebug('No Issue Number available - Trade Paperbacks, Graphic Novels and Compendiums are not supported as of yet.') if arcid is None: issuech.append({ 'Comic_ID': comicid, 'Issue_ID': tempissue['Issue_ID'], 'Issue_Number': tempissue['Issue_Number'], 'Issue_Date': tempissue['CoverDate'], 'Store_Date': tempissue['StoreDate'], 'Issue_Name': tempissue['Issue_Name'] }) else: issuech.append({ 'ArcID': arcid, 'ComicName': tempissue['ComicName'], 'ComicID': tempissue['Comic_ID'], 'IssueID': tempissue['Issue_ID'], 'Issue_Number': tempissue['Issue_Number'], 'Issue_Date': tempissue['CoverDate'], 'Store_Date': tempissue['StoreDate'], 'Issue_Name': tempissue['Issue_Name'] }) if tempissue['CoverDate'] < firstdate and tempissue['CoverDate'] != '0000-00-00': firstdate = tempissue['CoverDate'] n-= 1 #issue['firstdate'] = firstdate return issuech, firstdate def GetFirstIssue(issueid, dom): #if the Series Year doesn't exist, get the first issue and take the date from that try: first_year = dom.getElementsByTagName('cover_date')[0].firstChild.wholeText except: first_year = '0000' return first_year the_year = first_year[:4] the_month = first_year[5:7] the_date = the_year + '-' + the_month return the_year def GetSeriesYears(dom): #used by the 'add a story arc' option to individually populate the Series Year for each series within the given arc. #series year is required for alot of functionality. series = dom.getElementsByTagName('volume') tempseries = {} serieslist = [] for dm in series: try: totids = len(dm.getElementsByTagName('id')) idc = 0 while (idc < totids): if dm.getElementsByTagName('id')[idc].parentNode.nodeName == 'volume': tempseries['ComicID'] = dm.getElementsByTagName('id')[idc].firstChild.wholeText idc+=1 except: logger.warn('There was a problem retrieving a comicid for a series within the arc. This will have to manually corrected most likely.') tempseries['ComicID'] = 'None' tempseries['Series'] = 'None' tempseries['Publisher'] = 'None' try: totnames = len(dm.getElementsByTagName('name')) namesc = 0 while (namesc < totnames): if dm.getElementsByTagName('name')[namesc].parentNode.nodeName == 'volume': tempseries['Series'] = dm.getElementsByTagName('name')[namesc].firstChild.wholeText elif dm.getElementsByTagName('name')[namesc].parentNode.nodeName == 'publisher': tempseries['Publisher'] = dm.getElementsByTagName('name')[namesc].firstChild.wholeText namesc+=1 except: logger.warn('There was a problem retrieving a Series Name or Publisher for a series within the arc. This will have to manually corrected.') try: tempseries['SeriesYear'] = dm.getElementsByTagName('start_year')[0].firstChild.wholeText except: logger.warn('There was a problem retrieving the start year for a particular series within the story arc.') tempseries['SeriesYear'] = '0000' serieslist.append({"ComicID": tempseries['ComicID'], "ComicName": tempseries['Series'], "SeriesYear": tempseries['SeriesYear'], "Publisher": tempseries['Publisher']}) return serieslist def GetImportList(results): importlist = results.getElementsByTagName('issue') serieslist = [] importids = {} tempseries = {} for implist in importlist: try: totids = len(implist.getElementsByTagName('id')) idt = 0 while (idt < totids): if implist.getElementsByTagName('id')[idt].parentNode.nodeName == 'volume': tempseries['ComicID'] = implist.getElementsByTagName('id')[idt].firstChild.wholeText elif implist.getElementsByTagName('id')[idt].parentNode.nodeName == 'issue': tempseries['IssueID'] = implist.getElementsByTagName('id')[idt].firstChild.wholeText idt += 1 except: tempseries['ComicID'] = None try: totnames = len(implist.getElementsByTagName('name')) tot = 0 while (tot < totnames): if implist.getElementsByTagName('name')[tot].parentNode.nodeName == 'volume': tempseries['ComicName'] = implist.getElementsByTagName('name')[tot].firstChild.wholeText elif implist.getElementsByTagName('name')[tot].parentNode.nodeName == 'issue': try: tempseries['Issue_Name'] = implist.getElementsByTagName('name')[tot].firstChild.wholeText except: tempseries['Issue_Name'] = None tot += 1 except: tempseries['ComicName'] = 'None' try: tempseries['Issue_Number'] = implist.getElementsByTagName('issue_number')[0].firstChild.wholeText except: logger.fdebug('No Issue Number available - Trade Paperbacks, Graphic Novels and Compendiums are not supported as of yet.') logger.info('tempseries:' + str(tempseries)) serieslist.append({"ComicID": tempseries['ComicID'], "IssueID": tempseries['IssueID'], "ComicName": tempseries['ComicName'], "Issue_Name": tempseries['Issue_Name'], "Issue_Number": tempseries['Issue_Number']}) return serieslist def drophtml(html): from bs4 import BeautifulSoup soup = BeautifulSoup(html) text_parts = soup.findAll(text=True) #print ''.join(text_parts) return ''.join(text_parts)