# This file is part of Mylar. # # Mylar is free software: you can redistribute it and/or modify # it under the terms of the GNU General Public License as published by # the Free Software Foundation, either version 3 of the License, or # (at your option) any later version. # # Mylar is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the # implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public # License for more details. # # You should have received a copy of the GNU General Public License # along with Mylar. If not, see . import sys import os import re import time import logger import string import urllib2 import lib.feedparser import mylar import platform from bs4 import BeautifulSoup as Soup from xml.parsers.expat import ExpatError import httplib import requests def patch_http_response_read(func): def inner(*args): try: return func(*args) except httplib.IncompleteRead, e: return e.partial return inner httplib.HTTPResponse.read = patch_http_response_read(httplib.HTTPResponse.read) if platform.python_version() == '2.7.6': httplib.HTTPConnection._http_vsn = 10 httplib.HTTPConnection._http_vsn_str = 'HTTP/1.0' def pulldetails(comicid, type, issueid=None, offset=1, arclist=None, comicidlist=None): #import easy to use xml parser called minidom: from xml.dom.minidom import parseString if mylar.CONFIG.COMICVINE_API == 'None' or mylar.CONFIG.COMICVINE_API is None: logger.warn('You have not specified your own ComicVine API key - it\'s a requirement. Get your own @ http://api.comicvine.com.') return else: comicapi = mylar.CONFIG.COMICVINE_API if type == 'comic': if not comicid.startswith('4050-'): comicid = '4050-' + comicid PULLURL = mylar.CVURL + 'volume/' + str(comicid) + '/?api_key=' + str(comicapi) + '&format=xml&field_list=name,count_of_issues,issues,start_year,site_detail_url,image,publisher,description,first_issue,deck,aliases' elif type == 'issue': if mylar.CONFIG.CV_ONLY: cv_type = 'issues' if arclist is None: searchset = 'filter=volume:' + str(comicid) + '&field_list=cover_date,description,id,image,issue_number,name,date_last_updated,store_date' else: searchset = 'filter=id:' + (arclist) + '&field_list=cover_date,id,issue_number,name,date_last_updated,store_date,volume' else: cv_type = 'volume/' + str(comicid) searchset = 'name,count_of_issues,issues,start_year,site_detail_url,image,publisher,description,store_date' PULLURL = mylar.CVURL + str(cv_type) + '/?api_key=' + str(comicapi) + '&format=xml&' + str(searchset) + '&offset=' + str(offset) elif any([type == 'image', type == 'firstissue']): #this is used ONLY for CV_ONLY PULLURL = mylar.CVURL + 'issues/?api_key=' + str(comicapi) + '&format=xml&filter=id:' + str(issueid) + '&field_list=cover_date,image' elif type == 'storyarc': PULLURL = mylar.CVURL + 'story_arcs/?api_key=' + str(comicapi) + '&format=xml&filter=name:' + str(issueid) + '&field_list=cover_date' elif type == 'comicyears': PULLURL = mylar.CVURL + 'volumes/?api_key=' + str(comicapi) + '&format=xml&filter=id:' + str(comicidlist) + '&field_list=name,id,start_year,publisher,description,deck,aliases&offset=' + str(offset) elif type == 'import': PULLURL = mylar.CVURL + 'issues/?api_key=' + str(comicapi) + '&format=xml&filter=id:' + (comicidlist) + '&field_list=cover_date,id,issue_number,name,date_last_updated,store_date,volume' + '&offset=' + str(offset) elif type == 'update_dates': PULLURL = mylar.CVURL + 'issues/?api_key=' + str(comicapi) + '&format=xml&filter=id:' + (comicidlist)+ '&field_list=date_last_updated, id, issue_number, store_date, cover_date, name, volume ' + '&offset=' + str(offset) #logger.info('CV.PULLURL: ' + PULLURL) #new CV API restriction - one api request / second. if mylar.CONFIG.CVAPI_RATE is None or mylar.CONFIG.CVAPI_RATE < 2: time.sleep(2) else: time.sleep(mylar.CONFIG.CVAPI_RATE) #download the file: #set payload to None for now... payload = None try: r = requests.get(PULLURL, params=payload, verify=mylar.CONFIG.CV_VERIFY, headers=mylar.CV_HEADERS) except Exception, e: logger.warn('Error fetching data from ComicVine: %s' % (e)) return #logger.fdebug('cv status code : ' + str(r.status_code)) try: dom = parseString(r.content) except ExpatError: if u'Abnormal Traffic Detected' in r.content: logger.error('ComicVine has banned this server\'s IP address because it exceeded the API rate limit.') else: logger.warn('[WARNING] ComicVine is not responding correctly at the moment. This is usually due to some problems on their end. If you re-try things again in a few moments, things might work') return except Exception as e: logger.warn('[ERROR] Error returned from CV: %s' % e) return else: return dom def getComic(comicid, type, issueid=None, arc=None, arcid=None, arclist=None, comicidlist=None): if type == 'issue': offset = 1 issue = {} ndic = [] issuechoice = [] comicResults = [] firstdate = '2099-00-00' #let's find out how many results we get from the query... if comicid is None: #if comicid is None, it's coming from the story arc search results. id = arcid #since the arclist holds the issueids, and the pertinent reading order - we need to strip out the reading order so this works. aclist = '' if arclist.startswith('M'): islist = arclist[1:] else: for ac in arclist.split('|'): aclist += ac[:ac.find(',')] + '|' if aclist.endswith('|'): aclist = aclist[:-1] islist = aclist else: id = comicid islist = None searched = pulldetails(id, 'issue', None, 0, islist) if searched is None: return False totalResults = searched.getElementsByTagName('number_of_total_results')[0].firstChild.wholeText logger.fdebug("there are " + str(totalResults) + " search results...") if not totalResults: return False countResults = 0 while (countResults < int(totalResults)): logger.fdebug("querying range from " + str(countResults) + " to " + str(countResults + 100)) if countResults > 0: #new api - have to change to page # instead of offset count offsetcount = countResults searched = pulldetails(id, 'issue', None, offsetcount, islist) issuechoice, tmpdate = GetIssuesInfo(id, searched, arcid) if tmpdate < firstdate: firstdate = tmpdate ndic = ndic + issuechoice #search results are limited to 100 and by pagination now...let's account for this. countResults = countResults + 100 issue['issuechoice'] = ndic issue['firstdate'] = firstdate return issue elif type == 'comic': dom = pulldetails(comicid, 'comic', None, 1) return GetComicInfo(comicid, dom) elif any([type == 'image', type == 'firstissue']): dom = pulldetails(comicid, type, issueid, 1) return Getissue(issueid, dom, type) elif type == 'storyarc': dom = pulldetails(arc, 'storyarc', None, 1) return GetComicInfo(issueid, dom) elif type == 'comicyears': #used by the story arc searcher when adding a given arc to poll each ComicID in order to populate the Series Year & volume (hopefully). #this grabs each issue based on issueid, and then subsets the comicid for each to be used later. #set the offset to 0, since we're doing a filter. dom = pulldetails(arcid, 'comicyears', offset=0, comicidlist=comicidlist) return GetSeriesYears(dom) elif type == 'import': #used by the importer when doing a scan with metatagging enabled. If metatagging comes back true, then there's an IssueID present #within the tagging (with CT). This compiles all of the IssueID's during a scan (in 100's), and returns the corresponding CV data #related to the given IssueID's - namely ComicID, Name, Volume (more at some point, but those are the important ones). offset = 1 id_count = 0 import_list = [] logger.fdebug('comicidlist:' + str(comicidlist)) while id_count < len(comicidlist): #break it up by 100 per api hit #do the first 100 regardless in_cnt = 0 if id_count + 100 <= len(comicidlist): endcnt = id_count + 100 else: endcnt = len(comicidlist) for i in range(id_count, endcnt): if in_cnt == 0: tmpidlist = str(comicidlist[i]) else: tmpidlist += '|' + str(comicidlist[i]) in_cnt +=1 logger.fdebug('tmpidlist: ' + str(tmpidlist)) searched = pulldetails(None, 'import', offset=0, comicidlist=tmpidlist) if searched is None: break else: tGIL = GetImportList(searched) import_list += tGIL id_count +=100 return import_list elif type == 'update_dates': dom = pulldetails(None, 'update_dates', offset=1, comicidlist=comicidlist) return UpdateDates(dom) def GetComicInfo(comicid, dom, safechk=None): if safechk is None: #safetycheck when checking comicvine. If it times out, increment the chk on retry attempts up until 5 tries then abort. safechk = 1 elif safechk > 4: logger.error('Unable to add / refresh the series due to inablity to retrieve data from ComicVine. You might want to try abit later and/or make sure ComicVine is up.') return #comicvine isn't as up-to-date with issue counts.. #so this can get really buggered, really fast. tracks = dom.getElementsByTagName('issue') try: cntit = dom.getElementsByTagName('count_of_issues')[0].firstChild.wholeText except: cntit = len(tracks) trackcnt = len(tracks) logger.fdebug("number of issues I counted: " + str(trackcnt)) logger.fdebug("number of issues CV says it has: " + str(cntit)) # if the two don't match, use trackcnt as count_of_issues might be not upto-date for some reason if int(trackcnt) != int(cntit): cntit = trackcnt vari = "yes" else: vari = "no" logger.fdebug("vari is set to: " + str(vari)) #if str(trackcnt) != str(int(cntit)+2): # cntit = int(cntit) + 1 comic = {} comicchoice = [] cntit = int(cntit) #retrieve the first xml tag (<tag>data</tag>) #that the parser finds with name tagName: # to return the parent name of the <name> node : dom.getElementsByTagName('name')[0].parentNode.nodeName # where [0] denotes the number of the name field(s) # where nodeName denotes the parentNode : ComicName = results, publisher = publisher, issues = issue try: names = len(dom.getElementsByTagName('name')) n = 0 comic['ComicPublisher'] = 'Unknown' #set this to a default value here so that it will carry through properly while (n < names): if dom.getElementsByTagName('name')[n].parentNode.nodeName == 'results': try: comic['ComicName'] = dom.getElementsByTagName('name')[n].firstChild.wholeText comic['ComicName'] = comic['ComicName'].rstrip() except: logger.error('There was a problem retrieving the given data from ComicVine. Ensure that www.comicvine.com is accessible AND that you have provided your OWN ComicVine API key.') return elif dom.getElementsByTagName('name')[n].parentNode.nodeName == 'publisher': try: comic['ComicPublisher'] = dom.getElementsByTagName('name')[n].firstChild.wholeText except: comic['ComicPublisher'] = "Unknown" n += 1 except: logger.warn('Something went wrong retrieving from ComicVine. Ensure your API is up-to-date and that comicvine is accessible') return try: comic['ComicYear'] = dom.getElementsByTagName('start_year')[0].firstChild.wholeText except: comic['ComicYear'] = '0000' #safety check, cause you known, dufus'... if any([comic['ComicYear'][-1:] == '-', comic['ComicYear'][-1:] == '?']): comic['ComicYear'] = comic['ComicYear'][:-1] try: comic['ComicURL'] = dom.getElementsByTagName('site_detail_url')[trackcnt].firstChild.wholeText except: #this should never be an exception. If it is, it's probably due to CV timing out - so let's sleep for abit then retry. logger.warn('Unable to retrieve URL for volume. This is usually due to a timeout to CV, or going over the API. Retrying again in 10s.') time.sleep(10) safechk +=1 GetComicInfo(comicid, dom, safechk) desdeck = 0 #the description field actually holds the Volume# - so let's grab it desc_soup = None try: descchunk = dom.getElementsByTagName('description')[0].firstChild.wholeText desc_soup = Soup(descchunk, "html.parser") desclinks = desc_soup.findAll('a') comic_desc = drophtml(descchunk) desdeck +=1 except: comic_desc = 'None' #sometimes the deck has volume labels try: deckchunk = dom.getElementsByTagName('deck')[0].firstChild.wholeText comic_deck = deckchunk desdeck +=1 except: comic_deck = 'None' #comic['ComicDescription'] = comic_desc try: comic['Aliases'] = dom.getElementsByTagName('aliases')[0].firstChild.wholeText comic['Aliases'] = re.sub('\n', '##', comic['Aliases']).strip() if comic['Aliases'][-2:] == '##': comic['Aliases'] = comic['Aliases'][:-2] #logger.fdebug('Aliases: ' + str(aliases)) except: comic['Aliases'] = 'None' comic['ComicVersion'] = 'None' #noversion' #figure out if it's a print / digital edition. comic['Type'] = 'None' if comic_deck != 'None': if any(['print' in comic_deck.lower(), 'digital' in comic_deck.lower(), 'paperback' in comic_deck.lower(), 'one shot' in re.sub('-', '', comic_deck.lower()).strip(), 'hardcover' in comic_deck.lower()]): if all(['print' in comic_deck.lower(), 'reprint' not in comic_deck.lower()]): comic['Type'] = 'Print' elif 'digital' in comic_deck.lower(): comic['Type'] = 'Digital' elif 'paperback' in comic_deck.lower(): comic['Type'] = 'TPB' elif 'hardcover' in comic_deck.lower(): comic['Type'] = 'HC' elif 'oneshot' in re.sub('-', '', comic_deck.lower()).strip(): comic['Type'] = 'One-Shot' else: comic['Type'] = 'Print' if comic_desc != 'None' and comic['Type'] == 'None': if 'print' in comic_desc[:60].lower() and all(['for the printed edition' not in comic_desc.lower(), 'print edition can be found' not in comic_desc.lower(), 'reprints' not in comic_desc.lower()]): comic['Type'] = 'Print' elif 'digital' in comic_desc[:60].lower() and 'digital edition can be found' not in comic_desc.lower(): comic['Type'] = 'Digital' elif all(['paperback' in comic_desc[:60].lower(), 'paperback can be found' not in comic_desc.lower()]) or 'collects' in comic_desc[:60].lower(): comic['Type'] = 'TPB' elif 'hardcover' in comic_desc[:60].lower() and 'hardcover can be found' not in comic_desc.lower(): comic['Type'] = 'HC' elif any(['one-shot' in comic_desc[:60].lower(), 'one shot' in comic_desc[:60].lower()]) and any(['can be found' not in comic_desc.lower(), 'following the' not in comic_desc.lower(), 'after the' not in comic_desc.lower()]): i = 0 comic['Type'] = 'One-Shot' avoidwords = ['preceding', 'after the', 'following the'] while i < 2: if i == 0: cbd = 'one-shot' elif i == 1: cbd = 'one shot' tmp1 = comic_desc[:60].lower().find(cbd) if tmp1 != -1: for x in avoidwords: tmp2 = comic_desc[:tmp1].lower().find(x) if tmp2 != -1: logger.fdebug('FAKE NEWS: caught incorrect reference to one-shot. Forcing to Print') comic['Type'] = 'Print' i = 3 break i+=1 else: comic['Type'] = 'Print' if all([comic_desc != 'None', 'trade paperback' in comic_desc[:30].lower(), 'collecting' in comic_desc[:40].lower()]): #ie. Trade paperback collecting Marvel Team-Up #9-11, 48-51, 72, 110 & 145. first_collect = comic_desc.lower().find('collecting') #logger.info('first_collect: %s' % first_collect) #logger.info('comic_desc: %s' % comic_desc) #logger.info('desclinks: %s' % desclinks) issue_list = [] micdrop = [] if desc_soup is not None: #if it's point form bullets, ignore it cause it's not the current volume stuff. test_it = desc_soup.find('ul') if test_it: for x in test_it.findAll('li'): if any(['Next' in x.findNext(text=True), 'Previous' in x.findNext(text=True)]): mic_check = x.find('a') micdrop.append(mic_check['data-ref-id']) for fc in desclinks: try: fc_id = fc['data-ref-id'] except: continue if fc_id in micdrop: continue fc_name = fc.findNext(text=True) if fc_id.startswith('4000'): fc_cid = None fc_isid = fc_id iss_start = fc_name.find('#') issuerun = fc_name[iss_start:].strip() fc_name = fc_name[:iss_start].strip() elif fc_id.startswith('4050'): fc_cid = fc_id fc_isid = None issuerun = fc.next_sibling if issuerun is not None: lines = re.sub("[^0-9]", ' ', issuerun).strip().split(' ') if len(lines) > 0: for x in sorted(lines, reverse=True): srchline = issuerun.rfind(x) if srchline != -1: try: if issuerun[srchline+len(x)] == ',' or issuerun[srchline+len(x)] == '.' or issuerun[srchline+len(x)] == ' ': issuerun = issuerun[:srchline+len(x)] break except Exception as e: #logger.warn('[ERROR] %s' % e) continue else: iss_start = fc_name.find('#') issuerun = fc_name[iss_start:].strip() fc_name = fc_name[:iss_start].strip() if issuerun.strip().endswith('.') or issuerun.strip().endswith(','): #logger.fdebug('Changed issuerun from %s to %s' % (issuerun, issuerun[:-1])) issuerun = issuerun.strip()[:-1] if issuerun.endswith(' and '): issuerun = issuerun[:-4].strip() elif issuerun.endswith(' and'): issuerun = issuerun[:-3].strip() else: continue # except: # pass issue_list.append({'series': fc_name, 'comicid': fc_cid, 'issueid': fc_isid, 'issues': issuerun}) #first_collect = cis logger.info('Collected issues in volume: %s' % issue_list) if len(issue_list) == 0: comic['Issue_List'] = 'None' else: comic['Issue_List'] = issue_list else: comic['Issue_List'] = 'None' while (desdeck > 0): if desdeck == 1: if comic_desc == 'None': comicDes = comic_deck[:30] else: #extract the first 60 characters comicDes = comic_desc[:60].replace('New 52', '') elif desdeck == 2: #extract the characters from the deck comicDes = comic_deck[:30].replace('New 52', '') else: break i = 0 while (i < 2): if 'volume' in comicDes.lower(): #found volume - let's grab it. v_find = comicDes.lower().find('volume') #arbitrarily grab the next 10 chars (6 for volume + 1 for space + 3 for the actual vol #) #increased to 10 to allow for text numbering (+5 max) #sometimes it's volume 5 and ocassionally it's fifth volume. if comicDes[v_find+7:comicDes.find(' ', v_find+7)].isdigit(): comic['ComicVersion'] = re.sub("[^0-9]", "", comicDes[v_find+7:comicDes.find(' ', v_find+7)]).strip() break elif i == 0: vfind = comicDes[v_find:v_find +15] #if it's volume 5 format basenums = {'zero': '0', 'one': '1', 'two': '2', 'three': '3', 'four': '4', 'five': '5', 'six': '6', 'seven': '7', 'eight': '8', 'nine': '9', 'ten': '10', 'i': '1', 'ii': '2', 'iii': '3', 'iv': '4', 'v': '5'} logger.fdebug('volume X format - ' + str(i) + ': ' + vfind) else: vfind = comicDes[:v_find] # if it's fifth volume format basenums = {'zero': '0', 'first': '1', 'second': '2', 'third': '3', 'fourth': '4', 'fifth': '5', 'sixth': '6', 'seventh': '7', 'eighth': '8', 'nineth': '9', 'tenth': '10', 'i': '1', 'ii': '2', 'iii': '3', 'iv': '4', 'v': '5'} logger.fdebug('X volume format - ' + str(i) + ': ' + vfind) volconv = '' for nums in basenums: if nums in vfind.lower(): sconv = basenums[nums] vfind = re.sub(nums, sconv, vfind.lower()) break #logger.info('volconv: ' + str(volconv)) #now we attempt to find the character position after the word 'volume' if i == 0: volthis = vfind.lower().find('volume') volthis = volthis + 6 # add on the actual word to the position so that we can grab the subsequent digit vfind = vfind[volthis:volthis + 4] # grab the next 4 characters ;) elif i == 1: volthis = vfind.lower().find('volume') vfind = vfind[volthis - 4:volthis] # grab the next 4 characters ;) if '(' in vfind: #bracket detected in versioning' vfindit = re.findall('[^()]+', vfind) vfind = vfindit[0] vf = re.findall('[^<>]+', vfind) try: ledigit = re.sub("[^0-9]", "", vf[0]) if ledigit != '': comic['ComicVersion'] = ledigit logger.fdebug("Volume information found! Adding to series record : volume " + comic['ComicVersion']) break except: pass i += 1 else: i += 1 if comic['ComicVersion'] == 'None': logger.fdebug('comic[ComicVersion]:' + str(comic['ComicVersion'])) desdeck -= 1 else: break if vari == "yes": comic['ComicIssues'] = str(cntit) else: comic['ComicIssues'] = dom.getElementsByTagName('count_of_issues')[0].firstChild.wholeText comic['ComicImage'] = dom.getElementsByTagName('super_url')[0].firstChild.wholeText comic['ComicImageALT'] = dom.getElementsByTagName('small_url')[0].firstChild.wholeText comic['FirstIssueID'] = dom.getElementsByTagName('id')[0].firstChild.wholeText #logger.info('comic: %s' % comic) return comic def GetIssuesInfo(comicid, dom, arcid=None): subtracks = dom.getElementsByTagName('issue') if not mylar.CONFIG.CV_ONLY: cntiss = dom.getElementsByTagName('count_of_issues')[0].firstChild.wholeText logger.fdebug("issues I've counted: " + str(len(subtracks))) logger.fdebug("issues CV says it has: " + str(int(cntiss))) if int(len(subtracks)) != int(cntiss): logger.fdebug("CV's count is wrong, I counted different...going with my count for physicals" + str(len(subtracks))) cntiss = len(subtracks) # assume count of issues is wrong, go with ACTUAL physical api count cntiss = int(cntiss) n = cntiss -1 else: n = int(len(subtracks)) tempissue = {} issuech = [] firstdate = '2099-00-00' for subtrack in subtracks: if not mylar.CONFIG.CV_ONLY: if (dom.getElementsByTagName('name')[n].firstChild) is not None: issue['Issue_Name'] = dom.getElementsByTagName('name')[n].firstChild.wholeText else: issue['Issue_Name'] = 'None' issue['Issue_ID'] = dom.getElementsByTagName('id')[n].firstChild.wholeText issue['Issue_Number'] = dom.getElementsByTagName('issue_number')[n].firstChild.wholeText issuech.append({ 'Issue_ID': issue['Issue_ID'], 'Issue_Number': issue['Issue_Number'], 'Issue_Name': issue['Issue_Name'] }) else: try: totnames = len(subtrack.getElementsByTagName('name')) tot = 0 while (tot < totnames): if subtrack.getElementsByTagName('name')[tot].parentNode.nodeName == 'volume': tempissue['ComicName'] = subtrack.getElementsByTagName('name')[tot].firstChild.wholeText elif subtrack.getElementsByTagName('name')[tot].parentNode.nodeName == 'issue': try: tempissue['Issue_Name'] = subtrack.getElementsByTagName('name')[tot].firstChild.wholeText except: tempissue['Issue_Name'] = None tot += 1 except: tempissue['ComicName'] = 'None' try: totids = len(subtrack.getElementsByTagName('id')) idt = 0 while (idt < totids): if subtrack.getElementsByTagName('id')[idt].parentNode.nodeName == 'volume': tempissue['Comic_ID'] = subtrack.getElementsByTagName('id')[idt].firstChild.wholeText elif subtrack.getElementsByTagName('id')[idt].parentNode.nodeName == 'issue': tempissue['Issue_ID'] = subtrack.getElementsByTagName('id')[idt].firstChild.wholeText idt += 1 except: tempissue['Issue_Name'] = 'None' try: tempissue['CoverDate'] = subtrack.getElementsByTagName('cover_date')[0].firstChild.wholeText except: tempissue['CoverDate'] = '0000-00-00' try: tempissue['StoreDate'] = subtrack.getElementsByTagName('store_date')[0].firstChild.wholeText except: tempissue['StoreDate'] = '0000-00-00' try: digital_desc = subtrack.getElementsByTagName('description')[0].firstChild.wholeText except: tempissue['DigitalDate'] = '0000-00-00' else: tempissue['DigitalDate'] = '0000-00-00' if all(['digital' in digital_desc.lower()[-90:], 'print' in digital_desc.lower()[-90:]]): #get the digital date of issue here... mff = mylar.filechecker.FileChecker() vlddate = mff.checkthedate(digital_desc[-90:], fulldate=True) #logger.fdebug('vlddate: %s' % vlddate) if vlddate: tempissue['DigitalDate'] = vlddate try: tempissue['Issue_Number'] = subtrack.getElementsByTagName('issue_number')[0].firstChild.wholeText except: logger.fdebug('No Issue Number available - Trade Paperbacks, Graphic Novels and Compendiums are not supported as of yet.') try: tempissue['ComicImage'] = subtrack.getElementsByTagName('small_url')[0].firstChild.wholeText except: tempissue['ComicImage'] = 'None' try: tempissue['ComicImageALT'] = subtrack.getElementsByTagName('medium_url')[0].firstChild.wholeText except: tempissue['ComicImageALT'] = 'None' if arcid is None: issuech.append({ 'Comic_ID': comicid, 'Issue_ID': tempissue['Issue_ID'], 'Issue_Number': tempissue['Issue_Number'], 'Issue_Date': tempissue['CoverDate'], 'Store_Date': tempissue['StoreDate'], 'Digital_Date': tempissue['DigitalDate'], 'Issue_Name': tempissue['Issue_Name'], 'Image': tempissue['ComicImage'], 'ImageALT': tempissue['ComicImageALT'] }) else: issuech.append({ 'ArcID': arcid, 'ComicName': tempissue['ComicName'], 'ComicID': tempissue['Comic_ID'], 'IssueID': tempissue['Issue_ID'], 'Issue_Number': tempissue['Issue_Number'], 'Issue_Date': tempissue['CoverDate'], 'Store_Date': tempissue['StoreDate'], 'Digital_Date': tempissue['DigitalDate'], 'Issue_Name': tempissue['Issue_Name'] }) if tempissue['CoverDate'] < firstdate and tempissue['CoverDate'] != '0000-00-00': firstdate = tempissue['CoverDate'] n-= 1 #logger.fdebug('issue_info: %s' % issuech) #issue['firstdate'] = firstdate return issuech, firstdate def Getissue(issueid, dom, type): #if the Series Year doesn't exist, get the first issue and take the date from that if type == 'firstissue': try: first_year = dom.getElementsByTagName('cover_date')[0].firstChild.wholeText except: first_year = '0000' return first_year the_year = first_year[:4] the_month = first_year[5:7] the_date = the_year + '-' + the_month return the_year else: try: image = dom.getElementsByTagName('super_url')[0].firstChild.wholeText except: image = None try: image_alt = dom.getElementsByTagName('small_url')[0].firstChild.wholeText except: image_alt = None return {'image': image, 'image_alt': image_alt} def GetSeriesYears(dom): #used by the 'add a story arc' option to individually populate the Series Year for each series within the given arc. #series year is required for alot of functionality. series = dom.getElementsByTagName('volume') tempseries = {} serieslist = [] for dm in series: try: totids = len(dm.getElementsByTagName('id')) idc = 0 while (idc < totids): if dm.getElementsByTagName('id')[idc].parentNode.nodeName == 'volume': tempseries['ComicID'] = dm.getElementsByTagName('id')[idc].firstChild.wholeText idc+=1 except: logger.warn('There was a problem retrieving a comicid for a series within the arc. This will have to manually corrected most likely.') tempseries['ComicID'] = 'None' tempseries['Series'] = 'None' tempseries['Publisher'] = 'None' try: totnames = len(dm.getElementsByTagName('name')) namesc = 0 while (namesc < totnames): if dm.getElementsByTagName('name')[namesc].parentNode.nodeName == 'volume': tempseries['Series'] = dm.getElementsByTagName('name')[namesc].firstChild.wholeText elif dm.getElementsByTagName('name')[namesc].parentNode.nodeName == 'publisher': tempseries['Publisher'] = dm.getElementsByTagName('name')[namesc].firstChild.wholeText namesc+=1 except: logger.warn('There was a problem retrieving a Series Name or Publisher for a series within the arc. This will have to manually corrected.') try: tempseries['SeriesYear'] = dm.getElementsByTagName('start_year')[0].firstChild.wholeText except: logger.warn('There was a problem retrieving the start year for a particular series within the story arc.') tempseries['SeriesYear'] = '0000' #cause you know, dufus'... if tempseries['SeriesYear'][-1:] == '-': tempseries['SeriesYear'] = tempseries['SeriesYear'][:-1] desdeck = 0 #the description field actually holds the Volume# - so let's grab it desc_soup = None try: descchunk = dm.getElementsByTagName('description')[0].firstChild.wholeText desc_soup = Soup(descchunk, "html.parser") desclinks = desc_soup.findAll('a') comic_desc = drophtml(descchunk) desdeck +=1 except: comic_desc = 'None' #sometimes the deck has volume labels try: deckchunk = dm.getElementsByTagName('deck')[0].firstChild.wholeText comic_deck = deckchunk desdeck +=1 except: comic_deck = 'None' #comic['ComicDescription'] = comic_desc try: tempseries['Aliases'] = dm.getElementsByTagName('aliases')[0].firstChild.wholeText tempseries['Aliases'] = re.sub('\n', '##', tempseries['Aliases']).strip() if tempseries['Aliases'][-2:] == '##': tempseries['Aliases'] = tempseries['Aliases'][:-2] #logger.fdebug('Aliases: ' + str(aliases)) except: tempseries['Aliases'] = 'None' tempseries['Volume'] = 'None' #noversion' #figure out if it's a print / digital edition. tempseries['Type'] = 'None' if comic_deck != 'None': if any(['print' in comic_deck.lower(), 'digital' in comic_deck.lower(), 'paperback' in comic_deck.lower(), 'one shot' in re.sub('-', '', comic_deck.lower()).strip(), 'hardcover' in comic_deck.lower()]): if 'print' in comic_deck.lower(): tempseries['Type'] = 'Print' elif 'digital' in comic_deck.lower(): tempseries['Type'] = 'Digital' elif 'paperback' in comic_deck.lower(): tempseries['Type'] = 'TPB' elif 'hardcover' in comic_deck.lower(): tempseries['Type'] = 'HC' elif 'oneshot' in re.sub('-', '', comic_deck.lower()).strip(): tempseries['Type'] = 'One-Shot' if comic_desc != 'None' and tempseries['Type'] == 'None': if 'print' in comic_desc[:60].lower() and 'print edition can be found' not in comic_desc.lower(): tempseries['Type'] = 'Print' elif 'digital' in comic_desc[:60].lower() and 'digital edition can be found' not in comic_desc.lower(): tempseries['Type'] = 'Digital' elif all(['paperback' in comic_desc[:60].lower(), 'paperback can be found' not in comic_desc.lower()]) or 'collects' in comic_desc[:60].lower(): tempseries['Type'] = 'TPB' elif 'hardcover' in comic_desc[:60].lower() and 'hardcover can be found' not in comic_desc.lower(): tempseries['Type'] = 'HC' elif any(['one-shot' in comic_desc[:60].lower(), 'one shot' in comic_desc[:60].lower()]) and any(['can be found' not in comic_desc.lower(), 'following the' not in comic_desc.lower()]): i = 0 tempseries['Type'] = 'One-Shot' avoidwords = ['preceding', 'after the special', 'following the'] while i < 2: if i == 0: cbd = 'one-shot' elif i == 1: cbd = 'one shot' tmp1 = comic_desc[:60].lower().find(cbd) if tmp1 != -1: for x in avoidwords: tmp2 = comic_desc[:tmp1].lower().find(x) if tmp2 != -1: logger.fdebug('FAKE NEWS: caught incorrect reference to one-shot. Forcing to Print') tempseries['Type'] = 'Print' i = 3 break i+=1 else: tempseries['Type'] = 'Print' if all([comic_desc != 'None', 'trade paperback' in comic_desc[:30].lower(), 'collecting' in comic_desc[:40].lower()]): #ie. Trade paperback collecting Marvel Team-Up #9-11, 48-51, 72, 110 & 145. first_collect = comic_desc.lower().find('collecting') #logger.info('first_collect: %s' % first_collect) #logger.info('comic_desc: %s' % comic_desc) #logger.info('desclinks: %s' % desclinks) issue_list = [] micdrop = [] if desc_soup is not None: #if it's point form bullets, ignore it cause it's not the current volume stuff. test_it = desc_soup.find('ul') if test_it: for x in test_it.findAll('li'): if any(['Next' in x.findNext(text=True), 'Previous' in x.findNext(text=True)]): mic_check = x.find('a') micdrop.append(mic_check['data-ref-id']) for fc in desclinks: #logger.info('fc: %s' % fc) fc_id = fc['data-ref-id'] #logger.info('fc_id: %s' % fc_id) if fc_id in micdrop: continue fc_name = fc.findNext(text=True) if fc_id.startswith('4000'): fc_cid = None fc_isid = fc_id iss_start = fc_name.find('#') issuerun = fc_name[iss_start:].strip() fc_name = fc_name[:iss_start].strip() elif fc_id.startswith('4050'): fc_cid = fc_id fc_isid = None issuerun = fc.next_sibling if issuerun is not None: lines = re.sub("[^0-9]", ' ', issuerun).strip().split(' ') if len(lines) > 0: for x in sorted(lines, reverse=True): srchline = issuerun.rfind(x) if srchline != -1: try: if issuerun[srchline+len(x)] == ',' or issuerun[srchline+len(x)] == '.' or issuerun[srchline+len(x)] == ' ': issuerun = issuerun[:srchline+len(x)] break except Exception as e: logger.warn('[ERROR] %s' % e) continue else: iss_start = fc_name.find('#') issuerun = fc_name[iss_start:].strip() fc_name = fc_name[:iss_start].strip() if issuerun.endswith('.') or issuerun.endswith(','): #logger.fdebug('Changed issuerun from %s to %s' % (issuerun, issuerun[:-1])) issuerun = issuerun[:-1] if issuerun.endswith(' and '): issuerun = issuerun[:-4].strip() elif issuerun.endswith(' and'): issuerun = issuerun[:-3].strip() else: continue # except: # pass issue_list.append({'series': fc_name, 'comicid': fc_cid, 'issueid': fc_isid, 'issues': issuerun}) #first_collect = cis logger.info('Collected issues in volume: %s' % issue_list) tempseries['Issue_List'] = issue_list else: tempseries['Issue_List'] = 'None' while (desdeck > 0): if desdeck == 1: if comic_desc == 'None': comicDes = comic_deck[:30] else: #extract the first 60 characters comicDes = comic_desc[:60].replace('New 52', '') elif desdeck == 2: #extract the characters from the deck comicDes = comic_deck[:30].replace('New 52', '') else: break i = 0 while (i < 2): if 'volume' in comicDes.lower(): #found volume - let's grab it. v_find = comicDes.lower().find('volume') #arbitrarily grab the next 10 chars (6 for volume + 1 for space + 3 for the actual vol #) #increased to 10 to allow for text numbering (+5 max) #sometimes it's volume 5 and ocassionally it's fifth volume. if i == 0: vfind = comicDes[v_find:v_find +15] #if it's volume 5 format basenums = {'zero': '0', 'one': '1', 'two': '2', 'three': '3', 'four': '4', 'five': '5', 'six': '6', 'seven': '7', 'eight': '8', 'nine': '9', 'ten': '10', 'i': '1', 'ii': '2', 'iii': '3', 'iv': '4', 'v': '5'} logger.fdebug('volume X format - %s: %s' % (i, vfind)) else: vfind = comicDes[:v_find] # if it's fifth volume format basenums = {'zero': '0', 'first': '1', 'second': '2', 'third': '3', 'fourth': '4', 'fifth': '5', 'sixth': '6', 'seventh': '7', 'eighth': '8', 'nineth': '9', 'tenth': '10', 'i': '1', 'ii': '2', 'iii': '3', 'iv': '4', 'v': '5'} logger.fdebug('X volume format - %s: %s' % (i, vfind)) volconv = '' for nums in basenums: if nums in vfind.lower(): sconv = basenums[nums] vfind = re.sub(nums, sconv, vfind.lower()) break #logger.info('volconv: ' + str(volconv)) #now we attempt to find the character position after the word 'volume' if i == 0: volthis = vfind.lower().find('volume') volthis = volthis + 6 # add on the actual word to the position so that we can grab the subsequent digit vfind = vfind[volthis:volthis + 4] # grab the next 4 characters ;) elif i == 1: volthis = vfind.lower().find('volume') vfind = vfind[volthis - 4:volthis] # grab the next 4 characters ;) if '(' in vfind: #bracket detected in versioning' vfindit = re.findall('[^()]+', vfind) vfind = vfindit[0] vf = re.findall('[^<>]+', vfind) try: ledigit = re.sub("[^0-9]", "", vf[0]) if ledigit != '': tempseries['Volume'] = ledigit logger.fdebug("Volume information found! Adding to series record : volume %s" % tempseries['Volume']) break except: pass i += 1 else: i += 1 if tempseries['Volume'] == 'None': logger.fdebug('tempseries[Volume]: %s' % tempseries['Volume']) desdeck -= 1 else: break serieslist.append({"ComicID": tempseries['ComicID'], "ComicName": tempseries['Series'], "SeriesYear": tempseries['SeriesYear'], "Publisher": tempseries['Publisher'], "Volume": tempseries['Volume'], "Aliases": tempseries['Aliases'], "Type": tempseries['Type']}) return serieslist def UpdateDates(dom): issues = dom.getElementsByTagName('issue') tempissue = {} issuelist = [] for dm in issues: tempissue['ComicID'] = 'None' tempissue['IssueID'] = 'None' try: totids = len(dm.getElementsByTagName('id')) idc = 0 while (idc < totids): if dm.getElementsByTagName('id')[idc].parentNode.nodeName == 'volume': tempissue['ComicID'] = dm.getElementsByTagName('id')[idc].firstChild.wholeText if dm.getElementsByTagName('id')[idc].parentNode.nodeName == 'issue': tempissue['IssueID'] = dm.getElementsByTagName('id')[idc].firstChild.wholeText idc+=1 except: logger.warn('There was a problem retrieving a comicid/issueid for the given issue. This will have to manually corrected most likely.') tempissue['SeriesTitle'] = 'None' tempissue['IssueTitle'] = 'None' try: totnames = len(dm.getElementsByTagName('name')) namesc = 0 while (namesc < totnames): if dm.getElementsByTagName('name')[namesc].parentNode.nodeName == 'issue': tempissue['IssueTitle'] = dm.getElementsByTagName('name')[namesc].firstChild.wholeText elif dm.getElementsByTagName('name')[namesc].parentNode.nodeName == 'volume': tempissue['SeriesTitle'] = dm.getElementsByTagName('name')[namesc].firstChild.wholeText namesc+=1 except: logger.warn('There was a problem retrieving the Series Title / Issue Title for a series within the arc. This will have to manually corrected.') try: tempissue['CoverDate'] = dm.getElementsByTagName('cover_date')[0].firstChild.wholeText except: tempissue['CoverDate'] = '0000-00-00' try: tempissue['StoreDate'] = dm.getElementsByTagName('store_date')[0].firstChild.wholeText except: tempissue['StoreDate'] = '0000-00-00' try: tempissue['IssueNumber'] = dm.getElementsByTagName('issue_number')[0].firstChild.wholeText except: logger.fdebug('No Issue Number available - Trade Paperbacks, Graphic Novels and Compendiums are not supported as of yet.') tempissue['IssueNumber'] = 'None' try: tempissue['date_last_updated'] = dm.getElementsByTagName('date_last_updated')[0].firstChild.wholeText except: tempissue['date_last_updated'] = '0000-00-00' issuelist.append({'ComicID': tempissue['ComicID'], 'IssueID': tempissue['IssueID'], 'SeriesTitle': tempissue['SeriesTitle'], 'IssueTitle': tempissue['IssueTitle'], 'CoverDate': tempissue['CoverDate'], 'StoreDate': tempissue['StoreDate'], 'IssueNumber': tempissue['IssueNumber'], 'Date_Last_Updated': tempissue['date_last_updated']}) return issuelist def GetImportList(results): importlist = results.getElementsByTagName('issue') serieslist = [] importids = {} tempseries = {} for implist in importlist: try: totids = len(implist.getElementsByTagName('id')) idt = 0 while (idt < totids): if implist.getElementsByTagName('id')[idt].parentNode.nodeName == 'volume': tempseries['ComicID'] = implist.getElementsByTagName('id')[idt].firstChild.wholeText elif implist.getElementsByTagName('id')[idt].parentNode.nodeName == 'issue': tempseries['IssueID'] = implist.getElementsByTagName('id')[idt].firstChild.wholeText idt += 1 except: tempseries['ComicID'] = None try: totnames = len(implist.getElementsByTagName('name')) tot = 0 while (tot < totnames): if implist.getElementsByTagName('name')[tot].parentNode.nodeName == 'volume': tempseries['ComicName'] = implist.getElementsByTagName('name')[tot].firstChild.wholeText elif implist.getElementsByTagName('name')[tot].parentNode.nodeName == 'issue': try: tempseries['Issue_Name'] = implist.getElementsByTagName('name')[tot].firstChild.wholeText except: tempseries['Issue_Name'] = None tot += 1 except: tempseries['ComicName'] = 'None' try: tempseries['Issue_Number'] = implist.getElementsByTagName('issue_number')[0].firstChild.wholeText except: logger.fdebug('No Issue Number available - Trade Paperbacks, Graphic Novels and Compendiums are not supported as of yet.') logger.info('tempseries:' + str(tempseries)) serieslist.append({"ComicID": tempseries['ComicID'], "IssueID": tempseries['IssueID'], "ComicName": tempseries['ComicName'], "Issue_Name": tempseries['Issue_Name'], "Issue_Number": tempseries['Issue_Number']}) return serieslist def drophtml(html): soup = Soup(html, "html.parser") text_parts = soup.findAll(text=True) #print ''.join(text_parts) return ''.join(text_parts)