mylar/mylar/cv.py

514 lines
22 KiB
Python
Executable File

# This file is part of Mylar.
#
# Mylar is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# Mylar is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the
# implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public
# License for more details.
#
# You should have received a copy of the GNU General Public License
# along with Mylar. If not, see <http://www.gnu.org/licenses/>.
import sys
import os
import re
import time
import logger
import string
import urllib2
import lib.feedparser
import mylar
import platform
from mylar.helpers import cvapi_check
from bs4 import BeautifulSoup as Soup
import httplib
def patch_http_response_read(func):
def inner(*args):
try:
return func(*args)
except httplib.IncompleteRead, e:
return e.partial
return inner
httplib.HTTPResponse.read = patch_http_response_read(httplib.HTTPResponse.read)
if platform.python_version() == '2.7.6':
httplib.HTTPConnection._http_vsn = 10
httplib.HTTPConnection._http_vsn_str = 'HTTP/1.0'
def pulldetails(comicid, type, issueid=None, offset=1, arclist=None, comicidlist=None):
#import easy to use xml parser called minidom:
from xml.dom.minidom import parseString
if mylar.COMICVINE_API == 'None' or mylar.COMICVINE_API is None or mylar.COMICVINE_API == mylar.DEFAULT_CVAPI:
logger.warn('You have not specified your own ComicVine API key - alot of things will be limited. Get your own @ http://api.comicvine.com.')
comicapi = mylar.DEFAULT_CVAPI
else:
comicapi = mylar.COMICVINE_API
if type == 'comic':
if not comicid.startswith('4050-'): comicid = '4050-' + comicid
PULLURL = mylar.CVURL + 'volume/' + str(comicid) + '/?api_key=' + str(comicapi) + '&format=xml&field_list=name,count_of_issues,issues,start_year,site_detail_url,image,publisher,description,first_issue,deck,aliases'
elif type == 'issue':
if mylar.CV_ONLY:
cv_type = 'issues'
if arclist is None:
searchset = 'filter=volume:' + str(comicid) + '&field_list=cover_date,description,id,image,issue_number,name,date_last_updated,store_date'
else:
searchset = 'filter=id:' + (arclist) + '&field_list=cover_date,id,issue_number,name,date_last_updated,store_date,volume'
else:
cv_type = 'volume/' + str(comicid)
searchset = 'name,count_of_issues,issues,start_year,site_detail_url,image,publisher,description,store_date'
PULLURL = mylar.CVURL + str(cv_type) + '/?api_key=' + str(comicapi) + '&format=xml&' + str(searchset) + '&offset=' + str(offset)
elif type == 'firstissue':
#this is used ONLY for CV_ONLY
PULLURL = mylar.CVURL + 'issues/?api_key=' + str(comicapi) + '&format=xml&filter=id:' + str(issueid) + '&field_list=cover_date'
elif type == 'storyarc':
PULLURL = mylar.CVURL + 'story_arcs/?api_key=' + str(comicapi) + '&format=xml&filter=name:' + str(issueid) + '&field_list=cover_date'
elif type == 'comicyears':
PULLURL = mylar.CVURL + 'volumes/?api_key=' + str(comicapi) + '&format=xml&filter=id:' + str(comicidlist) + '&field_list=name,id,start_year,publisher&offset=' + str(offset)
#logger.info('PULLURL: ' + PULLURL)
#CV API Check here.
if mylar.CVAPI_COUNT == 0 or mylar.CVAPI_COUNT >= mylar.CVAPI_MAX:
cvapi_check()
#download the file:
file = urllib2.urlopen(PULLURL)
#increment CV API counter.
mylar.CVAPI_COUNT += 1
#convert to string:
data = file.read()
#close file because we dont need it anymore:
file.close()
#parse the xml you downloaded
dom = parseString(data)
return dom
def getComic(comicid, type, issueid=None, arc=None, arcid=None, arclist=None, comicidlist=None):
if type == 'issue':
offset = 1
issue = {}
ndic = []
issuechoice = []
comicResults = []
firstdate = '2099-00-00'
#let's find out how many results we get from the query...
if comicid is None:
#if comicid is None, it's coming from the story arc search results.
id = arcid
islist = arclist
else:
id = comicid
islist = None
searched = pulldetails(id, 'issue', None, 0, islist)
if searched is None: return False
totalResults = searched.getElementsByTagName('number_of_total_results')[0].firstChild.wholeText
logger.fdebug("there are " + str(totalResults) + " search results...")
if not totalResults:
return False
countResults = 0
while (countResults < int(totalResults)):
logger.fdebug("querying " + str(countResults))
if countResults > 0:
#new api - have to change to page # instead of offset count
offsetcount = countResults
searched = pulldetails(id, 'issue', None, offsetcount, islist)
issuechoice, tmpdate = GetIssuesInfo(id, searched, arcid)
if tmpdate < firstdate:
firstdate = tmpdate
ndic = ndic + issuechoice
#search results are limited to 100 and by pagination now...let's account for this.
countResults = countResults + 100
issue['issuechoice'] = ndic
issue['firstdate'] = firstdate
return issue
elif type == 'comic':
dom = pulldetails(comicid, 'comic', None, 1)
return GetComicInfo(comicid, dom)
elif type == 'firstissue':
dom = pulldetails(comicid, 'firstissue', issueid, 1)
return GetFirstIssue(issueid, dom)
elif type == 'storyarc':
dom = pulldetails(arc, 'storyarc', None, 1)
return GetComicInfo(issueid, dom)
elif type == 'comicyears':
#used by the story arc searcher when adding a given arc to poll each ComicID in order to populate the Series Year.
#this grabs each issue based on issueid, and then subsets the comicid for each to be used later.
#set the offset to 0, since we're doing a filter.
dom = pulldetails(arcid, 'comicyears', offset=0, comicidlist=comicidlist)
return GetSeriesYears(dom)
def GetComicInfo(comicid, dom, safechk=None):
if safechk is None:
#safetycheck when checking comicvine. If it times out, increment the chk on retry attempts up until 5 tries then abort.
safechk = 1
elif safechk > 4:
logger.error('Unable to add / refresh the series due to inablity to retrieve data from ComicVine. You might want to try abit later and/or make sure ComicVine is up.')
return
#comicvine isn't as up-to-date with issue counts..
#so this can get really buggered, really fast.
tracks = dom.getElementsByTagName('issue')
try:
cntit = dom.getElementsByTagName('count_of_issues')[0].firstChild.wholeText
except:
cntit = len(tracks)
trackcnt = len(tracks)
logger.fdebug("number of issues I counted: " + str(trackcnt))
logger.fdebug("number of issues CV says it has: " + str(cntit))
# if the two don't match, use trackcnt as count_of_issues might be not upto-date for some reason
if int(trackcnt) != int(cntit):
cntit = trackcnt
vari = "yes"
else: vari = "no"
logger.fdebug("vari is set to: " + str(vari))
#if str(trackcnt) != str(int(cntit)+2):
# cntit = int(cntit) + 1
comic = {}
comicchoice = []
cntit = int(cntit)
#retrieve the first xml tag (<tag>data</tag>)
#that the parser finds with name tagName:
# to return the parent name of the <name> node : dom.getElementsByTagName('name')[0].parentNode.nodeName
# where [0] denotes the number of the name field(s)
# where nodeName denotes the parentNode : ComicName = results, publisher = publisher, issues = issue
try:
names = len(dom.getElementsByTagName('name'))
n = 0
while (n < names):
if dom.getElementsByTagName('name')[n].parentNode.nodeName == 'results':
try:
comic['ComicName'] = dom.getElementsByTagName('name')[n].firstChild.wholeText
comic['ComicName'] = comic['ComicName'].rstrip()
except:
logger.error('There was a problem retrieving the given data from ComicVine. Ensure that www.comicvine.com is accessible AND that you have provided your OWN ComicVine API key.')
return
elif dom.getElementsByTagName('name')[n].parentNode.nodeName == 'publisher':
try:
comic['ComicPublisher'] = dom.getElementsByTagName('name')[n].firstChild.wholeText
except:
comic['ComicPublisher'] = "Unknown"
n += 1
except:
logger.warn('Something went wrong retrieving from ComicVine. Ensure your API is up-to-date and that comicvine is accessible')
return
try:
comic['ComicYear'] = dom.getElementsByTagName('start_year')[0].firstChild.wholeText
except:
comic['ComicYear'] = '0000'
try:
comic['ComicURL'] = dom.getElementsByTagName('site_detail_url')[trackcnt].firstChild.wholeText
except:
#this should never be an exception. If it is, it's probably due to CV timing out - so let's sleep for abit then retry.
logger.warn('Unable to retrieve URL for volume. This is usually due to a timeout to CV, or going over the API. Retrying again in 10s.')
time.sleep(10)
safechk +=1
GetComicInfo(comicid, dom, safechk)
desdeck = 0
#the description field actually holds the Volume# - so let's grab it
try:
descchunk = dom.getElementsByTagName('description')[0].firstChild.wholeText
comic_desc = drophtml(descchunk)
desdeck +=1
except:
comic_desc = 'None'
#sometimes the deck has volume labels
try:
deckchunk = dom.getElementsByTagName('deck')[0].firstChild.wholeText
comic_deck = deckchunk
desdeck +=1
except:
comic_deck = 'None'
#comic['ComicDescription'] = comic_desc
try:
comic['Aliases'] = dom.getElementsByTagName('aliases')[0].firstChild.wholeText
#logger.fdebug('Aliases: ' + str(aliases))
except:
comic['Aliases'] = 'None'
comic['ComicVersion'] = 'noversion'
#logger.info('comic_desc:' + comic_desc)
#logger.info('comic_deck:' + comic_deck)
#logger.info('desdeck: ' + str(desdeck))
while (desdeck > 0):
if desdeck == 1:
if comic_desc == 'None':
comicDes = comic_deck[:30]
else:
#extract the first 60 characters
comicDes = comic_desc[:60].replace('New 52', '')
elif desdeck == 2:
#extract the characters from the deck
comicDes = comic_deck[:30].replace('New 52', '')
else:
break
i = 0
while (i < 2):
if 'volume' in comicDes.lower():
#found volume - let's grab it.
v_find = comicDes.lower().find('volume')
#arbitrarily grab the next 10 chars (6 for volume + 1 for space + 3 for the actual vol #)
#increased to 10 to allow for text numbering (+5 max)
#sometimes it's volume 5 and ocassionally it's fifth volume.
if i == 0:
vfind = comicDes[v_find:v_find +15] #if it's volume 5 format
basenums = {'zero': '0', 'one': '1', 'two': '2', 'three': '3', 'four': '4', 'five': '5', 'six': '6', 'seven': '7', 'eight': '8', 'nine': '9', 'ten': '10', 'i': '1', 'ii': '2', 'iii': '3', 'iv': '4', 'v': '5'}
logger.fdebug('volume X format - ' + str(i) + ': ' + vfind)
else:
vfind = comicDes[:v_find] # if it's fifth volume format
basenums = {'zero': '0', 'first': '1', 'second': '2', 'third': '3', 'fourth': '4', 'fifth': '5', 'sixth': '6', 'seventh': '7', 'eighth': '8', 'nineth': '9', 'tenth': '10', 'i': '1', 'ii': '2', 'iii': '3', 'iv': '4', 'v': '5'}
logger.fdebug('X volume format - ' + str(i) + ': ' + vfind)
volconv = ''
for nums in basenums:
if nums in vfind.lower():
sconv = basenums[nums]
vfind = re.sub(nums, sconv, vfind.lower())
break
#logger.info('volconv: ' + str(volconv))
#now we attempt to find the character position after the word 'volume'
if i == 0:
volthis = vfind.lower().find('volume')
volthis = volthis + 6 # add on the actual word to the position so that we can grab the subsequent digit
vfind = vfind[volthis:volthis + 4] # grab the next 4 characters ;)
elif i == 1:
volthis = vfind.lower().find('volume')
vfind = vfind[volthis - 4:volthis] # grab the next 4 characters ;)
if '(' in vfind:
#bracket detected in versioning'
vfindit = re.findall('[^()]+', vfind)
vfind = vfindit[0]
vf = re.findall('[^<>]+', vfind)
ledigit = re.sub("[^0-9]", "", vf[0])
if ledigit != '':
comic['ComicVersion'] = ledigit
logger.fdebug("Volume information found! Adding to series record : volume " + comic['ComicVersion'])
break
i += 1
else:
i += 1
if comic['ComicVersion'] == 'noversion':
logger.fdebug('comic[ComicVersion]:' + str(comic['ComicVersion']))
desdeck -= 1
else:
break
if vari == "yes":
comic['ComicIssues'] = str(cntit)
else:
comic['ComicIssues'] = dom.getElementsByTagName('count_of_issues')[0].firstChild.wholeText
comic['ComicImage'] = dom.getElementsByTagName('super_url')[0].firstChild.wholeText
comic['ComicImageALT'] = dom.getElementsByTagName('small_url')[0].firstChild.wholeText
comic['FirstIssueID'] = dom.getElementsByTagName('id')[0].firstChild.wholeText
# print ("fistIss:" + str(comic['FirstIssueID']))
# comicchoice.append({
# 'ComicName': comic['ComicName'],
# 'ComicYear': comic['ComicYear'],
# 'Comicid': comicid,
# 'ComicURL': comic['ComicURL'],
# 'ComicIssues': comic['ComicIssues'],
# 'ComicImage': comic['ComicImage'],
# 'ComicVolume': ParseVol,
# 'ComicPublisher': comic['ComicPublisher']
# })
# comic['comicchoice'] = comicchoice
return comic
def GetIssuesInfo(comicid, dom, arcid=None):
subtracks = dom.getElementsByTagName('issue')
if not mylar.CV_ONLY:
cntiss = dom.getElementsByTagName('count_of_issues')[0].firstChild.wholeText
logger.fdebug("issues I've counted: " + str(len(subtracks)))
logger.fdebug("issues CV says it has: " + str(int(cntiss)))
if int(len(subtracks)) != int(cntiss):
logger.fdebug("CV's count is wrong, I counted different...going with my count for physicals" + str(len(subtracks)))
cntiss = len(subtracks) # assume count of issues is wrong, go with ACTUAL physical api count
cntiss = int(cntiss)
n = cntiss -1
else:
n = int(len(subtracks))
tempissue = {}
issuech = []
firstdate = '2099-00-00'
for subtrack in subtracks:
if not mylar.CV_ONLY:
if (dom.getElementsByTagName('name')[n].firstChild) is not None:
issue['Issue_Name'] = dom.getElementsByTagName('name')[n].firstChild.wholeText
else:
issue['Issue_Name'] = 'None'
issue['Issue_ID'] = dom.getElementsByTagName('id')[n].firstChild.wholeText
issue['Issue_Number'] = dom.getElementsByTagName('issue_number')[n].firstChild.wholeText
issuech.append({
'Issue_ID': issue['Issue_ID'],
'Issue_Number': issue['Issue_Number'],
'Issue_Name': issue['Issue_Name']
})
else:
try:
totnames = len(subtrack.getElementsByTagName('name'))
tot = 0
while (tot < totnames):
if subtrack.getElementsByTagName('name')[tot].parentNode.nodeName == 'volume':
tempissue['ComicName'] = subtrack.getElementsByTagName('name')[tot].firstChild.wholeText
elif subtrack.getElementsByTagName('name')[tot].parentNode.nodeName == 'issue':
try:
tempissue['Issue_Name'] = subtrack.getElementsByTagName('name')[tot].firstChild.wholeText
except:
tempissue['Issue_Name'] = None
tot += 1
except:
tempissue['ComicName'] = 'None'
try:
totids = len(subtrack.getElementsByTagName('id'))
idt = 0
while (idt < totids):
if subtrack.getElementsByTagName('id')[idt].parentNode.nodeName == 'volume':
tempissue['Comic_ID'] = subtrack.getElementsByTagName('id')[idt].firstChild.wholeText
elif subtrack.getElementsByTagName('id')[idt].parentNode.nodeName == 'issue':
tempissue['Issue_ID'] = subtrack.getElementsByTagName('id')[idt].firstChild.wholeText
idt += 1
except:
tempissue['Issue_Name'] = 'None'
try:
tempissue['CoverDate'] = subtrack.getElementsByTagName('cover_date')[0].firstChild.wholeText
except:
tempissue['CoverDate'] = '0000-00-00'
try:
tempissue['StoreDate'] = subtrack.getElementsByTagName('store_date')[0].firstChild.wholeText
except:
tempissue['StoreDate'] = '0000-00-00'
try:
tempissue['Issue_Number'] = subtrack.getElementsByTagName('issue_number')[0].firstChild.wholeText
except:
logger.fdebug('No Issue Number available - Trade Paperbacks, Graphic Novels and Compendiums are not supported as of yet.')
if arcid is None:
issuech.append({
'Comic_ID': comicid,
'Issue_ID': tempissue['Issue_ID'],
'Issue_Number': tempissue['Issue_Number'],
'Issue_Date': tempissue['CoverDate'],
'Store_Date': tempissue['StoreDate'],
'Issue_Name': tempissue['Issue_Name']
})
else:
issuech.append({
'ArcID': arcid,
'ComicName': tempissue['ComicName'],
'ComicID': tempissue['Comic_ID'],
'IssueID': tempissue['Issue_ID'],
'Issue_Number': tempissue['Issue_Number'],
'Issue_Date': tempissue['CoverDate'],
'Store_Date': tempissue['StoreDate'],
'Issue_Name': tempissue['Issue_Name']
})
if tempissue['CoverDate'] < firstdate and tempissue['CoverDate'] != '0000-00-00':
firstdate = tempissue['CoverDate']
n-= 1
#issue['firstdate'] = firstdate
return issuech, firstdate
def GetFirstIssue(issueid, dom):
#if the Series Year doesn't exist, get the first issue and take the date from that
try:
first_year = dom.getElementsByTagName('cover_date')[0].firstChild.wholeText
except:
first_year = '0000'
return first_year
the_year = first_year[:4]
the_month = first_year[5:7]
the_date = the_year + '-' + the_month
return the_year
def GetSeriesYears(dom):
#used by the 'add a story arc' option to individually populate the Series Year for each series within the given arc.
#series year is required for alot of functionality.
series = dom.getElementsByTagName('volume')
tempseries = {}
serieslist = []
for dm in series:
try:
totids = len(dm.getElementsByTagName('id'))
idc = 0
while (idc < totids):
if dm.getElementsByTagName('id')[idc].parentNode.nodeName == 'volume':
tempseries['ComicID'] = dm.getElementsByTagName('id')[idc].firstChild.wholeText
idc+=1
except:
logger.warn('There was a problem retrieving a comicid for a series within the arc. This will have to manually corrected most likely.')
tempseries['ComicID'] = 'None'
tempseries['Series'] = 'None'
tempseries['Publisher'] = 'None'
try:
totnames = len(dm.getElementsByTagName('name'))
namesc = 0
while (namesc < totnames):
if dm.getElementsByTagName('name')[namesc].parentNode.nodeName == 'volume':
tempseries['Series'] = dm.getElementsByTagName('name')[namesc].firstChild.wholeText
elif dm.getElementsByTagName('name')[namesc].parentNode.nodeName == 'publisher':
tempseries['Publisher'] = dm.getElementsByTagName('name')[namesc].firstChild.wholeText
namesc+=1
except:
logger.warn('There was a problem retrieving a Series Name or Publisher for a series within the arc. This will have to manually corrected.')
try:
tempseries['SeriesYear'] = dm.getElementsByTagName('start_year')[0].firstChild.wholeText
except:
logger.warn('There was a problem retrieving the start year for a particular series within the story arc.')
tempseries['SeriesYear'] = '0000'
serieslist.append({"ComicID": tempseries['ComicID'],
"ComicName": tempseries['Series'],
"SeriesYear": tempseries['SeriesYear'],
"Publisher": tempseries['Publisher']})
return serieslist
def drophtml(html):
from bs4 import BeautifulSoup
soup = BeautifulSoup(html)
text_parts = soup.findAll(text=True)
#print ''.join(text_parts)
return ''.join(text_parts)