mirror of https://github.com/evilhero/mylar
199 lines
7.0 KiB
Python
Executable File
199 lines
7.0 KiB
Python
Executable File
|
|
from bs4 import BeautifulSoup, UnicodeDammit
|
|
import urllib2
|
|
import re
|
|
import helpers
|
|
import logger
|
|
import datetime
|
|
import sys
|
|
from decimal import Decimal
|
|
from HTMLParser import HTMLParseError
|
|
from time import strptime
|
|
|
|
def cbdb(comicnm, ComicYear):
|
|
#comicnm = 'Animal Man'
|
|
#print ( "comicname: " + str(comicnm) )
|
|
#print ( "comicyear: " + str(comicyr) )
|
|
comicnm = re.sub(' ', '+', comicnm)
|
|
input = "http://mobile.comicbookdb.com/search.php?form_search=" + str(comicnm) + "&form_searchtype=Title&x=0&y=0"
|
|
response = urllib2.urlopen ( input )
|
|
soup = BeautifulSoup ( response)
|
|
abc = soup.findAll('a', href=True)
|
|
lenabc = len(abc)
|
|
i=0
|
|
resultName = []
|
|
resultID = []
|
|
resultYear = []
|
|
resultIssues = []
|
|
resultURL = []
|
|
matched = "no"
|
|
|
|
while (i < lenabc):
|
|
titlet = abc[i] #iterate through the href's, pulling out only results.
|
|
print ("titlet: " + str(titlet))
|
|
if "title.php" in str(titlet):
|
|
print ("found title")
|
|
tempName = titlet.findNext(text=True)
|
|
print ("tempName: " + tempName)
|
|
resultName = tempName[:tempName.find("(")]
|
|
print ("ComicName: " + resultName)
|
|
|
|
resultYear = tempName[tempName.find("(")+1:tempName.find(")")]
|
|
if resultYear.isdigit(): pass
|
|
else:
|
|
i+=1
|
|
continue
|
|
print "ComicYear: " + resultYear
|
|
|
|
ID_som = titlet['href']
|
|
resultURL = ID_som
|
|
print "CBDB URL: " + resultURL
|
|
|
|
IDst = ID_som.find('?ID=')
|
|
resultID = ID_som[(IDst+4):]
|
|
|
|
print "CBDB ID: " + resultID
|
|
|
|
|
|
print ("resultname: " + resultName)
|
|
CleanComicName = re.sub('[\,\.\:\;\'\[\]\(\)\!\@\#\$\%\^\&\*\-\_\+\=\?\/]', '', comicnm)
|
|
CleanComicName = re.sub(' ', '', CleanComicName).lower()
|
|
CleanResultName = re.sub('[\,\.\:\;\'\[\]\(\)\!\@\#\$\%\^\&\*\-\_\+\=\?\/]', '', resultName)
|
|
CleanResultName = re.sub(' ', '', CleanResultName).lower()
|
|
print ("CleanComicName: " + CleanComicName)
|
|
print ("CleanResultName: " + CleanResultName)
|
|
if CleanResultName == CleanComicName or CleanResultName[3:] == CleanComicName or len(CleanComicName) == len(CleanResultName):
|
|
#if resultName[n].lower() == helpers.cleanName(str(ComicName)).lower():
|
|
print ("i:" + str(i) + "...matched by name to Mylar!")
|
|
print ("ComicYear: " + str(ComicYear) + ".. to ResultYear: " + str(resultYear))
|
|
if resultYear.isdigit():
|
|
if int(resultYear) == int(ComicYear) or int(resultYear) == int(ComicYear)+1:
|
|
resultID = str(resultID)
|
|
print ("Matchumundo!")
|
|
matched = "yes"
|
|
else:
|
|
continue
|
|
if matched == "yes":
|
|
break
|
|
i+=1
|
|
return IssueDetails(resultID)
|
|
|
|
|
|
def IssueDetails(cbdb_id):
|
|
annuals = {}
|
|
annualslist = []
|
|
gcount = 0
|
|
pagethis = 'http://comicbookdb.com/title.php?ID=' + str(cbdb_id)
|
|
|
|
response = urllib2.urlopen(pagethis)
|
|
soup = BeautifulSoup(response)
|
|
|
|
resultp = soup.findAll("table")
|
|
total = len(resultp) # -- number of tables
|
|
#get details here
|
|
|
|
startit = resultp[0].find("table", {"width" : "884" })
|
|
|
|
i = 0
|
|
pubchk = 0
|
|
boop = startit.findAll('strong')
|
|
for t in boop:
|
|
if pubchk == 0:
|
|
if ("publisher.php?" in startit('a')[i]['href']):
|
|
print (startit('a')[i]['href'])
|
|
publisher = str(startit('a')[i].contents)
|
|
print ("publisher: " + publisher)
|
|
pubchk = "1"
|
|
elif 'Publication Date: ' in t:
|
|
pdi = boop[i].nextSibling
|
|
print ("publication date: " + pdi)
|
|
elif 'Number of issues cataloged: ' in t:
|
|
noi = boop[i].nextSibling
|
|
print ("number of issues: " + noi)
|
|
|
|
i+=1
|
|
|
|
if i > len(boop): break
|
|
|
|
# pd = startit.find("Publication Date: ").nextSibling.next.text
|
|
# resultPublished = str(pd)
|
|
# noi = startit.find("Number of issues cataloged: ").nextSibling.next.text
|
|
# totalIssues = str(noi)
|
|
# print ("Publication Dates : " + str(resultPublished))
|
|
# print ("Total Issues: " + str(totalIssues))
|
|
ti = 1 # start at one as 0 is the ENTIRE soup structure
|
|
while (ti < total):
|
|
#print result
|
|
if resultp[ti].find("a", {"class" : "page_link" }):
|
|
#print "matcheroso"
|
|
tableno = resultp[ti].findAll('tr') #7th table, all the tr's
|
|
#print ti, total
|
|
break
|
|
ti+=1
|
|
noresults = len(tableno)
|
|
#print ("tableno: " + str(tableno))
|
|
print ("there are " + str(noresults) + " issues total (cover variations, et all).")
|
|
i=1 # start at 1 so we don't grab the table headers ;)
|
|
issue = []
|
|
storyarc = []
|
|
pubdate = []
|
|
#resultit = tableno[1]
|
|
#print ("resultit: " + str(resultit))
|
|
|
|
while (i < noresults):
|
|
resultit = tableno[i] # 7th table, 1st set of tr (which indicates an issue).
|
|
#print ("resultit: " + str(resultit))
|
|
issuet = resultit.find("a", {"class" : "page_link" }) # gets the issue # portion
|
|
try:
|
|
issue = issuet.findNext(text=True)
|
|
except:
|
|
#print ("blank space - skipping")
|
|
i+=1
|
|
continue
|
|
if 'annual' not in issue.lower():
|
|
i+=1
|
|
continue
|
|
|
|
lent = resultit('a',href=True) #gathers all the a href's within this particular tr
|
|
#print ("lent: " + str(lent))
|
|
lengtht = len(lent) #returns the # of ahref's within this particular tr
|
|
#print ("lengtht: " + str(lengtht))
|
|
#since we don't know which one contains the story arc, we need to iterate through to find it
|
|
#we need to know story arc, because the following td is the Publication Date
|
|
n=0
|
|
while (n < lengtht):
|
|
storyt = lent[n] #
|
|
#print ("storyt: " + str(storyt))
|
|
if 'storyarc.php' in storyt:
|
|
#print ("found storyarc")
|
|
storyarc = storyt.findNext(text=True)
|
|
#print ("Story Arc: " + str(storyarc))
|
|
break
|
|
n+=1
|
|
pubd = resultit('td') # find all the <td>'s within this tr
|
|
publen = len(pubd) # find the # of <td>'s
|
|
pubs = pubd[publen-1] #take the last <td> which will always contain the publication date
|
|
pdaters = pubs.findNext(text=True) #get the actual date :)
|
|
pubdate = re.sub("[^0-9]", "", pdaters)
|
|
print ("Issue : " + str(issue) + " (" + str(pubdate) + ")")
|
|
|
|
annualslist.append({
|
|
'AnnualIssue': str(issue),
|
|
'AnnualDate': pubdate
|
|
})
|
|
gcount+=1
|
|
|
|
i+=1
|
|
|
|
annuals['annualslist'] = annualslist
|
|
|
|
print ("Issues:" + str(annuals['annualslist']))
|
|
print ("There are " + str(gcount) + " issues.")
|
|
|
|
annuals['totalissues'] = gcount
|
|
annuals['GCDComicID'] = cbdb_id
|
|
return annuals
|
|
|
|
if __name__ == '__main__':
|
|
cbdb(sys.argv[1], sys.argv[2])
|