mylar/mylar/comicbookdb.py


from bs4 import BeautifulSoup, UnicodeDammit
import urllib2
import re
import helpers
import logger
import datetime
import sys
from decimal import Decimal
from HTMLParser import HTMLParseError
from time import strptime

def cbdb(comicnm, ComicYear):
    #comicnm = 'Animal Man'
    #print ( "comicname: " + str(comicnm) )
    #print ( "comicyear: " + str(comicyr) )
    comicnm = re.sub(' ', '+', comicnm)
    input = "http://mobile.comicbookdb.com/search.php?form_search=" + str(comicnm) + "&form_searchtype=Title&x=0&y=0"
    response = urllib2.urlopen ( input )
    soup = BeautifulSoup ( response)
    abc = soup.findAll('a', href=True)
    lenabc = len(abc)
    i=0
    resultName = []
    resultID = []
    resultYear = []
    resultIssues = []
    resultURL = []
    matched = "no"

    while (i < lenabc):
        titlet = abc[i] #iterate through the href's, pulling out only results.
        print ("titlet: " + str(titlet))
        if "title.php" in str(titlet):
            print ("found title")
            tempName = titlet.findNext(text=True)
            print ("tempName: " + tempName)
            resultName = tempName[:tempName.find("(")]
            print ("ComicName: " + resultName)

            resultYear = tempName[tempName.find("(")+1:tempName.find(")")]
            if resultYear.isdigit(): pass
            else:
                i+=1
                continue
            print "ComicYear: " + resultYear

            ID_som = titlet['href']
            resultURL = ID_som
            print "CBDB URL: " + resultURL

            IDst = ID_som.find('?ID=')
            resultID = ID_som[(IDst+4):]

            print "CBDB ID: " + resultID


            print ("resultname: " + resultName)
            CleanComicName = re.sub('[\,\.\:\;\'\[\]\(\)\!\@\#\$\%\^\&\*\-\_\+\=\?\/]', '', comicnm)
            CleanComicName = re.sub(' ', '', CleanComicName).lower()
            CleanResultName = re.sub('[\,\.\:\;\'\[\]\(\)\!\@\#\$\%\^\&\*\-\_\+\=\?\/]', '', resultName)
            CleanResultName = re.sub(' ', '', CleanResultName).lower()
            print ("CleanComicName: " + CleanComicName)
            print ("CleanResultName: " + CleanResultName)
            if CleanResultName == CleanComicName or CleanResultName[3:] == CleanComicName or len(CleanComicName) == len(CleanResultName):
            #if resultName[n].lower() == helpers.cleanName(str(ComicName)).lower():
                print ("i:" + str(i) + "...matched by name to Mylar!")
                print ("ComicYear: " + str(ComicYear) + ".. to ResultYear: " + str(resultYear))
                if resultYear.isdigit():
                    if int(resultYear) == int(ComicYear) or int(resultYear) == int(ComicYear)+1:
                        resultID = str(resultID)
                        print ("Matchumundo!")
                        matched = "yes"
                else:
                    continue
            if matched == "yes":
                break
        i+=1
    return IssueDetails(resultID)


def IssueDetails(cbdb_id):
    annuals = {}
    annualslist = []
    gcount = 0
    pagethis = 'http://comicbookdb.com/title.php?ID=' + str(cbdb_id)

    response = urllib2.urlopen(pagethis)
    soup = BeautifulSoup(response)

    resultp = soup.findAll("table")
    total = len(resultp)  # -- number of tables
    #get details here

    startit = resultp[0].find("table", {"width" : "884" })

    i = 0
    pubchk = 0
    boop = startit.findAll('strong')
    for t in boop:
        if pubchk == 0:
            if ("publisher.php?" in startit('a')[i]['href']):
                print (startit('a')[i]['href'])
                publisher = str(startit('a')[i].contents)
                print ("publisher: " + publisher)
                pubchk = "1"
        elif 'Publication Date: ' in t:
            pdi = boop[i].nextSibling
            print ("publication date: " + pdi)
        elif 'Number of issues cataloged: ' in t:
            noi = boop[i].nextSibling
            print ("number of issues: " + noi)

        i+=1

        if i > len(boop): break

#    pd = startit.find("Publication Date: ").nextSibling.next.text
#    resultPublished = str(pd)
#    noi = startit.find("Number of issues cataloged: ").nextSibling.next.text
#    totalIssues = str(noi)
#    print ("Publication Dates : " + str(resultPublished))
#    print ("Total Issues: " + str(totalIssues))
    ti = 1 # start at one as 0 is the ENTIRE soup structure
    while (ti < total):
        #print result
        if resultp[ti].find("a", {"class" : "page_link" }):
            #print "matcheroso"
            tableno = resultp[ti].findAll('tr')  #7th table, all the tr's
            #print ti, total
            break
        ti+=1
    noresults = len(tableno)
    #print ("tableno: " + str(tableno))
    print ("there are " + str(noresults) + " issues total (cover variations, et all).")
    i=1 # start at 1 so we don't grab the table headers ;)
    issue = []
    storyarc = []
    pubdate = []
    #resultit = tableno[1]
    #print ("resultit: " + str(resultit))

    while (i < noresults):
        resultit = tableno[i]   # 7th table, 1st set of tr (which indicates an issue).
        #print ("resultit: " + str(resultit))
        issuet = resultit.find("a", {"class" : "page_link" })  # gets the issue # portion
        try:
            issue = issuet.findNext(text=True)
        except:
            #print ("blank space - skipping")
            i+=1
            continue
        if 'annual' not in issue.lower():
            i+=1
            continue

        lent = resultit('a',href=True) #gathers all the a href's within this particular tr
        #print ("lent: " + str(lent))
        lengtht = len(lent)  #returns the # of ahref's within this particular tr
        #print ("lengtht: " + str(lengtht))
        #since we don't know which one contains the story arc, we need to iterate through to find it
        #we need to know story arc, because the following td is the Publication Date
        n=0
        while (n < lengtht):
            storyt = lent[n] #
            #print ("storyt: " + str(storyt))
            if 'storyarc.php' in storyt:
                #print ("found storyarc")
                storyarc = storyt.findNext(text=True)
                #print ("Story Arc: " + str(storyarc))
                break
            n+=1
        pubd = resultit('td')  # find all the <td>'s within this tr
        publen = len(pubd) # find the # of <td>'s
        pubs = pubd[publen-1] #take the last <td> which will always contain the publication date
        pdaters = pubs.findNext(text=True) #get the actual date :)
        pubdate = re.sub("[^0-9]", "", pdaters)
        print ("Issue : " + str(issue) + "  (" + str(pubdate) + ")")

        annualslist.append({
            'AnnualIssue':             str(issue),
            'AnnualDate':              pubdate
            })
        gcount+=1

        i+=1

    annuals['annualslist'] = annualslist

    print ("Issues:" + str(annuals['annualslist']))
    print ("There are " + str(gcount) + " issues.")

    annuals['totalissues'] = gcount
    annuals['GCDComicID'] = cbdb_id
    return annuals

if __name__ == '__main__':
    cbdb(sys.argv[1], sys.argv[2])