mylar/mylar/cv.py

#  This file is part of Mylar.
#
#  Mylar is free software: you can redistribute it and/or modify
#  it under the terms of the GNU General Public License as published by
#  the Free Software Foundation, either version 3 of the License, or
#  (at your option) any later version.
#
#  Mylar is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the
#  implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public
#  License for more details.
#
#  You should have received a copy of the GNU General Public License
#  along with Mylar.  If not, see <http://www.gnu.org/licenses/>.


import sys
import os
import re
import logger
import string
import urllib
import lib.feedparser
import mylar
from bs4 import BeautifulSoup as Soup

def pulldetails(comicid,type,issueid=None,offset=1):
    import urllib2

    #import easy to use xml parser called minidom:
    from xml.dom.minidom import parseString

    comicapi='583939a3df0a25fc4e8b7a29934a13078002dc27'
    if type == 'comic':
        if not comicid.startswith('4050-'): comicid = '4050-' + comicid
        PULLURL= mylar.CVURL + 'volume/' + str(comicid) + '/?api_key=' + str(comicapi) + '&format=xml&field_list=name,count_of_issues,issues,start_year,site_detail_url,image,publisher,description,first_issue,deck'
    elif type == 'issue':
        if mylar.CV_ONLY:
            cv_type = 'issues'
            searchset = 'filter=volume:' + str(comicid) + '&field_list=cover_date,description,id,image,issue_number,name,date_last_updated,store_date'
        else:
            cv_type = 'volume/' + str(comicid)
            searchset = 'name,count_of_issues,issues,start_year,site_detail_url,image,publisher,description,store_date'
        PULLURL = mylar.CVURL + str(cv_type) + '/?api_key=' + str(comicapi) + '&format=xml&' + str(searchset) + '&offset=' + str(offset)
    elif type == 'firstissue':
        #this is used ONLY for CV_ONLY
        PULLURL = mylar.CVURL + 'issues/?api_key=' + str(comicapi) + '&format=xml&filter=id:' + str(issueid) + '&field_list=cover_date'
    elif type == 'storyarc':
       PULLURL =  mylar.CVURL + 'story_arc/?api_key=' + str(comicapi) + '&format=xml&filter=id:' + str(issueid) + '&field_list=cover_date'

    #download the file:
    file = urllib2.urlopen(PULLURL)
    #convert to string:
    data = file.read()
    #close file because we dont need it anymore:
    file.close()
    #parse the xml you downloaded
    dom = parseString(data)

    return dom


def getComic(comicid,type,issueid=None):
    if type == 'issue':
        offset = 1
        issue = {}
        ndic = []
        issuechoice = []
        comicResults = []
        firstdate = '2099-00-00'
        #let's find out how many results we get from the query...
        searched = pulldetails(comicid,'issue',None,0)
        if searched is None: return False
        totalResults = searched.getElementsByTagName('number_of_total_results')[0].firstChild.wholeText
        logger.fdebug("there are " + str(totalResults) + " search results...")
        if not totalResults:
            return False
        countResults = 0
        while (countResults < int(totalResults)):
            logger.fdebug("querying " + str(countResults))
            if countResults > 0:
                #new api - have to change to page # instead of offset count
                offsetcount = countResults
                searched = pulldetails(comicid,'issue',None,offsetcount)
            issuechoice,tmpdate = GetIssuesInfo(comicid,searched)
            if tmpdate < firstdate:
                firstdate = tmpdate
            ndic = ndic + issuechoice
            #search results are limited to 100 and by pagination now...let's account for this.
            countResults = countResults + 100

        issue['issuechoice'] = ndic
        issue['firstdate'] = firstdate

        return issue

    elif type == 'comic':
        dom = pulldetails(comicid,'comic',None,1)
        return GetComicInfo(comicid,dom)
    elif type == 'firstissue':
        dom = pulldetails(comicid,'firstissue',issueid,1)
        return GetFirstIssue(issueid,dom)

def GetComicInfo(comicid,dom):

    #comicvine isn't as up-to-date with issue counts..
    #so this can get really buggered, really fast.
    tracks = dom.getElementsByTagName('issue')
    try:
        cntit = dom.getElementsByTagName('count_of_issues')[0].firstChild.wholeText
    except:
        cntit = len(tracks)
    trackcnt = len(tracks)
    logger.fdebug("number of issues I counted: " + str(trackcnt))
    logger.fdebug("number of issues CV says it has: " + str(cntit))
    # if the two don't match, use trackcnt as count_of_issues might be not upto-date for some reason
    if int(trackcnt) != int(cntit):
        cntit = trackcnt
        vari = "yes"
    else: vari = "no"
    logger.fdebug("vari is set to: " + str(vari))
    #if str(trackcnt) != str(int(cntit)+2):
    #    cntit = int(cntit) + 1
    comic = {}
    comicchoice = []
    cntit = int(cntit)
    #retrieve the first xml tag (<tag>data</tag>)
    #that the parser finds with name tagName:
    comic['ComicName'] = dom.getElementsByTagName('name')[trackcnt+1].firstChild.wholeText
    comic['ComicName'] = comic['ComicName'].rstrip()
    try:
        comic['ComicYear'] = dom.getElementsByTagName('start_year')[0].firstChild.wholeText
    except:
        comic['ComicYear'] = '0000'
    comic['ComicURL'] = dom.getElementsByTagName('site_detail_url')[trackcnt].firstChild.wholeText

    desdeck = 0
    #the description field actually holds the Volume# - so let's grab it
    try:
        descchunk = dom.getElementsByTagName('description')[0].firstChild.wholeText
        comic_desc = drophtml(descchunk)
        desdeck +=1
    except:
        comic_desc = 'None'

    #sometimes the deck has volume labels
    try:
        deckchunk = dom.getElementsByTagName('deck')[0].firstChild.wholeText
        comic_deck = deckchunk
        desdeck +=1
    except:
        comic_deck = 'None'

    comic['ComicVersion'] = 'noversion'
    #logger.info('comic_desc:' + comic_desc)
    #logger.info('comic_deck:' + comic_deck)
    #logger.info('desdeck: ' + str(desdeck))
    while (desdeck > 0):
        if desdeck == 1:
            if comic_desc == 'None':
                comicDes = comic_deck[:30]
            else:
                #extract the first 60 characters
                comicDes = comic_desc[:60].replace('New 52', '')
        elif desdeck == 2:
            #extract the characters from the deck
            comicDes = comic_deck[:30].replace('New 52', '')
        else:
            break

        i = 0
        while (i < 2):
            if 'volume' in comicDes.lower():
                #found volume - let's grab it.
                v_find = comicDes.lower().find('volume')
                #arbitrarily grab the next 10 chars (6 for volume + 1 for space + 3 for the actual vol #)
                #increased to 10 to allow for text numbering (+5 max)
                #sometimes it's volume 5 and ocassionally it's fifth volume.
                if i == 0:
                    vfind = comicDes[v_find:v_find+15]   #if it's volume 5 format
                    basenums = {'zero':'0','one':'1','two':'2','three':'3','four':'4','five':'5','six':'6','seven':'7','eight':'8','nine':'9','ten':'10'}
                    #logger.fdebug(str(i) + ': ' + str(vfind))
                else:
                    vfind = comicDes[:v_find]   # if it's fifth volume format
                    basenums = {'zero':'0','first':'1','second':'2','third':'3','fourth':'4','fifth':'5','sixth':'6','seventh':'7','eighth':'8','nineth':'9','tenth':'10'}
                    #logger.fdebug(str(i) + ': ' + str(vfind))
                volconv = ''
                for nums in basenums:
                    if nums in vfind.lower():
                        sconv = basenums[nums]
                        vfind = re.sub(nums, sconv, vfind.lower())
                        break
                #logger.fdebug('volconv: ' + str(volconv))

                #now we attempt to find the character position after the word 'volume'
                if i == 0:
                    volthis = vfind.lower().find('volume')
                    volthis = volthis + 6 # add on the actual word to the position so that we can grab the subsequent digit
                    vfind = vfind[volthis:volthis+4] #grab the next 4 characters ;)
                elif i == 1:
                    volthis = vfind.lower().find('volume')
                    vfind = vfind[volthis-4:volthis] #grab the next 4 characters ;)

                if '(' in vfind:
                    #bracket detected in versioning'
                    vfindit = re.findall('[^()]+', vfind)
                    vfind = vfindit[0]
                vf = re.findall('[^<>]+', vfind)
                ledigit = re.sub("[^0-9]", "", vf[0])
                if ledigit != '':
                    comic['ComicVersion'] = ledigit
                    logger.info("Volume information found! Adding to series record : volume " + comic['ComicVersion'])
                    break
                i+=1
            else:
                i+=1

        if comic['ComicVersion'] == 'noversion':
            logger.info('comic[ComicVersion]:' + str(comic['ComicVersion']))
            desdeck -=1
        else:
            break

    if vari == "yes":
        comic['ComicIssues'] = str(cntit)
    else:
        comic['ComicIssues'] = dom.getElementsByTagName('count_of_issues')[0].firstChild.wholeText
    comic['ComicImage'] = dom.getElementsByTagName('super_url')[0].firstChild.wholeText

    try:
        comic['ComicPublisher'] = dom.getElementsByTagName('name')[trackcnt+2].firstChild.wholeText
    except:
        comic['ComicPublisher'] = "Unknown"

    comic['FirstIssueID'] = dom.getElementsByTagName('id')[0].firstChild.wholeText

#    print ("fistIss:" + str(comic['FirstIssueID']))
#    comicchoice.append({
#        'ComicName':              comic['ComicName'],
#        'ComicYear':              comic['ComicYear'],
#        'Comicid':                comicid,
#        'ComicURL':               comic['ComicURL'],
#        'ComicIssues':            comic['ComicIssues'],
#        'ComicImage':             comic['ComicImage'],
#        'ComicVolume':            ParseVol,
#        'ComicPublisher':         comic['ComicPublisher']
#        })

#    comic['comicchoice'] = comicchoice
    return comic

def GetIssuesInfo(comicid,dom):
    subtracks = dom.getElementsByTagName('issue')
    if not mylar.CV_ONLY:
        cntiss = dom.getElementsByTagName('count_of_issues')[0].firstChild.wholeText
        logger.fdebug("issues I've counted: " + str(len(subtracks)))
        logger.fdebug("issues CV says it has: " + str(int(cntiss)))

        if int(len(subtracks)) != int(cntiss):
            logger.fdebug("CV's count is wrong, I counted different...going with my count for physicals" + str(len(subtracks)))
            cntiss = len(subtracks) # assume count of issues is wrong, go with ACTUAL physical api count
        cntiss = int(cntiss)
        n = cntiss-1
    else:
        n = int(len(subtracks))
    tempissue = {}
    issuech = []
    firstdate = '2099-00-00'
    for subtrack in subtracks:
        if not mylar.CV_ONLY:
            if (dom.getElementsByTagName('name')[n].firstChild) is not None:
                issue['Issue_Name'] = dom.getElementsByTagName('name')[n].firstChild.wholeText
            else:
                issue['Issue_Name'] = 'None'

            issue['Issue_ID'] = dom.getElementsByTagName('id')[n].firstChild.wholeText
            issue['Issue_Number'] = dom.getElementsByTagName('issue_number')[n].firstChild.wholeText

            issuech.append({
                'Issue_ID':                issue['Issue_ID'],
                'Issue_Number':            issue['Issue_Number'],
                'Issue_Name':              issue['Issue_Name']
                })
        else:
            try:
                tempissue['Issue_Name'] = subtrack.getElementsByTagName('name')[0].firstChild.wholeText
            except:
                tempissue['Issue_Name'] = 'None'
            tempissue['Issue_ID'] = subtrack.getElementsByTagName('id')[0].firstChild.wholeText
            try:
                tempissue['CoverDate'] = subtrack.getElementsByTagName('cover_date')[0].firstChild.wholeText
            except:
                tempissue['CoverDate'] = '0000-00-00'
            try:
                tempissue['StoreDate'] = subtrack.getElementsByTagName('store_date')[0].firstChild.wholeText
            except:
                tempissue['StoreDate'] = '0000-00-00'
            tempissue['Issue_Number'] = subtrack.getElementsByTagName('issue_number')[0].firstChild.wholeText
            issuech.append({
                'Comic_ID':                comicid,
                'Issue_ID':                tempissue['Issue_ID'],
                'Issue_Number':            tempissue['Issue_Number'],
                'Issue_Date':              tempissue['CoverDate'],
                'Store_Date':              tempissue['StoreDate'],
                'Issue_Name':              tempissue['Issue_Name']
                })

            if tempissue['CoverDate'] < firstdate and tempissue['CoverDate'] != '0000-00-00':
                firstdate = tempissue['CoverDate']
        n-=1

    #issue['firstdate'] = firstdate
    return issuech, firstdate

def GetFirstIssue(issueid,dom):
    #if the Series Year doesn't exist, get the first issue and take the date from that
    try:
        first_year = dom.getElementsByTagName('cover_date')[0].firstChild.wholeText
    except:
        first_year = '0000'
        return first_year

    the_year = first_year[:4]
    the_month = first_year[5:7]
    the_date = the_year + '-' + the_month

    return the_year


def drophtml(html):
    from bs4 import BeautifulSoup
    soup = BeautifulSoup(html)

    text_parts = soup.findAll(text=True)
    #print ''.join(text_parts)
    return ''.join(text_parts)