mylar/mylar/cv.py

#  This file is part of Mylar.
#
#  Mylar is free software: you can redistribute it and/or modify
#  it under the terms of the GNU General Public License as published by
#  the Free Software Foundation, either version 3 of the License, or
#  (at your option) any later version.
#
#  Mylar is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the
#  implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public
#  License for more details.
#
#  You should have received a copy of the GNU General Public License
#  along with Mylar.  If not, see <http://www.gnu.org/licenses/>.


import sys
import os
import re
import time
import logger
import string
import urllib2
import lib.feedparser
import mylar
import platform
from bs4 import BeautifulSoup as Soup
from xml.parsers.expat import ExpatError
import httplib
import requests

def patch_http_response_read(func):
    def inner(*args):
        try:
            return func(*args)
        except httplib.IncompleteRead, e:
            return e.partial

    return inner
httplib.HTTPResponse.read = patch_http_response_read(httplib.HTTPResponse.read)

if platform.python_version() == '2.7.6':
    httplib.HTTPConnection._http_vsn = 10
    httplib.HTTPConnection._http_vsn_str = 'HTTP/1.0'


def pulldetails(comicid, type, issueid=None, offset=1, arclist=None, comicidlist=None):
    #import easy to use xml parser called minidom:
    from xml.dom.minidom import parseString

    if mylar.CONFIG.COMICVINE_API == 'None' or mylar.CONFIG.COMICVINE_API is None:
        logger.warn('You have not specified your own ComicVine API key - it\'s a requirement. Get your own @ http://api.comicvine.com.')
        return
    else:
        comicapi = mylar.CONFIG.COMICVINE_API

    if type == 'comic':
        if not comicid.startswith('4050-'): comicid = '4050-' + comicid
        PULLURL = mylar.CVURL + 'volume/' + str(comicid) + '/?api_key=' + str(comicapi) + '&format=xml&field_list=name,count_of_issues,issues,start_year,site_detail_url,image,publisher,description,first_issue,deck,aliases'
    elif type == 'issue':
        if mylar.CONFIG.CV_ONLY:
            cv_type = 'issues'
            if arclist is None:
                searchset = 'filter=volume:' + str(comicid) + '&field_list=cover_date,description,id,image,issue_number,name,date_last_updated,store_date'
            else:
                searchset = 'filter=id:' + (arclist) + '&field_list=cover_date,id,issue_number,name,date_last_updated,store_date,volume'
        else:
            cv_type = 'volume/' + str(comicid)
            searchset = 'name,count_of_issues,issues,start_year,site_detail_url,image,publisher,description,store_date'
        PULLURL = mylar.CVURL + str(cv_type) + '/?api_key=' + str(comicapi) + '&format=xml&' + str(searchset) + '&offset=' + str(offset)
    elif any([type == 'image', type == 'firstissue']):
        #this is used ONLY for CV_ONLY
        PULLURL = mylar.CVURL + 'issues/?api_key=' + str(comicapi) + '&format=xml&filter=id:' + str(issueid) + '&field_list=cover_date,image'
    elif type == 'storyarc':
        PULLURL = mylar.CVURL + 'story_arcs/?api_key=' + str(comicapi) + '&format=xml&filter=name:' + str(issueid) + '&field_list=cover_date'
    elif type == 'comicyears':
        PULLURL = mylar.CVURL + 'volumes/?api_key=' + str(comicapi) + '&format=xml&filter=id:' + str(comicidlist) + '&field_list=name,id,start_year,publisher,description,deck,aliases&offset=' + str(offset)
    elif type == 'import':
        PULLURL = mylar.CVURL + 'issues/?api_key=' + str(comicapi) + '&format=xml&filter=id:' + (comicidlist) + '&field_list=cover_date,id,issue_number,name,date_last_updated,store_date,volume' + '&offset=' + str(offset)
    elif type == 'update_dates':
        PULLURL = mylar.CVURL + 'issues/?api_key=' + str(comicapi) + '&format=xml&filter=id:' + (comicidlist)+ '&field_list=date_last_updated, id, issue_number, store_date, cover_date, name, volume ' + '&offset=' + str(offset)

    #logger.info('CV.PULLURL: ' + PULLURL)
    #new CV API restriction - one api request / second.
    if mylar.CONFIG.CVAPI_RATE is None or mylar.CONFIG.CVAPI_RATE < 2:
        time.sleep(2)
    else:
        time.sleep(mylar.CONFIG.CVAPI_RATE)

    #download the file:
    #set payload to None for now...
    payload = None

    try:
        r = requests.get(PULLURL, params=payload, verify=mylar.CONFIG.CV_VERIFY, headers=mylar.CV_HEADERS)
    except Exception, e:
        logger.warn('Error fetching data from ComicVine: %s' % (e))
        return

    #logger.fdebug('cv status code : ' + str(r.status_code))
    try:
        dom = parseString(r.content)
    except ExpatError:
        if u'<title>Abnormal Traffic Detected' in r.content:
            logger.error('ComicVine has banned this server\'s IP address because it exceeded the API rate limit.')
        else:
            logger.warn('[WARNING] ComicVine is not responding correctly at the moment. This is usually due to some problems on their end. If you re-try things again in a few moments, things might work')
        return
    except Exception as e:
        logger.warn('[ERROR] Error returned from CV: %s' % e)
        return
    else:
        return dom

def getComic(comicid, type, issueid=None, arc=None, arcid=None, arclist=None, comicidlist=None):
    if type == 'issue':
        offset = 1
        issue = {}
        ndic = []
        issuechoice = []
        comicResults = []
        firstdate = '2099-00-00'
        #let's find out how many results we get from the query...
        if comicid is None:
            #if comicid is None, it's coming from the story arc search results.
            id = arcid
            #since the arclist holds the issueids, and the pertinent reading order - we need to strip out the reading order so this works.
            aclist = ''
            if arclist.startswith('M'):
                islist = arclist[1:]
            else:
                for ac in arclist.split('|'):
                    aclist += ac[:ac.find(',')] + '|'
                if aclist.endswith('|'):
                    aclist = aclist[:-1]
                islist = aclist
        else:
            id = comicid
            islist = None
        searched = pulldetails(id, 'issue', None, 0, islist)
        if searched is None:
            return False
        totalResults = searched.getElementsByTagName('number_of_total_results')[0].firstChild.wholeText
        logger.fdebug("there are " + str(totalResults) + " search results...")
        if not totalResults:
            return False
        countResults = 0
        while (countResults < int(totalResults)):
            logger.fdebug("querying range from " + str(countResults) + " to " + str(countResults + 100))
            if countResults > 0:
                #new api - have to change to page # instead of offset count
                offsetcount = countResults
                searched = pulldetails(id, 'issue', None, offsetcount, islist)
            issuechoice, tmpdate = GetIssuesInfo(id, searched, arcid)
            if tmpdate < firstdate:
                firstdate = tmpdate
            ndic = ndic + issuechoice
            #search results are limited to 100 and by pagination now...let's account for this.
            countResults = countResults + 100

        issue['issuechoice'] = ndic
        issue['firstdate'] = firstdate
        return issue

    elif type == 'comic':
        dom = pulldetails(comicid, 'comic', None, 1)
        return GetComicInfo(comicid, dom)
    elif any([type == 'image', type == 'firstissue']):
        dom = pulldetails(comicid, type, issueid, 1)
        return Getissue(issueid, dom, type)
    elif type == 'storyarc':
        dom = pulldetails(arc, 'storyarc', None, 1)
        return GetComicInfo(issueid, dom)
    elif type == 'comicyears':
        #used by the story arc searcher when adding a given arc to poll each ComicID in order to populate the Series Year & volume (hopefully).
        #this grabs each issue based on issueid, and then subsets the comicid for each to be used later.
        #set the offset to 0, since we're doing a filter.
        dom = pulldetails(arcid, 'comicyears', offset=0, comicidlist=comicidlist)
        return GetSeriesYears(dom)
    elif type == 'import':
        #used by the importer when doing a scan with metatagging enabled. If metatagging comes back true, then there's an IssueID present
        #within the tagging (with CT). This compiles all of the IssueID's during a scan (in 100's), and returns the corresponding CV data
        #related to the given IssueID's - namely ComicID, Name, Volume (more at some point, but those are the important ones).
        offset = 1
        id_count = 0
        import_list = []
        logger.fdebug('comicidlist:' + str(comicidlist))

        while id_count < len(comicidlist):
            #break it up by 100 per api hit
            #do the first 100 regardless
            in_cnt = 0
            if id_count + 100 <= len(comicidlist):
                endcnt = id_count + 100
            else:
                endcnt = len(comicidlist)

            for i in range(id_count, endcnt):
                if in_cnt == 0:
                    tmpidlist = str(comicidlist[i])
                else:
                    tmpidlist += '|' + str(comicidlist[i])
                in_cnt +=1
            logger.fdebug('tmpidlist: ' + str(tmpidlist))

            searched = pulldetails(None, 'import', offset=0, comicidlist=tmpidlist)

            if searched is None:
                break
            else:
                tGIL = GetImportList(searched)
                import_list += tGIL

            id_count +=100

        return import_list

    elif type == 'update_dates':
        dom = pulldetails(None, 'update_dates', offset=1, comicidlist=comicidlist)
        return UpdateDates(dom)

def GetComicInfo(comicid, dom, safechk=None):
    if safechk is None:
        #safetycheck when checking comicvine. If it times out, increment the chk on retry attempts up until 5 tries then abort.
        safechk = 1
    elif safechk > 4:
        logger.error('Unable to add / refresh the series due to inablity to retrieve data from ComicVine. You might want to try abit later and/or make sure ComicVine is up.')
        return
    #comicvine isn't as up-to-date with issue counts..
    #so this can get really buggered, really fast.
    tracks = dom.getElementsByTagName('issue')
    try:
        cntit = dom.getElementsByTagName('count_of_issues')[0].firstChild.wholeText
    except:
        cntit = len(tracks)
    trackcnt = len(tracks)
    logger.fdebug("number of issues I counted: " + str(trackcnt))
    logger.fdebug("number of issues CV says it has: " + str(cntit))
    # if the two don't match, use trackcnt as count_of_issues might be not upto-date for some reason
    if int(trackcnt) != int(cntit):
        cntit = trackcnt
        vari = "yes"
    else: vari = "no"
    logger.fdebug("vari is set to: " + str(vari))
    #if str(trackcnt) != str(int(cntit)+2):
    #    cntit = int(cntit) + 1
    comic = {}
    comicchoice = []
    cntit = int(cntit)
    #retrieve the first xml tag (<tag>data</tag>)
    #that the parser finds with name tagName:
    # to return the parent name of the <name> node : dom.getElementsByTagName('name')[0].parentNode.nodeName
    # where [0] denotes the number of the name field(s)
    # where nodeName denotes the parentNode : ComicName = results, publisher = publisher, issues = issue
    try:
        names = len(dom.getElementsByTagName('name'))
        n = 0
        comic['ComicPublisher'] = 'Unknown'   #set this to a default value here so that it will carry through properly
        while (n < names):
            if dom.getElementsByTagName('name')[n].parentNode.nodeName == 'results':
                try:
                    comic['ComicName'] = dom.getElementsByTagName('name')[n].firstChild.wholeText
                    comic['ComicName'] = comic['ComicName'].rstrip()
                except:
                    logger.error('There was a problem retrieving the given data from ComicVine. Ensure that www.comicvine.com is accessible AND that you have provided your OWN ComicVine API key.')
                    return

            elif dom.getElementsByTagName('name')[n].parentNode.nodeName == 'publisher':
                try:
                    comic['ComicPublisher'] = dom.getElementsByTagName('name')[n].firstChild.wholeText
                except:
                    comic['ComicPublisher'] = "Unknown"

            n += 1
    except:
        logger.warn('Something went wrong retrieving from ComicVine. Ensure your API is up-to-date and that comicvine is accessible')
        return

    try:
        comic['ComicYear'] = dom.getElementsByTagName('start_year')[0].firstChild.wholeText
    except:
        comic['ComicYear'] = '0000'

    #safety check, cause you known, dufus'...
    if any([comic['ComicYear'][-1:] == '-', comic['ComicYear'][-1:] == '?']):
        comic['ComicYear'] = comic['ComicYear'][:-1]

    try:
        comic['ComicURL'] = dom.getElementsByTagName('site_detail_url')[trackcnt].firstChild.wholeText
    except:
        #this should never be an exception. If it is, it's probably due to CV timing out - so let's sleep for abit then retry.
        logger.warn('Unable to retrieve URL for volume. This is usually due to a timeout to CV, or going over the API. Retrying again in 10s.')
        time.sleep(10)
        safechk +=1
        GetComicInfo(comicid, dom, safechk)

    desdeck = 0
    #the description field actually holds the Volume# - so let's grab it
    desc_soup = None
    try:
        descchunk = dom.getElementsByTagName('description')[0].firstChild.wholeText
        desc_soup = Soup(descchunk, "html.parser")
        desclinks = desc_soup.findAll('a')
        comic_desc = drophtml(descchunk)
        desdeck +=1
    except:
        comic_desc = 'None'

    #sometimes the deck has volume labels
    try:
        deckchunk = dom.getElementsByTagName('deck')[0].firstChild.wholeText
        comic_deck = deckchunk
        desdeck +=1
    except:
        comic_deck = 'None'

    #comic['ComicDescription'] = comic_desc

    try:
        comic['Aliases'] = dom.getElementsByTagName('aliases')[0].firstChild.wholeText
        comic['Aliases'] = re.sub('\n', '##', comic['Aliases']).strip()
        if comic['Aliases'][-2:] == '##':
            comic['Aliases'] = comic['Aliases'][:-2]
        #logger.fdebug('Aliases: ' + str(aliases))
    except:
        comic['Aliases'] = 'None'

    comic['ComicVersion'] = 'None' #noversion'

    #figure out if it's a print / digital edition.
    comic['Type'] = 'None'
    if comic_deck != 'None':
        if any(['print' in comic_deck.lower(), 'digital' in comic_deck.lower(), 'paperback' in comic_deck.lower(), 'one shot' in re.sub('-', '', comic_deck.lower()).strip(), 'hardcover' in comic_deck.lower()]):
            if all(['print' in comic_deck.lower(), 'reprint' not in comic_deck.lower()]):
                comic['Type'] = 'Print'
            elif 'digital' in comic_deck.lower():
                comic['Type'] = 'Digital'
            elif 'paperback' in comic_deck.lower():
                comic['Type'] = 'TPB'
            elif 'hardcover' in comic_deck.lower():
                comic['Type'] = 'HC'
            elif 'oneshot' in re.sub('-', '', comic_deck.lower()).strip():
                comic['Type'] = 'One-Shot'
            else:
                comic['Type'] = 'Print'

    if comic_desc != 'None' and comic['Type'] == 'None':
        if 'print' in comic_desc[:60].lower() and all(['for the printed edition' not in comic_desc.lower(), 'print edition can be found' not in comic_desc.lower(), 'reprints' not in comic_desc.lower()]):
            comic['Type'] = 'Print'
        elif 'digital' in comic_desc[:60].lower() and 'digital edition can be found' not in comic_desc.lower():
            comic['Type'] = 'Digital'
        elif all(['paperback' in comic_desc[:60].lower(), 'paperback can be found' not in comic_desc.lower()]) or 'collects' in comic_desc[:60].lower():
            comic['Type'] = 'TPB'
        elif 'hardcover' in comic_desc[:60].lower() and 'hardcover can be found' not in comic_desc.lower():
            comic['Type'] = 'HC'
        elif any(['one-shot' in comic_desc[:60].lower(), 'one shot' in comic_desc[:60].lower()]) and any(['can be found' not in comic_desc.lower(), 'following the' not in comic_desc.lower(), 'after the' not in comic_desc.lower()]):
            i = 0
            comic['Type'] = 'One-Shot'
            avoidwords = ['preceding', 'after the', 'following the']
            while i < 2:
                if i == 0:
                    cbd = 'one-shot'
                elif i == 1:
                    cbd = 'one shot'
                tmp1 = comic_desc[:60].lower().find(cbd)
                if tmp1 != -1:
                    for x in avoidwords:
                        tmp2 = comic_desc[:tmp1].lower().find(x)
                        if tmp2 != -1:
                            logger.fdebug('FAKE NEWS: caught incorrect reference to one-shot. Forcing to Print')
                            comic['Type'] = 'Print'
                            i = 3
                            break
                i+=1
        else:
            comic['Type'] = 'Print'

    if all([comic_desc != 'None', 'trade paperback' in comic_desc[:30].lower(), 'collecting' in comic_desc[:40].lower()]):
        #ie. Trade paperback collecting Marvel Team-Up #9-11, 48-51, 72, 110 & 145.
        first_collect = comic_desc.lower().find('collecting')
        #logger.info('first_collect: %s' % first_collect)
        #logger.info('comic_desc: %s' % comic_desc)
        #logger.info('desclinks: %s' % desclinks)
        issue_list = []
        micdrop = []
        if desc_soup is not None:
            #if it's point form bullets, ignore it cause it's not the current volume stuff.
            test_it = desc_soup.find('ul')
            if test_it:
                for x in test_it.findAll('li'):
                    if any(['Next' in x.findNext(text=True), 'Previous' in x.findNext(text=True)]):
                        mic_check = x.find('a')
                        micdrop.append(mic_check['data-ref-id'])

        for fc in desclinks:
            try:
                fc_id = fc['data-ref-id']
            except:
                continue

            if fc_id in micdrop:
                continue

            fc_name = fc.findNext(text=True)

            if fc_id.startswith('4000'):
                fc_cid = None
                fc_isid = fc_id
                iss_start = fc_name.find('#')
                issuerun = fc_name[iss_start:].strip()
                fc_name = fc_name[:iss_start].strip()
            elif fc_id.startswith('4050'):
                fc_cid = fc_id
                fc_isid = None
                issuerun = fc.next_sibling
                if issuerun is not None:
                    lines = re.sub("[^0-9]", ' ', issuerun).strip().split(' ')
                    if len(lines) > 0:
                        for x in sorted(lines, reverse=True):
                            srchline = issuerun.rfind(x)
                            if srchline != -1:
                                try:
                                    if issuerun[srchline+len(x)] == ',' or issuerun[srchline+len(x)] == '.' or issuerun[srchline+len(x)] == ' ':
                                        issuerun = issuerun[:srchline+len(x)]
                                        break
                                except Exception as e:
                                    #logger.warn('[ERROR] %s' % e)
                                    continue
                else:
                    iss_start = fc_name.find('#')
                    issuerun = fc_name[iss_start:].strip()
                    fc_name = fc_name[:iss_start].strip()

                if issuerun.strip().endswith('.') or issuerun.strip().endswith(','):
                    #logger.fdebug('Changed issuerun from %s to %s' % (issuerun, issuerun[:-1]))
                    issuerun = issuerun.strip()[:-1]
                if issuerun.endswith(' and '):
                    issuerun = issuerun[:-4].strip()
                elif issuerun.endswith(' and'):
                    issuerun = issuerun[:-3].strip()
            else:
                continue
                #    except:
                #        pass
            issue_list.append({'series':   fc_name,
                               'comicid':  fc_cid,
                               'issueid':  fc_isid,
                               'issues':   issuerun})
            #first_collect = cis

        logger.info('Collected issues in volume: %s' % issue_list)
        if len(issue_list) == 0:
            comic['Issue_List'] = 'None'
        else:
            comic['Issue_List'] = issue_list
    else:
        comic['Issue_List'] = 'None'

    while (desdeck > 0):
        if desdeck == 1:
            if comic_desc == 'None':
                comicDes = comic_deck[:30]
            else:
                #extract the first 60 characters
                comicDes = comic_desc[:60].replace('New 52', '')
        elif desdeck == 2:
            #extract the characters from the deck
            comicDes = comic_deck[:30].replace('New 52', '')
        else:
            break

        i = 0
        while (i < 2):
            if 'volume' in comicDes.lower():
                #found volume - let's grab it.
                v_find = comicDes.lower().find('volume')
                #arbitrarily grab the next 10 chars (6 for volume + 1 for space + 3 for the actual vol #)
                #increased to 10 to allow for text numbering (+5 max)
                #sometimes it's volume 5 and ocassionally it's fifth volume.
                if comicDes[v_find+7:comicDes.find(' ', v_find+7)].isdigit():
                    comic['ComicVersion'] = re.sub("[^0-9]", "", comicDes[v_find+7:comicDes.find(' ', v_find+7)]).strip()
                    break
                elif i == 0:
                    vfind = comicDes[v_find:v_find +15]   #if it's volume 5 format
                    basenums = {'zero': '0', 'one': '1', 'two': '2', 'three': '3', 'four': '4', 'five': '5', 'six': '6', 'seven': '7', 'eight': '8', 'nine': '9', 'ten': '10', 'i': '1', 'ii': '2', 'iii': '3', 'iv': '4', 'v': '5'}
                    logger.fdebug('volume X format - ' + str(i) + ': ' + vfind)
                else:
                    vfind = comicDes[:v_find]   # if it's fifth volume format
                    basenums = {'zero': '0', 'first': '1', 'second': '2', 'third': '3', 'fourth': '4', 'fifth': '5', 'sixth': '6', 'seventh': '7', 'eighth': '8', 'nineth': '9', 'tenth': '10', 'i': '1', 'ii': '2', 'iii': '3', 'iv': '4', 'v': '5'}
                    logger.fdebug('X volume format - ' + str(i) + ': ' + vfind)
                volconv = ''
                for nums in basenums:
                    if nums in vfind.lower():
                        sconv = basenums[nums]
                        vfind = re.sub(nums, sconv, vfind.lower())
                        break
                #logger.info('volconv: ' + str(volconv))

                #now we attempt to find the character position after the word 'volume'
                if i == 0:
                    volthis = vfind.lower().find('volume')
                    volthis = volthis + 6  # add on the actual word to the position so that we can grab the subsequent digit
                    vfind = vfind[volthis:volthis + 4]  # grab the next 4 characters ;)
                elif i == 1:
                    volthis = vfind.lower().find('volume')
                    vfind = vfind[volthis - 4:volthis]  # grab the next 4 characters ;)

                if '(' in vfind:
                    #bracket detected in versioning'
                    vfindit = re.findall('[^()]+', vfind)
                    vfind = vfindit[0]
                vf = re.findall('[^<>]+', vfind)
                try:
                    ledigit = re.sub("[^0-9]", "", vf[0])
                    if ledigit != '':
                        comic['ComicVersion'] = ledigit
                        logger.fdebug("Volume information found! Adding to series record : volume " + comic['ComicVersion'])
                        break
                except:
                    pass

                i += 1
            else:
                i += 1

        if comic['ComicVersion'] == 'None':
            logger.fdebug('comic[ComicVersion]:' + str(comic['ComicVersion']))
            desdeck -= 1
        else:
            break

    if vari == "yes":
        comic['ComicIssues'] = str(cntit)
    else:
        comic['ComicIssues'] = dom.getElementsByTagName('count_of_issues')[0].firstChild.wholeText

    comic['ComicImage'] = dom.getElementsByTagName('super_url')[0].firstChild.wholeText
    comic['ComicImageALT'] = dom.getElementsByTagName('small_url')[0].firstChild.wholeText

    comic['FirstIssueID'] = dom.getElementsByTagName('id')[0].firstChild.wholeText

    #logger.info('comic: %s' % comic)
    return comic

def GetIssuesInfo(comicid, dom, arcid=None):
    subtracks = dom.getElementsByTagName('issue')
    if not mylar.CONFIG.CV_ONLY:
        cntiss = dom.getElementsByTagName('count_of_issues')[0].firstChild.wholeText
        logger.fdebug("issues I've counted: " + str(len(subtracks)))
        logger.fdebug("issues CV says it has: " + str(int(cntiss)))

        if int(len(subtracks)) != int(cntiss):
            logger.fdebug("CV's count is wrong, I counted different...going with my count for physicals" + str(len(subtracks)))
            cntiss = len(subtracks) # assume count of issues is wrong, go with ACTUAL physical api count
        cntiss = int(cntiss)
        n = cntiss -1
    else:
        n = int(len(subtracks))
    tempissue = {}
    issuech = []
    firstdate = '2099-00-00'
    for subtrack in subtracks:
        if not mylar.CONFIG.CV_ONLY:
            if (dom.getElementsByTagName('name')[n].firstChild) is not None:
                issue['Issue_Name'] = dom.getElementsByTagName('name')[n].firstChild.wholeText
            else:
                issue['Issue_Name'] = 'None'

            issue['Issue_ID'] = dom.getElementsByTagName('id')[n].firstChild.wholeText
            issue['Issue_Number'] = dom.getElementsByTagName('issue_number')[n].firstChild.wholeText

            issuech.append({
                'Issue_ID':                issue['Issue_ID'],
                'Issue_Number':            issue['Issue_Number'],
                'Issue_Name':              issue['Issue_Name']
                })
        else:
            try:
                totnames = len(subtrack.getElementsByTagName('name'))
                tot = 0
                while (tot < totnames):
                    if subtrack.getElementsByTagName('name')[tot].parentNode.nodeName == 'volume':
                        tempissue['ComicName'] = subtrack.getElementsByTagName('name')[tot].firstChild.wholeText
                    elif subtrack.getElementsByTagName('name')[tot].parentNode.nodeName == 'issue':
                        try:
                            tempissue['Issue_Name'] = subtrack.getElementsByTagName('name')[tot].firstChild.wholeText
                        except:
                            tempissue['Issue_Name'] = None
                    tot += 1
            except:
                tempissue['ComicName'] = 'None'

            try:
                totids = len(subtrack.getElementsByTagName('id'))
                idt = 0
                while (idt < totids):
                    if subtrack.getElementsByTagName('id')[idt].parentNode.nodeName == 'volume':
                        tempissue['Comic_ID'] = subtrack.getElementsByTagName('id')[idt].firstChild.wholeText
                    elif subtrack.getElementsByTagName('id')[idt].parentNode.nodeName == 'issue':
                        tempissue['Issue_ID'] = subtrack.getElementsByTagName('id')[idt].firstChild.wholeText
                    idt += 1
            except:
                tempissue['Issue_Name'] = 'None'

            try:
                tempissue['CoverDate'] = subtrack.getElementsByTagName('cover_date')[0].firstChild.wholeText
            except:
                tempissue['CoverDate'] = '0000-00-00'
            try:
                tempissue['StoreDate'] = subtrack.getElementsByTagName('store_date')[0].firstChild.wholeText
            except:
                tempissue['StoreDate'] = '0000-00-00'
            try:
                digital_desc = subtrack.getElementsByTagName('description')[0].firstChild.wholeText
            except:
                tempissue['DigitalDate'] = '0000-00-00'
            else:
                tempissue['DigitalDate'] = '0000-00-00'
                if all(['digital' in digital_desc.lower()[-90:], 'print' in digital_desc.lower()[-90:]]):
                    #get the digital date of issue here...
                    mff = mylar.filechecker.FileChecker()
                    vlddate = mff.checkthedate(digital_desc[-90:], fulldate=True)
                    #logger.fdebug('vlddate: %s' % vlddate)
                    if vlddate:
                        tempissue['DigitalDate'] = vlddate
            try:
                tempissue['Issue_Number'] = subtrack.getElementsByTagName('issue_number')[0].firstChild.wholeText
            except:
                logger.fdebug('No Issue Number available - Trade Paperbacks, Graphic Novels and Compendiums are not supported as of yet.')

            try:
                tempissue['ComicImage'] = subtrack.getElementsByTagName('small_url')[0].firstChild.wholeText
            except:
                tempissue['ComicImage'] = 'None'

            try:
                tempissue['ComicImageALT'] = subtrack.getElementsByTagName('medium_url')[0].firstChild.wholeText
            except:
                tempissue['ComicImageALT'] = 'None'

            if arcid is None:
                issuech.append({
                    'Comic_ID':                comicid,
                    'Issue_ID':                tempissue['Issue_ID'],
                    'Issue_Number':            tempissue['Issue_Number'],
                    'Issue_Date':              tempissue['CoverDate'],
                    'Store_Date':              tempissue['StoreDate'],
                    'Digital_Date':            tempissue['DigitalDate'],
                    'Issue_Name':              tempissue['Issue_Name'],
                    'Image':                   tempissue['ComicImage'],
                    'ImageALT':                tempissue['ComicImageALT']
                    })

            else:
                issuech.append({
                    'ArcID':                   arcid,
                    'ComicName':               tempissue['ComicName'],
                    'ComicID':                 tempissue['Comic_ID'],
                    'IssueID':                 tempissue['Issue_ID'],
                    'Issue_Number':            tempissue['Issue_Number'],
                    'Issue_Date':              tempissue['CoverDate'],
                    'Store_Date':              tempissue['StoreDate'],
                    'Digital_Date':            tempissue['DigitalDate'],
                    'Issue_Name':              tempissue['Issue_Name']
                    })

            if tempissue['CoverDate'] < firstdate and tempissue['CoverDate'] != '0000-00-00':
                firstdate = tempissue['CoverDate']
        n-= 1

    #logger.fdebug('issue_info: %s' % issuech)
    #issue['firstdate'] = firstdate
    return issuech, firstdate

def Getissue(issueid, dom, type):
    #if the Series Year doesn't exist, get the first issue and take the date from that
    if type == 'firstissue':
        try:
            first_year = dom.getElementsByTagName('cover_date')[0].firstChild.wholeText
        except:
            first_year = '0000'
            return first_year

        the_year = first_year[:4]
        the_month = first_year[5:7]
        the_date = the_year + '-' + the_month
        return the_year
    else:
        try:
            image = dom.getElementsByTagName('super_url')[0].firstChild.wholeText
        except:
            image = None
        try:
            image_alt = dom.getElementsByTagName('small_url')[0].firstChild.wholeText
        except:
            image_alt = None

        return {'image':     image,
                'image_alt': image_alt}

def GetSeriesYears(dom):
    #used by the 'add a story arc' option to individually populate the Series Year for each series within the given arc.
    #series year is required for alot of functionality.
    series = dom.getElementsByTagName('volume')
    tempseries = {}
    serieslist = []
    for dm in series:
        try:
            totids = len(dm.getElementsByTagName('id'))
            idc = 0
            while (idc < totids):
                if dm.getElementsByTagName('id')[idc].parentNode.nodeName == 'volume':
                    tempseries['ComicID'] = dm.getElementsByTagName('id')[idc].firstChild.wholeText
                idc+=1
        except:
            logger.warn('There was a problem retrieving a comicid for a series within the arc. This will have to manually corrected most likely.')
            tempseries['ComicID'] = 'None'

        tempseries['Series'] = 'None'
        tempseries['Publisher'] = 'None'
        try:
            totnames = len(dm.getElementsByTagName('name'))
            namesc = 0
            while (namesc < totnames):
                if dm.getElementsByTagName('name')[namesc].parentNode.nodeName == 'volume':
                    tempseries['Series'] = dm.getElementsByTagName('name')[namesc].firstChild.wholeText
                elif dm.getElementsByTagName('name')[namesc].parentNode.nodeName == 'publisher':
                    tempseries['Publisher'] = dm.getElementsByTagName('name')[namesc].firstChild.wholeText
                namesc+=1
        except:
            logger.warn('There was a problem retrieving a Series Name or Publisher for a series within the arc. This will have to manually corrected.')

        try:
            tempseries['SeriesYear'] = dm.getElementsByTagName('start_year')[0].firstChild.wholeText
        except:
            logger.warn('There was a problem retrieving the start year for a particular series within the story arc.')
            tempseries['SeriesYear'] = '0000'

        #cause you know, dufus'...
        if tempseries['SeriesYear'][-1:] == '-':
            tempseries['SeriesYear'] = tempseries['SeriesYear'][:-1]

        desdeck = 0
        #the description field actually holds the Volume# - so let's grab it
        desc_soup = None
        try:
            descchunk = dm.getElementsByTagName('description')[0].firstChild.wholeText
            desc_soup = Soup(descchunk, "html.parser")
            desclinks = desc_soup.findAll('a')
            comic_desc = drophtml(descchunk)
            desdeck +=1
        except:
            comic_desc = 'None'

        #sometimes the deck has volume labels
        try:
            deckchunk = dm.getElementsByTagName('deck')[0].firstChild.wholeText
            comic_deck = deckchunk
            desdeck +=1
        except:
            comic_deck = 'None'

        #comic['ComicDescription'] = comic_desc

        try:
            tempseries['Aliases'] = dm.getElementsByTagName('aliases')[0].firstChild.wholeText
            tempseries['Aliases'] = re.sub('\n', '##', tempseries['Aliases']).strip()
            if tempseries['Aliases'][-2:] == '##':
                tempseries['Aliases'] = tempseries['Aliases'][:-2]
            #logger.fdebug('Aliases: ' + str(aliases))
        except:
            tempseries['Aliases'] = 'None'

        tempseries['Volume'] = 'None' #noversion'

        #figure out if it's a print / digital edition.
        tempseries['Type'] = 'None'
        if comic_deck != 'None':
            if any(['print' in comic_deck.lower(), 'digital' in comic_deck.lower(), 'paperback' in comic_deck.lower(), 'one shot' in re.sub('-', '', comic_deck.lower()).strip(), 'hardcover' in comic_deck.lower()]):
                if 'print' in comic_deck.lower():
                    tempseries['Type'] = 'Print'
                elif 'digital' in comic_deck.lower():
                    tempseries['Type'] = 'Digital'
                elif 'paperback' in comic_deck.lower():
                    tempseries['Type'] = 'TPB'
                elif 'hardcover' in comic_deck.lower():
                    tempseries['Type'] = 'HC'
                elif 'oneshot' in re.sub('-', '', comic_deck.lower()).strip():
                    tempseries['Type'] = 'One-Shot'

        if comic_desc != 'None' and tempseries['Type'] == 'None':
            if 'print' in comic_desc[:60].lower() and 'print edition can be found' not in comic_desc.lower():
                tempseries['Type'] = 'Print'
            elif 'digital' in comic_desc[:60].lower() and 'digital edition can be found' not in comic_desc.lower():
                tempseries['Type'] = 'Digital'
            elif all(['paperback' in comic_desc[:60].lower(), 'paperback can be found' not in comic_desc.lower()]) or 'collects' in comic_desc[:60].lower():
                tempseries['Type'] = 'TPB'
            elif 'hardcover' in comic_desc[:60].lower() and 'hardcover can be found' not in comic_desc.lower():
                tempseries['Type'] = 'HC'
            elif any(['one-shot' in comic_desc[:60].lower(), 'one shot' in comic_desc[:60].lower()]) and any(['can be found' not in comic_desc.lower(), 'following the' not in comic_desc.lower()]):
                i = 0
                tempseries['Type'] = 'One-Shot'
                avoidwords = ['preceding', 'after the special', 'following the']
                while i < 2:
                    if i == 0:
                        cbd = 'one-shot'
                    elif i == 1:
                        cbd = 'one shot'
                    tmp1 = comic_desc[:60].lower().find(cbd)
                    if tmp1 != -1:
                        for x in avoidwords:
                            tmp2 = comic_desc[:tmp1].lower().find(x)
                            if tmp2 != -1:
                                logger.fdebug('FAKE NEWS: caught incorrect reference to one-shot. Forcing to Print')
                                tempseries['Type'] = 'Print'
                                i = 3
                                break
                    i+=1
            else:
                tempseries['Type'] = 'Print'

        if all([comic_desc != 'None', 'trade paperback' in comic_desc[:30].lower(), 'collecting' in comic_desc[:40].lower()]):
            #ie. Trade paperback collecting Marvel Team-Up #9-11, 48-51, 72, 110 & 145.
            first_collect = comic_desc.lower().find('collecting')
            #logger.info('first_collect: %s' % first_collect)
            #logger.info('comic_desc: %s' % comic_desc)
            #logger.info('desclinks: %s' % desclinks)
            issue_list = []
            micdrop = []
            if desc_soup is not None:
                #if it's point form bullets, ignore it cause it's not the current volume stuff.
                test_it = desc_soup.find('ul')
                if test_it:
                    for x in test_it.findAll('li'):
                        if any(['Next' in x.findNext(text=True), 'Previous' in x.findNext(text=True)]):
                            mic_check = x.find('a')
                            micdrop.append(mic_check['data-ref-id'])

            for fc in desclinks:
                #logger.info('fc: %s'  % fc)
                fc_id = fc['data-ref-id']
                #logger.info('fc_id: %s'  % fc_id)
                if fc_id in micdrop:
                    continue
                fc_name = fc.findNext(text=True)
                if fc_id.startswith('4000'):
                    fc_cid = None
                    fc_isid = fc_id
                    iss_start = fc_name.find('#')
                    issuerun = fc_name[iss_start:].strip()
                    fc_name = fc_name[:iss_start].strip()
                elif fc_id.startswith('4050'):
                    fc_cid = fc_id
                    fc_isid = None
                    issuerun = fc.next_sibling
                    if issuerun is not None:
                        lines = re.sub("[^0-9]", ' ', issuerun).strip().split(' ')
                        if len(lines) > 0:
                            for x in sorted(lines, reverse=True):
                                srchline = issuerun.rfind(x)
                                if srchline != -1:
                                    try:
                                        if issuerun[srchline+len(x)] == ',' or issuerun[srchline+len(x)] == '.' or issuerun[srchline+len(x)] == ' ':
                                            issuerun = issuerun[:srchline+len(x)]
                                            break
                                    except Exception as e:
                                        logger.warn('[ERROR] %s' % e)
                                        continue
                    else:
                        iss_start = fc_name.find('#')
                        issuerun = fc_name[iss_start:].strip()
                        fc_name = fc_name[:iss_start].strip()

                    if issuerun.endswith('.') or issuerun.endswith(','):
                        #logger.fdebug('Changed issuerun from %s to %s' % (issuerun, issuerun[:-1]))
                        issuerun = issuerun[:-1]
                    if issuerun.endswith(' and '):
                        issuerun = issuerun[:-4].strip()
                    elif issuerun.endswith(' and'):
                        issuerun = issuerun[:-3].strip()
                else:
                    continue
                    #    except:
                    #        pass
                issue_list.append({'series':   fc_name,
                                   'comicid':  fc_cid,
                                   'issueid':  fc_isid,
                                   'issues':   issuerun})
                #first_collect = cis

            logger.info('Collected issues in volume: %s' % issue_list)
            tempseries['Issue_List'] = issue_list
        else:
            tempseries['Issue_List'] = 'None'

        while (desdeck > 0):
            if desdeck == 1:
                if comic_desc == 'None':
                    comicDes = comic_deck[:30]
                else:
                    #extract the first 60 characters
                    comicDes = comic_desc[:60].replace('New 52', '')
            elif desdeck == 2:
                #extract the characters from the deck
                comicDes = comic_deck[:30].replace('New 52', '')
            else:
                break

            i = 0
            while (i < 2):
                if 'volume' in comicDes.lower():
                    #found volume - let's grab it.
                    v_find = comicDes.lower().find('volume')
                    #arbitrarily grab the next 10 chars (6 for volume + 1 for space + 3 for the actual vol #)
                    #increased to 10 to allow for text numbering (+5 max)
                    #sometimes it's volume 5 and ocassionally it's fifth volume.
                    if i == 0:
                        vfind = comicDes[v_find:v_find +15]   #if it's volume 5 format
                        basenums = {'zero': '0', 'one': '1', 'two': '2', 'three': '3', 'four': '4', 'five': '5', 'six': '6', 'seven': '7', 'eight': '8', 'nine': '9', 'ten': '10', 'i': '1', 'ii': '2', 'iii': '3', 'iv': '4', 'v': '5'}
                        logger.fdebug('volume X format - %s: %s' % (i, vfind))
                    else:
                        vfind = comicDes[:v_find]   # if it's fifth volume format
                        basenums = {'zero': '0', 'first': '1', 'second': '2', 'third': '3', 'fourth': '4', 'fifth': '5', 'sixth': '6', 'seventh': '7', 'eighth': '8', 'nineth': '9', 'tenth': '10', 'i': '1', 'ii': '2', 'iii': '3', 'iv': '4', 'v': '5'}
                        logger.fdebug('X volume format - %s: %s' % (i, vfind))
                    volconv = ''
                    for nums in basenums:
                        if nums in vfind.lower():
                            sconv = basenums[nums]
                            vfind = re.sub(nums, sconv, vfind.lower())
                            break
                    #logger.info('volconv: ' + str(volconv))

                    #now we attempt to find the character position after the word 'volume'
                    if i == 0:
                        volthis = vfind.lower().find('volume')
                        volthis = volthis + 6  # add on the actual word to the position so that we can grab the subsequent digit
                        vfind = vfind[volthis:volthis + 4]  # grab the next 4 characters ;)
                    elif i == 1:
                        volthis = vfind.lower().find('volume')
                        vfind = vfind[volthis - 4:volthis]  # grab the next 4 characters ;)

                    if '(' in vfind:
                        #bracket detected in versioning'
                        vfindit = re.findall('[^()]+', vfind)
                        vfind = vfindit[0]
                    vf = re.findall('[^<>]+', vfind)
                    try:
                        ledigit = re.sub("[^0-9]", "", vf[0])
                        if ledigit != '':
                            tempseries['Volume'] = ledigit
                            logger.fdebug("Volume information found! Adding to series record : volume %s" % tempseries['Volume'])
                            break
                    except:
                        pass

                    i += 1
                else:
                    i += 1

            if tempseries['Volume'] == 'None':
                logger.fdebug('tempseries[Volume]: %s' % tempseries['Volume'])
                desdeck -= 1
            else:
                break


        serieslist.append({"ComicID":    tempseries['ComicID'],
                           "ComicName":  tempseries['Series'],
                           "SeriesYear": tempseries['SeriesYear'],
                           "Publisher":  tempseries['Publisher'],
                           "Volume":     tempseries['Volume'],
                           "Aliases":    tempseries['Aliases'],
                           "Type":       tempseries['Type']})

    return serieslist

def UpdateDates(dom):
    issues = dom.getElementsByTagName('issue')
    tempissue = {}
    issuelist = []
    for dm in issues:
        tempissue['ComicID'] = 'None'
        tempissue['IssueID'] = 'None'
        try:
            totids = len(dm.getElementsByTagName('id'))
            idc = 0
            while (idc < totids):
                if dm.getElementsByTagName('id')[idc].parentNode.nodeName == 'volume':
                    tempissue['ComicID'] = dm.getElementsByTagName('id')[idc].firstChild.wholeText
                if dm.getElementsByTagName('id')[idc].parentNode.nodeName == 'issue':
                    tempissue['IssueID'] = dm.getElementsByTagName('id')[idc].firstChild.wholeText
                idc+=1
        except:
            logger.warn('There was a problem retrieving a comicid/issueid for the given issue. This will have to manually corrected most likely.')

        tempissue['SeriesTitle'] = 'None'
        tempissue['IssueTitle'] = 'None'
        try:
            totnames = len(dm.getElementsByTagName('name'))
            namesc = 0
            while (namesc < totnames):
                if dm.getElementsByTagName('name')[namesc].parentNode.nodeName == 'issue':
                    tempissue['IssueTitle'] = dm.getElementsByTagName('name')[namesc].firstChild.wholeText
                elif dm.getElementsByTagName('name')[namesc].parentNode.nodeName == 'volume':
                    tempissue['SeriesTitle'] = dm.getElementsByTagName('name')[namesc].firstChild.wholeText
                namesc+=1
        except:
            logger.warn('There was a problem retrieving the Series Title / Issue Title for a series within the arc. This will have to manually corrected.')

        try:
            tempissue['CoverDate'] = dm.getElementsByTagName('cover_date')[0].firstChild.wholeText
        except:
            tempissue['CoverDate'] = '0000-00-00'
        try:
            tempissue['StoreDate'] = dm.getElementsByTagName('store_date')[0].firstChild.wholeText
        except:
            tempissue['StoreDate'] = '0000-00-00'
        try:
            tempissue['IssueNumber'] = dm.getElementsByTagName('issue_number')[0].firstChild.wholeText
        except:
            logger.fdebug('No Issue Number available - Trade Paperbacks, Graphic Novels and Compendiums are not supported as of yet.')
            tempissue['IssueNumber'] = 'None'
        try:
            tempissue['date_last_updated'] = dm.getElementsByTagName('date_last_updated')[0].firstChild.wholeText
        except:
            tempissue['date_last_updated'] = '0000-00-00'

        issuelist.append({'ComicID':            tempissue['ComicID'],
                          'IssueID':            tempissue['IssueID'],
                          'SeriesTitle':        tempissue['SeriesTitle'],
                          'IssueTitle':         tempissue['IssueTitle'],
                          'CoverDate':          tempissue['CoverDate'],
                          'StoreDate':          tempissue['StoreDate'],
                          'IssueNumber':        tempissue['IssueNumber'],
                          'Date_Last_Updated':  tempissue['date_last_updated']})

    return issuelist

def GetImportList(results):
    importlist = results.getElementsByTagName('issue')
    serieslist = []
    importids = {}
    tempseries = {}
    for implist in importlist:
        try:
            totids = len(implist.getElementsByTagName('id'))
            idt = 0
            while (idt < totids):
                if implist.getElementsByTagName('id')[idt].parentNode.nodeName == 'volume':
                    tempseries['ComicID'] = implist.getElementsByTagName('id')[idt].firstChild.wholeText
                elif implist.getElementsByTagName('id')[idt].parentNode.nodeName == 'issue':
                    tempseries['IssueID'] = implist.getElementsByTagName('id')[idt].firstChild.wholeText
                idt += 1
        except:
            tempseries['ComicID'] = None

        try:
            totnames = len(implist.getElementsByTagName('name'))
            tot = 0
            while (tot < totnames):
                if implist.getElementsByTagName('name')[tot].parentNode.nodeName == 'volume':
                    tempseries['ComicName'] = implist.getElementsByTagName('name')[tot].firstChild.wholeText
                elif implist.getElementsByTagName('name')[tot].parentNode.nodeName == 'issue':
                    try:
                        tempseries['Issue_Name'] = implist.getElementsByTagName('name')[tot].firstChild.wholeText
                    except:
                        tempseries['Issue_Name'] = None
                tot += 1
        except:
            tempseries['ComicName'] = 'None'

        try:
            tempseries['Issue_Number'] = implist.getElementsByTagName('issue_number')[0].firstChild.wholeText
        except:
            logger.fdebug('No Issue Number available - Trade Paperbacks, Graphic Novels and Compendiums are not supported as of yet.')

        logger.info('tempseries:' + str(tempseries))
        serieslist.append({"ComicID":      tempseries['ComicID'],
                           "IssueID":      tempseries['IssueID'],
                           "ComicName":    tempseries['ComicName'],
                           "Issue_Name":   tempseries['Issue_Name'],
                           "Issue_Number": tempseries['Issue_Number']})


    return serieslist

def drophtml(html):
    soup = Soup(html, "html.parser")

    text_parts = soup.findAll(text=True)
    #print ''.join(text_parts)
    return ''.join(text_parts)