mylar/mylar/solicit.py


from bs4 import BeautifulSoup, UnicodeDammit
import urllib2
import csv
import fileinput
import sys
import re
import os
import sqlite3
import datetime
from decimal import Decimal
from HTMLParser import HTMLParseError
from time import strptime

import mylar
from mylar import logger

def solicit(month, year):
    #convert to numerics just to ensure this...
    month = int(month)
    year = int(year)

    #print ( "month: " + str(month) )
    #print ( "year: " + str(year) )

    # in order to gather ALL upcoming - let's start to loop through months going ahead one at a time
    # until we get a null then break. (Usually not more than 3 months in advance is available)
    mnloop = 0
    upcoming = []

    while (mnloop < 5):

        pagelinks = "http://www.comicbookresources.com/tag/solicits" + str(month) + str(year)    
        pageresponse = urllib2.urlopen ( pagelinks )
        soup = BeautifulSoup (pageresponse)
        cntlinks = soup.findAll('h3')
        lenlinks = len(cntlinks)
        logger.info( str(lenlinks) + ' results' )

        publish = []
        resultURL = []

        x = 0
        cnt = 0

        while (x < lenlinks):
            headt = cntlinks[x] #iterate through the hrefs pulling out only results.
            if "/?page=article&amp;id=" in str(headt):
                #print ("titlet: " + str(headt))
                headName = headt.findNext(text=True)
                if ('Marvel' and 'DC' and 'Image' not in headName) and ('Solicitations' in headName):
                    pubstart = headName.find('Solicitations')
                    publish.append( headName[:pubstart].strip() )
                    abc = headt.findAll('a', href=True)[0]
                    ID_som = abc['href']  #first instance will have the right link...
                    resultURL.append( ID_som )
                    #print '[ ' + publish[cnt] + '] Link URL: ' + resultURL[cnt]
                    cnt+=1
            x+=1

        #print 'cnt:' + str(cnt)

        if cnt == 0:
            break  # no results means, end it

        loopthis = (cnt-1)
        #this loops through each 'found' solicit page 
        shipdate = str(month) + '-' + str(year)
        while ( loopthis >= 0 ):
            upcoming += populate(resultURL[loopthis], publish[loopthis], shipdate)
            loopthis -=1

        month +=1  #increment month by 1
        mnloop +=1 #increment loop by 1

        if month > 12:    #failsafe failover for months
            month = 1
            year+=1

    #print upcoming
    logger.info( str(len(upcoming)) + ' upcoming issues discovered.' )

    newfl = mylar.CACHE_DIR + "/future-releases.txt"
    newtxtfile = open(newfl, 'wb')

    cntr = 1
    for row in upcoming:
        if row['Extra'] is None or row['Extra'] == '':
            extrarow = 'N/A'
        else:
            extrarow = row['Extra']
        newtxtfile.write(str(row['Shipdate']) + '\t' + str(row['Publisher']) + '\t' + str(row['Issue']) + '\t' + str(row['Comic']) + '\t' + str(extrarow) + '\tSkipped' + '\t' + str(cntr) + '\n')
        cntr +=1

    newtxtfile.close()


    logger.fdebug( 'attempting to populate future upcoming...' )

    mylardb = os.path.join(mylar.DATA_DIR, "mylar.db")

    connection = sqlite3.connect(str(mylardb))
    cursor = connection.cursor()

    cursor.executescript('drop table if exists future;')

    cursor.execute("CREATE TABLE IF NOT EXISTS future (SHIPDATE, PUBLISHER text, ISSUE text, COMIC VARCHAR(150), EXTRA text, STATUS text, FutureID text, ComicID text);")
    connection.commit()

    csvfile = open(newfl, "rb")
    creader = csv.reader(csvfile, delimiter='\t')

    t = 1

    for row in creader:
        try:
            #print ("Row: %s" % row)
            cursor.execute("INSERT INTO future VALUES (?,?,?,?,?,?,?,null);", row)
        except Exception, e:
            logger.fdebug("Error - invald arguments...-skipping")
            pass
        t+=1
    logger.fdebug('successfully added ' + str(t) + ' issues to future upcoming table.')
    csvfile.close()
    connection.commit()
    connection.close()


    mylar.weeklypull.pullitcheck(futurepull="yes")
    #.end

def populate(link,publisher,shipdate):
    #this is the secondary url call to populate
    input = 'http://www.comicbookresources.com/' + link
    response = urllib2.urlopen ( input )
    soup = BeautifulSoup (response)
    abc = soup.findAll('p')
    lenabc = len(abc)
    i=0
    resultName = []
    resultID = []
    resultURL = []
    matched = "no"
    upcome = []

    while (i < lenabc):
        titlet = abc[i] #iterate through the p pulling out only results. 
        #print ("titlet: " + str(titlet))
        if "/news/preview2.php" in str(titlet):
            tempName = titlet.findNext(text=True)
            if ' TPB' not in tempName and ' HC' not in tempName and 'GN-TPB' not in tempName and 'subscription variant' not in tempName.lower():
                #print publisher + ' found upcoming'
                if '#' in tempName:
                    tempName = tempName.encode('ascii', 'replace')    #.decode('utf-8')
                    if '???' in tempName:
                        tempName = tempName.replace('???', ' ')
                    stissue = tempName.find('#')
                    endissue = tempName.find(' ', stissue)
                    if tempName[stissue+1] == ' ':   #if issue has space between # and number, adjust.
                        endissue = tempName.find(' ', stissue+2)
                    if endissue == -1: endissue = len(tempName)
                    issue = tempName[stissue:endissue].lstrip(' ')
                    if ':'in issue: issue = re.sub(':', '', issue).rstrip()
                    exinfo = tempName[endissue:].lstrip(' ')

                    issue1 = None
                    issue2 = None

                    if '-' in issue:
                        #print ('multiple issues detected. Splitting.')
                        ststart = issue.find('-')
                        issue1 = issue[:ststart]
                        issue2 = '#' + str(issue[ststart+1:])

                    if '&' in exinfo:
                        #print ('multiple issues detected. Splitting.')
                        ststart = exinfo.find('&')
                        issue1 = issue   # this detects fine
                        issue2 = '#' + str(exinfo[ststart+1:])
                        if '& ' in issue2: issue2 = re.sub("&\\b", "", issue2)
                        exinfo = exinfo.replace(exinfo[ststart+1:len(issue2)], '').strip()
                        if exinfo == '&': exinfo = 'N/A'

                    comic = tempName[:stissue].strip()
                    if 'for \$1' in comic:
                        exinfo = 'for $1'
                        comic = comic.replace('for \$1\:', '').lstrip()

                    if issue1:
                        upcome.append({
                            'Shipdate': shipdate,
                            'Publisher': publisher.upper(),
                            'Issue':   re.sub('#', '',issue1).lstrip(),
                            'Comic':   comic.upper(),
                            'Extra':   exinfo.upper()
                        })
                        #print ('Comic: ' + comic)
                        #print('issue#: ' + re.sub('#', '', issue1))
                        #print ('extra info: ' + exinfo)
                        if issue2:
                            upcome.append({
                                'Shipdate': shipdate,
                                'Publisher': publisher.upper(),
                                'Issue':   re.sub('#', '', issue2).lstrip(),
                                'Comic':   comic.upper(),
                                'Extra':   exinfo.upper()
                            })
                            #print ('Comic: ' + comic)
                            #print('issue#: ' + re.sub('#', '', issue2))
                            #print ('extra info: ' + exinfo)
                    else:          
                        upcome.append({
                            'Shipdate': shipdate,
                            'Publisher': publisher.upper(),
                            'Issue':   re.sub('#', '', issue).lstrip(),
                            'Comic':   comic.upper(),
                            'Extra':   exinfo.upper()
                        })
                        #print ('Comic: ' + comic)
                        #print ('issue#: ' + re.sub('#', '', issue))
                        #print ('extra info: ' + exinfo)
                else:
                    print ('no issue # to retrieve.')
        i+=1
    return upcome
    #end.

if __name__ == '__main__':
    solicit(sys.argv[1], sys.argv[2])
IMP: Added ForceRSS Check and Test SABnzbd Connection buttons in Config, FIX: If Annuals not enabled, would error on home screen, IMP: updated mylar.init.d (thnx Kalinon), FIX: Manual Post-Processing fix for Manual Run (thnx Kalinon), IMP: Library Monitor working (check folder every X minutes and Post-Process), IMP: Future Upcoming introduction, IMP: Experimental search better handling of year inclusions, FIX: Filechecker will now pick up series with years in the series title accordingly, FIX: Torrent seedbox sending would lockup occassionally when attempting to send torrent file, FIX: malformed image url on some series, IMP: Moved issue updating to a seperate function, IMP: When series was refreshed, would download the last issue (or few issues depending on date), regardless of status, IMP: When series is volume 1 or volume label doesn't exist, either assume V1 or remove volume requirements to improve matching hits, IMP: StoryArcs will now check in StoryArc folder for existing issues and change status in StoryArc accordingly... 2013-11-28 15:48:59 +00:00
			`from bs4 import BeautifulSoup, UnicodeDammit`
			`import urllib2`
			`import csv`
			`import fileinput`
			`import sys`
			`import re`
			`import os`
			`import sqlite3`
			`import datetime`
			`from decimal import Decimal`
			`from HTMLParser import HTMLParseError`
			`from time import strptime`

			`import mylar`
			`from mylar import logger`

			`def solicit(month, year):`
			`#convert to numerics just to ensure this...`
			`month = int(month)`
			`year = int(year)`

			`#print ( "month: " + str(month) )`
			`#print ( "year: " + str(year) )`

			`# in order to gather ALL upcoming - let's start to loop through months going ahead one at a time`
			`# until we get a null then break. (Usually not more than 3 months in advance is available)`
			`mnloop = 0`
			`upcoming = []`

			`while (mnloop < 5):`

			`pagelinks = "http://www.comicbookresources.com/tag/solicits" + str(month) + str(year)`
			`pageresponse = urllib2.urlopen ( pagelinks )`
			`soup = BeautifulSoup (pageresponse)`
			`cntlinks = soup.findAll('h3')`
			`lenlinks = len(cntlinks)`
			`logger.info( str(lenlinks) + ' results' )`

			`publish = []`
			`resultURL = []`

			`x = 0`
			`cnt = 0`

			`while (x < lenlinks):`
			`headt = cntlinks[x] #iterate through the hrefs pulling out only results.`
			`if "/?page=article&id=" in str(headt):`
			`#print ("titlet: " + str(headt))`
			`headName = headt.findNext(text=True)`
			`if ('Marvel' and 'DC' and 'Image' not in headName) and ('Solicitations' in headName):`
			`pubstart = headName.find('Solicitations')`
			`publish.append( headName[:pubstart].strip() )`
			`abc = headt.findAll('a', href=True)[0]`
			`ID_som = abc['href'] #first instance will have the right link...`
			`resultURL.append( ID_som )`
			`#print '[ ' + publish[cnt] + '] Link URL: ' + resultURL[cnt]`
			`cnt+=1`
			`x+=1`

			`#print 'cnt:' + str(cnt)`

			`if cnt == 0:`
			`break # no results means, end it`

			`loopthis = (cnt-1)`
			`#this loops through each 'found' solicit page`
			`shipdate = str(month) + '-' + str(year)`
			`while ( loopthis >= 0 ):`
			`upcoming += populate(resultURL[loopthis], publish[loopthis], shipdate)`
			`loopthis -=1`

			`month +=1 #increment month by 1`
			`mnloop +=1 #increment loop by 1`

			`if month > 12: #failsafe failover for months`
			`month = 1`
			`year+=1`

			`#print upcoming`
			`logger.info( str(len(upcoming)) + ' upcoming issues discovered.' )`

			`newfl = mylar.CACHE_DIR + "/future-releases.txt"`
			`newtxtfile = open(newfl, 'wb')`

			`cntr = 1`
			`for row in upcoming:`
			`if row['Extra'] is None or row['Extra'] == '':`
			`extrarow = 'N/A'`
			`else:`
			`extrarow = row['Extra']`
			`newtxtfile.write(str(row['Shipdate']) + '\t' + str(row['Publisher']) + '\t' + str(row['Issue']) + '\t' + str(row['Comic']) + '\t' + str(extrarow) + '\tSkipped' + '\t' + str(cntr) + '\n')`
			`cntr +=1`

			`newtxtfile.close()`


			`logger.fdebug( 'attempting to populate future upcoming...' )`

			`mylardb = os.path.join(mylar.DATA_DIR, "mylar.db")`

			`connection = sqlite3.connect(str(mylardb))`
			`cursor = connection.cursor()`

			`cursor.executescript('drop table if exists future;')`

			`cursor.execute("CREATE TABLE IF NOT EXISTS future (SHIPDATE, PUBLISHER text, ISSUE text, COMIC VARCHAR(150), EXTRA text, STATUS text, FutureID text, ComicID text);")`
			`connection.commit()`

			`csvfile = open(newfl, "rb")`
			`creader = csv.reader(csvfile, delimiter='\t')`

			`t = 1`

			`for row in creader:`
			`try:`
			`#print ("Row: %s" % row)`
			`cursor.execute("INSERT INTO future VALUES (?,?,?,?,?,?,?,null);", row)`
			`except Exception, e:`
			`logger.fdebug("Error - invald arguments...-skipping")`
			`pass`
			`t+=1`
			`logger.fdebug('successfully added ' + str(t) + ' issues to future upcoming table.')`
			`csvfile.close()`
			`connection.commit()`
			`connection.close()`


			`mylar.weeklypull.pullitcheck(futurepull="yes")`
			`#.end`

			`def populate(link,publisher,shipdate):`
			`#this is the secondary url call to populate`
			`input = 'http://www.comicbookresources.com/' + link`
			`response = urllib2.urlopen ( input )`
			`soup = BeautifulSoup (response)`
			`abc = soup.findAll('p')`
			`lenabc = len(abc)`
			`i=0`
			`resultName = []`
			`resultID = []`
			`resultURL = []`
			`matched = "no"`
			`upcome = []`

			`while (i < lenabc):`
			`titlet = abc[i] #iterate through the p pulling out only results.`
			`#print ("titlet: " + str(titlet))`
			`if "/news/preview2.php" in str(titlet):`
			`tempName = titlet.findNext(text=True)`
			`if ' TPB' not in tempName and ' HC' not in tempName and 'GN-TPB' not in tempName and 'subscription variant' not in tempName.lower():`
			`#print publisher + ' found upcoming'`
			`if '#' in tempName:`
			`tempName = tempName.encode('ascii', 'replace') #.decode('utf-8')`
			`if '???' in tempName:`
			`tempName = tempName.replace('???', ' ')`
			`stissue = tempName.find('#')`
			`endissue = tempName.find(' ', stissue)`
			`if tempName[stissue+1] == ' ': #if issue has space between # and number, adjust.`
			`endissue = tempName.find(' ', stissue+2)`
			`if endissue == -1: endissue = len(tempName)`
			`issue = tempName[stissue:endissue].lstrip(' ')`
			`if ':'in issue: issue = re.sub(':', '', issue).rstrip()`
			`exinfo = tempName[endissue:].lstrip(' ')`

			`issue1 = None`
			`issue2 = None`

			`if '-' in issue:`
			`#print ('multiple issues detected. Splitting.')`
			`ststart = issue.find('-')`
			`issue1 = issue[:ststart]`
			`issue2 = '#' + str(issue[ststart+1:])`

			`if '&' in exinfo:`
			`#print ('multiple issues detected. Splitting.')`
			`ststart = exinfo.find('&')`
			`issue1 = issue # this detects fine`
			`issue2 = '#' + str(exinfo[ststart+1:])`
			`if '& ' in issue2: issue2 = re.sub("&\\b", "", issue2)`
			`exinfo = exinfo.replace(exinfo[ststart+1:len(issue2)], '').strip()`
			`if exinfo == '&': exinfo = 'N/A'`

			`comic = tempName[:stissue].strip()`
			`if 'for \$1' in comic:`
			`exinfo = 'for $1'`
			`comic = comic.replace('for \$1\:', '').lstrip()`

			`if issue1:`
			`upcome.append({`
			`'Shipdate': shipdate,`
			`'Publisher': publisher.upper(),`
			`'Issue': re.sub('#', '',issue1).lstrip(),`
			`'Comic': comic.upper(),`
			`'Extra': exinfo.upper()`
			`})`
			`#print ('Comic: ' + comic)`
			`#print('issue#: ' + re.sub('#', '', issue1))`
			`#print ('extra info: ' + exinfo)`
			`if issue2:`
			`upcome.append({`
			`'Shipdate': shipdate,`
			`'Publisher': publisher.upper(),`
			`'Issue': re.sub('#', '', issue2).lstrip(),`
			`'Comic': comic.upper(),`
			`'Extra': exinfo.upper()`
			`})`
			`#print ('Comic: ' + comic)`
			`#print('issue#: ' + re.sub('#', '', issue2))`
			`#print ('extra info: ' + exinfo)`
			`else:`
			`upcome.append({`
			`'Shipdate': shipdate,`
			`'Publisher': publisher.upper(),`
			`'Issue': re.sub('#', '', issue).lstrip(),`
			`'Comic': comic.upper(),`
			`'Extra': exinfo.upper()`
			`})`
			`#print ('Comic: ' + comic)`
			`#print ('issue#: ' + re.sub('#', '', issue))`
			`#print ('extra info: ' + exinfo)`
			`else:`
			`print ('no issue # to retrieve.')`
			`i+=1`
			`return upcome`
			`#end.`

			`if __name__ == '__main__':`
			`solicit(sys.argv[1], sys.argv[2])`