from bs4 import BeautifulSoup, UnicodeDammit import urllib2 import csv import fileinput import sys import re import os import sqlite3 import datetime import unicodedata from decimal import Decimal from HTMLParser import HTMLParseError from time import strptime import mylar from mylar import logger def solicit(month, year): #convert to numerics just to ensure this... month = int(month) year = int(year) #print ( "month: " + str(month) ) #print ( "year: " + str(year) ) # in order to gather ALL upcoming - let's start to loop through months going ahead one at a time # until we get a null then break. (Usually not more than 3 months in advance is available) mnloop = 0 upcoming = [] publishers = {'DC Comics':'DC Comics', 'Marvel':'Marvel Comics', 'Image':'Image Comics', 'IDW':'IDW Publishing', 'Dark Horse':'Dark Horse Comics'} while (mnloop < 5): if year == 2014: if len(str(month)) == 1: month_string = '0' + str(month) else: month_string = str(month) datestring = str(year) + str(month_string) else: datestring = str(month) + str(year) pagelinks = "http://www.comicbookresources.com/tag/solicits" + str(datestring) logger.info('datestring:' + datestring) logger.info('checking:' + pagelinks) pageresponse = urllib2.urlopen ( pagelinks ) soup = BeautifulSoup (pageresponse) cntlinks = soup.findAll('h3') lenlinks = len(cntlinks) logger.info( str(lenlinks) + ' results' ) publish = [] resultURL = [] x = 0 cnt = 0 while (x < lenlinks): headt = cntlinks[x] #iterate through the hrefs pulling out only results. if "/?page=article&id=" in str(headt): #print ("titlet: " + str(headt)) headName = headt.findNext(text=True) if ('Marvel' and 'DC' and 'Image' not in headName) and ('Solicitations' in headName or 'Solicits' in headName): pubstart = headName.find('Solicitations') for pub in publishers: if pub in headName[:pubstart]: publish.append(publishers[pub]) #publish.append( headName[:pubstart].strip() ) abc = headt.findAll('a', href=True)[0] ID_som = abc['href'] #first instance will have the right link... resultURL.append( ID_som ) #print '[ ' + publish[cnt] + '] Link URL: ' + resultURL[cnt] cnt+=1 x+=1 #print 'cnt:' + str(cnt) if cnt == 0: break # no results means, end it loopthis = (cnt-1) #this loops through each 'found' solicit page shipdate = str(month_string) + '-' + str(year) while ( loopthis >= 0 ): upcoming += populate(resultURL[loopthis], publish[loopthis], shipdate) loopthis -=1 month +=1 #increment month by 1 mnloop +=1 #increment loop by 1 if month > 12: #failsafe failover for months month = 1 year+=1 #print upcoming logger.info( str(len(upcoming)) + ' upcoming issues discovered.' ) newfl = mylar.CACHE_DIR + "/future-releases.txt" newtxtfile = open(newfl, 'wb') cntr = 1 for row in upcoming: if row['Extra'] is None or row['Extra'] == '': extrarow = 'N/A' else: extrarow = row['Extra'] newtxtfile.write(str(row['Shipdate']) + '\t' + str(row['Publisher']) + '\t' + str(row['Issue']) + '\t' + str(row['Comic']) + '\t' + str(extrarow) + '\tSkipped' + '\t' + str(cntr) + '\n') cntr +=1 newtxtfile.close() logger.fdebug( 'attempting to populate future upcoming...' ) mylardb = os.path.join(mylar.DATA_DIR, "mylar.db") connection = sqlite3.connect(str(mylardb)) cursor = connection.cursor() cursor.executescript('drop table if exists future;') cursor.execute("CREATE TABLE IF NOT EXISTS future (SHIPDATE, PUBLISHER text, ISSUE text, COMIC VARCHAR(150), EXTRA text, STATUS text, FutureID text, ComicID text);") connection.commit() csvfile = open(newfl, "rb") creader = csv.reader(csvfile, delimiter='\t') t = 1 for row in creader: try: #print ("Row: %s" % row) cursor.execute("INSERT INTO future VALUES (?,?,?,?,?,?,?,null);", row) except Exception, e: logger.fdebug("Error - invald arguments...-skipping") pass t+=1 logger.fdebug('successfully added ' + str(t) + ' issues to future upcoming table.') csvfile.close() connection.commit() connection.close() mylar.weeklypull.pullitcheck(futurepull="yes") #.end def populate(link,publisher,shipdate): #this is the secondary url call to populate input = 'http://www.comicbookresources.com/' + link response = urllib2.urlopen ( input ) soup = BeautifulSoup (response) abc = soup.findAll('p') lenabc = len(abc) i=0 resultName = [] resultID = [] resultURL = [] matched = "no" upcome = [] get_next = False prev_chk = False while (i < lenabc): titlet = abc[i] #iterate through the p pulling out only results. #print ("titlet: " + str(titlet)) if "/prev_img.php?pid" in str(titlet): #solicits in 03-2014 have seperated

tags, so we need to take the subsequent

, not the initial. get_next = True i+=1 continue elif "/news/preview2.php" in str(titlet): prev_chk = True get_next = False elif get_next == True: prev_chk = True else: prev_chk = False get_next = False if prev_chk == True: tempName = titlet.findNext(text=True) if ' TPB' not in tempName and ' HC' not in tempName and 'GN-TPB' not in tempName and 'for $1' not in tempName.lower() and 'subscription variant' not in tempName.lower() and 'poster' not in tempName.lower(): #print publisher + ' found upcoming' if '#' in tempName: #tempName = tempName.replace(u'.',u"'") tempName = tempName.encode('ascii', 'replace') #.decode('utf-8') if '???' in tempName: tempName = tempName.replace('???', ' ') stissue = tempName.find('#') endissue = tempName.find(' ', stissue) if tempName[stissue+1] == ' ': #if issue has space between # and number, adjust. endissue = tempName.find(' ', stissue+2) if endissue == -1: endissue = len(tempName) issue = tempName[stissue:endissue].lstrip(' ') if ':'in issue: issue = re.sub(':', '', issue).rstrip() exinfo = tempName[endissue:].lstrip(' ') issue1 = None issue2 = None if '-' in issue: #print ('multiple issues detected. Splitting.') ststart = issue.find('-') issue1 = issue[:ststart] issue2 = '#' + str(issue[ststart+1:]) if '&' in exinfo: #print ('multiple issues detected. Splitting.') ststart = exinfo.find('&') issue1 = issue # this detects fine issue2 = '#' + str(exinfo[ststart+1:]) if '& ' in issue2: issue2 = re.sub("&\\b", "", issue2) exinfo = exinfo.replace(exinfo[ststart+1:len(issue2)], '').strip() if exinfo == '&': exinfo = 'N/A' comic = tempName[:stissue].strip() if 'for \$1' in comic: exinfo = 'for $1' comic = comic.replace('for \$1\:', '').lstrip() if issue1: upcome.append({ 'Shipdate': shipdate, 'Publisher': publisher.upper(), 'Issue': re.sub('#', '',issue1).lstrip(), 'Comic': comic.upper(), 'Extra': exinfo.upper() }) #print ('Comic: ' + comic) #print('issue#: ' + re.sub('#', '', issue1)) #print ('extra info: ' + exinfo) if issue2: upcome.append({ 'Shipdate': shipdate, 'Publisher': publisher.upper(), 'Issue': re.sub('#', '', issue2).lstrip(), 'Comic': comic.upper(), 'Extra': exinfo.upper() }) #print ('Comic: ' + comic) #print('issue#: ' + re.sub('#', '', issue2)) #print ('extra info: ' + exinfo) else: upcome.append({ 'Shipdate': shipdate, 'Publisher': publisher.upper(), 'Issue': re.sub('#', '', issue).lstrip(), 'Comic': comic.upper(), 'Extra': exinfo.upper() }) #print ('Comic: ' + comic) #print ('issue#: ' + re.sub('#', '', issue)) #print ('extra info: ' + exinfo) else: pass #print ('no issue # to retrieve.') i+=1 return upcome #end. if __name__ == '__main__': solicit(sys.argv[1], sys.argv[2])