mirror of https://github.com/evilhero/mylar
262 lines
9.9 KiB
Python
Executable File
262 lines
9.9 KiB
Python
Executable File
|
|
from bs4 import BeautifulSoup, UnicodeDammit
|
|
import urllib2
|
|
import csv
|
|
import fileinput
|
|
import sys
|
|
import re
|
|
import os
|
|
import sqlite3
|
|
import datetime
|
|
import unicodedata
|
|
from decimal import Decimal
|
|
from HTMLParser import HTMLParseError
|
|
from time import strptime
|
|
|
|
import mylar
|
|
from mylar import logger
|
|
|
|
def solicit(month, year):
|
|
#convert to numerics just to ensure this...
|
|
month = int(month)
|
|
year = int(year)
|
|
|
|
#print ( "month: " + str(month) )
|
|
#print ( "year: " + str(year) )
|
|
|
|
# in order to gather ALL upcoming - let's start to loop through months going ahead one at a time
|
|
# until we get a null then break. (Usually not more than 3 months in advance is available)
|
|
mnloop = 0
|
|
upcoming = []
|
|
|
|
publishers = {'DC Comics':'DC Comics', 'Marvel':'Marvel Comics', 'Image':'Image Comics', 'IDW':'IDW Publishing', 'Dark Horse':'Dark Horse Comics'}
|
|
|
|
while (mnloop < 5):
|
|
if year == 2014:
|
|
if len(str(month)) == 1:
|
|
month_string = '0' + str(month)
|
|
else:
|
|
month_string = str(month)
|
|
datestring = str(year) + str(month_string)
|
|
else:
|
|
datestring = str(month) + str(year)
|
|
pagelinks = "http://www.comicbookresources.com/tag/solicits" + str(datestring)
|
|
logger.info('datestring:' + datestring)
|
|
logger.info('checking:' + pagelinks)
|
|
pageresponse = urllib2.urlopen ( pagelinks )
|
|
soup = BeautifulSoup (pageresponse)
|
|
cntlinks = soup.findAll('h3')
|
|
lenlinks = len(cntlinks)
|
|
logger.info( str(lenlinks) + ' results' )
|
|
|
|
publish = []
|
|
resultURL = []
|
|
|
|
x = 0
|
|
cnt = 0
|
|
|
|
while (x < lenlinks):
|
|
headt = cntlinks[x] #iterate through the hrefs pulling out only results.
|
|
if "/?page=article&id=" in str(headt):
|
|
#print ("titlet: " + str(headt))
|
|
headName = headt.findNext(text=True)
|
|
if ('Marvel' and 'DC' and 'Image' not in headName) and ('Solicitations' in headName or 'Solicits' in headName):
|
|
pubstart = headName.find('Solicitations')
|
|
for pub in publishers:
|
|
if pub in headName[:pubstart]:
|
|
publish.append(publishers[pub])
|
|
#publish.append( headName[:pubstart].strip() )
|
|
abc = headt.findAll('a', href=True)[0]
|
|
ID_som = abc['href'] #first instance will have the right link...
|
|
resultURL.append( ID_som )
|
|
#print '[ ' + publish[cnt] + '] Link URL: ' + resultURL[cnt]
|
|
cnt+=1
|
|
x+=1
|
|
|
|
#print 'cnt:' + str(cnt)
|
|
|
|
if cnt == 0:
|
|
break # no results means, end it
|
|
|
|
loopthis = (cnt-1)
|
|
#this loops through each 'found' solicit page
|
|
shipdate = str(month_string) + '-' + str(year)
|
|
while ( loopthis >= 0 ):
|
|
upcoming += populate(resultURL[loopthis], publish[loopthis], shipdate)
|
|
loopthis -=1
|
|
|
|
month +=1 #increment month by 1
|
|
mnloop +=1 #increment loop by 1
|
|
|
|
if month > 12: #failsafe failover for months
|
|
month = 1
|
|
year+=1
|
|
|
|
#print upcoming
|
|
logger.info( str(len(upcoming)) + ' upcoming issues discovered.' )
|
|
|
|
newfl = mylar.CACHE_DIR + "/future-releases.txt"
|
|
newtxtfile = open(newfl, 'wb')
|
|
|
|
cntr = 1
|
|
for row in upcoming:
|
|
if row['Extra'] is None or row['Extra'] == '':
|
|
extrarow = 'N/A'
|
|
else:
|
|
extrarow = row['Extra']
|
|
newtxtfile.write(str(row['Shipdate']) + '\t' + str(row['Publisher']) + '\t' + str(row['Issue']) + '\t' + str(row['Comic']) + '\t' + str(extrarow) + '\tSkipped' + '\t' + str(cntr) + '\n')
|
|
cntr +=1
|
|
|
|
newtxtfile.close()
|
|
|
|
|
|
logger.fdebug( 'attempting to populate future upcoming...' )
|
|
|
|
mylardb = os.path.join(mylar.DATA_DIR, "mylar.db")
|
|
|
|
connection = sqlite3.connect(str(mylardb))
|
|
cursor = connection.cursor()
|
|
|
|
cursor.executescript('drop table if exists future;')
|
|
|
|
cursor.execute("CREATE TABLE IF NOT EXISTS future (SHIPDATE, PUBLISHER text, ISSUE text, COMIC VARCHAR(150), EXTRA text, STATUS text, FutureID text, ComicID text);")
|
|
connection.commit()
|
|
|
|
csvfile = open(newfl, "rb")
|
|
creader = csv.reader(csvfile, delimiter='\t')
|
|
|
|
t = 1
|
|
|
|
for row in creader:
|
|
try:
|
|
#print ("Row: %s" % row)
|
|
cursor.execute("INSERT INTO future VALUES (?,?,?,?,?,?,?,null);", row)
|
|
except Exception, e:
|
|
logger.fdebug("Error - invald arguments...-skipping")
|
|
pass
|
|
t+=1
|
|
logger.fdebug('successfully added ' + str(t) + ' issues to future upcoming table.')
|
|
csvfile.close()
|
|
connection.commit()
|
|
connection.close()
|
|
|
|
|
|
mylar.weeklypull.pullitcheck(futurepull="yes")
|
|
#.end
|
|
|
|
def populate(link,publisher,shipdate):
|
|
#this is the secondary url call to populate
|
|
input = 'http://www.comicbookresources.com/' + link
|
|
response = urllib2.urlopen ( input )
|
|
soup = BeautifulSoup (response)
|
|
abc = soup.findAll('p')
|
|
lenabc = len(abc)
|
|
i=0
|
|
resultName = []
|
|
resultID = []
|
|
resultURL = []
|
|
matched = "no"
|
|
upcome = []
|
|
get_next = False
|
|
prev_chk = False
|
|
|
|
while (i < lenabc):
|
|
titlet = abc[i] #iterate through the p pulling out only results.
|
|
#print ("titlet: " + str(titlet))
|
|
if "/prev_img.php?pid" in str(titlet):
|
|
#solicits in 03-2014 have seperated <p> tags, so we need to take the subsequent <p>, not the initial.
|
|
get_next = True
|
|
i+=1
|
|
continue
|
|
elif "/news/preview2.php" in str(titlet):
|
|
prev_chk = True
|
|
get_next = False
|
|
elif get_next == True:
|
|
prev_chk = True
|
|
else:
|
|
prev_chk = False
|
|
get_next = False
|
|
if prev_chk == True:
|
|
tempName = titlet.findNext(text=True)
|
|
if ' TPB' not in tempName and ' HC' not in tempName and 'GN-TPB' not in tempName and 'for $1' not in tempName.lower() and 'subscription variant' not in tempName.lower() and 'poster' not in tempName.lower():
|
|
#print publisher + ' found upcoming'
|
|
if '#' in tempName:
|
|
#tempName = tempName.replace(u'.',u"'")
|
|
tempName = tempName.encode('ascii', 'replace') #.decode('utf-8')
|
|
if '???' in tempName:
|
|
tempName = tempName.replace('???', ' ')
|
|
stissue = tempName.find('#')
|
|
endissue = tempName.find(' ', stissue)
|
|
if tempName[stissue+1] == ' ': #if issue has space between # and number, adjust.
|
|
endissue = tempName.find(' ', stissue+2)
|
|
if endissue == -1: endissue = len(tempName)
|
|
issue = tempName[stissue:endissue].lstrip(' ')
|
|
if ':'in issue: issue = re.sub(':', '', issue).rstrip()
|
|
exinfo = tempName[endissue:].lstrip(' ')
|
|
|
|
issue1 = None
|
|
issue2 = None
|
|
|
|
if '-' in issue:
|
|
#print ('multiple issues detected. Splitting.')
|
|
ststart = issue.find('-')
|
|
issue1 = issue[:ststart]
|
|
issue2 = '#' + str(issue[ststart+1:])
|
|
|
|
if '&' in exinfo:
|
|
#print ('multiple issues detected. Splitting.')
|
|
ststart = exinfo.find('&')
|
|
issue1 = issue # this detects fine
|
|
issue2 = '#' + str(exinfo[ststart+1:])
|
|
if '& ' in issue2: issue2 = re.sub("&\\b", "", issue2)
|
|
exinfo = exinfo.replace(exinfo[ststart+1:len(issue2)], '').strip()
|
|
if exinfo == '&': exinfo = 'N/A'
|
|
|
|
comic = tempName[:stissue].strip()
|
|
if 'for \$1' in comic:
|
|
exinfo = 'for $1'
|
|
comic = comic.replace('for \$1\:', '').lstrip()
|
|
|
|
if issue1:
|
|
upcome.append({
|
|
'Shipdate': shipdate,
|
|
'Publisher': publisher.upper(),
|
|
'Issue': re.sub('#', '',issue1).lstrip(),
|
|
'Comic': comic.upper(),
|
|
'Extra': exinfo.upper()
|
|
})
|
|
#print ('Comic: ' + comic)
|
|
#print('issue#: ' + re.sub('#', '', issue1))
|
|
#print ('extra info: ' + exinfo)
|
|
if issue2:
|
|
upcome.append({
|
|
'Shipdate': shipdate,
|
|
'Publisher': publisher.upper(),
|
|
'Issue': re.sub('#', '', issue2).lstrip(),
|
|
'Comic': comic.upper(),
|
|
'Extra': exinfo.upper()
|
|
})
|
|
#print ('Comic: ' + comic)
|
|
#print('issue#: ' + re.sub('#', '', issue2))
|
|
#print ('extra info: ' + exinfo)
|
|
else:
|
|
upcome.append({
|
|
'Shipdate': shipdate,
|
|
'Publisher': publisher.upper(),
|
|
'Issue': re.sub('#', '', issue).lstrip(),
|
|
'Comic': comic.upper(),
|
|
'Extra': exinfo.upper()
|
|
})
|
|
#print ('Comic: ' + comic)
|
|
#print ('issue#: ' + re.sub('#', '', issue))
|
|
#print ('extra info: ' + exinfo)
|
|
else:
|
|
pass
|
|
#print ('no issue # to retrieve.')
|
|
i+=1
|
|
return upcome
|
|
#end.
|
|
|
|
if __name__ == '__main__':
|
|
solicit(sys.argv[1], sys.argv[2])
|