mirror of https://github.com/evilhero/mylar
359 lines
14 KiB
Python
Executable File
359 lines
14 KiB
Python
Executable File
|
|
from bs4 import BeautifulSoup, UnicodeDammit
|
|
import urllib2
|
|
import csv
|
|
import fileinput
|
|
import sys
|
|
import re
|
|
import os
|
|
import sqlite3
|
|
import datetime
|
|
import unicodedata
|
|
from decimal import Decimal
|
|
from HTMLParser import HTMLParseError
|
|
from time import strptime
|
|
|
|
import mylar
|
|
from mylar import logger, helpers
|
|
|
|
def solicit(month, year):
|
|
#convert to numerics just to ensure this...
|
|
month = int(month)
|
|
year = int(year)
|
|
|
|
#print ( "month: " + str(month) )
|
|
#print ( "year: " + str(year) )
|
|
|
|
# in order to gather ALL upcoming - let's start to loop through months going ahead one at a time
|
|
# until we get a null then break. (Usually not more than 3 months in advance is available)
|
|
mnloop = 0
|
|
upcoming = []
|
|
|
|
publishers = {'DC Comics': 'DC Comics', 'DC\'s': 'DC Comics', 'Marvel': 'Marvel Comics', 'Image': 'Image Comics', 'IDW': 'IDW Publishing', 'Dark Horse': 'Dark Horse'}
|
|
|
|
|
|
# -- this is no longer needed (testing)
|
|
# while (mnloop < 5):
|
|
# if year == 2014:
|
|
# if len(str(month)) == 1:
|
|
# month_string = '0' + str(month)
|
|
# else:
|
|
# month_string = str(month)
|
|
# datestring = str(year) + str(month_string)
|
|
# else:
|
|
# datestring = str(month) + str(year)
|
|
|
|
# pagelinks = "http://www.comicbookresources.com/tag/solicits" + str(datestring)
|
|
|
|
#using the solicits+datestring leaves out some entries occasionally
|
|
#should use http://www.comicbookresources.com/tag/solicitations
|
|
#then just use the logic below but instead of datestring, find the month term and
|
|
#go ahead up to +5 months.
|
|
|
|
if month > 0:
|
|
month_start = month
|
|
month_end = month + 5
|
|
#if month_end > 12:
|
|
# ms = 8, me=13 [(12-8)+(13-12)] = [4 + 1] = 5
|
|
# [(12 - ms) + (me - 12)] = number of months (5)
|
|
|
|
monthlist = []
|
|
mongr = month_start
|
|
|
|
#we need to build the months we can grab, but the non-numeric way.
|
|
while (mongr <= month_end):
|
|
mon = mongr
|
|
if mon == 13:
|
|
mon = 1
|
|
year +=1
|
|
|
|
if len(str(mon)) == 1:
|
|
mon = '0' + str(mon)
|
|
|
|
monthlist.append({"month": helpers.fullmonth(str(mon)).lower(),
|
|
"num_month": mon,
|
|
"year": str(year)})
|
|
mongr+=1
|
|
|
|
logger.info('months: ' + str(monthlist))
|
|
|
|
pagelinks = "http://www.comicbookresources.com/tag/solicitations"
|
|
|
|
|
|
#logger.info('datestring:' + datestring)
|
|
#logger.info('checking:' + pagelinks)
|
|
pageresponse = urllib2.urlopen (pagelinks)
|
|
soup = BeautifulSoup (pageresponse)
|
|
cntlinks = soup.findAll('h3')
|
|
lenlinks = len(cntlinks)
|
|
#logger.info( str(lenlinks) + ' results' )
|
|
|
|
publish = []
|
|
resultURL = []
|
|
resultmonth = []
|
|
resultyear = []
|
|
|
|
x = 0
|
|
cnt = 0
|
|
|
|
while (x < lenlinks):
|
|
headt = cntlinks[x] #iterate through the hrefs pulling out only results.
|
|
if "/?page=article&id=" in str(headt):
|
|
#print ("titlet: " + str(headt))
|
|
headName = headt.findNext(text=True)
|
|
#print ('headName: ' + headName)
|
|
if 'Image' in headName: print 'IMAGE FOUND'
|
|
if not all(['Marvel' in headName, 'DC' in headName, 'Image' in headName]) and ('Solicitations' in headName or 'Solicits' in headName):
|
|
# test for month here (int(month) + 5)
|
|
if not any(d.get('month', None) == str(headName).lower() for d in monthlist):
|
|
for mt in monthlist:
|
|
if mt['month'] in headName.lower():
|
|
logger.info('matched on month: ' + str(mt['month']))
|
|
logger.info('matched on year: ' + str(mt['year']))
|
|
resultmonth.append(mt['num_month'])
|
|
resultyear.append(mt['year'])
|
|
|
|
pubstart = headName.find('Solicitations')
|
|
publishchk = False
|
|
for pub in publishers:
|
|
if pub in headName[:pubstart]:
|
|
#print 'publisher:' + str(publishers[pub])
|
|
publish.append(publishers[pub])
|
|
publishchk = True
|
|
break
|
|
if publishchk == False:
|
|
break
|
|
#publish.append( headName[:pubstart].strip() )
|
|
abc = headt.findAll('a', href=True)[0]
|
|
ID_som = abc['href'] #first instance will have the right link...
|
|
resultURL.append(ID_som)
|
|
#print '(' + str(cnt) + ') [ ' + publish[cnt] + '] Link URL: ' + resultURL[cnt]
|
|
cnt+=1
|
|
|
|
else:
|
|
logger.info('incorrect month - not using.')
|
|
|
|
x+=1
|
|
|
|
if cnt == 0:
|
|
return #break # no results means, end it
|
|
|
|
loopthis = (cnt -1)
|
|
#this loops through each 'found' solicit page
|
|
#shipdate = str(month_string) + '-' + str(year) - not needed.
|
|
while (loopthis >= 0):
|
|
#print 'loopthis is : ' + str(loopthis)
|
|
#print 'resultURL is : ' + str(resultURL[loopthis])
|
|
shipdate = str(resultmonth[loopthis]) + '-' + str(resultyear[loopthis])
|
|
upcoming += populate(resultURL[loopthis], publish[loopthis], shipdate)
|
|
loopthis -=1
|
|
|
|
logger.info(str(len(upcoming)) + ' upcoming issues discovered.')
|
|
|
|
newfl = mylar.CACHE_DIR + "/future-releases.txt"
|
|
newtxtfile = open(newfl, 'wb')
|
|
|
|
cntr = 1
|
|
for row in upcoming:
|
|
if row['Extra'] is None or row['Extra'] == '':
|
|
extrarow = 'N/A'
|
|
else:
|
|
extrarow = row['Extra']
|
|
newtxtfile.write(str(row['Shipdate']) + '\t' + str(row['Publisher']) + '\t' + str(row['Issue']) + '\t' + str(row['Comic']) + '\t' + str(extrarow) + '\tSkipped' + '\t' + str(cntr) + '\n')
|
|
cntr +=1
|
|
|
|
newtxtfile.close()
|
|
|
|
|
|
logger.fdebug('attempting to populate future upcoming...')
|
|
|
|
mylardb = os.path.join(mylar.DATA_DIR, "mylar.db")
|
|
|
|
connection = sqlite3.connect(str(mylardb))
|
|
cursor = connection.cursor()
|
|
|
|
# we should extract the issues that are being watched, but no data is available yet ('Watch For' status)
|
|
# once we get the data, store it, wipe the existing table, retrieve the new data, populate the data into
|
|
# the table, recheck the series against the current watchlist and then restore the Watch For data.
|
|
|
|
|
|
cursor.executescript('drop table if exists future;')
|
|
|
|
cursor.execute("CREATE TABLE IF NOT EXISTS future (SHIPDATE, PUBLISHER text, ISSUE text, COMIC VARCHAR(150), EXTRA text, STATUS text, FutureID text, ComicID text);")
|
|
connection.commit()
|
|
|
|
csvfile = open(newfl, "rb")
|
|
creader = csv.reader(csvfile, delimiter='\t')
|
|
|
|
t = 1
|
|
|
|
for row in creader:
|
|
try:
|
|
#print ("Row: %s" % row)
|
|
cursor.execute("INSERT INTO future VALUES (?,?,?,?,?,?,?,null);", row)
|
|
except Exception, e:
|
|
logger.fdebug("Error - invald arguments...-skipping")
|
|
pass
|
|
t+=1
|
|
logger.fdebug('successfully added ' + str(t) + ' issues to future upcoming table.')
|
|
csvfile.close()
|
|
connection.commit()
|
|
connection.close()
|
|
|
|
|
|
mylar.weeklypull.pullitcheck(futurepull="yes")
|
|
#.end
|
|
|
|
def populate(link, publisher, shipdate):
|
|
#this is the secondary url call to populate
|
|
input = 'http://www.comicbookresources.com/' + link
|
|
#print 'checking ' + str(input)
|
|
response = urllib2.urlopen (input)
|
|
soup = BeautifulSoup (response)
|
|
abc = soup.findAll('p')
|
|
lenabc = len(abc)
|
|
i=0
|
|
resultName = []
|
|
resultID = []
|
|
resultURL = []
|
|
matched = "no"
|
|
upcome = []
|
|
get_next = False
|
|
prev_chk = False
|
|
|
|
while (i < lenabc):
|
|
titlet = abc[i] #iterate through the p pulling out only results.
|
|
titlet_next = titlet.findNext(text=True)
|
|
#print ("titlet: " + str(titlet))
|
|
if "/prev_img.php?pid" in str(titlet) and titlet_next is None:
|
|
#solicits in 03-2014 have seperated <p> tags, so we need to take the subsequent <p>, not the initial.
|
|
prev_chk = False
|
|
get_next = True
|
|
i+=1
|
|
continue
|
|
elif titlet_next is not None:
|
|
#logger.fdebug('non seperated <p> tags - taking next text.')
|
|
get_next = False
|
|
prev_chk = True
|
|
|
|
elif "/news/preview2.php" in str(titlet):
|
|
prev_chk = True
|
|
get_next = False
|
|
elif get_next == True:
|
|
prev_chk = True
|
|
else:
|
|
prev_chk = False
|
|
get_next = False
|
|
|
|
if prev_chk == True:
|
|
tempName = titlet.findNext(text=True)
|
|
if not any([' TPB' in tempName, 'HC' in tempName, 'GN-TPB' in tempName, 'for $1' in tempName.lower(), 'subscription variant' in tempName.lower(), 'poster' in tempName.lower()]):
|
|
if '#' in tempName[:50]:
|
|
#tempName = tempName.replace(u'.',u"'")
|
|
tempName = tempName.encode('ascii', 'replace') #.decode('utf-8')
|
|
if '???' in tempName:
|
|
tempName = tempName.replace('???', ' ')
|
|
stissue = tempName.find('#')
|
|
endissue = tempName.find(' ', stissue)
|
|
if tempName[stissue +1] == ' ': #if issue has space between # and number, adjust.
|
|
endissue = tempName.find(' ', stissue +2)
|
|
if endissue == -1: endissue = len(tempName)
|
|
issue = tempName[stissue:endissue].lstrip(' ')
|
|
if ':'in issue: issue = re.sub(':', '', issue).rstrip()
|
|
exinfo = tempName[endissue:].lstrip(' ')
|
|
|
|
issue1 = None
|
|
issue2 = None
|
|
|
|
if '-' in issue:
|
|
#print ('multiple issues detected. Splitting.')
|
|
ststart = issue.find('-')
|
|
issue1 = issue[:ststart]
|
|
issue2 = '#' + str(issue[ststart +1:])
|
|
|
|
if '&' in exinfo:
|
|
#print ('multiple issues detected. Splitting.')
|
|
ststart = exinfo.find('&')
|
|
issue1 = issue # this detects fine
|
|
issue2 = '#' + str(exinfo[ststart +1:])
|
|
if '& ' in issue2: issue2 = re.sub("&\\b", "", issue2)
|
|
exinfo = exinfo.replace(exinfo[ststart +1:len(issue2)], '').strip()
|
|
if exinfo == '&': exinfo = 'N/A'
|
|
|
|
comic = tempName[:stissue].strip()
|
|
|
|
if 'for \$1' in comic:
|
|
exinfo = 'for $1'
|
|
comic = comic.replace('for \$1\:', '').lstrip()
|
|
|
|
issuedate = shipdate
|
|
if 'on sale' in str(titlet).lower():
|
|
onsale_start = str(titlet).lower().find('on sale') + 8
|
|
onsale_end = str(titlet).lower().find('<br>', onsale_start)
|
|
thedate = str(titlet)[onsale_start:onsale_end]
|
|
m = None
|
|
|
|
basemonths = {'january': '1', 'jan': '1', 'february': '2', 'feb': '2', 'march': '3', 'mar': '3', 'april': '4', 'apr': '4', 'may': '5', 'june': '6', 'july': '7', 'august': '8', 'aug': '8', 'september': '9', 'sept': '9', 'october': '10', 'oct': '10', 'november': '11', 'nov': '11', 'december': '12', 'dec': '12'}
|
|
for month in basemonths:
|
|
if month in thedate.lower():
|
|
m = basemonths[month]
|
|
monthname = month
|
|
break
|
|
|
|
if m is not None:
|
|
theday = len(month) + 1 # account for space between month & day
|
|
thedaystart = thedate[theday:(theday +2)].strip() # day numeric won't exceed 2
|
|
if len(str(thedaystart)) == 1:
|
|
thedaystart = '0' + str(thedaystart)
|
|
if len(str(m)) == 1:
|
|
m = '0' + str(m)
|
|
thedate = shipdate[-4:] + '-' + str(m) + '-' + str(thedaystart)
|
|
|
|
logger.info('[' + comic + '] On sale :' + str(thedate))
|
|
exinfo += ' [' + str(thedate) + ']'
|
|
issuedate = thedate
|
|
|
|
|
|
if issue1:
|
|
upcome.append({
|
|
'Shipdate': issuedate,
|
|
'Publisher': publisher.upper(),
|
|
'Issue': re.sub('#', '', issue1).lstrip(),
|
|
'Comic': comic.upper(),
|
|
'Extra': exinfo.upper()
|
|
})
|
|
#print ('Comic: ' + comic)
|
|
#print('issue#: ' + re.sub('#', '', issue1))
|
|
#print ('extra info: ' + exinfo)
|
|
if issue2:
|
|
upcome.append({
|
|
'Shipdate': issuedate,
|
|
'Publisher': publisher.upper(),
|
|
'Issue': re.sub('#', '', issue2).lstrip(),
|
|
'Comic': comic.upper(),
|
|
'Extra': exinfo.upper()
|
|
})
|
|
#print ('Comic: ' + comic)
|
|
#print('issue#: ' + re.sub('#', '', issue2))
|
|
#print ('extra info: ' + exinfo)
|
|
else:
|
|
upcome.append({
|
|
'Shipdate': issuedate,
|
|
'Publisher': publisher.upper(),
|
|
'Issue': re.sub('#', '', issue).lstrip(),
|
|
'Comic': comic.upper(),
|
|
'Extra': exinfo.upper()
|
|
})
|
|
#print ('Comic: ' + comic)
|
|
#print ('issue#: ' + re.sub('#', '', issue))
|
|
#print ('extra info: ' + exinfo)
|
|
else:
|
|
pass
|
|
#print ('no issue # to retrieve.')
|
|
i+=1
|
|
return upcome
|
|
#end.
|
|
|
|
if __name__ == '__main__':
|
|
solicit(sys.argv[1], sys.argv[2])
|