mylar/mylar/solicit.py

359 lines
14 KiB
Python
Executable File

from bs4 import BeautifulSoup, UnicodeDammit
import urllib2
import csv
import fileinput
import sys
import re
import os
import sqlite3
import datetime
import unicodedata
from decimal import Decimal
from HTMLParser import HTMLParseError
from time import strptime
import mylar
from mylar import logger, helpers
def solicit(month, year):
#convert to numerics just to ensure this...
month = int(month)
year = int(year)
#print ( "month: " + str(month) )
#print ( "year: " + str(year) )
# in order to gather ALL upcoming - let's start to loop through months going ahead one at a time
# until we get a null then break. (Usually not more than 3 months in advance is available)
mnloop = 0
upcoming = []
publishers = {'DC Comics': 'DC Comics', 'DC\'s': 'DC Comics', 'Marvel': 'Marvel Comics', 'Image': 'Image Comics', 'IDW': 'IDW Publishing', 'Dark Horse': 'Dark Horse'}
# -- this is no longer needed (testing)
# while (mnloop < 5):
# if year == 2014:
# if len(str(month)) == 1:
# month_string = '0' + str(month)
# else:
# month_string = str(month)
# datestring = str(year) + str(month_string)
# else:
# datestring = str(month) + str(year)
# pagelinks = "http://www.comicbookresources.com/tag/solicits" + str(datestring)
#using the solicits+datestring leaves out some entries occasionally
#should use http://www.comicbookresources.com/tag/solicitations
#then just use the logic below but instead of datestring, find the month term and
#go ahead up to +5 months.
if month > 0:
month_start = month
month_end = month + 5
#if month_end > 12:
# ms = 8, me=13 [(12-8)+(13-12)] = [4 + 1] = 5
# [(12 - ms) + (me - 12)] = number of months (5)
monthlist = []
mongr = month_start
#we need to build the months we can grab, but the non-numeric way.
while (mongr <= month_end):
mon = mongr
if mon == 13:
mon = 1
year +=1
if len(str(mon)) == 1:
mon = '0' + str(mon)
monthlist.append({"month": helpers.fullmonth(str(mon)).lower(),
"num_month": mon,
"year": str(year)})
mongr+=1
logger.info('months: ' + str(monthlist))
pagelinks = "http://www.comicbookresources.com/tag/solicitations"
#logger.info('datestring:' + datestring)
#logger.info('checking:' + pagelinks)
pageresponse = urllib2.urlopen (pagelinks)
soup = BeautifulSoup (pageresponse)
cntlinks = soup.findAll('h3')
lenlinks = len(cntlinks)
#logger.info( str(lenlinks) + ' results' )
publish = []
resultURL = []
resultmonth = []
resultyear = []
x = 0
cnt = 0
while (x < lenlinks):
headt = cntlinks[x] #iterate through the hrefs pulling out only results.
if "/?page=article&amp;id=" in str(headt):
#print ("titlet: " + str(headt))
headName = headt.findNext(text=True)
#print ('headName: ' + headName)
if 'Image' in headName: print 'IMAGE FOUND'
if not all(['Marvel' in headName, 'DC' in headName, 'Image' in headName]) and ('Solicitations' in headName or 'Solicits' in headName):
# test for month here (int(month) + 5)
if not any(d.get('month', None) == str(headName).lower() for d in monthlist):
for mt in monthlist:
if mt['month'] in headName.lower():
logger.info('matched on month: ' + str(mt['month']))
logger.info('matched on year: ' + str(mt['year']))
resultmonth.append(mt['num_month'])
resultyear.append(mt['year'])
pubstart = headName.find('Solicitations')
publishchk = False
for pub in publishers:
if pub in headName[:pubstart]:
#print 'publisher:' + str(publishers[pub])
publish.append(publishers[pub])
publishchk = True
break
if publishchk == False:
break
#publish.append( headName[:pubstart].strip() )
abc = headt.findAll('a', href=True)[0]
ID_som = abc['href'] #first instance will have the right link...
resultURL.append(ID_som)
#print '(' + str(cnt) + ') [ ' + publish[cnt] + '] Link URL: ' + resultURL[cnt]
cnt+=1
else:
logger.info('incorrect month - not using.')
x+=1
if cnt == 0:
return #break # no results means, end it
loopthis = (cnt -1)
#this loops through each 'found' solicit page
#shipdate = str(month_string) + '-' + str(year) - not needed.
while (loopthis >= 0):
#print 'loopthis is : ' + str(loopthis)
#print 'resultURL is : ' + str(resultURL[loopthis])
shipdate = str(resultmonth[loopthis]) + '-' + str(resultyear[loopthis])
upcoming += populate(resultURL[loopthis], publish[loopthis], shipdate)
loopthis -=1
logger.info(str(len(upcoming)) + ' upcoming issues discovered.')
newfl = mylar.CACHE_DIR + "/future-releases.txt"
newtxtfile = open(newfl, 'wb')
cntr = 1
for row in upcoming:
if row['Extra'] is None or row['Extra'] == '':
extrarow = 'N/A'
else:
extrarow = row['Extra']
newtxtfile.write(str(row['Shipdate']) + '\t' + str(row['Publisher']) + '\t' + str(row['Issue']) + '\t' + str(row['Comic']) + '\t' + str(extrarow) + '\tSkipped' + '\t' + str(cntr) + '\n')
cntr +=1
newtxtfile.close()
logger.fdebug('attempting to populate future upcoming...')
mylardb = os.path.join(mylar.DATA_DIR, "mylar.db")
connection = sqlite3.connect(str(mylardb))
cursor = connection.cursor()
# we should extract the issues that are being watched, but no data is available yet ('Watch For' status)
# once we get the data, store it, wipe the existing table, retrieve the new data, populate the data into
# the table, recheck the series against the current watchlist and then restore the Watch For data.
cursor.executescript('drop table if exists future;')
cursor.execute("CREATE TABLE IF NOT EXISTS future (SHIPDATE, PUBLISHER text, ISSUE text, COMIC VARCHAR(150), EXTRA text, STATUS text, FutureID text, ComicID text);")
connection.commit()
csvfile = open(newfl, "rb")
creader = csv.reader(csvfile, delimiter='\t')
t = 1
for row in creader:
try:
#print ("Row: %s" % row)
cursor.execute("INSERT INTO future VALUES (?,?,?,?,?,?,?,null);", row)
except Exception, e:
logger.fdebug("Error - invald arguments...-skipping")
pass
t+=1
logger.fdebug('successfully added ' + str(t) + ' issues to future upcoming table.')
csvfile.close()
connection.commit()
connection.close()
mylar.weeklypull.pullitcheck(futurepull="yes")
#.end
def populate(link, publisher, shipdate):
#this is the secondary url call to populate
input = 'http://www.comicbookresources.com/' + link
#print 'checking ' + str(input)
response = urllib2.urlopen (input)
soup = BeautifulSoup (response)
abc = soup.findAll('p')
lenabc = len(abc)
i=0
resultName = []
resultID = []
resultURL = []
matched = "no"
upcome = []
get_next = False
prev_chk = False
while (i < lenabc):
titlet = abc[i] #iterate through the p pulling out only results.
titlet_next = titlet.findNext(text=True)
#print ("titlet: " + str(titlet))
if "/prev_img.php?pid" in str(titlet) and titlet_next is None:
#solicits in 03-2014 have seperated <p> tags, so we need to take the subsequent <p>, not the initial.
prev_chk = False
get_next = True
i+=1
continue
elif titlet_next is not None:
#logger.fdebug('non seperated <p> tags - taking next text.')
get_next = False
prev_chk = True
elif "/news/preview2.php" in str(titlet):
prev_chk = True
get_next = False
elif get_next == True:
prev_chk = True
else:
prev_chk = False
get_next = False
if prev_chk == True:
tempName = titlet.findNext(text=True)
if not any([' TPB' in tempName, 'HC' in tempName, 'GN-TPB' in tempName, 'for $1' in tempName.lower(), 'subscription variant' in tempName.lower(), 'poster' in tempName.lower()]):
if '#' in tempName[:50]:
#tempName = tempName.replace(u'.',u"'")
tempName = tempName.encode('ascii', 'replace') #.decode('utf-8')
if '???' in tempName:
tempName = tempName.replace('???', ' ')
stissue = tempName.find('#')
endissue = tempName.find(' ', stissue)
if tempName[stissue +1] == ' ': #if issue has space between # and number, adjust.
endissue = tempName.find(' ', stissue +2)
if endissue == -1: endissue = len(tempName)
issue = tempName[stissue:endissue].lstrip(' ')
if ':'in issue: issue = re.sub(':', '', issue).rstrip()
exinfo = tempName[endissue:].lstrip(' ')
issue1 = None
issue2 = None
if '-' in issue:
#print ('multiple issues detected. Splitting.')
ststart = issue.find('-')
issue1 = issue[:ststart]
issue2 = '#' + str(issue[ststart +1:])
if '&' in exinfo:
#print ('multiple issues detected. Splitting.')
ststart = exinfo.find('&')
issue1 = issue # this detects fine
issue2 = '#' + str(exinfo[ststart +1:])
if '& ' in issue2: issue2 = re.sub("&\\b", "", issue2)
exinfo = exinfo.replace(exinfo[ststart +1:len(issue2)], '').strip()
if exinfo == '&': exinfo = 'N/A'
comic = tempName[:stissue].strip()
if 'for \$1' in comic:
exinfo = 'for $1'
comic = comic.replace('for \$1\:', '').lstrip()
issuedate = shipdate
if 'on sale' in str(titlet).lower():
onsale_start = str(titlet).lower().find('on sale') + 8
onsale_end = str(titlet).lower().find('<br>', onsale_start)
thedate = str(titlet)[onsale_start:onsale_end]
m = None
basemonths = {'january': '1', 'jan': '1', 'february': '2', 'feb': '2', 'march': '3', 'mar': '3', 'april': '4', 'apr': '4', 'may': '5', 'june': '6', 'july': '7', 'august': '8', 'aug': '8', 'september': '9', 'sept': '9', 'october': '10', 'oct': '10', 'november': '11', 'nov': '11', 'december': '12', 'dec': '12'}
for month in basemonths:
if month in thedate.lower():
m = basemonths[month]
monthname = month
break
if m is not None:
theday = len(month) + 1 # account for space between month & day
thedaystart = thedate[theday:(theday +2)].strip() # day numeric won't exceed 2
if len(str(thedaystart)) == 1:
thedaystart = '0' + str(thedaystart)
if len(str(m)) == 1:
m = '0' + str(m)
thedate = shipdate[-4:] + '-' + str(m) + '-' + str(thedaystart)
logger.info('[' + comic + '] On sale :' + str(thedate))
exinfo += ' [' + str(thedate) + ']'
issuedate = thedate
if issue1:
upcome.append({
'Shipdate': issuedate,
'Publisher': publisher.upper(),
'Issue': re.sub('#', '', issue1).lstrip(),
'Comic': comic.upper(),
'Extra': exinfo.upper()
})
#print ('Comic: ' + comic)
#print('issue#: ' + re.sub('#', '', issue1))
#print ('extra info: ' + exinfo)
if issue2:
upcome.append({
'Shipdate': issuedate,
'Publisher': publisher.upper(),
'Issue': re.sub('#', '', issue2).lstrip(),
'Comic': comic.upper(),
'Extra': exinfo.upper()
})
#print ('Comic: ' + comic)
#print('issue#: ' + re.sub('#', '', issue2))
#print ('extra info: ' + exinfo)
else:
upcome.append({
'Shipdate': issuedate,
'Publisher': publisher.upper(),
'Issue': re.sub('#', '', issue).lstrip(),
'Comic': comic.upper(),
'Extra': exinfo.upper()
})
#print ('Comic: ' + comic)
#print ('issue#: ' + re.sub('#', '', issue))
#print ('extra info: ' + exinfo)
else:
pass
#print ('no issue # to retrieve.')
i+=1
return upcome
#end.
if __name__ == '__main__':
solicit(sys.argv[1], sys.argv[2])