mylar/mylar/newpull.py

131 lines
4.6 KiB
Python
Raw Normal View History

from bs4 import BeautifulSoup, UnicodeDammit
import urllib2
import csv
import fileinput
import sys
import re
import os
import sqlite3
import datetime
import unicodedata
from decimal import Decimal
from HTMLParser import HTMLParseError
from time import strptime
import lib.requests as requests
import mylar
from mylar import logger
def newpull():
pagelinks = "http://www.previewsworld.com/Home/1/1/71/952"
try:
r = requests.get(pagelinks, verify=False)
except Exception, e:
logger.warn('Error fetching data: %s' % (tmpprov, e))
soup = BeautifulSoup(r.content)
getthedate = soup.findAll("div", {"class": "Headline"})[0]
#the date will be in the FIRST ahref
try:
getdate_link = getthedate('a')[0]
newdates = getdate_link.findNext(text=True).strip()
except IndexError:
newdates = getthedate.findNext(text=True).strip()
logger.fdebug('New Releases date detected as : ' + re.sub('New Releases For', '', newdates).strip())
cntlinks = soup.findAll('tr')
lenlinks = len(cntlinks)
publish = []
resultURL = []
resultmonth = []
resultyear = []
x = 0
cnt = 0
endthis = False
pull_list = []
publishers = {'PREVIEWS PUBLICATIONS', 'DARK HORSE COMICS', 'DC COMICS', 'IDW PUBLISHING', 'IMAGE COMICS', 'MARVEL COMICS', 'COMICS & GRAPHIC NOVELS'}
isspublisher = None
while (x < lenlinks):
headt = cntlinks[x] #iterate through the hrefs pulling out only results.
found_iss = headt.findAll('td')
pubcheck = found_iss[0].text.strip() #.findNext(text=True)
for pub in publishers:
if pub in pubcheck:
chklink = found_iss[0].findAll('a', href=True) #make sure it doesn't have a link in it.
if not chklink:
isspublisher = pub
break
if isspublisher == 'PREVIEWS PUBLICATIONS' or isspublisher is None:
pass
else:
if '/Catalog/' in str(headt):
findurl_link = headt.findAll('a', href=True)[0]
urlID = findurl_link.findNext(text=True)
issue_link = findurl_link['href']
issue_lk = issue_link.find('/Catalog/')
if issue_lk == -1:
x+=1
continue
elif "Home/1/1/71" in issue_link:
#logger.fdebug('Ignoring - menu option.')
x+=1
continue
if len(found_iss) > 0:
pull_list.append({"iss_url": issue_link,
"name": found_iss[1].findNext(text=True),
"price": found_iss[2],
"publisher": isspublisher,
"ID": urlID})
if "PREVIEWS" in headt:
#logger.fdebug('Ignoring: ' + found_iss[0])
break
if "MAGAZINES" in headt:
#logger.fdebug('End.')
endthis = True
break
x+=1
logger.fdebug('Saving new pull-list information into local file for subsequent merge')
except_file = os.path.join(mylar.CACHE_DIR, 'newreleases.txt')
try:
csvfile = open(str(except_file), 'rb')
csvfile.close()
2015-05-22 08:32:51 +00:00
except (OSError, IOError):
logger.fdebug('file does not exist - continuing.')
else:
logger.fdebug('file exists - removing.')
os.remove(except_file)
oldpub = None
breakhtml = {"<td>", "<tr>", "</td>", "</tr>"}
with open(str(except_file), 'wb') as f:
f.write('%s\n' % (newdates))
for pl in pull_list:
if pl['publisher'] == oldpub:
FIX: included version of comictagger should now work with both Windows and *nix based OS' again, IMP: Global Copy/Move option available when performing post-processing, IMP: Added a verbose file-checking option (FOLDER_SCAN_LOG_VERBOSE) - when enabled will log as it currently does during manual post-processing/file-checking runs, when disabled it will not spam the log nearly as much resulting in more readable log files, IMP: Added Verbose debug logging both via startup option(-v), as well as toggle button in Log GUI (from headphones), as well as per-page loading of log file(s) in GUI, FIX: When doing manual post-processing on issues that were in story arcs, will now indicate X story-arc issues were post-processed for better visibility, FIX: Fixed an issue with deleting from the nzblog table when story arc issues were post-processed, IMP: Added WEEKFOLDER_LOC to the config.ini to allow for specification of where the weekly download directories will default to (as opposed to off of ComicLocation root), IMP: Better handling of some special character references in series titles when looking for series on the auto-wanted list, IMP: 32P will now auto-disable provider if logon returns invalid credentials, FIX: When using alt_pull on weekly pull list, xA0 unicode character caused error, FIX: If title had invalid character in filename that was replaced with a character that already existed in the title, would not scan in during file-checking, FIX: When searching for a series (weeklypull-list/add a series), if the title contained 'and' or '&' would return really mixed up results, FIX: When Post-Processing, if filename being processed had special characters (ie. comma) and was different than nzbname, in some cases would fail to find/move issues, IMP: Utilize internal comictagger to convert from cbr/cbz, IMP: Added more checks when post-processing to ensure files are handled correctly, IMP: Added meta-tag reading when importing series/issues - if previously tagged with CT, will reverse look-up the provided IssueID to reference the correct ComicID, IMP: If scanned directory during import contins cvinfo file, use that and force the ComicID to entire directory when importing a series, IMP: Manual meta-tagging issues will no longer create temporary directories and/or create files in the Comic Location root causing problems for some users, FIX: Annuals weren't properly sorted upon loading of comic details page for some series, IMP: Added some extra checks when validating/creating directories, FIX: Fixed a problem when displaying some covers of .cbz files on the comic details page
2016-01-26 07:49:56 +00:00
exceptln = str(pl['ID']) + "\t" + pl['name'].replace(u"\xA0", u" ") + "\t" + str(pl['price'])
else:
FIX: included version of comictagger should now work with both Windows and *nix based OS' again, IMP: Global Copy/Move option available when performing post-processing, IMP: Added a verbose file-checking option (FOLDER_SCAN_LOG_VERBOSE) - when enabled will log as it currently does during manual post-processing/file-checking runs, when disabled it will not spam the log nearly as much resulting in more readable log files, IMP: Added Verbose debug logging both via startup option(-v), as well as toggle button in Log GUI (from headphones), as well as per-page loading of log file(s) in GUI, FIX: When doing manual post-processing on issues that were in story arcs, will now indicate X story-arc issues were post-processed for better visibility, FIX: Fixed an issue with deleting from the nzblog table when story arc issues were post-processed, IMP: Added WEEKFOLDER_LOC to the config.ini to allow for specification of where the weekly download directories will default to (as opposed to off of ComicLocation root), IMP: Better handling of some special character references in series titles when looking for series on the auto-wanted list, IMP: 32P will now auto-disable provider if logon returns invalid credentials, FIX: When using alt_pull on weekly pull list, xA0 unicode character caused error, FIX: If title had invalid character in filename that was replaced with a character that already existed in the title, would not scan in during file-checking, FIX: When searching for a series (weeklypull-list/add a series), if the title contained 'and' or '&' would return really mixed up results, FIX: When Post-Processing, if filename being processed had special characters (ie. comma) and was different than nzbname, in some cases would fail to find/move issues, IMP: Utilize internal comictagger to convert from cbr/cbz, IMP: Added more checks when post-processing to ensure files are handled correctly, IMP: Added meta-tag reading when importing series/issues - if previously tagged with CT, will reverse look-up the provided IssueID to reference the correct ComicID, IMP: If scanned directory during import contins cvinfo file, use that and force the ComicID to entire directory when importing a series, IMP: Manual meta-tagging issues will no longer create temporary directories and/or create files in the Comic Location root causing problems for some users, FIX: Annuals weren't properly sorted upon loading of comic details page for some series, IMP: Added some extra checks when validating/creating directories, FIX: Fixed a problem when displaying some covers of .cbz files on the comic details page
2016-01-26 07:49:56 +00:00
exceptln = pl['publisher'] + "\n" + str(pl['ID']) + "\t" + pl['name'].replace(u"\xA0", u" ") + "\t" + str(pl['price'])
for lb in breakhtml:
2015-05-22 08:32:51 +00:00
exceptln = re.sub(lb, '', exceptln).strip()
2015-05-22 08:32:51 +00:00
exceptline = exceptln.decode('utf-8', 'ignore')
f.write('%s\n' % (exceptline.encode('ascii', 'replace').strip()))
oldpub = pl['publisher']
if __name__ == '__main__':
newpull()