mylar/mylar/newpull.py


from bs4 import BeautifulSoup, UnicodeDammit
import urllib2
import csv
import fileinput
import sys
import re
import os
import sqlite3
import datetime
import unicodedata
from decimal import Decimal
from HTMLParser import HTMLParseError
from time import strptime
import lib.requests as requests

import mylar
from mylar import logger

def newpull():
        pagelinks = "http://www.previewsworld.com/Home/1/1/71/952"

        try:
            r = requests.get(pagelinks, verify=False)

        except Exception, e:
            logger.warn('Error fetching data: %s' % (tmpprov, e))

        soup = BeautifulSoup(r.content)
        getthedate = soup.findAll("div", {"class": "Headline"})[0]

        #the date will be in the FIRST ahref
        try:
            getdate_link = getthedate('a')[0]
            newdates = getdate_link.findNext(text=True).strip()
        except IndexError:
            newdates = getthedate.findNext(text=True).strip()
        logger.fdebug('New Releases date detected as : ' + re.sub('New Releases For', '', newdates).strip())
        cntlinks = soup.findAll('tr')
        lenlinks = len(cntlinks)

        publish = []
        resultURL = []
        resultmonth = []
        resultyear = []

        x = 0
        cnt = 0
        endthis = False
        pull_list = []

        publishers = {'PREVIEWS PUBLICATIONS', 'DARK HORSE COMICS', 'DC COMICS', 'IDW PUBLISHING', 'IMAGE COMICS', 'MARVEL COMICS', 'COMICS & GRAPHIC NOVELS'}
        isspublisher = None

        while (x < lenlinks):
            headt = cntlinks[x] #iterate through the hrefs pulling out only results.
            found_iss = headt.findAll('td')
            pubcheck = found_iss[0].text.strip() #.findNext(text=True)
            for pub in publishers:
                if pub in pubcheck:
                    chklink = found_iss[0].findAll('a', href=True)  #make sure it doesn't have a link in it.
                    if not chklink:
                        isspublisher = pub
                        break
                    
            if isspublisher == 'PREVIEWS PUBLICATIONS' or isspublisher is None:
                pass

            else:
                if '/Catalog/' in str(headt):
                    findurl_link = headt.findAll('a', href=True)[0]
                    urlID = findurl_link.findNext(text=True)
                    issue_link = findurl_link['href']
                    issue_lk = issue_link.find('/Catalog/')
                    if issue_lk == -1:
                        x+=1
                        continue
                    elif "Home/1/1/71" in issue_link:
                        #logger.fdebug('Ignoring - menu option.')
                        x+=1
                        continue

                    if len(found_iss) > 0:
                        pull_list.append({"iss_url":   issue_link,
                                          "name":      found_iss[1].findNext(text=True),
                                          "price":     found_iss[2],
                                          "publisher": isspublisher,
                                          "ID": urlID})

                if "PREVIEWS" in headt:
                    #logger.fdebug('Ignoring: ' + found_iss[0])
                    break
                if "MAGAZINES" in headt:
                    #logger.fdebug('End.')
                    endthis = True
                    break

            x+=1

        logger.fdebug('Saving new pull-list information into local file for subsequent merge')
        except_file = os.path.join(mylar.CACHE_DIR, 'newreleases.txt')
        try:
            csvfile = open(str(except_file), 'rb')
            csvfile.close()
        except (OSError, IOError):
            logger.fdebug('file does not exist - continuing.')
        else:
            logger.fdebug('file exists - removing.')
            os.remove(except_file)

        oldpub = None
        breakhtml = {"<td>", "<tr>", "</td>", "</tr>"}
        with open(str(except_file), 'wb') as f:
            f.write('%s\n' % (newdates))
            for pl in pull_list:
                if pl['publisher'] == oldpub:
                    exceptln = str(pl['ID']) + "\t" + pl['name'].replace(u"\xA0", u" ") + "\t" + str(pl['price'])
                else:
                    exceptln = pl['publisher'] + "\n" + str(pl['ID']) + "\t" + pl['name'].replace(u"\xA0", u" ") + "\t" + str(pl['price'])

                for lb in breakhtml:
                    exceptln = re.sub(lb, '', exceptln).strip()

                exceptline = exceptln.decode('utf-8', 'ignore')
                f.write('%s\n' % (exceptline.encode('ascii', 'replace').strip()))
                oldpub = pl['publisher']


if __name__ == '__main__':
    newpull()
Way too many fixes, some improvements (multipe_dest_dirs, alt weekly pull) 2014-11-21 17:16:28 +00:00
			`from bs4 import BeautifulSoup, UnicodeDammit`
			`import urllib2`
			`import csv`
			`import fileinput`
			`import sys`
			`import re`
			`import os`
			`import sqlite3`
			`import datetime`
			`import unicodedata`
			`from decimal import Decimal`
			`from HTMLParser import HTMLParseError`
			`from time import strptime`
FIX: One-off Failed Download handling will now work, IMP: Some better error handling when detecting one-off's during post-processing, FIX:(#1143) When series contained a digit preceded by a dash, would incorrectly assume it as a negative issue number, FIX: Improved being able to detect corresponding annuals on CV when refreshing/importing a series when the annuals are new (ie. no data on CV), FIX: Alt_Pull method for weekly pull list retrieval working again, FIX: Fixed nzbid detection for experimental search, IMP: Mass Import button now available on Import Results screen (will attempt to import all series that are in a 'Not Imported' status), IMP: When searching for arc issues using the 'Search for Missing' option, the call is now threaded so it runs in the background, IMP: Changed IssueYear to IssueDate for Story Arc Issues on the arc details page, FIX:(#1156) Typo that caused error when attempting to view cbz comics in the series detail page, FIX:(#1145) Select All option via top checkbox (on series detail page), FIX: Auto-Want feature via weeklypull will better match to titles that contain 'the' and have hypens in differing character positions, FIX:(#1160) Would incorrectly take the length of a decimal placed issue when searching and never complete the issue number cylcing search, FIX:(#1161) When annuals not enabled, and series in watchlist and series annual in pullist, would error out trying to link series to pullist. 2015-11-18 06:32:40 +00:00			`import lib.requests as requests`
Way too many fixes, some improvements (multipe_dest_dirs, alt weekly pull) 2014-11-21 17:16:28 +00:00
			`import mylar`
			`from mylar import logger`

			`def newpull():`
			`pagelinks = "http://www.previewsworld.com/Home/1/1/71/952"`

FIX: One-off Failed Download handling will now work, IMP: Some better error handling when detecting one-off's during post-processing, FIX:(#1143) When series contained a digit preceded by a dash, would incorrectly assume it as a negative issue number, FIX: Improved being able to detect corresponding annuals on CV when refreshing/importing a series when the annuals are new (ie. no data on CV), FIX: Alt_Pull method for weekly pull list retrieval working again, FIX: Fixed nzbid detection for experimental search, IMP: Mass Import button now available on Import Results screen (will attempt to import all series that are in a 'Not Imported' status), IMP: When searching for arc issues using the 'Search for Missing' option, the call is now threaded so it runs in the background, IMP: Changed IssueYear to IssueDate for Story Arc Issues on the arc details page, FIX:(#1156) Typo that caused error when attempting to view cbz comics in the series detail page, FIX:(#1145) Select All option via top checkbox (on series detail page), FIX: Auto-Want feature via weeklypull will better match to titles that contain 'the' and have hypens in differing character positions, FIX:(#1160) Would incorrectly take the length of a decimal placed issue when searching and never complete the issue number cylcing search, FIX:(#1161) When annuals not enabled, and series in watchlist and series annual in pullist, would error out trying to link series to pullist. 2015-11-18 06:32:40 +00:00			`try:`
			`r = requests.get(pagelinks, verify=False)`

			`except Exception, e:`
			`logger.warn('Error fetching data: %s' % (tmpprov, e))`

			`soup = BeautifulSoup(r.content)`
Way too many fixes, some improvements (multipe_dest_dirs, alt weekly pull) 2014-11-21 17:16:28 +00:00			`getthedate = soup.findAll("div", {"class": "Headline"})[0]`
FIX: One-off Failed Download handling will now work, IMP: Some better error handling when detecting one-off's during post-processing, FIX:(#1143) When series contained a digit preceded by a dash, would incorrectly assume it as a negative issue number, FIX: Improved being able to detect corresponding annuals on CV when refreshing/importing a series when the annuals are new (ie. no data on CV), FIX: Alt_Pull method for weekly pull list retrieval working again, FIX: Fixed nzbid detection for experimental search, IMP: Mass Import button now available on Import Results screen (will attempt to import all series that are in a 'Not Imported' status), IMP: When searching for arc issues using the 'Search for Missing' option, the call is now threaded so it runs in the background, IMP: Changed IssueYear to IssueDate for Story Arc Issues on the arc details page, FIX:(#1156) Typo that caused error when attempting to view cbz comics in the series detail page, FIX:(#1145) Select All option via top checkbox (on series detail page), FIX: Auto-Want feature via weeklypull will better match to titles that contain 'the' and have hypens in differing character positions, FIX:(#1160) Would incorrectly take the length of a decimal placed issue when searching and never complete the issue number cylcing search, FIX:(#1161) When annuals not enabled, and series in watchlist and series annual in pullist, would error out trying to link series to pullist. 2015-11-18 06:32:40 +00:00
Way too many fixes, some improvements (multipe_dest_dirs, alt weekly pull) 2014-11-21 17:16:28 +00:00			`#the date will be in the FIRST ahref`
FIX:(#939) Redirect loop error when ALT_PULL is enabled and attempting to load up new weekly pull-list, FIX: When manual post-processing and doing out-of-whack check if the Series Total was equal to the Have Total, would perform an endless loop sequence. 2015-02-02 20:29:21 +00:00			`try:`
			`getdate_link = getthedate('a')[0]`
			`newdates = getdate_link.findNext(text=True).strip()`
			`except IndexError:`
			`newdates = getthedate.findNext(text=True).strip()`
Way too many fixes, some improvements (multipe_dest_dirs, alt weekly pull) 2014-11-21 17:16:28 +00:00			`logger.fdebug('New Releases date detected as : ' + re.sub('New Releases For', '', newdates).strip())`
			`cntlinks = soup.findAll('tr')`
			`lenlinks = len(cntlinks)`

			`publish = []`
			`resultURL = []`
			`resultmonth = []`
			`resultyear = []`

			`x = 0`
			`cnt = 0`
			`endthis = False`
			`pull_list = []`

FIX: One-off Failed Download handling will now work, IMP: Some better error handling when detecting one-off's during post-processing, FIX:(#1143) When series contained a digit preceded by a dash, would incorrectly assume it as a negative issue number, FIX: Improved being able to detect corresponding annuals on CV when refreshing/importing a series when the annuals are new (ie. no data on CV), FIX: Alt_Pull method for weekly pull list retrieval working again, FIX: Fixed nzbid detection for experimental search, IMP: Mass Import button now available on Import Results screen (will attempt to import all series that are in a 'Not Imported' status), IMP: When searching for arc issues using the 'Search for Missing' option, the call is now threaded so it runs in the background, IMP: Changed IssueYear to IssueDate for Story Arc Issues on the arc details page, FIX:(#1156) Typo that caused error when attempting to view cbz comics in the series detail page, FIX:(#1145) Select All option via top checkbox (on series detail page), FIX: Auto-Want feature via weeklypull will better match to titles that contain 'the' and have hypens in differing character positions, FIX:(#1160) Would incorrectly take the length of a decimal placed issue when searching and never complete the issue number cylcing search, FIX:(#1161) When annuals not enabled, and series in watchlist and series annual in pullist, would error out trying to link series to pullist. 2015-11-18 06:32:40 +00:00			`publishers = {'PREVIEWS PUBLICATIONS', 'DARK HORSE COMICS', 'DC COMICS', 'IDW PUBLISHING', 'IMAGE COMICS', 'MARVEL COMICS', 'COMICS & GRAPHIC NOVELS'}`
			`isspublisher = None`
Way too many fixes, some improvements (multipe_dest_dirs, alt weekly pull) 2014-11-21 17:16:28 +00:00
			`while (x < lenlinks):`
			`headt = cntlinks[x] #iterate through the hrefs pulling out only results.`
FIX: One-off Failed Download handling will now work, IMP: Some better error handling when detecting one-off's during post-processing, FIX:(#1143) When series contained a digit preceded by a dash, would incorrectly assume it as a negative issue number, FIX: Improved being able to detect corresponding annuals on CV when refreshing/importing a series when the annuals are new (ie. no data on CV), FIX: Alt_Pull method for weekly pull list retrieval working again, FIX: Fixed nzbid detection for experimental search, IMP: Mass Import button now available on Import Results screen (will attempt to import all series that are in a 'Not Imported' status), IMP: When searching for arc issues using the 'Search for Missing' option, the call is now threaded so it runs in the background, IMP: Changed IssueYear to IssueDate for Story Arc Issues on the arc details page, FIX:(#1156) Typo that caused error when attempting to view cbz comics in the series detail page, FIX:(#1145) Select All option via top checkbox (on series detail page), FIX: Auto-Want feature via weeklypull will better match to titles that contain 'the' and have hypens in differing character positions, FIX:(#1160) Would incorrectly take the length of a decimal placed issue when searching and never complete the issue number cylcing search, FIX:(#1161) When annuals not enabled, and series in watchlist and series annual in pullist, would error out trying to link series to pullist. 2015-11-18 06:32:40 +00:00			`found_iss = headt.findAll('td')`
			`pubcheck = found_iss[0].text.strip() #.findNext(text=True)`
			`for pub in publishers:`
			`if pub in pubcheck:`
			`chklink = found_iss[0].findAll('a', href=True) #make sure it doesn't have a link in it.`
			`if not chklink:`
			`isspublisher = pub`
			`break`

			`if isspublisher == 'PREVIEWS PUBLICATIONS' or isspublisher is None:`
			`pass`

			`else:`
			`if '/Catalog/' in str(headt):`
			`findurl_link = headt.findAll('a', href=True)[0]`
			`urlID = findurl_link.findNext(text=True)`
			`issue_link = findurl_link['href']`
			`issue_lk = issue_link.find('/Catalog/')`
			`if issue_lk == -1:`
			`x+=1`
			`continue`
			`elif "Home/1/1/71" in issue_link:`
			`#logger.fdebug('Ignoring - menu option.')`
			`x+=1`
			`continue`

			`if len(found_iss) > 0:`
			`pull_list.append({"iss_url": issue_link,`
			`"name": found_iss[1].findNext(text=True),`
			`"price": found_iss[2],`
			`"publisher": isspublisher,`
			`"ID": urlID})`

			`if "PREVIEWS" in headt:`
			`#logger.fdebug('Ignoring: ' + found_iss[0])`
			`break`
			`if "MAGAZINES" in headt:`
			`#logger.fdebug('End.')`
			`endthis = True`
			`break`

Way too many fixes, some improvements (multipe_dest_dirs, alt weekly pull) 2014-11-21 17:16:28 +00:00			`x+=1`

			`logger.fdebug('Saving new pull-list information into local file for subsequent merge')`
FIX:(#891) If series on watchlist was on pull-list listed as a comp (ie.l-4), would assume it was a valid issue and error out, FIX: Fixed an invalid cache location reference point 2014-11-25 16:59:56 +00:00			`except_file = os.path.join(mylar.CACHE_DIR, 'newreleases.txt')`
Way too many fixes, some improvements (multipe_dest_dirs, alt weekly pull) 2014-11-21 17:16:28 +00:00			`try:`
			`csvfile = open(str(except_file), 'rb')`
			`csvfile.close()`
Whitespace cleanup 2015-05-22 08:32:51 +00:00			`except (OSError, IOError):`
Way too many fixes, some improvements (multipe_dest_dirs, alt weekly pull) 2014-11-21 17:16:28 +00:00			`logger.fdebug('file does not exist - continuing.')`
			`else:`
			`logger.fdebug('file exists - removing.')`
			`os.remove(except_file)`

			`oldpub = None`
			`breakhtml = {"<td>", "<tr>", "</td>", "</tr>"}`
			`with open(str(except_file), 'wb') as f:`
			`f.write('%s\n' % (newdates))`
			`for pl in pull_list:`
			`if pl['publisher'] == oldpub:`
FIX: included version of comictagger should now work with both Windows and *nix based OS' again, IMP: Global Copy/Move option available when performing post-processing, IMP: Added a verbose file-checking option (FOLDER_SCAN_LOG_VERBOSE) - when enabled will log as it currently does during manual post-processing/file-checking runs, when disabled it will not spam the log nearly as much resulting in more readable log files, IMP: Added Verbose debug logging both via startup option(-v), as well as toggle button in Log GUI (from headphones), as well as per-page loading of log file(s) in GUI, FIX: When doing manual post-processing on issues that were in story arcs, will now indicate X story-arc issues were post-processed for better visibility, FIX: Fixed an issue with deleting from the nzblog table when story arc issues were post-processed, IMP: Added WEEKFOLDER_LOC to the config.ini to allow for specification of where the weekly download directories will default to (as opposed to off of ComicLocation root), IMP: Better handling of some special character references in series titles when looking for series on the auto-wanted list, IMP: 32P will now auto-disable provider if logon returns invalid credentials, FIX: When using alt_pull on weekly pull list, xA0 unicode character caused error, FIX: If title had invalid character in filename that was replaced with a character that already existed in the title, would not scan in during file-checking, FIX: When searching for a series (weeklypull-list/add a series), if the title contained 'and' or '&' would return really mixed up results, FIX: When Post-Processing, if filename being processed had special characters (ie. comma) and was different than nzbname, in some cases would fail to find/move issues, IMP: Utilize internal comictagger to convert from cbr/cbz, IMP: Added more checks when post-processing to ensure files are handled correctly, IMP: Added meta-tag reading when importing series/issues - if previously tagged with CT, will reverse look-up the provided IssueID to reference the correct ComicID, IMP: If scanned directory during import contins cvinfo file, use that and force the ComicID to entire directory when importing a series, IMP: Manual meta-tagging issues will no longer create temporary directories and/or create files in the Comic Location root causing problems for some users, FIX: Annuals weren't properly sorted upon loading of comic details page for some series, IMP: Added some extra checks when validating/creating directories, FIX: Fixed a problem when displaying some covers of .cbz files on the comic details page 2016-01-26 07:49:56 +00:00			`exceptln = str(pl['ID']) + "\t" + pl['name'].replace(u"\xA0", u" ") + "\t" + str(pl['price'])`
Way too many fixes, some improvements (multipe_dest_dirs, alt weekly pull) 2014-11-21 17:16:28 +00:00			`else:`
FIX: included version of comictagger should now work with both Windows and *nix based OS' again, IMP: Global Copy/Move option available when performing post-processing, IMP: Added a verbose file-checking option (FOLDER_SCAN_LOG_VERBOSE) - when enabled will log as it currently does during manual post-processing/file-checking runs, when disabled it will not spam the log nearly as much resulting in more readable log files, IMP: Added Verbose debug logging both via startup option(-v), as well as toggle button in Log GUI (from headphones), as well as per-page loading of log file(s) in GUI, FIX: When doing manual post-processing on issues that were in story arcs, will now indicate X story-arc issues were post-processed for better visibility, FIX: Fixed an issue with deleting from the nzblog table when story arc issues were post-processed, IMP: Added WEEKFOLDER_LOC to the config.ini to allow for specification of where the weekly download directories will default to (as opposed to off of ComicLocation root), IMP: Better handling of some special character references in series titles when looking for series on the auto-wanted list, IMP: 32P will now auto-disable provider if logon returns invalid credentials, FIX: When using alt_pull on weekly pull list, xA0 unicode character caused error, FIX: If title had invalid character in filename that was replaced with a character that already existed in the title, would not scan in during file-checking, FIX: When searching for a series (weeklypull-list/add a series), if the title contained 'and' or '&' would return really mixed up results, FIX: When Post-Processing, if filename being processed had special characters (ie. comma) and was different than nzbname, in some cases would fail to find/move issues, IMP: Utilize internal comictagger to convert from cbr/cbz, IMP: Added more checks when post-processing to ensure files are handled correctly, IMP: Added meta-tag reading when importing series/issues - if previously tagged with CT, will reverse look-up the provided IssueID to reference the correct ComicID, IMP: If scanned directory during import contins cvinfo file, use that and force the ComicID to entire directory when importing a series, IMP: Manual meta-tagging issues will no longer create temporary directories and/or create files in the Comic Location root causing problems for some users, FIX: Annuals weren't properly sorted upon loading of comic details page for some series, IMP: Added some extra checks when validating/creating directories, FIX: Fixed a problem when displaying some covers of .cbz files on the comic details page 2016-01-26 07:49:56 +00:00			`exceptln = pl['publisher'] + "\n" + str(pl['ID']) + "\t" + pl['name'].replace(u"\xA0", u" ") + "\t" + str(pl['price'])`
Way too many fixes, some improvements (multipe_dest_dirs, alt weekly pull) 2014-11-21 17:16:28 +00:00
			`for lb in breakhtml:`
Whitespace cleanup 2015-05-22 08:32:51 +00:00			`exceptln = re.sub(lb, '', exceptln).strip()`
Way too many fixes, some improvements (multipe_dest_dirs, alt weekly pull) 2014-11-21 17:16:28 +00:00
Whitespace cleanup 2015-05-22 08:32:51 +00:00			`exceptline = exceptln.decode('utf-8', 'ignore')`
			`f.write('%s\n' % (exceptline.encode('ascii', 'replace').strip()))`
Way too many fixes, some improvements (multipe_dest_dirs, alt weekly pull) 2014-11-21 17:16:28 +00:00			`oldpub = pl['publisher']`


			`if __name__ == '__main__':`
			`newpull()`