mylar/mylar/newpull.py


from bs4 import BeautifulSoup, UnicodeDammit
import urllib2
import csv
import fileinput
import sys
import re
import os
import sqlite3
import datetime
import unicodedata
from decimal import Decimal
from HTMLParser import HTMLParseError
from time import strptime
import requests

import mylar
from mylar import logger

def newpull():
        pagelinks = "http://www.previewsworld.com/Home/1/1/71/952"

        try:
            r = requests.get(pagelinks, verify=False)

        except Exception, e:
            logger.warn('Error fetching data: %s' % e)

        soup = BeautifulSoup(r.content)
        getthedate = soup.findAll("div", {"class": "Headline"})[0]

        #the date will be in the FIRST ahref
        try:
            getdate_link = getthedate('a')[0]
            newdates = getdate_link.findNext(text=True).strip()
        except IndexError:
            newdates = getthedate.findNext(text=True).strip()
        logger.fdebug('New Releases date detected as : ' + re.sub('New Releases For', '', newdates).strip())
        cntlinks = soup.findAll('tr')
        lenlinks = len(cntlinks)

        publish = []
        resultURL = []
        resultmonth = []
        resultyear = []

        x = 0
        cnt = 0
        endthis = False
        pull_list = []

        publishers = {'PREVIEWS PUBLICATIONS', 'DARK HORSE COMICS', 'DC COMICS', 'IDW PUBLISHING', 'IMAGE COMICS', 'MARVEL COMICS', 'COMICS & GRAPHIC NOVELS', 'MAGAZINES', 'MERCHANDISE'}
        isspublisher = None

        while (x < lenlinks):
            headt = cntlinks[x] #iterate through the hrefs pulling out only results.
            found_iss = headt.findAll('td')
            pubcheck = found_iss[0].text.strip() #.findNext(text=True)
            for pub in publishers:
                if pub in pubcheck:
                    chklink = found_iss[0].findAll('a', href=True)  #make sure it doesn't have a link in it.
                    if not chklink:
                        isspublisher = pub
                        break
                    
            if isspublisher == 'PREVIEWS PUBLICATIONS' or isspublisher is None:
                pass

            elif any([isspublisher == 'MAGAZINES', isspublisher == 'MERCHANDISE']):
                #logger.fdebug('End.')
                endthis = True
                break

            else:
                if "PREVIEWS" in headt:
                    #logger.fdebug('Ignoring: ' + found_iss[0])
                    break

                if '/Catalog/' in str(headt):
                    findurl_link = headt.findAll('a', href=True)[0]
                    urlID = findurl_link.findNext(text=True)
                    issue_link = findurl_link['href']
                    issue_lk = issue_link.find('/Catalog/')
                    if issue_lk == -1:
                        x+=1
                        continue
                    elif "Home/1/1/71" in issue_link:
                        #logger.fdebug('Ignoring - menu option.')
                        x+=1
                        continue

                    if len(found_iss) > 0:
                        pull_list.append({"iss_url":   issue_link,
                                          "name":      found_iss[1].findNext(text=True),
                                          "price":     found_iss[2],
                                          "publisher": isspublisher,
                                          "ID": urlID})

            x+=1

        logger.fdebug('Saving new pull-list information into local file for subsequent merge')
        except_file = os.path.join(mylar.CACHE_DIR, 'newreleases.txt')
        try:
            csvfile = open(str(except_file), 'rb')
            csvfile.close()
        except (OSError, IOError):
            logger.fdebug('file does not exist - continuing.')
        else:
            logger.fdebug('file exists - removing.')
            os.remove(except_file)

        oldpub = None
        breakhtml = {"<td>", "<tr>", "</td>", "</tr>"}
        with open(str(except_file), 'wb') as f:
            f.write('%s\n' % (newdates))
            for pl in pull_list:
                if pl['publisher'] == oldpub:
                    exceptln = str(pl['ID']) + "\t" + pl['name'].replace(u"\xA0", u" ") + "\t" + str(pl['price'])
                else:
                    exceptln = pl['publisher'] + "\n" + str(pl['ID']) + "\t" + pl['name'].replace(u"\xA0", u" ") + "\t" + str(pl['price'])

                for lb in breakhtml:
                    exceptln = re.sub(lb, '', exceptln).strip()

                exceptline = exceptln.decode('utf-8', 'ignore')
                f.write('%s\n' % (exceptline.encode('ascii', 'replace').strip()))
                oldpub = pl['publisher']


if __name__ == '__main__':
    newpull()