mylar/mylar/parseit.py

630 lines
29 KiB
Python
Raw Normal View History

# This file is part of Mylar.
#
# Mylar is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# Mylar is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with Mylar. If not, see <http://www.gnu.org/licenses/>.
from bs4 import BeautifulSoup, UnicodeDammit
import urllib2
import re
import helpers
import logger
import datetime
import sys
from decimal import Decimal
from HTMLParser import HTMLParseError
from time import strptime
def GCDScraper(ComicName, ComicYear, Total, ComicID, quickmatch=None):
NOWyr = datetime.date.today().year
if datetime.date.today().month == 12:
NOWyr = NOWyr + 1
logger.fdebug("We're in December, incremented search Year to increase search results: " + str(NOWyr))
comicnm = ComicName
comicyr = ComicYear
comicis = Total
comicid = ComicID
#print ( "comicname: " + str(comicnm) )
#print ( "comicyear: " + str(comicyr) )
#print ( "comichave: " + str(comicis) )
#print ( "comicid: " + str(comicid) )
comicnm = re.sub(' ', '+', comicnm)
input = 'http://www.comics.org/search/advanced/process/?target=series&method=icontains&logic=False&order2=date&order3=&start_date=' + str(comicyr) + '-01-01&end_date=' + str(NOWyr) + '-12-31&series=' + str(comicnm) + '&is_indexed=None'
response = urllib2.urlopen ( input )
soup = BeautifulSoup ( response)
cnt1 = len(soup.findAll("tr", {"class" : "listing_even"}))
cnt2 = len(soup.findAll("tr", {"class" : "listing_odd"}))
cnt = int(cnt1 + cnt2)
2012-09-09 19:34:53 +00:00
#print (str(cnt) + " results")
resultName = []
resultID = []
resultYear = []
resultIssues = []
resultURL = None
n_odd = -1
n_even = -1
n = 0
while ( n < cnt ):
if n%2==0:
n_even+=1
resultp = soup.findAll("tr", {"class" : "listing_even"})[n_even]
else:
n_odd+=1
resultp = soup.findAll("tr", {"class" : "listing_odd"})[n_odd]
rtp = resultp('a')[1]
resultName.append(helpers.cleanName(rtp.findNext(text=True)))
2012-09-09 19:34:53 +00:00
#print ( "Comic Name: " + str(resultName[n]) )
fip = resultp('a',href=True)[1]
resultID.append(fip['href'])
#print ( "ID: " + str(resultID[n]) )
subtxt3 = resultp('td')[3]
resultYear.append(subtxt3.findNext(text=True))
resultYear[n] = resultYear[n].replace(' ','')
subtxt4 = resultp('td')[4]
resultIssues.append(helpers.cleanName(subtxt4.findNext(text=True)))
resiss = resultIssues[n].find('issue')
resiss = int(resiss)
resultIssues[n] = resultIssues[n].replace('','')[:resiss]
resultIssues[n] = resultIssues[n].replace(' ','')
2012-09-09 19:34:53 +00:00
#print ( "Year: " + str(resultYear[n]) )
#print ( "Issues: " + str(resultIssues[n]) )
CleanComicName = re.sub('[\,\.\:\;\'\[\]\(\)\!\@\#\$\%\^\&\*\-\_\+\=\?\/]', '', comicnm)
CleanComicName = re.sub(' ', '', CleanComicName).lower()
CleanResultName = re.sub('[\,\.\:\;\'\[\]\(\)\!\@\#\$\%\^\&\*\-\_\+\=\?\/]', '', resultName[n])
CleanResultName = re.sub(' ', '', CleanResultName).lower()
#print ("CleanComicName: " + str(CleanComicName))
#print ("CleanResultName: " + str(CleanResultName))
if CleanResultName == CleanComicName or CleanResultName[3:] == CleanComicName:
#if resultName[n].lower() == helpers.cleanName(str(ComicName)).lower():
2012-09-09 19:34:53 +00:00
#print ("n:" + str(n) + "...matched by name to Mylar!")
#this has been seen in a few instances already, so trying to adjust.
#when the series year is 2011, in gcd it might be 2012 due to publication
#dates overlapping between Dec/11 and Jan/12. Let's accept a match with a
#1 year grace space, and then pull in the first issue to see the actual pub
# date and if coincides with the other date..match it.
if resultYear[n] == ComicYear or resultYear[n] == str(int(ComicYear)+1):
2012-09-09 19:34:53 +00:00
#print ("n:" + str(n) + "...matched by year to Mylar!")
#print ( "Year: " + str(resultYear[n]) )
#Occasionally there are discrepancies in comic count between
#GCD and CV. 99% it's CV not updating to the newest issue as fast
#as GCD does. Therefore, let's increase the CV count by 1 to get it
#to match, any more variation could cause incorrect matching.
#ie. witchblade on GCD says 159 issues, CV states 161.
if int(resultIssues[n]) == int(Total) or int(resultIssues[n]) == int(Total)+1 or (int(resultIssues[n])+1) == int(Total):
#print ("initial issue match..continuing.")
if int(resultIssues[n]) == int(Total)+1:
issvariation = "cv"
elif int(resultIssues[n])+1 == int(Total):
issvariation = "gcd"
else:
issvariation = "no"
#print ("n:" + str(n) + "...matched by issues to Mylar!")
#print ("complete match!...proceeding")
TotalIssues = resultIssues[n]
resultURL = str(resultID[n])
rptxt = resultp('td')[6]
resultPublished = rptxt.findNext(text=True)
#print ("Series Published: " + str(resultPublished))
break
n+=1
# it's possible that comicvine would return a comic name incorrectly, or gcd
# has the wrong title and won't match 100%...
# (ie. The Flash-2011 on comicvine is Flash-2011 on gcd)
# this section is to account for variations in spelling, punctuation, etc/
basnumbs = {'one':1,'two':2,'three':3,'four':4,'five':5,'six':6,'seven':7,'eight':8,'nine':9,'ten':10,'eleven':11,'twelve':12}
if resultURL is None:
#search for number as text, and change to numeric
for numbs in basnumbs:
#print ("numbs:" + str(numbs))
if numbs in ComicName.lower():
numconv = basnumbs[numbs]
#print ("numconv: " + str(numconv))
ComicNm = re.sub(str(numbs), str(numconv), ComicName.lower())
#print ("comicname-reVISED:" + str(ComicNm))
return GCDScraper(ComicNm, ComicYear, Total, ComicID)
break
if ComicName.lower().startswith('the '):
ComicName = ComicName[4:]
return GCDScraper(ComicName, ComicYear, Total, ComicID)
2012-09-15 05:01:55 +00:00
if ':' in ComicName:
ComicName = re.sub(':', '', ComicName)
return GCDScraper(ComicName, ComicYear, Total, ComicID)
if '-' in ComicName:
ComicName = re.sub('-', ' ', ComicName)
return GCDScraper(ComicName, ComicYear, Total, ComicID)
if 'and' in ComicName.lower():
ComicName = ComicName.replace('and', '&')
return GCDScraper(ComicName, ComicYear, Total, ComicID)
if not quickmatch: return 'No Match'
#vari_loop = 0
if quickmatch == "yes":
if resultURL is None: return 'No Match'
else: return 'Match'
return GCDdetails(comseries=None, resultURL=resultURL, vari_loop=0, ComicID=ComicID, TotalIssues=TotalIssues, issvariation=issvariation, resultPublished=resultPublished)
def GCDdetails(comseries, resultURL, vari_loop, ComicID, TotalIssues, issvariation, resultPublished):
gcdinfo = {}
gcdchoice = []
gcount = 0
i = 0
# datemonth = {'one':1,'two':2,'three':3,'four':4,'five':5,'six':6,'seven':7,'eight':8,'nine':9,'ten':10,'eleven':$
# #search for number as text, and change to numeric
# for numbs in basnumbs:
# #print ("numbs:" + str(numbs))
# if numbs in ComicName.lower():
# numconv = basnumbs[numbs]
# #print ("numconv: " + str(numconv))
if vari_loop > 1:
resultPublished = "Unknown"
if vari_loop == 99: vari_loop = 1
while (i <= vari_loop):
if vari_loop > 0:
try:
boong = comseries['comseries'][i]
except IndexError:
break
resultURL = boong['comseriesID']
ComicID = boong['comicid']
TotalIssues+= int(boong['comseriesIssues'])
else:
resultURL = resultURL
# if we're here - it means it's a mismatched name.
# let's pull down the publication date as it'll be blank otherwise
inputMIS = 'http://www.comics.org' + str(resultURL)
resp = urllib2.urlopen ( inputMIS )
# soup = BeautifulSoup ( resp )
try:
soup = BeautifulSoup(urllib2.urlopen(inputMIS))
except UnicodeDecodeError:
logger.info("I've detected your system is using: " + sys.stdout.encoding)
logger.info("unable to parse properly due to utf-8 problem, ignoring wrong symbols")
try:
soup = BeautifulSoup(urllib2.urlopen(inputMIS)).decode('utf-8', 'ignore')
except UnicodeDecodeError:
logger.info("not working...aborting. Tell Evilhero.")
return
parsed = soup.find("div", {"id" : "series_data"})
subtxt3 = parsed.find("dd", {"id" : "publication_dates"})
resultPublished = subtxt3.findNext(text=True).rstrip()
#print ("pubdate:" + str(resultPublished))
subtxt9 = parsed.find("dd", {"id" : "series_format"})
resultFormat = subtxt9.findNext(text=True).rstrip()
# the caveat - if a series is ongoing but only has 1 issue published at a particular point in time,
# resultPublished will return just the date and not the word 'Present' which dictates on the main
# page if a series is Continuing / Ended .
if 'ongoing series' in resultFormat.lower() and 'was' not in resultFormat.lower():
resultPublished = resultPublished + " - Present"
coverst = soup.find("div", {"id" : "series_cover"})
if coverst < 0:
gcdcover = "None"
else:
subcoverst = coverst('img',src=True)[0]
gcdcover = subcoverst['src']
#print ("resultURL:" + str(resultURL))
#print ("comicID:" + str(ComicID))
input2 = 'http://www.comics.org' + str(resultURL) + 'details/'
resp = urllib2.urlopen(input2)
soup = BeautifulSoup(resp)
#for newer comics, on-sale date has complete date...
#for older comics, pub.date is to be used
# type = soup.find(text=' On-sale date ')
type = soup.find(text=' Pub. Date ')
if type:
#print ("on-sale date detected....adjusting")
datetype = "pub"
else:
#print ("pub date defaulting")
datetype = "on-sale"
cnt1 = len(soup.findAll("tr", {"class" : "row_even_False"}))
cnt2 = len(soup.findAll("tr", {"class" : "row_even_True"}))
cnt = int(cnt1 + cnt2)
#print (str(cnt) + " Issues in Total (this may be wrong due to alternate prints, etc")
n_odd = -1
n_even = -1
n = 0
PI = "1.00"
altcount = 0
PrevYRMO = "0000-00"
while ( n < cnt ):
if n%2==0:
n_odd+=1
parsed = soup.findAll("tr", {"class" : "row_even_False"})[n_odd]
ntype = "odd"
else:
n_even+=1
ntype = "even"
parsed = soup.findAll("tr", {"class" : "row_even_True"})[n_even]
subtxt3 = parsed.find("a")
ParseIssue = subtxt3.findNext(text=True)
fid = parsed('a',href=True)[0]
resultGID = fid['href']
resultID = resultGID[7:-1]
if ',' in ParseIssue: ParseIssue = re.sub("\,", "", ParseIssue)
variant="no"
if 'Vol' in ParseIssue or '[' in ParseIssue or 'a' in ParseIssue or 'b' in ParseIssue or 'c' in ParseIssue:
m = re.findall('[^\[\]]+', ParseIssue)
# ^^ takes care of []
ParseIssue = re.sub("[^0-9]", " ", m[0])
# ^^ removes everything but the digits from the remaining non-brackets
#logger.fdebug("variant cover detected : " + str(ParseIssue))
variant="yes"
altcount = 1
isslen = ParseIssue.find(' ')
if isslen < 0:
#logger.fdebug("just digits left..using " + str(ParseIssue))
isslen == 0
isschk = ParseIssue
else:
#logger.fdebug("more than digits left - first space detected at position : " + str(isslen))
#if 'isslen' exists, it means that it's an alternative cover.
#however, if ONLY alternate covers exist of an issue it won't work.
#let's use the FIRST record, and ignore all other covers for the given issue.
isschk = ParseIssue[:isslen]
#logger.fdebug("Parsed Issue#: " + str(isschk))
ParseIssue = re.sub("\s", "", ParseIssue)
#check if decimal or '1/2' exists or not, and store decimal results
halfchk = "no"
if '.' in isschk:
isschk_find = isschk.find('.')
isschk_b4dec = isschk[:isschk_find]
isschk_decval = isschk[isschk_find+1:]
elif '/' in isschk:
ParseIssue = "0.50"
isslen = 0
halfchk = "yes"
else:
isschk_decval = ".00"
if variant == "yes":
#logger.fdebug("alternate cover detected - skipping/ignoring.")
altcount = 1
# in order to get the compare right, let's decimialize the string to '.00'.
if halfchk == "yes": pass
else:
ParseIssue = ParseIssue + isschk_decval
if not any(d.get('GCDIssue', None) == str(ParseIssue) for d in gcdchoice):
#logger.fdebug("preparing to add issue to db : " + str(ParseIssue))
gcdinfo['ComicIssue'] = ParseIssue
#--- let's use pubdate.
#try publicationd date first
subtxt1 = parsed('td')[1]
ParseDate = subtxt1.findNext(text=True)
basmonths = {'january':'01','february':'02','march':'03','april':'04','may':'05','june':'06','july':'07','august':'08','september':'09','october':'10','november':'11','december':'12'}
pdlen = len(ParseDate)
pdfind = ParseDate.find(' ',2)
#logger.fdebug("length: " + str(pdlen) + "....first space @ pos " + str(pdfind))
#logger.fdebug("this should be the year: " + str(ParseDate[pdfind+1:pdlen-1]))
if ParseDate[pdfind+1:pdlen-1].isdigit():
#assume valid date.
#search for number as text, and change to numeric
for numbs in basmonths:
if numbs in ParseDate.lower():
pconv = basmonths[numbs]
ParseYear = re.sub('/s','',ParseDate[-5:])
ParseDate = str(ParseYear) + "-" + str(pconv)
#logger.fdebug("!success - Publication date: " + str(ParseDate))
break
else:
# #try key date
# subtxt1 = parsed('td')[2]
# ParseDate = subtxt1.findNext(text=True)
# #logger.fdebug("no pub.date detected, attempting to use on-sale date: " + str(ParseDate))
# if (ParseDate) < 7:
# #logger.fdebug("Invalid on-sale date - less than 7 characters. Trying Key date")
# subtxt3 = parsed('td')[0]
# ParseDate = subtxt3.findNext(text=True)
# if ParseDate == ' ':
#increment previous month by one and throw it in until it's populated properly.
if PrevYRMO == '0000-00':
ParseDate = '0000-00'
else:
PrevYR = str(PrevYRMO)[:4]
PrevMO = str(PrevYRMO)[5:]
#let's increment the month now (if it's 12th month, up the year and hit Jan.)
if int(PrevMO) == 12:
PrevYR = int(PrevYR) + 1
PrevMO = 1
else:
PrevMO = int(PrevMO) + 1
if int(PrevMO) < 10:
PrevMO = "0" + str(PrevMO)
ParseDate = str(PrevYR) + "-" + str(PrevMO)
ParseDate = ParseDate.replace(' ','')
PrevYRMO = ParseDate
gcdinfo['ComicDate'] = ParseDate
#^^ will retrieve date #
#logger.fdebug("adding: " + str(gcdinfo['ComicIssue']))
if ComicID[:1] == "G":
gcdchoice.append({
'GCDid': ComicID,
'IssueID': resultID,
'GCDIssue': gcdinfo['ComicIssue'],
'GCDDate': gcdinfo['ComicDate']
})
gcount+=1
else:
gcdchoice.append({
'GCDid': ComicID,
'GCDIssue': gcdinfo['ComicIssue'],
'GCDDate': gcdinfo['ComicDate']
})
gcdinfo['gcdchoice'] = gcdchoice
altcount = 0
n+=1
# ---redundant---
# else:
# #--if 2 identical issue numbers legitimately exist, but have different
# #--publication dates, try to distinguish
# logger.fdebug("2 identical issue #'s have been found...determining if it's intentional.")
# #get current issue & publication date.
# logger.fdebug("Issue #:" + str(ParseIssue))
# logger.fdebug("IssueDate: " + str(gcdinfo['ComicDate']))
# #get conflicting issue from tuple
# for d in gcdchoice:
# if str(d['GCDIssue']) == str(gcdinfo['ComicIssue']):
# logger.fdebug("Issue # already in tuple - checking IssueDate:" + str(d['GCDDate']) )
# if str(d['GCDDate']) == str(gcdinfo['ComicDate']):
# logger.fdebug("Issue #'s and dates match...skipping.")
# else:
# logger.fdebug("Issue#'s match but different publication dates, not skipping.")
#pass
#logger.fdebug("Duplicate issue detected in DB - ignoring subsequent issue # " + str(gcdinfo['ComicIssue']))
i+=1
gcdinfo['gcdvariation'] = issvariation
if ComicID[:1] == "G":
gcdinfo['totalissues'] = gcount
else:
gcdinfo['totalissues'] = TotalIssues
gcdinfo['ComicImage'] = gcdcover
gcdinfo['resultPublished'] = resultPublished
return gcdinfo
## -- end (GCD) -- ##
def GCDAdd(gcdcomicid):
serieschoice = []
series = {}
logger.fdebug("I'm trying to find these GCD comicid's:" + str(gcdcomicid))
for gcdid in gcdcomicid:
logger.fdebug("looking at gcdid:" + str(gcdid))
input2 = 'http://www.comics.org/series/' + str(gcdid)
logger.fdebug("---url: " + str(input2))
resp = urllib2.urlopen ( input2 )
soup = BeautifulSoup ( resp )
logger.fdebug("SeriesName section...")
parsen = soup.find("span", {"id" : "series_name"})
#logger.fdebug("series name (UNPARSED): " + str(parsen))
subpar = parsen('a')[0]
resultName = subpar.findNext(text=True)
logger.fdebug("ComicName: " + str(resultName))
#covers-start
logger.fdebug("Covers section...")
coverst = soup.find("div", {"id" : "series_cover"})
if coverst < 0:
gcdcover = "None"
logger.fdebug("unable to find any covers - setting to None")
else:
subcoverst = coverst('img',src=True)[0]
#logger.fdebug("cover (UNPARSED) : " + str(subcoverst))
gcdcover = subcoverst['src']
logger.fdebug("Cover: " + str(gcdcover))
#covers end
#publisher start
logger.fdebug("Publisher section...")
try:
pubst = soup.find("div", {"class" : "item_data"})
catchit = pubst('a')[0]
except (IndexError, TypeError):
pubst = soup.findAll("div", {"class" : "left"})[1]
catchit = pubst.find("a")
publisher = catchit.findNext(text=True)
logger.fdebug("Publisher: " + str(publisher))
#publisher end
parsed = soup.find("div", {"id" : "series_data"})
#logger.fdebug("series_data: " + str(parsed))
#print ("parse:" + str(parsed))
subtxt3 = parsed.find("dd", {"id" : "publication_dates"})
#logger.fdebug("publication_dates: " + str(subtxt3))
pubdate = subtxt3.findNext(text=True).rstrip()
logger.fdebug("pubdate:" + str(pubdate))
subtxt4 = parsed.find("dd", {"id" : "issues_published"})
noiss = subtxt4.findNext(text=True)
lenwho = len(noiss)
lent = noiss.find(' ',2)
lenf = noiss.find('(')
stringit = noiss[lenf:lenwho]
stringout = noiss[:lent]
noissues = stringout.rstrip(' \t\r\n\0')
numbering = stringit.rstrip(' \t\r\n\0')
logger.fdebug("noissues:" + str(noissues))
logger.fdebug("numbering:" + str(numbering))
serieschoice.append({
"ComicID": gcdid,
"ComicName": resultName,
"ComicYear" : pubdate,
"ComicIssues" : noissues,
"ComicPublisher" : publisher,
"ComicCover" : gcdcover
})
series['serieschoice'] = serieschoice
return series
def ComChk(ComicName, ComicYear, ComicPublisher, Total, ComicID):
comchkchoice = []
comchoice = {}
NOWyr = datetime.date.today().year
if datetime.date.today().month == 12:
NOWyr = NOWyr + 1
logger.fdebug("We're in December, incremented search Year to increase search results: " + str(NOWyr))
comicnm = ComicName
comicyr = ComicYear
comicis = Total
comicid = ComicID
comicpub = ComicPublisher
print ("...comchk parser initialization...")
print ( "comicname: " + str(comicnm) )
print ( "comicyear: " + str(comicyr) )
print ( "comichave: " + str(comicis) )
print ( "comicpub: " + str(comicpub) )
print ( "comicid: " + str(comicid) )
# do 3 runs at the comics.org search to get the best results
comicrun = []
# &pub_name=DC
# have to remove the spaces from Publisher or else will not work (ie. DC Comics vs DC will not match)
# take the 1st word ;)
#comicpub = comicpub.split()[0]
# if it's not one of the BIG publisher's it might fail - so let's increase the odds.
pubbiggies = [ 'DC',
'Marvel',
'Image',
'IDW' ]
uhuh = "no"
for pb in pubbiggies:
if pb in comicpub:
#keep publisher in url if a biggie.
uhuh = "yes"
print (" publisher match : " + str(comicpub))
conv_pub = comicpub.split()[0]
print (" converted publisher to : " + str(conv_pub))
#1st run setup - leave it all as it is.
comicrun.append(comicnm)
cruncnt = 0
#2nd run setup - remove the last character and do a broad search (keep year or else will blow up)
if len(str(comicnm).split()) > 2:
comicrun.append(' '.join(comicnm.split(' ')[:-1]))
cruncnt+=1
# to increase the likely hood of matches and to get a broader scope...
# lets remove extra characters
if re.sub('[\.\,\:]', '', comicnm) != comicnm:
comicrun.append(re.sub('[\.\,\:]', '', comicnm))
cruncnt+=1
totalcount = 0
cr = 0
print ("cruncnt is " + str(cruncnt))
while (cr <= cruncnt):
print ("cr is " + str(cr))
comicnm = comicrun[cr]
#leaving spaces in will screw up the search...let's take care of it
comicnm = re.sub(' ', '+', comicnm)
print ("comicnm: " + str(comicnm))
if uhuh == "yes":
publink = "&pub_name=" + str(conv_pub)
if uhuh == "no":
publink = "&pub_name="
input = 'http://www.comics.org/search/advanced/process/?target=series&method=icontains&logic=False&keywords=&order1=series&order2=date&order3=&start_date=' + str(comicyr) + '-01-01&end_date=' + str(NOWyr) + '-12-31' + '&title=&feature=&job_number=&pages=&script=&pencils=&inks=&colors=&letters=&story_editing=&genre=&characters=&synopsis=&reprint_notes=&story_reprinted=None&notes=' + str(publink) + '&pub_notes=&brand=&brand_notes=&indicia_publisher=&is_surrogate=None&ind_pub_notes=&series=' + str(comicnm) + '&series_year_began=&series_notes=&tracking_notes=&issue_count=&is_comics=None&format=&color=&dimensions=&paper_stock=&binding=&publishing_format=&issues=&volume=&issue_title=&variant_name=&issue_date=&indicia_frequency=&price=&issue_pages=&issue_editing=&isbn=&barcode=&issue_notes=&issue_reprinted=None&is_indexed=None'
response = urllib2.urlopen ( input )
soup = BeautifulSoup ( response)
cnt1 = len(soup.findAll("tr", {"class" : "listing_even"}))
cnt2 = len(soup.findAll("tr", {"class" : "listing_odd"}))
cnt = int(cnt1 + cnt2)
# print ("cnt1: " + str(cnt1))
# print ("cnt2: " + str(cnt2))
# print (str(cnt) + " results")
resultName = []
resultID = []
resultYear = []
resultIssues = []
resultPublisher = []
resultURL = None
n_odd = -1
n_even = -1
n = 0
while ( n < cnt ):
if n%2==0:
n_even+=1
resultp = soup.findAll("tr", {"class" : "listing_even"})[n_even]
else:
n_odd+=1
resultp = soup.findAll("tr", {"class" : "listing_odd"})[n_odd]
rtp = resultp('a')[1]
resultName.append(helpers.cleanName(rtp.findNext(text=True)))
# print ( "Comic Name: " + str(resultName[n]) )
pub = resultp('a')[0]
resultPublisher.append(pub.findNext(text=True))
# print ( "Publisher: " + str(resultPublisher[n]) )
fip = resultp('a',href=True)[1]
resultID.append(fip['href'])
# print ( "ID: " + str(resultID[n]) )
subtxt3 = resultp('td')[3]
resultYear.append(subtxt3.findNext(text=True))
resultYear[n] = resultYear[n].replace(' ','')
subtxt4 = resultp('td')[4]
resultIssues.append(helpers.cleanName(subtxt4.findNext(text=True)))
resiss = resultIssues[n].find('issue')
resiss = int(resiss)
resultIssues[n] = resultIssues[n].replace('','')[:resiss]
resultIssues[n] = resultIssues[n].replace(' ','')
# print ( "Year: " + str(resultYear[n]) )
# print ( "Issues: " + str(resultIssues[n]) )
# print ("comchkchoice: " + str(comchkchoice))
if not any(d.get('GCDID', None) == str(resultID[n]) for d in comchkchoice):
#print ( str(resultID[n]) + " not in DB...adding.")
comchkchoice.append({
"ComicID": str(comicid),
"ComicName": str(resultName[n]),
"GCDID": str(resultID[n]).split('/')[2],
"ComicYear" : str(resultYear[n]),
"ComicPublisher" : str(resultPublisher[n]),
"ComicURL" : "http://www.comics.org" + str(resultID[n]),
"ComicIssues" : str(resultIssues[n])
})
#else:
#print ( str(resultID[n]) + " already in DB...skipping" )
n+=1
cr+=1
totalcount= totalcount + cnt
comchoice['comchkchoice'] = comchkchoice
return comchoice, totalcount
def decode_html(html_string):
converted = UnicodeDammit(html_string, isHTML=True)
if not converted.unicode:
raise UnicodeDecodeError(
"Failed to detect encoding, tried [%s]",
', '.join(converted.triedEncodings))
# print converted.originalEncoding
return converted.unicode