FIX: Fixed search query to be more accurate on multi-termed queries

This commit is contained in:
evilhero 2017-11-15 17:49:57 -05:00
parent 95b39ca1ed
commit dccbdcdba8
2 changed files with 21 additions and 34 deletions

View File

@ -44,20 +44,17 @@ if platform.python_version() == '2.7.6':
httplib.HTTPConnection._http_vsn = 10 httplib.HTTPConnection._http_vsn = 10
httplib.HTTPConnection._http_vsn_str = 'HTTP/1.0' httplib.HTTPConnection._http_vsn_str = 'HTTP/1.0'
def pullsearch(comicapi, comicquery, offset, type, annuals=False): def pullsearch(comicapi, comicquery, offset, type):
u_comicquery = urllib.quote(comicquery.encode('utf-8').strip())
u_comicquery = u_comicquery.replace(" ", "%20")
u_comicquery = u_comicquery.replace('-', '%2D')
#logger.info('comicquery: %s' % comicquery)
if annuals is True:
PULLURL = mylar.CVURL + 'search?api_key=' + str(comicapi) + '&resources=' + str(type) + '&query=' + u_comicquery + '&field_list=id,name,start_year,first_issue,site_detail_url,count_of_issues,image,publisher,deck,description,last_issue&format=xml&limit=100&page=' + str(offset)
else: cnt = 1
# 02/22/2014 use the volume filter label to get the right results. for x in comicquery:
# add the 's' to the end of type to pluralize the caption (it's needed) if cnt == 1:
if type == 'story_arc': filterline = '%s' % x
u_comicquery = re.sub("%20AND%20", "%20", u_comicquery) else:
PULLURL = mylar.CVURL + str(type) + 's?api_key=' + str(comicapi) + '&filter=name:' + u_comicquery + '&field_list=id,name,start_year,site_detail_url,count_of_issues,image,publisher,deck,description,first_issue,last_issue&format=xml&offset=' + str(offset) # 2012/22/02 - CVAPI flipped back to offset instead of page filterline+= ',name:%s' % x
cnt+=1
PULLURL = mylar.CVURL + str(type) + 's?api_key=' + str(comicapi) + '&filter=name:' + filterline + '&field_list=id,name,start_year,site_detail_url,count_of_issues,image,publisher,deck,description,first_issue,last_issue&format=xml&offset=' + str(offset) # 2012/22/02 - CVAPI flipped back to offset instead of page
#all these imports are standard on most modern python implementations #all these imports are standard on most modern python implementations
#logger.info('MB.PULLURL:' + PULLURL) #logger.info('MB.PULLURL:' + PULLURL)
@ -88,18 +85,14 @@ def findComic(name, mode, issue, limityear=None, type=None):
comiclist = [] comiclist = []
arcinfolist = [] arcinfolist = []
#if type == 'story_arc': commons = [' and ', ' the ']
# chars = set('!?*&') for x in commons:
#else: if x in name.lower():
# chars = set('!?*&-') name = re.sub(x, ' ', name.lower()).strip()
#if any((c in chars) for c in name) or 'annual' in name:
# name = '"' +name +'"' pattern = re.compile(ur'\w+', re.UNICODE)
annuals = False name = pattern.findall(name)
if 'annual' in name:
name = '"' + name +'"'
annuals = True
#print ("limityear: " + str(limityear))
if limityear is None: limityear = 'None' if limityear is None: limityear = 'None'
comicquery = name comicquery = name
@ -114,7 +107,7 @@ def findComic(name, mode, issue, limityear=None, type=None):
type = 'volume' type = 'volume'
#let's find out how many results we get from the query... #let's find out how many results we get from the query...
searched = pullsearch(comicapi, comicquery, 0, type, annuals) searched = pullsearch(comicapi, comicquery, 0, type)
if searched is None: if searched is None:
return False return False
totalResults = searched.getElementsByTagName('number_of_total_results')[0].firstChild.wholeText totalResults = searched.getElementsByTagName('number_of_total_results')[0].firstChild.wholeText
@ -128,15 +121,9 @@ def findComic(name, mode, issue, limityear=None, type=None):
while (countResults < int(totalResults)): while (countResults < int(totalResults)):
#logger.fdebug("querying " + str(countResults)) #logger.fdebug("querying " + str(countResults))
if countResults > 0: if countResults > 0:
#2012/22/02 - CV API flipped back to offset usage instead of page offsetcount = countResults
if annuals is True:
# search uses page for offset
offsetcount = (countResults /100) + 1
else:
# filter uses offset
offsetcount = countResults
searched = pullsearch(comicapi, comicquery, offsetcount, type, annuals) searched = pullsearch(comicapi, comicquery, offsetcount, type)
comicResults = searched.getElementsByTagName(type) comicResults = searched.getElementsByTagName(type)
body = '' body = ''
n = 0 n = 0

View File

@ -1404,7 +1404,7 @@ def future_check():
if not theissdate.startswith('20'): if not theissdate.startswith('20'):
theissdate = ser['IssueDate'][:4] theissdate = ser['IssueDate'][:4]
logger.info('looking for new data for ' + ser['ComicName'] + '[#' + str(ser['IssueNumber']) + '] (' + str(theissdate) + ')') logger.info('looking for new data for ' + ser['ComicName'] + '[#' + str(ser['IssueNumber']) + '] (' + str(theissdate) + ')')
searchresults, explicit = mb.findComic(ser['ComicName'], mode='pullseries', issue=ser['IssueNumber'], limityear=theissdate, explicit='all') searchresults = mb.findComic(ser['ComicName'], mode='pullseries', issue=ser['IssueNumber'], limityear=theissdate)
if len(searchresults) > 0: if len(searchresults) > 0:
if len(searchresults) > 1: if len(searchresults) > 1:
logger.info('More than one result returned - this may have to be a manual add, but I\'m going to try to figure it out myself first.') logger.info('More than one result returned - this may have to be a manual add, but I\'m going to try to figure it out myself first.')