FIX:(#897) Improvements for Experimental search and better rss handling/searching for it, IMP: restricting filetypes to cbr/cbz, as well as file-sizees, should now be working for Experimental / CBT / KAT, FIX:(#885) referencing issue_firstword errors should now be fixed and working again, IMP: Attempt at better handling of filenames that have issue titles within them (searching only)

This commit is contained in:
evilhero 2014-12-01 12:23:49 -05:00
parent eeae5e77ea
commit 1ad1d7e927
3 changed files with 364 additions and 115 deletions

View File

@ -31,6 +31,9 @@ def Startit(searchName, searchIssue, searchYear, ComicVersion, IssDateFix):
searchIsOne = "0"+searchIssue
searchIsTwo = "00"+searchIssue
if mylar.PREFERRED_QUALITY == 1: joinSearch = joinSearch + " .cbr"
elif mylar.PREFERRED_QUALITY == 2: joinSearch = joinSearch + " .cbz"
if "-" in searchName:
searchName = searchName.replace("-", '((\\s)?[-:])?(\\s)?')
@ -105,11 +108,12 @@ def Startit(searchName, searchIssue, searchYear, ComicVersion, IssDateFix):
logger.fdebug("titlesplit: " + str(title.split("\"")))
splitTitle = title.split("\"")
noYear = 'False'
_digits = re.compile('\d')
for subs in splitTitle:
logger.fdebug('sub:' + subs)
regExCount = 0
if len(subs) >= len(cName) and not any(d in subs.lower() for d in except_list):
if len(subs) >= len(cName) and not any(d in subs.lower() for d in except_list) and bool(_digits.search(subs)) is True:
#Looping through dictionary to run each regEx - length + regex is determined by regexList up top.
# while regExCount < len(regexList):
# regExTest = re.findall(regexList[regExCount], subs, flags=re.IGNORECASE)
@ -120,6 +124,14 @@ def Startit(searchName, searchIssue, searchYear, ComicVersion, IssDateFix):
# 'title': subs,
# 'link': str(link)
# })
# this will still match on crap like 'For SomeSomayes' especially if the series length < 'For SomeSomayes'
if subs.startswith('for').lower():
if cName.startswith('for').lower():
pass
else:
#this is the crap we ignore. Continue
logger.fdebug('this starts with FOR : ' + str(subs) + '. This is not present in the series - ignoring.')
continue
logger.fdebug('match.')
if IssDateFix != "no":
if IssDateFix == "01" or IssDateFix == "02": ComicYearFix = str(int(searchYear) - 1)

View File

@ -474,7 +474,15 @@ def torrentdbsearch(seriesname,issue,comicid=None,nzbprov=None):
tsearch_rem2 = re.sub("\\bthe\\b", "%", tsearch_rem1.lower())
tsearch_removed = re.sub('\s+', ' ', tsearch_rem2)
tsearch_seriesname = re.sub('[\'\!\@\#\$\%\:\-\;\/\\=\?\&\.\s]', '%',tsearch_removed)
tsearch = tsearch_seriesname + "%"
if mylar.PREFERRED_QUALITY == 0:
tsearch = tsearch_seriesname + "%"
elif mylar.PREFERRED_QUALITY == 1:
tsearch = tsearch_seriesname + "%cbr%"
elif mylar.PREFERRED_QUALITY == 2:
tsearch = tsearch_seriesname + "%cbz%"
else:
tsearch = tsearch_seriesname + "%"
logger.fdebug('tsearch : ' + tsearch)
AS_Alt = []
tresults = []
@ -508,7 +516,14 @@ def torrentdbsearch(seriesname,issue,comicid=None,nzbprov=None):
if AS_formatrem_seriesname[:1] == ' ': AS_formatrem_seriesname = AS_formatrem_seriesname[1:]
AS_Alt.append(AS_formatrem_seriesname)
AS_Alternate += '%'
if mylar.PREFERRED_QUALITY == 0:
AS_Alternate += "%"
elif mylar.PREFERRED_QUALITY == 1:
AS_Alternate += "%cbr%"
elif mylar.PREFERRED_QUALITY == 2:
AS_Alternate += "%cbz%"
else:
AS_Alternate += "%"
if mylar.ENABLE_CBT:
#print "AS_Alternate:" + str(AS_Alternate)
@ -526,6 +541,17 @@ def torrentdbsearch(seriesname,issue,comicid=None,nzbprov=None):
for tor in tresults:
torsplit = tor['Title'].split('/')
if mylar.PREFERRED_QUALITY == 1:
if 'cbr' in tor['Title']:
logger.fdebug('Quality restriction enforced [ cbr only ]. Accepting result.')
else:
logger.fdebug('Quality restriction enforced [ cbr only ]. Rejecting result.')
elif mylar.PREFERRED_QUALITY == 2:
if 'cbz' in tor['Title']:
logger.fdebug('Quality restriction enforced [ cbz only ]. Accepting result.')
else:
logger.fdebug('Quality restriction enforced [ cbz only ]. Rejecting result.')
logger.fdebug('tor-Title: ' + tor['Title'])
logger.fdebug('there are ' + str(len(torsplit)) + ' sections in this title')
i=0
@ -534,6 +560,7 @@ def torrentdbsearch(seriesname,issue,comicid=None,nzbprov=None):
logger.fdebug('this is a result from ' + str(tor['Site']) + ', not the site I am looking for of ' + str(nzbprov))
continue
#0 holds the title/issue and format-type.
ext_check = True # extension checker to enforce cbr/cbz filetype restrictions.
while (i < len(torsplit)):
#we'll rebuild the string here so that it's formatted accordingly to be passed back to the parser.
logger.fdebug('section(' + str(i) + '): ' + torsplit[i])
@ -549,6 +576,8 @@ def torrentdbsearch(seriesname,issue,comicid=None,nzbprov=None):
rebuiltline = rebuiltline + ' (' + titletemp + ')'
i+=1
if ext_check == False:
continue
logger.fdebug('rebuiltline is :' + rebuiltline)
seriesname_mod = seriesname
@ -656,10 +685,11 @@ def nzbdbsearch(seriesname,issue,comicid=None,nzbprov=None,searchYear=None,Comic
seriesname = snm['ComicName']
seriesname_alt = snm['AlternateSearch']
nsearch_seriesname = re.sub('[\'\!\@\#\$\%\:\;\/\\=\?\.\-\s]', '%',seriesname)
formatrem_seriesname = re.sub('[\'\!\@\#\$\%\:\;\/\\=\?\.]', '',seriesname)
nsearch = '%' + nsearch_seriesname + "%"
nresults = myDB.select("SELECT * FROM rssdb WHERE Title like ? AND Site=?", [nsearch,nzbprov])
if nresults is None:
logger.fdebug('nzb search returned no results for ' + seriesname)
@ -672,6 +702,7 @@ def nzbdbsearch(seriesname,issue,comicid=None,nzbprov=None,searchYear=None,Comic
AS_Alternate = AlternateSearch
for calt in chkthealt:
AS_Alternate = re.sub('##','',calt)
AS_Alternate = '%' + AS_Alternate + "%"
nresults += myDB.select("SELECT * FROM rssdb WHERE Title like ? AND Site=?", [AS_Alternate,nzbprov])
if nresults is None:
logger.fdebug('nzb alternate name search returned no results.')
@ -692,15 +723,28 @@ def nzbdbsearch(seriesname,issue,comicid=None,nzbprov=None,searchYear=None,Comic
else:
ComVersChk = 0
filetype = None
if mylar.PREFERRED_QUALITY == 1: filetype = 'cbr'
elif mylar.PREFERRED_QUALITY == 2: filetype = 'cbz'
for results in nresults:
title = results['Title']
#logger.fdebug("titlesplit: " + str(title.split("\"")))
splitTitle = title.split("\"")
noYear = 'False'
_digits = re.compile('\d')
for subs in splitTitle:
#logger.fdebug(subs)
if len(subs) > 10 and not any(d in subs.lower() for d in except_list):
if len(subs) >= len(seriesname) and not any(d in subs.lower() for d in except_list) and bool(_digits.search(subs)) is True:
if subs.lower().startswith('for'):
# need to filter down alternate names in here at some point...
if seriesname.lower().startswith('for'):
pass
else:
#this is the crap we ignore. Continue
logger.fdebug('this starts with FOR : ' + str(subs) + '. This is not present in the series - ignoring.')
continue
if ComVersChk == 0:
noYear = 'False'
@ -716,6 +760,10 @@ def nzbdbsearch(seriesname,issue,comicid=None,nzbprov=None,searchYear=None,Comic
if noYear == 'False':
if filetype is not None:
if filetype not in subs.lower():
continue
nzbtheinfo.append({
'title': subs,
'link': re.sub('\/release\/', '/download/', results['Link']),

View File

@ -34,9 +34,11 @@ import datetime
from wsgiref.handlers import format_date_time
def search_init(ComicName, IssueNumber, ComicYear, SeriesYear, Publisher, IssueDate, StoreDate, IssueID, AlternateSearch=None, UseFuzzy=None, ComicVersion=None, SARC=None, IssueArcID=None, mode=None, rsscheck=None, ComicID=None, manualsearch=None, filesafe=None):
unaltered_ComicName = None
if filesafe:
if filesafe != ComicName and mode != 'want_ann':
logger.info('[SEARCH] altering ComicName to search-safe Name : ' + filesafe)
unaltered_ComicName = ComicName
ComicName = filesafe
if ComicYear == None: ComicYear = '2014'
else: ComicYear = str(ComicYear)[:4]
@ -196,7 +198,7 @@ def search_init(ComicName, IssueNumber, ComicYear, SeriesYear, Publisher, IssueD
searchprov = prov_order[prov_count].lower()
if searchmode == 'rss':
findit = NZB_SEARCH(ComicName, IssueNumber, ComicYear, SeriesYear, Publisher, IssueDate, StoreDate, searchprov, prov_count, IssDateFix, IssueID, UseFuzzy, newznab_host, ComicVersion=ComicVersion, SARC=SARC, IssueArcID=IssueArcID, RSS="yes", ComicID=ComicID, issuetitle=issuetitle)
findit = NZB_SEARCH(ComicName, IssueNumber, ComicYear, SeriesYear, Publisher, IssueDate, StoreDate, searchprov, prov_count, IssDateFix, IssueID, UseFuzzy, newznab_host, ComicVersion=ComicVersion, SARC=SARC, IssueArcID=IssueArcID, RSS="yes", ComicID=ComicID, issuetitle=issuetitle, unaltered_ComicName=unaltered_ComicName)
if findit == 'yes':
logger.fdebug("findit = found!")
break
@ -209,13 +211,13 @@ def search_init(ComicName, IssueNumber, ComicYear, SeriesYear, Publisher, IssueD
for calt in chkthealt:
AS_Alternate = re.sub('##','',calt)
logger.info(u"Alternate Search pattern detected...re-adjusting to : " + str(AS_Alternate) + " " + str(ComicYear))
findit = NZB_SEARCH(AS_Alternate, IssueNumber, ComicYear, SeriesYear, Publisher, IssueDate, StoreDate, searchprov, prov_count, IssDateFix, IssueID, UseFuzzy, newznab_host, ComicVersion=ComicVersion, SARC=SARC, IssueArcID=IssueArcID, RSS="yes", ComicID=ComicID, issuetitle=issuetitle)
findit = NZB_SEARCH(AS_Alternate, IssueNumber, ComicYear, SeriesYear, Publisher, IssueDate, StoreDate, searchprov, prov_count, IssDateFix, IssueID, UseFuzzy, newznab_host, ComicVersion=ComicVersion, SARC=SARC, IssueArcID=IssueArcID, RSS="yes", ComicID=ComicID, issuetitle=issuetitle, unaltered_ComicName=AS_Alternate)
if findit == 'yes':
break
if findit == 'yes': break
else:
findit = NZB_SEARCH(ComicName, IssueNumber, ComicYear, SeriesYear, Publisher, IssueDate, StoreDate, searchprov, prov_count, IssDateFix, IssueID, UseFuzzy, newznab_host, ComicVersion=ComicVersion, SARC=SARC, IssueArcID=IssueArcID, ComicID=ComicID, issuetitle=issuetitle)
findit = NZB_SEARCH(ComicName, IssueNumber, ComicYear, SeriesYear, Publisher, IssueDate, StoreDate, searchprov, prov_count, IssDateFix, IssueID, UseFuzzy, newznab_host, ComicVersion=ComicVersion, SARC=SARC, IssueArcID=IssueArcID, ComicID=ComicID, issuetitle=issuetitle, unaltered_ComicName=unaltered_ComicName)
if findit == 'yes':
logger.fdebug("findit = found!")
break
@ -228,7 +230,7 @@ def search_init(ComicName, IssueNumber, ComicYear, SeriesYear, Publisher, IssueD
for calt in chkthealt:
AS_Alternate = re.sub('##','',calt)
logger.info(u"Alternate Search pattern detected...re-adjusting to : " + str(AS_Alternate) + " " + str(ComicYear))
findit = NZB_SEARCH(AS_Alternate, IssueNumber, ComicYear, SeriesYear, Publisher, IssueDate, StoreDate, searchprov, prov_count, IssDateFix, IssueID, UseFuzzy, newznab_host, ComicVersion=ComicVersion, SARC=SARC, IssueArcID=IssueArcID, ComicID=ComicID, issuetitle=issuetitle)
findit = NZB_SEARCH(AS_Alternate, IssueNumber, ComicYear, SeriesYear, Publisher, IssueDate, StoreDate, searchprov, prov_count, IssDateFix, IssueID, UseFuzzy, newznab_host, ComicVersion=ComicVersion, SARC=SARC, IssueArcID=IssueArcID, ComicID=ComicID, issuetitle=issuetitle, unaltered_ComicName=unaltered_ComicName)
if findit == 'yes':
break
if findit == 'yes': break
@ -250,7 +252,7 @@ def search_init(ComicName, IssueNumber, ComicYear, SeriesYear, Publisher, IssueD
return findit, 'None'
def NZB_SEARCH(ComicName, IssueNumber, ComicYear, SeriesYear, Publisher, IssueDate, StoreDate, nzbprov, prov_count, IssDateFix, IssueID, UseFuzzy, newznab_host=None, ComicVersion=None, SARC=None, IssueArcID=None, RSS=None, ComicID=None, issuetitle=None):
def NZB_SEARCH(ComicName, IssueNumber, ComicYear, SeriesYear, Publisher, IssueDate, StoreDate, nzbprov, prov_count, IssDateFix, IssueID, UseFuzzy, newznab_host=None, ComicVersion=None, SARC=None, IssueArcID=None, RSS=None, ComicID=None, issuetitle=None, unaltered_ComicName=None):
if nzbprov == 'nzb.su':
apikey = mylar.NZBSU_APIKEY
@ -287,6 +289,7 @@ def NZB_SEARCH(ComicName, IssueNumber, ComicYear, SeriesYear, Publisher, IssueDa
#myDB = db.DBConnection()
#nodown = myDB.action('SELECT * FROM nzblog')
#this will completely render the api search results empty. Needs to get fixed.
if mylar.PREFERRED_QUALITY == 0: filetype = ""
elif mylar.PREFERRED_QUALITY == 1: filetype = ".cbr"
elif mylar.PREFERRED_QUALITY == 2: filetype = ".cbz"
@ -414,13 +417,13 @@ def NZB_SEARCH(ComicName, IssueNumber, ComicYear, SeriesYear, Publisher, IssueDa
isssearch = str(c_number) + "%20" + str(c_alpha)
if cmloopit == 3:
comsearch = comsrc + "%2000" + str(isssearch) + "%20" + str(filetype)
comsearch = comsrc + "%2000" + str(isssearch) #+ "%20" + str(filetype)
issdig = '00'
elif cmloopit == 2:
comsearch = comsrc + "%200" + str(isssearch) + "%20" + str(filetype)
comsearch = comsrc + "%200" + str(isssearch) #+ "%20" + str(filetype)
issdig = '0'
elif cmloopit == 1:
comsearch = comsrc + "%20" + str(isssearch) + "%20" + str(filetype)
comsearch = comsrc + "%20" + str(isssearch) #+ "%20" + str(filetype)
issdig = ''
mod_isssearch = str(issdig) + str(isssearch)
@ -548,7 +551,8 @@ def NZB_SEARCH(ComicName, IssueNumber, ComicYear, SeriesYear, Publisher, IssueDa
else:
for entry in bb['entries']:
logger.fdebug("checking search result: " + entry['title'])
if nzbprov != "experimental" and nzbprov != "dognzb":
if nzbprov != "dognzb":
#rss for experimental doesn't have the size constraints embedded. So we do it here.
if RSS == "yes":
comsize_b = entry['length']
else:
@ -557,9 +561,28 @@ def NZB_SEARCH(ComicName, IssueNumber, ComicYear, SeriesYear, Publisher, IssueDa
comsize_b = entry['length']
elif nzbprov == 'KAT':
comsize_b = entry['size']
elif nzbprov == 'experimental':
comsize_b = entry['length'] # we only want the size from the rss - the search/api has it already.
else:
tmpsz = entry.enclosures[0]
comsize_b = tmpsz['length']
#file restriction limitation here
#only works with KAT (done here) & CBT (done in rsscheck) & Experimental (has it embeded in search and rss checks)
if nzbprov == 'KAT':
if mylar.PREFERRED_QUALITY == 1:
if 'cbr' in entry['title']:
logger.fdebug('Quality restriction enforced [ .cbr only ]. Accepting result.')
else:
logger.fdebug('Quality restriction enforced [ .cbr only ]. Rejecting this result.')
continue
elif mylar.PREFERRED_QUALITY == 2:
if 'cbz' in entry['title']:
logger.fdebug('Quality restriction enforced [ .cbz only ]. Accepting result.')
else:
logger.fdebug('Quality restriction enforced [ .cbz only ]. Rejecting this result.')
continue
if comsize_b is None:
logger.fdebug('Size of file cannot be retrieved. Ignoring size-comparison and continuing.')
#comsize_b = 0
@ -818,13 +841,20 @@ def NZB_SEARCH(ComicName, IssueNumber, ComicYear, SeriesYear, Publisher, IssueDa
#let's do this here and save a few extra loops ;)
#fix for issue dates between Nov-Dec/Jan
if IssDateFix != "no" and UseFuzzy is not "2":
if IssDateFix == "01" or IssDateFix == "02" or IssDateFix == "03": ComicYearFix = int(ComicYear) - 1
else: ComicYearFix = int(ComicYear) + 1
if str(ComicYearFix) in result_comyear:
logger.fdebug("further analysis reveals this was published inbetween Nov-Jan, incrementing year to " + str(ComicYearFix) + " has resulted in a match!")
yearmatch = "true"
if IssDateFix == "01" or IssDateFix == "02" or IssDateFix == "03":
ComicYearFix = int(ComicYear) - 1
if str(ComicYearFix) in result_comyear:
logger.fdebug("further analysis reveals this was published inbetween Nov-Jan, decreasing year to " + str(ComicYearFix) + " has resulted in a match!")
yearmatch = "true"
else:
logger.fdebug(str(comyear) + " - not the right year.")
else:
logger.fdebug(str(comyear) + " - not the right year.")
ComicYearFix = int(ComicYear) + 1
if str(ComicYearFix) in result_comyear:
logger.fdebug("further analysis reveals this was published inbetween Nov-Jan, incrementing year to " + str(ComicYearFix) + " has resulted in a match!")
yearmatch = "true"
else:
logger.fdebug(str(comyear) + " - not the right year.")
elif UseFuzzy == "1": yearmatch = "true"
if Publisher is not None:
@ -870,11 +900,15 @@ def NZB_SEARCH(ComicName, IssueNumber, ComicYear, SeriesYear, Publisher, IssueDa
#if the series doesn't have a '-' within it.
hyphensplit = None
hyphenfail = False
for m in re.finditer('-', comic_andiss):
logger.fdebug('I have found a hyphen within the nzbname @ position: ' + str(m.start()))
if '-' in ComicName:
logger.fdebug('There is a hyphen present in the series title. Ignoring position: ' + str(m.start()))
pass
issue_firstword = None
if unaltered_ComicName is not None:
ComicName = unaltered_ComicName
for m in re.finditer('[-/:]', comic_andiss):
#sometimes the : within a series title is replaced with a -, since filenames can't contain :
logger.fdebug('[' + ComicName + '] I have found a ' + str(m.group()) + ' within the nzbname @ position: ' + str(m.start()))
if str(m.group()) in ComicName: # and m.start() <= len(ComicName) + 2:
logger.fdebug('There is a ' + str(m.group()) + ' present in the series title. Ignoring position: ' + str(m.start()))
continue
else:
logger.fdebug('There is no hyphen present in the series title.')
logger.fdebug('Assuming position start is : ' + str(m.start()))
@ -894,7 +928,7 @@ def NZB_SEARCH(ComicName, IssueNumber, ComicYear, SeriesYear, Publisher, IssueDa
continue
#changed this from '' to ' '
comic_iss_b4 = re.sub('[\-\:\,\?]', ' ', str(comic_andiss))
comic_iss_b4 = re.sub('[\-\:\,\?\!]', ' ', str(comic_andiss))
comic_iss = comic_iss_b4.replace('.',' ')
#if issue_except: comic_iss = re.sub(issue_except.lower(), '', comic_iss)
logger.fdebug("adjusted nzb comic and issue: " + str(comic_iss))
@ -1075,98 +1109,129 @@ def NZB_SEARCH(ComicName, IssueNumber, ComicYear, SeriesYear, Publisher, IssueDa
initialchk = 'ok'
isstitle_chk = False
if (splitst) != len(watchcomic_split):
logger.fdebug("incorrect comic lengths...not a match")
issuetitle = re.sub('[\-\:\,\?\.]', ' ', str(issuetitle))
issuetitle_words = issuetitle.split(None)
#issue title comparison here:
logger.fdebug('there are ' + str(len(issuetitle_words)) + ' words in the issue title of : ' + str(issuetitle))
# we minus 1 the splitst since the issue # is included in there.
if (splitst - 1) > len(watchcomic_split):
possibleissue_num = splitit[splitst]
logger.fdebug('possible issue number of : ' + str(possibleissue_num))
extra_words = splitst - len(watchcomic_split)
logger.fdebug('there are ' + str(extra_words) + ' left over after we remove the series title.')
wordcount = 1
#remove the series title here so we just have the 'hopefully' issue title
for word in splitit:
#logger.info('word: ' + str(word))
if wordcount > len(watchcomic_split):
#logger.info('wordcount: ' + str(wordcount))
#logger.info('watchcomic_split: ' + str(len(watchcomic_split)))
if wordcount - len(watchcomic_split) == 1:
search_issue_title = word
possibleissue_num = word
else:
search_issue_title += ' ' + word
wordcount +=1
if issue_firstword:
vals = IssueTitleCheck(issuetitle, watchcomic_split, splitit, splitst, issue_firstword, hyphensplit, orignzb=entry['title'])
# logger.fdebug("incorrect comic lengths...not a match")
#
# issuetitle = re.sub('[\-\:\,\?\.]', ' ', str(issuetitle))
# issuetitle_words = issuetitle.split(None)
# #issue title comparison here:
# logger.fdebug('there are ' + str(len(issuetitle_words)) + ' words in the issue title of : ' + str(issuetitle))
# # we minus 1 the splitst since the issue # is included in there.
# if (splitst - 1) > len(watchcomic_split):
# possibleissue_num = splitit[splitst]
# logger.fdebug('possible issue number of : ' + str(possibleissue_num))
# extra_words = splitst - len(watchcomic_split)
# logger.fdebug('there are ' + str(extra_words) + ' left over after we remove the series title.')
# wordcount = 1
# #remove the series title here so we just have the 'hopefully' issue title
# for word in splitit:
# #logger.info('word: ' + str(word))
# if wordcount > len(watchcomic_split):
# #logger.info('wordcount: ' + str(wordcount))
# #logger.info('watchcomic_split: ' + str(len(watchcomic_split)))
# if wordcount - len(watchcomic_split) == 1:
# search_issue_title = word
# possibleissue_num = word
# else:
# search_issue_title += ' ' + word
# wordcount +=1
#
# decit = search_issue_title.split(None)
# if decit[0].isdigit() and decit[1].isdigit():
# logger.fdebug('possible decimal - referencing position from original title.')
# chkme = entry['title'].find(decit[0])
# chkend = entry['title'].find(decit[1], chkme + len(decit[0]))
# chkspot = entry['title'][chkme:chkend+1]
# print chkme, chkend
# print chkspot
# # we add +1 to decit totals in order to account for the '.' that's missing and we assume is there.
# if len(chkspot) == ( len(decit[0]) + len(decit[1]) + 1 ):
# logger.fdebug('lengths match for possible decimal issue.')
# if '.' in chkspot:
# logger.fdebug('decimal located within : ' + str(chkspot))
# possibleissue_num = chkspot
# splitst = splitst -1 #remove the second numeric as it's a decimal and would add an extra char to the matching process
# logger.fdebug('search_issue_title is : ' + str(search_issue_title))
# logger.fdebug('possible issue number of : ' + str(possibleissue_num))
#
# if hyphensplit is not None:
# logger.fdebug('hypen split detected.')
# try:
# issue_start = search_issue_title.find(issue_firstword)
# logger.fdebug('located first word of : ' + str(issue_firstword) + ' at position : ' + str(issue_start))
# search_issue_title = search_issue_title[issue_start:]
# logger.fdebug('corrected search_issue_title is now : ' + str(search_issue_title))
# except TypeError:
# logger.fdebug('invalid parsing detection. Ignoring this result.')
# continue
# #now we have the nzb issue title (if it exists), let's break it down further.
# sit_split = search_issue_title.split(None)
# watch_split_count = len(issuetitle_words)
# isstitle_removal = []
# isstitle_match = 0 #counter to tally % match
# misword = 0 # counter to tally words that probably don't need to be an 'exact' match for
# for wsplit in issuetitle_words:
# of_chk = False
# if wsplit.lower() == 'part' or wsplit.lower() == 'of':
# if wsplit.lower() == 'of':
# of_chk = True
# logger.fdebug('not worrying about this word : ' + str(wsplit))
# misword +=1
# continue
# if wsplit.isdigit() and of_chk == True:
# logger.fdebug('of ' + str(wsplit) + ' detected. Ignoring for matching.')
# of_chk = False
# continue
#
# for sit in sit_split:
# logger.fdebug('looking at : ' + str(sit.lower()) + ' -TO- ' + str(wsplit.lower()))
# if sit.lower() == 'part':
# logger.fdebug('not worrying about this word : ' + str(sit))
# misword +=1
# isstitle_removal.append(sit)
# break
# elif sit.lower() == wsplit.lower():
# logger.fdebug('word match: ' + str(sit))
# isstitle_match +=1
# isstitle_removal.append(sit)
# break
# else:
# try:
# if int(sit) == int(wsplit):
# logger.fdebug('found matching numeric: ' + str(wsplit))
# isstitle_match +=1
# isstitle_removal.append(sit)
# break
# except:
# pass
#
# logger.fdebug('isstitle_match count : ' + str(isstitle_match))
# if isstitle_match > 0:
# iss_calc = ( ( isstitle_match + misword ) / watch_split_count ) * 100
# logger.fdebug('iss_calc: ' + str(iss_calc) + ' % with ' + str(misword) + ' unaccounted for words')
# else:
# iss_calc = 0
# logger.fdebug('0 words matched on issue title.')
# if iss_calc >= 80:
# logger.fdebug('>80% match on issue name. If this were implemented, this would be considered a match.')
# logger.fdebug('we should remove ' + str(len(isstitle_removal)) + ' words : ' + str(isstitle_removal))
# logger.fdebug('Removing issue title from nzb filename to improve matching algorithims.')
# splitst = splitst - len(isstitle_removal)
# isstitle_chk = True
#
# else:
# pass
print str(vals)
logger.fdebug('search_issue_title is : ' + str(search_issue_title))
logger.fdebug('possible issue number of : ' + str(possibleissue_num))
if hyphensplit is not None:
logger.fdebug('hypen split detected.')
issue_start = search_issue_title.find(issue_firstword)
logger.fdebug('located first word of : ' + str(issue_firstword) + ' at position : ' + str(issue_start))
search_issue_title = search_issue_title[issue_start:]
logger.fdebug('corrected search_issue_title is now : ' + str(search_issue_title))
#now we have the nzb issue title (if it exists), let's break it down further.
sit_split = search_issue_title.split(None)
watch_split_count = len(issuetitle_words)
isstitle_removal = []
isstitle_match = 0 #counter to tally % match
misword = 0 # counter to tally words that probably don't need to be an 'exact' match for
for wsplit in issuetitle_words:
of_chk = False
if wsplit.lower() == 'part' or wsplit.lower() == 'of':
if wsplit.lower() == 'of':
of_chk = True
logger.fdebug('not worrying about this word : ' + str(wsplit))
misword +=1
if vals is not None:
if vals[0]['status'] == 'continue':
continue
if wsplit.isdigit() and of_chk == True:
logger.fdebug('of ' + str(wsplit) + ' detected. Ignoring for matching.')
of_chk = False
continue
for sit in sit_split:
logger.fdebug('looking at : ' + str(sit.lower()) + ' -TO- ' + str(wsplit.lower()))
if sit.lower() == 'part':
logger.fdebug('not worrying about this word : ' + str(sit))
misword +=1
isstitle_removal.append(sit)
break
elif sit.lower() == wsplit.lower():
logger.fdebug('word match: ' + str(sit))
isstitle_match +=1
isstitle_removal.append(sit)
break
else:
try:
if int(sit) == int(wsplit):
logger.fdebug('found matching numeric: ' + str(wsplit))
isstitle_match +=1
isstitle_removal.append(sit)
break
except:
pass
logger.fdebug('isstitle_match count : ' + str(isstitle_match))
if isstitle_match > 0:
iss_calc = ( ( isstitle_match + misword ) / watch_split_count ) * 100
logger.fdebug('iss_calc: ' + str(iss_calc) + ' % with ' + str(misword) + ' unaccounted for words')
else:
logger.fdebug('Issue title status returned of : ' + str(vals[0]['status'])) # will either be OK or pass.
else:
iss_calc = 0
logger.fdebug('0 words matched on issue title.')
if iss_calc >= 80:
logger.fdebug('>80% match on issue name. If this were implemented, this would be considered a match.')
logger.fdebug('we should remove ' + str(len(isstitle_removal)) + ' words : ' + str(isstitle_removal))
logger.fdebug('Removing issue title from nzb filename to improve matching algorithims.')
splitst = splitst - len(isstitle_removal)
isstitle_chk = True
else:
pass
logger.fdebug('No issue title.')
for tstsplit in splitit:
if tstsplit.lower() == 'the':
@ -1857,3 +1922,127 @@ def FailedMark(IssueID, ComicID, id, nzbname, prov):
Markit = FailProcess.markFailed()
return "torrent-fail"
def IssueTitleCheck(issuetitle, watchcomic_split, splitit, splitst, issue_firstword, hyphensplit, orignzb=None):
vals = []
initialchk = 'ok'
isstitle_chk = False
logger.fdebug("incorrect comic lengths...not a match")
issuetitle = re.sub('[\-\:\,\?\.]', ' ', str(issuetitle))
issuetitle_words = issuetitle.split(None)
#issue title comparison here:
logger.fdebug('there are ' + str(len(issuetitle_words)) + ' words in the issue title of : ' + str(issuetitle))
# we minus 1 the splitst since the issue # is included in there.
if (splitst - 1) > len(watchcomic_split):
possibleissue_num = splitit[splitst]
logger.fdebug('possible issue number of : ' + str(possibleissue_num))
extra_words = splitst - len(watchcomic_split)
logger.fdebug('there are ' + str(extra_words) + ' left over after we remove the series title.')
wordcount = 1
#remove the series title here so we just have the 'hopefully' issue title
for word in splitit:
#logger.info('word: ' + str(word))
if wordcount > len(watchcomic_split):
#logger.info('wordcount: ' + str(wordcount))
#logger.info('watchcomic_split: ' + str(len(watchcomic_split)))
if wordcount - len(watchcomic_split) == 1:
search_issue_title = word
possibleissue_num = word
else:
search_issue_title += ' ' + word
wordcount +=1
decit = search_issue_title.split(None)
if decit[0].isdigit() and decit[1].isdigit():
logger.fdebug('possible decimal - referencing position from original title.')
chkme = orignzb.find(decit[0])
chkend = orignzb.find(decit[1], chkme + len(decit[0]))
chkspot = orignzb[chkme:chkend+1]
print chkme, chkend
print chkspot
# we add +1 to decit totals in order to account for the '.' that's missing and we assume is there.
if len(chkspot) == ( len(decit[0]) + len(decit[1]) + 1 ):
logger.fdebug('lengths match for possible decimal issue.')
if '.' in chkspot:
logger.fdebug('decimal located within : ' + str(chkspot))
possibleissue_num = chkspot
splitst = splitst -1 #remove the second numeric as it's a decimal and would add an extra char to$
logger.fdebug('search_issue_title is : ' + str(search_issue_title))
logger.fdebug('possible issue number of : ' + str(possibleissue_num))
if hyphensplit is not None:
logger.fdebug('hypen split detected.')
try:
issue_start = search_issue_title.find(issue_firstword)
logger.fdebug('located first word of : ' + str(issue_firstword) + ' at position : ' + str(issue_start))
search_issue_title = search_issue_title[issue_start:]
logger.fdebug('corrected search_issue_title is now : ' + str(search_issue_title))
except TypeError:
logger.fdebug('invalid parsing detection. Ignoring this result.')
return vals.append({"splitit": splitit,
"splitst": splitst,
"isstitle_chk": isstitle_chk,
"status": "continue"})
#now we have the nzb issue title (if it exists), let's break it down further.
sit_split = search_issue_title.split(None)
watch_split_count = len(issuetitle_words)
isstitle_removal = []
isstitle_match = 0 #counter to tally % match
misword = 0 # counter to tally words that probably don't need to be an 'exact' match.
for wsplit in issuetitle_words:
of_chk = False
if wsplit.lower() == 'part' or wsplit.lower() == 'of':
if wsplit.lower() == 'of':
of_chk = True
logger.fdebug('not worrying about this word : ' + str(wsplit))
misword +=1
continue
if wsplit.isdigit() and of_chk == True:
logger.fdebug('of ' + str(wsplit) + ' detected. Ignoring for matching.')
of_chk = False
continue
for sit in sit_split:
logger.fdebug('looking at : ' + str(sit.lower()) + ' -TO- ' + str(wsplit.lower()))
if sit.lower() == 'part':
logger.fdebug('not worrying about this word : ' + str(sit))
misword +=1
isstitle_removal.append(sit)
break
elif sit.lower() == wsplit.lower():
logger.fdebug('word match: ' + str(sit))
isstitle_match +=1
isstitle_removal.append(sit)
break
else:
try:
if int(sit) == int(wsplit):
logger.fdebug('found matching numeric: ' + str(wsplit))
isstitle_match +=1
isstitle_removal.append(sit)
break
except:
pass
logger.fdebug('isstitle_match count : ' + str(isstitle_match))
if isstitle_match > 0:
iss_calc = ( ( isstitle_match + misword ) / watch_split_count ) * 100
logger.fdebug('iss_calc: ' + str(iss_calc) + ' % with ' + str(misword) + ' unaccounted for words')
else:
iss_calc = 0
logger.fdebug('0 words matched on issue title.')
if iss_calc >= 80:
logger.fdebug('>80% match on issue name. If this were implemented, this would be considered a match.')
logger.fdebug('we should remove ' + str(len(isstitle_removal)) + ' words : ' + str(isstitle_removal))
logger.fdebug('Removing issue title from nzb filename to improve matching algorithims.')
splitst = splitst - len(isstitle_removal)
isstitle_chk = True
return vals.append({"splitit": splitit,
"splitst": splitst,
"isstitle_chk": isstitle_chk,
"status": "ok"})