From c06e96c129fb5848da20c9f6c0a3461190bc5e75 Mon Sep 17 00:00:00 2001 From: evilhero Date: Fri, 8 Mar 2019 16:56:05 -0500 Subject: [PATCH] FIX: fix for DDL provider option attempting to use incorrect links when downloading, FIX: Fixed some DDL problems due to various parsing / tpe problems, IMP: DDL Provider will now follow RSS feed option if option is enabled --- mylar/getcomics.py | 135 +++++++++++++++++++++++++++++--------------- mylar/rsscheck.py | 110 ++++++++++++++++++++++++++++++++++++ mylar/rsscheckit.py | 3 + mylar/search.py | 25 +++++--- 4 files changed, 220 insertions(+), 53 deletions(-) diff --git a/mylar/getcomics.py b/mylar/getcomics.py index 06da10b0..7c6fe2f4 100644 --- a/mylar/getcomics.py +++ b/mylar/getcomics.py @@ -140,7 +140,7 @@ class GC(object): nwsize = size.find('//') size = re.sub('\[', '', size[:nwsize]).strip() else: - size = '0 M' + size = '0M' i+=1 dateline = f.find('time') datefull = dateline['datetime'] @@ -163,15 +163,19 @@ class GC(object): def parse_downloadresults(self, id, mainlink): myDB = db.DBConnection() + series = None + year = None + size = None title = os.path.join(mylar.CONFIG.CACHE_DIR, 'getcomics-' + id) soup = BeautifulSoup(open(title+'.html'), 'html.parser') orig_find = soup.find("p", {"style": "text-align: center;"}) i = 0 option_find = orig_find + possible_more = None while True: #i <= 10: prev_option = option_find option_find = option_find.findNext(text=True) - if i == 0: + if i == 0 and series is None: series = option_find elif 'Year' in option_find: year = option_find.findNext(text=True) @@ -189,24 +193,52 @@ class GC(object): for f in soup.findAll("div", {"class": "aio-pulse"}): lk = f.find('a') if lk['title'] == 'Download Now': - link = lk['href'] - site = lk['title'] + link = {"series": series, + "site": lk['title'], + "year": year, + "issues": None, + "size": size, + "link": lk['href']} + break #get the first link just to test links = [] if link is None and possible_more.name == 'ul': - bb = possible_more.findAll('li') - for x in bb: - volume = x.findNext(text=True) - if u'\u2013' in volume: - volume = re.sub(u'\u2013', '-', volume) - linkline = x.find('a') - link = linkline['href'] - site = linkline.findNext(text=True) - links.append({"volume": volume, - "site": site, - "link": link}) + try: + bb = possible_more.findAll('li') + except: + pass + else: + for x in bb: + linkline = x.find('a') + if linkline: + if 'go.php' in linkline['href']: + volume = x.findNext(text=True) + if u'\u2013' in volume: + volume = re.sub(u'\u2013', '-', volume) + #volume label contains series, issue(s), year(s), and size + series_st = volume.find('(') + issues_st = volume.find('#') + series = volume[:series_st] + if any([issues_st == -1, series_st == -1]): + issues = None + else: + series = volume[:issues_st].strip() + issues = volume[issues_st+1:series_st].strip() + year_end = volume.find(')', series_st+1) + year = re.sub('[\(\)]', '', volume[series_st+1: year_end]).strip() + size_end = volume.find(')', year_end+1) + size = re.sub('[\(\)]', '', volume[year_end+1: size_end]).strip() + linked = linkline['href'] + site = linkline.findNext(text=True) + if site == 'Main Server': + links.append({"series": series, + "site": site, + "year": year, + "issues": issues, + "size": size, + "link": linked}) else: check_extras = soup.findAll("h3") for sb in check_extras: @@ -220,40 +252,52 @@ class GC(object): if u'\u2013' in volume: volume = re.sub(u'\u2013', '-', volume) linkline = x.find('a') - link = linkline['href'] + linked = linkline['href'] site = linkline.findNext(text=True) links.append({"volume": volume, "site": site, - "link": link}) + "link": linked}) - if link is None: + if all([link is None, len(links) == 0]): logger.warn('Unable to retrieve any valid immediate download links. They might not exist.') return {'success': False} - + if all([link is not None, len(links) == 0]): + logger.info('only one item discovered, changing queue length to accomodate: %s [%s]' % (link, type(link))) + links = [link] + elif len(links) > 0: + if len(links) > 1: + logger.info('[DDL-QUEUER] This pack has been broken up into %s separate packs - queueing each in sequence for your enjoyment.' % len(links)) + cnt = 1 for x in links: - logger.fdebug('[%s] %s - %s' % (x['site'], x['volume'], x['link'])) + if len(links) == 1: + mod_id = id + else: + mod_id = id+'-'+str(cnt) + #logger.fdebug('[%s] %s (%s) %s [%s][%s]' % (x['site'], x['series'], x['year'], x['issues'], x['size'], x['link'])) - ctrlval = {'id': id} - vals = {'series': series, - 'year': year, - 'size': size, - 'issueid': self.issueid, - 'comicid': self.comicid, - 'link': link, - 'mainlink': mainlink, - 'updated_date': datetime.datetime.now().strftime('%Y-%m-%d %H:%M'), - 'status': 'Queued'} - myDB.upsert('ddl_info', vals, ctrlval) + ctrlval = {'id': mod_id} + vals = {'series': x['series'], + 'year': x['year'], + 'size': x['size'], + 'issues': x['issues'], + 'issueid': self.issueid, + 'comicid': self.comicid, + 'link': x['link'], + 'mainlink': mainlink, + 'updated_date': datetime.datetime.now().strftime('%Y-%m-%d %H:%M'), + 'status': 'Queued'} + myDB.upsert('ddl_info', vals, ctrlval) - mylar.DDL_QUEUE.put({'link': link, - 'mainlink': mainlink, - 'series': series, - 'year': year, - 'size': size, - 'comicid': self.comicid, - 'issueid': self.issueid, - 'id': id, - 'resume': None}) + mylar.DDL_QUEUE.put({'link': x['link'], + 'mainlink': mainlink, + 'series': x['series'], + 'year': x['year'], + 'size': x['size'], + 'comicid': self.comicid, + 'issueid': self.issueid, + 'id': mod_id, + 'resume': None}) + cnt+=1 return {'success': True} @@ -275,20 +319,23 @@ class GC(object): t = s.get(link, verify=True, cookies=cf_cookievalue, headers=self.headers, stream=True) filename = os.path.basename(urllib.unquote(t.url).decode('utf-8')) + if 'GetComics.INFO' in filename: + filename = re.sub('GetComics.INFO', '', filename, re.I).strip() try: remote_filesize = int(t.headers['Content-length']) logger.fdebug('remote filesize: %s' % remote_filesize) except Exception as e: - logger.warn('[WARNING] Unable to retrieve remote file size. Error returned as : %s' % e) + logger.warn('[WARNING] Unable to retrieve remote file size - this is usually due to the page being behind a different click-bait/ad page. Error returned as : %s' % e) + logger.warn('[WARNING] Considering this particular download as invalid and will ignore this result.') remote_filesize = 0 mylar.DDL_LOCK = False return ({"success": False, "filename": filename, "path": None}) - else: - #write the filename to the db for tracking purposes... - myDB.upsert('ddl_info', {'filename': filename, 'remote_filesize': remote_filesize}, {'id': id}) + + #write the filename to the db for tracking purposes... + myDB.upsert('ddl_info', {'filename': filename, 'remote_filesize': remote_filesize}, {'id': id}) path = os.path.join(mylar.CONFIG.DDL_LOCATION, filename) diff --git a/mylar/rsscheck.py b/mylar/rsscheck.py index d753ddb4..cb7859b0 100755 --- a/mylar/rsscheck.py +++ b/mylar/rsscheck.py @@ -24,6 +24,7 @@ from datetime import datetime, timedelta import gzip import time import random +from bs4 import BeautifulSoup from StringIO import StringIO import mylar @@ -384,6 +385,78 @@ def torrents(pickfeed=None, seriesname=None, issue=None, feedinfo=None): return torinfo return +def ddl(forcerss=False): + headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:40.0) Gecko/20100101 Firefox/40.1'} + ddl_feed = 'https://getcomics.info/feed/' + try: + r = requests.get(ddl_feed, verify=True, headers=headers) + except Exception, e: + logger.warn('Error fetching RSS Feed Data from DDL: %s' % (e)) + return False + else: + if r.status_code != 200: + #typically 403 will not return results, but just catch anything other than a 200 + if r.status_code == 403: + logger.warn('ERROR - status code:%s' % r.status_code) + return False + else: + logger.warn('[%s] Status code returned: %s' % (r.status_code)) + return False + + feedme = feedparser.parse(r.content) + results = [] + for entry in feedme.entries: + soup = BeautifulSoup(entry.summary, 'html.parser') + orig_find = soup.find("p", {"style": "text-align: center;"}) + i = 0 + option_find = orig_find + while True: #i <= 10: + prev_option = option_find + option_find = option_find.findNext(text=True) + if 'Year' in option_find: + year = option_find.findNext(text=True) + year = re.sub('\|', '', year).strip() + else: + if 'Size' in prev_option: + size = option_find #.findNext(text=True) + if '- MB' in size: size = '0 MB' + possible_more = orig_find.next_sibling + break + i+=1 + + link = entry.link + title = entry.title + updated = entry.updated + if updated.endswith('+0000'): + updated = updated[:-5].strip() + tmpid = entry.id + id = tmpid[tmpid.find('=')+1:] + if 'KB' in size: + szform = 'KB' + sz = 'K' + elif 'GB' in size: + szform = 'GB' + sz = 'G' + elif 'MB' in size: + szform = 'MB' + sz = 'M' + elif 'TB' in size: + szform = 'TB' + sz = 'T' + tsize = helpers.human2bytes(re.sub('[^0-9]', '', size).strip() + sz) + + #link can be referenced with the ?p=id url + results.append({'Title': title, + 'Size': tsize, + 'Link': id, + 'Site': 'DDL', + 'Pubdate': updated}) + + if len(results) >0: + logger.info('[RSS][DDL] %s entries have been indexed and are now going to be stored for caching.' % len(results)) + rssdbupdate(results, len(results), 'ddl') + + return def nzbs(provider=None, forcerss=False): @@ -569,6 +642,43 @@ def rssdbupdate(feeddata, i, type): logger.fdebug('Completed adding new data to RSS DB. Next add in ' + str(mylar.CONFIG.RSS_CHECKINTERVAL) + ' minutes') return +def ddl_dbsearch(seriesname, issue, comicid=None, nzbprov=None, oneoff=False): + myDB = db.DBConnection() + seriesname_alt = None + if any([comicid is None, comicid == 'None', oneoff is True]): + pass + else: + snm = myDB.selectone("SELECT * FROM comics WHERE comicid=?", [comicid]).fetchone() + if snm is None: + logger.fdebug('Invalid ComicID of %s. Aborting search' % comicid) + return "no results" + else: + seriesname = snm['ComicName'] + seriesname_alt = snm['AlternateSearch'] + + dsearch_rem1 = re.sub("\\band\\b", "%", seriesname.lower()) + dsearch_rem2 = re.sub("\\bthe\\b", "%", dsearch_rem1.lower()) + dsearch_removed = re.sub('\s+', ' ', dsearch_rem2) + dsearch_seriesname = re.sub('[\'\!\@\#\$\%\:\-\;\/\\=\?\&\.\s\,]', '%', dsearch_removed) + dsearch = '%' + dsearch_seriesname + '%' + dresults = myDB.select("SELECT * FROM rssdb WHERE Title like ? AND Site='DDL'", [dsearch]) + ddltheinfo = [] + ddlinfo = {} + if not dresults: + return "no results" + else: + for dl in dresults: + ddltheinfo.append({ + 'title': dl['Title'], + 'link': dl['Link'], + 'pubdate': dl['Pubdate'], + 'site': dl['Site'], + 'length': dl['Size'] + }) + + ddlinfo['entries'] = ddltheinfo + + return ddlinfo def torrentdbsearch(seriesname, issue, comicid=None, nzbprov=None, oneoff=False): myDB = db.DBConnection() diff --git a/mylar/rsscheckit.py b/mylar/rsscheckit.py index 930e3115..e7bab64e 100755 --- a/mylar/rsscheckit.py +++ b/mylar/rsscheckit.py @@ -91,6 +91,9 @@ class tehMain(): logger.info('[RSS-FEEDS] Initiating RSS Feed Check for NZB Providers.') rsscheck.nzbs(forcerss=forcerss) + if mylar.CONFIG.ENABLE_DDL is True: + logger.info('[RSS-FEEDS] Initiating RSS Feed Check for DDL Provider.') + rsscheck.ddl(forcerss=forcerss) logger.info('[RSS-FEEDS] RSS Feed Check/Update Complete') logger.info('[RSS-FEEDS] Watchlist Check for new Releases') mylar.search.searchforissue(rsscheck='yes') diff --git a/mylar/search.py b/mylar/search.py index 7020dd2d..e3167222 100755 --- a/mylar/search.py +++ b/mylar/search.py @@ -254,7 +254,6 @@ def search_init(ComicName, IssueNumber, ComicYear, SeriesYear, Publisher, IssueD c_number = c_number[:decst].rstrip() while (srchloop <= searchcnt): - logger.fdebug('srchloop: %s' % srchloop) #searchmodes: # rss - will run through the built-cached db of entries # api - will run through the providers via api (or non-api in the case of Experimental) @@ -334,9 +333,9 @@ def search_init(ComicName, IssueNumber, ComicYear, SeriesYear, Publisher, IssueD prov_count+=1 continue if searchmode == 'rss': - if searchprov.lower() == 'ddl': - prov_count+=1 - continue + #if searchprov.lower() == 'ddl': + # prov_count+=1 + # continue findit = NZB_SEARCH(ComicName, IssueNumber, ComicYear, SeriesYear, Publisher, IssueDate, StoreDate, searchprov, send_prov_count, IssDateFix, IssueID, UseFuzzy, newznab_host, ComicVersion=ComicVersion, SARC=SARC, IssueArcID=IssueArcID, RSS="yes", ComicID=ComicID, issuetitle=issuetitle, unaltered_ComicName=unaltered_ComicName, oneoff=oneoff, cmloopit=cmloopit, manual=manual, torznab_host=torznab_host, digitaldate=digitaldate, booktype=booktype) if findit['status'] is False: if AlternateSearch is not None and AlternateSearch != "None": @@ -581,7 +580,7 @@ def NZB_SEARCH(ComicName, IssueNumber, ComicYear, SeriesYear, Publisher, IssueDa foundc['status'] = False done = True break - if any([nzbprov == '32P', nzbprov == 'Public Torrents']): + if any([nzbprov == '32P', nzbprov == 'Public Torrents', nzbprov == 'ddl']): #because 32p directly stores the exact issue, no need to worry about iterating over variations of the issue number. findloop == 99 @@ -619,14 +618,17 @@ def NZB_SEARCH(ComicName, IssueNumber, ComicYear, SeriesYear, Publisher, IssueDa #logger.fdebug('RSS Check: %s' % RSS) #logger.fdebug('nzbprov: %s' % nzbprov) #logger.fdebug('comicid: %s' % ComicID) - if nzbprov == 'ddl': + if nzbprov == 'ddl' and RSS == "no": cmname = re.sub("%20", " ", str(comsrc)) logger.fdebug('Sending request to DDL site for : %s %s' % (findcomic, isssearch)) b = getcomics.GC(query='%s %s' % (findcomic, isssearch)) bb = b.search() #logger.info('bb returned from DDL: %s' % bb) elif RSS == "yes": - if nzbprov == '32P' or nzbprov == 'Public Torrents': + if nzbprov == 'ddl': + logger.fdebug('Sending request to [%s] RSS for %s : %s' % (nzbprov, ComicName, mod_isssearch)) + bb = rsscheck.ddl_dbsearch(ComicName, mod_isssearch, ComicID, nzbprov, oneoff) + elif nzbprov == '32P' or nzbprov == 'Public Torrents': cmname = re.sub("%20", " ", str(comsrc)) logger.fdebug('Sending request to [%s] RSS for %s : %s' % (nzbprov, ComicName, mod_isssearch)) bb = rsscheck.torrentdbsearch(ComicName, mod_isssearch, ComicID, nzbprov, oneoff) @@ -1389,7 +1391,13 @@ def NZB_SEARCH(ComicName, IssueNumber, ComicYear, SeriesYear, Publisher, IssueDa nowrite = False if all([nzbprov == 'torznab', 'worldwidetorrents' in entry['link']]): nzbid = generate_id(nzbprov, entry['id']) - elif all([nzbprov == 'ddl', 'getcomics' in entry['link']]): + elif all([nzbprov == 'ddl', 'getcomics' in entry['link']]) or all([nzbprov == 'ddl', RSS == 'yes']): + if RSS == "yes": + entry['id'] = entry['link'] + entry['link'] = 'https://getcomics.info/?p='+str(entry['id']) + entry['filename'] = entry['title'] + if '/cat/' in entry['link']: + entry['link'] = 'https://getcomics.info/?p='+str(entry['id']) nzbid = entry['id'] entry['title'] = entry['filename'] else: @@ -2318,7 +2326,6 @@ def searcher(nzbprov, nzbname, comicinfo, link, IssueID, ComicID, tmpprov, direc ggc = getcomics.GC(issueid=IssueID, comicid=ComicID) sendsite = ggc.loadsite(nzbid, link) ddl_it = ggc.parse_downloadresults(nzbid, link) - logger.info("ddl status response: %s" % ddl_it) if ddl_it['success'] is True: logger.info('Successfully snatched %s from DDL site. It is currently being queued to download in position %s' % (nzbname, mylar.DDL_QUEUE.qsize())) else: