FIX: fix for DDL provider option attempting to use incorrect links when downloading, FIX: Fixed some DDL problems due to various parsing / tpe problems, IMP: DDL Provider will now follow RSS feed option if option is enabled

2019-03-08 16:56:05 -05:00 · 2019-03-08 16:56:05 -05:00 · c06e96c129
parent ef278eac21
commit c06e96c129
4 changed files with 220 additions and 53 deletions
--- a/mylar/getcomics.py
+++ b/mylar/getcomics.py
@ -140,7 +140,7 @@ class GC(object):
                            nwsize = size.find('//')
                            size = re.sub('\[', '', size[:nwsize]).strip()
                    else:
-                        size = '0 M'
+                        size = '0M'
                i+=1
            dateline = f.find('time')
            datefull = dateline['datetime']
@ -163,15 +163,19 @@ class GC(object):
    def parse_downloadresults(self, id, mainlink):
        myDB = db.DBConnection()
        series = None
        year = None
        size = None
        title = os.path.join(mylar.CONFIG.CACHE_DIR, 'getcomics-' + id)
        soup = BeautifulSoup(open(title+'.html'), 'html.parser')
        orig_find = soup.find("p", {"style": "text-align: center;"})
        i = 0
        option_find = orig_find
        possible_more = None
        while True: #i <= 10:
            prev_option = option_find
            option_find = option_find.findNext(text=True)
-            if i == 0:
+            if i == 0 and series is None:
                series = option_find
            elif 'Year' in option_find:
                year = option_find.findNext(text=True)
@ -189,24 +193,52 @@ class GC(object):
        for f in soup.findAll("div", {"class": "aio-pulse"}):
            lk = f.find('a')
            if lk['title'] == 'Download Now':
-                link = lk['href']
+                link = {"series":  series,
-                site = lk['title']
+                         "site":   lk['title'],
                         "year":   year,
                         "issues": None,
                         "size":   size,
                         "link":   lk['href']}
                break #get the first link just to test
        links = []
        if link is None and possible_more.name == 'ul':
-            bb = possible_more.findAll('li')
+            try:
-            for x in bb:
+                bb = possible_more.findAll('li')
-                volume = x.findNext(text=True)
+            except:
-                if u'\u2013' in volume:
+                pass
-                    volume = re.sub(u'\u2013', '-', volume)
+            else:
-                linkline = x.find('a')
+                for x in bb:
-                link = linkline['href']
+                    linkline = x.find('a')
-                site = linkline.findNext(text=True)
+                    if linkline:
-                links.append({"volume": volume,
+                        if 'go.php' in linkline['href']:
-                              "site": site,
+                            volume = x.findNext(text=True)
-                              "link": link})
+                            if u'\u2013' in volume:
                                volume = re.sub(u'\u2013', '-', volume)
                            #volume label contains series, issue(s), year(s), and size
                            series_st = volume.find('(')
                            issues_st = volume.find('#')
                            series = volume[:series_st]
                            if any([issues_st == -1, series_st == -1]):
                                issues = None
                            else:
                                series = volume[:issues_st].strip()
                                issues = volume[issues_st+1:series_st].strip()
                            year_end = volume.find(')', series_st+1)
                            year = re.sub('[\(\)]', '', volume[series_st+1: year_end]).strip()
                            size_end = volume.find(')', year_end+1)
                            size = re.sub('[\(\)]', '', volume[year_end+1: size_end]).strip()
                            linked = linkline['href']
                            site = linkline.findNext(text=True)
                            if site == 'Main Server':
                                links.append({"series": series,
                                              "site":   site,
                                              "year":   year,
                                              "issues": issues,
                                              "size":   size,
                                              "link":   linked})
        else:
            check_extras = soup.findAll("h3")
            for sb in check_extras:
@ -220,40 +252,52 @@ class GC(object):
                            if u'\u2013' in volume:
                                volume = re.sub(u'\u2013', '-', volume)
                            linkline = x.find('a')
-                            link = linkline['href']
+                            linked = linkline['href']
                            site = linkline.findNext(text=True)
                            links.append({"volume": volume,
                                          "site": site,
-                                          "link": link})
+                                          "link": linked})
-        if link is None:
+        if all([link is None, len(links) == 0]):
            logger.warn('Unable to retrieve any valid immediate download links. They might not exist.')
            return {'success':  False}
-
+        if all([link is not None, len(links) == 0]):
            logger.info('only one item discovered, changing queue length to accomodate: %s [%s]' % (link, type(link)))
            links = [link]
        elif len(links) > 0:
            if len(links) > 1:
                logger.info('[DDL-QUEUER] This pack has been broken up into %s separate packs - queueing each in sequence for your enjoyment.' % len(links))
        cnt = 1
        for x in links:
-            logger.fdebug('[%s] %s - %s' % (x['site'], x['volume'], x['link']))
+            if len(links) == 1:
                mod_id = id
            else:
                mod_id = id+'-'+str(cnt)
            #logger.fdebug('[%s] %s (%s) %s [%s][%s]' % (x['site'], x['series'], x['year'], x['issues'], x['size'],  x['link']))
-        ctrlval = {'id':   id}
+            ctrlval = {'id':        mod_id}
-        vals = {'series':       series,
+            vals = {'series':       x['series'],
-                'year':         year,
+                    'year':         x['year'],
-                'size':         size,
+                    'size':         x['size'],
-                'issueid':      self.issueid,
+                    'issues':       x['issues'],
-                'comicid':      self.comicid,
+                    'issueid':      self.issueid,
-                'link':         link,
+                    'comicid':      self.comicid,
-                'mainlink':     mainlink,
+                    'link':         x['link'],
-                'updated_date': datetime.datetime.now().strftime('%Y-%m-%d %H:%M'),
+                    'mainlink':     mainlink,
-                'status':       'Queued'}
+                    'updated_date': datetime.datetime.now().strftime('%Y-%m-%d %H:%M'),
-        myDB.upsert('ddl_info', vals, ctrlval)
+                    'status':       'Queued'}
            myDB.upsert('ddl_info', vals, ctrlval)
-        mylar.DDL_QUEUE.put({'link':     link,
+            mylar.DDL_QUEUE.put({'link':     x['link'],
-                             'mainlink': mainlink,
+                                 'mainlink': mainlink,
-                             'series':   series,
+                                 'series':   x['series'],
-                             'year':     year,
+                                 'year':     x['year'],
-                             'size':     size,
+                                 'size':     x['size'],
-                             'comicid':  self.comicid,
+                                 'comicid':  self.comicid,
-                             'issueid':  self.issueid,
+                                 'issueid':  self.issueid,
-                             'id':       id,
+                                 'id':       mod_id,
-                             'resume':   None})
+                                 'resume':   None})
            cnt+=1
        return {'success': True}
@ -275,20 +319,23 @@ class GC(object):
                t = s.get(link, verify=True, cookies=cf_cookievalue, headers=self.headers, stream=True)
                filename = os.path.basename(urllib.unquote(t.url).decode('utf-8'))
                if 'GetComics.INFO' in filename:
                    filename = re.sub('GetComics.INFO', '', filename, re.I).strip()
                try:
                    remote_filesize = int(t.headers['Content-length'])
                    logger.fdebug('remote filesize: %s' % remote_filesize)
                except Exception as e:
-                    logger.warn('[WARNING] Unable to retrieve remote file size. Error returned as : %s' % e)
+                    logger.warn('[WARNING] Unable to retrieve remote file size - this is usually due to the page being behind a different click-bait/ad page. Error returned as : %s' % e)
                    logger.warn('[WARNING] Considering this particular download as invalid and will ignore this result.')
                    remote_filesize = 0
                    mylar.DDL_LOCK = False
                    return ({"success":  False,
                            "filename": filename,
                            "path":     None})
-                else:
+
-                    #write the filename to the db for tracking purposes...
+                #write the filename to the db for tracking purposes...
-                    myDB.upsert('ddl_info', {'filename': filename, 'remote_filesize': remote_filesize}, {'id': id})
+                myDB.upsert('ddl_info', {'filename': filename, 'remote_filesize': remote_filesize}, {'id': id})
                path = os.path.join(mylar.CONFIG.DDL_LOCATION, filename)
--- a/mylar/rsscheck.py
+++ b/mylar/rsscheck.py
@ -24,6 +24,7 @@ from datetime import datetime, timedelta
 import gzip
 import time
 import random
 from bs4 import BeautifulSoup
 from StringIO import StringIO
 import mylar
@ -384,6 +385,78 @@ def torrents(pickfeed=None, seriesname=None, issue=None, feedinfo=None):
        return torinfo
    return
 def ddl(forcerss=False):
    headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:40.0) Gecko/20100101 Firefox/40.1'}
    ddl_feed = 'https://getcomics.info/feed/'
    try:
        r = requests.get(ddl_feed, verify=True, headers=headers)
    except Exception, e:
        logger.warn('Error fetching RSS Feed Data from DDL: %s' % (e))
        return False
    else:
        if r.status_code != 200:
            #typically 403 will not return results, but just catch anything other than a 200
            if r.status_code == 403:
                logger.warn('ERROR - status code:%s' % r.status_code)
                return False
            else:
                logger.warn('[%s] Status code returned: %s' % (r.status_code))
                return False
        feedme = feedparser.parse(r.content)
        results = []
        for entry in feedme.entries:
            soup = BeautifulSoup(entry.summary, 'html.parser')
            orig_find = soup.find("p", {"style": "text-align: center;"})
            i = 0
            option_find = orig_find
            while True: #i <= 10:
                prev_option = option_find
                option_find = option_find.findNext(text=True)
                if 'Year' in option_find:
                    year = option_find.findNext(text=True)
                    year = re.sub('\|', '', year).strip()
                else:
                   if 'Size' in prev_option:
                        size = option_find #.findNext(text=True)
                        if '- MB' in size: size = '0 MB'
                        possible_more = orig_find.next_sibling
                        break
            i+=1
            link = entry.link
            title = entry.title
            updated = entry.updated
            if updated.endswith('+0000'):
                updated = updated[:-5].strip()
            tmpid = entry.id
            id = tmpid[tmpid.find('=')+1:]
            if 'KB' in size:
                szform = 'KB'
                sz = 'K'
            elif 'GB' in size:
                szform = 'GB'
                sz = 'G'
            elif 'MB' in size:
                szform = 'MB'
                sz = 'M'
            elif 'TB' in size:
                szform = 'TB'
                sz = 'T'
            tsize = helpers.human2bytes(re.sub('[^0-9]', '', size).strip() + sz)
            #link can be referenced with the ?p=id url
            results.append({'Title':   title,
                            'Size':    tsize,
                            'Link':    id,
                            'Site':    'DDL',
                            'Pubdate': updated})
        if len(results) >0:
            logger.info('[RSS][DDL] %s entries have been indexed and are now going to be stored for caching.' % len(results))
            rssdbupdate(results, len(results), 'ddl')
    return
 def nzbs(provider=None, forcerss=False):
@ -569,6 +642,43 @@ def rssdbupdate(feeddata, i, type):
    logger.fdebug('Completed adding new data to RSS DB. Next add in ' + str(mylar.CONFIG.RSS_CHECKINTERVAL) + ' minutes')
    return
 def ddl_dbsearch(seriesname, issue, comicid=None, nzbprov=None, oneoff=False):
    myDB = db.DBConnection()
    seriesname_alt = None
    if any([comicid is None, comicid == 'None', oneoff is True]):
        pass
    else:
        snm = myDB.selectone("SELECT * FROM comics WHERE comicid=?", [comicid]).fetchone()
        if snm is None:
            logger.fdebug('Invalid ComicID of %s. Aborting search' % comicid)
            return "no results"
        else:
            seriesname = snm['ComicName']
            seriesname_alt = snm['AlternateSearch']
    dsearch_rem1 = re.sub("\\band\\b", "%", seriesname.lower())
    dsearch_rem2 = re.sub("\\bthe\\b", "%", dsearch_rem1.lower())
    dsearch_removed = re.sub('\s+', ' ', dsearch_rem2)
    dsearch_seriesname = re.sub('[\'\!\@\#\$\%\:\-\;\/\\=\?\&\.\s\,]', '%', dsearch_removed)
    dsearch = '%' + dsearch_seriesname + '%'
    dresults = myDB.select("SELECT * FROM rssdb WHERE Title like ? AND Site='DDL'", [dsearch])
    ddltheinfo = []
    ddlinfo = {}
    if not dresults:
        return "no results"
    else:
        for dl in dresults:
            ddltheinfo.append({
                          'title':   dl['Title'],
                          'link':    dl['Link'],
                          'pubdate': dl['Pubdate'],
                          'site':    dl['Site'],
                          'length':  dl['Size']
                          })
    ddlinfo['entries'] = ddltheinfo
    return ddlinfo
 def torrentdbsearch(seriesname, issue, comicid=None, nzbprov=None, oneoff=False):
    myDB = db.DBConnection()
--- a/mylar/rsscheckit.py
+++ b/mylar/rsscheckit.py
@ -91,6 +91,9 @@ class tehMain():
            logger.info('[RSS-FEEDS] Initiating RSS Feed Check for NZB Providers.')
            rsscheck.nzbs(forcerss=forcerss)
            if mylar.CONFIG.ENABLE_DDL is True:
                logger.info('[RSS-FEEDS] Initiating RSS Feed Check for DDL Provider.')
                rsscheck.ddl(forcerss=forcerss)
            logger.info('[RSS-FEEDS] RSS Feed Check/Update Complete')
            logger.info('[RSS-FEEDS] Watchlist Check for new Releases')
            mylar.search.searchforissue(rsscheck='yes')
--- a/mylar/search.py
+++ b/mylar/search.py
@ -254,7 +254,6 @@ def search_init(ComicName, IssueNumber, ComicYear, SeriesYear, Publisher, IssueD
            c_number = c_number[:decst].rstrip()
    while (srchloop <= searchcnt):
        logger.fdebug('srchloop: %s' % srchloop)
        #searchmodes:
        # rss - will run through the built-cached db of entries
        # api - will run through the providers via api (or non-api in the case of Experimental)
@ -334,9 +333,9 @@ def search_init(ComicName, IssueNumber, ComicYear, SeriesYear, Publisher, IssueD
                    prov_count+=1
                    continue
                if searchmode == 'rss':
-                    if searchprov.lower() == 'ddl':
+                    #if searchprov.lower() == 'ddl':
-                        prov_count+=1
+                    #    prov_count+=1
-                        continue
+                    #    continue
                    findit = NZB_SEARCH(ComicName, IssueNumber, ComicYear, SeriesYear, Publisher, IssueDate, StoreDate, searchprov, send_prov_count, IssDateFix, IssueID, UseFuzzy, newznab_host, ComicVersion=ComicVersion, SARC=SARC, IssueArcID=IssueArcID, RSS="yes", ComicID=ComicID, issuetitle=issuetitle, unaltered_ComicName=unaltered_ComicName, oneoff=oneoff, cmloopit=cmloopit, manual=manual, torznab_host=torznab_host, digitaldate=digitaldate, booktype=booktype)
                    if findit['status'] is False:
                        if AlternateSearch is not None and AlternateSearch != "None":
@ -581,7 +580,7 @@ def NZB_SEARCH(ComicName, IssueNumber, ComicYear, SeriesYear, Publisher, IssueDa
            foundc['status'] = False
            done = True
            break
-        if any([nzbprov == '32P', nzbprov == 'Public Torrents']):
+        if any([nzbprov == '32P', nzbprov == 'Public Torrents', nzbprov == 'ddl']):
            #because 32p directly stores the exact issue, no need to worry about iterating over variations of the issue number.
            findloop == 99
@ -619,14 +618,17 @@ def NZB_SEARCH(ComicName, IssueNumber, ComicYear, SeriesYear, Publisher, IssueDa
        #logger.fdebug('RSS Check: %s' % RSS)
        #logger.fdebug('nzbprov: %s' % nzbprov)
        #logger.fdebug('comicid: %s' % ComicID)
-        if nzbprov == 'ddl':
+        if nzbprov == 'ddl' and RSS == "no":
            cmname = re.sub("%20", " ", str(comsrc))
            logger.fdebug('Sending request to DDL site for : %s %s' % (findcomic, isssearch))
            b = getcomics.GC(query='%s %s' % (findcomic, isssearch))
            bb = b.search()
            #logger.info('bb returned from DDL: %s' % bb)
        elif RSS == "yes":
-            if nzbprov == '32P' or nzbprov == 'Public Torrents':
+            if nzbprov == 'ddl':
                logger.fdebug('Sending request to [%s] RSS for %s : %s' % (nzbprov, ComicName, mod_isssearch))
                bb = rsscheck.ddl_dbsearch(ComicName, mod_isssearch, ComicID, nzbprov, oneoff)
            elif nzbprov == '32P' or nzbprov == 'Public Torrents':
                cmname = re.sub("%20", " ", str(comsrc))
                logger.fdebug('Sending request to [%s] RSS for %s : %s' % (nzbprov, ComicName, mod_isssearch))
                bb = rsscheck.torrentdbsearch(ComicName, mod_isssearch, ComicID, nzbprov, oneoff)
@ -1389,7 +1391,13 @@ def NZB_SEARCH(ComicName, IssueNumber, ComicYear, SeriesYear, Publisher, IssueDa
                            nowrite = False
                            if all([nzbprov == 'torznab', 'worldwidetorrents' in entry['link']]):
                                nzbid = generate_id(nzbprov, entry['id'])
-                            elif all([nzbprov == 'ddl', 'getcomics' in entry['link']]):
+                            elif all([nzbprov == 'ddl', 'getcomics' in entry['link']]) or all([nzbprov == 'ddl', RSS == 'yes']):
                                if RSS == "yes":
                                    entry['id'] = entry['link']
                                    entry['link'] = 'https://getcomics.info/?p='+str(entry['id'])
                                    entry['filename'] = entry['title']
                                if '/cat/' in entry['link']:
                                    entry['link'] = 'https://getcomics.info/?p='+str(entry['id'])
                                nzbid = entry['id']
                                entry['title'] = entry['filename']
                            else:
@ -2318,7 +2326,6 @@ def searcher(nzbprov, nzbname, comicinfo, link, IssueID, ComicID, tmpprov, direc
        ggc = getcomics.GC(issueid=IssueID, comicid=ComicID)
        sendsite = ggc.loadsite(nzbid, link)
        ddl_it = ggc.parse_downloadresults(nzbid, link)
        logger.info("ddl status response: %s" % ddl_it)
        if ddl_it['success'] is True:
            logger.info('Successfully snatched %s from DDL site. It is currently being queued to download in position %s' % (nzbname, mylar.DDL_QUEUE.qsize()))
        else: