FIX: fix for DDL provider option attempting to use incorrect links when downloading, FIX: Fixed some DDL problems due to various parsing / tpe problems, IMP: DDL Provider will now follow RSS feed option if option is enabled

2025-03-06 19:38:02 +00:00 · 2019-03-08 16:56:05 -05:00 · 2019-03-08 16:56:05 -05:00 · c06e96c129
commit c06e96c129
parent ef278eac21
4 changed files with 220 additions and 53 deletions
--- a/mylar/getcomics.py
+++ b/mylar/getcomics.py
@ -140,7 +140,7 @@ class GC(object):
                            nwsize = size.find('//')
                            size = re.sub('\[', '', size[:nwsize]).strip()
                    else:
-                        size = '0 M'
+                        size = '0M'
                i+=1
            dateline = f.find('time')
            datefull = dateline['datetime']
@ -163,15 +163,19 @@ class GC(object):

    def parse_downloadresults(self, id, mainlink):
        myDB = db.DBConnection()
+        series = None
+        year = None
+        size = None
        title = os.path.join(mylar.CONFIG.CACHE_DIR, 'getcomics-' + id)
        soup = BeautifulSoup(open(title+'.html'), 'html.parser')
        orig_find = soup.find("p", {"style": "text-align: center;"})
        i = 0
        option_find = orig_find
+        possible_more = None
        while True: #i <= 10:
            prev_option = option_find
            option_find = option_find.findNext(text=True)
-            if i == 0:
+            if i == 0 and series is None:
                series = option_find
            elif 'Year' in option_find:
                year = option_find.findNext(text=True)
@ -189,24 +193,52 @@ class GC(object):
        for f in soup.findAll("div", {"class": "aio-pulse"}):
            lk = f.find('a')
            if lk['title'] == 'Download Now':
-                link = lk['href']
-                site = lk['title']
+                link = {"series":  series,
+                         "site":   lk['title'],
+                         "year":   year,
+                         "issues": None,
+                         "size":   size,
+                         "link":   lk['href']}
+
                break #get the first link just to test

        links = []

        if link is None and possible_more.name == 'ul':
-            bb = possible_more.findAll('li')
-            for x in bb:
-                volume = x.findNext(text=True)
-                if u'\u2013' in volume:
-                    volume = re.sub(u'\u2013', '-', volume)
-                linkline = x.find('a')
-                link = linkline['href']
-                site = linkline.findNext(text=True)
-                links.append({"volume": volume,
-                              "site": site,
-                              "link": link})
+            try:
+                bb = possible_more.findAll('li')
+            except:
+                pass
+            else:
+                for x in bb:
+                    linkline = x.find('a')
+                    if linkline:
+                        if 'go.php' in linkline['href']:
+                            volume = x.findNext(text=True)
+                            if u'\u2013' in volume:
+                                volume = re.sub(u'\u2013', '-', volume)
+                            #volume label contains series, issue(s), year(s), and size
+                            series_st = volume.find('(')
+                            issues_st = volume.find('#')
+                            series = volume[:series_st]
+                            if any([issues_st == -1, series_st == -1]):
+                                issues = None
+                            else:
+                                series = volume[:issues_st].strip()
+                                issues = volume[issues_st+1:series_st].strip()
+                            year_end = volume.find(')', series_st+1)
+                            year = re.sub('[\(\)]', '', volume[series_st+1: year_end]).strip()
+                            size_end = volume.find(')', year_end+1)
+                            size = re.sub('[\(\)]', '', volume[year_end+1: size_end]).strip()
+                            linked = linkline['href']
+                            site = linkline.findNext(text=True)
+                            if site == 'Main Server':
+                                links.append({"series": series,
+                                              "site":   site,
+                                              "year":   year,
+                                              "issues": issues,
+                                              "size":   size,
+                                              "link":   linked})
        else:
            check_extras = soup.findAll("h3")
            for sb in check_extras:
@ -220,40 +252,52 @@ class GC(object):
                            if u'\u2013' in volume:
                                volume = re.sub(u'\u2013', '-', volume)
                            linkline = x.find('a')
-                            link = linkline['href']
+                            linked = linkline['href']
                            site = linkline.findNext(text=True)
                            links.append({"volume": volume,
                                          "site": site,
-                                          "link": link})
+                                          "link": linked})

-        if link is None:
+        if all([link is None, len(links) == 0]):
            logger.warn('Unable to retrieve any valid immediate download links. They might not exist.')
            return {'success':  False}
-
+        if all([link is not None, len(links) == 0]):
+            logger.info('only one item discovered, changing queue length to accomodate: %s [%s]' % (link, type(link)))
+            links = [link]
+        elif len(links) > 0:
+            if len(links) > 1:
+                logger.info('[DDL-QUEUER] This pack has been broken up into %s separate packs - queueing each in sequence for your enjoyment.' % len(links))
+        cnt = 1
        for x in links:
-            logger.fdebug('[%s] %s - %s' % (x['site'], x['volume'], x['link']))
+            if len(links) == 1:
+                mod_id = id
+            else:
+                mod_id = id+'-'+str(cnt)
+            #logger.fdebug('[%s] %s (%s) %s [%s][%s]' % (x['site'], x['series'], x['year'], x['issues'], x['size'],  x['link']))

-        ctrlval = {'id':   id}
-        vals = {'series':       series,
-                'year':         year,
-                'size':         size,
-                'issueid':      self.issueid,
-                'comicid':      self.comicid,
-                'link':         link,
-                'mainlink':     mainlink,
-                'updated_date': datetime.datetime.now().strftime('%Y-%m-%d %H:%M'),
-                'status':       'Queued'}
-        myDB.upsert('ddl_info', vals, ctrlval)
+            ctrlval = {'id':        mod_id}
+            vals = {'series':       x['series'],
+                    'year':         x['year'],
+                    'size':         x['size'],
+                    'issues':       x['issues'],
+                    'issueid':      self.issueid,
+                    'comicid':      self.comicid,
+                    'link':         x['link'],
+                    'mainlink':     mainlink,
+                    'updated_date': datetime.datetime.now().strftime('%Y-%m-%d %H:%M'),
+                    'status':       'Queued'}
+            myDB.upsert('ddl_info', vals, ctrlval)

-        mylar.DDL_QUEUE.put({'link':     link,
-                             'mainlink': mainlink,
-                             'series':   series,
-                             'year':     year,
-                             'size':     size,
-                             'comicid':  self.comicid,
-                             'issueid':  self.issueid,
-                             'id':       id,
-                             'resume':   None})
+            mylar.DDL_QUEUE.put({'link':     x['link'],
+                                 'mainlink': mainlink,
+                                 'series':   x['series'],
+                                 'year':     x['year'],
+                                 'size':     x['size'],
+                                 'comicid':  self.comicid,
+                                 'issueid':  self.issueid,
+                                 'id':       mod_id,
+                                 'resume':   None})
+            cnt+=1

        return {'success': True}

@ -275,20 +319,23 @@ class GC(object):
                t = s.get(link, verify=True, cookies=cf_cookievalue, headers=self.headers, stream=True)

                filename = os.path.basename(urllib.unquote(t.url).decode('utf-8'))
+                if 'GetComics.INFO' in filename:
+                    filename = re.sub('GetComics.INFO', '', filename, re.I).strip()

                try:
                    remote_filesize = int(t.headers['Content-length'])
                    logger.fdebug('remote filesize: %s' % remote_filesize)
                except Exception as e:
-                    logger.warn('[WARNING] Unable to retrieve remote file size. Error returned as : %s' % e)
+                    logger.warn('[WARNING] Unable to retrieve remote file size - this is usually due to the page being behind a different click-bait/ad page. Error returned as : %s' % e)
+                    logger.warn('[WARNING] Considering this particular download as invalid and will ignore this result.')
                    remote_filesize = 0
                    mylar.DDL_LOCK = False
                    return ({"success":  False,
                            "filename": filename,
                            "path":     None})
-                else:
-                    #write the filename to the db for tracking purposes...
-                    myDB.upsert('ddl_info', {'filename': filename, 'remote_filesize': remote_filesize}, {'id': id})
+
+                #write the filename to the db for tracking purposes...
+                myDB.upsert('ddl_info', {'filename': filename, 'remote_filesize': remote_filesize}, {'id': id})

                path = os.path.join(mylar.CONFIG.DDL_LOCATION, filename)

--- a/mylar/rsscheck.py
+++ b/mylar/rsscheck.py
@ -24,6 +24,7 @@ from datetime import datetime, timedelta
 import gzip
 import time
 import random
+from bs4 import BeautifulSoup
 from StringIO import StringIO

 import mylar
@ -384,6 +385,78 @@ def torrents(pickfeed=None, seriesname=None, issue=None, feedinfo=None):
        return torinfo
    return

+def ddl(forcerss=False):
+    headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:40.0) Gecko/20100101 Firefox/40.1'}
+    ddl_feed = 'https://getcomics.info/feed/'
+    try:
+        r = requests.get(ddl_feed, verify=True, headers=headers)
+    except Exception, e:
+        logger.warn('Error fetching RSS Feed Data from DDL: %s' % (e))
+        return False
+    else:
+        if r.status_code != 200:
+            #typically 403 will not return results, but just catch anything other than a 200
+            if r.status_code == 403:
+                logger.warn('ERROR - status code:%s' % r.status_code)
+                return False
+            else:
+                logger.warn('[%s] Status code returned: %s' % (r.status_code))
+                return False
+
+        feedme = feedparser.parse(r.content)
+        results = []
+        for entry in feedme.entries:
+            soup = BeautifulSoup(entry.summary, 'html.parser')
+            orig_find = soup.find("p", {"style": "text-align: center;"})
+            i = 0
+            option_find = orig_find
+            while True: #i <= 10:
+                prev_option = option_find
+                option_find = option_find.findNext(text=True)
+                if 'Year' in option_find:
+                    year = option_find.findNext(text=True)
+                    year = re.sub('\|', '', year).strip()
+                else:
+                   if 'Size' in prev_option:
+                        size = option_find #.findNext(text=True)
+                        if '- MB' in size: size = '0 MB'
+                        possible_more = orig_find.next_sibling
+                        break
+            i+=1
+
+            link = entry.link
+            title = entry.title
+            updated = entry.updated
+            if updated.endswith('+0000'):
+                updated = updated[:-5].strip()
+            tmpid = entry.id
+            id = tmpid[tmpid.find('=')+1:]
+            if 'KB' in size:
+                szform = 'KB'
+                sz = 'K'
+            elif 'GB' in size:
+                szform = 'GB'
+                sz = 'G'
+            elif 'MB' in size:
+                szform = 'MB'
+                sz = 'M'
+            elif 'TB' in size:
+                szform = 'TB'
+                sz = 'T'
+            tsize = helpers.human2bytes(re.sub('[^0-9]', '', size).strip() + sz)
+
+            #link can be referenced with the ?p=id url
+            results.append({'Title':   title,
+                            'Size':    tsize,
+                            'Link':    id,
+                            'Site':    'DDL',
+                            'Pubdate': updated})
+
+        if len(results) >0:
+            logger.info('[RSS][DDL] %s entries have been indexed and are now going to be stored for caching.' % len(results))
+            rssdbupdate(results, len(results), 'ddl')
+
+    return

 def nzbs(provider=None, forcerss=False):

@ -569,6 +642,43 @@ def rssdbupdate(feeddata, i, type):
    logger.fdebug('Completed adding new data to RSS DB. Next add in ' + str(mylar.CONFIG.RSS_CHECKINTERVAL) + ' minutes')
    return

+def ddl_dbsearch(seriesname, issue, comicid=None, nzbprov=None, oneoff=False):
+    myDB = db.DBConnection()
+    seriesname_alt = None
+    if any([comicid is None, comicid == 'None', oneoff is True]):
+        pass
+    else:
+        snm = myDB.selectone("SELECT * FROM comics WHERE comicid=?", [comicid]).fetchone()
+        if snm is None:
+            logger.fdebug('Invalid ComicID of %s. Aborting search' % comicid)
+            return "no results"
+        else:
+            seriesname = snm['ComicName']
+            seriesname_alt = snm['AlternateSearch']
+
+    dsearch_rem1 = re.sub("\\band\\b", "%", seriesname.lower())
+    dsearch_rem2 = re.sub("\\bthe\\b", "%", dsearch_rem1.lower())
+    dsearch_removed = re.sub('\s+', ' ', dsearch_rem2)
+    dsearch_seriesname = re.sub('[\'\!\@\#\$\%\:\-\;\/\\=\?\&\.\s\,]', '%', dsearch_removed)
+    dsearch = '%' + dsearch_seriesname + '%'
+    dresults = myDB.select("SELECT * FROM rssdb WHERE Title like ? AND Site='DDL'", [dsearch])
+    ddltheinfo = []
+    ddlinfo = {}
+    if not dresults:
+        return "no results"
+    else:
+        for dl in dresults:
+            ddltheinfo.append({
+                          'title':   dl['Title'],
+                          'link':    dl['Link'],
+                          'pubdate': dl['Pubdate'],
+                          'site':    dl['Site'],
+                          'length':  dl['Size']
+                          })
+
+    ddlinfo['entries'] = ddltheinfo
+
+    return ddlinfo

 def torrentdbsearch(seriesname, issue, comicid=None, nzbprov=None, oneoff=False):
    myDB = db.DBConnection()
--- a/mylar/rsscheckit.py
+++ b/mylar/rsscheckit.py
@ -91,6 +91,9 @@ class tehMain():

            logger.info('[RSS-FEEDS] Initiating RSS Feed Check for NZB Providers.')
            rsscheck.nzbs(forcerss=forcerss)
+            if mylar.CONFIG.ENABLE_DDL is True:
+                logger.info('[RSS-FEEDS] Initiating RSS Feed Check for DDL Provider.')
+                rsscheck.ddl(forcerss=forcerss)
            logger.info('[RSS-FEEDS] RSS Feed Check/Update Complete')
            logger.info('[RSS-FEEDS] Watchlist Check for new Releases')
            mylar.search.searchforissue(rsscheck='yes')
--- a/mylar/search.py
+++ b/mylar/search.py
@ -254,7 +254,6 @@ def search_init(ComicName, IssueNumber, ComicYear, SeriesYear, Publisher, IssueD
            c_number = c_number[:decst].rstrip()

    while (srchloop <= searchcnt):
-        logger.fdebug('srchloop: %s' % srchloop)
        #searchmodes:
        # rss - will run through the built-cached db of entries
        # api - will run through the providers via api (or non-api in the case of Experimental)
@ -334,9 +333,9 @@ def search_init(ComicName, IssueNumber, ComicYear, SeriesYear, Publisher, IssueD
                    prov_count+=1
                    continue
                if searchmode == 'rss':
-                    if searchprov.lower() == 'ddl':
-                        prov_count+=1
-                        continue
+                    #if searchprov.lower() == 'ddl':
+                    #    prov_count+=1
+                    #    continue
                    findit = NZB_SEARCH(ComicName, IssueNumber, ComicYear, SeriesYear, Publisher, IssueDate, StoreDate, searchprov, send_prov_count, IssDateFix, IssueID, UseFuzzy, newznab_host, ComicVersion=ComicVersion, SARC=SARC, IssueArcID=IssueArcID, RSS="yes", ComicID=ComicID, issuetitle=issuetitle, unaltered_ComicName=unaltered_ComicName, oneoff=oneoff, cmloopit=cmloopit, manual=manual, torznab_host=torznab_host, digitaldate=digitaldate, booktype=booktype)
                    if findit['status'] is False:
                        if AlternateSearch is not None and AlternateSearch != "None":
@ -581,7 +580,7 @@ def NZB_SEARCH(ComicName, IssueNumber, ComicYear, SeriesYear, Publisher, IssueDa
            foundc['status'] = False
            done = True
            break
-        if any([nzbprov == '32P', nzbprov == 'Public Torrents']):
+        if any([nzbprov == '32P', nzbprov == 'Public Torrents', nzbprov == 'ddl']):
            #because 32p directly stores the exact issue, no need to worry about iterating over variations of the issue number.
            findloop == 99

@ -619,14 +618,17 @@ def NZB_SEARCH(ComicName, IssueNumber, ComicYear, SeriesYear, Publisher, IssueDa
        #logger.fdebug('RSS Check: %s' % RSS)
        #logger.fdebug('nzbprov: %s' % nzbprov)
        #logger.fdebug('comicid: %s' % ComicID)
-        if nzbprov == 'ddl':
+        if nzbprov == 'ddl' and RSS == "no":
            cmname = re.sub("%20", " ", str(comsrc))
            logger.fdebug('Sending request to DDL site for : %s %s' % (findcomic, isssearch))
            b = getcomics.GC(query='%s %s' % (findcomic, isssearch))
            bb = b.search()
            #logger.info('bb returned from DDL: %s' % bb)
        elif RSS == "yes":
-            if nzbprov == '32P' or nzbprov == 'Public Torrents':
+            if nzbprov == 'ddl':
+                logger.fdebug('Sending request to [%s] RSS for %s : %s' % (nzbprov, ComicName, mod_isssearch))
+                bb = rsscheck.ddl_dbsearch(ComicName, mod_isssearch, ComicID, nzbprov, oneoff)
+            elif nzbprov == '32P' or nzbprov == 'Public Torrents':
                cmname = re.sub("%20", " ", str(comsrc))
                logger.fdebug('Sending request to [%s] RSS for %s : %s' % (nzbprov, ComicName, mod_isssearch))
                bb = rsscheck.torrentdbsearch(ComicName, mod_isssearch, ComicID, nzbprov, oneoff)
@ -1389,7 +1391,13 @@ def NZB_SEARCH(ComicName, IssueNumber, ComicYear, SeriesYear, Publisher, IssueDa
                            nowrite = False
                            if all([nzbprov == 'torznab', 'worldwidetorrents' in entry['link']]):
                                nzbid = generate_id(nzbprov, entry['id'])
-                            elif all([nzbprov == 'ddl', 'getcomics' in entry['link']]):
+                            elif all([nzbprov == 'ddl', 'getcomics' in entry['link']]) or all([nzbprov == 'ddl', RSS == 'yes']):
+                                if RSS == "yes":
+                                    entry['id'] = entry['link']
+                                    entry['link'] = 'https://getcomics.info/?p='+str(entry['id'])
+                                    entry['filename'] = entry['title']
+                                if '/cat/' in entry['link']:
+                                    entry['link'] = 'https://getcomics.info/?p='+str(entry['id'])
                                nzbid = entry['id']
                                entry['title'] = entry['filename']
                            else:
@ -2318,7 +2326,6 @@ def searcher(nzbprov, nzbname, comicinfo, link, IssueID, ComicID, tmpprov, direc
        ggc = getcomics.GC(issueid=IssueID, comicid=ComicID)
        sendsite = ggc.loadsite(nzbid, link)
        ddl_it = ggc.parse_downloadresults(nzbid, link)
-        logger.info("ddl status response: %s" % ddl_it)
        if ddl_it['success'] is True:
            logger.info('Successfully snatched %s from DDL site. It is currently being queued to download in position %s' % (nzbname, mylar.DDL_QUEUE.qsize()))
        else: