From c06e96c129fb5848da20c9f6c0a3461190bc5e75 Mon Sep 17 00:00:00 2001
From: evilhero <evilhero@gmail.com>
Date: Fri, 8 Mar 2019 16:56:05 -0500
Subject: [PATCH] FIX: fix for DDL provider option attempting to use incorrect
 links when downloading, FIX: Fixed some DDL problems due to various parsing /
 tpe problems, IMP: DDL Provider will now follow RSS feed option if option is
 enabled

---
 mylar/getcomics.py  | 135 +++++++++++++++++++++++++++++---------------
 mylar/rsscheck.py   | 110 ++++++++++++++++++++++++++++++++++++
 mylar/rsscheckit.py |   3 +
 mylar/search.py     |  25 +++++---
 4 files changed, 220 insertions(+), 53 deletions(-)

diff --git a/mylar/getcomics.py b/mylar/getcomics.py
index 06da10b0..7c6fe2f4 100644
--- a/mylar/getcomics.py
+++ b/mylar/getcomics.py
@@ -140,7 +140,7 @@ class GC(object):
                             nwsize = size.find('//')
                             size = re.sub('\[', '', size[:nwsize]).strip()
                     else:
-                        size = '0 M'
+                        size = '0M'
                 i+=1
             dateline = f.find('time')
             datefull = dateline['datetime']
@@ -163,15 +163,19 @@ class GC(object):
 
     def parse_downloadresults(self, id, mainlink):
         myDB = db.DBConnection()
+        series = None
+        year = None
+        size = None
         title = os.path.join(mylar.CONFIG.CACHE_DIR, 'getcomics-' + id)
         soup = BeautifulSoup(open(title+'.html'), 'html.parser')
         orig_find = soup.find("p", {"style": "text-align: center;"})
         i = 0
         option_find = orig_find
+        possible_more = None
         while True: #i <= 10:
             prev_option = option_find
             option_find = option_find.findNext(text=True)
-            if i == 0:
+            if i == 0 and series is None:
                 series = option_find
             elif 'Year' in option_find:
                 year = option_find.findNext(text=True)
@@ -189,24 +193,52 @@ class GC(object):
         for f in soup.findAll("div", {"class": "aio-pulse"}):
             lk = f.find('a')
             if lk['title'] == 'Download Now':
-                link = lk['href']
-                site = lk['title']
+                link = {"series":  series,
+                         "site":   lk['title'],
+                         "year":   year,
+                         "issues": None,
+                         "size":   size,
+                         "link":   lk['href']}
+
                 break #get the first link just to test
 
         links = []
 
         if link is None and possible_more.name == 'ul':
-            bb = possible_more.findAll('li')
-            for x in bb:
-                volume = x.findNext(text=True)
-                if u'\u2013' in volume:
-                    volume = re.sub(u'\u2013', '-', volume)
-                linkline = x.find('a')
-                link = linkline['href']
-                site = linkline.findNext(text=True)
-                links.append({"volume": volume,
-                              "site": site,
-                              "link": link})
+            try:
+                bb = possible_more.findAll('li')
+            except:
+                pass
+            else:
+                for x in bb:
+                    linkline = x.find('a')
+                    if linkline:
+                        if 'go.php' in linkline['href']:
+                            volume = x.findNext(text=True)
+                            if u'\u2013' in volume:
+                                volume = re.sub(u'\u2013', '-', volume)
+                            #volume label contains series, issue(s), year(s), and size
+                            series_st = volume.find('(')
+                            issues_st = volume.find('#')
+                            series = volume[:series_st]
+                            if any([issues_st == -1, series_st == -1]):
+                                issues = None
+                            else:
+                                series = volume[:issues_st].strip()
+                                issues = volume[issues_st+1:series_st].strip()
+                            year_end = volume.find(')', series_st+1)
+                            year = re.sub('[\(\)]', '', volume[series_st+1: year_end]).strip()
+                            size_end = volume.find(')', year_end+1)
+                            size = re.sub('[\(\)]', '', volume[year_end+1: size_end]).strip()
+                            linked = linkline['href']
+                            site = linkline.findNext(text=True)
+                            if site == 'Main Server':
+                                links.append({"series": series,
+                                              "site":   site,
+                                              "year":   year,
+                                              "issues": issues,
+                                              "size":   size,
+                                              "link":   linked})
         else:
             check_extras = soup.findAll("h3")
             for sb in check_extras:
@@ -220,40 +252,52 @@ class GC(object):
                             if u'\u2013' in volume:
                                 volume = re.sub(u'\u2013', '-', volume)
                             linkline = x.find('a')
-                            link = linkline['href']
+                            linked = linkline['href']
                             site = linkline.findNext(text=True)
                             links.append({"volume": volume,
                                           "site": site,
-                                          "link": link})
+                                          "link": linked})
 
-        if link is None:
+        if all([link is None, len(links) == 0]):
             logger.warn('Unable to retrieve any valid immediate download links. They might not exist.')
             return {'success':  False}
-
+        if all([link is not None, len(links) == 0]):
+            logger.info('only one item discovered, changing queue length to accomodate: %s [%s]' % (link, type(link)))
+            links = [link]
+        elif len(links) > 0:
+            if len(links) > 1:
+                logger.info('[DDL-QUEUER] This pack has been broken up into %s separate packs - queueing each in sequence for your enjoyment.' % len(links))
+        cnt = 1
         for x in links:
-            logger.fdebug('[%s] %s - %s' % (x['site'], x['volume'], x['link']))
+            if len(links) == 1:
+                mod_id = id
+            else:
+                mod_id = id+'-'+str(cnt)
+            #logger.fdebug('[%s] %s (%s) %s [%s][%s]' % (x['site'], x['series'], x['year'], x['issues'], x['size'],  x['link']))
 
-        ctrlval = {'id':   id}
-        vals = {'series':       series,
-                'year':         year,
-                'size':         size,
-                'issueid':      self.issueid,
-                'comicid':      self.comicid,
-                'link':         link,
-                'mainlink':     mainlink,
-                'updated_date': datetime.datetime.now().strftime('%Y-%m-%d %H:%M'),
-                'status':       'Queued'}
-        myDB.upsert('ddl_info', vals, ctrlval)
+            ctrlval = {'id':        mod_id}
+            vals = {'series':       x['series'],
+                    'year':         x['year'],
+                    'size':         x['size'],
+                    'issues':       x['issues'],
+                    'issueid':      self.issueid,
+                    'comicid':      self.comicid,
+                    'link':         x['link'],
+                    'mainlink':     mainlink,
+                    'updated_date': datetime.datetime.now().strftime('%Y-%m-%d %H:%M'),
+                    'status':       'Queued'}
+            myDB.upsert('ddl_info', vals, ctrlval)
 
-        mylar.DDL_QUEUE.put({'link':     link,
-                             'mainlink': mainlink,
-                             'series':   series,
-                             'year':     year,
-                             'size':     size,
-                             'comicid':  self.comicid,
-                             'issueid':  self.issueid,
-                             'id':       id,
-                             'resume':   None})
+            mylar.DDL_QUEUE.put({'link':     x['link'],
+                                 'mainlink': mainlink,
+                                 'series':   x['series'],
+                                 'year':     x['year'],
+                                 'size':     x['size'],
+                                 'comicid':  self.comicid,
+                                 'issueid':  self.issueid,
+                                 'id':       mod_id,
+                                 'resume':   None})
+            cnt+=1
 
         return {'success': True}
 
@@ -275,20 +319,23 @@ class GC(object):
                 t = s.get(link, verify=True, cookies=cf_cookievalue, headers=self.headers, stream=True)
 
                 filename = os.path.basename(urllib.unquote(t.url).decode('utf-8'))
+                if 'GetComics.INFO' in filename:
+                    filename = re.sub('GetComics.INFO', '', filename, re.I).strip()
 
                 try:
                     remote_filesize = int(t.headers['Content-length'])
                     logger.fdebug('remote filesize: %s' % remote_filesize)
                 except Exception as e:
-                    logger.warn('[WARNING] Unable to retrieve remote file size. Error returned as : %s' % e)
+                    logger.warn('[WARNING] Unable to retrieve remote file size - this is usually due to the page being behind a different click-bait/ad page. Error returned as : %s' % e)
+                    logger.warn('[WARNING] Considering this particular download as invalid and will ignore this result.')
                     remote_filesize = 0
                     mylar.DDL_LOCK = False
                     return ({"success":  False,
                             "filename": filename,
                             "path":     None})
-                else:
-                    #write the filename to the db for tracking purposes...
-                    myDB.upsert('ddl_info', {'filename': filename, 'remote_filesize': remote_filesize}, {'id': id})
+
+                #write the filename to the db for tracking purposes...
+                myDB.upsert('ddl_info', {'filename': filename, 'remote_filesize': remote_filesize}, {'id': id})
 
                 path = os.path.join(mylar.CONFIG.DDL_LOCATION, filename)
 
diff --git a/mylar/rsscheck.py b/mylar/rsscheck.py
index d753ddb4..cb7859b0 100755
--- a/mylar/rsscheck.py
+++ b/mylar/rsscheck.py
@@ -24,6 +24,7 @@ from datetime import datetime, timedelta
 import gzip
 import time
 import random
+from bs4 import BeautifulSoup
 from StringIO import StringIO
 
 import mylar
@@ -384,6 +385,78 @@ def torrents(pickfeed=None, seriesname=None, issue=None, feedinfo=None):
         return torinfo
     return
 
+def ddl(forcerss=False):
+    headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:40.0) Gecko/20100101 Firefox/40.1'}
+    ddl_feed = 'https://getcomics.info/feed/'
+    try:
+        r = requests.get(ddl_feed, verify=True, headers=headers)
+    except Exception, e:
+        logger.warn('Error fetching RSS Feed Data from DDL: %s' % (e))
+        return False
+    else:
+        if r.status_code != 200:
+            #typically 403 will not return results, but just catch anything other than a 200
+            if r.status_code == 403:
+                logger.warn('ERROR - status code:%s' % r.status_code)
+                return False
+            else:
+                logger.warn('[%s] Status code returned: %s' % (r.status_code))
+                return False
+
+        feedme = feedparser.parse(r.content)
+        results = []
+        for entry in feedme.entries:
+            soup = BeautifulSoup(entry.summary, 'html.parser')
+            orig_find = soup.find("p", {"style": "text-align: center;"})
+            i = 0
+            option_find = orig_find
+            while True: #i <= 10:
+                prev_option = option_find
+                option_find = option_find.findNext(text=True)
+                if 'Year' in option_find:
+                    year = option_find.findNext(text=True)
+                    year = re.sub('\|', '', year).strip()
+                else:
+                   if 'Size' in prev_option:
+                        size = option_find #.findNext(text=True)
+                        if '- MB' in size: size = '0 MB'
+                        possible_more = orig_find.next_sibling
+                        break
+            i+=1
+
+            link = entry.link
+            title = entry.title
+            updated = entry.updated
+            if updated.endswith('+0000'):
+                updated = updated[:-5].strip()
+            tmpid = entry.id
+            id = tmpid[tmpid.find('=')+1:]
+            if 'KB' in size:
+                szform = 'KB'
+                sz = 'K'
+            elif 'GB' in size:
+                szform = 'GB'
+                sz = 'G'
+            elif 'MB' in size:
+                szform = 'MB'
+                sz = 'M'
+            elif 'TB' in size:
+                szform = 'TB'
+                sz = 'T'
+            tsize = helpers.human2bytes(re.sub('[^0-9]', '', size).strip() + sz)
+
+            #link can be referenced with the ?p=id url
+            results.append({'Title':   title,
+                            'Size':    tsize,
+                            'Link':    id,
+                            'Site':    'DDL',
+                            'Pubdate': updated})
+
+        if len(results) >0:
+            logger.info('[RSS][DDL] %s entries have been indexed and are now going to be stored for caching.' % len(results))
+            rssdbupdate(results, len(results), 'ddl')
+
+    return
 
 def nzbs(provider=None, forcerss=False):
 
@@ -569,6 +642,43 @@ def rssdbupdate(feeddata, i, type):
     logger.fdebug('Completed adding new data to RSS DB. Next add in ' + str(mylar.CONFIG.RSS_CHECKINTERVAL) + ' minutes')
     return
 
+def ddl_dbsearch(seriesname, issue, comicid=None, nzbprov=None, oneoff=False):
+    myDB = db.DBConnection()
+    seriesname_alt = None
+    if any([comicid is None, comicid == 'None', oneoff is True]):
+        pass
+    else:
+        snm = myDB.selectone("SELECT * FROM comics WHERE comicid=?", [comicid]).fetchone()
+        if snm is None:
+            logger.fdebug('Invalid ComicID of %s. Aborting search' % comicid)
+            return "no results"
+        else:
+            seriesname = snm['ComicName']
+            seriesname_alt = snm['AlternateSearch']
+
+    dsearch_rem1 = re.sub("\\band\\b", "%", seriesname.lower())
+    dsearch_rem2 = re.sub("\\bthe\\b", "%", dsearch_rem1.lower())
+    dsearch_removed = re.sub('\s+', ' ', dsearch_rem2)
+    dsearch_seriesname = re.sub('[\'\!\@\#\$\%\:\-\;\/\\=\?\&\.\s\,]', '%', dsearch_removed)
+    dsearch = '%' + dsearch_seriesname + '%'
+    dresults = myDB.select("SELECT * FROM rssdb WHERE Title like ? AND Site='DDL'", [dsearch])
+    ddltheinfo = []
+    ddlinfo = {}
+    if not dresults:
+        return "no results"
+    else:
+        for dl in dresults:
+            ddltheinfo.append({
+                          'title':   dl['Title'],
+                          'link':    dl['Link'],
+                          'pubdate': dl['Pubdate'],
+                          'site':    dl['Site'],
+                          'length':  dl['Size']
+                          })
+
+    ddlinfo['entries'] = ddltheinfo
+
+    return ddlinfo
 
 def torrentdbsearch(seriesname, issue, comicid=None, nzbprov=None, oneoff=False):
     myDB = db.DBConnection()
diff --git a/mylar/rsscheckit.py b/mylar/rsscheckit.py
index 930e3115..e7bab64e 100755
--- a/mylar/rsscheckit.py
+++ b/mylar/rsscheckit.py
@@ -91,6 +91,9 @@ class tehMain():
 
             logger.info('[RSS-FEEDS] Initiating RSS Feed Check for NZB Providers.')
             rsscheck.nzbs(forcerss=forcerss)
+            if mylar.CONFIG.ENABLE_DDL is True:
+                logger.info('[RSS-FEEDS] Initiating RSS Feed Check for DDL Provider.')
+                rsscheck.ddl(forcerss=forcerss)
             logger.info('[RSS-FEEDS] RSS Feed Check/Update Complete')
             logger.info('[RSS-FEEDS] Watchlist Check for new Releases')
             mylar.search.searchforissue(rsscheck='yes')
diff --git a/mylar/search.py b/mylar/search.py
index 7020dd2d..e3167222 100755
--- a/mylar/search.py
+++ b/mylar/search.py
@@ -254,7 +254,6 @@ def search_init(ComicName, IssueNumber, ComicYear, SeriesYear, Publisher, IssueD
             c_number = c_number[:decst].rstrip()
 
     while (srchloop <= searchcnt):
-        logger.fdebug('srchloop: %s' % srchloop)
         #searchmodes:
         # rss - will run through the built-cached db of entries
         # api - will run through the providers via api (or non-api in the case of Experimental)
@@ -334,9 +333,9 @@ def search_init(ComicName, IssueNumber, ComicYear, SeriesYear, Publisher, IssueD
                     prov_count+=1
                     continue
                 if searchmode == 'rss':
-                    if searchprov.lower() == 'ddl':
-                        prov_count+=1
-                        continue
+                    #if searchprov.lower() == 'ddl':
+                    #    prov_count+=1
+                    #    continue
                     findit = NZB_SEARCH(ComicName, IssueNumber, ComicYear, SeriesYear, Publisher, IssueDate, StoreDate, searchprov, send_prov_count, IssDateFix, IssueID, UseFuzzy, newznab_host, ComicVersion=ComicVersion, SARC=SARC, IssueArcID=IssueArcID, RSS="yes", ComicID=ComicID, issuetitle=issuetitle, unaltered_ComicName=unaltered_ComicName, oneoff=oneoff, cmloopit=cmloopit, manual=manual, torznab_host=torznab_host, digitaldate=digitaldate, booktype=booktype)
                     if findit['status'] is False:
                         if AlternateSearch is not None and AlternateSearch != "None":
@@ -581,7 +580,7 @@ def NZB_SEARCH(ComicName, IssueNumber, ComicYear, SeriesYear, Publisher, IssueDa
             foundc['status'] = False
             done = True
             break
-        if any([nzbprov == '32P', nzbprov == 'Public Torrents']):
+        if any([nzbprov == '32P', nzbprov == 'Public Torrents', nzbprov == 'ddl']):
             #because 32p directly stores the exact issue, no need to worry about iterating over variations of the issue number.
             findloop == 99
 
@@ -619,14 +618,17 @@ def NZB_SEARCH(ComicName, IssueNumber, ComicYear, SeriesYear, Publisher, IssueDa
         #logger.fdebug('RSS Check: %s' % RSS)
         #logger.fdebug('nzbprov: %s' % nzbprov)
         #logger.fdebug('comicid: %s' % ComicID)
-        if nzbprov == 'ddl':
+        if nzbprov == 'ddl' and RSS == "no":
             cmname = re.sub("%20", " ", str(comsrc))
             logger.fdebug('Sending request to DDL site for : %s %s' % (findcomic, isssearch))
             b = getcomics.GC(query='%s %s' % (findcomic, isssearch))
             bb = b.search()
             #logger.info('bb returned from DDL: %s' % bb)
         elif RSS == "yes":
-            if nzbprov == '32P' or nzbprov == 'Public Torrents':
+            if nzbprov == 'ddl':
+                logger.fdebug('Sending request to [%s] RSS for %s : %s' % (nzbprov, ComicName, mod_isssearch))
+                bb = rsscheck.ddl_dbsearch(ComicName, mod_isssearch, ComicID, nzbprov, oneoff)
+            elif nzbprov == '32P' or nzbprov == 'Public Torrents':
                 cmname = re.sub("%20", " ", str(comsrc))
                 logger.fdebug('Sending request to [%s] RSS for %s : %s' % (nzbprov, ComicName, mod_isssearch))
                 bb = rsscheck.torrentdbsearch(ComicName, mod_isssearch, ComicID, nzbprov, oneoff)
@@ -1389,7 +1391,13 @@ def NZB_SEARCH(ComicName, IssueNumber, ComicYear, SeriesYear, Publisher, IssueDa
                             nowrite = False
                             if all([nzbprov == 'torznab', 'worldwidetorrents' in entry['link']]):
                                 nzbid = generate_id(nzbprov, entry['id'])
-                            elif all([nzbprov == 'ddl', 'getcomics' in entry['link']]):
+                            elif all([nzbprov == 'ddl', 'getcomics' in entry['link']]) or all([nzbprov == 'ddl', RSS == 'yes']):
+                                if RSS == "yes":
+                                    entry['id'] = entry['link']
+                                    entry['link'] = 'https://getcomics.info/?p='+str(entry['id'])
+                                    entry['filename'] = entry['title']
+                                if '/cat/' in entry['link']:
+                                    entry['link'] = 'https://getcomics.info/?p='+str(entry['id'])
                                 nzbid = entry['id']
                                 entry['title'] = entry['filename']
                             else:
@@ -2318,7 +2326,6 @@ def searcher(nzbprov, nzbname, comicinfo, link, IssueID, ComicID, tmpprov, direc
         ggc = getcomics.GC(issueid=IssueID, comicid=ComicID)
         sendsite = ggc.loadsite(nzbid, link)
         ddl_it = ggc.parse_downloadresults(nzbid, link)
-        logger.info("ddl status response: %s" % ddl_it)
         if ddl_it['success'] is True:
             logger.info('Successfully snatched %s from DDL site. It is currently being queued to download in position %s' % (nzbname, mylar.DDL_QUEUE.qsize()))
         else: