mylar/mylar/wwt.py

#!/usr/bin/env python
#  This file is part of Mylar.
#
#  Mylar is free software: you can redistribute it and/or modify
#  it under the terms of the GNU General Public License as published by
#  the Free Software Foundation, either version 3 of the License, or
#  (at your option) any later version.
#
#  Mylar is distributed in the hope that it will be useful,
#  but WITHOUT ANY WARRANTY; without even the implied warranty of
#  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
#  GNU General Public License for more details.
#
#  You should have received a copy of the GNU General Public License
#  along with Mylar.  If not, see <http://www.gnu.org/licenses/>.

import lib.requests as requests
from bs4 import BeautifulSoup, UnicodeDammit
import urlparse
import re
import time
import sys
import datetime
from datetime import timedelta


import mylar
from mylar import logger, helpers

class wwt(object):

    def __init__(self, name, issue):
        self.url = 'https://worldwidetorrents.me/'
        self.query = name + ' ' + str(int(issue)) #'Batman White Knight'
        logger.info('query set to : %s' % self.query)
        pass

    def wwt_connect(self):
        resultlist = None
        params = {'c50': 1,
                  'search': self.query,
                  'cat': 132,
                  'incldead': 0,
                  'lang': 0}

        with requests.Session() as s:
            newurl = self.url + 'torrents-search.php'
            r = s.get(newurl, params=params, verify=True)

            if not r.status_code == 200:
                return
            logger.info('status code: %s' % r.status_code)
            soup = BeautifulSoup(r.content, "html5lib") 

            resultpages = soup.find("p", {"align": "center"})
            try:
                pagelist = resultpages.findAll("a")
            except:
                logger.info('No results found for %s' % self.query)
                return

            pages = []
            for p in pagelist:
                if p['href'] not in pages:
                    logger.fdebug('page: %s' % p['href'])
                    pages.append(p['href'])
            logger.fdebug('pages: %s' % (len(pages) + 1))

            resultlist = self.wwt_data(soup)
            if pages:
                for p in pages:
                    time.sleep(5)  #5s delay btwn requests
                    newurl = self.url + str(p)
                    r = s.get(newurl, params=params, verify=True)
                    if not r.status_code == 200:
                        continue
                    soup = BeautifulSoup(r.content, "html5lib")
                    resultlist += self.wwt_data(soup)

            logger.fdebug('%s results: %s' % (len(resultlist), resultlist))

        res = {}
        if len(resultlist) >= 1:
            res['entries'] = resultlist
        return res

    def wwt_data(self, data):

            resultw = data.find("table", {"class": "w3-table w3-striped w3-bordered w3-card-4"})
            resultp = resultw.findAll("tr")

            #final = []
            results = []
            for res in resultp:
                if res.findNext(text=True) == 'Torrents Name':
                    continue
                title = res.find('a')
                torrent = title['title']
                try:
                    for link in res.find_all('a', href=True):
                        if link['href'].startswith('download.php'):
                            linkurl = urlparse.parse_qs(urlparse.urlparse(link['href']).query)['id']
                            #results = {'torrent':  torrent,
                            #           'link':     link['href']}
                            break
                    for td in res.findAll('td'):
                        try:
                            seed = td.find("font", {"color": "green"})
                            leech = td.find("font", {"color": "#ff0000"})
                            value = td.findNext(text=True)
                            if any(['MB' in value, 'GB' in value]):
                                if 'MB' in value:
                                    szform = 'MB'
                                    sz = 'M'
                                else:
                                    szform = 'GB'
                                    sz = 'G'
                                size = helpers.human2bytes(str(re.sub(szform, '', value)).strip() + sz)
                            elif seed is not None:
                                seeders = value
                                #results['seeders'] = seeders
                            elif leech is not None:
                                leechers = value
                                #results['leechers'] = leechers
                            else:
                                age = value
                                #results['age'] = age
                        except Exception as e:
                            logger.warn('exception: %s' % e)

                    logger.info('age: %s' % age)
                    results.append({'title':    torrent,
                                    'link':     ''.join(linkurl),
                                    'pubdate':  self.string_to_delta(age),
                                    'size':     size,
                                    'site':     'WWT'})
                    logger.info('results: %s' % results)
                except Exception as e:
                    logger.warn('Error: %s' % e)
                    continue
                #else:
                #    final.append(results)

            return results

    def string_to_delta(self, relative):
        #using simplistic year (no leap months are 30 days long.
        #WARNING: 12 months != 1 year
        logger.info('trying to remap date from %s' % relative)
        unit_mapping = [('mic', 'microseconds', 1),
                        ('millis', 'microseconds', 1000),
                        ('sec', 'seconds', 1),
                        ('mins', 'seconds', 60),
                        ('hrs', 'seconds', 3600),
                        ('day', 'days', 1),
                        ('wk', 'days', 7),
                        ('mon', 'days', 30),
                        ('year', 'days', 365)]
        try:
            tokens = relative.lower().split(' ')
            past = False
            if tokens[-1] == 'ago':
                past = True
                tokens =  tokens[:-1]
            elif tokens[0] == 'in':
                tokens = tokens[1:]

            units = dict(days = 0, seconds = 0, microseconds = 0)
            #we should always get pairs, if not we let this die and throw an exception
            while len(tokens) > 0:
                value = tokens.pop(0)
                if value == 'and':    #just skip this token
                    continue
                else:
                    value = float(value)

                unit = tokens.pop(0)
                for match, time_unit, time_constant in unit_mapping:
                    if unit.startswith(match):
                        units[time_unit] += value * time_constant
            #print datetime.timedelta(**units), past
            val = datetime.datetime.now() - datetime.timedelta(**units)
            return datetime.datetime.strftime(val, '%a, %d %b %Y %H:%M:%S')
        except Exception as e:
            raise ValueError("Don't know how to parse %s: %s" % (relative, e))
IMP: Added Choose specific Download option to manually select from result list on Upcoming/Details/Weekly tabs, IMP: Added Wanted storyarcs to the overall Wanted section so now will search as per global options (storyarc issues can be displayed optionally on Wanted tab), IMP: Added custom url option for image banners/posters for storyarcs, IMP: updated Cherrypy, FIX: Fixed ComicRN not working when forms authentication used - in conjunction updated autoProcessComics to 2.0 which now uses apikey instead of user/pass, IMP: Alternate Series Covers option for alternating existing series image to the most current issue image, IMP: Added overall series total to series page for reference, IMP: Search workflow completely changed to accomodate more than one indexer - now will simultaneously sumbit initial request to each provider, wait 30s submit additional as required at 30s intervals, FIX: Removed TPSE as an option and relabelled to just Public Torrents, IMP: Added direct backlog search to WWT option (pack support will follow), FIX: Removed line about configparser being required for ComicTagger usage, IMP: Test code in place for newzab testing, FIX: Fixed layout problem with torrents that are in auto-snatch status on weekly tab, IMP: backend code improvements to allow for better alias usage and annual linking directly from WS, IMP: Updated systemd init-scripts with read.me, IMP: When post-processing, will now check for available destination free space before actually moving files, IMP: Will copy during metatagging to cache folder instead of move being an option so cleanup is cleaner if something fails, FIX: Changed readinglist table to storyarcs for clarity, IMP: When post-processing issues, will now only update the one issue status and adjust totals accordingly (instead of doing a complete rescan of the series), FIX: Clear out empty ID's from the Failed DB on startup, IMP: Initial code-run at REST API interface, FIX: Fixed some matching problems with 32p due to case, IMP: removed apikeys from log entries that were accidentally logging, IMP: When searching 32p, if items get packed up - will now delete the cached reference so new items of the same can be located, IMP: ForceSearch option switched to scheduler so simultaneous runs should not occur, FIX: Fixed manual metatagging error that would occur if multiple destination directories existed 2018-02-16 19:57:01 +00:00			`#!/usr/bin/env python`
			`# This file is part of Mylar.`
			`#`
			`# Mylar is free software: you can redistribute it and/or modify`
			`# it under the terms of the GNU General Public License as published by`
			`# the Free Software Foundation, either version 3 of the License, or`
			`# (at your option) any later version.`
			`#`
			`# Mylar is distributed in the hope that it will be useful,`
			`# but WITHOUT ANY WARRANTY; without even the implied warranty of`
			`# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the`
			`# GNU General Public License for more details.`
			`#`
			`# You should have received a copy of the GNU General Public License`
			`# along with Mylar. If not, see <http://www.gnu.org/licenses/>.`

			`import lib.requests as requests`
			`from bs4 import BeautifulSoup, UnicodeDammit`
			`import urlparse`
			`import re`
			`import time`
			`import sys`
			`import datetime`
			`from datetime import timedelta`


			`import mylar`
			`from mylar import logger, helpers`

			`class wwt(object):`

			`def __init__(self, name, issue):`
			`self.url = 'https://worldwidetorrents.me/'`
			`self.query = name + ' ' + str(int(issue)) #'Batman White Knight'`
			`logger.info('query set to : %s' % self.query)`
			`pass`

			`def wwt_connect(self):`
			`resultlist = None`
			`params = {'c50': 1,`
			`'search': self.query,`
			`'cat': 132,`
			`'incldead': 0,`
			`'lang': 0}`

			`with requests.Session() as s:`
			`newurl = self.url + 'torrents-search.php'`
			`r = s.get(newurl, params=params, verify=True)`

			`if not r.status_code == 200:`
			`return`
			`logger.info('status code: %s' % r.status_code)`
			`soup = BeautifulSoup(r.content, "html5lib")`

			`resultpages = soup.find("p", {"align": "center"})`
			`try:`
			`pagelist = resultpages.findAll("a")`
			`except:`
			`logger.info('No results found for %s' % self.query)`
FIX: Fix for problems when using wwt (returning no search results would throw error, beautifulsoup could not be used if html5lib was up-to-date on the host system 2018-05-01 14:42:00 +00:00			`return`

IMP: Added Choose specific Download option to manually select from result list on Upcoming/Details/Weekly tabs, IMP: Added Wanted storyarcs to the overall Wanted section so now will search as per global options (storyarc issues can be displayed optionally on Wanted tab), IMP: Added custom url option for image banners/posters for storyarcs, IMP: updated Cherrypy, FIX: Fixed ComicRN not working when forms authentication used - in conjunction updated autoProcessComics to 2.0 which now uses apikey instead of user/pass, IMP: Alternate Series Covers option for alternating existing series image to the most current issue image, IMP: Added overall series total to series page for reference, IMP: Search workflow completely changed to accomodate more than one indexer - now will simultaneously sumbit initial request to each provider, wait 30s submit additional as required at 30s intervals, FIX: Removed TPSE as an option and relabelled to just Public Torrents, IMP: Added direct backlog search to WWT option (pack support will follow), FIX: Removed line about configparser being required for ComicTagger usage, IMP: Test code in place for newzab testing, FIX: Fixed layout problem with torrents that are in auto-snatch status on weekly tab, IMP: backend code improvements to allow for better alias usage and annual linking directly from WS, IMP: Updated systemd init-scripts with read.me, IMP: When post-processing, will now check for available destination free space before actually moving files, IMP: Will copy during metatagging to cache folder instead of move being an option so cleanup is cleaner if something fails, FIX: Changed readinglist table to storyarcs for clarity, IMP: When post-processing issues, will now only update the one issue status and adjust totals accordingly (instead of doing a complete rescan of the series), FIX: Clear out empty ID's from the Failed DB on startup, IMP: Initial code-run at REST API interface, FIX: Fixed some matching problems with 32p due to case, IMP: removed apikeys from log entries that were accidentally logging, IMP: When searching 32p, if items get packed up - will now delete the cached reference so new items of the same can be located, IMP: ForceSearch option switched to scheduler so simultaneous runs should not occur, FIX: Fixed manual metatagging error that would occur if multiple destination directories existed 2018-02-16 19:57:01 +00:00			`pages = []`
			`for p in pagelist:`
			`if p['href'] not in pages:`
			`logger.fdebug('page: %s' % p['href'])`
			`pages.append(p['href'])`
			`logger.fdebug('pages: %s' % (len(pages) + 1))`

			`resultlist = self.wwt_data(soup)`
			`if pages:`
			`for p in pages:`
			`time.sleep(5) #5s delay btwn requests`
			`newurl = self.url + str(p)`
			`r = s.get(newurl, params=params, verify=True)`
			`if not r.status_code == 200:`
			`continue`
			`soup = BeautifulSoup(r.content, "html5lib")`
			`resultlist += self.wwt_data(soup)`

			`logger.fdebug('%s results: %s' % (len(resultlist), resultlist))`

			`res = {}`
			`if len(resultlist) >= 1:`
			`res['entries'] = resultlist`
			`return res`

			`def wwt_data(self, data):`

			`resultw = data.find("table", {"class": "w3-table w3-striped w3-bordered w3-card-4"})`
			`resultp = resultw.findAll("tr")`

			`#final = []`
			`results = []`
			`for res in resultp:`
			`if res.findNext(text=True) == 'Torrents Name':`
			`continue`
			`title = res.find('a')`
			`torrent = title['title']`
			`try:`
			`for link in res.find_all('a', href=True):`
			`if link['href'].startswith('download.php'):`
			`linkurl = urlparse.parse_qs(urlparse.urlparse(link['href']).query)['id']`
			`#results = {'torrent': torrent,`
			`# 'link': link['href']}`
			`break`
			`for td in res.findAll('td'):`
			`try:`
			`seed = td.find("font", {"color": "green"})`
			`leech = td.find("font", {"color": "#ff0000"})`
			`value = td.findNext(text=True)`
			`if any(['MB' in value, 'GB' in value]):`
			`if 'MB' in value:`
			`szform = 'MB'`
			`sz = 'M'`
			`else:`
			`szform = 'GB'`
			`sz = 'G'`
			`size = helpers.human2bytes(str(re.sub(szform, '', value)).strip() + sz)`
			`elif seed is not None:`
			`seeders = value`
			`#results['seeders'] = seeders`
			`elif leech is not None:`
			`leechers = value`
			`#results['leechers'] = leechers`
			`else:`
			`age = value`
			`#results['age'] = age`
			`except Exception as e:`
			`logger.warn('exception: %s' % e)`

			`logger.info('age: %s' % age)`
			`results.append({'title': torrent,`
			`'link': ''.join(linkurl),`
			`'pubdate': self.string_to_delta(age),`
			`'size': size,`
			`'site': 'WWT'})`
			`logger.info('results: %s' % results)`
			`except Exception as e:`
			`logger.warn('Error: %s' % e)`
			`continue`
			`#else:`
			`# final.append(results)`

			`return results`

			`def string_to_delta(self, relative):`
			`#using simplistic year (no leap months are 30 days long.`
			`#WARNING: 12 months != 1 year`
			`logger.info('trying to remap date from %s' % relative)`
			`unit_mapping = [('mic', 'microseconds', 1),`
			`('millis', 'microseconds', 1000),`
			`('sec', 'seconds', 1),`
			`('mins', 'seconds', 60),`
			`('hrs', 'seconds', 3600),`
			`('day', 'days', 1),`
			`('wk', 'days', 7),`
			`('mon', 'days', 30),`
			`('year', 'days', 365)]`
			`try:`
			`tokens = relative.lower().split(' ')`
			`past = False`
			`if tokens[-1] == 'ago':`
			`past = True`
			`tokens = tokens[:-1]`
			`elif tokens[0] == 'in':`
			`tokens = tokens[1:]`

			`units = dict(days = 0, seconds = 0, microseconds = 0)`
			`#we should always get pairs, if not we let this die and throw an exception`
			`while len(tokens) > 0:`
			`value = tokens.pop(0)`
			`if value == 'and': #just skip this token`
			`continue`
			`else:`
			`value = float(value)`

			`unit = tokens.pop(0)`
			`for match, time_unit, time_constant in unit_mapping:`
			`if unit.startswith(match):`
			`units[time_unit] += value * time_constant`
			`#print datetime.timedelta(**units), past`
			`val = datetime.datetime.now() - datetime.timedelta(**units)`
			`return datetime.datetime.strftime(val, '%a, %d %b %Y %H:%M:%S')`
			`except Exception as e:`
			`raise ValueError("Don't know how to parse %s: %s" % (relative, e))`