Merge pull request #165 from blackjack4494/master

Release 2020.11.11 Happy Singles Day
This commit is contained in:
Tom-Oliver Heidel 2020-11-11 00:22:02 +01:00 committed by GitHub
commit 98d9618f37
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
9 changed files with 275 additions and 211 deletions

View File

@ -82,7 +82,7 @@ jobs:
- name: Set up Python - name: Set up Python
uses: actions/setup-python@v2 uses: actions/setup-python@v2
with: with:
python-version: '3.x' python-version: '3.8'
- name: Install Requirements - name: Install Requirements
run: pip install pyinstaller run: pip install pyinstaller
- name: Bump version - name: Bump version
@ -116,7 +116,7 @@ jobs:
- name: Set up Python 3.5.4 32-Bit - name: Set up Python 3.5.4 32-Bit
uses: actions/setup-python@v2 uses: actions/setup-python@v2
with: with:
python-version: '3.5.4' python-version: '3.4.4'
architecture: 'x86' architecture: 'x86'
- name: Install Requirements for 32 Bit - name: Install Requirements for 32 Bit
run: pip install pyinstaller==3.5 run: pip install pyinstaller==3.5

View File

@ -364,8 +364,10 @@ class FileDownloader(object):
else '%.2f' % sleep_interval)) else '%.2f' % sleep_interval))
time.sleep(sleep_interval) time.sleep(sleep_interval)
else: else:
if self.params.get('sleep_interval_subtitles') > 0: sleep_interval_sub = 0
if type(self.params.get('sleep_interval_subtitles')) is int:
sleep_interval_sub = self.params.get('sleep_interval_subtitles') sleep_interval_sub = self.params.get('sleep_interval_subtitles')
if sleep_interval_sub > 0:
self.to_screen( self.to_screen(
'[download] Sleeping %s seconds...' % ( '[download] Sleeping %s seconds...' % (
sleep_interval_sub)) sleep_interval_sub))

View File

@ -115,8 +115,10 @@ class CurlFD(ExternalFD):
def _make_cmd(self, tmpfilename, info_dict): def _make_cmd(self, tmpfilename, info_dict):
cmd = [self.exe, '--location', '-o', tmpfilename] cmd = [self.exe, '--location', '-o', tmpfilename]
for key, val in info_dict['http_headers'].items(): if info_dict.get('http_headers') is not None:
cmd += ['--header', '%s: %s' % (key, val)] for key, val in info_dict['http_headers'].items():
cmd += ['--header', '%s: %s' % (key, val)]
cmd += self._bool_option('--continue-at', 'continuedl', '-', '0') cmd += self._bool_option('--continue-at', 'continuedl', '-', '0')
cmd += self._valueless_option('--silent', 'noprogress') cmd += self._valueless_option('--silent', 'noprogress')
cmd += self._valueless_option('--verbose', 'verbose') cmd += self._valueless_option('--verbose', 'verbose')
@ -150,8 +152,9 @@ class AxelFD(ExternalFD):
def _make_cmd(self, tmpfilename, info_dict): def _make_cmd(self, tmpfilename, info_dict):
cmd = [self.exe, '-o', tmpfilename] cmd = [self.exe, '-o', tmpfilename]
for key, val in info_dict['http_headers'].items(): if info_dict.get('http_headers') is not None:
cmd += ['-H', '%s: %s' % (key, val)] for key, val in info_dict['http_headers'].items():
cmd += ['-H', '%s: %s' % (key, val)]
cmd += self._configuration_args() cmd += self._configuration_args()
cmd += ['--', info_dict['url']] cmd += ['--', info_dict['url']]
return cmd return cmd
@ -162,8 +165,9 @@ class WgetFD(ExternalFD):
def _make_cmd(self, tmpfilename, info_dict): def _make_cmd(self, tmpfilename, info_dict):
cmd = [self.exe, '-O', tmpfilename, '-nv', '--no-cookies'] cmd = [self.exe, '-O', tmpfilename, '-nv', '--no-cookies']
for key, val in info_dict['http_headers'].items(): if info_dict.get('http_headers') is not None:
cmd += ['--header', '%s: %s' % (key, val)] for key, val in info_dict['http_headers'].items():
cmd += ['--header', '%s: %s' % (key, val)]
cmd += self._option('--limit-rate', 'ratelimit') cmd += self._option('--limit-rate', 'ratelimit')
retry = self._option('--tries', 'retries') retry = self._option('--tries', 'retries')
if len(retry) == 2: if len(retry) == 2:
@ -189,8 +193,9 @@ class Aria2cFD(ExternalFD):
if dn: if dn:
cmd += ['--dir', dn] cmd += ['--dir', dn]
cmd += ['--out', os.path.basename(tmpfilename)] cmd += ['--out', os.path.basename(tmpfilename)]
for key, val in info_dict['http_headers'].items(): if info_dict.get('http_headers') is not None:
cmd += ['--header', '%s: %s' % (key, val)] for key, val in info_dict['http_headers'].items():
cmd += ['--header', '%s: %s' % (key, val)]
cmd += self._option('--interface', 'source_address') cmd += self._option('--interface', 'source_address')
cmd += self._option('--all-proxy', 'proxy') cmd += self._option('--all-proxy', 'proxy')
cmd += self._bool_option('--check-certificate', 'nocheckcertificate', 'false', 'true', '=') cmd += self._bool_option('--check-certificate', 'nocheckcertificate', 'false', 'true', '=')
@ -206,8 +211,10 @@ class HttpieFD(ExternalFD):
def _make_cmd(self, tmpfilename, info_dict): def _make_cmd(self, tmpfilename, info_dict):
cmd = ['http', '--download', '--output', tmpfilename, info_dict['url']] cmd = ['http', '--download', '--output', tmpfilename, info_dict['url']]
for key, val in info_dict['http_headers'].items():
cmd += ['%s:%s' % (key, val)] if info_dict.get('http_headers') is not None:
for key, val in info_dict['http_headers'].items():
cmd += ['%s:%s' % (key, val)]
return cmd return cmd
@ -253,7 +260,7 @@ class FFmpegFD(ExternalFD):
# if end_time: # if end_time:
# args += ['-t', compat_str(end_time - start_time)] # args += ['-t', compat_str(end_time - start_time)]
if info_dict['http_headers'] and re.match(r'^https?://', url): if info_dict.get('http_headers') is not None and re.match(r'^https?://', url):
# Trailing \r\n after each HTTP header is important to prevent warning from ffmpeg/avconv: # Trailing \r\n after each HTTP header is important to prevent warning from ffmpeg/avconv:
# [http @ 00000000003d2fa0] No trailing CRLF found in HTTP header. # [http @ 00000000003d2fa0] No trailing CRLF found in HTTP header.
headers = handle_youtubedl_headers(info_dict['http_headers']) headers = handle_youtubedl_headers(info_dict['http_headers'])

View File

@ -82,7 +82,10 @@ class YoutubeLiveChatReplayFD(FragmentFD):
offset = int(replay_chat_item_action['videoOffsetTimeMsec']) offset = int(replay_chat_item_action['videoOffsetTimeMsec'])
processed_fragment.extend( processed_fragment.extend(
json.dumps(action, ensure_ascii=False).encode('utf-8') + b'\n') json.dumps(action, ensure_ascii=False).encode('utf-8') + b'\n')
continuation_id = live_chat_continuation['continuations'][0]['liveChatReplayContinuationData']['continuation'] try:
continuation_id = live_chat_continuation['continuations'][0]['liveChatReplayContinuationData']['continuation']
except KeyError:
continuation_id = None
self._append_fragment(ctx, processed_fragment) self._append_fragment(ctx, processed_fragment)

View File

@ -1175,6 +1175,7 @@ from .theweatherchannel import TheWeatherChannelIE
from .thisamericanlife import ThisAmericanLifeIE from .thisamericanlife import ThisAmericanLifeIE
from .thisav import ThisAVIE from .thisav import ThisAVIE
from .thisoldhouse import ThisOldHouseIE from .thisoldhouse import ThisOldHouseIE
from .thisvid import ThisVidIE
from .threeqsdn import ThreeQSDNIE from .threeqsdn import ThreeQSDNIE
from .tiktok import TikTokIE from .tiktok import TikTokIE
from .tinypic import TinyPicIE from .tinypic import TinyPicIE

View File

@ -0,0 +1,97 @@
# coding: utf-8
from __future__ import unicode_literals
import re
from .common import InfoExtractor
class ThisVidIE(InfoExtractor):
_VALID_URL = r'https?://(?:www\.)?thisvid\.com/(?P<type>videos|embed)/(?P<id>[A-Za-z0-9-]+/?)'
_TESTS = [{
'url': 'https://thisvid.com/videos/french-boy-pantsed/',
'md5': '3397979512c682f6b85b3b04989df224',
'info_dict': {
'id': '2400174',
'ext': 'mp4',
'title': 'French Boy Pantsed',
'thumbnail': 'https://media.thisvid.com/contents/videos_screenshots/2400000/2400174/preview.mp4.jpg',
'age_limit': 18,
}
}, {
'url': 'https://thisvid.com/embed/2400174/',
'md5': '3397979512c682f6b85b3b04989df224',
'info_dict': {
'id': '2400174',
'ext': 'mp4',
'title': 'French Boy Pantsed',
'thumbnail': 'https://media.thisvid.com/contents/videos_screenshots/2400000/2400174/preview.mp4.jpg',
'age_limit': 18,
}
}]
def _real_extract(self, url):
main_id = self._match_id(url)
webpage = self._download_webpage(url, main_id)
# URL decryptor was reversed from version 4.0.4, later verified working with 5.2.0 and may change in the future.
kvs_version = self._html_search_regex(r'<script [^>]+?src="https://thisvid\.com/player/kt_player\.js\?v=(\d+(\.\d+)+)">', webpage, 'kvs_version', fatal=False)
if not kvs_version.startswith("5."):
self.report_warning("Major version change (" + kvs_version + ") in player engine--Download may fail.")
title = self._html_search_regex(r'<title>(?:Video: )?(.+?)(?: - (?:\w+ porn at )?ThisVid(?:.com| tube))?</title>', webpage, 'title')
# video_id, video_url and license_code from the 'flashvars' JSON object:
video_id = self._html_search_regex(r"video_id: '([0-9]+)',", webpage, 'video_id')
video_url = self._html_search_regex(r"video_url: '(function/0/.+?)',", webpage, 'video_url')
license_code = self._html_search_regex(r"license_code: '([0-9$]{16})',", webpage, 'license_code')
thumbnail = self._html_search_regex(r"preview_url: '((?:https?:)?//media.thisvid.com/.+?.jpg)',", webpage, 'thumbnail', fatal=False)
if thumbnail.startswith("//"):
thumbnail = "https:" + thumbnail
if (re.match(self._VALID_URL, url).group('type') == "videos"):
display_id = main_id
else:
display_id = self._search_regex(r'<link rel="canonical" href="' + self._VALID_URL + r'">', webpage, 'display_id', fatal=False),
return {
'id': video_id,
'display_id': display_id,
'title': title,
'url': getrealurl(video_url, license_code),
'thumbnail': thumbnail,
'age_limit': 18,
}
def getrealurl(video_url, license_code):
urlparts = video_url.split('/')[2:]
license = getlicensetoken(license_code)
newmagic = urlparts[5][:32]
for o in range(len(newmagic) - 1, -1, -1):
new = ""
l = (o + sum([int(n) for n in license[o:]])) % 32
for i in range(0, len(newmagic)):
if i == o:
new += newmagic[l]
elif i == l:
new += newmagic[o]
else:
new += newmagic[i]
newmagic = new
urlparts[5] = newmagic + urlparts[5][32:]
return "/".join(urlparts)
def getlicensetoken(license):
modlicense = license.replace("$", "").replace("0", "1")
center = int(len(modlicense) / 2)
fronthalf = int(modlicense[:center + 1])
backhalf = int(modlicense[center:])
modlicense = str(4 * abs(fronthalf - backhalf))
retval = ""
for o in range(0, center + 1):
for i in range(1, 5):
retval += str((int(license[o + i]) + int(modlicense[o])) % 10)
return retval

View File

@ -308,17 +308,26 @@ class VikiIE(VikiBaseIE):
'url': thumbnail.get('url'), 'url': thumbnail.get('url'),
}) })
new_video = self._download_json(
'https://www.viki.com/api/videos/%s' % video_id, video_id,
'Downloading new video JSON to get subtitles', headers={'x-viki-app-ver': '2.2.5.1428709186'}, expected_status=[200, 400, 404])
subtitles = {} subtitles = {}
for sub in new_video.get('streamSubtitles').get('dash'): try:
subtitles[sub.get('srclang')] = [{ # New way to fetch subtitles
'ext': 'vtt', new_video = self._download_json(
'url': sub.get('src'), 'https://www.viki.com/api/videos/%s' % video_id, video_id,
'completion': sub.get('percentage'), 'Downloading new video JSON to get subtitles', headers={'x-viki-app-ver': '2.2.5.1428709186'}, expected_status=[200, 400, 404])
}] for sub in new_video.get('streamSubtitles').get('dash'):
subtitles[sub.get('srclang')] = [{
'ext': 'vtt',
'url': sub.get('src'),
'completion': sub.get('percentage'),
}]
except AttributeError:
# fall-back to the old way if there isn't a streamSubtitles attribute
for subtitle_lang, _ in video.get('subtitle_completions', {}).items():
subtitles[subtitle_lang] = [{
'ext': subtitles_format,
'url': self._prepare_call(
'videos/%s/subtitles/%s.%s' % (video_id, subtitle_lang, subtitles_format)),
} for subtitles_format in ('srt', 'vtt')]
result = { result = {
'id': video_id, 'id': video_id,

View File

@ -36,6 +36,7 @@ from ..utils import (
get_element_by_attribute, get_element_by_attribute,
get_element_by_id, get_element_by_id,
int_or_none, int_or_none,
js_to_json,
mimetype2ext, mimetype2ext,
orderedSet, orderedSet,
parse_codecs, parse_codecs,
@ -70,6 +71,8 @@ class YoutubeBaseInfoExtractor(InfoExtractor):
_LOGIN_REQUIRED = False _LOGIN_REQUIRED = False
_PLAYLIST_ID_RE = r'(?:PL|LL|EC|UU|FL|RD|UL|TL|PU|OLAK5uy_)[0-9A-Za-z-_]{10,}' _PLAYLIST_ID_RE = r'(?:PL|LL|EC|UU|FL|RD|UL|TL|PU|OLAK5uy_)[0-9A-Za-z-_]{10,}'
_INITIAL_DATA_RE = r'(?:window\["ytInitialData"\]|ytInitialData)\W?=\W?({.*?});'
_YTCFG_DATA_RE = r"ytcfg.set\(({.*?})\)"
_YOUTUBE_CLIENT_HEADERS = { _YOUTUBE_CLIENT_HEADERS = {
'x-youtube-client-name': '1', 'x-youtube-client-name': '1',
@ -274,7 +277,6 @@ class YoutubeBaseInfoExtractor(InfoExtractor):
def _download_webpage_handle(self, *args, **kwargs): def _download_webpage_handle(self, *args, **kwargs):
query = kwargs.get('query', {}).copy() query = kwargs.get('query', {}).copy()
query['disable_polymer'] = 'true'
kwargs['query'] = query kwargs['query'] = query
return super(YoutubeBaseInfoExtractor, self)._download_webpage_handle( return super(YoutubeBaseInfoExtractor, self)._download_webpage_handle(
*args, **compat_kwargs(kwargs)) *args, **compat_kwargs(kwargs))
@ -297,15 +299,61 @@ class YoutubeBaseInfoExtractor(InfoExtractor):
class YoutubeEntryListBaseInfoExtractor(YoutubeBaseInfoExtractor): class YoutubeEntryListBaseInfoExtractor(YoutubeBaseInfoExtractor):
# Extract entries from page with "Load more" button
def _entries(self, page, playlist_id): def _find_entries_in_json(self, extracted):
more_widget_html = content_html = page entries = []
for page_num in itertools.count(1): c = {}
for entry in self._process_page(content_html):
def _real_find(obj):
if obj is None or isinstance(obj, str):
return
if type(obj) is list:
for elem in obj:
_real_find(elem)
if type(obj) is dict:
if self._is_entry(obj):
entries.append(obj)
return
if 'continuationCommand' in obj:
c['continuation'] = obj
return
for _, o in obj.items():
_real_find(o)
_real_find(extracted)
return entries, try_get(c, lambda x: x["continuation"])
def _entries(self, page, playlist_id, max_pages=None):
seen = []
yt_conf = {}
for m in re.finditer(self._YTCFG_DATA_RE, page):
parsed = self._parse_json(m.group(1), playlist_id,
transform_source=js_to_json, fatal=False)
if parsed:
yt_conf.update(parsed)
data_json = self._parse_json(self._search_regex(self._INITIAL_DATA_RE, page, 'ytInitialData'), None)
for page_num in range(1, max_pages + 1) if max_pages is not None else itertools.count(1):
entries, continuation = self._find_entries_in_json(data_json)
processed = self._process_entries(entries, seen)
if not processed:
break
for entry in processed:
yield entry yield entry
mobj = re.search(r'data-uix-load-more-href="/?(?P<more>[^"]+)"', more_widget_html) if not continuation or not yt_conf:
if not mobj: break
continuation_token = try_get(continuation, lambda x: x['continuationCommand']['token'])
continuation_url = try_get(continuation, lambda x: x['commandMetadata']['webCommandMetadata']['apiUrl'])
if not continuation_token or not continuation_url:
break break
count = 0 count = 0
@ -314,12 +362,23 @@ class YoutubeEntryListBaseInfoExtractor(YoutubeBaseInfoExtractor):
try: try:
# Downloading page may result in intermittent 5xx HTTP error # Downloading page may result in intermittent 5xx HTTP error
# that is usually worked around with a retry # that is usually worked around with a retry
more = self._download_json( data_json = self._download_json(
'https://www.youtube.com/%s' % mobj.group('more'), playlist_id, 'https://www.youtube.com%s' % continuation_url,
'Downloading page #%s%s' playlist_id,
% (page_num, ' (retry #%d)' % count if count else ''), 'Downloading continuation page #%s%s' % (page_num, ' (retry #%d)' % count if count else ''),
transform_source=uppercase_escape, transform_source=uppercase_escape,
headers=self._YOUTUBE_CLIENT_HEADERS) query={
'key': try_get(yt_conf, lambda x: x['INNERTUBE_API_KEY'])
},
data=bytes(json.dumps({
'context': try_get(yt_conf, lambda x: x['INNERTUBE_CONTEXT']),
'continuation': continuation_token
}), encoding='utf-8'),
headers={
'Content-Type': 'application/json'
}
)
break break
except ExtractorError as e: except ExtractorError as e:
if isinstance(e.cause, compat_HTTPError) and e.cause.code in (500, 503): if isinstance(e.cause, compat_HTTPError) and e.cause.code in (500, 503):
@ -328,31 +387,30 @@ class YoutubeEntryListBaseInfoExtractor(YoutubeBaseInfoExtractor):
continue continue
raise raise
content_html = more['content_html'] def _extract_title(self, renderer):
if not content_html.strip(): title = try_get(renderer, lambda x: x['title']['runs'][0]['text'], compat_str)
# Some webpages show a "Load more" button but they don't if title:
# have more videos return title
break return try_get(renderer, lambda x: x['title']['simpleText'], compat_str)
more_widget_html = more['load_more_widget_html']
class YoutubePlaylistBaseInfoExtractor(YoutubeEntryListBaseInfoExtractor): class YoutubePlaylistBaseInfoExtractor(YoutubeEntryListBaseInfoExtractor):
def _process_page(self, content): def _is_entry(self, obj):
for video_id, video_title in self.extract_videos_from_page(content): return 'videoId' in obj
yield self.url_result(video_id, 'Youtube', video_id, video_title)
def extract_videos_from_page_impl(self, video_re, page, ids_in_page, titles_in_page): def _process_entries(self, entries, seen):
for mobj in re.finditer(video_re, page): ids_in_page = []
# The link with index 0 is not the first video of the playlist (not sure if still actual) titles_in_page = []
if 'index' in mobj.groupdict() and mobj.group('id') == '0': for renderer in entries:
video_id = try_get(renderer, lambda x: x['videoId'])
video_title = self._extract_title(renderer)
if video_id is None or video_title is None:
# we do not have a videoRenderer or title extraction broke
continue continue
video_id = mobj.group('id')
video_title = unescapeHTML( video_title = video_title.strip()
mobj.group('title')) if 'title' in mobj.groupdict() else None
if video_title:
video_title = video_title.strip()
if video_title == '► Play all':
video_title = None
try: try:
idx = ids_in_page.index(video_id) idx = ids_in_page.index(video_id)
if video_title and not titles_in_page[idx]: if video_title and not titles_in_page[idx]:
@ -361,19 +419,17 @@ class YoutubePlaylistBaseInfoExtractor(YoutubeEntryListBaseInfoExtractor):
ids_in_page.append(video_id) ids_in_page.append(video_id)
titles_in_page.append(video_title) titles_in_page.append(video_title)
def extract_videos_from_page(self, page): for video_id, video_title in zip(ids_in_page, titles_in_page):
ids_in_page = [] yield self.url_result(video_id, 'Youtube', video_id, video_title)
titles_in_page = []
self.extract_videos_from_page_impl(
self._VIDEO_RE, page, ids_in_page, titles_in_page)
return zip(ids_in_page, titles_in_page)
class YoutubePlaylistsBaseInfoExtractor(YoutubeEntryListBaseInfoExtractor): class YoutubePlaylistsBaseInfoExtractor(YoutubeEntryListBaseInfoExtractor):
def _process_page(self, content): def _is_entry(self, obj):
for playlist_id in orderedSet(re.findall( return 'playlistId' in obj
r'<h3[^>]+class="[^"]*yt-lockup-title[^"]*"[^>]*><a[^>]+href="/?playlist\?list=([0-9A-Za-z-_]{10,})"',
content)): def _process_entries(self, entries, seen):
for playlist_id in orderedSet(try_get(r, lambda x: x['playlistId']) for r in entries):
yield self.url_result( yield self.url_result(
'https://www.youtube.com/playlist?list=%s' % playlist_id, 'YoutubePlaylist') 'https://www.youtube.com/playlist?list=%s' % playlist_id, 'YoutubePlaylist')
@ -3240,11 +3296,7 @@ class YoutubePlaylistsIE(YoutubePlaylistsBaseInfoExtractor):
}] }]
class YoutubeSearchBaseInfoExtractor(YoutubePlaylistBaseInfoExtractor): class YoutubeSearchIE(SearchInfoExtractor, YoutubePlaylistBaseInfoExtractor):
_VIDEO_RE = r'href="\s*/watch\?v=(?P<id>[0-9A-Za-z_-]{11})(?:[^"]*"[^>]+\btitle="(?P<title>[^"]+))?'
class YoutubeSearchIE(SearchInfoExtractor, YoutubeSearchBaseInfoExtractor):
IE_DESC = 'YouTube.com searches' IE_DESC = 'YouTube.com searches'
# there doesn't appear to be a real limit, for example if you search for # there doesn't appear to be a real limit, for example if you search for
# 'python' you get more than 8.000.000 results # 'python' you get more than 8.000.000 results
@ -3341,11 +3393,10 @@ class YoutubeSearchDateIE(YoutubeSearchIE):
_SEARCH_PARAMS = 'CAI%3D' _SEARCH_PARAMS = 'CAI%3D'
class YoutubeSearchURLIE(YoutubeSearchBaseInfoExtractor): class YoutubeSearchURLIE(YoutubePlaylistBaseInfoExtractor):
IE_DESC = 'YouTube.com search URLs' IE_DESC = 'YouTube.com search URLs'
IE_NAME = 'youtube:search_url' IE_NAME = 'youtube:search_url'
_VALID_URL = r'https?://(?:www\.)?youtube\.com/results\?(.*?&)?(?:search_query|q)=(?P<query>[^&]+)(?:[&]|$)' _VALID_URL = r'https?://(?:www\.)?youtube\.com/results\?(.*?&)?(?:search_query|q)=(?P<query>[^&]+)(?:[&]|$)'
_SEARCH_DATA = r'(?:window\["ytInitialData"\]|ytInitialData)\W?=\W?({.*?});'
_TESTS = [{ _TESTS = [{
'url': 'https://www.youtube.com/results?baz=bar&search_query=youtube-dl+test+video&filters=video&lclk=video', 'url': 'https://www.youtube.com/results?baz=bar&search_query=youtube-dl+test+video&filters=video&lclk=video',
'playlist_mincount': 5, 'playlist_mincount': 5,
@ -3357,63 +3408,20 @@ class YoutubeSearchURLIE(YoutubeSearchBaseInfoExtractor):
'only_matching': True, 'only_matching': True,
}] }]
def _find_videos_in_json(self, extracted): def _process_json_dict(self, obj, videos, c):
videos = [] if "videoId" in obj:
videos.append(obj)
return
def _real_find(obj): if "nextContinuationData" in obj:
if obj is None or isinstance(obj, str): c["continuation"] = obj["nextContinuationData"]
return return
if type(obj) is list:
for elem in obj:
_real_find(elem)
if type(obj) is dict:
if "videoId" in obj:
videos.append(obj)
return
for _, o in obj.items():
_real_find(o)
_real_find(extracted)
return videos
def extract_videos_from_page_impl(self, page, ids_in_page, titles_in_page):
search_response = self._parse_json(self._search_regex(self._SEARCH_DATA, page, 'ytInitialData'), None)
result_items = self._find_videos_in_json(search_response)
for renderer in result_items:
video_id = try_get(renderer, lambda x: x['videoId'])
video_title = try_get(renderer, lambda x: x['title']['runs'][0]['text']) or try_get(renderer, lambda x: x['title']['simpleText'])
if video_id is None or video_title is None:
# we do not have a videoRenderer or title extraction broke
continue
video_title = video_title.strip()
try:
idx = ids_in_page.index(video_id)
if video_title and not titles_in_page[idx]:
titles_in_page[idx] = video_title
except ValueError:
ids_in_page.append(video_id)
titles_in_page.append(video_title)
def extract_videos_from_page(self, page):
ids_in_page = []
titles_in_page = []
self.extract_videos_from_page_impl(page, ids_in_page, titles_in_page)
return zip(ids_in_page, titles_in_page)
def _real_extract(self, url): def _real_extract(self, url):
mobj = re.match(self._VALID_URL, url) mobj = re.match(self._VALID_URL, url)
query = compat_urllib_parse_unquote_plus(mobj.group('query')) query = compat_urllib_parse_unquote_plus(mobj.group('query'))
webpage = self._download_webpage(url, query) webpage = self._download_webpage(url, query)
return self.playlist_result(self._process_page(webpage), playlist_title=query) return self.playlist_result(self._entries(webpage, query, max_pages=5), playlist_title=query)
class YoutubeShowIE(YoutubePlaylistsBaseInfoExtractor): class YoutubeShowIE(YoutubePlaylistsBaseInfoExtractor):
@ -3435,14 +3443,12 @@ class YoutubeShowIE(YoutubePlaylistsBaseInfoExtractor):
'https://www.youtube.com/show/%s/playlists' % playlist_id) 'https://www.youtube.com/show/%s/playlists' % playlist_id)
class YoutubeFeedsInfoExtractor(YoutubeBaseInfoExtractor): class YoutubeFeedsInfoExtractor(YoutubePlaylistBaseInfoExtractor):
""" """
Base class for feed extractors Base class for feed extractors
Subclasses must define the _FEED_NAME and _PLAYLIST_TITLE properties. Subclasses must define the _FEED_NAME and _PLAYLIST_TITLE properties.
""" """
_LOGIN_REQUIRED = True _LOGIN_REQUIRED = True
_FEED_DATA = r'(?:window\["ytInitialData"\]|ytInitialData)\W?=\W?({.*?});'
_YTCFG_DATA = r"ytcfg.set\(({.*?})\)"
@property @property
def IE_NAME(self): def IE_NAME(self):
@ -3451,96 +3457,35 @@ class YoutubeFeedsInfoExtractor(YoutubeBaseInfoExtractor):
def _real_initialize(self): def _real_initialize(self):
self._login() self._login()
def _find_videos_in_json(self, extracted): def _process_entries(self, entries, seen):
videos = [] new_info = []
c = {} for v in entries:
v_id = try_get(v, lambda x: x['videoId'])
if not v_id:
continue
def _real_find(obj): have_video = False
if obj is None or isinstance(obj, str): for old in seen:
return if old['videoId'] == v_id:
have_video = True
break
if type(obj) is list: if not have_video:
for elem in obj: new_info.append(v)
_real_find(elem)
if type(obj) is dict: if not new_info:
if "videoId" in obj: return
videos.append(obj)
return
if "nextContinuationData" in obj: seen.extend(new_info)
c["continuation"] = obj["nextContinuationData"] for video in new_info:
return yield self.url_result(try_get(video, lambda x: x['videoId']), YoutubeIE.ie_key(), video_title=self._extract_title(video))
for _, o in obj.items():
_real_find(o)
_real_find(extracted)
return videos, try_get(c, lambda x: x["continuation"])
def _entries(self, page):
info = []
yt_conf = self._parse_json(self._search_regex(self._YTCFG_DATA, page, 'ytcfg.set', default="null"), None, fatal=False)
search_response = self._parse_json(self._search_regex(self._FEED_DATA, page, 'ytInitialData'), None)
for page_num in itertools.count(1):
video_info, continuation = self._find_videos_in_json(search_response)
new_info = []
for v in video_info:
v_id = try_get(v, lambda x: x['videoId'])
if not v_id:
continue
have_video = False
for old in info:
if old['videoId'] == v_id:
have_video = True
break
if not have_video:
new_info.append(v)
if not new_info:
break
info.extend(new_info)
for video in new_info:
yield self.url_result(try_get(video, lambda x: x['videoId']), YoutubeIE.ie_key(), video_title=try_get(video, lambda x: x['title']['runs'][0]['text']) or try_get(video, lambda x: x['title']['simpleText']))
if not continuation or not yt_conf:
break
search_response = self._download_json(
'https://www.youtube.com/browse_ajax', self._PLAYLIST_TITLE,
'Downloading page #%s' % page_num,
transform_source=uppercase_escape,
query={
"ctoken": try_get(continuation, lambda x: x["continuation"]),
"continuation": try_get(continuation, lambda x: x["continuation"]),
"itct": try_get(continuation, lambda x: x["clickTrackingParams"])
},
headers={
"X-YouTube-Client-Name": try_get(yt_conf, lambda x: x["INNERTUBE_CONTEXT_CLIENT_NAME"]),
"X-YouTube-Client-Version": try_get(yt_conf, lambda x: x["INNERTUBE_CONTEXT_CLIENT_VERSION"]),
"X-Youtube-Identity-Token": try_get(yt_conf, lambda x: x["ID_TOKEN"]),
"X-YouTube-Device": try_get(yt_conf, lambda x: x["DEVICE"]),
"X-YouTube-Page-CL": try_get(yt_conf, lambda x: x["PAGE_CL"]),
"X-YouTube-Page-Label": try_get(yt_conf, lambda x: x["PAGE_BUILD_LABEL"]),
"X-YouTube-Variants-Checksum": try_get(yt_conf, lambda x: x["VARIANTS_CHECKSUM"]),
})
def _real_extract(self, url): def _real_extract(self, url):
page = self._download_webpage( page = self._download_webpage(
'https://www.youtube.com/feed/%s' % self._FEED_NAME, 'https://www.youtube.com/feed/%s' % self._FEED_NAME,
self._PLAYLIST_TITLE) self._PLAYLIST_TITLE)
return self.playlist_result( return self.playlist_result(self._entries(page, self._PLAYLIST_TITLE),
self._entries(page), playlist_title=self._PLAYLIST_TITLE) playlist_title=self._PLAYLIST_TITLE)
class YoutubeWatchLaterIE(YoutubePlaylistIE): class YoutubeWatchLaterIE(YoutubePlaylistIE):

View File

@ -13,7 +13,7 @@ from ..utils import (
class ZoomIE(InfoExtractor): class ZoomIE(InfoExtractor):
IE_NAME = 'zoom' IE_NAME = 'zoom'
_VALID_URL = r'https://(?:.*).?zoom.us/rec(?:ording)?/play/(?P<id>[A-Za-z0-9\-_]+)' _VALID_URL = r'https://(?:.*).?zoom.us/rec(?:ording)?/(play|share)/(?P<id>[A-Za-z0-9\-_.]+)'
_TEST = { _TEST = {
'url': 'https://zoom.us/recording/play/SILVuCL4bFtRwWTtOCFQQxAsBQsJljFtm9e4Z_bvo-A8B-nzUSYZRNuPl3qW5IGK', 'url': 'https://zoom.us/recording/play/SILVuCL4bFtRwWTtOCFQQxAsBQsJljFtm9e4Z_bvo-A8B-nzUSYZRNuPl3qW5IGK',