From b860e4cc2f53c7858054f73928f51188ea6b49b8 Mon Sep 17 00:00:00 2001 From: Nicolas SAPA Date: Sun, 8 Nov 2020 08:36:26 +0100 Subject: [PATCH 01/12] [common] Make sure self.params.get('sleep_interval_subtitles') is int This can happen if another software is using yt-dlc'API (ie: tubeup). The stack trace would be: $ tubeup 'https://youtube.com/watch?v=JyE9OF03cao' [debug] Encodings: locale UTF-8, fs utf-8, out UTF-8, pref UTF-8 [debug] youtube-dlc version 2020.10.25 [debug] Python version 3.7.3 (CPython) - Linux-5.8.0-0.bpo.2-amd64-x86_64-with-debian-10.6 [debug] exe versions: ffmpeg 3.3.9, ffprobe 3.3.9 [debug] Proxy map: {} There are no annotations to write. ERROR: '>' not supported between instances of 'NoneType' and 'int' Traceback (most recent call last): File "/mnt/data2/Backup/Wiki/.local/lib/python3.7/site-packages/youtube_dlc/YoutubeDL.py", line 846, in extract_info return self.process_ie_result(ie_result, download, extra_info) File "/mnt/data2/Backup/Wiki/.local/lib/python3.7/site-packages/youtube_dlc/YoutubeDL.py", line 901, in process_ie_result return self.process_video_result(ie_result, download=download) File "/mnt/data2/Backup/Wiki/.local/lib/python3.7/site-packages/youtube_dlc/YoutubeDL.py", line 1696, in process_video_result self.process_info(new_info) File "/mnt/data2/Backup/Wiki/.local/lib/python3.7/site-packages/youtube_dlc/YoutubeDL.py", line 1894, in process_info dl(sub_filename, sub_info, subtitle=True) File "/mnt/data2/Backup/Wiki/.local/lib/python3.7/site-packages/youtube_dlc/YoutubeDL.py", line 1866, in dl return fd.download(name, info, subtitle) File "/mnt/data2/Backup/Wiki/.local/lib/python3.7/site-packages/youtube_dlc/downloader/common.py", line 367, in download if self.params.get('sleep_interval_subtitles') > 0: TypeError: '>' not supported between instances of 'NoneType' and 'int' --- youtube_dlc/downloader/common.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/youtube_dlc/downloader/common.py b/youtube_dlc/downloader/common.py index c65500d61..7d303be1c 100644 --- a/youtube_dlc/downloader/common.py +++ b/youtube_dlc/downloader/common.py @@ -364,8 +364,10 @@ class FileDownloader(object): else '%.2f' % sleep_interval)) time.sleep(sleep_interval) else: - if self.params.get('sleep_interval_subtitles') > 0: + sleep_interval_sub = 0 + if type(self.params.get('sleep_interval_subtitles')) is int: sleep_interval_sub = self.params.get('sleep_interval_subtitles') + if sleep_interval_sub > 0: self.to_screen( '[download] Sleeping %s seconds...' % ( sleep_interval_sub)) From 8263104fe4f7aed96a1cc92be6b58cc219de876e Mon Sep 17 00:00:00 2001 From: Nicolas SAPA Date: Sun, 8 Nov 2020 08:49:03 +0100 Subject: [PATCH 02/12] [youtube] Fix 'liveChatReplayContinuationData' missing 'continuation' key live_chat_continuation['continuations'][0]['liveChatReplayContinuationData']['continuation'] can not exist. So catch the KeyError. Traceback: $ tubeup 'https://youtube.com/watch?v=JyE9OF03cao' [debug] Encodings: locale UTF-8, fs utf-8, out UTF-8, pref UTF-8 [debug] youtube-dlc version 2020.10.25 [debug] Python version 3.7.3 (CPython) - Linux-5.8.0-0.bpo.2-amd64-x86_64-with-debian-10.6 [debug] exe versions: ffmpeg 3.3.9, ffprobe 3.3.9 [debug] Proxy map: {} There are no annotations to write. [download] 452.59KiB at 615.35KiB/s (00:01)ERROR: 'liveChatReplayContinuationData' Traceback (most recent call last): File "/mnt/data2/Backup/Wiki/.local/lib/python3.7/site-packages/youtube_dlc/YoutubeDL.py", line 846, in extract_info return self.process_ie_result(ie_result, download, extra_info) File "/mnt/data2/Backup/Wiki/.local/lib/python3.7/site-packages/youtube_dlc/YoutubeDL.py", line 901, in process_ie_result return self.process_video_result(ie_result, download=download) File "/mnt/data2/Backup/Wiki/.local/lib/python3.7/site-packages/youtube_dlc/YoutubeDL.py", line 1696, in process_video_result self.process_info(new_info) File "/mnt/data2/Backup/Wiki/.local/lib/python3.7/site-packages/youtube_dlc/YoutubeDL.py", line 1894, in process_info dl(sub_filename, sub_info, subtitle=True) File "/mnt/data2/Backup/Wiki/.local/lib/python3.7/site-packages/youtube_dlc/YoutubeDL.py", line 1866, in dl return fd.download(name, info, subtitle) File "/mnt/data2/Backup/Wiki/.local/lib/python3.7/site-packages/youtube_dlc/downloader/common.py", line 375, in download return self.real_download(filename, info_dict) File "/mnt/data2/Backup/Wiki/.local/lib/python3.7/site-packages/youtube_dlc/downloader/youtube_live_chat.py", line 85, in real_download continuation_id = live_chat_continuation['continuations'][0]['liveChatReplayContinuationData']['continuation'] KeyError: 'liveChatReplayContinuationData' --- youtube_dlc/downloader/youtube_live_chat.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/youtube_dlc/downloader/youtube_live_chat.py b/youtube_dlc/downloader/youtube_live_chat.py index 4932dd9c5..b333afa5b 100644 --- a/youtube_dlc/downloader/youtube_live_chat.py +++ b/youtube_dlc/downloader/youtube_live_chat.py @@ -82,7 +82,10 @@ class YoutubeLiveChatReplayFD(FragmentFD): offset = int(replay_chat_item_action['videoOffsetTimeMsec']) processed_fragment.extend( json.dumps(action, ensure_ascii=False).encode('utf-8') + b'\n') - continuation_id = live_chat_continuation['continuations'][0]['liveChatReplayContinuationData']['continuation'] + try: + continuation_id = live_chat_continuation['continuations'][0]['liveChatReplayContinuationData']['continuation'] + except KeyError: + continuation_id = None self._append_fragment(ctx, processed_fragment) From 876f1c17fff194cbed3595bb2a8497ea9e479bf7 Mon Sep 17 00:00:00 2001 From: Ali Sherief Date: Mon, 9 Nov 2020 16:06:48 +0000 Subject: [PATCH 03/12] Fix #93 YoutubePlaylistsIE --- youtube_dlc/extractor/youtube.py | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/youtube_dlc/extractor/youtube.py b/youtube_dlc/extractor/youtube.py index 3ec2581dc..35ac67b49 100644 --- a/youtube_dlc/extractor/youtube.py +++ b/youtube_dlc/extractor/youtube.py @@ -300,11 +300,12 @@ class YoutubeEntryListBaseInfoExtractor(YoutubeBaseInfoExtractor): # Extract entries from page with "Load more" button def _entries(self, page, playlist_id): more_widget_html = content_html = page + mobj_reg = r'(?:(?:data-uix-load-more-href="[^"]+?;continuation=)|(?:"continuation":"))(?P[^"]+)"' for page_num in itertools.count(1): for entry in self._process_page(content_html): yield entry - mobj = re.search(r'data-uix-load-more-href="/?(?P[^"]+)"', more_widget_html) + mobj = re.search(mobj_reg, more_widget_html) if not mobj: break @@ -315,7 +316,7 @@ class YoutubeEntryListBaseInfoExtractor(YoutubeBaseInfoExtractor): # Downloading page may result in intermittent 5xx HTTP error # that is usually worked around with a retry more = self._download_json( - 'https://www.youtube.com/%s' % mobj.group('more'), playlist_id, + 'https://www.youtube.com/browse_ajax?ctoken=%s' % mobj.group('more'), playlist_id, 'Downloading page #%s%s' % (page_num, ' (retry #%d)' % count if count else ''), transform_source=uppercase_escape, @@ -372,7 +373,7 @@ class YoutubePlaylistBaseInfoExtractor(YoutubeEntryListBaseInfoExtractor): class YoutubePlaylistsBaseInfoExtractor(YoutubeEntryListBaseInfoExtractor): def _process_page(self, content): for playlist_id in orderedSet(re.findall( - r']+class="[^"]*yt-lockup-title[^"]*"[^>]*>]+href="/?playlist\?list=([0-9A-Za-z-_]{10,})"', + r'"/?playlist\?list=([0-9A-Za-z-_]{10,})"', content)): yield self.url_result( 'https://www.youtube.com/playlist?list=%s' % playlist_id, 'YoutubePlaylist') From 142f2c8e99e61054d3354bd915a9e46cbd80c8ea Mon Sep 17 00:00:00 2001 From: Robin Dunn <> Date: Mon, 9 Nov 2020 15:24:42 -0800 Subject: [PATCH 04/12] fall-back to the old way to fetch subtitles, if needed --- youtube_dlc/extractor/viki.py | 29 +++++++++++++++++++---------- 1 file changed, 19 insertions(+), 10 deletions(-) diff --git a/youtube_dlc/extractor/viki.py b/youtube_dlc/extractor/viki.py index 0f188f84d..6bddf8be9 100644 --- a/youtube_dlc/extractor/viki.py +++ b/youtube_dlc/extractor/viki.py @@ -308,17 +308,26 @@ class VikiIE(VikiBaseIE): 'url': thumbnail.get('url'), }) - new_video = self._download_json( - 'https://www.viki.com/api/videos/%s' % video_id, video_id, - 'Downloading new video JSON to get subtitles', headers={'x-viki-app-ver': '2.2.5.1428709186'}, expected_status=[200, 400, 404]) - subtitles = {} - for sub in new_video.get('streamSubtitles').get('dash'): - subtitles[sub.get('srclang')] = [{ - 'ext': 'vtt', - 'url': sub.get('src'), - 'completion': sub.get('percentage'), - }] + try: + # New way to fetch subtitles + new_video = self._download_json( + 'https://www.viki.com/api/videos/%s' % video_id, video_id, + 'Downloading new video JSON to get subtitles', headers={'x-viki-app-ver': '2.2.5.1428709186'}, expected_status=[200, 400, 404]) + for sub in new_video.get('streamSubtitles').get('dash'): + subtitles[sub.get('srclang')] = [{ + 'ext': 'vtt', + 'url': sub.get('src'), + 'completion': sub.get('percentage'), + }] + except AttributeError: + # fall-back to the old way if there isn't a streamSubtitles attribute + for subtitle_lang, _ in video.get('subtitle_completions', {}).items(): + subtitles[subtitle_lang] = [{ + 'ext': subtitles_format, + 'url': self._prepare_call( + 'videos/%s/subtitles/%s.%s' % (video_id, subtitle_lang, subtitles_format)), + } for subtitles_format in ('srt', 'vtt')] result = { 'id': video_id, From da8fb75df5aa3a6bdda2afbe7bec7da905f0618a Mon Sep 17 00:00:00 2001 From: Tom-Oliver Heidel Date: Tue, 10 Nov 2020 01:19:33 +0100 Subject: [PATCH 05/12] [skip travis] adjust python versions --- .github/workflows/build.yml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml index 74b50ecca..4920a30b8 100644 --- a/.github/workflows/build.yml +++ b/.github/workflows/build.yml @@ -82,7 +82,7 @@ jobs: - name: Set up Python uses: actions/setup-python@v2 with: - python-version: '3.x' + python-version: '3.8' - name: Install Requirements run: pip install pyinstaller - name: Bump version @@ -116,7 +116,7 @@ jobs: - name: Set up Python 3.5.4 32-Bit uses: actions/setup-python@v2 with: - python-version: '3.5.4' + python-version: '3.4.4' architecture: 'x86' - name: Install Requirements for 32 Bit run: pip install pyinstaller==3.5 From 9833e7a015ca788a4f881c8ee945967b5f3d71bc Mon Sep 17 00:00:00 2001 From: Luc Ritchie Date: Tue, 10 Nov 2020 03:38:26 -0500 Subject: [PATCH 06/12] fix: youtube: Polymer UI and JSON endpoints for playlists We already had a few copies of Polymer-style pagination handling logic for certain circumstances, but now we're forced into using it for all playlists since we can no longer disable Polymer. Refactor the logic to move it to the parent class for all entry lists (including e.g. search results, feeds, and list of playlists), and generify a bit to cover the child classes' use cases. --- youtube_dlc/extractor/youtube.py | 280 ++++++++++++++----------------- 1 file changed, 126 insertions(+), 154 deletions(-) diff --git a/youtube_dlc/extractor/youtube.py b/youtube_dlc/extractor/youtube.py index 3ec2581dc..273d823c2 100644 --- a/youtube_dlc/extractor/youtube.py +++ b/youtube_dlc/extractor/youtube.py @@ -36,6 +36,7 @@ from ..utils import ( get_element_by_attribute, get_element_by_id, int_or_none, + js_to_json, mimetype2ext, orderedSet, parse_codecs, @@ -70,6 +71,8 @@ class YoutubeBaseInfoExtractor(InfoExtractor): _LOGIN_REQUIRED = False _PLAYLIST_ID_RE = r'(?:PL|LL|EC|UU|FL|RD|UL|TL|PU|OLAK5uy_)[0-9A-Za-z-_]{10,}' + _INITIAL_DATA_RE = r'(?:window\["ytInitialData"\]|ytInitialData)\W?=\W?({.*?});' + _YTCFG_DATA_RE = r"ytcfg.set\(({.*?})\)" _YOUTUBE_CLIENT_HEADERS = { 'x-youtube-client-name': '1', @@ -274,7 +277,6 @@ class YoutubeBaseInfoExtractor(InfoExtractor): def _download_webpage_handle(self, *args, **kwargs): query = kwargs.get('query', {}).copy() - query['disable_polymer'] = 'true' kwargs['query'] = query return super(YoutubeBaseInfoExtractor, self)._download_webpage_handle( *args, **compat_kwargs(kwargs)) @@ -297,15 +299,60 @@ class YoutubeBaseInfoExtractor(InfoExtractor): class YoutubeEntryListBaseInfoExtractor(YoutubeBaseInfoExtractor): - # Extract entries from page with "Load more" button + def _find_entries_in_json(self, extracted): + entries = [] + c = {} + + def _real_find(obj): + if obj is None or isinstance(obj, str): + return + + if type(obj) is list: + for elem in obj: + _real_find(elem) + + if type(obj) is dict: + if self._is_entry(obj): + entries.append(obj) + return + + if 'continuationCommand' in obj: + c['continuation'] = obj + return + + for _, o in obj.items(): + _real_find(o) + + _real_find(extracted) + + return entries, try_get(c, lambda x: x["continuation"]) + def _entries(self, page, playlist_id): - more_widget_html = content_html = page + seen = [] + + yt_conf = {} + for m in re.finditer(self._YTCFG_DATA_RE, page): + parsed = self._parse_json(m.group(1), playlist_id, + transform_source=js_to_json, fatal=False) + if parsed: + yt_conf.update(parsed) + + data_json = self._parse_json(self._search_regex(self._INITIAL_DATA_RE, page, 'ytInitialData'), None) + for page_num in itertools.count(1): - for entry in self._process_page(content_html): + entries, continuation = self._find_entries_in_json(data_json) + processed = self._process_entries(entries, seen) + + if not processed: + break + for entry in processed: yield entry - mobj = re.search(r'data-uix-load-more-href="/?(?P[^"]+)"', more_widget_html) - if not mobj: + if not continuation or not yt_conf: + break + continuation_token = try_get(continuation, lambda x: x['continuationCommand']['token']) + continuation_url = try_get(continuation, lambda x: x['commandMetadata']['webCommandMetadata']['apiUrl']) + if not continuation_token or not continuation_url: break count = 0 @@ -314,12 +361,22 @@ class YoutubeEntryListBaseInfoExtractor(YoutubeBaseInfoExtractor): try: # Downloading page may result in intermittent 5xx HTTP error # that is usually worked around with a retry - more = self._download_json( - 'https://www.youtube.com/%s' % mobj.group('more'), playlist_id, - 'Downloading page #%s%s' - % (page_num, ' (retry #%d)' % count if count else ''), + data_json = self._download_json( + 'https://www.youtube.com%s' % continuation_url, + playlist_id, + 'Downloading page #%s%s' % (page_num, ' (retry #%d)' % count if count else ''), transform_source=uppercase_escape, - headers=self._YOUTUBE_CLIENT_HEADERS) + query={ + 'key': try_get(yt_conf, lambda x: x['INNERTUBE_API_KEY']) + }, + data=bytes(json.dumps({ + 'context': try_get(yt_conf, lambda x: x['INNERTUBE_CONTEXT']), + 'continuation': continuation_token + }), encoding='utf-8'), + headers={ + 'Content-Type': 'application/json' + } + ) break except ExtractorError as e: if isinstance(e.cause, compat_HTTPError) and e.cause.code in (500, 503): @@ -328,31 +385,30 @@ class YoutubeEntryListBaseInfoExtractor(YoutubeBaseInfoExtractor): continue raise - content_html = more['content_html'] - if not content_html.strip(): - # Some webpages show a "Load more" button but they don't - # have more videos - break - more_widget_html = more['load_more_widget_html'] + def _extract_title(self, renderer): + title = try_get(renderer, lambda x: x['title']['runs'][0]['text'], compat_str) + if title: + return title + return try_get(renderer, lambda x: x['title']['simpleText'], compat_str) class YoutubePlaylistBaseInfoExtractor(YoutubeEntryListBaseInfoExtractor): - def _process_page(self, content): - for video_id, video_title in self.extract_videos_from_page(content): - yield self.url_result(video_id, 'Youtube', video_id, video_title) + def _is_entry(self, obj): + return 'videoId' in obj - def extract_videos_from_page_impl(self, video_re, page, ids_in_page, titles_in_page): - for mobj in re.finditer(video_re, page): - # The link with index 0 is not the first video of the playlist (not sure if still actual) - if 'index' in mobj.groupdict() and mobj.group('id') == '0': + def _process_entries(self, entries, seen): + ids_in_page = [] + titles_in_page = [] + for renderer in entries: + video_id = try_get(renderer, lambda x: x['videoId']) + video_title = self._extract_title(renderer) + + if video_id is None or video_title is None: + # we do not have a videoRenderer or title extraction broke continue - video_id = mobj.group('id') - video_title = unescapeHTML( - mobj.group('title')) if 'title' in mobj.groupdict() else None - if video_title: - video_title = video_title.strip() - if video_title == '► Play all': - video_title = None + + video_title = video_title.strip() + try: idx = ids_in_page.index(video_id) if video_title and not titles_in_page[idx]: @@ -361,19 +417,16 @@ class YoutubePlaylistBaseInfoExtractor(YoutubeEntryListBaseInfoExtractor): ids_in_page.append(video_id) titles_in_page.append(video_title) - def extract_videos_from_page(self, page): - ids_in_page = [] - titles_in_page = [] - self.extract_videos_from_page_impl( - self._VIDEO_RE, page, ids_in_page, titles_in_page) - return zip(ids_in_page, titles_in_page) + for video_id, video_title in zip(ids_in_page, titles_in_page): + yield self.url_result(video_id, 'Youtube', video_id, video_title) class YoutubePlaylistsBaseInfoExtractor(YoutubeEntryListBaseInfoExtractor): - def _process_page(self, content): - for playlist_id in orderedSet(re.findall( - r']+class="[^"]*yt-lockup-title[^"]*"[^>]*>]+href="/?playlist\?list=([0-9A-Za-z-_]{10,})"', - content)): + def _is_entry(self, obj): + return 'playlistId' in obj + + def _process_entries(self, entries, seen): + for playlist_id in orderedSet(try_get(r, lambda x: x['playlistId']) for r in entries): yield self.url_result( 'https://www.youtube.com/playlist?list=%s' % playlist_id, 'YoutubePlaylist') @@ -3240,11 +3293,7 @@ class YoutubePlaylistsIE(YoutubePlaylistsBaseInfoExtractor): }] -class YoutubeSearchBaseInfoExtractor(YoutubePlaylistBaseInfoExtractor): - _VIDEO_RE = r'href="\s*/watch\?v=(?P[0-9A-Za-z_-]{11})(?:[^"]*"[^>]+\btitle="(?P[^"]+))?' - - -class YoutubeSearchIE(SearchInfoExtractor, YoutubeSearchBaseInfoExtractor): +class YoutubeSearchIE(SearchInfoExtractor, YoutubePlaylistBaseInfoExtractor): IE_DESC = 'YouTube.com searches' # there doesn't appear to be a real limit, for example if you search for # 'python' you get more than 8.000.000 results @@ -3341,11 +3390,10 @@ class YoutubeSearchDateIE(YoutubeSearchIE): _SEARCH_PARAMS = 'CAI%3D' -class YoutubeSearchURLIE(YoutubeSearchBaseInfoExtractor): +class YoutubeSearchURLIE(YoutubePlaylistBaseInfoExtractor): IE_DESC = 'YouTube.com search URLs' IE_NAME = 'youtube:search_url' _VALID_URL = r'https?://(?:www\.)?youtube\.com/results\?(.*?&)?(?:search_query|q)=(?P<query>[^&]+)(?:[&]|$)' - _SEARCH_DATA = r'(?:window\["ytInitialData"\]|ytInitialData)\W?=\W?({.*?});' _TESTS = [{ 'url': 'https://www.youtube.com/results?baz=bar&search_query=youtube-dl+test+video&filters=video&lclk=video', 'playlist_mincount': 5, @@ -3357,28 +3405,14 @@ class YoutubeSearchURLIE(YoutubeSearchBaseInfoExtractor): 'only_matching': True, }] - def _find_videos_in_json(self, extracted): - videos = [] + def _process_json_dict(self, obj, videos, c): + if "videoId" in obj: + videos.append(obj) + return - def _real_find(obj): - if obj is None or isinstance(obj, str): - return - - if type(obj) is list: - for elem in obj: - _real_find(elem) - - if type(obj) is dict: - if "videoId" in obj: - videos.append(obj) - return - - for _, o in obj.items(): - _real_find(o) - - _real_find(extracted) - - return videos + if "nextContinuationData" in obj: + c["continuation"] = obj["nextContinuationData"] + return def extract_videos_from_page_impl(self, page, ids_in_page, titles_in_page): search_response = self._parse_json(self._search_regex(self._SEARCH_DATA, page, 'ytInitialData'), None) @@ -3413,7 +3447,8 @@ class YoutubeSearchURLIE(YoutubeSearchBaseInfoExtractor): mobj = re.match(self._VALID_URL, url) query = compat_urllib_parse_unquote_plus(mobj.group('query')) webpage = self._download_webpage(url, query) - return self.playlist_result(self._process_page(webpage), playlist_title=query) + data_json = self._process_initial_data(webpage) + return self.playlist_result(self._process_data(data_json), playlist_title=query) class YoutubeShowIE(YoutubePlaylistsBaseInfoExtractor): @@ -3435,14 +3470,12 @@ class YoutubeShowIE(YoutubePlaylistsBaseInfoExtractor): 'https://www.youtube.com/show/%s/playlists' % playlist_id) -class YoutubeFeedsInfoExtractor(YoutubeBaseInfoExtractor): +class YoutubeFeedsInfoExtractor(YoutubePlaylistBaseInfoExtractor): """ Base class for feed extractors Subclasses must define the _FEED_NAME and _PLAYLIST_TITLE properties. """ _LOGIN_REQUIRED = True - _FEED_DATA = r'(?:window\["ytInitialData"\]|ytInitialData)\W?=\W?({.*?});' - _YTCFG_DATA = r"ytcfg.set\(({.*?})\)" @property def IE_NAME(self): @@ -3451,96 +3484,35 @@ class YoutubeFeedsInfoExtractor(YoutubeBaseInfoExtractor): def _real_initialize(self): self._login() - def _find_videos_in_json(self, extracted): - videos = [] - c = {} + def _process_entries(self, entries, seen): + new_info = [] + for v in entries: + v_id = try_get(v, lambda x: x['videoId']) + if not v_id: + continue - def _real_find(obj): - if obj is None or isinstance(obj, str): - return + have_video = False + for old in seen: + if old['videoId'] == v_id: + have_video = True + break - if type(obj) is list: - for elem in obj: - _real_find(elem) + if not have_video: + new_info.append(v) - if type(obj) is dict: - if "videoId" in obj: - videos.append(obj) - return + if not new_info: + return - if "nextContinuationData" in obj: - c["continuation"] = obj["nextContinuationData"] - return - - for _, o in obj.items(): - _real_find(o) - - _real_find(extracted) - - return videos, try_get(c, lambda x: x["continuation"]) - - def _entries(self, page): - info = [] - - yt_conf = self._parse_json(self._search_regex(self._YTCFG_DATA, page, 'ytcfg.set', default="null"), None, fatal=False) - - search_response = self._parse_json(self._search_regex(self._FEED_DATA, page, 'ytInitialData'), None) - - for page_num in itertools.count(1): - video_info, continuation = self._find_videos_in_json(search_response) - - new_info = [] - - for v in video_info: - v_id = try_get(v, lambda x: x['videoId']) - if not v_id: - continue - - have_video = False - for old in info: - if old['videoId'] == v_id: - have_video = True - break - - if not have_video: - new_info.append(v) - - if not new_info: - break - - info.extend(new_info) - - for video in new_info: - yield self.url_result(try_get(video, lambda x: x['videoId']), YoutubeIE.ie_key(), video_title=try_get(video, lambda x: x['title']['runs'][0]['text']) or try_get(video, lambda x: x['title']['simpleText'])) - - if not continuation or not yt_conf: - break - - search_response = self._download_json( - 'https://www.youtube.com/browse_ajax', self._PLAYLIST_TITLE, - 'Downloading page #%s' % page_num, - transform_source=uppercase_escape, - query={ - "ctoken": try_get(continuation, lambda x: x["continuation"]), - "continuation": try_get(continuation, lambda x: x["continuation"]), - "itct": try_get(continuation, lambda x: x["clickTrackingParams"]) - }, - headers={ - "X-YouTube-Client-Name": try_get(yt_conf, lambda x: x["INNERTUBE_CONTEXT_CLIENT_NAME"]), - "X-YouTube-Client-Version": try_get(yt_conf, lambda x: x["INNERTUBE_CONTEXT_CLIENT_VERSION"]), - "X-Youtube-Identity-Token": try_get(yt_conf, lambda x: x["ID_TOKEN"]), - "X-YouTube-Device": try_get(yt_conf, lambda x: x["DEVICE"]), - "X-YouTube-Page-CL": try_get(yt_conf, lambda x: x["PAGE_CL"]), - "X-YouTube-Page-Label": try_get(yt_conf, lambda x: x["PAGE_BUILD_LABEL"]), - "X-YouTube-Variants-Checksum": try_get(yt_conf, lambda x: x["VARIANTS_CHECKSUM"]), - }) + seen.extend(new_info) + for video in new_info: + yield self.url_result(try_get(video, lambda x: x['videoId']), YoutubeIE.ie_key(), video_title=self._extract_title(video)) def _real_extract(self, url): page = self._download_webpage( 'https://www.youtube.com/feed/%s' % self._FEED_NAME, self._PLAYLIST_TITLE) - return self.playlist_result( - self._entries(page), playlist_title=self._PLAYLIST_TITLE) + return self.playlist_result(self._entries(page, self._PLAYLIST_TITLE), + playlist_title=self._PLAYLIST_TITLE) class YoutubeWatchLaterIE(YoutubePlaylistIE): From 8f109ad4ad6bc734f817ccf3daefb9ed603d7480 Mon Sep 17 00:00:00 2001 From: Roman Karwacik <roman.karwacik@rwth-aachen.de> Date: Tue, 10 Nov 2020 10:39:57 +0100 Subject: [PATCH 07/12] [zoom] Fix url parsing for url's containing /share/ and dots --- youtube_dlc/extractor/zoom.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_dlc/extractor/zoom.py b/youtube_dlc/extractor/zoom.py index 003e1f901..038a90297 100644 --- a/youtube_dlc/extractor/zoom.py +++ b/youtube_dlc/extractor/zoom.py @@ -13,7 +13,7 @@ from ..utils import ( class ZoomIE(InfoExtractor): IE_NAME = 'zoom' - _VALID_URL = r'https://(?:.*).?zoom.us/rec(?:ording)?/play/(?P<id>[A-Za-z0-9\-_]+)' + _VALID_URL = r'https://(?:.*).?zoom.us/rec(?:ording)?/(play|share)/(?P<id>[A-Za-z0-9\-_.]+)' _TEST = { 'url': 'https://zoom.us/recording/play/SILVuCL4bFtRwWTtOCFQQxAsBQsJljFtm9e4Z_bvo-A8B-nzUSYZRNuPl3qW5IGK', From 002ea8fe172c0bf234fd15d3775a527706843fc3 Mon Sep 17 00:00:00 2001 From: pukkandan <pukkandan@gmail.com> Date: Tue, 27 Oct 2020 16:48:23 +0530 Subject: [PATCH 08/12] Fix external downloader when there is no http_header --- youtube_dlc/downloader/external.py | 29 ++++++++++++++++++----------- 1 file changed, 18 insertions(+), 11 deletions(-) diff --git a/youtube_dlc/downloader/external.py b/youtube_dlc/downloader/external.py index c31f8910a..d2f8f271d 100644 --- a/youtube_dlc/downloader/external.py +++ b/youtube_dlc/downloader/external.py @@ -115,8 +115,10 @@ class CurlFD(ExternalFD): def _make_cmd(self, tmpfilename, info_dict): cmd = [self.exe, '--location', '-o', tmpfilename] - for key, val in info_dict['http_headers'].items(): - cmd += ['--header', '%s: %s' % (key, val)] + if info_dict.get('http_headers') is not None: + for key, val in info_dict['http_headers'].items(): + cmd += ['--header', '%s: %s' % (key, val)] + cmd += self._bool_option('--continue-at', 'continuedl', '-', '0') cmd += self._valueless_option('--silent', 'noprogress') cmd += self._valueless_option('--verbose', 'verbose') @@ -150,8 +152,9 @@ class AxelFD(ExternalFD): def _make_cmd(self, tmpfilename, info_dict): cmd = [self.exe, '-o', tmpfilename] - for key, val in info_dict['http_headers'].items(): - cmd += ['-H', '%s: %s' % (key, val)] + if info_dict.get('http_headers') is not None: + for key, val in info_dict['http_headers'].items(): + cmd += ['-H', '%s: %s' % (key, val)] cmd += self._configuration_args() cmd += ['--', info_dict['url']] return cmd @@ -162,8 +165,9 @@ class WgetFD(ExternalFD): def _make_cmd(self, tmpfilename, info_dict): cmd = [self.exe, '-O', tmpfilename, '-nv', '--no-cookies'] - for key, val in info_dict['http_headers'].items(): - cmd += ['--header', '%s: %s' % (key, val)] + if info_dict.get('http_headers') is not None: + for key, val in info_dict['http_headers'].items(): + cmd += ['--header', '%s: %s' % (key, val)] cmd += self._option('--limit-rate', 'ratelimit') retry = self._option('--tries', 'retries') if len(retry) == 2: @@ -189,8 +193,9 @@ class Aria2cFD(ExternalFD): if dn: cmd += ['--dir', dn] cmd += ['--out', os.path.basename(tmpfilename)] - for key, val in info_dict['http_headers'].items(): - cmd += ['--header', '%s: %s' % (key, val)] + if info_dict.get('http_headers') is not None: + for key, val in info_dict['http_headers'].items(): + cmd += ['--header', '%s: %s' % (key, val)] cmd += self._option('--interface', 'source_address') cmd += self._option('--all-proxy', 'proxy') cmd += self._bool_option('--check-certificate', 'nocheckcertificate', 'false', 'true', '=') @@ -206,8 +211,10 @@ class HttpieFD(ExternalFD): def _make_cmd(self, tmpfilename, info_dict): cmd = ['http', '--download', '--output', tmpfilename, info_dict['url']] - for key, val in info_dict['http_headers'].items(): - cmd += ['%s:%s' % (key, val)] + + if info_dict.get('http_headers') is not None: + for key, val in info_dict['http_headers'].items(): + cmd += ['%s:%s' % (key, val)] return cmd @@ -253,7 +260,7 @@ class FFmpegFD(ExternalFD): # if end_time: # args += ['-t', compat_str(end_time - start_time)] - if info_dict['http_headers'] and re.match(r'^https?://', url): + if info_dict.get('http_headers') is not None and re.match(r'^https?://', url): # Trailing \r\n after each HTTP header is important to prevent warning from ffmpeg/avconv: # [http @ 00000000003d2fa0] No trailing CRLF found in HTTP header. headers = handle_youtubedl_headers(info_dict['http_headers']) From d7aec208f2a2ef883c7ffb14c0c4ceb4c9c9ddfa Mon Sep 17 00:00:00 2001 From: rigstot <rigstot@users.noreply.github.com> Date: Sun, 19 Jul 2020 15:07:29 +0200 Subject: [PATCH 09/12] implement ThisVid extractor deobfuscates the video URL using a reverse engineered version of KVS player's algorithm. This was tested against version 4.0.4, 5.0.1, 5.1.1.4 and 5.2.0.4 of the player and a warning will be issued if the major version changes. --- youtube_dlc/extractor/extractors.py | 1 + youtube_dlc/extractor/thisvid.py | 97 +++++++++++++++++++++++++++++ 2 files changed, 98 insertions(+) create mode 100644 youtube_dlc/extractor/thisvid.py diff --git a/youtube_dlc/extractor/extractors.py b/youtube_dlc/extractor/extractors.py index 666134d86..ee404f78d 100644 --- a/youtube_dlc/extractor/extractors.py +++ b/youtube_dlc/extractor/extractors.py @@ -1175,6 +1175,7 @@ from .theweatherchannel import TheWeatherChannelIE from .thisamericanlife import ThisAmericanLifeIE from .thisav import ThisAVIE from .thisoldhouse import ThisOldHouseIE +from .thisvid import ThisVidIE from .threeqsdn import ThreeQSDNIE from .tiktok import TikTokIE from .tinypic import TinyPicIE diff --git a/youtube_dlc/extractor/thisvid.py b/youtube_dlc/extractor/thisvid.py new file mode 100644 index 000000000..f507e1b06 --- /dev/null +++ b/youtube_dlc/extractor/thisvid.py @@ -0,0 +1,97 @@ +# coding: utf-8 +from __future__ import unicode_literals +import re + +from .common import InfoExtractor + + +class ThisVidIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?thisvid\.com/(?P<type>videos|embed)/(?P<id>[A-Za-z0-9-]+/?)' + _TESTS = [{ + 'url': 'https://thisvid.com/videos/french-boy-pantsed/', + 'md5': '3397979512c682f6b85b3b04989df224', + 'info_dict': { + 'id': '2400174', + 'ext': 'mp4', + 'title': 'French Boy Pantsed', + 'thumbnail': 'https://media.thisvid.com/contents/videos_screenshots/2400000/2400174/preview.mp4.jpg', + 'age_limit': 18, + } + }, { + 'url': 'https://thisvid.com/embed/2400174/', + 'md5': '3397979512c682f6b85b3b04989df224', + 'info_dict': { + 'id': '2400174', + 'ext': 'mp4', + 'title': 'French Boy Pantsed', + 'thumbnail': 'https://media.thisvid.com/contents/videos_screenshots/2400000/2400174/preview.mp4.jpg', + 'age_limit': 18, + } + }] + + def _real_extract(self, url): + main_id = self._match_id(url) + webpage = self._download_webpage(url, main_id) + + # URL decryptor was reversed from version 4.0.4, later verified working with 5.2.0 and may change in the future. + kvs_version = self._html_search_regex(r'<script [^>]+?src="https://thisvid\.com/player/kt_player\.js\?v=(\d+(\.\d+)+)">', webpage, 'kvs_version', fatal=False) + if not kvs_version.startswith("5."): + self.report_warning("Major version change (" + kvs_version + ") in player engine--Download may fail.") + + title = self._html_search_regex(r'<title>(?:Video: )?(.+?)(?: - (?:\w+ porn at )?ThisVid(?:.com| tube))?', webpage, 'title') + # video_id, video_url and license_code from the 'flashvars' JSON object: + video_id = self._html_search_regex(r"video_id: '([0-9]+)',", webpage, 'video_id') + video_url = self._html_search_regex(r"video_url: '(function/0/.+?)',", webpage, 'video_url') + license_code = self._html_search_regex(r"license_code: '([0-9$]{16})',", webpage, 'license_code') + thumbnail = self._html_search_regex(r"preview_url: '((?:https?:)?//media.thisvid.com/.+?.jpg)',", webpage, 'thumbnail', fatal=False) + if thumbnail.startswith("//"): + thumbnail = "https:" + thumbnail + if (re.match(self._VALID_URL, url).group('type') == "videos"): + display_id = main_id + else: + display_id = self._search_regex(r'', webpage, 'display_id', fatal=False), + + return { + 'id': video_id, + 'display_id': display_id, + 'title': title, + 'url': getrealurl(video_url, license_code), + 'thumbnail': thumbnail, + 'age_limit': 18, + } + + +def getrealurl(video_url, license_code): + urlparts = video_url.split('/')[2:] + license = getlicensetoken(license_code) + newmagic = urlparts[5][:32] + + for o in range(len(newmagic) - 1, -1, -1): + new = "" + l = (o + sum([int(n) for n in license[o:]])) % 32 + + for i in range(0, len(newmagic)): + if i == o: + new += newmagic[l] + elif i == l: + new += newmagic[o] + else: + new += newmagic[i] + newmagic = new + + urlparts[5] = newmagic + urlparts[5][32:] + return "/".join(urlparts) + + +def getlicensetoken(license): + modlicense = license.replace("$", "").replace("0", "1") + center = int(len(modlicense) / 2) + fronthalf = int(modlicense[:center + 1]) + backhalf = int(modlicense[center:]) + + modlicense = str(4 * abs(fronthalf - backhalf)) + retval = "" + for o in range(0, center + 1): + for i in range(1, 5): + retval += str((int(license[o + i]) + int(modlicense[o])) % 10) + return retval From 0f8566e90bee77775be133d551045698a84a2bdd Mon Sep 17 00:00:00 2001 From: Unknown Date: Tue, 10 Nov 2020 23:20:52 +0100 Subject: [PATCH 10/12] manually set limit for youtubesearchurl --- youtube_dlc/extractor/youtube.py | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/youtube_dlc/extractor/youtube.py b/youtube_dlc/extractor/youtube.py index 273d823c2..0dbb3531c 100644 --- a/youtube_dlc/extractor/youtube.py +++ b/youtube_dlc/extractor/youtube.py @@ -327,7 +327,7 @@ class YoutubeEntryListBaseInfoExtractor(YoutubeBaseInfoExtractor): return entries, try_get(c, lambda x: x["continuation"]) - def _entries(self, page, playlist_id): + def _entries(self, page, playlist_id, n=1): seen = [] yt_conf = {} @@ -339,7 +339,8 @@ class YoutubeEntryListBaseInfoExtractor(YoutubeBaseInfoExtractor): data_json = self._parse_json(self._search_regex(self._INITIAL_DATA_RE, page, 'ytInitialData'), None) - for page_num in itertools.count(1): + # for page_num in itertools.count(1): + for page_num in range(n): entries, continuation = self._find_entries_in_json(data_json) processed = self._process_entries(entries, seen) @@ -3447,8 +3448,8 @@ class YoutubeSearchURLIE(YoutubePlaylistBaseInfoExtractor): mobj = re.match(self._VALID_URL, url) query = compat_urllib_parse_unquote_plus(mobj.group('query')) webpage = self._download_webpage(url, query) - data_json = self._process_initial_data(webpage) - return self.playlist_result(self._process_data(data_json), playlist_title=query) + # data_json = self._process_initial_data(webpage) + return self.playlist_result(self._entries(webpage, query, n=5), playlist_title=query) class YoutubeShowIE(YoutubePlaylistsBaseInfoExtractor): From 73ac85678588b1c2997a94c0069ac0a9309adf19 Mon Sep 17 00:00:00 2001 From: Luc Ritchie Date: Tue, 10 Nov 2020 17:47:40 -0500 Subject: [PATCH 11/12] [youtube] max_pages=5 for search, unlimited for everything else Also drop a few leftover methods in search that are no longer used. --- youtube_dlc/extractor/youtube.py | 39 ++++---------------------------- 1 file changed, 4 insertions(+), 35 deletions(-) diff --git a/youtube_dlc/extractor/youtube.py b/youtube_dlc/extractor/youtube.py index d8d12a721..2fea11070 100644 --- a/youtube_dlc/extractor/youtube.py +++ b/youtube_dlc/extractor/youtube.py @@ -328,7 +328,7 @@ class YoutubeEntryListBaseInfoExtractor(YoutubeBaseInfoExtractor): return entries, try_get(c, lambda x: x["continuation"]) - def _entries(self, page, playlist_id, n=1): + def _entries(self, page, playlist_id, max_pages=None): seen = [] yt_conf = {} @@ -340,8 +340,7 @@ class YoutubeEntryListBaseInfoExtractor(YoutubeBaseInfoExtractor): data_json = self._parse_json(self._search_regex(self._INITIAL_DATA_RE, page, 'ytInitialData'), None) - # for page_num in itertools.count(1): - for page_num in range(n): + for page_num in range(1, max_pages + 1) if max_pages is not None else itertools.count(1): entries, continuation = self._find_entries_in_json(data_json) processed = self._process_entries(entries, seen) @@ -366,7 +365,7 @@ class YoutubeEntryListBaseInfoExtractor(YoutubeBaseInfoExtractor): data_json = self._download_json( 'https://www.youtube.com%s' % continuation_url, playlist_id, - 'Downloading page #%s%s' % (page_num, ' (retry #%d)' % count if count else ''), + 'Downloading continuation page #%s%s' % (page_num, ' (retry #%d)' % count if count else ''), transform_source=uppercase_escape, query={ @@ -3418,41 +3417,11 @@ class YoutubeSearchURLIE(YoutubePlaylistBaseInfoExtractor): c["continuation"] = obj["nextContinuationData"] return - def extract_videos_from_page_impl(self, page, ids_in_page, titles_in_page): - search_response = self._parse_json(self._search_regex(self._SEARCH_DATA, page, 'ytInitialData'), None) - - result_items = self._find_videos_in_json(search_response) - - for renderer in result_items: - video_id = try_get(renderer, lambda x: x['videoId']) - video_title = try_get(renderer, lambda x: x['title']['runs'][0]['text']) or try_get(renderer, lambda x: x['title']['simpleText']) - - if video_id is None or video_title is None: - # we do not have a videoRenderer or title extraction broke - continue - - video_title = video_title.strip() - - try: - idx = ids_in_page.index(video_id) - if video_title and not titles_in_page[idx]: - titles_in_page[idx] = video_title - except ValueError: - ids_in_page.append(video_id) - titles_in_page.append(video_title) - - def extract_videos_from_page(self, page): - ids_in_page = [] - titles_in_page = [] - self.extract_videos_from_page_impl(page, ids_in_page, titles_in_page) - return zip(ids_in_page, titles_in_page) - def _real_extract(self, url): mobj = re.match(self._VALID_URL, url) query = compat_urllib_parse_unquote_plus(mobj.group('query')) webpage = self._download_webpage(url, query) - # data_json = self._process_initial_data(webpage) - return self.playlist_result(self._entries(webpage, query, n=5), playlist_title=query) + return self.playlist_result(self._entries(webpage, query, max_pages=0), playlist_title=query) class YoutubeShowIE(YoutubePlaylistsBaseInfoExtractor): From 104bfdd24de9dd5f636887afd8b263a4c53673a7 Mon Sep 17 00:00:00 2001 From: Unknown Date: Wed, 11 Nov 2020 00:00:27 +0100 Subject: [PATCH 12/12] ytsearchurl 5 pages for around 100 results --- youtube_dlc/extractor/youtube.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_dlc/extractor/youtube.py b/youtube_dlc/extractor/youtube.py index 2fea11070..d5d25859d 100644 --- a/youtube_dlc/extractor/youtube.py +++ b/youtube_dlc/extractor/youtube.py @@ -3421,7 +3421,7 @@ class YoutubeSearchURLIE(YoutubePlaylistBaseInfoExtractor): mobj = re.match(self._VALID_URL, url) query = compat_urllib_parse_unquote_plus(mobj.group('query')) webpage = self._download_webpage(url, query) - return self.playlist_result(self._entries(webpage, query, max_pages=0), playlist_title=query) + return self.playlist_result(self._entries(webpage, query, max_pages=5), playlist_title=query) class YoutubeShowIE(YoutubePlaylistsBaseInfoExtractor):