[youtube] fix: extract mix playlist ids from ytInitialData (#33)

This commit is contained in:
insaneracist 2020-10-29 16:11:14 -07:00
parent 4932ba4aec
commit 5b0a6a8010
1 changed files with 26 additions and 9 deletions

View File

@ -279,6 +279,15 @@ class YoutubeBaseInfoExtractor(InfoExtractor):
return super(YoutubeBaseInfoExtractor, self)._download_webpage_handle( return super(YoutubeBaseInfoExtractor, self)._download_webpage_handle(
*args, **compat_kwargs(kwargs)) *args, **compat_kwargs(kwargs))
def _get_yt_initial_data(self, video_id, webpage):
config = self._search_regex(
(r'window\["ytInitialData"\]\s*=\s*(.*?)(?<=});',
r'var\s+ytInitialData\s*=\s*(.*?)(?<=});'),
webpage, 'ytInitialData', default=None)
if config:
return self._parse_json(
uppercase_escape(config), video_id, fatal=False)
def _real_initialize(self): def _real_initialize(self):
if self._downloader is None: if self._downloader is None:
return return
@ -1397,15 +1406,6 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
return self._parse_json( return self._parse_json(
uppercase_escape(config), video_id, fatal=False) uppercase_escape(config), video_id, fatal=False)
def _get_yt_initial_data(self, video_id, webpage):
config = self._search_regex(
(r'window\["ytInitialData"\]\s*=\s*(.*?)(?<=});',
r'var\s+ytInitialData\s*=\s*(.*?)(?<=});'),
webpage, 'ytInitialData', default=None)
if config:
return self._parse_json(
uppercase_escape(config), video_id, fatal=False)
def _get_automatic_captions(self, video_id, webpage): def _get_automatic_captions(self, video_id, webpage):
"""We need the webpage for getting the captions url, pass it as an """We need the webpage for getting the captions url, pass it as an
argument to speed up the process.""" argument to speed up the process."""
@ -2765,6 +2765,16 @@ class YoutubePlaylistIE(YoutubePlaylistBaseInfoExtractor):
return zip(ids_in_page, titles_in_page) return zip(ids_in_page, titles_in_page)
def _extract_mix_ids_from_yt_initial(self, yt_initial):
ids = []
playlist_contents = try_get(yt_initial, lambda x: x['contents']['twoColumnWatchNextResults']['playlist']['playlist']['contents'])
if type(playlist_contents) is list:
for item in playlist_contents:
videoId = try_get(item, lambda x: x['playlistPanelVideoRenderer']['videoId'])
if type(videoId) is str:
ids.append(videoId)
return ids
def _extract_mix(self, playlist_id): def _extract_mix(self, playlist_id):
# The mixes are generated from a single video # The mixes are generated from a single video
# the id of the playlist is just 'RD' + video_id # the id of the playlist is just 'RD' + video_id
@ -2778,6 +2788,13 @@ class YoutubePlaylistIE(YoutubePlaylistBaseInfoExtractor):
r'''(?xs)data-video-username=".*?".*? r'''(?xs)data-video-username=".*?".*?
href="/watch\?v=([0-9A-Za-z_-]{11})&amp;[^"]*?list=%s''' % re.escape(playlist_id), href="/watch\?v=([0-9A-Za-z_-]{11})&amp;[^"]*?list=%s''' % re.escape(playlist_id),
webpage)) webpage))
# if no ids in html of page, try using embedded json
if (len(new_ids) == 0):
yt_initial = self._get_yt_initial_data(playlist_id, webpage)
if yt_initial:
new_ids = self._extract_mix_ids_from_yt_initial(yt_initial)
# Fetch new pages until all the videos are repeated, it seems that # Fetch new pages until all the videos are repeated, it seems that
# there are always 51 unique videos. # there are always 51 unique videos.
new_ids = [_id for _id in new_ids if _id not in ids] new_ids = [_id for _id in new_ids if _id not in ids]