Update youtube extractor to 2020.11.24

This commit is contained in:
pukkandan 2020-11-24 03:29:10 +05:30
parent 70d5c17b08
commit 3d3dddc948
2 changed files with 82 additions and 57 deletions

View File

@ -64,9 +64,10 @@ class TestAllURLsMatching(unittest.TestCase):
# self.assertMatch('http://www.youtube.com/NASAgovVideo/videos', ['youtube:tab']) # self.assertMatch('http://www.youtube.com/NASAgovVideo/videos', ['youtube:tab'])
def test_youtube_feeds(self): def test_youtube_feeds(self):
self.assertMatch('https://www.youtube.com/feed/watch_later', ['youtube:watchlater']) self.assertMatch('https://www.youtube.com/feed/library', ['youtube:tab'])
self.assertMatch('https://www.youtube.com/feed/subscriptions', ['youtube:subscriptions']) self.assertMatch('https://www.youtube.com/feed/history', ['youtube:tab'])
self.assertMatch('https://www.youtube.com/feed/recommended', ['youtube:recommended']) self.assertMatch('https://www.youtube.com/feed/watch_later', ['youtube:tab'])
self.assertMatch('https://www.youtube.com/feed/subscriptions', ['youtube:tab'])
# def test_youtube_search_matching(self): # def test_youtube_search_matching(self):
# self.assertMatch('http://www.youtube.com/results?search_query=making+mustard', ['youtube:search_url']) # self.assertMatch('http://www.youtube.com/results?search_query=making+mustard', ['youtube:search_url'])

View File

@ -2541,6 +2541,7 @@ class YoutubeTabIE(YoutubeBaseInfoExtractor):
(?: (?:
(?:channel|c|user)/| (?:channel|c|user)/|
(?P<not_channel> (?P<not_channel>
feed/|
(?:playlist|watch)\?.*?\blist= (?:playlist|watch)\?.*?\blist=
)| )|
(?!(%s)([/#?]|$)) # Direct URLs (?!(%s)([/#?]|$)) # Direct URLs
@ -2785,7 +2786,30 @@ class YoutubeTabIE(YoutubeBaseInfoExtractor):
}, { }, {
'url': 'https://www.youtube.com/c/CommanderVideoHq/live', 'url': 'https://www.youtube.com/c/CommanderVideoHq/live',
'only_matching': True, 'only_matching': True,
}, }, {
'url': 'https://www.youtube.com/feed/trending',
'only_matching': True,
}, {
# needs auth
'url': 'https://www.youtube.com/feed/library',
'only_matching': True,
}, {
# needs auth
'url': 'https://www.youtube.com/feed/history',
'only_matching': True,
}, {
# needs auth
'url': 'https://www.youtube.com/feed/subscriptions',
'only_matching': True,
}, {
# needs auth
'url': 'https://www.youtube.com/feed/watch_later',
'only_matching': True,
}, {
# no longer available?
'url': 'https://www.youtube.com/feed/recommended',
'only_matching': True,
}
# TODO # TODO
# { # {
# 'url': 'https://www.youtube.com/TheYoungTurks/live', # 'url': 'https://www.youtube.com/TheYoungTurks/live',
@ -2872,27 +2896,34 @@ class YoutubeTabIE(YoutubeBaseInfoExtractor):
'https://www.youtube.com/channel/%s' % channel_id, 'https://www.youtube.com/channel/%s' % channel_id,
ie=YoutubeTabIE.ie_key(), video_title=title) ie=YoutubeTabIE.ie_key(), video_title=title)
def _shelf_entries_trimmed(self, shelf_renderer): def _shelf_entries_from_content(self, shelf_renderer):
renderer = try_get( content = shelf_renderer.get('content')
shelf_renderer, lambda x: x['content']['horizontalListRenderer'], dict) if not isinstance(content, dict):
if not renderer:
return return
# TODO: add support for nested playlists so each shelf is processed renderer = content.get('gridRenderer')
# as separate playlist if renderer:
# TODO: this includes only first N items # TODO: add support for nested playlists so each shelf is processed
for entry in self._grid_entries(renderer): # as separate playlist
yield entry # TODO: this includes only first N items
for entry in self._grid_entries(renderer):
yield entry
renderer = content.get('horizontalListRenderer')
if renderer:
# TODO
pass
def _shelf_entries(self, shelf_renderer): def _shelf_entries(self, shelf_renderer):
ep = try_get( ep = try_get(
shelf_renderer, lambda x: x['endpoint']['commandMetadata']['webCommandMetadata']['url'], shelf_renderer, lambda x: x['endpoint']['commandMetadata']['webCommandMetadata']['url'],
compat_str) compat_str)
shelf_url = urljoin('https://www.youtube.com', ep) shelf_url = urljoin('https://www.youtube.com', ep)
if not shelf_url: if shelf_url:
return title = try_get(
title = try_get( shelf_renderer, lambda x: x['title']['runs'][0]['text'], compat_str)
shelf_renderer, lambda x: x['title']['runs'][0]['text'], compat_str) yield self.url_result(shelf_url, video_title=title)
yield self.url_result(shelf_url, video_title=title) # Shelf may not contain shelf URL, fallback to extraction from content
for entry in self._shelf_entries_from_content(shelf_renderer):
yield entry
def _playlist_entries(self, video_list_renderer): def _playlist_entries(self, video_list_renderer):
for content in video_list_renderer['contents']: for content in video_list_renderer['contents']:
@ -2906,6 +2937,7 @@ class YoutubeTabIE(YoutubeBaseInfoExtractor):
continue continue
yield self._extract_video(renderer) yield self._extract_video(renderer)
r""" # Not needed in the new implementation
def _itemSection_entries(self, item_sect_renderer): def _itemSection_entries(self, item_sect_renderer):
for content in item_sect_renderer['contents']: for content in item_sect_renderer['contents']:
if not isinstance(content, dict): if not isinstance(content, dict):
@ -2917,6 +2949,7 @@ class YoutubeTabIE(YoutubeBaseInfoExtractor):
if not video_id: if not video_id:
continue continue
yield self._extract_video(renderer) yield self._extract_video(renderer)
"""
def _rich_entries(self, rich_grid_renderer): def _rich_entries(self, rich_grid_renderer):
renderer = try_get( renderer = try_get(
@ -3369,7 +3402,7 @@ class YoutubeYtUserIE(InfoExtractor):
ie=YoutubeTabIE.ie_key(), video_id=user_id) ie=YoutubeTabIE.ie_key(), video_id=user_id)
class YoutubeFavouritesIE(InfoExtractor): class YoutubeFavouritesIE(YoutubeBaseInfoExtractor):
IE_NAME = 'youtube:favorites' IE_NAME = 'youtube:favorites'
IE_DESC = 'YouTube.com liked videos, ":ytfav" for short (requires authentication)' IE_DESC = 'YouTube.com liked videos, ":ytfav" for short (requires authentication)'
_VALID_URL = r':ytfav(?:ou?rite)?s?' _VALID_URL = r':ytfav(?:ou?rite)?s?'
@ -3515,7 +3548,7 @@ class YoutubeSearchURLIE(YoutubeSearchIE):
class YoutubeFeedsInfoExtractor(YoutubeTabIE): class YoutubeFeedsInfoExtractor(YoutubeTabIE):
""" """
Base class for feed extractors Base class for feed extractors
Subclasses must define the _FEED_NAME and _PLAYLIST_TITLE properties. Subclasses must define the _FEED_NAME property.
""" """
_LOGIN_REQUIRED = True _LOGIN_REQUIRED = True
# _MAX_PAGES = 5 # _MAX_PAGES = 5
@ -3528,44 +3561,17 @@ class YoutubeFeedsInfoExtractor(YoutubeTabIE):
def _real_initialize(self): def _real_initialize(self):
self._login() self._login()
def _shelf_entries(self, shelf_renderer):
renderer = try_get(shelf_renderer, lambda x: x['content']['gridRenderer'], dict)
if not renderer:
return
for entry in self._grid_entries(renderer):
yield entry
def _extract_from_tabs(self, item_id, webpage, data, tabs, identity_token):
selected_tab = self._extract_selected_tab(tabs)
return self.playlist_result(
self._entries(selected_tab['content'], identity_token),
playlist_title=self._PLAYLIST_TITLE)
def _real_extract(self, url): def _real_extract(self, url):
item_id = self._FEED_NAME return self.url_result(
url = 'https://www.youtube.com/feed/%s' % self._FEED_NAME 'https://www.youtube.com/feed/%s' % self._FEED_NAME,
webpage = self._download_webpage(url, item_id) ie=YoutubeTabIE.ie_key())
identity_token = self._search_regex(
r'\bID_TOKEN["\']\s*:\s*["\'](.+?)["\']', webpage,
'identity token', default=None)
data = self._extract_yt_initial_data(item_id, webpage)
tabs = try_get(
data, lambda x: x['contents']['twoColumnBrowseResultsRenderer']['tabs'], list)
if tabs:
return self._extract_from_tabs(item_id, webpage, data, tabs, identity_token)
# Failed to recognize
raise ExtractorError('Unable to recognize feed page')
class YoutubeWatchLaterIE(InfoExtractor): class YoutubeWatchLaterIE(InfoExtractor):
IE_NAME = 'youtube:watchlater' IE_NAME = 'youtube:watchlater'
_VALID_URL = r'https?://(?:www\.)?youtube\.com/feed/watch_later|:ytwatchlater|WL'
IE_DESC = 'Youtube watch later list, ":ytwatchlater" for short (requires authentication)' IE_DESC = 'Youtube watch later list, ":ytwatchlater" for short (requires authentication)'
_VALID_URL = r':ytwatchlater'
_TESTS = [{ _TESTS = [{
'url': 'https://www.youtube.com/feed/watch_later',
'only_matching': True,
}, {
'url': ':ytwatchlater', 'url': ':ytwatchlater',
'only_matching': True, 'only_matching': True,
}] }]
@ -3577,23 +3583,41 @@ class YoutubeWatchLaterIE(InfoExtractor):
class YoutubeRecommendedIE(YoutubeFeedsInfoExtractor): class YoutubeRecommendedIE(YoutubeFeedsInfoExtractor):
IE_DESC = 'YouTube.com recommended videos, ":ytrec" for short (requires authentication)' IE_DESC = 'YouTube.com recommended videos, ":ytrec" for short (requires authentication)'
_VALID_URL = r'https?://(?:www\.)?youtube\.com(?:/feed/recommended|/?[?#]|/?$)|:ytrec(?:ommended)?' _VALID_URL = r'https?://(?:www\.)?youtube\.com/?(?:[?#]|$)|:ytrec(?:ommended)?'
_FEED_NAME = 'recommended' _FEED_NAME = 'recommended'
_PLAYLIST_TITLE = 'Youtube Recommended videos' _TESTS = [{
'url': ':ytrec',
'only_matching': True,
}, {
'url': ':ytrecommended',
'only_matching': True,
}, {
'url': 'https://youtube.com',
'only_matching': True,
}]
class YoutubeSubscriptionsIE(YoutubeFeedsInfoExtractor): class YoutubeSubscriptionsIE(YoutubeFeedsInfoExtractor):
_VALID_URL = r'https?://(?:www\.)?youtube\.com/feed/subscriptions|:ytsub(?:scription)?s?'
IE_DESC = 'YouTube.com subscriptions feed, ":ytsubs" for short (requires authentication)' IE_DESC = 'YouTube.com subscriptions feed, ":ytsubs" for short (requires authentication)'
_VALID_URL = r':ytsub(?:scription)?s?'
_FEED_NAME = 'subscriptions' _FEED_NAME = 'subscriptions'
_PLAYLIST_TITLE = 'Youtube Subscriptions' _TESTS = [{
'url': ':ytsubs',
'only_matching': True,
}, {
'url': ':ytsubscriptions',
'only_matching': True,
}]
class YoutubeHistoryIE(YoutubeFeedsInfoExtractor): class YoutubeHistoryIE(YoutubeFeedsInfoExtractor):
IE_DESC = 'Youtube watch history, ":ythistory" for short (requires authentication)' IE_DESC = 'Youtube watch history, ":ythistory" for short (requires authentication)'
_VALID_URL = r'https?://(?:www\.)?youtube\.com/feed/history|:ythistory' _VALID_URL = r':ythistory'
_FEED_NAME = 'history' _FEED_NAME = 'history'
_PLAYLIST_TITLE = 'Youtube History' _TESTS = [{
'url': ':ythistory',
'only_matching': True,
}]
class YoutubeTruncatedURLIE(InfoExtractor): class YoutubeTruncatedURLIE(InfoExtractor):