[ytsearch] added support to get correct playlist results while searching for playlists. Added playlist count to playlist results. Added video duration in video results.
This commit is contained in:
parent
416da574ec
commit
4dfb0763ba
|
@ -946,7 +946,7 @@ class InfoExtractor(object):
|
|||
|
||||
# Methods for following #608
|
||||
@staticmethod
|
||||
def url_result(url, ie=None, video_id=None, video_title=None):
|
||||
def url_result(url, ie=None, video_id=None, video_title=None, video_duration=None):
|
||||
"""Returns a URL that points to a page that should be processed"""
|
||||
# TODO: ie should be the class used for getting the info
|
||||
video_info = {'_type': 'url',
|
||||
|
@ -956,6 +956,8 @@ class InfoExtractor(object):
|
|||
video_info['id'] = video_id
|
||||
if video_title is not None:
|
||||
video_info['title'] = video_title
|
||||
if video_duration is not None:
|
||||
video_info['duration'] = video_duration
|
||||
return video_info
|
||||
|
||||
def playlist_from_matches(self, matches, playlist_id=None, playlist_title=None, getter=None, ie=None):
|
||||
|
|
|
@ -326,35 +326,56 @@ class YoutubeEntryListBaseInfoExtractor(YoutubeBaseInfoExtractor):
|
|||
|
||||
class YoutubePlaylistBaseInfoExtractor(YoutubeEntryListBaseInfoExtractor):
|
||||
def _process_page(self, content):
|
||||
for video_id, video_title in self.extract_videos_from_page(content):
|
||||
yield self.url_result(video_id, 'Youtube', video_id, video_title)
|
||||
for video_id, video_title, video_duration, playlist_video_id in self.extract_videos_from_page(content):
|
||||
if len(video_id) == 11:
|
||||
# Youtube video id found
|
||||
yield self.url_result(video_id, 'Youtube', video_id, video_title, video_duration)
|
||||
elif len(video_id) > 11:
|
||||
# Youtube playlist id found
|
||||
yield self.url_result('https://www.youtube.com/watch?v=%s&list=%s' % (playlist_video_id, video_id), 'YoutubePlaylist', video_id, video_title, video_duration)
|
||||
|
||||
def extract_videos_from_page_impl(self, video_re, page, ids_in_page, titles_in_page):
|
||||
def extract_videos_from_page_impl(self, video_re, page, ids_in_page, titles_in_page, durations_in_page, playlist_video_id_in_page):
|
||||
for mobj in re.finditer(video_re, page):
|
||||
# The link with index 0 is not the first video of the playlist (not sure if still actual)
|
||||
if 'index' in mobj.groupdict() and mobj.group('id') == '0':
|
||||
continue
|
||||
video_id = mobj.group('id')
|
||||
video_title = unescapeHTML(
|
||||
mobj.group('title')) if 'title' in mobj.groupdict() else None
|
||||
video_id_original = mobj.group('id')
|
||||
video_id = video_id_original
|
||||
playlist_id = mobj.group('plid') if 'plid' in mobj.groupdict() else None
|
||||
if playlist_id is not None:
|
||||
video_id = playlist_id
|
||||
video_title = unescapeHTML(mobj.group('title')) if 'title' in mobj.groupdict() else None
|
||||
if video_title:
|
||||
video_title = video_title.strip()
|
||||
if video_title == '► Play all':
|
||||
video_title = None
|
||||
video_duration = mobj.group('duration') if 'duration' in mobj.groupdict() else None
|
||||
playlist_count = mobj.group('plcounter') if 'plcounter' in mobj.groupdict() else None
|
||||
if playlist_id is not None and playlist_count is not None:
|
||||
video_duration = playlist_count
|
||||
if video_duration:
|
||||
video_duration = video_duration.strip()
|
||||
try:
|
||||
idx = ids_in_page.index(video_id)
|
||||
if video_title and not titles_in_page[idx]:
|
||||
titles_in_page[idx] = video_title
|
||||
if video_duration and not durations_in_page[idx]:
|
||||
durations_in_page[idx] = video_duration
|
||||
if playlist_id is not None and not playlist_video_id_in_page[idx]:
|
||||
playlist_video_id_in_page[idx] = video_id_original
|
||||
except ValueError:
|
||||
ids_in_page.append(video_id)
|
||||
titles_in_page.append(video_title)
|
||||
durations_in_page.append(video_duration)
|
||||
playlist_video_id_in_page.append(video_id_original)
|
||||
|
||||
def extract_videos_from_page(self, page):
|
||||
ids_in_page = []
|
||||
titles_in_page = []
|
||||
self.extract_videos_from_page_impl(
|
||||
self._VIDEO_RE, page, ids_in_page, titles_in_page)
|
||||
return zip(ids_in_page, titles_in_page)
|
||||
playlist_video_id_in_page = []
|
||||
durations_in_page = []
|
||||
self.extract_videos_from_page_impl(self._VIDEO_RE, page, ids_in_page, titles_in_page, durations_in_page, playlist_video_id_in_page)
|
||||
return zip(ids_in_page, titles_in_page, durations_in_page, playlist_video_id_in_page)
|
||||
|
||||
|
||||
class YoutubePlaylistsBaseInfoExtractor(YoutubeEntryListBaseInfoExtractor):
|
||||
|
@ -2764,6 +2785,8 @@ class YoutubePlaylistIE(YoutubePlaylistBaseInfoExtractor):
|
|||
def extract_videos_from_page(self, page):
|
||||
ids_in_page = []
|
||||
titles_in_page = []
|
||||
durations_in_page = []
|
||||
playlist_video_id_in_page
|
||||
|
||||
for item in re.findall(
|
||||
r'(<[^>]*\bdata-video-id\s*=\s*["\'][0-9A-Za-z_-]{11}[^>]+>)', page):
|
||||
|
@ -2774,20 +2797,23 @@ class YoutubePlaylistIE(YoutubePlaylistBaseInfoExtractor):
|
|||
video_title = video_title.strip()
|
||||
ids_in_page.append(video_id)
|
||||
titles_in_page.append(video_title)
|
||||
# TODO: ADD VIDEO DURATION HERE TOO?
|
||||
durations_in_page.append(None)
|
||||
playlist_video_id_in_page.append(None)
|
||||
|
||||
# Fallback with old _VIDEO_RE
|
||||
self.extract_videos_from_page_impl(
|
||||
self._VIDEO_RE, page, ids_in_page, titles_in_page)
|
||||
self._VIDEO_RE, page, ids_in_page, titles_in_page, durations_in_page, playlist_video_id_in_page)
|
||||
|
||||
# Relaxed fallbacks
|
||||
self.extract_videos_from_page_impl(
|
||||
r'href="\s*/watch\?v\s*=\s*(?P<id>[0-9A-Za-z_-]{11})', page,
|
||||
ids_in_page, titles_in_page)
|
||||
ids_in_page, titles_in_page, durations_in_page, playlist_video_id_in_page)
|
||||
self.extract_videos_from_page_impl(
|
||||
r'data-video-ids\s*=\s*["\'](?P<id>[0-9A-Za-z_-]{11})', page,
|
||||
ids_in_page, titles_in_page)
|
||||
ids_in_page, titles_in_page, durations_in_page, playlist_video_id_in_page)
|
||||
|
||||
return zip(ids_in_page, titles_in_page)
|
||||
return zip(ids_in_page, titles_in_page, durations_in_page, playlist_video_id_in_page)
|
||||
|
||||
def _extract_mix(self, playlist_id):
|
||||
# The mixes are generated from a single video
|
||||
|
@ -3171,7 +3197,7 @@ class YoutubePlaylistsIE(YoutubePlaylistsBaseInfoExtractor):
|
|||
|
||||
|
||||
class YoutubeSearchBaseInfoExtractor(YoutubePlaylistBaseInfoExtractor):
|
||||
_VIDEO_RE = r'href="\s*/watch\?v=(?P<id>[0-9A-Za-z_-]{11})(?:[^"]*"[^>]+\btitle="(?P<title>[^"]+))?'
|
||||
_VIDEO_RE = r'href="\s*/watch\?v=(?P<id>[0-9A-Za-z_-]{11})(&list=(?P<plid>[0-9A-Za-z_-]+))?(((?!formatted-video-count-label|tile-link|href="\s*/watch)[\s\S])*"[^\d]+(?P<plcounter>[0-9,.]+)</b>\svideos)?((?:[^\"]*"[^>]+\btitle="(?P<title>[^\"]+)))?(?:.*Duration:\s*(?P<duration>([0-1]?[0-9]|2[0-3]):[0-5][0-9]))?'
|
||||
|
||||
|
||||
class YoutubeSearchIE(SearchInfoExtractor, YoutubeSearchBaseInfoExtractor):
|
||||
|
|
Loading…
Reference in New Issue