From e5d39886ec8e4e40b2b7257d16cc5d8505cc1f69 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Mon, 17 Apr 2017 00:23:16 +0700 Subject: [PATCH] [limelight] Improve embeds extraction (closes #12761) * Move extraction code to extractor * Add extraction for LimelightEmbeddedPlayerFlash embeds * Extract multiple video --- youtube_dl/extractor/generic.py | 6 +++++ youtube_dl/extractor/limelight.py | 37 +++++++++++++++++++++++++++++++ 2 files changed, 43 insertions(+) diff --git a/youtube_dl/extractor/generic.py b/youtube_dl/extractor/generic.py index 6a34c2491..c523abb25 100644 --- a/youtube_dl/extractor/generic.py +++ b/youtube_dl/extractor/generic.py @@ -85,6 +85,7 @@ from .ustream import UstreamIE from .openload import OpenloadIE from .videopress import VideoPressIE from .rutube import RutubeIE +from .limelight import LimelightBaseIE class GenericIE(InfoExtractor): @@ -2483,6 +2484,11 @@ class GenericIE(InfoExtractor): return self.url_result(piksel_url, PikselIE.ie_key()) # Look for Limelight embeds + limelight_urls = LimelightBaseIE._extract_urls(webpage, url) + if limelight_urls: + return self.playlist_result( + limelight_urls, video_id, video_title, video_description) + mobj = re.search(r'LimelightPlayer\.doLoad(Media|Channel|ChannelList)\(["\'](?P[a-z0-9]{32})', webpage) if mobj: lm = { diff --git a/youtube_dl/extractor/limelight.py b/youtube_dl/extractor/limelight.py index f52c2e169..0041453af 100644 --- a/youtube_dl/extractor/limelight.py +++ b/youtube_dl/extractor/limelight.py @@ -9,6 +9,7 @@ from ..utils import ( determine_ext, float_or_none, int_or_none, + smuggle_url, unsmuggle_url, ExtractorError, ) @@ -18,6 +19,42 @@ class LimelightBaseIE(InfoExtractor): _PLAYLIST_SERVICE_URL = 'http://production-ps.lvp.llnw.net/r/PlaylistService/%s/%s/%s' _API_URL = 'http://api.video.limelight.com/rest/organizations/%s/%s/%s/%s.json' + @classmethod + def _extract_urls(cls, webpage, source_url): + lm = { + 'Media': 'media', + 'Channel': 'channel', + 'ChannelList': 'channel_list', + } + entries = [] + for kind, video_id in re.findall( + r'LimelightPlayer\.doLoad(Media|Channel|ChannelList)\(["\'](?P[a-z0-9]{32})', + webpage): + print('video_id', video_id) + entries.append(cls.url_result( + smuggle_url( + 'limelight:%s:%s' % (lm[kind], video_id), + {'source_url': source_url}), + 'Limelight%s' % kind, video_id)) + for mobj in re.finditer( + # As per [1] class attribute should be exactly equal to + # LimelightEmbeddedPlayerFlash but numerous examples seen + # that don't exactly match it (e.g. [2]). + # 1. http://support.3playmedia.com/hc/en-us/articles/227732408-Limelight-Embedding-the-Captions-Plugin-with-the-Limelight-Player-on-Your-Webpage + # 2. http://www.sedona.com/FacilitatorTraining2017 + r'''(?sx) + ]+class=(["\'])(?:(?!\1).)*\bLimelightEmbeddedPlayerFlash\b(?:(?!\1).)*\1[^>]*>.*? + ]+ + name=(["\'])flashVars\2[^>]+ + value=(["\'])(?:(?!\3).)*mediaId=(?P[a-z0-9]{32}) + ''', webpage): + entries.append(cls.url_result( + smuggle_url( + 'limelight:media:%s' % mobj.group('id'), + {'source_url': source_url}), + 'LimelightMedia', mobj.group('id'))) + return entries + def _call_playlist_service(self, item_id, method, fatal=True, referer=None): headers = {} if referer: