Merge youtube-dl and fix Youtube Feedspull/100/merge
@@ -61,7 +61,7 @@ def build_lazy_ie(ie, name): | |||
return s | |||
# find the correct sorting and add the required base classes so that sublcasses | |||
# find the correct sorting and add the required base classes so that subclasses | |||
# can be correctly created | |||
classes = _ALL_CLASSES[:-1] | |||
ordered_cls = [] | |||
@@ -59,9 +59,9 @@ | |||
- **ARD:mediathek** | |||
- **ARDBetaMediathek** | |||
- **Arkena** | |||
- **arte.tv:+7** | |||
- **arte.tv:embed** | |||
- **arte.tv:playlist** | |||
- **ArteTV** | |||
- **ArteTVEmbed** | |||
- **ArteTVPlaylist** | |||
- **AsianCrush** | |||
- **AsianCrushPlaylist** | |||
- **AtresPlayer** | |||
@@ -111,6 +111,7 @@ | |||
- **Bloomberg** | |||
- **BokeCC** | |||
- **BostonGlobe** | |||
- **Box** | |||
- **Bpb**: Bundeszentrale für politische Bildung | |||
- **BR**: Bayerischer Rundfunk | |||
- **BravoTV** | |||
@@ -158,6 +159,7 @@ | |||
- **Chilloutzone** | |||
- **chirbit** | |||
- **chirbit:profile** | |||
- **cielotv.it** | |||
- **Cinchcast** | |||
- **Cinemax** | |||
- **CiscoLiveSearch** | |||
@@ -425,6 +427,7 @@ | |||
- **la7.it** | |||
- **laola1tv** | |||
- **laola1tv:embed** | |||
- **lbry.tv** | |||
- **LCI** | |||
- **Lcp** | |||
- **LcpPlay** | |||
@@ -475,6 +478,7 @@ | |||
- **massengeschmack.tv** | |||
- **MatchTV** | |||
- **MDR**: MDR.DE and KiKA | |||
- **MedalTV** | |||
- **media.ccc.de** | |||
- **media.ccc.de:lists** | |||
- **Medialaan** | |||
@@ -618,6 +622,7 @@ | |||
- **Nuvid** | |||
- **NYTimes** | |||
- **NYTimesArticle** | |||
- **NYTimesCooking** | |||
- **NZZ** | |||
- **ocw.mit.edu** | |||
- **OdaTV** | |||
@@ -670,6 +675,8 @@ | |||
- **PicartoVod** | |||
- **Piksel** | |||
- **Pinkbike** | |||
- **Pinterest** | |||
- **PinterestCollection** | |||
- **Pladform** | |||
- **Platzi** | |||
- **PlatziCourse** | |||
@@ -766,6 +773,7 @@ | |||
- **RTVNH** | |||
- **RTVS** | |||
- **RUHD** | |||
- **RumbleEmbed** | |||
- **rutube**: Rutube videos | |||
- **rutube:channel**: Rutube channels | |||
- **rutube:embed**: Rutube embedded videos | |||
@@ -836,12 +844,14 @@ | |||
- **SpankBangPlaylist** | |||
- **Spankwire** | |||
- **Spiegel** | |||
- **Spiegel:Article**: Articles on spiegel.de | |||
- **Spiegeltv** | |||
- **sport.francetvinfo.fr** | |||
- **Sport5** | |||
- **SportBox** | |||
- **SportDeutschland** | |||
- **Spreaker** | |||
- **SpreakerPage** | |||
- **SpreakerShow** | |||
- **SpreakerShowPage** | |||
- **SpringboardPlatform** | |||
- **Sprout** | |||
- **sr:mediathek**: Saarländischer Rundfunk | |||
@@ -945,6 +955,7 @@ | |||
- **TV2DKBornholmPlay** | |||
- **TV4**: tv4.se and tv4play.se | |||
- **TV5MondePlus**: TV5MONDE+ | |||
- **tv8.it** | |||
- **TVA** | |||
- **TVANouvelles** | |||
- **TVANouvellesArticle** | |||
@@ -1059,7 +1070,7 @@ | |||
- **vk:wallpost** | |||
- **vlive** | |||
- **vlive:channel** | |||
- **vlive:playlist** | |||
- **vlive:post** | |||
- **Vodlocker** | |||
- **VODPl** | |||
- **VODPlatform** | |||
@@ -1148,20 +1159,17 @@ | |||
- **YourPorn** | |||
- **YourUpload** | |||
- **youtube**: YouTube.com | |||
- **youtube:channel**: YouTube.com channels | |||
- **youtube:favorites**: YouTube.com favourite videos, ":ytfav" for short (requires authentication) | |||
- **youtube:favorites**: YouTube.com liked videos, ":ytfav" for short (requires authentication) | |||
- **youtube:history**: Youtube watch history, ":ythistory" for short (requires authentication) | |||
- **youtube:live**: YouTube.com live streams | |||
- **youtube:playlist**: YouTube.com playlists | |||
- **youtube:playlists**: YouTube.com user/channel playlists | |||
- **youtube:recommended**: YouTube.com recommended videos, ":ytrec" for short (requires authentication) | |||
- **youtube:search**: YouTube.com searches | |||
- **youtube:search:date**: YouTube.com searches, newest videos first | |||
- **youtube:search**: YouTube.com searches, "ytsearch" keyword | |||
- **youtube:search:date**: YouTube.com searches, newest videos first, "ytsearchdate" keyword | |||
- **youtube:search_url**: YouTube.com search URLs | |||
- **youtube:show**: YouTube.com (multi-season) shows | |||
- **youtube:subscriptions**: YouTube.com subscriptions feed, "ytsubs" keyword (requires authentication) | |||
- **youtube:user**: YouTube.com user videos (URL or "ytuser" keyword) | |||
- **youtube:subscriptions**: YouTube.com subscriptions feed, ":ytsubs" for short (requires authentication) | |||
- **youtube:tab**: YouTube.com tab | |||
- **youtube:watchlater**: Youtube watch later list, ":ytwatchlater" for short (requires authentication) | |||
- **YoutubeYtUser**: YouTube.com user videos, URL or "ytuser" keyword | |||
- **Zapiks** | |||
- **Zaq1** | |||
- **Zattoo** | |||
@@ -37,7 +37,7 @@ | |||
"writeinfojson": true, | |||
"writesubtitles": false, | |||
"allsubtitles": false, | |||
"listssubtitles": false, | |||
"listsubtitles": false, | |||
"socket_timeout": 20, | |||
"fixup": "never" | |||
} |
@@ -919,6 +919,76 @@ class TestYoutubeDL(unittest.TestCase): | |||
self.assertEqual(downloaded['extractor'], 'testex') | |||
self.assertEqual(downloaded['extractor_key'], 'TestEx') | |||
# Test case for https://github.com/ytdl-org/youtube-dl/issues/27064 | |||
def test_ignoreerrors_for_playlist_with_url_transparent_iterable_entries(self): | |||
class _YDL(YDL): | |||
def __init__(self, *args, **kwargs): | |||
super(_YDL, self).__init__(*args, **kwargs) | |||
def trouble(self, s, tb=None): | |||
pass | |||
ydl = _YDL({ | |||
'format': 'extra', | |||
'ignoreerrors': True, | |||
}) | |||
class VideoIE(InfoExtractor): | |||
_VALID_URL = r'video:(?P<id>\d+)' | |||
def _real_extract(self, url): | |||
video_id = self._match_id(url) | |||
formats = [{ | |||
'format_id': 'default', | |||
'url': 'url:', | |||
}] | |||
if video_id == '0': | |||
raise ExtractorError('foo') | |||
if video_id == '2': | |||
formats.append({ | |||
'format_id': 'extra', | |||
'url': TEST_URL, | |||
}) | |||
return { | |||
'id': video_id, | |||
'title': 'Video %s' % video_id, | |||
'formats': formats, | |||
} | |||
class PlaylistIE(InfoExtractor): | |||
_VALID_URL = r'playlist:' | |||
def _entries(self): | |||
for n in range(3): | |||
video_id = compat_str(n) | |||
yield { | |||
'_type': 'url_transparent', | |||
'ie_key': VideoIE.ie_key(), | |||
'id': video_id, | |||
'url': 'video:%s' % video_id, | |||
'title': 'Video Transparent %s' % video_id, | |||
} | |||
def _real_extract(self, url): | |||
return self.playlist_result(self._entries()) | |||
ydl.add_info_extractor(VideoIE(ydl)) | |||
ydl.add_info_extractor(PlaylistIE(ydl)) | |||
info = ydl.extract_info('playlist:') | |||
entries = info['entries'] | |||
self.assertEqual(len(entries), 3) | |||
self.assertTrue(entries[0] is None) | |||
self.assertTrue(entries[1] is None) | |||
self.assertEqual(len(ydl.downloaded_info_dicts), 1) | |||
downloaded = ydl.downloaded_info_dicts[0] | |||
self.assertEqual(entries[2], downloaded) | |||
self.assertEqual(downloaded['url'], TEST_URL) | |||
self.assertEqual(downloaded['title'], 'Video Transparent 2') | |||
self.assertEqual(downloaded['id'], '2') | |||
self.assertEqual(downloaded['extractor'], 'Video') | |||
self.assertEqual(downloaded['extractor_key'], 'Video') | |||
if __name__ == '__main__': | |||
unittest.main() |
@@ -31,45 +31,47 @@ class TestAllURLsMatching(unittest.TestCase): | |||
def test_youtube_playlist_matching(self): | |||
assertPlaylist = lambda url: self.assertMatch(url, ['youtube:playlist']) | |||
assertTab = lambda url: self.assertMatch(url, ['youtube:tab']) | |||
assertPlaylist('ECUl4u3cNGP61MdtwGTqZA0MreSaDybji8') | |||
assertPlaylist('UUBABnxM4Ar9ten8Mdjj1j0Q') # 585 | |||
assertPlaylist('https://www.youtube.com/playlist?list=UUBABnxM4Ar9ten8Mdjj1j0Q') | |||
assertPlaylist('PL63F0C78739B09958') | |||
assertTab('https://www.youtube.com/AsapSCIENCE') | |||
assertTab('https://www.youtube.com/embedded') | |||
assertTab('https://www.youtube.com/feed') # Own channel's home page | |||
assertTab('https://www.youtube.com/playlist?list=UUBABnxM4Ar9ten8Mdjj1j0Q') | |||
assertPlaylist('https://www.youtube.com/course?list=ECUl4u3cNGP61MdtwGTqZA0MreSaDybji8') | |||
assertPlaylist('https://www.youtube.com/playlist?list=PLwP_SiAcdui0KVebT0mU9Apz359a4ubsC') | |||
assertPlaylist('https://www.youtube.com/watch?v=AV6J6_AeFEQ&playnext=1&list=PL4023E734DA416012') # 668 | |||
assertTab('https://www.youtube.com/playlist?list=PLwP_SiAcdui0KVebT0mU9Apz359a4ubsC') | |||
assertTab('https://www.youtube.com/watch?v=AV6J6_AeFEQ&playnext=1&list=PL4023E734DA416012') # 668 | |||
self.assertFalse('youtube:playlist' in self.matching_ies('PLtS2H6bU1M')) | |||
# Top tracks | |||
assertPlaylist('https://www.youtube.com/playlist?list=MCUS.20142101') | |||
assertTab('https://www.youtube.com/playlist?list=MCUS.20142101') | |||
def test_youtube_matching(self): | |||
self.assertTrue(YoutubeIE.suitable('PLtS2H6bU1M')) | |||
self.assertFalse(YoutubeIE.suitable('https://www.youtube.com/watch?v=AV6J6_AeFEQ&playnext=1&list=PL4023E734DA416012')) # 668 | |||
self.assertMatch('http://youtu.be/BaW_jenozKc', ['youtube']) | |||
self.assertMatch('http://www.youtube.com/v/BaW_jenozKc', ['youtube']) | |||
# self.assertMatch('http://www.youtube.com/v/BaW_jenozKc', ['youtube']) # /v/ is no longer valid | |||
self.assertMatch('https://youtube.googleapis.com/v/BaW_jenozKc', ['youtube']) | |||
self.assertMatch('http://www.cleanvideosearch.com/media/action/yt/watch?videoId=8v_4O44sfjM', ['youtube']) | |||
def test_youtube_channel_matching(self): | |||
assertChannel = lambda url: self.assertMatch(url, ['youtube:channel']) | |||
assertChannel = lambda url: self.assertMatch(url, ['youtube:tab']) | |||
assertChannel('https://www.youtube.com/channel/HCtnHdj3df7iM') | |||
assertChannel('https://www.youtube.com/channel/HCtnHdj3df7iM?feature=gb_ch_rec') | |||
assertChannel('https://www.youtube.com/channel/HCtnHdj3df7iM/videos') | |||
def test_youtube_user_matching(self): | |||
self.assertMatch('http://www.youtube.com/NASAgovVideo/videos', ['youtube:user']) | |||
# def test_youtube_user_matching(self): | |||
# self.assertMatch('http://www.youtube.com/NASAgovVideo/videos', ['youtube:tab']) | |||
def test_youtube_feeds(self): | |||
self.assertMatch('https://www.youtube.com/feed/watch_later', ['youtube:watchlater']) | |||
self.assertMatch('https://www.youtube.com/feed/subscriptions', ['youtube:subscriptions']) | |||
self.assertMatch('https://www.youtube.com/feed/recommended', ['youtube:recommended']) | |||
self.assertMatch('https://www.youtube.com/my_favorites', ['youtube:favorites']) | |||
def test_youtube_show_matching(self): | |||
self.assertMatch('http://www.youtube.com/show/airdisasters', ['youtube:show']) | |||
def test_youtube_search_matching(self): | |||
self.assertMatch('http://www.youtube.com/results?search_query=making+mustard', ['youtube:search_url']) | |||
self.assertMatch('https://www.youtube.com/results?baz=bar&search_query=youtube-dl+test+video&filters=video&lclk=video', ['youtube:search_url']) | |||
self.assertMatch('https://www.youtube.com/feed/library', ['youtube:tab']) | |||
self.assertMatch('https://www.youtube.com/feed/history', ['youtube:tab']) | |||
self.assertMatch('https://www.youtube.com/feed/watch_later', ['youtube:tab']) | |||
self.assertMatch('https://www.youtube.com/feed/subscriptions', ['youtube:tab']) | |||
# def test_youtube_search_matching(self): | |||
# self.assertMatch('http://www.youtube.com/results?search_query=making+mustard', ['youtube:search_url']) | |||
# self.assertMatch('https://www.youtube.com/results?baz=bar&search_query=youtube-dl+test+video&filters=video&lclk=video', ['youtube:search_url']) | |||
def test_youtube_extract(self): | |||
assertExtractId = lambda url, id: self.assertEqual(YoutubeIE.extract_id(url), id) | |||
@@ -937,6 +937,28 @@ class TestUtil(unittest.TestCase): | |||
self.assertEqual(d['x'], 1) | |||
self.assertEqual(d['y'], 'a') | |||
# Just drop ! prefix for now though this results in a wrong value | |||
on = js_to_json('''{ | |||
a: !0, | |||
b: !1, | |||
c: !!0, | |||
d: !!42.42, | |||
e: !!![], | |||
f: !"abc", | |||
g: !"", | |||
!42: 42 | |||
}''') | |||
self.assertEqual(json.loads(on), { | |||
'a': 0, | |||
'b': 1, | |||
'c': 0, | |||
'd': 42.42, | |||
'e': [], | |||
'f': "abc", | |||
'g': "", | |||
'42': 42 | |||
}) | |||
on = js_to_json('["abc", "def",]') | |||
self.assertEqual(json.loads(on), ['abc', 'def']) | |||
@@ -994,6 +1016,12 @@ class TestUtil(unittest.TestCase): | |||
on = js_to_json('{42:4.2e1}') | |||
self.assertEqual(json.loads(on), {'42': 42.0}) | |||
on = js_to_json('{ "0x40": "0x40" }') | |||
self.assertEqual(json.loads(on), {'0x40': '0x40'}) | |||
on = js_to_json('{ "040": "040" }') | |||
self.assertEqual(json.loads(on), {'040': '040'}) | |||
def test_js_to_json_malformed(self): | |||
self.assertEqual(js_to_json('42a1'), '42"a1"') | |||
self.assertEqual(js_to_json('42a-1'), '42"a"-1') | |||
@@ -830,34 +830,23 @@ class YoutubeDL(object): | |||
'and will probably not work.') | |||
try: | |||
try: | |||
temp_id = ie.extract_id(url) if callable(getattr(ie, 'extract_id', None)) else ie._match_id(url) | |||
except (AssertionError, IndexError): | |||
temp_id = None | |||
if temp_id is not None and self.in_download_archive({'id': temp_id, 'ie_key': ie_key}): | |||
self.to_screen("[%s] %s: has already been recorded in archive" % ( | |||
ie_key, temp_id)) | |||
break | |||
temp_id = ie.extract_id(url) if callable(getattr(ie, 'extract_id', None)) else ie._match_id(url) | |||
except (AssertionError, IndexError, AttributeError): | |||
temp_id = None | |||
if temp_id is not None and self.in_download_archive({'id': temp_id, 'ie_key': ie_key}): | |||
self.to_screen("[%s] %s: has already been recorded in archive" % ( | |||
ie_key, temp_id)) | |||
break | |||
ie_result = ie.extract(url) | |||
if ie_result is None: # Finished already (backwards compatibility; listformats and friends should be moved here) | |||
break | |||
if isinstance(ie_result, list): | |||
# Backwards compatibility: old IE result format | |||
ie_result = { | |||
'_type': 'compat_list', | |||
'entries': ie_result, | |||
} | |||
if info_dict: | |||
if info_dict.get('id'): | |||
ie_result['id'] = info_dict['id'] | |||
if info_dict.get('title'): | |||
ie_result['title'] = info_dict['title'] | |||
self.add_default_extra_info(ie_result, ie, url) | |||
if process: | |||
return self.process_ie_result(ie_result, download, extra_info) | |||
else: | |||
return ie_result | |||
return self.__extract_info(url, ie, download, extra_info, process, info_dict) | |||
else: | |||
self.report_error('no suitable InfoExtractor for URL %s' % url) | |||
def __handle_extraction_exceptions(func): | |||
def wrapper(self, *args, **kwargs): | |||
try: | |||
return func(self, *args, **kwargs) | |||
except GeoRestrictedError as e: | |||
msg = e.msg | |||
if e.countries: | |||
@@ -865,20 +854,38 @@ class YoutubeDL(object): | |||
map(ISO3166Utils.short2full, e.countries)) | |||
msg += '\nYou might want to use a VPN or a proxy server (with --proxy) to workaround.' | |||
self.report_error(msg) | |||
break | |||
except ExtractorError as e: # An error we somewhat expected | |||
self.report_error(compat_str(e), e.format_traceback()) | |||
break | |||
except MaxDownloadsReached: | |||
raise | |||
except Exception as e: | |||
if self.params.get('ignoreerrors', False): | |||
self.report_error(error_to_compat_str(e), tb=encode_compat_str(traceback.format_exc())) | |||
break | |||
else: | |||
raise | |||
return wrapper | |||
@__handle_extraction_exceptions | |||
def __extract_info(self, url, ie, download, extra_info, process, info_dict): | |||
ie_result = ie.extract(url) | |||
if ie_result is None: # Finished already (backwards compatibility; listformats and friends should be moved here) | |||
return | |||
if isinstance(ie_result, list): | |||
# Backwards compatibility: old IE result format | |||
ie_result = { | |||
'_type': 'compat_list', | |||
'entries': ie_result, | |||
} | |||
if info_dict: | |||
if info_dict.get('id'): | |||
ie_result['id'] = info_dict['id'] | |||
if info_dict.get('title'): | |||
ie_result['title'] = info_dict['title'] | |||
self.add_default_extra_info(ie_result, ie, url) | |||
if process: | |||
return self.process_ie_result(ie_result, download, extra_info) | |||
else: | |||
self.report_error('no suitable InfoExtractor for URL %s' % url) | |||
return ie_result | |||
def add_default_extra_info(self, ie_result, ie, url): | |||
self.add_extra_info(ie_result, { | |||
@@ -1057,9 +1064,8 @@ class YoutubeDL(object): | |||
self.to_screen('[download] ' + reason) | |||
continue | |||
entry_result = self.process_ie_result(entry, | |||
download=download, | |||
extra_info=extra) | |||
entry_result = self.__process_iterable_entry(entry, download, extra) | |||
# TODO: skip failed (empty) entries? | |||
playlist_results.append(entry_result) | |||
ie_result['entries'] = playlist_results | |||
self.to_screen('[download] Finished downloading playlist: %s' % playlist) | |||
@@ -1088,6 +1094,11 @@ class YoutubeDL(object): | |||
else: | |||
raise Exception('Invalid result type: %s' % result_type) | |||
@__handle_extraction_exceptions | |||
def __process_iterable_entry(self, entry, download, extra_info): | |||
return self.process_ie_result( | |||
entry, download=download, extra_info=extra_info) | |||
def _build_format_filter(self, filter_spec): | |||
" Returns a function to filter the formats according to the filter_spec " | |||
@@ -2345,7 +2345,7 @@ except ImportError: # Python <3.4 | |||
# HTMLParseError has been deprecated in Python 3.3 and removed in | |||
# Python 3.5. Introducing dummy exception for Python >3.5 for compatible | |||
# and uniform cross-version exceptiong handling | |||
# and uniform cross-version exception handling | |||
class compat_HTMLParseError(Exception): | |||
pass | |||
@@ -97,12 +97,15 @@ class FragmentFD(FileDownloader): | |||
def _download_fragment(self, ctx, frag_url, info_dict, headers=None): | |||
fragment_filename = '%s-Frag%d' % (ctx['tmpfilename'], ctx['fragment_index']) | |||
success = ctx['dl'].download(fragment_filename, { | |||
fragment_info_dict = { | |||
'url': frag_url, | |||
'http_headers': headers or info_dict.get('http_headers'), | |||
}) | |||
} | |||
success = ctx['dl'].download(fragment_filename, fragment_info_dict) | |||
if not success: | |||
return False, None | |||
if fragment_info_dict.get('filetime'): | |||
ctx['fragment_filetime'] = fragment_info_dict.get('filetime') | |||
down, frag_sanitized = sanitize_open(fragment_filename, 'rb') | |||
ctx['fragment_filename_sanitized'] = frag_sanitized | |||
frag_content = down.read() | |||
@@ -258,6 +261,13 @@ class FragmentFD(FileDownloader): | |||
downloaded_bytes = ctx['complete_frags_downloaded_bytes'] | |||
else: | |||
self.try_rename(ctx['tmpfilename'], ctx['filename']) | |||
if self.params.get('updatetime', True): | |||
filetime = ctx.get('fragment_filetime') | |||
if filetime: | |||
try: | |||
os.utime(ctx['filename'], (time.time(), filetime)) | |||
except Exception: | |||
pass | |||
downloaded_bytes = os.path.getsize(encodeFilename(ctx['filename'])) | |||
self._hook_progress({ | |||
@@ -109,7 +109,9 @@ class HttpFD(FileDownloader): | |||
try: | |||
ctx.data = self.ydl.urlopen(request) | |||
except (compat_urllib_error.URLError, ) as err: | |||
if isinstance(err.reason, socket.timeout): | |||
# reason may not be available, e.g. for urllib2.HTTPError on python 2.6 | |||
reason = getattr(err, 'reason', None) | |||
if isinstance(reason, socket.timeout): | |||
raise RetryDownload(err) | |||
raise err | |||
# When trying to resume, Content-Range HTTP header of response has to be checked | |||
@@ -275,7 +275,7 @@ class AfreecaTVIE(InfoExtractor): | |||
video_element = video_xml.findall(compat_xpath('./track/video'))[-1] | |||
if video_element is None or video_element.text is None: | |||
raise ExtractorError( | |||
'Video %s video does not exist' % video_id, expected=True) | |||
'Video %s does not exist' % video_id, expected=True) | |||
video_url = video_element.text.strip() | |||
@@ -0,0 +1,103 @@ | |||
# coding: utf-8 | |||
from __future__ import unicode_literals | |||
from .common import InfoExtractor | |||
from .youtube import YoutubeIE | |||
from .vimeo import VimeoIE | |||
from ..utils import ( | |||
int_or_none, | |||
parse_iso8601, | |||
update_url_query, | |||
) | |||
class AmaraIE(InfoExtractor): | |||
_VALID_URL = r'https?://(?:www\.)?amara\.org/(?:\w+/)?videos/(?P<id>\w+)' | |||
_TESTS = [{ | |||
# Youtube | |||
'url': 'https://amara.org/en/videos/jVx79ZKGK1ky/info/why-jury-trials-are-becoming-less-common/?tab=video', | |||
'md5': 'ea10daf2b6154b8c1ecf9922aca5e8ae', | |||
'info_dict': { | |||
'id': 'h6ZuVdvYnfE', | |||
'ext': 'mp4', | |||
'title': 'Why jury trials are becoming less common', | |||
'description': 'md5:a61811c319943960b6ab1c23e0cbc2c1', | |||
'thumbnail': r're:^https?://.*\.jpg$', | |||
'subtitles': dict, | |||
'upload_date': '20160813', | |||
'uploader': 'PBS NewsHour', | |||
'uploader_id': 'PBSNewsHour', | |||
'timestamp': 1549639570, | |||
} | |||
}, { | |||
# Vimeo | |||
'url': 'https://amara.org/en/videos/kYkK1VUTWW5I/info/vimeo-at-ces-2011', | |||
'md5': '99392c75fa05d432a8f11df03612195e', | |||
'info_dict': { | |||
'id': '18622084', | |||
'ext': 'mov', | |||
'title': 'Vimeo at CES 2011!', | |||
'description': 'md5:d41d8cd98f00b204e9800998ecf8427e', | |||
'thumbnail': r're:^https?://.*\.jpg$', | |||
'subtitles': dict, | |||
'timestamp': 1294763658, | |||
'upload_date': '20110111', | |||
'uploader': 'Sam Morrill', | |||
'uploader_id': 'sammorrill' | |||
} | |||
}, { | |||
# Direct Link | |||
'url': 'https://amara.org/en/videos/s8KL7I3jLmh6/info/the-danger-of-a-single-story/', | |||
'md5': 'd3970f08512738ee60c5807311ff5d3f', | |||
'info_dict': { | |||
'id': 's8KL7I3jLmh6', | |||
'ext': 'mp4', | |||
'title': 'The danger of a single story', | |||
'description': 'md5:d769b31139c3b8bb5be9177f62ea3f23', | |||
'thumbnail': r're:^https?://.*\.jpg$', | |||
'subtitles': dict, | |||
'upload_date': '20091007', | |||
'timestamp': 1254942511, | |||
} | |||
}] | |||
def _real_extract(self, url): | |||
video_id = self._match_id(url) | |||
meta = self._download_json( | |||
'https://amara.org/api/videos/%s/' % video_id, | |||
video_id, query={'format': 'json'}) | |||
title = meta['title'] | |||
video_url = meta['all_urls'][0] | |||
subtitles = {} | |||
for language in (meta.get('languages') or []): | |||
subtitles_uri = language.get('subtitles_uri') | |||
if not (subtitles_uri and language.get('published')): | |||
continue | |||
subtitle = subtitles.setdefault(language.get('code') or 'en', []) | |||
for f in ('json', 'srt', 'vtt'): | |||
subtitle.append({ | |||
'ext': f, | |||
'url': update_url_query(subtitles_uri, {'format': f}), | |||
}) | |||
info = { | |||
'url': video_url, | |||
'id': video_id, | |||
'subtitles': subtitles, | |||
'title': title, | |||
'description': meta.get('description'), | |||
'thumbnail': meta.get('thumbnail'), | |||
'duration': int_or_none(meta.get('duration')), | |||
'timestamp': parse_iso8601(meta.get('created')), | |||
} | |||
for ie in (YoutubeIE, VimeoIE): | |||
if ie.suitable(video_url): | |||
info.update({ | |||
'_type': 'url_transparent', | |||
'ie_key': ie.ie_key(), | |||
}) | |||
break | |||
return info |
@@ -4,23 +4,57 @@ from __future__ import unicode_literals | |||
import re | |||
from .common import InfoExtractor | |||
from ..compat import compat_str | |||
from ..compat import ( | |||
compat_str, | |||
compat_urlparse, | |||
) | |||
from ..utils import ( | |||
ExtractorError, | |||
int_or_none, | |||
qualities, | |||
try_get, | |||
unified_strdate, | |||
url_or_none, | |||
) | |||
# There are different sources of video in arte.tv, the extraction process | |||
# is different for each one. The videos usually expire in 7 days, so we can't | |||
# add tests. | |||
class ArteTVBaseIE(InfoExtractor): | |||
def _extract_from_json_url(self, json_url, video_id, lang, title=None): | |||
info = self._download_json(json_url, video_id) | |||
_ARTE_LANGUAGES = 'fr|de|en|es|it|pl' | |||
_API_BASE = 'https://api.arte.tv/api/player/v1' | |||
class ArteTVIE(ArteTVBaseIE): | |||
_VALID_URL = r'''(?x) | |||
https?:// | |||
(?: | |||
(?:www\.)?arte\.tv/(?P<lang>%(langs)s)/videos| | |||
api\.arte\.tv/api/player/v\d+/config/(?P<lang_2>%(langs)s) | |||
) | |||
/(?P<id>\d{6}-\d{3}-[AF]) | |||
''' % {'langs': ArteTVBaseIE._ARTE_LANGUAGES} | |||
_TESTS = [{ | |||
'url': 'https://www.arte.tv/en/videos/088501-000-A/mexico-stealing-petrol-to-survive/', | |||
'info_dict': { | |||
'id': '088501-000-A', | |||
'ext': 'mp4', | |||
'title': 'Mexico: Stealing Petrol to Survive', | |||
'upload_date': '20190628', | |||
}, | |||
}, { | |||
'url': 'https://www.arte.tv/pl/videos/100103-000-A/usa-dyskryminacja-na-porodowce/', | |||
'only_matching': True, | |||
}, { | |||
'url': 'https://api.arte.tv/api/player/v2/config/de/100605-013-A', | |||
'only_matching': True, | |||
}] | |||
def _real_extract(self, url): | |||
mobj = re.match(self._VALID_URL, url) | |||
video_id = mobj.group('id') | |||
lang = mobj.group('lang') or mobj.group('lang_2') | |||
info = self._download_json( | |||
'%s/config/%s/%s' % (self._API_BASE, lang, video_id), video_id) | |||
player_info = info['videoJsonPlayer'] | |||
vsr = try_get(player_info, lambda x: x['VSR'], dict) | |||
@@ -37,18 +71,11 @@ class ArteTVBaseIE(InfoExtractor): | |||
if not upload_date_str: | |||
upload_date_str = (player_info.get('VRA') or player_info.get('VDA') or '').split(' ')[0] | |||
title = (player_info.get('VTI') or title or player_info['VID']).strip() | |||
title = (player_info.get('VTI') or player_info['VID']).strip() | |||
subtitle = player_info.get('VSU', '').strip() | |||
if subtitle: | |||
title += ' - %s' % subtitle | |||
info_dict = { | |||
'id': player_info['VID'], | |||
'title': title, | |||
'description': player_info.get('VDE'), | |||
'upload_date': unified_strdate(upload_date_str), | |||
'thumbnail': player_info.get('programImage') or player_info.get('VTU', {}).get('IUR'), | |||
} | |||
qfunc = qualities(['MQ', 'HQ', 'EQ', 'SQ']) | |||
LANGS = { | |||
@@ -65,6 +92,10 @@ class ArteTVBaseIE(InfoExtractor): | |||
formats = [] | |||
for format_id, format_dict in vsr.items(): | |||
f = dict(format_dict) | |||
format_url = url_or_none(f.get('url')) | |||
streamer = f.get('streamer') | |||
if not format_url and not streamer: | |||
continue | |||
versionCode = f.get('versionCode') | |||
l = re.escape(langcode) | |||
@@ -107,6 +138,16 @@ class ArteTVBaseIE(InfoExtractor): | |||
else: | |||
lang_pref = -1 | |||
media_type = f.get('mediaType') | |||
if media_type == 'hls': | |||
m3u8_formats = self._extract_m3u8_formats( | |||
format_url, video_id, 'mp4', entry_protocol='m3u8_native', | |||
m3u8_id=format_id, fatal=False) | |||
for m3u8_format in m3u8_formats: | |||
m3u8_format['language_preference'] = lang_pref | |||
formats.extend(m3u8_formats) | |||
continue | |||
format = { | |||
'format_id': format_id, | |||
'preference': -10 if f.get('videoFormat') == 'M3U8' else None, | |||
@@ -118,7 +159,7 @@ class ArteTVBaseIE(InfoExtractor): | |||
'quality': qfunc(f.get('quality')), | |||
} | |||
if f.get('mediaType') == 'rtmp': | |||
if media_type == 'rtmp': | |||
format['url'] = f['streamer'] | |||
format['play_path'] = 'mp4:' + f['url'] | |||
format['ext'] = 'flv' | |||
@@ -127,56 +168,50 @@ class ArteTVBaseIE(InfoExtractor): | |||
formats.append(format) | |||
self._check_formats(formats, video_id) | |||
self._sort_formats(formats) | |||
info_dict['formats'] = formats | |||
return info_dict | |||
return { | |||
'id': player_info.get('VID') or video_id, | |||
'title': title, | |||
'description': player_info.get('VDE'), | |||
'upload_date': unified_strdate(upload_date_str), | |||
'thumbnail': player_info.get('programImage') or player_info.get('VTU', {}).get('IUR'), | |||
'formats': formats, | |||
} | |||
class ArteTVPlus7IE(ArteTVBaseIE): | |||
IE_NAME = 'arte.tv:+7' | |||
_VALID_URL = r'https?://(?:www\.)?arte\.tv/(?P<lang>fr|de|en|es|it|pl)/videos/(?P<id>\d{6}-\d{3}-[AF])' | |||
class ArteTVEmbedIE(InfoExtractor): | |||
_VALID_URL = r'https?://(?:www\.)?arte\.tv/player/v\d+/index\.php\?.*?\bjson_url=.+' | |||
_TESTS = [{ | |||
'url': 'https://www.arte.tv/en/videos/088501-000-A/mexico-stealing-petrol-to-survive/', | |||
'url': 'https://www.arte.tv/player/v5/index.php?json_url=https%3A%2F%2Fapi.arte.tv%2Fapi%2Fplayer%2Fv2%2Fconfig%2Fde%2F100605-013-A&lang=de&autoplay=true&mute=0100605-013-A', | |||
'info_dict': { | |||
'id': '088501-000-A', | |||
'id': '100605-013-A', | |||
'ext': 'mp4', | |||
'title': 'Mexico: Stealing Petrol to Survive', | |||
'upload_date': '20190628', | |||
'title': 'United we Stream November Lockdown Edition #13', | |||
'description': 'md5:be40b667f45189632b78c1425c7c2ce1', | |||
'upload_date': '20201116', | |||
}, | |||
}, { | |||
'url': 'https://www.arte.tv/player/v3/index.php?json_url=https://api.arte.tv/api/player/v2/config/de/100605-013-A', | |||
'only_matching': True, | |||
}] | |||
def _real_extract(self, url): | |||
lang, video_id = re.match(self._VALID_URL, url).groups() | |||
return self._extract_from_json_url( | |||
'https://api.arte.tv/api/player/v1/config/%s/%s' % (lang, video_id), | |||
video_id, lang) | |||
class ArteTVEmbedIE(ArteTVPlus7IE): | |||
IE_NAME = 'arte.tv:embed' | |||
_VALID_URL = r'''(?x) | |||
https://www\.arte\.tv | |||
/player/v3/index\.php\?json_url= | |||
(?P<json_url> | |||
https?://api\.arte\.tv/api/player/v1/config/ | |||
(?P<lang>[^/]+)/(?P<id>\d{6}-\d{3}-[AF]) | |||
) | |||
''' | |||
_TESTS = [] | |||
@staticmethod | |||
def _extract_urls(webpage): | |||
return [url for _, url in re.findall( | |||
r'<(?:iframe|script)[^>]+src=(["\'])(?P<url>(?:https?:)?//(?:www\.)?arte\.tv/player/v\d+/index\.php\?.*?\bjson_url=.+?)\1', | |||
webpage)] | |||
def _real_extract(self, url): | |||
json_url, lang, video_id = re.match(self._VALID_URL, url).groups() | |||
return self._extract_from_json_url(json_url, video_id, lang) | |||
qs = compat_urlparse.parse_qs(compat_urlparse.urlparse(url).query) | |||
json_url = qs['json_url'][0] | |||
video_id = ArteTVIE._match_id(json_url) | |||
return self.url_result( | |||
json_url, ie=ArteTVIE.ie_key(), video_id=video_id) | |||
class ArteTVPlaylistIE(ArteTVBaseIE): | |||
IE_NAME = 'arte.tv:playlist' | |||
_VALID_URL = r'https?://(?:www\.)?arte\.tv/(?P<lang>fr|de|en|es|it|pl)/videos/(?P<id>RC-\d{6})' | |||
_VALID_URL = r'https?://(?:www\.)?arte\.tv/(?P<lang>%s)/videos/(?P<id>RC-\d{6})' % ArteTVBaseIE._ARTE_LANGUAGES | |||
_TESTS = [{ | |||
'url': 'https://www.arte.tv/en/videos/RC-016954/earn-a-living/', | |||
'info_dict': { | |||
@@ -185,17 +220,35 @@ class ArteTVPlaylistIE(ArteTVBaseIE): | |||
'description': 'md5:d322c55011514b3a7241f7fb80d494c2', | |||
}, | |||
'playlist_mincount': 6, | |||
}, { | |||
'url': 'https://www.arte.tv/pl/videos/RC-014123/arte-reportage/', | |||
'only_matching': True, | |||
}] | |||
def _real_extract(self, url): | |||
lang, playlist_id = re.match(self._VALID_URL, url).groups() | |||
collection = self._download_json( | |||
'https://api.arte.tv/api/player/v1/collectionData/%s/%s?source=videos' | |||
% (lang, playlist_id), playlist_id) | |||
'%s/collectionData/%s/%s?source=videos' | |||
% (self._API_BASE, lang, playlist_id), playlist_id) | |||
entries = [] | |||
for video in collection['videos']: | |||
if not isinstance(video, dict): | |||
continue | |||
video_url = url_or_none(video.get('url')) or url_or_none(video.get('jsonUrl')) | |||
if not video_url: | |||
continue | |||
video_id = video.get('programId') | |||
entries.append({ | |||
'_type': 'url_transparent', | |||
'url': video_url, | |||
'id': video_id, | |||
'title': video.get('title'), | |||
'alt_title': video.get('subtitle'), | |||
'thumbnail': url_or_none(try_get(video, lambda x: x['mainImage']['url'], compat_str)), | |||
'duration': int_or_none(video.get('durationSeconds')), | |||
'view_count': int_or_none(video.get('views')), | |||
'ie_key': ArteTVIE.ie_key(), | |||
}) | |||
title = collection.get('title') | |||
description = collection.get('shortDescription') or collection.get('teaserText') | |||
entries = [ | |||
self._extract_from_json_url( | |||
video['jsonUrl'], video.get('programId') or playlist_id, lang) | |||
for video in collection['videos'] if video.get('jsonUrl')] | |||
return self.playlist_result(entries, playlist_id, title, description) |
@@ -1,3 +1,4 @@ | |||
# coding: utf-8 | |||
from __future__ import unicode_literals | |||
import random | |||
@@ -5,10 +6,7 @@ import re | |||
import time | |||
from .common import InfoExtractor | |||
from ..compat import ( | |||
compat_str, | |||
compat_urlparse, | |||
) | |||
from ..compat import compat_str | |||
from ..utils import ( | |||
ExtractorError, | |||
float_or_none, | |||
@@ -17,71 +15,32 @@ from ..utils import ( | |||
parse_filesize, | |||
str_or_none, | |||
try_get, | |||
unescapeHTML, | |||
update_url_query, | |||
unified_strdate, | |||
unified_timestamp, | |||
url_or_none, | |||
urljoin, | |||
) | |||
class BandcampBaseIE(InfoExtractor): | |||
"""Provide base functions for Bandcamp extractors""" | |||
def _extract_json_from_html_data_attribute(self, webpage, suffix, video_id): | |||
json_string = self._html_search_regex( | |||
r' data-%s="([^"]*)' % suffix, | |||
webpage, '%s json' % suffix, default='{}') | |||
return self._parse_json(json_string, video_id) | |||
def _parse_json_track(self, json): | |||
formats = [] | |||
file_ = json.get('file') | |||
if isinstance(file_, dict): | |||
for format_id, format_url in file_.items(): | |||
if not url_or_none(format_url): | |||
continue | |||
ext, abr_str = format_id.split('-', 1) | |||
formats.append({ | |||
'format_id': format_id, | |||
'url': self._proto_relative_url(format_url, 'http:'), | |||
'ext': ext, | |||
'vcodec': 'none', | |||
'acodec': ext, | |||
'abr': int_or_none(abr_str), | |||
}) | |||
return { | |||
'duration': float_or_none(json.get('duration')), | |||
'id': str_or_none(json.get('track_id') or json.get('id')), | |||
'title': json.get('title'), | |||
'title_link': json.get('title_link'), | |||
'number': int_or_none(json.get('track_num')), | |||
'formats': formats | |||
} | |||
class BandcampIE(BandcampBaseIE): | |||
IE_NAME = "Bandcamp:track" | |||
_VALID_URL = r'https?://[^/]+\.bandcamp\.com/track/(?P<title>[^/?#&]+)' | |||
class BandcampIE(InfoExtractor): | |||
_VALID_URL = r'https?://[^/]+\.bandcamp\.com/track/(?P<id>[^/?#&]+)' | |||
_TESTS = [{ | |||
'url': 'http://youtube-dl.bandcamp.com/track/youtube-dl-test-song', | |||
'md5': 'c557841d5e50261777a6585648adf439', | |||
'info_dict': { | |||
'id': '1812978515', | |||
'ext': 'mp3', | |||
'title': "youtube-dl \"'/\\\u00e4\u21ad - youtube-dl \"'/\\\u00e4\u21ad - youtube-dl test song \"'/\\\u00e4\u21ad", | |||
'title': "youtube-dl \"'/\\ä↭ - youtube-dl \"'/\\ä↭ - youtube-dl test song \"'/\\ä↭", | |||
'duration': 9.8485, | |||
'uploader': "youtube-dl \"'/\\\u00e4\u21ad", | |||
'timestamp': 1354224127, | |||
'uploader': 'youtube-dl "\'/\\ä↭', | |||
'upload_date': '20121129', | |||
'timestamp': 1354224127, | |||
}, | |||
'_skip': 'There is a limit of 200 free downloads / month for the test song' | |||
}, { | |||
# free download | |||
'url': 'http://benprunty.bandcamp.com/track/lanius-battle', | |||
'md5': '5d92af55811e47f38962a54c30b07ef0', | |||
'info_dict': { | |||
'id': '2650410135', | |||
'ext': 'aiff', | |||
@@ -120,52 +79,59 @@ class BandcampIE(BandcampBaseIE): | |||
}, | |||
}] | |||
def _extract_data_attr(self, webpage, video_id, attr='tralbum', fatal=True): | |||
return self._parse_json(self._html_search_regex( | |||
r'data-%s=(["\'])({.+?})\1' % attr, webpage, | |||
attr + ' data', group=2), video_id, fatal=fatal) | |||
def _real_extract(self, url): | |||
mobj = re.match(self._VALID_URL, url) | |||
title = mobj.group('title') | |||
url_track_title = title | |||
title = self._match_id(url) | |||
webpage = self._download_webpage(url, title) | |||
thumbnail = self._html_search_meta('og:image', webpage, default=None) | |||
json_tralbum = self._extract_json_from_html_data_attribute(webpage, "tralbum", url_track_title) | |||
json_embed = self._extract_json_from_html_data_attribute(webpage, "embed", url_track_title) | |||
json_tracks = json_tralbum.get('trackinfo') | |||
if not json_tracks: | |||
raise ExtractorError('Could not extract track') | |||
track = self._parse_json_track(json_tracks[0]) | |||
artist = json_tralbum.get('artist') | |||
album_title = json_embed.get('album_title') | |||
json_album = json_tralbum.get('packages') | |||
if json_album: | |||
json_album = json_album[0] | |||
album_publish_date = json_album.get('album_publish_date') | |||
album_release_date = json_album.get('album_release_date') | |||
else: | |||
album_publish_date = None | |||
album_release_date = json_tralbum.get('album_release_date') | |||
timestamp = unified_timestamp(json_tralbum.get('current', {}).get('publish_date') or album_publish_date) | |||
release_date = unified_strdate(album_release_date) | |||
download_link = self._search_regex( | |||
r'freeDownloadPage(?:["\']|"):\s*(["\']|")(?P<url>(?:(?!\1).)+)\1', webpage, | |||
'download link', default=None, group='url') | |||
tralbum = self._extract_data_attr(webpage, title) | |||
thumbnail = self._og_search_thumbnail(webpage) | |||
track_id = None | |||
track = None | |||
track_number = None | |||
duration = None | |||
formats = [] | |||
track_info = try_get(tralbum, lambda x: x['trackinfo'][0], dict) | |||
if track_info: | |||
file_ = track_info.get('file') | |||
if isinstance(file_, dict): | |||
for format_id, format_url in file_.items(): | |||
if not url_or_none(format_url): | |||
continue | |||
ext, abr_str = format_id.split('-', 1) | |||
formats.append({ | |||
'format_id': format_id, | |||
'url': self._proto_relative_url(format_url, 'http:'), | |||
'ext': ext, | |||
'vcodec': 'none', | |||
'acodec': ext, | |||
'abr': int_or_none(abr_str), | |||
}) | |||
track = track_info.get('title') | |||
track_id = str_or_none( | |||
track_info.get('track_id') or track_info.get('id')) | |||
track_number = int_or_none(track_info.get('track_num')) | |||
duration = float_or_none(track_info.get('duration')) | |||
embed = self._extract_data_attr(webpage, title, 'embed', False) | |||
current = tralbum.get('current') or {} | |||
artist = embed.get('artist') or current.get('artist') or tralbum.get('artist') | |||
timestamp = unified_timestamp( | |||
current.get('publish_date') or tralbum.get('album_publish_date')) | |||
download_link = tralbum.get('freeDownloadPage') | |||
if download_link: | |||
track_id = self._search_regex( | |||
r'\?id=(?P<id>\d+)&', | |||
download_link, 'track id') | |||
track_id = compat_str(tralbum['id']) | |||
download_webpage = self._download_webpage( | |||
download_link, track_id, 'Downloading free downloads page') | |||
blob = self._parse_json( | |||
self._search_regex( | |||
r'data-blob=(["\'])(?P<blob>{.+?})\1', download_webpage, | |||
'blob', group='blob'), | |||
track_id, transform_source=unescapeHTML) | |||
blob = self._extract_data_attr(download_webpage, track_id, 'blob') | |||
info = try_get( | |||
blob, (lambda x: x['digital_items'][0], | |||
@@ -173,6 +139,8 @@ class BandcampIE(BandcampBaseIE): | |||
if info: | |||
downloads = info.get('downloads') | |||
if isinstance(downloads, dict): | |||
if not track: | |||
track = info.get('title') | |||
if not artist: | |||
artist = info.get('artist') | |||
if not thumbnail: | |||
@@ -206,7 +174,7 @@ class BandcampIE(BandcampBaseIE): | |||
retry_url = url_or_none(stat.get('retry_url')) | |||
if not retry_url: | |||
continue | |||
track['formats'].append({ | |||
formats.append({ | |||
'url': self._proto_relative_url(retry_url, 'http:'), | |||
'ext': download_formats.get(format_id), | |||
'format_id': format_id, | |||
@@ -215,30 +183,34 @@ class BandcampIE(BandcampBaseIE): | |||
'vcodec': 'none', | |||
}) | |||
self._sort_formats(track['formats']) | |||
self._sort_formats(formats) | |||
title = '%s - %s' % (artist, track.get('title')) if artist else track.get('title') | |||
title = '%s - %s' % (artist, track) if artist else track | |||
if not duration: | |||
duration = float_or_none(self._html_search_meta( | |||
'duration', webpage, default=None)) | |||
return { | |||
'album': album_title, | |||
'artist': artist, | |||
'duration': track['duration'], | |||
'formats': track['formats'], | |||
'id': track['id'], | |||
'release_date': release_date, | |||
'id': track_id, | |||
'title': title, | |||
'thumbnail': thumbnail, | |||
'uploader': artist, | |||
'timestamp': timestamp, | |||
'title': title, | |||
'track': track['title'], | |||
'track_id': track['id'], | |||
'track_number': track['number'], | |||
'uploader': artist | |||
'release_date': unified_strdate(tralbum.get('album_release_date')), | |||
'duration': duration, | |||
'track': track, | |||
'track_number': track_number, | |||
'track_id': track_id, | |||
'artist': artist, | |||
'album': embed.get('album_title'), | |||
'formats': formats, | |||
} | |||
class BandcampAlbumIE(BandcampBaseIE): | |||
class BandcampAlbumIE(BandcampIE): | |||
IE_NAME = 'Bandcamp:album' | |||
_VALID_URL = r'https?://(?:(?P<subdomain>[^.]+)\.)?bandcamp\.com(?:/album/(?P<album_id>[^/?#&]+))?' | |||
_VALID_URL = r'https?://(?:(?P<subdomain>[^.]+)\.)?bandcamp\.com(?:/album/(?P<id>[^/?#&]+))?' | |||
_TESTS = [{ | |||
'url': 'http://blazo.bandcamp.com/album/jazz-format-mixtape-vol-1', | |||
@@ -248,7 +220,10 @@ class BandcampAlbumIE(BandcampBaseIE): | |||
'info_dict': { | |||
'id': '1353101989', | |||
'ext': 'mp3', | |||
'title': 'Intro', | |||
'title': 'Blazo - Intro', | |||
'timestamp': 1311756226, | |||
'upload_date': '20110727', | |||
'uploader': 'Blazo', | |||
} | |||
}, | |||
{ | |||
@@ -256,7 +231,10 @@ class BandcampAlbumIE(BandcampBaseIE): | |||
'info_dict': { | |||
'id': '38097443', | |||
'ext': 'mp3', | |||
'title': 'Kero One - Keep It Alive (Blazo remix)', | |||
'title': 'Blazo - Kero One - Keep It Alive (Blazo remix)', | |||
'timestamp': 1311757238, | |||
'upload_date': '20110727', | |||
'uploader': 'Blazo', | |||
} | |||
}, | |||
], | |||
@@ -292,6 +270,7 @@ class BandcampAlbumIE(BandcampBaseIE): | |||
'title': '"Entropy" EP', | |||
'uploader_id': 'jstrecords', | |||
'id': 'entropy-ep', | |||
'description': 'md5:0ff22959c943622972596062f2f366a5', | |||
}, | |||
'playlist_mincount': 3, | |||
}, { | |||
@@ -301,6 +280,7 @@ class BandcampAlbumIE(BandcampBaseIE): | |||
'id': 'we-are-the-plague', | |||
'title': 'WE ARE THE PLAGUE', | |||
'uploader_id': 'insulters', | |||
'description': 'md5:b3cf845ee41b2b1141dc7bde9237255f', | |||
}, | |||
'playlist_count': 2, | |||
}] | |||
@@ -312,41 +292,34 @@ class BandcampAlbumIE(BandcampBaseIE): | |||
else super(BandcampAlbumIE, cls).suitable(url)) | |||
def _real_extract(self, url): | |||
mobj = re.match(self._VALID_URL, url) | |||
uploader_id = mobj.group('subdomain') | |||
album_id = mobj.group('album_id') | |||
uploader_id, album_id = re.match(self._VALID_URL, url).groups() | |||
playlist_id = album_id or uploader_id | |||
webpage = self._download_webpage(url, playlist_id) | |||
json_tralbum = self._extract_json_from_html_data_attribute(webpage, "tralbum", playlist_id) | |||
json_embed = self._extract_json_from_html_data_attribute(webpage, "embed", playlist_id) | |||
json_tracks = json_tralbum.get('trackinfo') | |||
if not json_tracks: | |||
raise ExtractorError('Could not extract album tracks') | |||
album_title = json_embed.get('album_title') | |||
tralbum = self._extract_data_attr(webpage, playlist_id) | |||
track_info = tralbum.get('trackinfo') | |||
if not track_info: | |||
raise ExtractorError('The page doesn\'t contain any tracks') | |||
# Only tracks with duration info have songs | |||
tracks = [self._parse_json_track(track) for track in json_tracks] | |||
entries = [ | |||
self.url_result( | |||
compat_urlparse.urljoin(url, track['title_link']), | |||
ie=BandcampIE.ie_key(), video_id=track['id'], | |||
video_title=track['title']) | |||
for track in tracks | |||
if track.get('duration')] | |||
urljoin(url, t['title_link']), BandcampIE.ie_key(), | |||
str_or_none(t.get('track_id') or t.get('id')), t.get('title')) | |||
for t in track_info | |||
if t.get('duration')] | |||
current = tralbum.get('current') or {} | |||
return { | |||
'_type': 'playlist', | |||
'uploader_id': uploader_id, | |||
'id': playlist_id, | |||
'title': album_title, | |||
'entries': entries | |||
'title': current.get('title'), | |||
'description': current.get('about'), | |||
'entries': entries, | |||
} | |||
class BandcampWeeklyIE(InfoExtractor): | |||
class BandcampWeeklyIE(BandcampIE): | |||
IE_NAME = 'Bandcamp:weekly' | |||
_VALID_URL = r'https?://(?:www\.)?bandcamp\.com/?\?(?:.*?&)?show=(?P<id>\d+)' | |||
_TESTS = [{ | |||
@@ -361,29 +334,23 @@ class BandcampWeeklyIE(InfoExtractor): | |||
'release_date': '20170404', | |||
'series': 'Bandcamp Weekly', | |||
'episode': 'Magic Moments', | |||
'episode_number': 208, | |||
'episode_id': '224', | |||
} | |||
}, | |||
'params': { | |||
'format': 'opus-lo', | |||
}, | |||
}, { | |||
'url': 'https://bandcamp.com/?blah/blah@&show=228', | |||
'only_matching': True | |||
}] | |||
def _real_extract(self, url): | |||
video_id = self._match_id(url) | |||
webpage = self._download_webpage(url, video_id) | |||
blob = self._parse_json( | |||
self._search_regex( | |||
r'data-blob=(["\'])(?P<blob>{.+?})\1', webpage, | |||
'blob', group='blob'), | |||
video_id, transform_source=unescapeHTML) | |||
show_id = self._match_id(url) | |||
webpage = self._download_webpage(url, show_id) | |||
show = blob['bcw_show'] | |||
blob = self._extract_data_attr(webpage, show_id, 'blob') | |||
# This is desired because any invalid show id redirects to `bandcamp.com` | |||
# which happens to expose the latest Bandcamp Weekly episode. | |||
show_id = int_or_none(show.get('show_id')) or int_or_none(video_id) | |||
show = blob['bcw_data'][show_id] | |||
formats = [] | |||
for format_id, format_url in show['audio_stream'].items(): | |||
@@ -408,20 +375,8 @@ class BandcampWeeklyIE(InfoExtractor): | |||
if subtitle: | |||
title += ' - %s' % subtitle | |||
episode_number = None | |||
seq = blob.get('bcw_seq') | |||
if seq and isinstance(seq, list): | |||
try: | |||
episode_number = next( | |||
int_or_none(e.get('episode_number')) | |||
for e in seq | |||
if isinstance(e, dict) and int_or_none(e.get('id')) == show_id) | |||
except StopIteration: | |||
pass | |||
return { | |||
'id': video_id, | |||
'id': show_id, | |||
'title': title, | |||
'description': show.get('desc') or show.get('short_desc'), | |||
'duration': float_or_none(show.get('audio_duration')), | |||
@@ -429,7 +384,6 @@ class BandcampWeeklyIE(InfoExtractor): | |||
'release_date': unified_strdate(show.get('published_date')), | |||
'series': 'Bandcamp Weekly', | |||
'episode': show.get('subtitle'), | |||
'episode_number': episode_number, | |||
'episode_id': compat_str(video_id), | |||
'episode_id': show_id, | |||
'formats': formats | |||
} |
@@ -981,7 +981,7 @@ class BBCIE(BBCCoUkIE): | |||
group_id = self._search_regex( | |||
r'<div[^>]+\bclass=["\']video["\'][^>]+\bdata-pid=["\'](%s)' % self._ID_REGEX, | |||
webpage, 'group id', default=None) | |||
if playlist_id: | |||
if group_id: | |||
return self.url_result( | |||
'https://www.bbc.co.uk/programmes/%s' % group_id, | |||
ie=BBCCoUkIE.ie_key()) | |||
@@ -1092,10 +1092,26 @@ class BBCIE(BBCCoUkIE): | |||
self._search_regex( | |||
r'(?s)bbcthreeConfig\s*=\s*({.+?})\s*;\s*<', webpage, | |||
'bbcthree config', default='{}'), | |||
playlist_id, transform_source=js_to_json, fatal=False) | |||
if bbc3_config: | |||
playlist_id, transform_source=js_to_json, fatal=False) or {} | |||
payload = bbc3_config.get('payload') or {} | |||
if payload: | |||
clip = payload.get('currentClip') or {} | |||
clip_vpid = clip.get('vpid') | |||
clip_title = clip.get('title') | |||
if clip_vpid and clip_title: | |||
formats, subtitles = self._download_media_selector(clip_vpid) | |||
self._sort_formats(formats) | |||
return { | |||
'id': clip_vpid, | |||
'title': clip_title, | |||
'thumbnail': dict_get(clip, ('poster', 'imageUrl')), | |||
'description': clip.get('description'), | |||
'duration': parse_duration(clip.get('duration')), | |||
'formats': formats, | |||
'subtitles': subtitles, | |||
} | |||
bbc3_playlist = try_get( | |||
bbc3_config, lambda x: x['payload']['content']['bbcMedia']['playlist'], | |||
payload, lambda x: x['content']['bbcMedia']['playlist'], | |||
dict) | |||
if bbc3_playlist: | |||
playlist_title = bbc3_playlist.get('title') or playlist_title | |||
@@ -1118,6 +1134,39 @@ class BBCIE(BBCCoUkIE): | |||
return self.playlist_result( | |||
entries, playlist_id, playlist_title, playlist_description) | |||
initial_data = self._parse_json(self._search_regex( | |||
r'window\.__INITIAL_DATA__\s*=\s*({.+?});', webpage, | |||
'preload state', default='{}'), playlist_id, fatal=False) | |||
if initial_data: | |||
def parse_media(media): | |||
if not media: | |||
return | |||
for item in (try_get(media, lambda x: x['media']['items'], list) or []): | |||
item_id = item.get('id') | |||
item_title = item.get('title') | |||
if not (item_id and item_title): | |||
continue | |||
formats, subtitles = self._download_media_selector(item_id) | |||
self._sort_formats(formats) | |||
entries.append({ | |||
'id': item_id, | |||
'title': item_title, | |||
'thumbnail': item.get('holdingImageUrl'), | |||
'formats': formats, | |||
'subtitles': subtitles, | |||
}) | |||
for resp in (initial_data.get('data') or {}).values(): | |||
name = resp.get('name') | |||
if name == 'media-experience': | |||
parse_media(try_get(resp, lambda x: x['data']['initialItem']['mediaItem'], dict)) | |||
elif name == 'article': | |||
for block in (try_get(resp, lambda x: x['data']['blocks'], list) or []): | |||
if block.get('type') != 'media': | |||
continue | |||
parse_media(block.get('model')) | |||
return self.playlist_result( | |||
entries, playlist_id, playlist_title, playlist_description) | |||
def extract_all(pattern): | |||
return list(filter(None, map( | |||
lambda s: self._parse_json(s, playlist_id, fatal=False), | |||
@@ -0,0 +1,98 @@ | |||
# coding: utf-8 | |||
from __future__ import unicode_literals | |||
import json | |||
import re | |||
from .common import InfoExtractor | |||
from ..utils import ( | |||
determine_ext, | |||
parse_iso8601, | |||
# try_get, | |||
update_url_query, | |||
) | |||
class BoxIE(InfoExtractor): | |||
_VALID_URL = r'https?://(?:[^.]+\.)?app\.box\.com/s/(?P<shared_name>[^/]+)/file/(?P<id>\d+)' | |||
_TEST = { | |||
'url': 'https://mlssoccer.app.box.com/s/0evd2o3e08l60lr4ygukepvnkord1o1x/file/510727257538', | |||
'md5': '1f81b2fd3960f38a40a3b8823e5fcd43', | |||
'info_dict': { | |||
'id': '510727257538', | |||
'ext': 'mp4', | |||
'title': 'Garber St. Louis will be 28th MLS team +scarving.mp4', | |||
'uploader': 'MLS Video', | |||
'timestamp': 1566320259, | |||
'upload_date': '20190820', | |||
'uploader_id': '235196876', | |||
} | |||
} | |||
def _real_extract(self, url): | |||
shared_name, file_id = re.match(self._VALID_URL, url).groups() | |||
webpage = self._download_webpage(url, file_id) | |||
request_token = self._parse_json(self._search_regex( | |||
r'Box\.config\s*=\s*({.+?});', webpage, | |||
'Box config'), file_id)['requestToken'] | |||
access_token = self._download_json( | |||
'https://app.box.com/app-api/enduserapp/elements/tokens', file_id, | |||
'Downloading token JSON metadata', | |||
data=json.dumps({'fileIDs': [file_id]}).encode(), headers={ | |||
'Content-Type': 'application/json', | |||
'X-Request-Token': request_token, | |||
'X-Box-EndUser-API': 'sharedName=' + shared_name, | |||
})[file_id]['read'] | |||
shared_link = 'https://app.box.com/s/' + shared_name | |||
f = self._download_json( | |||
'https://api.box.com/2.0/files/' + file_id, file_id, | |||
'Downloading file JSON metadata', headers={ | |||
'Authorization': 'Bearer ' + access_token, | |||
'BoxApi': 'shared_link=' + shared_link, | |||
'X-Rep-Hints': '[dash]', # TODO: extract `hls` formats | |||
}, query={ | |||
'fields': 'authenticated_download_url,created_at,created_by,description,extension,is_download_available,name,representations,size' | |||
}) | |||
title = f['name'] | |||
query = { | |||
'access_token': access_token, | |||
'shared_link': shared_link | |||
} | |||
formats = [] | |||
# for entry in (try_get(f, lambda x: x['representations']['entries'], list) or []): | |||
# entry_url_template = try_get( | |||
# entry, lambda x: x['content']['url_template']) | |||
# if not entry_url_template: | |||
# continue | |||
# representation = entry.get('representation') | |||
# if representation == 'dash': | |||
# TODO: append query to every fragment URL | |||
# formats.extend(self._extract_mpd_formats( | |||
# entry_url_template.replace('{+asset_path}', 'manifest.mpd'), | |||
# file_id, query=query)) | |||
authenticated_download_url = f.get('authenticated_download_url') | |||
if authenticated_download_url and f.get('is_download_available'): | |||
formats.append({ | |||
'ext': f.get('extension') or determine_ext(title), | |||
'filesize': f.get('size'), | |||
'format_id': 'download', | |||
'url': update_url_query(authenticated_download_url, query), | |||
}) | |||
self._sort_formats(formats) | |||
creator = f.get('created_by') or {} | |||
return { | |||
'id': file_id, | |||
'title': title, | |||
'formats': formats, | |||
'description': f.get('description') or None, | |||
'uploader': creator.get('name'), | |||
'timestamp': parse_iso8601(f.get('created_at')), | |||
'uploader_id': creator.get('id'), | |||
} |
@@ -147,7 +147,7 @@ class BrightcoveLegacyIE(InfoExtractor): | |||
] | |||
@classmethod | |||
def _build_brighcove_url(cls, object_str): | |||
def _build_brightcove_url(cls, object_str): | |||
""" | |||
Build a Brightcove url from a xml string containing | |||
<object class="BrightcoveExperience">{params}</object> | |||
@@ -217,7 +217,7 @@ class BrightcoveLegacyIE(InfoExtractor): | |||
return cls._make_brightcove_url(params) | |||
@classmethod | |||
def _build_brighcove_url_from_js(cls, object_js): | |||
def _build_brightcove_url_from_js(cls, object_js): | |||
# The layout of JS is as follows: | |||
# customBC.createVideo = function (width, height, playerID, playerKey, videoPlayer, VideoRandomID) { | |||
# // build Brightcove <object /> XML | |||
@@ -272,12 +272,12 @@ class BrightcoveLegacyIE(InfoExtractor): | |||
).+?>\s*</object>''', | |||
webpage) | |||
if matches: | |||
return list(filter(None, [cls._build_brighcove_url(m) for m in matches])) | |||
return list(filter(None, [cls._build_brightcove_url(m) for m in matches])) | |||
matches = re.findall(r'(customBC\.createVideo\(.+?\);)', webpage) | |||
if matches: | |||
return list(filter(None, [ | |||
cls._build_brighcove_url_from_js(custom_bc) | |||
cls._build_brightcove_url_from_js(custom_bc) | |||
for custom_bc in matches])) | |||
return [src for _, src in re.findall( | |||
r'<iframe[^>]+src=([\'"])((?:https?:)?//link\.brightcove\.com/services/player/(?!\1).+)\1', webpage)] | |||
@@ -5,10 +5,16 @@ import codecs | |||
import re | |||
from .common import InfoExtractor | |||
from ..compat import ( | |||
compat_chr, | |||
compat_ord, | |||
compat_urllib_parse_unquote, | |||
) | |||
from ..utils import ( | |||
ExtractorError, | |||
float_or_none, | |||
int_or_none, | |||
merge_dicts, | |||
multipart_encode, | |||
parse_duration, | |||
random_birthday, | |||
@@ -107,8 +113,9 @@ class CDAIE(InfoExtractor): | |||
r'Odsłony:(?:\s| )*([0-9]+)', webpage, | |||
'view_count', default=None) | |||
average_rating = self._search_regex( | |||
r'<(?:span|meta)[^>]+itemprop=(["\'])ratingValue\1[^>]*>(?P<rating_value>[0-9.]+)', | |||
webpage, 'rating', fatal=False, group='rating_value') | |||
(r'<(?:span|meta)[^>]+itemprop=(["\'])ratingValue\1[^>]*>(?P<rating_value>[0-9.]+)', | |||
r'<span[^>]+\bclass=["\']rating["\'][^>]*>(?P<rating_value>[0-9.]+)'), webpage, 'rating', fatal=False, | |||
group='rating_value') | |||
info_dict = { | |||
'id': video_id, | |||
@@ -123,6 +130,24 @@ class CDAIE(InfoExtractor): | |||
'age_limit': 18 if need_confirm_age else 0, | |||
} | |||
# Source: https://www.cda.pl/js/player.js?t=1606154898 | |||
def decrypt_file(a): | |||
for p in ('_XDDD', '_CDA', '_ADC', '_CXD', '_QWE', '_Q5', '_IKSDE'): | |||
a = a.replace(p, '') | |||
a = compat_urllib_parse_unquote(a) | |||
b = [] | |||
for c in a: | |||
f = compat_ord(c) | |||
b.append(compat_chr(33 + (f + 14) % 94) if 33 <= f and 126 >= f else compat_chr(f)) | |||
a = ''.join(b) | |||
a = a.replace('.cda.mp4', '') | |||
for p in ('.2cda.pl', '.3cda.pl'): | |||
a = a.replace(p, '.cda.pl') | |||
if '/upstream' in a: | |||
a = a.replace('/upstream', '.mp4/upstream') | |||
return 'https://' + a | |||
return 'https://' + a + '.mp4' | |||
def extract_format(page, version): | |||
json_str = self._html_search_regex( | |||
r'player_data=(\\?["\'])(?P<player_data>.+?)\1', page, | |||
@@ -141,6 +166,8 @@ class CDAIE(InfoExtractor): | |||
video['file'] = codecs.decode(video['file'], 'rot_13') | |||
if video['file'].endswith('adc.mp4'): | |||
video['file'] = video['file'].replace('adc.mp4', '.mp4') | |||
elif not video['file'].startswith('http'): | |||
video['file'] = decrypt_file(video['file']) | |||
f = { | |||
'url': video['file'], | |||
} | |||
@@ -179,4 +206,6 @@ class CDAIE(InfoExtractor): | |||
self._sort_formats(formats) | |||
return info_dict | |||
info = self._search_json_ld(webpage, video_id, default={}) | |||
return merge_dicts(info_dict, info) |
@@ -1,6 +1,7 @@ | |||
# coding: utf-8 | |||
from __future__ import unicode_literals | |||
import re | |||
from .common import InfoExtractor | |||
from ..utils import smuggle_url | |||
@@ -38,7 +39,7 @@ class CNBCIE(InfoExtractor): | |||
class CNBCVideoIE(InfoExtractor): | |||
_VALID_URL = r'https?://(?:www\.)?cnbc\.com/video/(?:[^/]+/)+(?P<id>[^./?#&]+)' | |||
_VALID_URL = r'https?://(?:www\.)?cnbc\.com(?P<path>/video/(?:[^/]+/)+(?P<id>[^./?#&]+)\.html)' | |||
_TEST = { | |||
'url': 'https://www.cnbc.com/video/2018/07/19/trump-i-dont-necessarily-agree-with-raising-rates.html', | |||
'info_dict': { | |||
@@ -56,11 +57,15 @@ class CNBCVideoIE(InfoExtractor): | |||
} | |||
def _real_extract(self, url): | |||
display_id = self._match_id(url) | |||
webpage = self._download_webpage(url, display_id) | |||
video_id = self._search_regex( | |||
r'content_id["\']\s*:\s*["\'](\d+)', webpage, display_id, | |||
'video id') | |||
path, display_id = re.match(self._VALID_URL, url).groups() | |||
video_id = self._download_json( | |||
'https://webql-redesign.cnbcfm.com/graphql', display_id, query={ | |||
'query': '''{ | |||
page(path: "%s") { | |||
vcpsId | |||
} | |||
}''' % path, | |||
})['data']['page']['vcpsId'] | |||
return self.url_result( | |||
'http://video.cnbc.com/gallery/?video=%s' % video_id, | |||
'http://video.cnbc.com/gallery/?video=%d' % video_id, | |||
CNBCIE.ie_key()) |
@@ -1456,9 +1456,10 @@ class InfoExtractor(object): | |||
try: | |||
self._request_webpage(url, video_id, 'Checking %s URL' % item, headers=headers) | |||
return True | |||
except ExtractorError: | |||
except ExtractorError as e: | |||
self.to_screen( | |||
'%s: %s URL is invalid, skipping' % (video_id, item)) | |||
'%s: %s URL is invalid, skipping: %s' | |||
% (video_id, item, error_to_compat_str(e.cause))) | |||
return False | |||
def http_scheme(self): | |||
@@ -1663,7 +1664,7 @@ class InfoExtractor(object): | |||
# just the media without qualities renditions. | |||
# Fortunately, master playlist can be easily distinguished from media | |||
# playlist based on particular tags availability. As of [1, 4.3.3, 4.3.4] | |||
# master playlist tags MUST NOT appear in a media playist and vice versa. | |||
# master playlist tags MUST NOT appear in a media playlist and vice versa. | |||
# As of [1, 4.3.3.1] #EXT-X-TARGETDURATION tag is REQUIRED for every | |||
# media playlist and MUST NOT appear in master playlist thus we can | |||
# clearly detect media playlist with this criterion. | |||
@@ -2596,6 +2597,7 @@ class InfoExtractor(object): | |||
def _extract_akamai_formats(self, manifest_url, video_id, hosts={}): | |||
formats = [] | |||
hdcore_sign = 'hdcore=3.7.0' | |||
f4m_url = re.sub(r'(https?://[^/]+)/i/', r'\1/z/', manifest_url).replace('/master.m3u8', '/manifest.f4m') | |||
hds_host = hosts.get('hds') | |||
@@ -2608,6 +2610,7 @@ class InfoExtractor(object): | |||
for entry in f4m_formats: | |||
entry.update({'extra_param_to_segment_url': hdcore_sign}) | |||
formats.extend(f4m_formats) | |||
m3u8_url = re.sub(r'(https?://[^/]+)/z/', r'\1/i/', manifest_url).replace('/manifest.f4m', '/master.m3u8') | |||
hls_host = hosts.get('hls') | |||
if hls_host: | |||
@@ -2615,6 +2618,31 @@ class InfoExtractor(object): | |||
formats.extend(self._extract_m3u8_formats( | |||
m3u8_url, video_id, 'mp4', 'm3u8_native', | |||
m3u8_id='hls', fatal=False)) | |||
http_host = hosts.get('http') | |||
if http_host and 'hdnea=' not in manifest_url: | |||
REPL_REGEX = r'https://[^/]+/i/([^,]+),([^/]+),([^/]+).csmil/.+' | |||
qualities = re.match(REPL_REGEX, m3u8_url).group(2).split(',') | |||
qualities_length = len(qualities) | |||
if len(formats) in (qualities_length + 1, qualities_length * 2 + 1): | |||
i = 0 | |||
http_formats = [] | |||
for f in formats: | |||
if f['protocol'] == 'm3u8_native' and f['vcodec'] != 'none': | |||
for protocol in ('http', 'https'): | |||
http_f = f.copy() | |||
del http_f['manifest_url'] | |||
http_url = re.sub( | |||
REPL_REGEX, protocol + r'://%s/\1%s\3' % (http_host, qualities[i]), f['url']) | |||
http_f.update({ | |||
'format_id': http_f['format_id'].replace('hls-', protocol + '-'), | |||
'url': http_url, | |||
'protocol': protocol, | |||
}) | |||
http_formats.append(http_f) | |||
i += 1 | |||
formats.extend(http_formats) | |||
return formats | |||
def _extract_wowza_formats(self, url, video_id, m3u8_entry_protocol='m3u8_native', skip_protocols=[]): | |||
@@ -16,6 +16,8 @@ from ..utils import ( | |||
mimetype2ext, | |||
orderedSet, | |||
parse_iso8601, | |||
strip_or_none, | |||
try_get, | |||
) | |||
@@ -82,6 +84,7 @@ class CondeNastIE(InfoExtractor): | |||
'uploader': 'gq', | |||
'upload_date': '20170321', | |||
'timestamp': 1490126427, | |||
'description': 'How much grimmer would things be if these people were competent?', | |||
}, | |||
}, { | |||
# JS embed | |||
@@ -93,7 +96,7 @@ class CondeNastIE(InfoExtractor): | |||
'title': '3D printed TSA Travel Sentry keys really do open TSA locks', | |||
'uploader': 'arstechnica', | |||
'upload_date': '20150916', | |||
'timestamp': 1442434955, | |||
'timestamp': 1442434920, | |||
} | |||
}, { | |||
'url': 'https://player.cnevids.com/inline/video/59138decb57ac36b83000005.js?target=js-cne-player', | |||
@@ -196,6 +199,13 @@ class CondeNastIE(InfoExtractor): | |||
}) | |||
self._sort_formats(formats) | |||
subtitles = {} | |||
for t, caption in video_info.get('captions', {}).items(): | |||
caption_url = caption.get('src') | |||
if not (t in ('vtt', 'srt', 'tml') and caption_url): | |||
continue | |||
subtitles.setdefault('en', []).append({'url': caption_url}) | |||
return { | |||
'id': video_id, | |||
'formats': formats, | |||
@@ -208,6 +218,7 @@ class CondeNastIE(InfoExtractor): | |||
'season': video_info.get('season_title'), | |||
'timestamp': parse_iso8601(video_info.get('premiere_date')), | |||
'categories': video_info.get('categories'), | |||
'subtitles': subtitles, | |||
} | |||
def _real_extract(self, url): | |||
@@ -225,8 +236,16 @@ class CondeNastIE(InfoExtractor): | |||
if url_type == 'series': | |||
return self._extract_series(url, webpage) | |||
else: | |||
params = self._extract_video_params(webpage, display_id) | |||
info = self._search_json_ld( | |||
webpage, display_id, fatal=False) | |||
video = try_get(self._parse_json(self._search_regex( | |||
r'__PRELOADED_STATE__\s*=\s*({.+?});', webpage, | |||
'preload state', '{}'), display_id), | |||
lambda x: x['transformed']['video']) | |||
if video: | |||
params = {'videoId': video['id']} | |||
info = {'description': strip_or_none(video.get('description'))} | |||
else: | |||
params = self._extract_video_params(webpage, display_id) | |||
info = self._search_json_ld( | |||
webpage, display_id, fatal=False) | |||
info.update(self._extract_video(params)) | |||
return info |
@@ -7,7 +7,7 @@ from .dplay import DPlayIE | |||
class DiscoveryNetworksDeIE(DPlayIE): | |||
_VALID_URL = r'https?://(?:www\.)?(?P<domain>(?:tlc|dmax)\.de|dplay\.co\.uk)/(?:programme|show)/(?P<programme>[^/]+)/video/(?P<alternate_id>[^/]+)' | |||
_VALID_URL = r'https?://(?:www\.)?(?P<domain>(?:tlc|dmax)\.de|dplay\.co\.uk)/(?:programme|show|sendungen)/(?P<programme>[^/]+)/(?:video/)?(?P<alternate_id>[^/]+)' | |||
_TESTS = [{ | |||
'url': 'https://www.tlc.de/programme/breaking-amish/video/die-welt-da-drauen/DCB331270001100', | |||
@@ -29,6 +29,9 @@ class DiscoveryNetworksDeIE(DPlayIE): | |||
}, { | |||
'url': 'https://www.dplay.co.uk/show/ghost-adventures/video/hotel-leger-103620/EHD_280313B', | |||
'only_matching': True, | |||
}, { | |||
'url': 'https://tlc.de/sendungen/breaking-amish/die-welt-da-drauen/', | |||
'only_matching': True, | |||
}] | |||
def _real_extract(self, url): | |||
@@ -60,7 +60,7 @@ class EuropaIE(InfoExtractor): | |||
title = get_item('title', preferred_langs) or video_id | |||
description = get_item('description', preferred_langs) | |||
thumbnmail = xpath_text(playlist, './info/thumburl', 'thumbnail') | |||
thumbnail = xpath_text(playlist, './info/thumburl', 'thumbnail') | |||
upload_date = unified_strdate(xpath_text(playlist, './info/date', 'upload date')) | |||
duration = parse_duration(xpath_text(playlist, './info/duration', 'duration')) | |||
view_count = int_or_none(xpath_text(playlist, './info/views', 'views')) | |||
@@ -85,7 +85,7 @@ class EuropaIE(InfoExtractor): | |||
'id': video_id, | |||
'title': title, | |||
'description': description, | |||
'thumbnail': thumbnmail, | |||
'thumbnail': thumbnail, | |||
'upload_date': upload_date, | |||
'duration': duration, | |||
'view_count': view_count, | |||
@@ -36,6 +36,7 @@ from .afreecatv import AfreecaTVIE | |||
from .airmozilla import AirMozillaIE | |||
from .aljazeera import AlJazeeraIE | |||
from .alphaporno import AlphaPornoIE | |||
from .amara import AmaraIE | |||
from .alura import ( | |||
AluraIE, | |||
AluraCourseIE | |||
@@ -62,7 +63,7 @@ from .ard import ( | |||
ARDMediathekIE, | |||
) | |||
from .arte import ( | |||
ArteTVPlus7IE, | |||
ArteTVIE, | |||
ArteTVEmbedIE, | |||
ArteTVPlaylistIE, | |||
) | |||
@@ -129,6 +130,7 @@ from .blinkx import BlinkxIE | |||
from .bloomberg import BloombergIE | |||
from .bokecc import BokeCCIE | |||
from .bostonglobe import BostonGlobeIE | |||
from .box import BoxIE | |||
from .bpb import BpbIE | |||
from .br import ( | |||
BRIE, | |||
@@ -546,6 +548,7 @@ from .laola1tv import ( | |||
EHFTVIE, | |||
ITTFIE, | |||
) | |||
from .lbry import LBRYIE | |||
from .lci import LCIIE | |||
from .lcp import ( | |||
LcpPlayIE, | |||
@@ -621,6 +624,7 @@ from .markiza import ( | |||
from .massengeschmacktv import MassengeschmackTVIE | |||
from .matchtv import MatchTVIE | |||
from .mdr import MDRIE | |||
from .medaltv import MedalTVIE | |||
from .mediaset import MediasetIE | |||
from .mediasite import ( | |||
MediasiteIE, | |||
@@ -803,6 +807,7 @@ from .ntvru import NTVRuIE |