Merge 'ytdl-org/youtube-dl/master' release 2020.11.19

Old Extractors left behind:
	VLivePlaylistIE
	YoutubeSearchURLIE
	YoutubeShowIE
	YoutubeFavouritesIE

If removing old extractors, make corresponding changes in
	docs/supportedsites.md
	youtube_dlc/extractor/extractors.py

Not merged:
	.github/ISSUE_TEMPLATE/1_broken_site.md
	.github/ISSUE_TEMPLATE/2_site_support_request.md
	.github/ISSUE_TEMPLATE/3_site_feature_request.md
	.github/ISSUE_TEMPLATE/4_bug_report.md
	.github/ISSUE_TEMPLATE/5_feature_request.md
	test/test_all_urls.py
	youtube_dlc/version.py
	Changelog
pull/245/head
pukkandan 2020-11-20 00:52:59 +05:30
parent 228385340e
commit 8bdd16b499
34 changed files with 1828 additions and 1695 deletions

View File

@ -61,7 +61,7 @@ def build_lazy_ie(ie, name):
return s
# find the correct sorting and add the required base classes so that sublcasses
# find the correct sorting and add the required base classes so that subclasses
# can be correctly created
classes = _ALL_CLASSES[:-1]
ordered_cls = []

View File

@ -59,9 +59,9 @@
- **ARD:mediathek**
- **ARDBetaMediathek**
- **Arkena**
- **arte.tv:+7**
- **arte.tv:embed**
- **arte.tv:playlist**
- **ArteTV**
- **ArteTVEmbed**
- **ArteTVPlaylist**
- **AsianCrush**
- **AsianCrushPlaylist**
- **AtresPlayer**
@ -424,6 +424,7 @@
- **la7.it**
- **laola1tv**
- **laola1tv:embed**
- **lbry.tv**
- **LCI**
- **Lcp**
- **LcpPlay**
@ -835,8 +836,6 @@
- **SpankBangPlaylist**
- **Spankwire**
- **Spiegel**
- **Spiegel:Article**: Articles on spiegel.de
- **Spiegeltv**
- **sport.francetvinfo.fr**
- **Sport5**
- **SportBox**
@ -1147,19 +1146,18 @@
- **YourPorn**
- **YourUpload**
- **youtube**: YouTube.com
- **youtube:channel**: YouTube.com channels
- **youtube:favorites**: YouTube.com favourite videos, ":ytfav" for short (requires authentication)
- **youtube:history**: Youtube watch history, ":ythistory" for short (requires authentication)
- **youtube:live**: YouTube.com live streams
- **youtube:playlist**: YouTube.com playlists
- **youtube:playlists**: YouTube.com user/channel playlists
- **youtube:recommended**: YouTube.com recommended videos, ":ytrec" for short (requires authentication)
- **youtube:search**: YouTube.com searches
- **youtube:search:date**: YouTube.com searches, newest videos first
- **youtube:search_url**: YouTube.com search URLs
- **youtube:show**: YouTube.com (multi-season) shows
- **youtube:subscriptions**: YouTube.com subscriptions feed, "ytsubs" keyword (requires authentication)
- **youtube:user**: YouTube.com user videos (URL or "ytuser" keyword)
- **YoutubeYtUser**: YouTube.com user videos (URL or "ytuser" keyword)
- **youtube:tab**: YouTube.com tab
- **youtube:watchlater**: Youtube watch later list, ":ytwatchlater" for short (requires authentication)
- **Zapiks**
- **Zaq1**

View File

@ -31,15 +31,17 @@ class TestAllURLsMatching(unittest.TestCase):
def test_youtube_playlist_matching(self):
assertPlaylist = lambda url: self.assertMatch(url, ['youtube:playlist'])
assertTab = lambda url: self.assertMatch(url, ['youtube:tab'])
assertPlaylist('ECUl4u3cNGP61MdtwGTqZA0MreSaDybji8')
assertPlaylist('UUBABnxM4Ar9ten8Mdjj1j0Q') # 585
assertPlaylist('https://www.youtube.com/playlist?list=UUBABnxM4Ar9ten8Mdjj1j0Q')
assertPlaylist('PL63F0C78739B09958')
assertTab('https://www.youtube.com/playlist?list=UUBABnxM4Ar9ten8Mdjj1j0Q')
assertPlaylist('https://www.youtube.com/course?list=ECUl4u3cNGP61MdtwGTqZA0MreSaDybji8')
assertPlaylist('https://www.youtube.com/playlist?list=PLwP_SiAcdui0KVebT0mU9Apz359a4ubsC')
assertPlaylist('https://www.youtube.com/watch?v=AV6J6_AeFEQ&playnext=1&list=PL4023E734DA416012') # 668
assertTab('https://www.youtube.com/playlist?list=PLwP_SiAcdui0KVebT0mU9Apz359a4ubsC')
assertTab('https://www.youtube.com/watch?v=AV6J6_AeFEQ&playnext=1&list=PL4023E734DA416012') # 668
self.assertFalse('youtube:playlist' in self.matching_ies('PLtS2H6bU1M'))
# Top tracks
assertPlaylist('https://www.youtube.com/playlist?list=MCUS.20142101')
assertTab('https://www.youtube.com/playlist?list=MCUS.20142101')
def test_youtube_matching(self):
self.assertTrue(YoutubeIE.suitable('PLtS2H6bU1M'))
@ -50,26 +52,22 @@ class TestAllURLsMatching(unittest.TestCase):
self.assertMatch('http://www.cleanvideosearch.com/media/action/yt/watch?videoId=8v_4O44sfjM', ['youtube'])
def test_youtube_channel_matching(self):
assertChannel = lambda url: self.assertMatch(url, ['youtube:channel'])
assertChannel = lambda url: self.assertMatch(url, ['youtube:tab'])
assertChannel('https://www.youtube.com/channel/HCtnHdj3df7iM')
assertChannel('https://www.youtube.com/channel/HCtnHdj3df7iM?feature=gb_ch_rec')
assertChannel('https://www.youtube.com/channel/HCtnHdj3df7iM/videos')
def test_youtube_user_matching(self):
self.assertMatch('http://www.youtube.com/NASAgovVideo/videos', ['youtube:user'])
# def test_youtube_user_matching(self):
# self.assertMatch('http://www.youtube.com/NASAgovVideo/videos', ['youtube:tab'])
def test_youtube_feeds(self):
self.assertMatch('https://www.youtube.com/feed/watch_later', ['youtube:watchlater'])
self.assertMatch('https://www.youtube.com/feed/subscriptions', ['youtube:subscriptions'])
self.assertMatch('https://www.youtube.com/feed/recommended', ['youtube:recommended'])
self.assertMatch('https://www.youtube.com/my_favorites', ['youtube:favorites'])
def test_youtube_show_matching(self):
self.assertMatch('http://www.youtube.com/show/airdisasters', ['youtube:show'])
def test_youtube_search_matching(self):
self.assertMatch('http://www.youtube.com/results?search_query=making+mustard', ['youtube:search_url'])
self.assertMatch('https://www.youtube.com/results?baz=bar&search_query=youtube-dl+test+video&filters=video&lclk=video', ['youtube:search_url'])
# def test_youtube_search_matching(self):
# self.assertMatch('http://www.youtube.com/results?search_query=making+mustard', ['youtube:search_url'])
# self.assertMatch('https://www.youtube.com/results?baz=bar&search_query=youtube-dl+test+video&filters=video&lclk=video', ['youtube:search_url'])
def test_youtube_extract(self):
assertExtractId = lambda url, id: self.assertEqual(YoutubeIE.extract_id(url), id)

View File

@ -937,6 +937,28 @@ class TestUtil(unittest.TestCase):
self.assertEqual(d['x'], 1)
self.assertEqual(d['y'], 'a')
# Just drop ! prefix for now though this results in a wrong value
on = js_to_json('''{
a: !0,
b: !1,
c: !!0,
d: !!42.42,
e: !!![],
f: !"abc",
g: !"",
!42: 42
}''')
self.assertEqual(json.loads(on), {
'a': 0,
'b': 1,
'c': 0,
'd': 42.42,
'e': [],
'f': "abc",
'g': "",
'42': 42
})
on = js_to_json('["abc", "def",]')
self.assertEqual(json.loads(on), ['abc', 'def'])
@ -994,6 +1016,12 @@ class TestUtil(unittest.TestCase):
on = js_to_json('{42:4.2e1}')
self.assertEqual(json.loads(on), {'42': 42.0})
on = js_to_json('{ "0x40": "0x40" }')
self.assertEqual(json.loads(on), {'0x40': '0x40'})
on = js_to_json('{ "040": "040" }')
self.assertEqual(json.loads(on), {'040': '040'})
def test_js_to_json_malformed(self):
self.assertEqual(js_to_json('42a1'), '42"a1"')
self.assertEqual(js_to_json('42a-1'), '42"a"-1')

View File

@ -275,7 +275,7 @@ class AfreecaTVIE(InfoExtractor):
video_element = video_xml.findall(compat_xpath('./track/video'))[-1]
if video_element is None or video_element.text is None:
raise ExtractorError(
'Video %s video does not exist' % video_id, expected=True)
'Video %s does not exist' % video_id, expected=True)
video_url = video_element.text.strip()

View File

@ -4,23 +4,57 @@ from __future__ import unicode_literals
import re
from .common import InfoExtractor
from ..compat import compat_str
from ..compat import (
compat_str,
compat_urlparse,
)
from ..utils import (
ExtractorError,
int_or_none,
qualities,
try_get,
unified_strdate,
url_or_none,
)
# There are different sources of video in arte.tv, the extraction process
# is different for each one. The videos usually expire in 7 days, so we can't
# add tests.
class ArteTVBaseIE(InfoExtractor):
def _extract_from_json_url(self, json_url, video_id, lang, title=None):
info = self._download_json(json_url, video_id)
_ARTE_LANGUAGES = 'fr|de|en|es|it|pl'
_API_BASE = 'https://api.arte.tv/api/player/v1'
class ArteTVIE(ArteTVBaseIE):
_VALID_URL = r'''(?x)
https?://
(?:
(?:www\.)?arte\.tv/(?P<lang>%(langs)s)/videos|
api\.arte\.tv/api/player/v\d+/config/(?P<lang_2>%(langs)s)
)
/(?P<id>\d{6}-\d{3}-[AF])
''' % {'langs': ArteTVBaseIE._ARTE_LANGUAGES}
_TESTS = [{
'url': 'https://www.arte.tv/en/videos/088501-000-A/mexico-stealing-petrol-to-survive/',
'info_dict': {
'id': '088501-000-A',
'ext': 'mp4',
'title': 'Mexico: Stealing Petrol to Survive',
'upload_date': '20190628',
},
}, {
'url': 'https://www.arte.tv/pl/videos/100103-000-A/usa-dyskryminacja-na-porodowce/',
'only_matching': True,
}, {
'url': 'https://api.arte.tv/api/player/v2/config/de/100605-013-A',
'only_matching': True,
}]
def _real_extract(self, url):
mobj = re.match(self._VALID_URL, url)
video_id = mobj.group('id')
lang = mobj.group('lang') or mobj.group('lang_2')
info = self._download_json(
'%s/config/%s/%s' % (self._API_BASE, lang, video_id), video_id)
player_info = info['videoJsonPlayer']
vsr = try_get(player_info, lambda x: x['VSR'], dict)
@ -37,18 +71,11 @@ class ArteTVBaseIE(InfoExtractor):
if not upload_date_str:
upload_date_str = (player_info.get('VRA') or player_info.get('VDA') or '').split(' ')[0]
title = (player_info.get('VTI') or title or player_info['VID']).strip()
title = (player_info.get('VTI') or player_info['VID']).strip()
subtitle = player_info.get('VSU', '').strip()
if subtitle:
title += ' - %s' % subtitle
info_dict = {
'id': player_info['VID'],
'title': title,
'description': player_info.get('VDE'),
'upload_date': unified_strdate(upload_date_str),
'thumbnail': player_info.get('programImage') or player_info.get('VTU', {}).get('IUR'),
}
qfunc = qualities(['MQ', 'HQ', 'EQ', 'SQ'])
LANGS = {
@ -65,6 +92,10 @@ class ArteTVBaseIE(InfoExtractor):
formats = []
for format_id, format_dict in vsr.items():
f = dict(format_dict)
format_url = url_or_none(f.get('url'))
streamer = f.get('streamer')
if not format_url and not streamer:
continue
versionCode = f.get('versionCode')
l = re.escape(langcode)
@ -107,6 +138,16 @@ class ArteTVBaseIE(InfoExtractor):
else:
lang_pref = -1
media_type = f.get('mediaType')
if media_type == 'hls':
m3u8_formats = self._extract_m3u8_formats(
format_url, video_id, 'mp4', entry_protocol='m3u8_native',
m3u8_id=format_id, fatal=False)
for m3u8_format in m3u8_formats:
m3u8_format['language_preference'] = lang_pref
formats.extend(m3u8_formats)
continue
format = {
'format_id': format_id,
'preference': -10 if f.get('videoFormat') == 'M3U8' else None,
@ -118,7 +159,7 @@ class ArteTVBaseIE(InfoExtractor):
'quality': qfunc(f.get('quality')),
}
if f.get('mediaType') == 'rtmp':
if media_type == 'rtmp':
format['url'] = f['streamer']
format['play_path'] = 'mp4:' + f['url']
format['ext'] = 'flv'
@ -127,56 +168,50 @@ class ArteTVBaseIE(InfoExtractor):
formats.append(format)
self._check_formats(formats, video_id)
self._sort_formats(formats)
info_dict['formats'] = formats
return info_dict
return {
'id': player_info.get('VID') or video_id,
'title': title,
'description': player_info.get('VDE'),
'upload_date': unified_strdate(upload_date_str),
'thumbnail': player_info.get('programImage') or player_info.get('VTU', {}).get('IUR'),
'formats': formats,
}
class ArteTVPlus7IE(ArteTVBaseIE):
IE_NAME = 'arte.tv:+7'
_VALID_URL = r'https?://(?:www\.)?arte\.tv/(?P<lang>fr|de|en|es|it|pl)/videos/(?P<id>\d{6}-\d{3}-[AF])'
class ArteTVEmbedIE(InfoExtractor):
_VALID_URL = r'https?://(?:www\.)?arte\.tv/player/v\d+/index\.php\?.*?\bjson_url=.+'
_TESTS = [{
'url': 'https://www.arte.tv/en/videos/088501-000-A/mexico-stealing-petrol-to-survive/',
'url': 'https://www.arte.tv/player/v5/index.php?json_url=https%3A%2F%2Fapi.arte.tv%2Fapi%2Fplayer%2Fv2%2Fconfig%2Fde%2F100605-013-A&lang=de&autoplay=true&mute=0100605-013-A',
'info_dict': {
'id': '088501-000-A',
'id': '100605-013-A',
'ext': 'mp4',
'title': 'Mexico: Stealing Petrol to Survive',
'upload_date': '20190628',
'title': 'United we Stream November Lockdown Edition #13',
'description': 'md5:be40b667f45189632b78c1425c7c2ce1',
'upload_date': '20201116',
},
}, {
'url': 'https://www.arte.tv/player/v3/index.php?json_url=https://api.arte.tv/api/player/v2/config/de/100605-013-A',
'only_matching': True,
}]
def _real_extract(self, url):
lang, video_id = re.match(self._VALID_URL, url).groups()
return self._extract_from_json_url(
'https://api.arte.tv/api/player/v1/config/%s/%s' % (lang, video_id),
video_id, lang)
class ArteTVEmbedIE(ArteTVPlus7IE):
IE_NAME = 'arte.tv:embed'
_VALID_URL = r'''(?x)
https://www\.arte\.tv
/player/v3/index\.php\?json_url=
(?P<json_url>
https?://api\.arte\.tv/api/player/v1/config/
(?P<lang>[^/]+)/(?P<id>\d{6}-\d{3}-[AF])
)
'''
_TESTS = []
@staticmethod
def _extract_urls(webpage):
return [url for _, url in re.findall(
r'<(?:iframe|script)[^>]+src=(["\'])(?P<url>(?:https?:)?//(?:www\.)?arte\.tv/player/v\d+/index\.php\?.*?\bjson_url=.+?)\1',
webpage)]
def _real_extract(self, url):
json_url, lang, video_id = re.match(self._VALID_URL, url).groups()
return self._extract_from_json_url(json_url, video_id, lang)
qs = compat_urlparse.parse_qs(compat_urlparse.urlparse(url).query)
json_url = qs['json_url'][0]
video_id = ArteTVIE._match_id(json_url)
return self.url_result(
json_url, ie=ArteTVIE.ie_key(), video_id=video_id)
class ArteTVPlaylistIE(ArteTVBaseIE):
IE_NAME = 'arte.tv:playlist'
_VALID_URL = r'https?://(?:www\.)?arte\.tv/(?P<lang>fr|de|en|es|it|pl)/videos/(?P<id>RC-\d{6})'
_VALID_URL = r'https?://(?:www\.)?arte\.tv/(?P<lang>%s)/videos/(?P<id>RC-\d{6})' % ArteTVBaseIE._ARTE_LANGUAGES
_TESTS = [{
'url': 'https://www.arte.tv/en/videos/RC-016954/earn-a-living/',
'info_dict': {
@ -185,17 +220,35 @@ class ArteTVPlaylistIE(ArteTVBaseIE):
'description': 'md5:d322c55011514b3a7241f7fb80d494c2',
},
'playlist_mincount': 6,
}, {
'url': 'https://www.arte.tv/pl/videos/RC-014123/arte-reportage/',
'only_matching': True,
}]
def _real_extract(self, url):
lang, playlist_id = re.match(self._VALID_URL, url).groups()
collection = self._download_json(
'https://api.arte.tv/api/player/v1/collectionData/%s/%s?source=videos'
% (lang, playlist_id), playlist_id)
'%s/collectionData/%s/%s?source=videos'
% (self._API_BASE, lang, playlist_id), playlist_id)
entries = []
for video in collection['videos']:
if not isinstance(video, dict):
continue
video_url = url_or_none(video.get('url')) or url_or_none(video.get('jsonUrl'))
if not video_url:
continue
video_id = video.get('programId')
entries.append({
'_type': 'url_transparent',
'url': video_url,
'id': video_id,
'title': video.get('title'),
'alt_title': video.get('subtitle'),
'thumbnail': url_or_none(try_get(video, lambda x: x['mainImage']['url'], compat_str)),
'duration': int_or_none(video.get('durationSeconds')),
'view_count': int_or_none(video.get('views')),
'ie_key': ArteTVIE.ie_key(),
})
title = collection.get('title')
description = collection.get('shortDescription') or collection.get('teaserText')
entries = [
self._extract_from_json_url(
video['jsonUrl'], video.get('programId') or playlist_id, lang)
for video in collection['videos'] if video.get('jsonUrl')]
return self.playlist_result(entries, playlist_id, title, description)

View File

@ -1,3 +1,4 @@
# coding: utf-8
from __future__ import unicode_literals
import random
@ -5,10 +6,7 @@ import re
import time
from .common import InfoExtractor
from ..compat import (
compat_str,
compat_urlparse,
)
from ..compat import compat_str
from ..utils import (
ExtractorError,
float_or_none,
@ -17,71 +15,32 @@ from ..utils import (
parse_filesize,
str_or_none,
try_get,
unescapeHTML,
update_url_query,
unified_strdate,
unified_timestamp,
url_or_none,
urljoin,
)
class BandcampBaseIE(InfoExtractor):
"""Provide base functions for Bandcamp extractors"""
def _extract_json_from_html_data_attribute(self, webpage, suffix, video_id):
json_string = self._html_search_regex(
r' data-%s="([^"]*)' % suffix,
webpage, '%s json' % suffix, default='{}')
return self._parse_json(json_string, video_id)
def _parse_json_track(self, json):
formats = []
file_ = json.get('file')
if isinstance(file_, dict):
for format_id, format_url in file_.items():
if not url_or_none(format_url):
continue
ext, abr_str = format_id.split('-', 1)
formats.append({
'format_id': format_id,
'url': self._proto_relative_url(format_url, 'http:'),
'ext': ext,
'vcodec': 'none',
'acodec': ext,
'abr': int_or_none(abr_str),
})
return {
'duration': float_or_none(json.get('duration')),
'id': str_or_none(json.get('track_id') or json.get('id')),
'title': json.get('title'),
'title_link': json.get('title_link'),
'number': int_or_none(json.get('track_num')),
'formats': formats
}
class BandcampIE(BandcampBaseIE):
IE_NAME = "Bandcamp:track"
_VALID_URL = r'https?://[^/]+\.bandcamp\.com/track/(?P<title>[^/?#&]+)'
class BandcampIE(InfoExtractor):
_VALID_URL = r'https?://[^/]+\.bandcamp\.com/track/(?P<id>[^/?#&]+)'
_TESTS = [{
'url': 'http://youtube-dl.bandcamp.com/track/youtube-dl-test-song',
'md5': 'c557841d5e50261777a6585648adf439',
'info_dict': {
'id': '1812978515',
'ext': 'mp3',
'title': "youtube-dl \"'/\\\u00e4\u21ad - youtube-dl \"'/\\\u00e4\u21ad - youtube-dl test song \"'/\\\u00e4\u21ad",
'title': "youtube-dl \"'/\\ä↭ - youtube-dl \"'/\\ä↭ - youtube-dl test song \"'/\\ä↭",
'duration': 9.8485,
'uploader': "youtube-dl \"'/\\\u00e4\u21ad",
'timestamp': 1354224127,
'uploader': 'youtube-dl "\'/\\ä↭',
'upload_date': '20121129',
'timestamp': 1354224127,
},
'_skip': 'There is a limit of 200 free downloads / month for the test song'
}, {
# free download
'url': 'http://benprunty.bandcamp.com/track/lanius-battle',
'md5': '5d92af55811e47f38962a54c30b07ef0',
'info_dict': {
'id': '2650410135',
'ext': 'aiff',
@ -120,52 +79,59 @@ class BandcampIE(BandcampBaseIE):
},
}]
def _extract_data_attr(self, webpage, video_id, attr='tralbum', fatal=True):
return self._parse_json(self._html_search_regex(
r'data-%s=(["\'])({.+?})\1' % attr, webpage,
attr + ' data', group=2), video_id, fatal=fatal)
def _real_extract(self, url):
mobj = re.match(self._VALID_URL, url)
title = mobj.group('title')
url_track_title = title
title = self._match_id(url)
webpage = self._download_webpage(url, title)
thumbnail = self._html_search_meta('og:image', webpage, default=None)
tralbum = self._extract_data_attr(webpage, title)
thumbnail = self._og_search_thumbnail(webpage)
json_tralbum = self._extract_json_from_html_data_attribute(webpage, "tralbum", url_track_title)
json_embed = self._extract_json_from_html_data_attribute(webpage, "embed", url_track_title)
track_id = None
track = None
track_number = None
duration = None
json_tracks = json_tralbum.get('trackinfo')
if not json_tracks:
raise ExtractorError('Could not extract track')
formats = []
track_info = try_get(tralbum, lambda x: x['trackinfo'][0], dict)
if track_info:
file_ = track_info.get('file')
if isinstance(file_, dict):
for format_id, format_url in file_.items():
if not url_or_none(format_url):
continue
ext, abr_str = format_id.split('-', 1)
formats.append({
'format_id': format_id,
'url': self._proto_relative_url(format_url, 'http:'),
'ext': ext,
'vcodec': 'none',
'acodec': ext,
'abr': int_or_none(abr_str),
})
track = track_info.get('title')
track_id = str_or_none(
track_info.get('track_id') or track_info.get('id'))
track_number = int_or_none(track_info.get('track_num'))
duration = float_or_none(track_info.get('duration'))
track = self._parse_json_track(json_tracks[0])
artist = json_tralbum.get('artist')
album_title = json_embed.get('album_title')
embed = self._extract_data_attr(webpage, title, 'embed', False)
current = tralbum.get('current') or {}
artist = embed.get('artist') or current.get('artist') or tralbum.get('artist')
timestamp = unified_timestamp(
current.get('publish_date') or tralbum.get('album_publish_date'))
json_album = json_tralbum.get('packages')
if json_album:
json_album = json_album[0]
album_publish_date = json_album.get('album_publish_date')
album_release_date = json_album.get('album_release_date')
else:
album_publish_date = None
album_release_date = json_tralbum.get('album_release_date')
timestamp = unified_timestamp(json_tralbum.get('current', {}).get('publish_date') or album_publish_date)
release_date = unified_strdate(album_release_date)
download_link = self._search_regex(
r'freeDownloadPage(?:["\']|&quot;):\s*(["\']|&quot;)(?P<url>(?:(?!\1).)+)\1', webpage,
'download link', default=None, group='url')
download_link = tralbum.get('freeDownloadPage')
if download_link:
track_id = self._search_regex(
r'\?id=(?P<id>\d+)&',
download_link, 'track id')
track_id = compat_str(tralbum['id'])
download_webpage = self._download_webpage(
download_link, track_id, 'Downloading free downloads page')
blob = self._parse_json(
self._search_regex(
r'data-blob=(["\'])(?P<blob>{.+?})\1', download_webpage,
'blob', group='blob'),
track_id, transform_source=unescapeHTML)
blob = self._extract_data_attr(download_webpage, track_id, 'blob')
info = try_get(
blob, (lambda x: x['digital_items'][0],
@ -173,6 +139,8 @@ class BandcampIE(BandcampBaseIE):
if info:
downloads = info.get('downloads')
if isinstance(downloads, dict):
if not track:
track = info.get('title')
if not artist:
artist = info.get('artist')
if not thumbnail:
@ -206,7 +174,7 @@ class BandcampIE(BandcampBaseIE):
retry_url = url_or_none(stat.get('retry_url'))
if not retry_url:
continue
track['formats'].append({
formats.append({
'url': self._proto_relative_url(retry_url, 'http:'),
'ext': download_formats.get(format_id),
'format_id': format_id,
@ -215,30 +183,34 @@ class BandcampIE(BandcampBaseIE):
'vcodec': 'none',
})
self._sort_formats(track['formats'])
self._sort_formats(formats)
title = '%s - %s' % (artist, track.get('title')) if artist else track.get('title')
title = '%s - %s' % (artist, track) if artist else track
if not duration:
duration = float_or_none(self._html_search_meta(
'duration', webpage, default=None))
return {
'album': album_title,
'artist': artist,
'duration': track['duration'],
'formats': track['formats'],
'id': track['id'],
'release_date': release_date,
'thumbnail': thumbnail,
'timestamp': timestamp,
'id': track_id,
'title': title,
'track': track['title'],
'track_id': track['id'],
'track_number': track['number'],
'uploader': artist
'thumbnail': thumbnail,
'uploader': artist,
'timestamp': timestamp,
'release_date': unified_strdate(tralbum.get('album_release_date')),
'duration': duration,
'track': track,
'track_number': track_number,
'track_id': track_id,
'artist': artist,
'album': embed.get('album_title'),
'formats': formats,
}
class BandcampAlbumIE(BandcampBaseIE):
class BandcampAlbumIE(BandcampIE):
IE_NAME = 'Bandcamp:album'
_VALID_URL = r'https?://(?:(?P<subdomain>[^.]+)\.)?bandcamp\.com(?:/album/(?P<album_id>[^/?#&]+))?'
_VALID_URL = r'https?://(?:(?P<subdomain>[^.]+)\.)?bandcamp\.com(?:/album/(?P<id>[^/?#&]+))?'
_TESTS = [{
'url': 'http://blazo.bandcamp.com/album/jazz-format-mixtape-vol-1',
@ -248,7 +220,10 @@ class BandcampAlbumIE(BandcampBaseIE):
'info_dict': {
'id': '1353101989',
'ext': 'mp3',
'title': 'Intro',
'title': 'Blazo - Intro',
'timestamp': 1311756226,
'upload_date': '20110727',
'uploader': 'Blazo',
}
},
{
@ -256,7 +231,10 @@ class BandcampAlbumIE(BandcampBaseIE):
'info_dict': {
'id': '38097443',
'ext': 'mp3',
'title': 'Kero One - Keep It Alive (Blazo remix)',
'title': 'Blazo - Kero One - Keep It Alive (Blazo remix)',
'timestamp': 1311757238,
'upload_date': '20110727',
'uploader': 'Blazo',
}
},
],
@ -292,6 +270,7 @@ class BandcampAlbumIE(BandcampBaseIE):
'title': '"Entropy" EP',
'uploader_id': 'jstrecords',
'id': 'entropy-ep',
'description': 'md5:0ff22959c943622972596062f2f366a5',
},
'playlist_mincount': 3,
}, {
@ -301,6 +280,7 @@ class BandcampAlbumIE(BandcampBaseIE):
'id': 'we-are-the-plague',
'title': 'WE ARE THE PLAGUE',
'uploader_id': 'insulters',
'description': 'md5:b3cf845ee41b2b1141dc7bde9237255f',
},
'playlist_count': 2,
}]
@ -312,41 +292,34 @@ class BandcampAlbumIE(BandcampBaseIE):
else super(BandcampAlbumIE, cls).suitable(url))
def _real_extract(self, url):
mobj = re.match(self._VALID_URL, url)
uploader_id = mobj.group('subdomain')
album_id = mobj.group('album_id')
uploader_id, album_id = re.match(self._VALID_URL, url).groups()
playlist_id = album_id or uploader_id
webpage = self._download_webpage(url, playlist_id)
json_tralbum = self._extract_json_from_html_data_attribute(webpage, "tralbum", playlist_id)
json_embed = self._extract_json_from_html_data_attribute(webpage, "embed", playlist_id)
json_tracks = json_tralbum.get('trackinfo')
if not json_tracks:
raise ExtractorError('Could not extract album tracks')
album_title = json_embed.get('album_title')
tralbum = self._extract_data_attr(webpage, playlist_id)
track_info = tralbum.get('trackinfo')
if not track_info:
raise ExtractorError('The page doesn\'t contain any tracks')
# Only tracks with duration info have songs
tracks = [self._parse_json_track(track) for track in json_tracks]
entries = [
self.url_result(
compat_urlparse.urljoin(url, track['title_link']),
ie=BandcampIE.ie_key(), video_id=track['id'],
video_title=track['title'])
for track in tracks
if track.get('duration')]
urljoin(url, t['title_link']), BandcampIE.ie_key(),
str_or_none(t.get('track_id') or t.get('id')), t.get('title'))
for t in track_info
if t.get('duration')]
current = tralbum.get('current') or {}
return {
'_type': 'playlist',
'uploader_id': uploader_id,
'id': playlist_id,
'title': album_title,
'entries': entries
'title': current.get('title'),
'description': current.get('about'),
'entries': entries,
}
class BandcampWeeklyIE(InfoExtractor):
class BandcampWeeklyIE(BandcampIE):
IE_NAME = 'Bandcamp:weekly'
_VALID_URL = r'https?://(?:www\.)?bandcamp\.com/?\?(?:.*?&)?show=(?P<id>\d+)'
_TESTS = [{
@ -361,29 +334,23 @@ class BandcampWeeklyIE(InfoExtractor):
'release_date': '20170404',
'series': 'Bandcamp Weekly',
'episode': 'Magic Moments',
'episode_number': 208,
'episode_id': '224',
}
},
'params': {
'format': 'opus-lo',
},
}, {
'url': 'https://bandcamp.com/?blah/blah@&show=228',
'only_matching': True
}]
def _real_extract(self, url):
video_id = self._match_id(url)
webpage = self._download_webpage(url, video_id)
show_id = self._match_id(url)
webpage = self._download_webpage(url, show_id)
blob = self._parse_json(
self._search_regex(
r'data-blob=(["\'])(?P<blob>{.+?})\1', webpage,
'blob', group='blob'),
video_id, transform_source=unescapeHTML)
blob = self._extract_data_attr(webpage, show_id, 'blob')
show = blob['bcw_show']
# This is desired because any invalid show id redirects to `bandcamp.com`
# which happens to expose the latest Bandcamp Weekly episode.
show_id = int_or_none(show.get('show_id')) or int_or_none(video_id)
show = blob['bcw_data'][show_id]
formats = []
for format_id, format_url in show['audio_stream'].items():
@ -408,20 +375,8 @@ class BandcampWeeklyIE(InfoExtractor):
if subtitle:
title += ' - %s' % subtitle
episode_number = None
seq = blob.get('bcw_seq')
if seq and isinstance(seq, list):
try:
episode_number = next(
int_or_none(e.get('episode_number'))
for e in seq
if isinstance(e, dict) and int_or_none(e.get('id')) == show_id)
except StopIteration:
pass
return {
'id': video_id,
'id': show_id,
'title': title,
'description': show.get('desc') or show.get('short_desc'),
'duration': float_or_none(show.get('audio_duration')),
@ -429,7 +384,6 @@ class BandcampWeeklyIE(InfoExtractor):
'release_date': unified_strdate(show.get('published_date')),
'series': 'Bandcamp Weekly',
'episode': show.get('subtitle'),
'episode_number': episode_number,
'episode_id': compat_str(video_id),
'episode_id': show_id,
'formats': formats
}

View File

@ -1,6 +1,7 @@
# coding: utf-8
from __future__ import unicode_literals
import re
from .common import InfoExtractor
from ..utils import smuggle_url
@ -38,7 +39,7 @@ class CNBCIE(InfoExtractor):
class CNBCVideoIE(InfoExtractor):
_VALID_URL = r'https?://(?:www\.)?cnbc\.com/video/(?:[^/]+/)+(?P<id>[^./?#&]+)'
_VALID_URL = r'https?://(?:www\.)?cnbc\.com(?P<path>/video/(?:[^/]+/)+(?P<id>[^./?#&]+)\.html)'
_TEST = {
'url': 'https://www.cnbc.com/video/2018/07/19/trump-i-dont-necessarily-agree-with-raising-rates.html',
'info_dict': {
@ -56,11 +57,15 @@ class CNBCVideoIE(InfoExtractor):
}
def _real_extract(self, url):
display_id = self._match_id(url)
webpage = self._download_webpage(url, display_id)
video_id = self._search_regex(
r'content_id["\']\s*:\s*["\'](\d+)', webpage, display_id,
'video id')
path, display_id = re.match(self._VALID_URL, url).groups()
video_id = self._download_json(
'https://webql-redesign.cnbcfm.com/graphql', display_id, query={
'query': '''{
page(path: "%s") {
vcpsId
}
}''' % path,
})['data']['page']['vcpsId']
return self.url_result(
'http://video.cnbc.com/gallery/?video=%s' % video_id,
'http://video.cnbc.com/gallery/?video=%d' % video_id,
CNBCIE.ie_key())

View File

@ -1456,9 +1456,10 @@ class InfoExtractor(object):
try:
self._request_webpage(url, video_id, 'Checking %s URL' % item, headers=headers)
return True
except ExtractorError:
except ExtractorError as e:
self.to_screen(
'%s: %s URL is invalid, skipping' % (video_id, item))
'%s: %s URL is invalid, skipping: %s'
% (video_id, item, error_to_compat_str(e.cause)))
return False
def http_scheme(self):

View File

@ -16,6 +16,8 @@ from ..utils import (
mimetype2ext,
orderedSet,
parse_iso8601,
strip_or_none,
try_get,
)
@ -82,6 +84,7 @@ class CondeNastIE(InfoExtractor):
'uploader': 'gq',
'upload_date': '20170321',
'timestamp': 1490126427,
'description': 'How much grimmer would things be if these people were competent?',
},
}, {
# JS embed
@ -93,7 +96,7 @@ class CondeNastIE(InfoExtractor):
'title': '3D printed TSA Travel Sentry keys really do open TSA locks',
'uploader': 'arstechnica',
'upload_date': '20150916',
'timestamp': 1442434955,
'timestamp': 1442434920,
}
}, {
'url': 'https://player.cnevids.com/inline/video/59138decb57ac36b83000005.js?target=js-cne-player',
@ -196,6 +199,13 @@ class CondeNastIE(InfoExtractor):
})
self._sort_formats(formats)
subtitles = {}
for t, caption in video_info.get('captions', {}).items():
caption_url = caption.get('src')
if not (t in ('vtt', 'srt', 'tml') and caption_url):
continue
subtitles.setdefault('en', []).append({'url': caption_url})
return {
'id': video_id,
'formats': formats,
@ -208,6 +218,7 @@ class CondeNastIE(InfoExtractor):
'season': video_info.get('season_title'),
'timestamp': parse_iso8601(video_info.get('premiere_date')),
'categories': video_info.get('categories'),
'subtitles': subtitles,
}
def _real_extract(self, url):
@ -225,8 +236,16 @@ class CondeNastIE(InfoExtractor):
if url_type == 'series':
return self._extract_series(url, webpage)
else:
params = self._extract_video_params(webpage, display_id)
info = self._search_json_ld(
webpage, display_id, fatal=False)
video = try_get(self._parse_json(self._search_regex(
r'__PRELOADED_STATE__\s*=\s*({.+?});', webpage,
'preload state', '{}'), display_id),
lambda x: x['transformed']['video'])
if video:
params = {'videoId': video['id']}
info = {'description': strip_or_none(video.get('description'))}
else:
params = self._extract_video_params(webpage, display_id)
info = self._search_json_ld(
webpage, display_id, fatal=False)
info.update(self._extract_video(params))
return info

View File

@ -62,7 +62,7 @@ from .ard import (
ARDMediathekIE,
)
from .arte import (
ArteTVPlus7IE,
ArteTVIE,
ArteTVEmbedIE,
ArteTVPlaylistIE,
)
@ -542,6 +542,7 @@ from .laola1tv import (
EHFTVIE,
ITTFIE,
)
from .lbry import LBRYIE
from .lci import LCIIE
from .lcp import (
LcpPlayIE,
@ -1079,8 +1080,7 @@ from .spankbang import (
SpankBangPlaylistIE,
)
from .spankwire import SpankwireIE
from .spiegel import SpiegelIE, SpiegelArticleIE
from .spiegeltv import SpiegeltvIE
from .spiegel import SpiegelIE
from .spike import (
BellatorIE,
ParamountNetworkIE,
@ -1505,12 +1505,11 @@ from .yourporn import YourPornIE
from .yourupload import YourUploadIE
from .youtube import (
YoutubeIE,
YoutubeChannelIE,
YoutubeFavouritesIE,
YoutubeHistoryIE,
YoutubeLiveIE,
YoutubeTabIE,
YoutubePlaylistIE,
YoutubePlaylistsIE,
YoutubeRecommendedIE,
YoutubeSearchDateIE,
YoutubeSearchIE,
@ -1519,7 +1518,7 @@ from .youtube import (
YoutubeSubscriptionsIE,
YoutubeTruncatedIDIE,
YoutubeTruncatedURLIE,
YoutubeUserIE,
YoutubeYtUserIE,
YoutubeWatchLaterIE,
)
from .zapiks import ZapiksIE

View File

@ -17,6 +17,7 @@ from ..utils import (
parse_duration,
try_get,
url_or_none,
urljoin,
)
from .dailymotion import DailymotionIE
@ -128,18 +129,38 @@ class FranceTVIE(InfoExtractor):
is_live = None
formats = []
for video in info['videos']:
if video['statut'] != 'ONLINE':
videos = []
for video in (info.get('videos') or []):
if video.get('statut') != 'ONLINE':
continue
video_url = video['url']
if not video.get('url'):
continue
videos.append(video)
if not videos:
for device_type in ['desktop', 'mobile']:
fallback_info = self._download_json(
'https://player.webservices.francetelevisions.fr/v1/videos/%s' % video_id,
video_id, 'Downloading fallback %s video JSON' % device_type, query={
'device_type': device_type,
'browser': 'chrome',
}, fatal=False)
if fallback_info and fallback_info.get('video'):
videos.append(fallback_info['video'])
formats = []
for video in videos:
video_url = video.get('url')
if not video_url:
continue
if is_live is None:
is_live = (try_get(
video, lambda x: x['plages_ouverture'][0]['direct'],
bool) is True) or '/live.francetv.fr/' in video_url
format_id = video['format']
video, lambda x: x['plages_ouverture'][0]['direct'], bool) is True
or video.get('is_live') is True
or '/live.francetv.fr/' in video_url)
format_id = video.get('format')
ext = determine_ext(video_url)
if ext == 'f4m':
if georestricted:
@ -154,6 +175,9 @@ class FranceTVIE(InfoExtractor):
sign(video_url, format_id), video_id, 'mp4',
entry_protocol='m3u8_native', m3u8_id=format_id,
fatal=False))
elif ext == 'mpd':
formats.extend(self._extract_mpd_formats(
sign(video_url, format_id), video_id, mpd_id=format_id, fatal=False))
elif video_url.startswith('rtmp'):
formats.append({
'url': video_url,
@ -166,6 +190,7 @@ class FranceTVIE(InfoExtractor):
'url': video_url,
'format_id': format_id,
})
self._sort_formats(formats)
title = info['titre']
@ -185,10 +210,10 @@ class FranceTVIE(InfoExtractor):
return {
'id': video_id,
'title': self._live_title(title) if is_live else title,
'description': clean_html(info['synopsis']),
'thumbnail': compat_urlparse.urljoin('http://pluzz.francetv.fr', info['image']),
'duration': int_or_none(info.get('real_duration')) or parse_duration(info['duree']),
'timestamp': int_or_none(info['diffusion']['timestamp']),
'description': clean_html(info.get('synopsis')),
'thumbnail': urljoin('http://pluzz.francetv.fr', info.get('image')),
'duration': int_or_none(info.get('real_duration')) or parse_duration(info.get('duree')),
'timestamp': int_or_none(try_get(info, lambda x: x['diffusion']['timestamp'])),
'is_live': is_live,
'formats': formats,
'subtitles': subtitles,

View File

@ -91,6 +91,7 @@ from .piksel import PikselIE
from .videa import VideaIE
from .twentymin import TwentyMinutenIE
from .ustream import UstreamIE
from .arte import ArteTVEmbedIE
from .videopress import VideoPressIE
from .rutube import RutubeIE
from .limelight import LimelightBaseIE
@ -2760,11 +2761,9 @@ class GenericIE(InfoExtractor):
return self.url_result(ustream_url, UstreamIE.ie_key())
# Look for embedded arte.tv player
mobj = re.search(
r'<(?:script|iframe) [^>]*?src="(?P<url>http://www\.arte\.tv/(?:playerv2/embed|arte_vp/index)[^"]+)"',
webpage)
if mobj is not None:
return self.url_result(mobj.group('url'), 'ArteTVEmbed')
arte_urls = ArteTVEmbedIE._extract_urls(webpage)
if arte_urls:
return self.playlist_from_matches(arte_urls, video_id, video_title)
# Look for embedded francetv player
mobj = re.search(

View File

@ -150,7 +150,7 @@ class IqiyiSDKInterpreter(object):
elif function in other_functions:
other_functions[function]()
else:
raise ExtractorError('Unknown funcion %s' % function)
raise ExtractorError('Unknown function %s' % function)
return sdk.target

View File

@ -0,0 +1,88 @@
# coding: utf-8
from __future__ import unicode_literals
import json
from .common import InfoExtractor
from ..compat import compat_str
from ..utils import (
determine_ext,
ExtractorError,
int_or_none,
mimetype2ext,
try_get,
)
class LBRYIE(InfoExtractor):
IE_NAME = 'lbry.tv'
_VALID_URL = r'https?://(?:www\.)?(?:lbry\.tv|odysee\.com)/(?P<id>@[0-9a-zA-Z-]+:[0-9a-z]+/[0-9a-zA-Z().-]+:[0-9a-z])'
_TESTS = [{
# Video
'url': 'https://lbry.tv/@Mantega:1/First-day-LBRY:1',
'md5': '65bd7ec1f6744ada55da8e4c48a2edf9',
'info_dict': {
'id': '17f983b61f53091fb8ea58a9c56804e4ff8cff4d',
'ext': 'mp4',
'title': 'First day in LBRY? Start HERE!',
'description': 'md5:f6cb5c704b332d37f5119313c2c98f51',
'timestamp': 1595694354,
'upload_date': '20200725',
}
}, {
# Audio
'url': 'https://lbry.tv/@LBRYFoundation:0/Episode-1:e',
'md5': 'c94017d3eba9b49ce085a8fad6b98d00',
'info_dict': {
'id': 'e7d93d772bd87e2b62d5ab993c1c3ced86ebb396',
'ext': 'mp3',
'title': 'The LBRY Foundation Community Podcast Episode 1 - Introduction, Streaming on LBRY, Transcoding',
'description': 'md5:661ac4f1db09f31728931d7b88807a61',
'timestamp': 1591312601,
'upload_date': '20200604',
}
}, {
'url': 'https://odysee.com/@BrodieRobertson:5/apple-is-tracking-everything-you-do-on:e',
'only_matching': True,
}]
def _call_api_proxy(self, method, display_id, params):
return self._download_json(
'https://api.lbry.tv/api/v1/proxy', display_id,
headers={'Content-Type': 'application/json-rpc'},
data=json.dumps({
'method': method,
'params': params,
}).encode())['result']
def _real_extract(self, url):
display_id = self._match_id(url).replace(':', '#')
uri = 'lbry://' + display_id
result = self._call_api_proxy(
'resolve', display_id, {'urls': [uri]})[uri]
result_value = result['value']
if result_value.get('stream_type') not in ('video', 'audio'):
raise ExtractorError('Unsupported URL', expected=True)
streaming_url = self._call_api_proxy(
'get', display_id, {'uri': uri})['streaming_url']
source = result_value.get('source') or {}
media = result_value.get('video') or result_value.get('audio') or {}
signing_channel = result_value.get('signing_channel') or {}
return {
'id': result['claim_id'],
'title': result_value['title'],
'thumbnail': try_get(result_value, lambda x: x['thumbnail']['url'], compat_str),
'description': result_value.get('description'),
'license': result_value.get('license'),
'timestamp': int_or_none(result.get('timestamp')),
'tags': result_value.get('tags'),
'width': int_or_none(media.get('width')),
'height': int_or_none(media.get('height')),
'duration': int_or_none(media.get('duration')),
'channel': signing_channel.get('name'),