Merge branch 'master' into master

pull/211/head
Tom-Oliver Heidel 2020-11-30 02:32:58 +01:00 committed by GitHub
commit 9693a34773
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
75 changed files with 3796 additions and 2302 deletions

View File

@ -217,6 +217,8 @@ I will add some memorable short links to the binaries so you can download them e
--download-archive FILE Download only videos not listed in the
archive file. Record the IDs of all
downloaded videos in it.
--break-on-existing Stop the download process after attempting
to download a file that's in the archive.
--include-ads Download advertisements as well
(experimental)

View File

@ -61,7 +61,7 @@ def build_lazy_ie(ie, name):
return s
# find the correct sorting and add the required base classes so that sublcasses
# find the correct sorting and add the required base classes so that subclasses
# can be correctly created
classes = _ALL_CLASSES[:-1]
ordered_cls = []

View File

@ -59,9 +59,9 @@
- **ARD:mediathek**
- **ARDBetaMediathek**
- **Arkena**
- **arte.tv:+7**
- **arte.tv:embed**
- **arte.tv:playlist**
- **ArteTV**
- **ArteTVEmbed**
- **ArteTVPlaylist**
- **AsianCrush**
- **AsianCrushPlaylist**
- **AtresPlayer**
@ -104,12 +104,14 @@
- **BIQLE**
- **BitChute**
- **BitChuteChannel**
- **bitwave.tv**
- **BleacherReport**
- **BleacherReportCMS**
- **blinkx**
- **Bloomberg**
- **BokeCC**
- **BostonGlobe**
- **Box**
- **Bpb**: Bundeszentrale für politische Bildung
- **BR**: Bayerischer Rundfunk
- **BravoTV**
@ -157,6 +159,7 @@
- **Chilloutzone**
- **chirbit**
- **chirbit:profile**
- **cielotv.it**
- **Cinchcast**
- **Cinemax**
- **CiscoLiveSearch**
@ -424,6 +427,7 @@
- **la7.it**
- **laola1tv**
- **laola1tv:embed**
- **lbry.tv**
- **LCI**
- **Lcp**
- **LcpPlay**
@ -474,6 +478,7 @@
- **massengeschmack.tv**
- **MatchTV**
- **MDR**: MDR.DE and KiKA
- **MedalTV**
- **media.ccc.de**
- **media.ccc.de:lists**
- **Medialaan**
@ -617,6 +622,7 @@
- **Nuvid**
- **NYTimes**
- **NYTimesArticle**
- **NYTimesCooking**
- **NZZ**
- **ocw.mit.edu**
- **OdaTV**
@ -669,6 +675,8 @@
- **PicartoVod**
- **Piksel**
- **Pinkbike**
- **Pinterest**
- **PinterestCollection**
- **Pladform**
- **Platzi**
- **PlatziCourse**
@ -765,6 +773,7 @@
- **RTVNH**
- **RTVS**
- **RUHD**
- **RumbleEmbed**
- **rutube**: Rutube videos
- **rutube:channel**: Rutube channels
- **rutube:embed**: Rutube embedded videos
@ -835,12 +844,14 @@
- **SpankBangPlaylist**
- **Spankwire**
- **Spiegel**
- **Spiegel:Article**: Articles on spiegel.de
- **Spiegeltv**
- **sport.francetvinfo.fr**
- **Sport5**
- **SportBox**
- **SportDeutschland**
- **Spreaker**
- **SpreakerPage**
- **SpreakerShow**
- **SpreakerShowPage**
- **SpringboardPlatform**
- **Sprout**
- **sr:mediathek**: Saarländischer Rundfunk
@ -944,6 +955,7 @@
- **TV2DKBornholmPlay**
- **TV4**: tv4.se and tv4play.se
- **TV5MondePlus**: TV5MONDE+
- **tv8.it**
- **TVA**
- **TVANouvelles**
- **TVANouvellesArticle**
@ -1058,7 +1070,7 @@
- **vk:wallpost**
- **vlive**
- **vlive:channel**
- **vlive:playlist**
- **vlive:post**
- **Vodlocker**
- **VODPl**
- **VODPlatform**
@ -1147,20 +1159,17 @@
- **YourPorn**
- **YourUpload**
- **youtube**: YouTube.com
- **youtube:channel**: YouTube.com channels
- **youtube:favorites**: YouTube.com favourite videos, ":ytfav" for short (requires authentication)
- **youtube:favorites**: YouTube.com liked videos, ":ytfav" for short (requires authentication)
- **youtube:history**: Youtube watch history, ":ythistory" for short (requires authentication)
- **youtube:live**: YouTube.com live streams
- **youtube:playlist**: YouTube.com playlists
- **youtube:playlists**: YouTube.com user/channel playlists
- **youtube:recommended**: YouTube.com recommended videos, ":ytrec" for short (requires authentication)
- **youtube:search**: YouTube.com searches
- **youtube:search:date**: YouTube.com searches, newest videos first
- **youtube:search**: YouTube.com searches, "ytsearch" keyword
- **youtube:search:date**: YouTube.com searches, newest videos first, "ytsearchdate" keyword
- **youtube:search_url**: YouTube.com search URLs
- **youtube:show**: YouTube.com (multi-season) shows
- **youtube:subscriptions**: YouTube.com subscriptions feed, "ytsubs" keyword (requires authentication)
- **youtube:user**: YouTube.com user videos (URL or "ytuser" keyword)
- **youtube:subscriptions**: YouTube.com subscriptions feed, ":ytsubs" for short (requires authentication)
- **youtube:tab**: YouTube.com tab
- **youtube:watchlater**: Youtube watch later list, ":ytwatchlater" for short (requires authentication)
- **YoutubeYtUser**: YouTube.com user videos, URL or "ytuser" keyword
- **Zapiks**
- **Zaq1**
- **Zattoo**

View File

@ -37,7 +37,7 @@
"writeinfojson": true,
"writesubtitles": false,
"allsubtitles": false,
"listssubtitles": false,
"listsubtitles": false,
"socket_timeout": 20,
"fixup": "never"
}

View File

@ -919,6 +919,76 @@ class TestYoutubeDL(unittest.TestCase):
self.assertEqual(downloaded['extractor'], 'testex')
self.assertEqual(downloaded['extractor_key'], 'TestEx')
# Test case for https://github.com/ytdl-org/youtube-dl/issues/27064
def test_ignoreerrors_for_playlist_with_url_transparent_iterable_entries(self):
class _YDL(YDL):
def __init__(self, *args, **kwargs):
super(_YDL, self).__init__(*args, **kwargs)
def trouble(self, s, tb=None):
pass
ydl = _YDL({
'format': 'extra',
'ignoreerrors': True,
})
class VideoIE(InfoExtractor):
_VALID_URL = r'video:(?P<id>\d+)'
def _real_extract(self, url):
video_id = self._match_id(url)
formats = [{
'format_id': 'default',
'url': 'url:',
}]
if video_id == '0':
raise ExtractorError('foo')
if video_id == '2':
formats.append({
'format_id': 'extra',
'url': TEST_URL,
})
return {
'id': video_id,
'title': 'Video %s' % video_id,
'formats': formats,
}
class PlaylistIE(InfoExtractor):
_VALID_URL = r'playlist:'
def _entries(self):
for n in range(3):
video_id = compat_str(n)
yield {
'_type': 'url_transparent',
'ie_key': VideoIE.ie_key(),
'id': video_id,
'url': 'video:%s' % video_id,
'title': 'Video Transparent %s' % video_id,
}
def _real_extract(self, url):
return self.playlist_result(self._entries())
ydl.add_info_extractor(VideoIE(ydl))
ydl.add_info_extractor(PlaylistIE(ydl))
info = ydl.extract_info('playlist:')
entries = info['entries']
self.assertEqual(len(entries), 3)
self.assertTrue(entries[0] is None)
self.assertTrue(entries[1] is None)
self.assertEqual(len(ydl.downloaded_info_dicts), 1)
downloaded = ydl.downloaded_info_dicts[0]
self.assertEqual(entries[2], downloaded)
self.assertEqual(downloaded['url'], TEST_URL)
self.assertEqual(downloaded['title'], 'Video Transparent 2')
self.assertEqual(downloaded['id'], '2')
self.assertEqual(downloaded['extractor'], 'Video')
self.assertEqual(downloaded['extractor_key'], 'Video')
if __name__ == '__main__':
unittest.main()

View File

@ -31,45 +31,47 @@ class TestAllURLsMatching(unittest.TestCase):
def test_youtube_playlist_matching(self):
assertPlaylist = lambda url: self.assertMatch(url, ['youtube:playlist'])
assertTab = lambda url: self.assertMatch(url, ['youtube:tab'])
assertPlaylist('ECUl4u3cNGP61MdtwGTqZA0MreSaDybji8')
assertPlaylist('UUBABnxM4Ar9ten8Mdjj1j0Q') # 585
assertPlaylist('https://www.youtube.com/playlist?list=UUBABnxM4Ar9ten8Mdjj1j0Q')
assertPlaylist('PL63F0C78739B09958')
assertTab('https://www.youtube.com/AsapSCIENCE')
assertTab('https://www.youtube.com/embedded')
assertTab('https://www.youtube.com/feed') # Own channel's home page
assertTab('https://www.youtube.com/playlist?list=UUBABnxM4Ar9ten8Mdjj1j0Q')
assertPlaylist('https://www.youtube.com/course?list=ECUl4u3cNGP61MdtwGTqZA0MreSaDybji8')
assertPlaylist('https://www.youtube.com/playlist?list=PLwP_SiAcdui0KVebT0mU9Apz359a4ubsC')
assertPlaylist('https://www.youtube.com/watch?v=AV6J6_AeFEQ&playnext=1&list=PL4023E734DA416012') # 668
assertTab('https://www.youtube.com/playlist?list=PLwP_SiAcdui0KVebT0mU9Apz359a4ubsC')
assertTab('https://www.youtube.com/watch?v=AV6J6_AeFEQ&playnext=1&list=PL4023E734DA416012') # 668
self.assertFalse('youtube:playlist' in self.matching_ies('PLtS2H6bU1M'))
# Top tracks
assertPlaylist('https://www.youtube.com/playlist?list=MCUS.20142101')
assertTab('https://www.youtube.com/playlist?list=MCUS.20142101')
def test_youtube_matching(self):
self.assertTrue(YoutubeIE.suitable('PLtS2H6bU1M'))
self.assertFalse(YoutubeIE.suitable('https://www.youtube.com/watch?v=AV6J6_AeFEQ&playnext=1&list=PL4023E734DA416012')) # 668
self.assertMatch('http://youtu.be/BaW_jenozKc', ['youtube'])
self.assertMatch('http://www.youtube.com/v/BaW_jenozKc', ['youtube'])
# self.assertMatch('http://www.youtube.com/v/BaW_jenozKc', ['youtube']) # /v/ is no longer valid
self.assertMatch('https://youtube.googleapis.com/v/BaW_jenozKc', ['youtube'])
self.assertMatch('http://www.cleanvideosearch.com/media/action/yt/watch?videoId=8v_4O44sfjM', ['youtube'])
def test_youtube_channel_matching(self):
assertChannel = lambda url: self.assertMatch(url, ['youtube:channel'])
assertChannel = lambda url: self.assertMatch(url, ['youtube:tab'])
assertChannel('https://www.youtube.com/channel/HCtnHdj3df7iM')
assertChannel('https://www.youtube.com/channel/HCtnHdj3df7iM?feature=gb_ch_rec')
assertChannel('https://www.youtube.com/channel/HCtnHdj3df7iM/videos')
def test_youtube_user_matching(self):
self.assertMatch('http://www.youtube.com/NASAgovVideo/videos', ['youtube:user'])
# def test_youtube_user_matching(self):
# self.assertMatch('http://www.youtube.com/NASAgovVideo/videos', ['youtube:tab'])
def test_youtube_feeds(self):
self.assertMatch('https://www.youtube.com/feed/watch_later', ['youtube:watchlater'])
self.assertMatch('https://www.youtube.com/feed/subscriptions', ['youtube:subscriptions'])
self.assertMatch('https://www.youtube.com/feed/recommended', ['youtube:recommended'])
self.assertMatch('https://www.youtube.com/my_favorites', ['youtube:favorites'])
self.assertMatch('https://www.youtube.com/feed/library', ['youtube:tab'])
self.assertMatch('https://www.youtube.com/feed/history', ['youtube:tab'])
self.assertMatch('https://www.youtube.com/feed/watch_later', ['youtube:tab'])
self.assertMatch('https://www.youtube.com/feed/subscriptions', ['youtube:tab'])
def test_youtube_show_matching(self):
self.assertMatch('http://www.youtube.com/show/airdisasters', ['youtube:show'])
def test_youtube_search_matching(self):
self.assertMatch('http://www.youtube.com/results?search_query=making+mustard', ['youtube:search_url'])
self.assertMatch('https://www.youtube.com/results?baz=bar&search_query=youtube-dl+test+video&filters=video&lclk=video', ['youtube:search_url'])
# def test_youtube_search_matching(self):
# self.assertMatch('http://www.youtube.com/results?search_query=making+mustard', ['youtube:search_url'])
# self.assertMatch('https://www.youtube.com/results?baz=bar&search_query=youtube-dl+test+video&filters=video&lclk=video', ['youtube:search_url'])
def test_youtube_extract(self):
assertExtractId = lambda url, id: self.assertEqual(YoutubeIE.extract_id(url), id)

View File

@ -937,6 +937,28 @@ class TestUtil(unittest.TestCase):
self.assertEqual(d['x'], 1)
self.assertEqual(d['y'], 'a')
# Just drop ! prefix for now though this results in a wrong value
on = js_to_json('''{
a: !0,
b: !1,
c: !!0,
d: !!42.42,
e: !!![],
f: !"abc",
g: !"",
!42: 42
}''')
self.assertEqual(json.loads(on), {
'a': 0,
'b': 1,
'c': 0,
'd': 42.42,
'e': [],
'f': "abc",
'g': "",
'42': 42
})
on = js_to_json('["abc", "def",]')
self.assertEqual(json.loads(on), ['abc', 'def'])
@ -994,6 +1016,12 @@ class TestUtil(unittest.TestCase):
on = js_to_json('{42:4.2e1}')
self.assertEqual(json.loads(on), {'42': 42.0})
on = js_to_json('{ "0x40": "0x40" }')
self.assertEqual(json.loads(on), {'0x40': '0x40'})
on = js_to_json('{ "040": "040" }')
self.assertEqual(json.loads(on), {'040': '040'})
def test_js_to_json_malformed(self):
self.assertEqual(js_to_json('42a1'), '42"a1"')
self.assertEqual(js_to_json('42a-1'), '42"a"-1')

View File

@ -210,6 +210,8 @@ class YoutubeDL(object):
download_archive: File name of a file where all downloads are recorded.
Videos already present in the file are not downloaded
again.
break_on_existing: Stop the download process after attempting to download a file that's
in the archive.
cookiefile: File name where cookies should be read from and dumped to.
nocheckcertificate:Do not verify SSL certificates
prefer_insecure: Use HTTP instead of HTTPS to retrieve information.
@ -821,31 +823,30 @@ class YoutubeDL(object):
if not ie.suitable(url):
continue
ie = self.get_info_extractor(ie.ie_key())
ie_key = ie.ie_key()
ie = self.get_info_extractor(ie_key)
if not ie.working():
self.report_warning('The program functionality for this site has been marked as broken, '
'and will probably not work.')
try:
ie_result = ie.extract(url)
if ie_result is None: # Finished already (backwards compatibility; listformats and friends should be moved here)
break
if isinstance(ie_result, list):
# Backwards compatibility: old IE result format
ie_result = {
'_type': 'compat_list',
'entries': ie_result,
}
if info_dict:
if info_dict.get('id'):
ie_result['id'] = info_dict['id']
if info_dict.get('title'):
ie_result['title'] = info_dict['title']
self.add_default_extra_info(ie_result, ie, url)
if process:
return self.process_ie_result(ie_result, download, extra_info)
else:
return ie_result
temp_id = ie.extract_id(url) if callable(getattr(ie, 'extract_id', None)) else ie._match_id(url)
except (AssertionError, IndexError, AttributeError):
temp_id = None
if temp_id is not None and self.in_download_archive({'id': temp_id, 'ie_key': ie_key}):
self.to_screen("[%s] %s: has already been recorded in archive" % (
ie_key, temp_id))
break
return self.__extract_info(url, ie, download, extra_info, process, info_dict)
else:
self.report_error('no suitable InfoExtractor for URL %s' % url)
def __handle_extraction_exceptions(func):
def wrapper(self, *args, **kwargs):
try:
return func(self, *args, **kwargs)
except GeoRestrictedError as e:
msg = e.msg
if e.countries:
@ -853,20 +854,38 @@ class YoutubeDL(object):
map(ISO3166Utils.short2full, e.countries))
msg += '\nYou might want to use a VPN or a proxy server (with --proxy) to workaround.'
self.report_error(msg)
break
except ExtractorError as e: # An error we somewhat expected
self.report_error(compat_str(e), e.format_traceback())
break
except MaxDownloadsReached:
raise
except Exception as e:
if self.params.get('ignoreerrors', False):
self.report_error(error_to_compat_str(e), tb=encode_compat_str(traceback.format_exc()))
break
else:
raise
return wrapper
@__handle_extraction_exceptions
def __extract_info(self, url, ie, download, extra_info, process, info_dict):
ie_result = ie.extract(url)
if ie_result is None: # Finished already (backwards compatibility; listformats and friends should be moved here)
return
if isinstance(ie_result, list):
# Backwards compatibility: old IE result format
ie_result = {
'_type': 'compat_list',
'entries': ie_result,
}
if info_dict:
if info_dict.get('id'):
ie_result['id'] = info_dict['id']
if info_dict.get('title'):
ie_result['title'] = info_dict['title']
self.add_default_extra_info(ie_result, ie, url)
if process:
return self.process_ie_result(ie_result, download, extra_info)
else:
self.report_error('no suitable InfoExtractor for URL %s' % url)
return ie_result
def add_default_extra_info(self, ie_result, ie, url):
self.add_extra_info(ie_result, {
@ -1038,12 +1057,15 @@ class YoutubeDL(object):
reason = self._match_entry(entry, incomplete=True)
if reason is not None:
self.to_screen('[download] ' + reason)
continue
if reason.endswith('has already been recorded in the archive') and self.params.get('break_on_existing'):
print('[download] tried downloading a file that\'s already in the archive, stopping since --break-on-existing is set.')
break
else:
self.to_screen('[download] ' + reason)
continue
entry_result = self.process_ie_result(entry,
download=download,
extra_info=extra)
entry_result = self.__process_iterable_entry(entry, download, extra)
# TODO: skip failed (empty) entries?
playlist_results.append(entry_result)
ie_result['entries'] = playlist_results
self.to_screen('[download] Finished downloading playlist: %s' % playlist)
@ -1072,6 +1094,11 @@ class YoutubeDL(object):
else:
raise Exception('Invalid result type: %s' % result_type)
@__handle_extraction_exceptions
def __process_iterable_entry(self, entry, download, extra_info):
return self.process_ie_result(
entry, download=download, extra_info=extra_info)
def _build_format_filter(self, filter_spec):
" Returns a function to filter the formats according to the filter_spec "

View File

@ -405,6 +405,7 @@ def _real_main(argv=None):
'youtube_print_sig_code': opts.youtube_print_sig_code,
'age_limit': opts.age_limit,
'download_archive': download_archive_fn,
'break_on_existing': opts.break_on_existing,
'cookiefile': opts.cookiefile,
'nocheckcertificate': opts.no_check_certificate,
'prefer_insecure': opts.prefer_insecure,

View File

@ -2345,7 +2345,7 @@ except ImportError: # Python <3.4
# HTMLParseError has been deprecated in Python 3.3 and removed in
# Python 3.5. Introducing dummy exception for Python >3.5 for compatible
# and uniform cross-version exceptiong handling
# and uniform cross-version exception handling
class compat_HTMLParseError(Exception):
pass

View File

@ -97,12 +97,15 @@ class FragmentFD(FileDownloader):
def _download_fragment(self, ctx, frag_url, info_dict, headers=None):
fragment_filename = '%s-Frag%d' % (ctx['tmpfilename'], ctx['fragment_index'])
success = ctx['dl'].download(fragment_filename, {
fragment_info_dict = {
'url': frag_url,
'http_headers': headers or info_dict.get('http_headers'),
})
}
success = ctx['dl'].download(fragment_filename, fragment_info_dict)
if not success:
return False, None
if fragment_info_dict.get('filetime'):
ctx['fragment_filetime'] = fragment_info_dict.get('filetime')
down, frag_sanitized = sanitize_open(fragment_filename, 'rb')
ctx['fragment_filename_sanitized'] = frag_sanitized
frag_content = down.read()
@ -258,6 +261,13 @@ class FragmentFD(FileDownloader):
downloaded_bytes = ctx['complete_frags_downloaded_bytes']
else:
self.try_rename(ctx['tmpfilename'], ctx['filename'])
if self.params.get('updatetime', True):
filetime = ctx.get('fragment_filetime')
if filetime:
try:
os.utime(ctx['filename'], (time.time(), filetime))
except Exception:
pass
downloaded_bytes = os.path.getsize(encodeFilename(ctx['filename']))
self._hook_progress({

View File

@ -109,7 +109,9 @@ class HttpFD(FileDownloader):
try:
ctx.data = self.ydl.urlopen(request)
except (compat_urllib_error.URLError, ) as err:
if isinstance(err.reason, socket.timeout):
# reason may not be available, e.g. for urllib2.HTTPError on python 2.6
reason = getattr(err, 'reason', None)
if isinstance(reason, socket.timeout):
raise RetryDownload(err)
raise err
# When trying to resume, Content-Range HTTP header of response has to be checked

View File

@ -275,7 +275,7 @@ class AfreecaTVIE(InfoExtractor):
video_element = video_xml.findall(compat_xpath('./track/video'))[-1]
if video_element is None or video_element.text is None:
raise ExtractorError(
'Video %s video does not exist' % video_id, expected=True)
'Video %s does not exist' % video_id, expected=True)
video_url = video_element.text.strip()

View File

@ -0,0 +1,103 @@
# coding: utf-8
from __future__ import unicode_literals
from .common import InfoExtractor
from .youtube import YoutubeIE
from .vimeo import VimeoIE
from ..utils import (
int_or_none,
parse_iso8601,
update_url_query,
)
class AmaraIE(InfoExtractor):
_VALID_URL = r'https?://(?:www\.)?amara\.org/(?:\w+/)?videos/(?P<id>\w+)'
_TESTS = [{
# Youtube
'url': 'https://amara.org/en/videos/jVx79ZKGK1ky/info/why-jury-trials-are-becoming-less-common/?tab=video',
'md5': 'ea10daf2b6154b8c1ecf9922aca5e8ae',
'info_dict': {
'id': 'h6ZuVdvYnfE',
'ext': 'mp4',
'title': 'Why jury trials are becoming less common',
'description': 'md5:a61811c319943960b6ab1c23e0cbc2c1',
'thumbnail': r're:^https?://.*\.jpg$',
'subtitles': dict,
'upload_date': '20160813',
'uploader': 'PBS NewsHour',
'uploader_id': 'PBSNewsHour',
'timestamp': 1549639570,
}
}, {
# Vimeo
'url': 'https://amara.org/en/videos/kYkK1VUTWW5I/info/vimeo-at-ces-2011',
'md5': '99392c75fa05d432a8f11df03612195e',
'info_dict': {
'id': '18622084',
'ext': 'mov',
'title': 'Vimeo at CES 2011!',
'description': 'md5:d41d8cd98f00b204e9800998ecf8427e',
'thumbnail': r're:^https?://.*\.jpg$',
'subtitles': dict,
'timestamp': 1294763658,
'upload_date': '20110111',
'uploader': 'Sam Morrill',
'uploader_id': 'sammorrill'
}
}, {
# Direct Link
'url': 'https://amara.org/en/videos/s8KL7I3jLmh6/info/the-danger-of-a-single-story/',
'md5': 'd3970f08512738ee60c5807311ff5d3f',
'info_dict': {
'id': 's8KL7I3jLmh6',
'ext': 'mp4',
'title': 'The danger of a single story',
'description': 'md5:d769b31139c3b8bb5be9177f62ea3f23',
'thumbnail': r're:^https?://.*\.jpg$',
'subtitles': dict,
'upload_date': '20091007',
'timestamp': 1254942511,
}
}]
def _real_extract(self, url):
video_id = self._match_id(url)
meta = self._download_json(
'https://amara.org/api/videos/%s/' % video_id,
video_id, query={'format': 'json'})
title = meta['title']
video_url = meta['all_urls'][0]
subtitles = {}
for language in (meta.get('languages') or []):
subtitles_uri = language.get('subtitles_uri')
if not (subtitles_uri and language.get('published')):
continue
subtitle = subtitles.setdefault(language.get('code') or 'en', [])
for f in ('json', 'srt', 'vtt'):
subtitle.append({
'ext': f,
'url': update_url_query(subtitles_uri, {'format': f}),
})
info = {
'url': video_url,
'id': video_id,
'subtitles': subtitles,
'title': title,
'description': meta.get('description'),
'thumbnail': meta.get('thumbnail'),
'duration': int_or_none(meta.get('duration')),
'timestamp': parse_iso8601(meta.get('created')),
}
for ie in (YoutubeIE, VimeoIE):
if ie.suitable(video_url):
info.update({
'_type': 'url_transparent',
'ie_key': ie.ie_key(),
})
break
return info

View File

@ -4,23 +4,57 @@ from __future__ import unicode_literals
import re
from .common import InfoExtractor
from ..compat import compat_str
from ..compat import (
compat_str,
compat_urlparse,
)
from ..utils import (
ExtractorError,
int_or_none,
qualities,
try_get,
unified_strdate,
url_or_none,
)
# There are different sources of video in arte.tv, the extraction process
# is different for each one. The videos usually expire in 7 days, so we can't
# add tests.
class ArteTVBaseIE(InfoExtractor):
def _extract_from_json_url(self, json_url, video_id, lang, title=None):
info = self._download_json(json_url, video_id)
_ARTE_LANGUAGES = 'fr|de|en|es|it|pl'
_API_BASE = 'https://api.arte.tv/api/player/v1'
class ArteTVIE(ArteTVBaseIE):
_VALID_URL = r'''(?x)
https?://
(?:
(?:www\.)?arte\.tv/(?P<lang>%(langs)s)/videos|
api\.arte\.tv/api/player/v\d+/config/(?P<lang_2>%(langs)s)
)
/(?P<id>\d{6}-\d{3}-[AF])
''' % {'langs': ArteTVBaseIE._ARTE_LANGUAGES}
_TESTS = [{
'url': 'https://www.arte.tv/en/videos/088501-000-A/mexico-stealing-petrol-to-survive/',
'info_dict': {
'id': '088501-000-A',
'ext': 'mp4',
'title': 'Mexico: Stealing Petrol to Survive',
'upload_date': '20190628',
},
}, {
'url': 'https://www.arte.tv/pl/videos/100103-000-A/usa-dyskryminacja-na-porodowce/',
'only_matching': True,
}, {
'url': 'https://api.arte.tv/api/player/v2/config/de/100605-013-A',
'only_matching': True,
}]
def _real_extract(self, url):
mobj = re.match(self._VALID_URL, url)
video_id = mobj.group('id')
lang = mobj.group('lang') or mobj.group('lang_2')
info = self._download_json(
'%s/config/%s/%s' % (self._API_BASE, lang, video_id), video_id)
player_info = info['videoJsonPlayer']
vsr = try_get(player_info, lambda x: x['VSR'], dict)
@ -37,18 +71,11 @@ class ArteTVBaseIE(InfoExtractor):
if not upload_date_str:
upload_date_str = (player_info.get('VRA') or player_info.get('VDA') or '').split(' ')[0]
title = (player_info.get('VTI') or title or player_info['VID']).strip()
title = (player_info.get('VTI') or player_info['VID']).strip()
subtitle = player_info.get('VSU', '').strip()
if subtitle:
title += ' - %s' % subtitle
info_dict = {
'id': player_info['VID'],
'title': title,
'description': player_info.get('VDE'),
'upload_date': unified_strdate(upload_date_str),
'thumbnail': player_info.get('programImage') or player_info.get('VTU', {}).get('IUR'),
}
qfunc = qualities(['MQ', 'HQ', 'EQ', 'SQ'])
LANGS = {
@ -65,6 +92,10 @@ class ArteTVBaseIE(InfoExtractor):
formats = []
for format_id, format_dict in vsr.items():
f = dict(format_dict)
format_url = url_or_none(f.get('url'))
streamer = f.get('streamer')
if not format_url and not streamer:
continue
versionCode = f.get('versionCode')
l = re.escape(langcode)
@ -107,6 +138,16 @@ class ArteTVBaseIE(InfoExtractor):
else:
lang_pref = -1
media_type = f.get('mediaType')
if media_type == 'hls':
m3u8_formats = self._extract_m3u8_formats(
format_url, video_id, 'mp4', entry_protocol='m3u8_native',
m3u8_id=format_id, fatal=False)
for m3u8_format in m3u8_formats:
m3u8_format['language_preference'] = lang_pref
formats.extend(m3u8_formats)
continue
format = {
'format_id': format_id,
'preference': -10 if f.get('videoFormat') == 'M3U8' else None,
@ -118,7 +159,7 @@ class ArteTVBaseIE(InfoExtractor):
'quality': qfunc(f.get('quality')),
}
if f.get('mediaType') == 'rtmp':
if media_type == 'rtmp':
format['url'] = f['streamer']
format['play_path'] = 'mp4:' + f['url']
format['ext'] = 'flv'
@ -127,56 +168,50 @@ class ArteTVBaseIE(InfoExtractor):
formats.append(format)
self._check_formats(formats, video_id)
self._sort_formats(formats)
info_dict['formats'] = formats
return info_dict
return {
'id': player_info.get('VID') or video_id,
'title': title,
'description': player_info.get('VDE'),
'upload_date': unified_strdate(upload_date_str),
'thumbnail': player_info.get('programImage') or player_info.get('VTU', {}).get('IUR'),
'formats': formats,
}
class ArteTVPlus7IE(ArteTVBaseIE):
IE_NAME = 'arte.tv:+7'
_VALID_URL = r'https?://(?:www\.)?arte\.tv/(?P<lang>fr|de|en|es|it|pl)/videos/(?P<id>\d{6}-\d{3}-[AF])'
class ArteTVEmbedIE(InfoExtractor):
_VALID_URL = r'https?://(?:www\.)?arte\.tv/player/v\d+/index\.php\?.*?\bjson_url=.+'
_TESTS = [{
'url': 'https://www.arte.tv/en/videos/088501-000-A/mexico-stealing-petrol-to-survive/',
'url': 'https://www.arte.tv/player/v5/index.php?json_url=https%3A%2F%2Fapi.arte.tv%2Fapi%2Fplayer%2Fv2%2Fconfig%2Fde%2F100605-013-A&lang=de&autoplay=true&mute=0100605-013-A',
'info_dict': {
'id': '088501-000-A',
'id': '100605-013-A',
'ext': 'mp4',
'title': 'Mexico: Stealing Petrol to Survive',
'upload_date': '20190628',
'title': 'United we Stream November Lockdown Edition #13',
'description': 'md5:be40b667f45189632b78c1425c7c2ce1',
'upload_date': '20201116',
},
}, {
'url': 'https://www.arte.tv/player/v3/index.php?json_url=https://api.arte.tv/api/player/v2/config/de/100605-013-A',
'only_matching': True,
}]
def _real_extract(self, url):
lang, video_id = re.match(self._VALID_URL, url).groups()
return self._extract_from_json_url(
'https://api.arte.tv/api/player/v1/config/%s/%s' % (lang, video_id),
video_id, lang)
class ArteTVEmbedIE(ArteTVPlus7IE):
IE_NAME = 'arte.tv:embed'
_VALID_URL = r'''(?x)
https://www\.arte\.tv
/player/v3/index\.php\?json_url=
(?P<json_url>
https?://api\.arte\.tv/api/player/v1/config/
(?P<lang>[^/]+)/(?P<id>\d{6}-\d{3}-[AF])
)
'''
_TESTS = []
@staticmethod
def _extract_urls(webpage):
return [url for _, url in re.findall(
r'<(?:iframe|script)[^>]+src=(["\'])(?P<url>(?:https?:)?//(?:www\.)?arte\.tv/player/v\d+/index\.php\?.*?\bjson_url=.+?)\1',
webpage)]
def _real_extract(self, url):
json_url, lang, video_id = re.match(self._VALID_URL, url).groups()
return self._extract_from_json_url(json_url, video_id, lang)
qs = compat_urlparse.parse_qs(compat_urlparse.urlparse(url).query)
json_url = qs['json_url'][0]
video_id = ArteTVIE._match_id(json_url)
return self.url_result(
json_url, ie=ArteTVIE.ie_key(), video_id=video_id)
class ArteTVPlaylistIE(ArteTVBaseIE):
IE_NAME = 'arte.tv:playlist'
_VALID_URL = r'https?://(?:www\.)?arte\.tv/(?P<lang>fr|de|en|es|it|pl)/videos/(?P<id>RC-\d{6})'
_VALID_URL = r'https?://(?:www\.)?arte\.tv/(?P<lang>%s)/videos/(?P<id>RC-\d{6})' % ArteTVBaseIE._ARTE_LANGUAGES
_TESTS = [{
'url': 'https://www.arte.tv/en/videos/RC-016954/earn-a-living/',
'info_dict': {
@ -185,17 +220,35 @@ class ArteTVPlaylistIE(ArteTVBaseIE):
'description': 'md5:d322c55011514b3a7241f7fb80d494c2',
},
'playlist_mincount': 6,
}, {
'url': 'https://www.arte.tv/pl/videos/RC-014123/arte-reportage/',
'only_matching': True,
}]
def _real_extract(self, url):
lang, playlist_id = re.match(self._VALID_URL, url).groups()
collection = self._download_json(
'https://api.arte.tv/api/player/v1/collectionData/%s/%s?source=videos'
% (lang, playlist_id), playlist_id)
'%s/collectionData/%s/%s?source=videos'
% (self._API_BASE, lang, playlist_id), playlist_id)
entries = []
for video in collection['videos']:
if not isinstance(video, dict):
continue
video_url = url_or_none(video.get('url')) or url_or_none(video.get('jsonUrl'))
if not video_url:
continue
video_id = video.get('programId')
entries.append({
'_type': 'url_transparent',
'url': video_url,
'id': video_id,
'title': video.get('title'),
'alt_title': video.get('subtitle'),
'thumbnail': url_or_none(try_get(video, lambda x: x['mainImage']['url'], compat_str)),
'duration': int_or_none(video.get('durationSeconds')),
'view_count': int_or_none(video.get('views')),
'ie_key': ArteTVIE.ie_key(),
})
title = collection.get('title')
description = collection.get('shortDescription') or collection.get('teaserText')
entries = [
self._extract_from_json_url(
video['jsonUrl'], video.get('programId') or playlist_id, lang)
for video in collection['videos'] if video.get('jsonUrl')]
return self.playlist_result(entries, playlist_id, title, description)

View File

@ -1,3 +1,4 @@
# coding: utf-8
from __future__ import unicode_literals
import random
@ -5,10 +6,7 @@ import re
import time
from .common import InfoExtractor
from ..compat import (
compat_str,
compat_urlparse,
)
from ..compat import compat_str
from ..utils import (
ExtractorError,
float_or_none,
@ -17,71 +15,32 @@ from ..utils import (
parse_filesize,
str_or_none,
try_get,
unescapeHTML,
update_url_query,
unified_strdate,
unified_timestamp,
url_or_none,
urljoin,
)
class BandcampBaseIE(InfoExtractor):
"""Provide base functions for Bandcamp extractors"""
def _extract_json_from_html_data_attribute(self, webpage, suffix, video_id):
json_string = self._html_search_regex(
r' data-%s="([^"]*)' % suffix,
webpage, '%s json' % suffix, default='{}')
return self._parse_json(json_string, video_id)
def _parse_json_track(self, json):
formats = []
file_ = json.get('file')
if isinstance(file_, dict):
for format_id, format_url in file_.items():
if not url_or_none(format_url):
continue
ext, abr_str = format_id.split('-', 1)
formats.append({
'format_id': format_id,
'url': self._proto_relative_url(format_url, 'http:'),
'ext': ext,
'vcodec': 'none',
'acodec': ext,
'abr': int_or_none(abr_str),
})
return {
'duration': float_or_none(json.get('duration')),
'id': str_or_none(json.get('track_id') or json.get('id')),
'title': json.get('title'),
'title_link': json.get('title_link'),
'number': int_or_none(json.get('track_num')),
'formats': formats
}
class BandcampIE(BandcampBaseIE):
IE_NAME = "Bandcamp:track"
_VALID_URL = r'https?://[^/]+\.bandcamp\.com/track/(?P<title>[^/?#&]+)'
class BandcampIE(InfoExtractor):
_VALID_URL = r'https?://[^/]+\.bandcamp\.com/track/(?P<id>[^/?#&]+)'
_TESTS = [{
'url': 'http://youtube-dl.bandcamp.com/track/youtube-dl-test-song',
'md5': 'c557841d5e50261777a6585648adf439',
'info_dict': {
'id': '1812978515',
'ext': 'mp3',
'title': "youtube-dl \"'/\\\u00e4\u21ad - youtube-dl \"'/\\\u00e4\u21ad - youtube-dl test song \"'/\\\u00e4\u21ad",
'title': "youtube-dl \"'/\\ä↭ - youtube-dl \"'/\\ä↭ - youtube-dl test song \"'/\\ä↭",
'duration': 9.8485,
'uploader': "youtube-dl \"'/\\\u00e4\u21ad",
'timestamp': 1354224127,
'uploader': 'youtube-dl "\'/\\ä↭',
'upload_date': '20121129',
'timestamp': 1354224127,
},
'_skip': 'There is a limit of 200 free downloads / month for the test song'
}, {
# free download
'url': 'http://benprunty.bandcamp.com/track/lanius-battle',
'md5': '5d92af55811e47f38962a54c30b07ef0',
'info_dict': {
'id': '2650410135',
'ext': 'aiff',
@ -120,52 +79,59 @@ class BandcampIE(BandcampBaseIE):
},
}]
def _extract_data_attr(self, webpage, video_id, attr='tralbum', fatal=True):
return self._parse_json(self._html_search_regex(
r'data-%s=(["\'])({.+?})\1' % attr, webpage,
attr + ' data', group=2), video_id, fatal=fatal)
def _real_extract(self, url):
mobj = re.match(self._VALID_URL, url)
title = mobj.group('title')
url_track_title = title
title = self._match_id(url)
webpage = self._download_webpage(url, title)
thumbnail = self._html_search_meta('og:image', webpage, default=None)
tralbum = self._extract_data_attr(webpage, title)
thumbnail = self._og_search_thumbnail(webpage)
json_tralbum = self._extract_json_from_html_data_attribute(webpage, "tralbum", url_track_title)
json_embed = self._extract_json_from_html_data_attribute(webpage, "embed", url_track_title)
track_id = None
track = None
track_number = None
duration = None
json_tracks = json_tralbum.get('trackinfo')
if not json_tracks:
raise ExtractorError('Could not extract track')
formats = []
track_info = try_get(tralbum, lambda x: x['trackinfo'][0], dict)
if track_info:
file_ = track_info.get('file')
if isinstance(file_, dict):
for format_id, format_url in file_.items():
if not url_or_none(format_url):
continue
ext, abr_str = format_id.split('-', 1)
formats.append({
'format_id': format_id,
'url': self._proto_relative_url(format_url, 'http:'),
'ext': ext,
'vcodec': 'none',
'acodec': ext,
'abr': int_or_none(abr_str),
})
track = track_info.get('title')
track_id = str_or_none(
track_info.get('track_id') or track_info.get('id'))
track_number = int_or_none(track_info.get('track_num'))
duration = float_or_none(track_info.get('duration'))
track = self._parse_json_track(json_tracks[0])
artist = json_tralbum.get('artist')
album_title = json_embed.get('album_title')
embed = self._extract_data_attr(webpage, title, 'embed', False)
current = tralbum.get('current') or {}
artist = embed.get('artist') or current.get('artist') or tralbum.get('artist')
timestamp = unified_timestamp(
current.get('publish_date') or tralbum.get('album_publish_date'))
json_album = json_tralbum.get('packages')
if json_album:
json_album = json_album[0]
album_publish_date = json_album.get('album_publish_date')
album_release_date = json_album.get('album_release_date')
else:
album_publish_date = None
album_release_date = json_tralbum.get('album_release_date')
timestamp = unified_timestamp(json_tralbum.get('current', {}).get('publish_date') or album_publish_date)
release_date = unified_strdate(album_release_date)
download_link = self._search_regex(
r'freeDownloadPage(?:["\']|&quot;):\s*(["\']|&quot;)(?P<url>(?:(?!\1).)+)\1', webpage,
'download link', default=None, group='url')
download_link = tralbum.get('freeDownloadPage')
if download_link:
track_id = self._search_regex(
r'\?id=(?P<id>\d+)&',
download_link, 'track id')
track_id = compat_str(tralbum['id'])
download_webpage = self._download_webpage(
download_link, track_id, 'Downloading free downloads page')
blob = self._parse_json(
self._search_regex(
r'data-blob=(["\'])(?P<blob>{.+?})\1', download_webpage,
'blob', group='blob'),
track_id, transform_source=unescapeHTML)
blob = self._extract_data_attr(download_webpage, track_id, 'blob')
info = try_get(
blob, (lambda x: x['digital_items'][0],
@ -173,6 +139,8 @@ class BandcampIE(BandcampBaseIE):
if info:
downloads = info.get('downloads')
if isinstance(downloads, dict):
if not track:
track = info.get('title')
if not artist:
artist = info.get('artist')
if not thumbnail:
@ -206,7 +174,7 @@ class BandcampIE(BandcampBaseIE):
retry_url = url_or_none(stat.get('retry_url'))
if not retry_url:
continue
track['formats'].append({
formats.append({
'url': self._proto_relative_url(retry_url, 'http:'),
'ext': download_formats.get(format_id),
'format_id': format_id,
@ -215,30 +183,34 @@ class BandcampIE(BandcampBaseIE):
'vcodec': 'none',
})
self._sort_formats(track['formats'])
self._sort_formats(formats)
title = '%s - %s' % (artist, track.get('title')) if artist else track.get('title')
title = '%s - %s' % (artist, track) if artist else track
if not duration:
duration = float_or_none(self._html_search_meta(
'duration', webpage, default=None))
return {
'album': album_title,
'artist': artist,
'duration': track['duration'],
'formats': track['formats'],
'id': track['id'],
'release_date': release_date,
'thumbnail': thumbnail,
'timestamp': timestamp,
'id': track_id,
'title': title,
'track': track['title'],
'track_id': track['id'],
'track_number': track['number'],
'uploader': artist
'thumbnail': thumbnail,
'uploader': artist,
'timestamp': timestamp,
'release_date': unified_strdate(tralbum.get('album_release_date')),
'duration': duration,
'track': track,
'track_number': track_number,
'track_id': track_id,
'artist': artist,
'album': embed.get('album_title'),
'formats': formats,
}
class BandcampAlbumIE(BandcampBaseIE):
class BandcampAlbumIE(BandcampIE):
IE_NAME = 'Bandcamp:album'
_VALID_URL = r'https?://(?:(?P<subdomain>[^.]+)\.)?bandcamp\.com(?:/album/(?P<album_id>[^/?#&]+))?'
_VALID_URL = r'https?://(?:(?P<subdomain>[^.]+)\.)?bandcamp\.com(?:/album/(?P<id>[^/?#&]+))?'
_TESTS = [{
'url': 'http://blazo.bandcamp.com/album/jazz-format-mixtape-vol-1',
@ -248,7 +220,10 @@ class BandcampAlbumIE(BandcampBaseIE):
'info_dict': {
'id': '1353101989',
'ext': 'mp3',
'title': 'Intro',
'title': 'Blazo - Intro',
'timestamp': 1311756226,
'upload_date': '20110727',
'uploader': 'Blazo',
}
},
{
@ -256,7 +231,10 @@ class BandcampAlbumIE(BandcampBaseIE):
'info_dict': {
'id': '38097443',
'ext': 'mp3',
'title': 'Kero One - Keep It Alive (Blazo remix)',
'title': 'Blazo - Kero One - Keep It Alive (Blazo remix)',
'timestamp': 1311757238,
'upload_date': '20110727',
'uploader': 'Blazo',
}
},
],
@ -292,6 +270,7 @@ class BandcampAlbumIE(BandcampBaseIE):
'title': '"Entropy" EP',
'uploader_id': 'jstrecords',
'id': 'entropy-ep',
'description': 'md5:0ff22959c943622972596062f2f366a5',
},
'playlist_mincount': 3,
}, {
@ -301,6 +280,7 @@ class BandcampAlbumIE(BandcampBaseIE):
'id': 'we-are-the-plague',
'title': 'WE ARE THE PLAGUE',
'uploader_id': 'insulters',
'description': 'md5:b3cf845ee41b2b1141dc7bde9237255f',
},
'playlist_count': 2,
}]
@ -312,41 +292,34 @@ class BandcampAlbumIE(BandcampBaseIE):
else super(BandcampAlbumIE, cls).suitable(url))
def _real_extract(self, url):
mobj = re.match(self._VALID_URL, url)
uploader_id = mobj.group('subdomain')
album_id = mobj.group('album_id')
uploader_id, album_id = re.match(self._VALID_URL, url).groups()
playlist_id = album_id or uploader_id
webpage = self._download_webpage(url, playlist_id)
json_tralbum = self._extract_json_from_html_data_attribute(webpage, "tralbum", playlist_id)
json_embed = self._extract_json_from_html_data_attribute(webpage, "embed", playlist_id)