diff --git a/yt_dlp/YoutubeDL.py b/yt_dlp/YoutubeDL.py index 397d40503..0979252c9 100644 --- a/yt_dlp/YoutubeDL.py +++ b/yt_dlp/YoutubeDL.py @@ -1794,14 +1794,18 @@ class YoutubeDL(object): if 'display_id' not in info_dict and 'id' in info_dict: info_dict['display_id'] = info_dict['id'] - if info_dict.get('upload_date') is None and info_dict.get('timestamp') is not None: - # Working around out-of-range timestamp values (e.g. negative ones on Windows, - # see http://bugs.python.org/issue1646728) - try: - upload_date = datetime.datetime.utcfromtimestamp(info_dict['timestamp']) - info_dict['upload_date'] = upload_date.strftime('%Y%m%d') - except (ValueError, OverflowError, OSError): - pass + for ts_key, date_key in ( + ('timestamp', 'upload_date'), + ('release_timestamp', 'release_date'), + ): + if info_dict.get(date_key) is None and info_dict.get(ts_key) is not None: + # Working around out-of-range timestamp values (e.g. negative ones on Windows, + # see http://bugs.python.org/issue1646728) + try: + upload_date = datetime.datetime.utcfromtimestamp(info_dict[ts_key]) + info_dict[date_key] = upload_date.strftime('%Y%m%d') + except (ValueError, OverflowError, OSError): + pass # Auto generate title fields corresponding to the *_number fields when missing # in order to always have clean titles. This is very common for TV series. diff --git a/yt_dlp/extractor/applepodcasts.py b/yt_dlp/extractor/applepodcasts.py index 95758fece..6a74de758 100644 --- a/yt_dlp/extractor/applepodcasts.py +++ b/yt_dlp/extractor/applepodcasts.py @@ -42,6 +42,7 @@ class ApplePodcastsIE(InfoExtractor): ember_data = self._parse_json(self._search_regex( r'id="shoebox-ember-data-store"[^>]*>\s*({.+?})\s*<', webpage, 'ember data'), episode_id) + ember_data = ember_data.get(episode_id) or ember_data episode = ember_data['data']['attributes'] description = episode.get('description') or {} diff --git a/yt_dlp/extractor/bandcamp.py b/yt_dlp/extractor/bandcamp.py index 69e673a26..006aab3b4 100644 --- a/yt_dlp/extractor/bandcamp.py +++ b/yt_dlp/extractor/bandcamp.py @@ -49,6 +49,7 @@ class BandcampIE(InfoExtractor): 'uploader': 'Ben Prunty', 'timestamp': 1396508491, 'upload_date': '20140403', + 'release_timestamp': 1396483200, 'release_date': '20140403', 'duration': 260.877, 'track': 'Lanius (Battle)', @@ -69,6 +70,7 @@ class BandcampIE(InfoExtractor): 'uploader': 'Mastodon', 'timestamp': 1322005399, 'upload_date': '20111122', + 'release_timestamp': 1076112000, 'release_date': '20040207', 'duration': 120.79, 'track': 'Hail to Fire', @@ -197,7 +199,7 @@ class BandcampIE(InfoExtractor): 'thumbnail': thumbnail, 'uploader': artist, 'timestamp': timestamp, - 'release_date': unified_strdate(tralbum.get('album_release_date')), + 'release_timestamp': unified_timestamp(tralbum.get('album_release_date')), 'duration': duration, 'track': track, 'track_number': track_number, diff --git a/yt_dlp/extractor/bilibili.py b/yt_dlp/extractor/bilibili.py index c3e0a9262..6fcc4ac93 100644 --- a/yt_dlp/extractor/bilibili.py +++ b/yt_dlp/extractor/bilibili.py @@ -138,11 +138,6 @@ class BiliBiliIE(InfoExtractor): anime_id = mobj.group('anime_id') page_id = mobj.group('page') webpage = self._download_webpage(url, video_id) - headers = { - 'Referer': url, - 'Accept': '*/*' - } - headers.update(self.geo_verification_headers()) if 'anime/' not in url: cid = self._search_regex( @@ -160,8 +155,12 @@ class BiliBiliIE(InfoExtractor): if 'no_bangumi_tip' not in smuggled_data: self.to_screen('Downloading episode %s. To download all videos in anime %s, re-run yt-dlp with %s' % ( video_id, anime_id, compat_urlparse.urljoin(url, '//bangumi.bilibili.com/anime/%s' % anime_id))) + headers = { + 'Content-Type': 'application/x-www-form-urlencoded; charset=UTF-8', + 'Referer': url + } + headers.update(self.geo_verification_headers()) - headers['Content-Type'] = 'application/x-www-form-urlencoded; charset=UTF-8' js = self._download_json( 'http://bangumi.bilibili.com/web_api/get_source', video_id, data=urlencode_postdata({'episode_id': video_id}), @@ -170,6 +169,12 @@ class BiliBiliIE(InfoExtractor): self._report_error(js) cid = js['result']['cid'] + headers = { + 'Accept': 'application/json', + 'Referer': url + } + headers.update(self.geo_verification_headers()) + entries = [] RENDITIONS = ('qn=80&quality=80&type=', 'quality=2&type=mp4') diff --git a/yt_dlp/extractor/cbs.py b/yt_dlp/extractor/cbs.py index 503d2e6a6..38c8bbc80 100644 --- a/yt_dlp/extractor/cbs.py +++ b/yt_dlp/extractor/cbs.py @@ -27,7 +27,7 @@ class CBSBaseIE(ThePlatformFeedIE): class CBSIE(CBSBaseIE): - _VALID_URL = r'(?:cbs:|https?://(?:www\.)?(?:(?:cbs\.com|paramountplus\.com)/shows/[^/]+/video|colbertlateshow\.com/(?:video|podcasts))/)(?P[\w-]+)' + _VALID_URL = r'(?:cbs:|https?://(?:www\.)?(?:(?:cbs|paramountplus)\.com/shows/[^/]+/video|colbertlateshow\.com/(?:video|podcasts))/)(?P[\w-]+)' _TESTS = [{ 'url': 'https://www.cbs.com/shows/garth-brooks/video/_u7W953k6la293J7EPTd9oHkSPs6Xn6_/connect-chat-feat-garth-brooks/', @@ -53,7 +53,7 @@ class CBSIE(CBSBaseIE): 'url': 'http://www.colbertlateshow.com/podcasts/dYSwjqPs_X1tvbV_P2FcPWRa_qT6akTC/in-the-bad-room-with-stephen/', 'only_matching': True, }, { - 'url': 'https://www.paramountplus.com/shows/star-trek-discovery/video/l5ANMH9wM7kxwV1qr4u1xn88XOhYMlZX/star-trek-discovery-the-vulcan-hello/', + 'url': 'https://www.paramountplus.com/shows/all-rise/video/QmR1WhNkh1a_IrdHZrbcRklm176X_rVc/all-rise-space/', 'only_matching': True, }] diff --git a/yt_dlp/extractor/common.py b/yt_dlp/extractor/common.py index b74a5dc01..65fcfcbf5 100644 --- a/yt_dlp/extractor/common.py +++ b/yt_dlp/extractor/common.py @@ -231,8 +231,9 @@ class InfoExtractor(object): uploader: Full name of the video uploader. license: License name the video is licensed under. creator: The creator of the video. + release_timestamp: UNIX timestamp of the moment the video was released. release_date: The date (YYYYMMDD) when the video was released. - timestamp: UNIX timestamp of the moment the video became available. + timestamp: UNIX timestamp of the moment the video was uploaded upload_date: Video upload date (YYYYMMDD). If not explicitly set, calculated from timestamp. uploader_id: Nickname or id of the video uploader. diff --git a/yt_dlp/extractor/fujitv.py b/yt_dlp/extractor/fujitv.py index 39685e075..a02a94374 100644 --- a/yt_dlp/extractor/fujitv.py +++ b/yt_dlp/extractor/fujitv.py @@ -17,7 +17,7 @@ class FujiTVFODPlus7IE(InfoExtractor): def _real_extract(self, url): video_id = self._match_id(url) formats = self._extract_m3u8_formats( - self._BASE_URL + 'abr/pc_html5/%s.m3u8' % video_id, video_id) + self._BASE_URL + 'abr/pc_html5/%s.m3u8' % video_id, video_id, 'mp4') for f in formats: wh = self._BITRATE_MAP.get(f.get('tbr')) if wh: diff --git a/yt_dlp/extractor/lbry.py b/yt_dlp/extractor/lbry.py index 051d94873..865cda761 100644 --- a/yt_dlp/extractor/lbry.py +++ b/yt_dlp/extractor/lbry.py @@ -6,8 +6,10 @@ import json from .common import InfoExtractor from ..compat import ( + compat_parse_qs, compat_str, compat_urllib_parse_unquote, + compat_urllib_parse_urlparse, ) from ..utils import ( determine_ext, @@ -62,6 +64,7 @@ class LBRYBaseIE(InfoExtractor): 'description': stream_value.get('description'), 'license': stream_value.get('license'), 'timestamp': int_or_none(stream.get('timestamp')), + 'release_timestamp': int_or_none(stream_value.get('release_time')), 'tags': stream_value.get('tags'), 'duration': int_or_none(media.get('duration')), 'channel': try_get(signing_channel, lambda x: x['value']['title']), @@ -94,6 +97,8 @@ class LBRYIE(LBRYBaseIE): 'description': 'md5:f6cb5c704b332d37f5119313c2c98f51', 'timestamp': 1595694354, 'upload_date': '20200725', + 'release_timestamp': 1595340697, + 'release_date': '20200721', 'width': 1280, 'height': 720, } @@ -108,6 +113,8 @@ class LBRYIE(LBRYBaseIE): 'description': 'md5:661ac4f1db09f31728931d7b88807a61', 'timestamp': 1591312601, 'upload_date': '20200604', + 'release_timestamp': 1591312421, + 'release_date': '20200604', 'tags': list, 'duration': 2570, 'channel': 'The LBRY Foundation', @@ -189,17 +196,18 @@ class LBRYChannelIE(LBRYBaseIE): }] _PAGE_SIZE = 50 - def _fetch_page(self, claim_id, url, page): + def _fetch_page(self, claim_id, url, params, page): page += 1 + page_params = { + 'channel_ids': [claim_id], + 'claim_type': 'stream', + 'no_totals': True, + 'page': page, + 'page_size': self._PAGE_SIZE, + } + page_params.update(params) result = self._call_api_proxy( - 'claim_search', claim_id, { - 'channel_ids': [claim_id], - 'claim_type': 'stream', - 'no_totals': True, - 'page': page, - 'page_size': self._PAGE_SIZE, - 'stream_types': self._SUPPORTED_STREAM_TYPES, - }, 'page %d' % page) + 'claim_search', claim_id, page_params, 'page %d' % page) for item in (result.get('items') or []): stream_claim_name = item.get('name') stream_claim_id = item.get('claim_id') @@ -220,8 +228,31 @@ class LBRYChannelIE(LBRYBaseIE): result = self._resolve_url( 'lbry://' + display_id, display_id, 'channel') claim_id = result['claim_id'] + qs = compat_parse_qs(compat_urllib_parse_urlparse(url).query) + content = qs.get('content', [None])[0] + params = { + 'fee_amount': qs.get('fee_amount', ['>=0'])[0], + 'order_by': { + 'new': ['release_time'], + 'top': ['effective_amount'], + 'trending': ['trending_group', 'trending_mixed'], + }[qs.get('order', ['new'])[0]], + 'stream_types': [content] if content in ['audio', 'video'] else self._SUPPORTED_STREAM_TYPES, + } + duration = qs.get('duration', [None])[0] + if duration: + params['duration'] = { + 'long': '>=1200', + 'short': '<=240', + }[duration] + language = qs.get('language', ['all'])[0] + if language != 'all': + languages = [language] + if language == 'en': + languages.append('none') + params['any_languages'] = languages entries = OnDemandPagedList( - functools.partial(self._fetch_page, claim_id, url), + functools.partial(self._fetch_page, claim_id, url, params), self._PAGE_SIZE) result_value = result.get('value') or {} return self.playlist_result( diff --git a/yt_dlp/extractor/peertube.py b/yt_dlp/extractor/peertube.py index 32ff51653..d9b13adc2 100644 --- a/yt_dlp/extractor/peertube.py +++ b/yt_dlp/extractor/peertube.py @@ -599,11 +599,13 @@ class PeerTubeIE(InfoExtractor): else: age_limit = None + webpage_url = 'https://%s/videos/watch/%s' % (host, video_id) + return { 'id': video_id, 'title': title, 'description': description, - 'thumbnail': urljoin(url, video.get('thumbnailPath')), + 'thumbnail': urljoin(webpage_url, video.get('thumbnailPath')), 'timestamp': unified_timestamp(video.get('publishedAt')), 'uploader': account_data('displayName', compat_str), 'uploader_id': str_or_none(account_data('id', int)), @@ -621,5 +623,6 @@ class PeerTubeIE(InfoExtractor): 'tags': try_get(video, lambda x: x['tags'], list), 'categories': categories, 'formats': formats, - 'subtitles': subtitles + 'subtitles': subtitles, + 'webpage_url': webpage_url, } diff --git a/yt_dlp/extractor/pinterest.py b/yt_dlp/extractor/pinterest.py index 15c11a755..09aeea340 100644 --- a/yt_dlp/extractor/pinterest.py +++ b/yt_dlp/extractor/pinterest.py @@ -31,6 +31,7 @@ class PinterestBaseIE(InfoExtractor): title = (data.get('title') or data.get('grid_title') or video_id).strip() + urls = [] formats = [] duration = None if extract_formats: @@ -38,8 +39,9 @@ class PinterestBaseIE(InfoExtractor): if not isinstance(format_dict, dict): continue format_url = url_or_none(format_dict.get('url')) - if not format_url: + if not format_url or format_url in urls: continue + urls.append(format_url) duration = float_or_none(format_dict.get('duration'), scale=1000) ext = determine_ext(format_url) if 'hls' in format_id.lower() or ext == 'm3u8': diff --git a/yt_dlp/extractor/pornhub.py b/yt_dlp/extractor/pornhub.py index b7631e4e1..2a7818e41 100644 --- a/yt_dlp/extractor/pornhub.py +++ b/yt_dlp/extractor/pornhub.py @@ -167,6 +167,7 @@ class PornHubIE(PornHubBaseIE): 'params': { 'skip_download': True, }, + 'skip': 'Video has been flagged for verification in accordance with our trust and safety policy', }, { # subtitles 'url': 'https://www.pornhub.com/view_video.php?viewkey=ph5af5fef7c2aa7', @@ -265,7 +266,8 @@ class PornHubIE(PornHubBaseIE): webpage = dl_webpage('pc') error_msg = self._html_search_regex( - r'(?s)]+class=(["\'])(?:(?!\1).)*\b(?:removed|userMessageSection)\b(?:(?!\1).)*\1[^>]*>(?P.+?)', + (r'(?s)]+class=(["\'])(?:(?!\1).)*\b(?:removed|userMessageSection)\b(?:(?!\1).)*\1[^>]*>(?P.+?)', + r'(?s)]+class=["\']noVideo["\'][^>]*>(?P.+?)'), webpage, 'error message', default=None, group='error') if error_msg: error_msg = re.sub(r'\s+', ' ', error_msg) @@ -394,6 +396,21 @@ class PornHubIE(PornHubBaseIE): upload_date = None formats = [] + + def add_format(format_url, height=None): + tbr = None + mobj = re.search(r'(?P\d+)[pP]?_(?P\d+)[kK]', format_url) + if mobj: + if not height: + height = int(mobj.group('height')) + tbr = int(mobj.group('tbr')) + formats.append({ + 'url': format_url, + 'format_id': '%dp' % height if height else None, + 'height': height, + 'tbr': tbr, + }) + for video_url, height in video_urls: if not upload_date: upload_date = self._search_regex( @@ -410,18 +427,19 @@ class PornHubIE(PornHubBaseIE): video_url, video_id, 'mp4', entry_protocol='m3u8_native', m3u8_id='hls', fatal=False)) continue - tbr = None - mobj = re.search(r'(?P\d+)[pP]?_(?P\d+)[kK]', video_url) - if mobj: - if not height: - height = int(mobj.group('height')) - tbr = int(mobj.group('tbr')) - formats.append({ - 'url': video_url, - 'format_id': '%dp' % height if height else None, - 'height': height, - 'tbr': tbr, - }) + if '/video/get_media' in video_url: + medias = self._download_json(video_url, video_id, fatal=False) + if isinstance(medias, list): + for media in medias: + if not isinstance(media, dict): + continue + video_url = url_or_none(media.get('videoUrl')) + if not video_url: + continue + height = int_or_none(media.get('quality')) + add_format(video_url, height) + continue + add_format(video_url) self._sort_formats(formats) video_uploader = self._html_search_regex( diff --git a/yt_dlp/extractor/rtve.py b/yt_dlp/extractor/rtve.py index ce9db0629..d2fb754cf 100644 --- a/yt_dlp/extractor/rtve.py +++ b/yt_dlp/extractor/rtve.py @@ -2,8 +2,9 @@ from __future__ import unicode_literals import base64 +import io import re -import time +import sys from .common import InfoExtractor from ..compat import ( @@ -14,56 +15,13 @@ from ..utils import ( determine_ext, ExtractorError, float_or_none, + qualities, remove_end, remove_start, - sanitized_Request, std_headers, ) - -def _decrypt_url(png): - encrypted_data = compat_b64decode(png) - text_index = encrypted_data.find(b'tEXt') - text_chunk = encrypted_data[text_index - 4:] - length = compat_struct_unpack('!I', text_chunk[:4])[0] - # Use bytearray to get integers when iterating in both python 2.x and 3.x - data = bytearray(text_chunk[8:8 + length]) - data = [chr(b) for b in data if b != 0] - hash_index = data.index('#') - alphabet_data = data[:hash_index] - url_data = data[hash_index + 1:] - if url_data[0] == 'H' and url_data[3] == '%': - # remove useless HQ%% at the start - url_data = url_data[4:] - - alphabet = [] - e = 0 - d = 0 - for l in alphabet_data: - if d == 0: - alphabet.append(l) - d = e = (e + 1) % 4 - else: - d -= 1 - url = '' - f = 0 - e = 3 - b = 1 - for letter in url_data: - if f == 0: - l = int(letter) * 10 - f = 1 - else: - if e == 0: - l += int(letter) - url += alphabet[l] - e = (b + 3) % 4 - f = 0 - b += 1 - else: - e -= 1 - - return url +_bytes_to_chr = (lambda x: x) if sys.version_info[0] == 2 else (lambda x: map(chr, x)) class RTVEALaCartaIE(InfoExtractor): @@ -79,28 +37,31 @@ class RTVEALaCartaIE(InfoExtractor): 'ext': 'mp4', 'title': 'Balonmano - Swiss Cup masculina. Final: España-Suecia', 'duration': 5024.566, + 'series': 'Balonmano', }, + 'expected_warnings': ['Failed to download MPD manifest', 'Failed to download m3u8 information'], }, { 'note': 'Live stream', 'url': 'http://www.rtve.es/alacarta/videos/television/24h-live/1694255/', 'info_dict': { 'id': '1694255', - 'ext': 'flv', - 'title': 'TODO', + 'ext': 'mp4', + 'title': 're:^24H LIVE [0-9]{4}-[0-9]{2}-[0-9]{2} [0-9]{2}:[0-9]{2}$', + 'is_live': True, + }, + 'params': { + 'skip_download': 'live stream', }, - 'skip': 'The f4m manifest can\'t be used yet', }, { 'url': 'http://www.rtve.es/alacarta/videos/servir-y-proteger/servir-proteger-capitulo-104/4236788/', - 'md5': 'e55e162379ad587e9640eda4f7353c0f', + 'md5': 'd850f3c8731ea53952ebab489cf81cbf', 'info_dict': { 'id': '4236788', 'ext': 'mp4', - 'title': 'Servir y proteger - Capítulo 104 ', + 'title': 'Servir y proteger - Capítulo 104', 'duration': 3222.0, }, - 'params': { - 'skip_download': True, # requires ffmpeg - }, + 'expected_warnings': ['Failed to download MPD manifest', 'Failed to download m3u8 information'], }, { 'url': 'http://www.rtve.es/m/alacarta/videos/cuentame-como-paso/cuentame-como-paso-t16-ultimo-minuto-nuestra-vida-capitulo-276/2969138/?media=tve', 'only_matching': True, @@ -111,58 +72,102 @@ class RTVEALaCartaIE(InfoExtractor): def _real_initialize(self): user_agent_b64 = base64.b64encode(std_headers['User-Agent'].encode('utf-8')).decode('utf-8') - manager_info = self._download_json( + self._manager = self._download_json( 'http://www.rtve.es/odin/loki/' + user_agent_b64, - None, 'Fetching manager info') - self._manager = manager_info['manager'] + None, 'Fetching manager info')['manager'] + + @staticmethod + def _decrypt_url(png): + encrypted_data = io.BytesIO(compat_b64decode(png)[8:]) + while True: + length = compat_struct_unpack('!I', encrypted_data.read(4))[0] + chunk_type = encrypted_data.read(4) + if chunk_type == b'IEND': + break + data = encrypted_data.read(length) + if chunk_type == b'tEXt': + alphabet_data, text = data.split(b'\0') + quality, url_data = text.split(b'%%') + alphabet = [] + e = 0 + d = 0 + for l in _bytes_to_chr(alphabet_data): + if d == 0: + alphabet.append(l) + d = e = (e + 1) % 4 + else: + d -= 1 + url = '' + f = 0 + e = 3 + b = 1 + for letter in _bytes_to_chr(url_data): + if f == 0: + l = int(letter) * 10 + f = 1 + else: + if e == 0: + l += int(letter) + url += alphabet[l] + e = (b + 3) % 4 + f = 0 + b += 1 + else: + e -= 1 + + yield quality.decode(), url + encrypted_data.read(4) # CRC + + def _extract_png_formats(self, video_id): + png = self._download_webpage( + 'http://www.rtve.es/ztnr/movil/thumbnail/%s/videos/%s.png' % (self._manager, video_id), + video_id, 'Downloading url information', query={'q': 'v2'}) + q = qualities(['Media', 'Alta', 'HQ', 'HD_READY', 'HD_FULL']) + formats = [] + for quality, video_url in self._decrypt_url(png): + ext = determine_ext(video_url) + if ext == 'm3u8': + formats.extend(self._extract_m3u8_formats( + video_url, video_id, 'mp4', 'm3u8_native', + m3u8_id='hls', fatal=False)) + elif ext == 'mpd': + formats.extend(self._extract_mpd_formats( + video_url, video_id, 'dash', fatal=False)) + else: + formats.append({ + 'format_id': quality, + 'quality': q(quality), + 'url': video_url, + }) + self._sort_formats(formats) + return formats def _real_extract(self, url): - mobj = re.match(self._VALID_URL, url) - video_id = mobj.group('id') + video_id = self._match_id(url) info = self._download_json( 'http://www.rtve.es/api/videos/%s/config/alacarta_videos.json' % video_id, video_id)['page']['items'][0] if info['state'] == 'DESPU': raise ExtractorError('The video is no longer available', expected=True) - title = info['title'] - png_url = 'http://www.rtve.es/ztnr/movil/thumbnail/%s/videos/%s.png' % (self._manager, video_id) - png_request = sanitized_Request(png_url) - png_request.add_header('Referer', url) - png = self._download_webpage(png_request, video_id, 'Downloading url information') - video_url = _decrypt_url(png) - ext = determine_ext(video_url) - - formats = [] - if not video_url.endswith('.f4m') and ext != 'm3u8': - if '?' not in video_url: - video_url = video_url.replace('resources/', 'auth/resources/') - video_url = video_url.replace('.net.rtve', '.multimedia.cdn.rtve') - - if ext == 'm3u8': - formats.extend(self._extract_m3u8_formats( - video_url, video_id, ext='mp4', entry_protocol='m3u8_native', - m3u8_id='hls', fatal=False)) - elif ext == 'f4m': - formats.extend(self._extract_f4m_formats( - video_url, video_id, f4m_id='hds', fatal=False)) - else: - formats.append({ - 'url': video_url, - }) - self._sort_formats(formats) + title = info['title'].strip() + formats = self._extract_png_formats(video_id) subtitles = None - if info.get('sbtFile') is not None: - subtitles = self.extract_subtitles(video_id, info['sbtFile']) + sbt_file = info.get('sbtFile') + if sbt_file: + subtitles = self.extract_subtitles(video_id, sbt_file) + + is_live = info.get('live') is True return { 'id': video_id, - 'title': title, + 'title': self._live_title(title) if is_live else title, 'formats': formats, 'thumbnail': info.get('image'), - 'page_url': url, 'subtitles': subtitles, - 'duration': float_or_none(info.get('duration'), scale=1000), + 'duration': float_or_none(info.get('duration'), 1000), + 'is_live': is_live, + 'series': info.get('programTitle'), } def _get_subtitles(self, video_id, sub_file): @@ -174,48 +179,26 @@ class RTVEALaCartaIE(InfoExtractor): for s in subs) -class RTVEInfantilIE(InfoExtractor): +class RTVEInfantilIE(RTVEALaCartaIE): IE_NAME = 'rtve.es:infantil' IE_DESC = 'RTVE infantil' - _VALID_URL = r'https?://(?:www\.)?rtve\.es/infantil/serie/(?P[^/]*)/video/(?P[^/]*)/(?P[0-9]+)/' + _VALID_URL = r'https?://(?:www\.)?rtve\.es/infantil/serie/[^/]+/video/[^/]+/(?P[0-9]+)/' _TESTS = [{ 'url': 'http://www.rtve.es/infantil/serie/cleo/video/maneras-vivir/3040283/', - 'md5': '915319587b33720b8e0357caaa6617e6', + 'md5': '5747454717aedf9f9fdf212d1bcfc48d', 'info_dict': { 'id': '3040283', 'ext': 'mp4', 'title': 'Maneras de vivir', - 'thumbnail': 'http://www.rtve.es/resources/jpg/6/5/1426182947956.JPG', + 'thumbnail': r're:https?://.+/1426182947956\.JPG', 'duration': 357.958, }, + 'expected_warnings': ['Failed to download MPD manifest', 'Failed to download m3u8 information'], }] - def _real_extract(self, url): - video_id = self._match_id(url) - info = self._download_json( - 'http://www.rtve.es/api/videos/%s/config/alacarta_videos.json' % video_id, - video_id)['page']['items'][0] - webpage = self._download_webpage(url, video_id) - vidplayer_id = self._search_regex( - r' id="vidplayer([0-9]+)"', webpage, 'internal video ID') - - png_url = 'http://www.rtve.es/ztnr/movil/thumbnail/default/videos/%s.png' % vidplayer_id - png = self._download_webpage(png_url, video_id, 'Downloading url information') - video_url = _decrypt_url(png) - - return { - 'id': video_id, - 'ext': 'mp4', - 'title': info['title'], - 'url': video_url, - 'thumbnail': info.get('image'), - 'duration': float_or_none(info.get('duration'), scale=1000), - } - - -class RTVELiveIE(InfoExtractor): +class RTVELiveIE(RTVEALaCartaIE): IE_NAME = 'rtve.es:live' IE_DESC = 'RTVE.es live streams' _VALID_URL = r'https?://(?:www\.)?rtve\.es/directo/(?P[a-zA-Z0-9-]+)' @@ -225,7 +208,7 @@ class RTVELiveIE(InfoExtractor): 'info_dict': { 'id': 'la-1', 'ext': 'mp4', - 'title': 're:^La 1 [0-9]{4}-[0-9]{2}-[0-9]{2}Z[0-9]{6}$', + 'title': 're:^La 1 [0-9]{4}-[0-9]{2}-[0-9]{2} [0-9]{2}:[0-9]{2}$', }, 'params': { 'skip_download': 'live stream', @@ -234,29 +217,22 @@ class RTVELiveIE(InfoExtractor): def _real_extract(self, url): mobj = re.match(self._VALID_URL, url) - start_time = time.gmtime() video_id = mobj.group('id') webpage = self._download_webpage(url, video_id) title = remove_end(self._og_search_title(webpage), ' en directo en RTVE.es') title = remove_start(title, 'Estoy viendo ') - title += ' ' + time.strftime('%Y-%m-%dZ%H%M%S', start_time) vidplayer_id = self._search_regex( (r'playerId=player([0-9]+)', r'class=["\'].*?\blive_mod\b.*?["\'][^>]+data-assetid=["\'](\d+)', r'data-id=["\'](\d+)'), webpage, 'internal video ID') - png_url = 'http://www.rtve.es/ztnr/movil/thumbnail/amonet/videos/%s.png' % vidplayer_id - png = self._download_webpage(png_url, video_id, 'Downloading url information') - m3u8_url = _decrypt_url(png) - formats = self._extract_m3u8_formats(m3u8_url, video_id, ext='mp4') - self._sort_formats(formats) return { 'id': video_id, - 'title': title, - 'formats': formats, + 'title': self._live_title(title), + 'formats': self._extract_png_formats(vidplayer_id), 'is_live': True, } diff --git a/yt_dlp/extractor/shahid.py b/yt_dlp/extractor/shahid.py index c1d6aba2c..5768199bc 100644 --- a/yt_dlp/extractor/shahid.py +++ b/yt_dlp/extractor/shahid.py @@ -51,13 +51,16 @@ class ShahidIE(ShahidBaseIE): _NETRC_MACHINE = 'shahid' _VALID_URL = r'https?://shahid\.mbc\.net/ar/(?:serie|show|movie)s/[^/]+/(?Pepisode|clip|movie)-(?P\d+)' _TESTS = [{ - 'url': 'https://shahid.mbc.net/ar/shows/%D9%85%D8%AC%D9%84%D8%B3-%D8%A7%D9%84%D8%B4%D8%A8%D8%A7%D8%A8-%D8%A7%D9%84%D9%85%D9%88%D8%B3%D9%85-1-%D9%83%D9%84%D9%8A%D8%A8-1/clip-275286', + 'url': 'https://shahid.mbc.net/ar/shows/%D9%85%D8%AA%D8%AD%D9%81-%D8%A7%D9%84%D8%AF%D8%AD%D9%8A%D8%AD-%D8%A7%D9%84%D9%85%D9%88%D8%B3%D9%85-1-%D9%83%D9%84%D9%8A%D8%A8-1/clip-816924', 'info_dict': { - 'id': '275286', + 'id': '816924', 'ext': 'mp4', - 'title': 'مجلس الشباب الموسم 1 كليب 1', - 'timestamp': 1506988800, - 'upload_date': '20171003', + 'title': 'متحف الدحيح الموسم 1 كليب 1', + 'timestamp': 1602806400, + 'upload_date': '20201016', + 'description': 'برومو', + 'duration': 22, + 'categories': ['كوميديا'], }, 'params': { # m3u8 download @@ -109,12 +112,15 @@ class ShahidIE(ShahidBaseIE): page_type = 'episode' playout = self._call_api( - 'playout/url/' + video_id, video_id)['playout'] + 'playout/new/url/' + video_id, video_id)['playout'] if not self._downloader.params.get('allow_unplayable_formats') and playout.get('drm'): raise ExtractorError('This video is DRM protected.', expected=True) - formats = self._extract_m3u8_formats(playout['url'], video_id, 'mp4') + formats = self._extract_m3u8_formats(re.sub( + # https://docs.aws.amazon.com/mediapackage/latest/ug/manifest-filtering.html + r'aws\.manifestfilter=[\w:;,-]+&?', + '', playout['url']), video_id, 'mp4') self._sort_formats(formats) # video = self._call_api( diff --git a/yt_dlp/extractor/southpark.py b/yt_dlp/extractor/southpark.py index 95e6d2890..9aedaa04a 100644 --- a/yt_dlp/extractor/southpark.py +++ b/yt_dlp/extractor/southpark.py @@ -6,9 +6,9 @@ from .mtv import MTVServicesInfoExtractor class SouthParkIE(MTVServicesInfoExtractor): IE_NAME = 'southpark.cc.com' - _VALID_URL = r'https?://(?:www\.)?(?Psouthpark\.cc\.com/(?:clips|(?:full-)?episodes|collections)/(?P.+?)(\?|#|$))' + _VALID_URL = r'https?://(?:www\.)?(?Psouthpark(?:\.cc|studios)\.com/(?:clips|(?:full-)?episodes|collections)/(?P.+?)(\?|#|$))' - _FEED_URL = 'http://www.southparkstudios.com/feeds/video-player/mrss' + _FEED_URL = 'http://feeds.mtvnservices.com/od/feed/intl-mrss-player-feed' _TESTS = [{ 'url': 'http://southpark.cc.com/clips/104437/bat-daded#tab=featured', @@ -23,8 +23,20 @@ class SouthParkIE(MTVServicesInfoExtractor): }, { 'url': 'http://southpark.cc.com/collections/7758/fan-favorites/1', 'only_matching': True, + }, { + 'url': 'https://www.southparkstudios.com/episodes/h4o269/south-park-stunning-and-brave-season-19-ep-1', + 'only_matching': True, }] + def _get_feed_query(self, uri): + return { + 'accountOverride': 'intl.mtvi.com', + 'arcEp': 'shared.southpark.global', + 'ep': '90877963', + 'imageEp': 'shared.southpark.global', + 'mgid': uri, + } + class SouthParkEsIE(SouthParkIE): IE_NAME = 'southpark.cc.com:español' diff --git a/yt_dlp/extractor/sportdeutschland.py b/yt_dlp/extractor/sportdeutschland.py index 378fc7568..3e497a939 100644 --- a/yt_dlp/extractor/sportdeutschland.py +++ b/yt_dlp/extractor/sportdeutschland.py @@ -1,82 +1,105 @@ # coding: utf-8 from __future__ import unicode_literals -import re - from .common import InfoExtractor +from ..compat import ( + compat_parse_qs, + compat_urllib_parse_urlparse, +) from ..utils import ( + clean_html, + float_or_none, + int_or_none, parse_iso8601, - sanitized_Request, + strip_or_none, + try_get, ) class SportDeutschlandIE(InfoExtractor): - _VALID_URL = r'https?://sportdeutschland\.tv/(?P[^/?#]+)/(?P[^?#/]+)(?:$|[?#])' + _VALID_URL = r'https?://sportdeutschland\.tv/(?P(?:[^/]+/)?[^?#/&]+)' _TESTS = [{ 'url': 'https://sportdeutschland.tv/badminton/re-live-deutsche-meisterschaften-2020-halbfinals?playlistId=0', 'info_dict': { - 'id': 're-live-deutsche-meisterschaften-2020-halbfinals', + 'id': '5318cac0275701382770543d7edaf0a0', 'ext': 'mp4', - 'title': 're:Re-live: Deutsche Meisterschaften 2020.*Halbfinals', - 'categories': ['Badminton-Deutschland'], - 'view_count': int, - 'thumbnail': r're:^https?://.*\.(?:jpg|png)$', - 'timestamp': int, - 'upload_date': '20200201', - 'description': 're:.*', # meaningless description for THIS video + 'title': 'Re-live: Deutsche Meisterschaften 2020 - Halbfinals - Teil 1', + 'duration': 16106.36, }, + 'params': { + 'noplaylist': True, + # m3u8 download + 'skip_download': True, + }, + }, { + 'url': 'https://sportdeutschland.tv/badminton/re-live-deutsche-meisterschaften-2020-halbfinals?playlistId=0', + 'info_dict': { + 'id': 'c6e2fdd01f63013854c47054d2ab776f', + 'title': 'Re-live: Deutsche Meisterschaften 2020 - Halbfinals', + 'description': 'md5:5263ff4c31c04bb780c9f91130b48530', + 'duration': 31397, + }, + 'playlist_count': 2, + }, { + 'url': 'https://sportdeutschland.tv/freeride-world-tour-2021-fieberbrunn-oesterreich', + 'only_matching': True, }] def _real_extract(self, url): - mobj = re.match(self._VALID_URL, url) - video_id = mobj.group('id') - sport_id = mobj.group('sport') - - api_url = 'https://proxy.vidibusdynamic.net/ssl/backend.sportdeutschland.tv/api/permalinks/%s/%s?access_token=true' % ( - sport_id, video_id) - req = sanitized_Request(api_url, headers={ - 'Accept': 'application/vnd.vidibus.v2.html+json', - 'Referer': url, - }) - data = self._download_json(req, video_id) - + display_id = self._match_id(url) + data = self._download_json( + 'https://backend.sportdeutschland.tv/api/permalinks/' + display_id, + display_id, query={'access_token': 'true'}) asset = data['asset'] - categories = [data['section']['title']] - - formats = [] - smil_url = asset['video'] - if '.smil' in smil_url: - m3u8_url = smil_url.replace('.smil', '.m3u8') - formats.extend( - self._extract_m3u8_formats(m3u8_url, video_id, ext='mp4')) - - smil_doc = self._download_xml( - smil_url, video_id, note='Downloading SMIL metadata') - base_url_el = smil_doc.find('./head/meta') - if base_url_el: - base_url = base_url_el.attrib['base'] - formats.extend([{ - 'format_id': 'rmtp', - 'url': base_url if base_url_el else n.attrib['src'], - 'play_path': n.attrib['src'], - 'ext': 'flv', - 'preference': -100, - 'format_note': 'Seems to fail at example stream', - } for n in smil_doc.findall('./body/video')]) - else: - formats.append({'url': smil_url}) - - self._sort_formats(formats) - - return { - 'id': video_id, - 'formats': formats, - 'title': asset['title'], - 'thumbnail': asset.get('image'), - 'description': asset.get('teaser'), - 'duration': asset.get('duration'), - 'categories': categories, - 'view_count': asset.get('views'), - 'rtmp_live': asset.get('live'), - 'timestamp': parse_iso8601(asset.get('date')), + title = (asset.get('title') or asset['label']).strip() + asset_id = asset.get('id') or asset.get('uuid') + info = { + 'id': asset_id, + 'title': title, + 'description': clean_html(asset.get('body') or asset.get('description')) or asset.get('teaser'), + 'duration': int_or_none(asset.get('seconds')), } + videos = asset.get('videos') or [] + if len(videos) > 1: + playlist_id = compat_parse_qs(compat_urllib_parse_urlparse(url).query).get('playlistId', [None])[0] + if playlist_id: + if self._downloader.params.get('noplaylist'): + videos = [videos[int(playlist_id)]] + self.to_screen('Downloading just a single video because of --no-playlist') + else: + self.to_screen('Downloading playlist %s - add --no-playlist to just download video' % asset_id) + + def entries(): + for i, video in enumerate(videos, 1): + video_id = video.get('uuid') + video_url = video.get('url') + if not (video_id and video_url): + continue + formats = self._extract_m3u8_formats( + video_url.replace('.smil', '.m3u8'), video_id, 'mp4', fatal=False) + if not formats: + continue + yield { + 'id': video_id, + 'formats': formats, + 'title': title + ' - ' + (video.get('label') or 'Teil %d' % i), + 'duration': float_or_none(video.get('duration')), + } + info.update({ + '_type': 'multi_video', + 'entries': entries(), + }) + else: + formats = self._extract_m3u8_formats( + videos[0]['url'].replace('.smil', '.m3u8'), asset_id, 'mp4') + section_title = strip_or_none(try_get(data, lambda x: x['section']['title'])) + info.update({ + 'formats': formats, + 'display_id': asset.get('permalink'), + 'thumbnail': try_get(asset, lambda x: x['images'][0]), + 'categories': [section_title] if section_title else None, + 'view_count': int_or_none(asset.get('views')), + 'is_live': asset.get('is_live') is True, + 'timestamp': parse_iso8601(asset.get('date') or asset.get('published_at')), + }) + return info diff --git a/yt_dlp/extractor/tver.py b/yt_dlp/extractor/tver.py index 931d4d650..a54f49319 100644 --- a/yt_dlp/extractor/tver.py +++ b/yt_dlp/extractor/tver.py @@ -9,6 +9,7 @@ from ..utils import ( int_or_none, remove_start, smuggle_url, + strip_or_none, try_get, ) @@ -25,6 +26,10 @@ class TVerIE(InfoExtractor): }, { 'url': 'https://tver.jp/episode/79622438', 'only_matching': True, + }, { + # subtitle = ' ' + 'url': 'https://tver.jp/corner/f0068870', + 'only_matching': True, }] _TOKEN = None BRIGHTCOVE_URL_TEMPLATE = 'http://players.brightcove.net/%s/default_default/index.html?videoId=%s' @@ -47,8 +52,12 @@ class TVerIE(InfoExtractor): } if service == 'cx': + title = main['title'] + subtitle = strip_or_none(main.get('subtitle')) + if subtitle: + title += ' - ' + subtitle info.update({ - 'title': main.get('subtitle') or main['title'], + 'title': title, 'url': 'https://i.fod.fujitv.co.jp/plus7/web/%s/%s.html' % (p_id[:4], p_id), 'ie_key': 'FujiTVFODPlus7', }) diff --git a/yt_dlp/extractor/voxmedia.py b/yt_dlp/extractor/voxmedia.py index b318e15d4..661208125 100644 --- a/yt_dlp/extractor/voxmedia.py +++ b/yt_dlp/extractor/voxmedia.py @@ -7,6 +7,8 @@ from ..compat import compat_urllib_parse_unquote from ..utils import ( ExtractorError, int_or_none, + try_get, + unified_timestamp, ) @@ -19,14 +21,17 @@ class VoxMediaVolumeIE(OnceIE): setup = self._parse_json(self._search_regex( r'setup\s*=\s*({.+});', webpage, 'setup'), video_id) - video_data = setup.get('video') or {} + player_setup = setup.get('player_setup') or setup + video_data = player_setup.get('video') or {} + formatted_metadata = video_data.get('formatted_metadata') or {} info = { 'id': video_id, - 'title': video_data.get('title_short'), + 'title': player_setup.get('title') or video_data.get('title_short'), 'description': video_data.get('description_long') or video_data.get('description_short'), - 'thumbnail': video_data.get('brightcove_thumbnail') + 'thumbnail': formatted_metadata.get('thumbnail') or video_data.get('brightcove_thumbnail'), + 'timestamp': unified_timestamp(formatted_metadata.get('video_publish_date')), } - asset = setup.get('asset') or setup.get('params') or {} + asset = try_get(setup, lambda x: x['embed_assets']['chorus'], dict) or {} formats = [] hls_url = asset.get('hls_url') @@ -47,6 +52,7 @@ class VoxMediaVolumeIE(OnceIE): if formats: self._sort_formats(formats) info['formats'] = formats + info['duration'] = int_or_none(asset.get('duration')) return info for provider_video_type in ('ooyala', 'youtube', 'brightcove'): @@ -84,7 +90,7 @@ class VoxMediaIE(InfoExtractor): }, { # Volume embed, Youtube 'url': 'http://www.theverge.com/2014/10/21/7025853/google-nexus-6-hands-on-photos-video-android-phablet', - 'md5': '4c8f4a0937752b437c3ebc0ed24802b5', + 'md5': 'fd19aa0cf3a0eea515d4fd5c8c0e9d68', 'info_dict': { 'id': 'Gy8Md3Eky38', 'ext': 'mp4', @@ -93,6 +99,7 @@ class VoxMediaIE(InfoExtractor): 'uploader_id': 'TheVerge', 'upload_date': '20141021', 'uploader': 'The Verge', + 'timestamp': 1413907200, }, 'add_ie': ['Youtube'], 'skip': 'similar to the previous test', @@ -100,13 +107,13 @@ class VoxMediaIE(InfoExtractor): # Volume embed, Youtube 'url': 'http://www.vox.com/2016/3/31/11336640/mississippi-lgbt-religious-freedom-bill', 'info_dict': { - 'id': 'YCjDnX-Xzhg', + 'id': '22986359b', 'ext': 'mp4', 'title': "Mississippi's laws are so bad that its anti-LGBTQ law isn't needed to allow discrimination", 'description': 'md5:fc1317922057de31cd74bce91eb1c66c', - 'uploader_id': 'voxdotcom', 'upload_date': '20150915', - 'uploader': 'Vox', + 'timestamp': 1442332800, + 'duration': 285, }, 'add_ie': ['Youtube'], 'skip': 'similar to the previous test', @@ -160,6 +167,9 @@ class VoxMediaIE(InfoExtractor): 'ext': 'mp4', 'title': 'Post-Post-PC CEO: The Full Code Conference Video of Microsoft\'s Satya Nadella', 'description': 'The longtime veteran was chosen earlier this year as the software giant\'s third leader in its history.', + 'timestamp': 1402938000, + 'upload_date': '20140616', + 'duration': 4114, }, 'add_ie': ['VoxMediaVolume'], }]