Update to 2021.01.03

This commit is contained in:
pukkandan 2021-01-04 20:41:05 +05:30
parent efe3d655aa
commit 9434b7f182
11 changed files with 639 additions and 419 deletions

View File

@ -57,6 +57,7 @@
- **appletrailers**
- **appletrailers:section**
- **archive.org**: archive.org videos
- **ArcPublishing**
- **ARD**
- **ARD:mediathek**
- **ARDBetaMediathek**
@ -620,6 +621,7 @@
- **Npr**
- **NRK**
- **NRKPlaylist**
- **NRKRadioPodkast**
- **NRKSkole**: NRK Skole
- **NRKTV**: NRK TV and NRK Radio
- **NRKTVDirekte**: NRK TV Direkte and NRK Radio Direkte
@ -822,11 +824,14 @@
- **Shared**: shared.sx
- **ShowRoomLive**
- **Sina**
- **sky.it**
- **sky:news**
- **sky:sports**
- **sky:sports:news**
- **skyacademy.it**
- **SkylineWebcams**
- **SkyNews**
- **skynewsarabia:article**
- **skynewsarabia:video**
- **SkySports**
- **Slideshare**
- **SlidesLive**
- **Slutload**

View File

@ -2,21 +2,47 @@
from __future__ import unicode_literals
import re
import functools
from .common import InfoExtractor
from ..compat import compat_str
from ..utils import (
clean_html,
float_or_none,
int_or_none,
try_get,
unified_timestamp,
OnDemandPagedList,
parse_iso8601,
)
class ACastIE(InfoExtractor):
class ACastBaseIE(InfoExtractor):
def _extract_episode(self, episode, show_info):
title = episode['title']
info = {
'id': episode['id'],
'display_id': episode.get('episodeUrl'),
'url': episode['url'],
'title': title,
'description': clean_html(episode.get('description') or episode.get('summary')),
'thumbnail': episode.get('image'),
'timestamp': parse_iso8601(episode.get('publishDate')),
'duration': int_or_none(episode.get('duration')),
'filesize': int_or_none(episode.get('contentLength')),
'season_number': int_or_none(episode.get('season')),
'episode': title,
'episode_number': int_or_none(episode.get('episode')),
}
info.update(show_info)
return info
def _extract_show_info(self, show):
return {
'creator': show.get('author'),
'series': show.get('title'),
}
def _call_api(self, path, video_id, query=None):
return self._download_json(
'https://feeder.acast.com/api/v1/shows/' + path, video_id, query=query)
class ACastIE(ACastBaseIE):
IE_NAME = 'acast'
_VALID_URL = r'''(?x)
https?://
@ -28,15 +54,15 @@ class ACastIE(InfoExtractor):
'''
_TESTS = [{
'url': 'https://www.acast.com/sparpodcast/2.raggarmordet-rosterurdetforflutna',
'md5': '16d936099ec5ca2d5869e3a813ee8dc4',
'md5': 'f5598f3ad1e4776fed12ec1407153e4b',
'info_dict': {
'id': '2a92b283-1a75-4ad8-8396-499c641de0d9',
'ext': 'mp3',
'title': '2. Raggarmordet - Röster ur det förflutna',
'description': 'md5:4f81f6d8cf2e12ee21a321d8bca32db4',
'description': 'md5:a992ae67f4d98f1c0141598f7bebbf67',
'timestamp': 1477346700,
'upload_date': '20161024',
'duration': 2766.602563,
'duration': 2766,
'creator': 'Anton Berg & Martin Johnson',
'series': 'Spår',
'episode': '2. Raggarmordet - Röster ur det förflutna',
@ -45,7 +71,7 @@ class ACastIE(InfoExtractor):
'url': 'http://embed.acast.com/adambuxton/ep.12-adam-joeschristmaspodcast2015',
'only_matching': True,
}, {
'url': 'https://play.acast.com/s/rattegangspodden/s04e09-styckmordet-i-helenelund-del-22',
'url': 'https://play.acast.com/s/rattegangspodden/s04e09styckmordetihelenelund-del2-2',
'only_matching': True,
}, {
'url': 'https://play.acast.com/s/sparpodcast/2a92b283-1a75-4ad8-8396-499c641de0d9',
@ -54,40 +80,14 @@ class ACastIE(InfoExtractor):
def _real_extract(self, url):
channel, display_id = re.match(self._VALID_URL, url).groups()
s = self._download_json(
'https://feeder.acast.com/api/v1/shows/%s/episodes/%s' % (channel, display_id),
display_id)
media_url = s['url']
if re.search(r'[0-9a-f]{8}-(?:[0-9a-f]{4}-){3}[0-9a-f]{12}', display_id):
episode_url = s.get('episodeUrl')
if episode_url:
display_id = episode_url
else:
channel, display_id = re.match(self._VALID_URL, s['link']).groups()
cast_data = self._download_json(
'https://play-api.acast.com/splash/%s/%s' % (channel, display_id),
display_id)['result']
e = cast_data['episode']
title = e.get('name') or s['title']
return {
'id': compat_str(e['id']),
'display_id': display_id,
'url': media_url,
'title': title,
'description': e.get('summary') or clean_html(e.get('description') or s.get('description')),
'thumbnail': e.get('image'),
'timestamp': unified_timestamp(e.get('publishingDate') or s.get('publishDate')),
'duration': float_or_none(e.get('duration') or s.get('duration')),
'filesize': int_or_none(e.get('contentLength')),
'creator': try_get(cast_data, lambda x: x['show']['author'], compat_str),
'series': try_get(cast_data, lambda x: x['show']['name'], compat_str),
'season_number': int_or_none(e.get('seasonNumber')),
'episode': title,
'episode_number': int_or_none(e.get('episodeNumber')),
}
episode = self._call_api(
'%s/episodes/%s' % (channel, display_id),
display_id, {'showInfo': 'true'})
return self._extract_episode(
episode, self._extract_show_info(episode.get('show') or {}))
class ACastChannelIE(InfoExtractor):
class ACastChannelIE(ACastBaseIE):
IE_NAME = 'acast:channel'
_VALID_URL = r'''(?x)
https?://
@ -102,34 +102,24 @@ class ACastChannelIE(InfoExtractor):
'info_dict': {
'id': '4efc5294-5385-4847-98bd-519799ce5786',
'title': 'Today in Focus',
'description': 'md5:9ba5564de5ce897faeb12963f4537a64',
'description': 'md5:c09ce28c91002ce4ffce71d6504abaae',
},
'playlist_mincount': 35,
'playlist_mincount': 200,
}, {
'url': 'http://play.acast.com/s/ft-banking-weekly',
'only_matching': True,
}]
_API_BASE_URL = 'https://play.acast.com/api/'
_PAGE_SIZE = 10
@classmethod
def suitable(cls, url):
return False if ACastIE.suitable(url) else super(ACastChannelIE, cls).suitable(url)
def _fetch_page(self, channel_slug, page):
casts = self._download_json(
self._API_BASE_URL + 'channels/%s/acasts?page=%s' % (channel_slug, page),
channel_slug, note='Download page %d of channel data' % page)
for cast in casts:
yield self.url_result(
'https://play.acast.com/s/%s/%s' % (channel_slug, cast['url']),
'ACast', cast['id'])
def _real_extract(self, url):
channel_slug = self._match_id(url)
channel_data = self._download_json(
self._API_BASE_URL + 'channels/%s' % channel_slug, channel_slug)
entries = OnDemandPagedList(functools.partial(
self._fetch_page, channel_slug), self._PAGE_SIZE)
return self.playlist_result(entries, compat_str(
channel_data['id']), channel_data['name'], channel_data.get('description'))
show_slug = self._match_id(url)
show = self._call_api(show_slug, show_slug)
show_info = self._extract_show_info(show)
entries = []
for episode in (show.get('episodes') or []):
entries.append(self._extract_episode(episode, show_info))
return self.playlist_result(
entries, show.get('id'), show.get('title'), show.get('description'))

View File

@ -0,0 +1,174 @@
# coding: utf-8
from __future__ import unicode_literals
import re
from .common import InfoExtractor
from ..utils import (
extract_attributes,
int_or_none,
parse_iso8601,
try_get,
)
class ArcPublishingIE(InfoExtractor):
_UUID_REGEX = r'[\da-f]{8}-(?:[\da-f]{4}-){3}[\da-f]{12}'
_VALID_URL = r'arcpublishing:(?P<org>[a-z]+):(?P<id>%s)' % _UUID_REGEX
_TESTS = [{
# https://www.adn.com/politics/2020/11/02/video-senate-candidates-campaign-in-anchorage-on-eve-of-election-day/
'url': 'arcpublishing:adn:8c99cb6e-b29c-4bc9-9173-7bf9979225ab',
'only_matching': True,
}, {
# https://www.bostonglobe.com/video/2020/12/30/metro/footage-released-showing-officer-talking-about-striking-protesters-with-car/
'url': 'arcpublishing:bostonglobe:232b7ae6-7d73-432d-bc0a-85dbf0119ab1',
'only_matching': True,
}, {
# https://www.actionnewsjax.com/video/live-stream/
'url': 'arcpublishing:cmg:cfb1cf1b-3ab5-4d1b-86c5-a5515d311f2a',
'only_matching': True,
}, {
# https://elcomercio.pe/videos/deportes/deporte-total-futbol-peruano-seleccion-peruana-la-valorizacion-de-los-peruanos-en-el-exterior-tras-un-2020-atipico-nnav-vr-video-noticia/
'url': 'arcpublishing:elcomercio:27a7e1f8-2ec7-4177-874f-a4feed2885b3',
'only_matching': True,
}, {
# https://www.clickondetroit.com/video/community/2020/05/15/events-surrounding-woodward-dream-cruise-being-canceled/
'url': 'arcpublishing:gmg:c8793fb2-8d44-4242-881e-2db31da2d9fe',
'only_matching': True,
}, {
# https://www.wabi.tv/video/2020/12/30/trenton-company-making-equipment-pfizer-covid-vaccine/
'url': 'arcpublishing:gray:0b0ba30e-032a-4598-8810-901d70e6033e',
'only_matching': True,
}, {
# https://www.lateja.cr/el-mundo/video-china-aprueba-con-condiciones-su-primera/dfcbfa57-527f-45ff-a69b-35fe71054143/video/
'url': 'arcpublishing:gruponacion:dfcbfa57-527f-45ff-a69b-35fe71054143',
'only_matching': True,
}, {
# https://www.fifthdomain.com/video/2018/03/09/is-america-vulnerable-to-a-cyber-attack/
'url': 'arcpublishing:mco:aa0ca6fe-1127-46d4-b32c-be0d6fdb8055',
'only_matching': True,
}, {
# https://www.vl.no/kultur/2020/12/09/en-melding-fra-en-lytter-endret-julelista-til-lewi-bergrud/
'url': 'arcpublishing:mentormedier:47a12084-650b-4011-bfd0-3699b6947b2d',
'only_matching': True,
}, {
# https://www.14news.com/2020/12/30/whiskey-theft-caught-camera-henderson-liquor-store/
'url': 'arcpublishing:raycom:b89f61f8-79fa-4c09-8255-e64237119bf7',
'only_matching': True,
}, {
# https://www.theglobeandmail.com/world/video-ethiopian-woman-who-became-symbol-of-integration-in-italy-killed-on/
'url': 'arcpublishing:tgam:411b34c1-8701-4036-9831-26964711664b',
'only_matching': True,
}, {
# https://www.pilotonline.com/460f2931-8130-4719-8ea1-ffcb2d7cb685-132.html
'url': 'arcpublishing:tronc:460f2931-8130-4719-8ea1-ffcb2d7cb685',
'only_matching': True,
}]
_POWA_DEFAULTS = [
(['cmg', 'prisa'], '%s-config-prod.api.cdn.arcpublishing.com/video'),
([
'adn', 'advancelocal', 'answers', 'bonnier', 'bostonglobe', 'demo',
'gmg', 'gruponacion', 'infobae', 'mco', 'nzme', 'pmn', 'raycom',
'spectator', 'tbt', 'tgam', 'tronc', 'wapo', 'wweek',
], 'video-api-cdn.%s.arcpublishing.com/api'),
]
@staticmethod
def _extract_urls(webpage):
entries = []
# https://arcpublishing.atlassian.net/wiki/spaces/POWA/overview
for powa_el in re.findall(r'(<div[^>]+class="[^"]*\bpowa\b[^"]*"[^>]+data-uuid="%s"[^>]*>)' % ArcPublishingIE._UUID_REGEX, webpage):
powa = extract_attributes(powa_el) or {}
org = powa.get('data-org')
uuid = powa.get('data-uuid')
if org and uuid:
entries.append('arcpublishing:%s:%s' % (org, uuid))
return entries
def _real_extract(self, url):
org, uuid = re.match(self._VALID_URL, url).groups()
for orgs, tmpl in self._POWA_DEFAULTS:
if org in orgs:
base_api_tmpl = tmpl
break
else:
base_api_tmpl = '%s-prod-cdn.video-api.arcpublishing.com/api'
if org == 'wapo':
org = 'washpost'
video = self._download_json(
'https://%s/v1/ansvideos/findByUuid' % (base_api_tmpl % org),
uuid, query={'uuid': uuid})[0]
title = video['headlines']['basic']
is_live = video.get('status') == 'live'
urls = []
formats = []
for s in video.get('streams', []):
s_url = s.get('url')
if not s_url or s_url in urls:
continue
urls.append(s_url)
stream_type = s.get('stream_type')
if stream_type == 'smil':
smil_formats = self._extract_smil_formats(
s_url, uuid, fatal=False)
for f in smil_formats:
if f['url'].endswith('/cfx/st'):
f['app'] = 'cfx/st'
if not f['play_path'].startswith('mp4:'):
f['play_path'] = 'mp4:' + f['play_path']
if isinstance(f['tbr'], float):
f['vbr'] = f['tbr'] * 1000
del f['tbr']
f['format_id'] = 'rtmp-%d' % f['vbr']
formats.extend(smil_formats)
elif stream_type in ('ts', 'hls'):
m3u8_formats = self._extract_m3u8_formats(
s_url, uuid, 'mp4', 'm3u8' if is_live else 'm3u8_native',
m3u8_id='hls', fatal=False)
if all([f.get('acodec') == 'none' for f in m3u8_formats]):
continue
for f in m3u8_formats:
if f.get('acodec') == 'none':
f['preference'] = -40
elif f.get('vcodec') == 'none':
f['preference'] = -50
height = f.get('height')
if not height:
continue
vbr = self._search_regex(
r'[_x]%d[_-](\d+)' % height, f['url'], 'vbr', default=None)
if vbr:
f['vbr'] = int(vbr)
formats.extend(m3u8_formats)
else:
vbr = int_or_none(s.get('bitrate'))
formats.append({
'format_id': '%s-%d' % (stream_type, vbr) if vbr else stream_type,
'vbr': vbr,
'width': int_or_none(s.get('width')),
'height': int_or_none(s.get('height')),
'filesize': int_or_none(s.get('filesize')),
'url': s_url,
'preference': -1,
})
self._sort_formats(
formats, ('preference', 'width', 'height', 'vbr', 'filesize', 'tbr', 'ext', 'format_id'))
subtitles = {}
for subtitle in (try_get(video, lambda x: x['subtitles']['urls'], list) or []):
subtitle_url = subtitle.get('url')
if subtitle_url:
subtitles.setdefault('en', []).append({'url': subtitle_url})
return {
'id': uuid,
'title': self._live_title(title) if is_live else title,
'thumbnail': try_get(video, lambda x: x['promo_image']['url']),
'description': try_get(video, lambda x: x['subheadlines']['basic']),
'formats': formats,
'duration': int_or_none(video.get('duration'), 100),
'timestamp': parse_iso8601(video.get('created_date')),
'subtitles': subtitles,
'is_live': is_live,
}

View File

@ -60,6 +60,7 @@ from .appletrailers import (
AppleTrailersSectionIE,
)
from .archiveorg import ArchiveOrgIE
from .arcpublishing import ArcPublishingIE
from .arkena import ArkenaIE
from .ard import (
ARDBetaMediathekIE,
@ -816,6 +817,7 @@ from .nrk import (
NRKSkoleIE,
NRKTVIE,
NRKTVDirekteIE,
NRKRadioPodkastIE,
NRKTVEpisodeIE,
NRKTVEpisodesIE,
NRKTVSeasonIE,
@ -1087,6 +1089,7 @@ from .skynewsarabia import (
from .sky import (
SkyNewsIE,
SkySportsIE,
SkySportsNewsIE,
)
from .slideshare import SlideshareIE
from .slideslive import SlidesLiveIE

View File

@ -127,6 +127,7 @@ from .kinja import KinjaEmbedIE
from .gedi import GediEmbedsIE
from .rcs import RCSEmbedsIE
from .bitchute import BitChuteIE
from .arcpublishing import ArcPublishingIE
class GenericIE(InfoExtractor):
@ -2206,6 +2207,20 @@ class GenericIE(InfoExtractor):
'uploader': 'OTT Videos',
},
},
{
# ArcPublishing PoWa video player
'url': 'https://www.adn.com/politics/2020/11/02/video-senate-candidates-campaign-in-anchorage-on-eve-of-election-day/',
'md5': 'b03b2fac8680e1e5a7cc81a5c27e71b3',
'info_dict': {
'id': '8c99cb6e-b29c-4bc9-9173-7bf9979225ab',
'ext': 'mp4',
'title': 'Senate candidates wave to voters on Anchorage streets',
'description': 'md5:91f51a6511f090617353dc720318b20e',
'timestamp': 1604378735,
'upload_date': '20201103',
'duration': 1581,
},
},
]
def report_following_redirect(self, new_url):
@ -2572,6 +2587,10 @@ class GenericIE(InfoExtractor):
if tp_urls:
return self.playlist_from_matches(tp_urls, video_id, video_title, ie='ThePlatform')
arc_urls = ArcPublishingIE._extract_urls(webpage)
if arc_urls:
return self.playlist_from_matches(arc_urls, video_id, video_title, ie=ArcPublishingIE.ie_key())
# Look for embedded rtl.nl player
matches = re.findall(
r'<iframe[^>]+?src="((?:https?:)?//(?:(?:www|static)\.)?rtl\.nl/(?:system/videoplayer/[^"]+(?:video_)?)?embed[^"]+)"',

View File

@ -6,16 +6,13 @@ import random
import re
from .common import InfoExtractor
from ..compat import (
compat_str,
compat_urllib_parse_unquote,
)
from ..compat import compat_str
from ..utils import (
determine_ext,
ExtractorError,
int_or_none,
parse_age_limit,
parse_duration,
str_or_none,
try_get,
urljoin,
url_or_none,
@ -63,7 +60,8 @@ class NRKBaseIE(InfoExtractor):
return self._download_json(
urljoin('http://psapi.nrk.no/', path),
video_id, note or 'Downloading %s JSON' % item,
fatal=fatal, query=query)
fatal=fatal, query=query,
headers={'Accept-Encoding': 'gzip, deflate, br'})
class NRKIE(NRKBaseIE):
@ -116,9 +114,39 @@ class NRKIE(NRKBaseIE):
}, {
'url': 'https://www.nrk.no/video/humor/kommentatorboksen-reiser-til-sjos_d1fda11f-a4ad-437a-a374-0398bc84e999',
'only_matching': True,
}, {
# podcast
'url': 'nrk:l_96f4f1b0-de54-4e6a-b4f1-b0de54fe6af8',
'only_matching': True,
}, {
'url': 'nrk:podcast/l_96f4f1b0-de54-4e6a-b4f1-b0de54fe6af8',
'only_matching': True,
}, {
# clip
'url': 'nrk:150533',
'only_matching': True,
}, {
'url': 'nrk:clip/150533',
'only_matching': True,
}, {
# program
'url': 'nrk:MDDP12000117',
'only_matching': True,
}, {
'url': 'nrk:program/ENRK10100318',
'only_matching': True,
}, {
# direkte
'url': 'nrk:nrk1',
'only_matching': True,
}, {
'url': 'nrk:channel/nrk1',
'only_matching': True,
}]
def _extract_from_playback(self, video_id):
def _real_extract(self, url):
video_id = self._match_id(url).split('/')[-1]
path_templ = 'playback/%s/' + video_id
def call_playback_api(item, query=None):
@ -126,6 +154,8 @@ class NRKIE(NRKBaseIE):
# known values for preferredCdn: akamai, iponly, minicdn and telenor
manifest = call_playback_api('manifest', {'preferredCdn': 'akamai'})
video_id = try_get(manifest, lambda x: x['id'], compat_str) or video_id
if manifest.get('playability') == 'nonPlayable':
self._raise_error(manifest['nonPlayable'])
@ -140,8 +170,15 @@ class NRKIE(NRKBaseIE):
format_url = url_or_none(asset.get('url'))
if not format_url:
continue
if asset.get('format') == 'HLS' or determine_ext(format_url) == 'm3u8':
asset_format = (asset.get('format') or '').lower()
if asset_format == 'hls' or determine_ext(format_url) == 'm3u8':
formats.extend(self._extract_nrk_formats(format_url, video_id))
elif asset_format == 'mp3':
formats.append({
'url': format_url,
'format_id': asset_format,
'vcodec': 'none',
})
self._sort_formats(formats)
data = call_playback_api('metadata')
@ -168,31 +205,94 @@ class NRKIE(NRKBaseIE):
'height': int_or_none(image.get('pixelHeight')),
})
return {
subtitles = {}
for sub in try_get(playable, lambda x: x['subtitles'], list) or []:
if not isinstance(sub, dict):
continue
sub_url = url_or_none(sub.get('webVtt'))
if not sub_url:
continue
sub_key = str_or_none(sub.get('language')) or 'nb'
sub_type = str_or_none(sub.get('type'))
if sub_type:
sub_key += '-%s' % sub_type
subtitles.setdefault(sub_key, []).append({
'url': sub_url,
})
legal_age = try_get(
data, lambda x: x['legalAge']['body']['rating']['code'], compat_str)
# https://en.wikipedia.org/wiki/Norwegian_Media_Authority
if legal_age == 'A':
age_limit = 0
elif legal_age.isdigit():
age_limit = int_or_none(legal_age)
else:
age_limit = None
is_series = try_get(data, lambda x: x['_links']['series']['name']) == 'series'
info = {
'id': video_id,
'title': title,
'alt_title': alt_title,
'description': description,
'duration': duration,
'thumbnails': thumbnails,
'age_limit': age_limit,
'formats': formats,
'subtitles': subtitles,
}
def _real_extract(self, url):
video_id = self._match_id(url)
return self._extract_from_playback(video_id)
if is_series:
series = season_id = season_number = episode = episode_number = None
programs = self._call_api(
'programs/%s' % video_id, video_id, 'programs', fatal=False)
if programs and isinstance(programs, dict):
series = str_or_none(programs.get('seriesTitle'))
season_id = str_or_none(programs.get('seasonId'))
season_number = int_or_none(programs.get('seasonNumber'))
episode = str_or_none(programs.get('episodeTitle'))
episode_number = int_or_none(programs.get('episodeNumber'))
if not series:
series = title
if alt_title:
title += ' - %s' % alt_title
if not season_number:
season_number = int_or_none(self._search_regex(
r'Sesong\s+(\d+)', description or '', 'season number',
default=None))
if not episode:
episode = alt_title if is_series else None
if not episode_number:
episode_number = int_or_none(self._search_regex(
r'^(\d+)\.', episode or '', 'episode number',
default=None))
if not episode_number:
episode_number = int_or_none(self._search_regex(
r'\((\d+)\s*:\s*\d+\)', description or '',
'episode number', default=None))
info.update({
'title': title,
'series': series,
'season_id': season_id,
'season_number': season_number,
'episode': episode,
'episode_number': episode_number,
})
return info
class NRKTVIE(NRKBaseIE):
class NRKTVIE(InfoExtractor):
IE_DESC = 'NRK TV and NRK Radio'
_EPISODE_RE = r'(?P<id>[a-zA-Z]{4}\d{8})'
_VALID_URL = r'https?://(?:tv|radio)\.nrk(?:super)?\.no/(?:[^/]+/)*%s' % _EPISODE_RE
_API_HOSTS = ('psapi-ne.nrk.no', 'psapi-we.nrk.no')
_TESTS = [{
'url': 'https://tv.nrk.no/program/MDDP12000117',
'md5': 'c4a5960f1b00b40d47db65c1064e0ab1',
'info_dict': {
'id': 'MDDP12000117AA',
'id': 'MDDP12000117',
'ext': 'mp4',
'title': 'Alarm Trolltunga',
'description': 'md5:46923a6e6510eefcce23d5ef2a58f2ce',
@ -203,24 +303,27 @@ class NRKTVIE(NRKBaseIE):
'url': 'https://tv.nrk.no/serie/20-spoersmaal-tv/MUHH48000314/23-05-2014',
'md5': '8d40dab61cea8ab0114e090b029a0565',
'info_dict': {
'id': 'MUHH48000314AA',
'id': 'MUHH48000314',
'ext': 'mp4',
'title': '20 spørsmål 23.05.2014',
'title': '20 spørsmål - 23. mai 2014',
'alt_title': '23. mai 2014',
'description': 'md5:bdea103bc35494c143c6a9acdd84887a',
'duration': 1741,
'series': '20 spørsmål',
'episode': '23.05.2014',
'episode': '23. mai 2014',
'age_limit': 0,
},
}, {
'url': 'https://tv.nrk.no/program/mdfp15000514',
'info_dict': {
'id': 'MDFP15000514CA',
'id': 'MDFP15000514',
'ext': 'mp4',
'title': 'Grunnlovsjubiléet - Stor ståhei for ingenting 24.05.2014',
'title': 'Kunnskapskanalen - Grunnlovsjubiléet - Stor ståhei for ingenting',
'description': 'md5:89290c5ccde1b3a24bb8050ab67fe1db',
'duration': 4605.08,
'series': 'Kunnskapskanalen',
'episode': '24.05.2014',
'episode': 'Grunnlovsjubiléet - Stor ståhei for ingenting',
'age_limit': 0,
},
'params': {
'skip_download': True,
@ -229,10 +332,11 @@ class NRKTVIE(NRKBaseIE):
# single playlist video
'url': 'https://tv.nrk.no/serie/tour-de-ski/MSPO40010515/06-01-2015#del=2',
'info_dict': {
'id': 'MSPO40010515AH',
'id': 'MSPO40010515',
'ext': 'mp4',
'title': 'Sprint fri teknikk, kvinner og menn 06.01.2015',
'description': 'md5:c03aba1e917561eface5214020551b7a',
'age_limit': 0,
},
'params': {
'skip_download': True,
@ -242,24 +346,27 @@ class NRKTVIE(NRKBaseIE):
}, {
'url': 'https://tv.nrk.no/serie/tour-de-ski/MSPO40010515/06-01-2015',
'info_dict': {
'id': 'MSPO40010515AH',
'id': 'MSPO40010515',
'ext': 'mp4',
'title': 'Sprint fri teknikk, kvinner og menn 06.01.2015',
'description': 'md5:c03aba1e917561eface5214020551b7a',
'age_limit': 0,
},
'expected_warnings': ['Failed to download m3u8 information'],
'skip': 'Ikke tilgjengelig utenfor Norge',
}, {
'url': 'https://tv.nrk.no/serie/anno/KMTE50001317/sesong-3/episode-13',
'info_dict': {
'id': 'KMTE50001317AA',
'id': 'KMTE50001317',
'ext': 'mp4',
'title': 'Anno 13:30',
'title': 'Anno - 13. episode',
'description': 'md5:11d9613661a8dbe6f9bef54e3a4cbbfa',
'duration': 2340,
'series': 'Anno',
'episode': '13:30',
'episode': '13. episode',
'season_number': 3,
'episode_number': 13,
'age_limit': 0,
},
'params': {
'skip_download': True,
@ -267,13 +374,14 @@ class NRKTVIE(NRKBaseIE):
}, {
'url': 'https://tv.nrk.no/serie/nytt-paa-nytt/MUHH46000317/27-01-2017',
'info_dict': {
'id': 'MUHH46000317AA',
'id': 'MUHH46000317',
'ext': 'mp4',
'title': 'Nytt på Nytt 27.01.2017',
'description': 'md5:5358d6388fba0ea6f0b6d11c48b9eb4b',
'duration': 1796,
'series': 'Nytt på nytt',
'episode': '27.01.2017',
'age_limit': 0,
},
'params': {
'skip_download': True,
@ -290,180 +398,26 @@ class NRKTVIE(NRKBaseIE):
'only_matching': True,
}]
_api_host = None
def _extract_from_mediaelement(self, video_id):
api_hosts = (self._api_host, ) if self._api_host else self._API_HOSTS
for api_host in api_hosts:
data = self._download_json(
'http://%s/mediaelement/%s' % (api_host, video_id),
video_id, 'Downloading mediaelement JSON',
fatal=api_host == api_hosts[-1])
if not data:
continue
self._api_host = api_host
break
title = data.get('fullTitle') or data.get('mainTitle') or data['title']
video_id = data.get('id') or video_id
urls = []
entries = []
conviva = data.get('convivaStatistics') or {}
live = (data.get('mediaElementType') == 'Live'
or data.get('isLive') is True or conviva.get('isLive'))
def make_title(t):
return self._live_title(t) if live else t
media_assets = data.get('mediaAssets')
if media_assets and isinstance(media_assets, list):
def video_id_and_title(idx):
return ((video_id, title) if len(media_assets) == 1
else ('%s-%d' % (video_id, idx), '%s (Part %d)' % (title, idx)))
for num, asset in enumerate(media_assets, 1):
asset_url = asset.get('url')
if not asset_url or asset_url in urls:
continue
urls.append(asset_url)
formats = self._extract_nrk_formats(asset_url, video_id)
if not formats:
continue
self._sort_formats(formats)
entry_id, entry_title = video_id_and_title(num)
duration = parse_duration(asset.get('duration'))
subtitles = {}
for subtitle in ('webVtt', 'timedText'):
subtitle_url = asset.get('%sSubtitlesUrl' % subtitle)
if subtitle_url:
subtitles.setdefault('no', []).append({
'url': compat_urllib_parse_unquote(subtitle_url)
})
entries.append({
'id': asset.get('carrierId') or entry_id,
'title': make_title(entry_title),
'duration': duration,
'subtitles': subtitles,
'formats': formats,
'is_live': live,
})
if not entries:
media_url = data.get('mediaUrl')
if media_url and media_url not in urls:
formats = self._extract_nrk_formats(media_url, video_id)
if formats:
self._sort_formats(formats)
duration = parse_duration(data.get('duration'))
entries = [{
'id': video_id,
'title': make_title(title),
'duration': duration,
'formats': formats,
'is_live': live,
}]
if not entries:
self._raise_error(data)
series = conviva.get('seriesName') or data.get('seriesTitle')
episode = conviva.get('episodeName') or data.get('episodeNumberOrDate')
season_number = None
episode_number = None
if data.get('mediaElementType') == 'Episode':
_season_episode = data.get('scoresStatistics', {}).get('springStreamStream') or \
data.get('relativeOriginUrl', '')
EPISODENUM_RE = [
r'/s(?P<season>\d{,2})e(?P<episode>\d{,2})\.',
r'/sesong-(?P<season>\d{,2})/episode-(?P<episode>\d{,2})',
]
season_number = int_or_none(self._search_regex(
EPISODENUM_RE, _season_episode, 'season number',
default=None, group='season'))
episode_number = int_or_none(self._search_regex(
EPISODENUM_RE, _season_episode, 'episode number',
default=None, group='episode'))
thumbnails = None
images = data.get('images')
if images and isinstance(images, dict):
web_images = images.get('webImages')
if isinstance(web_images, list):
thumbnails = [{
'url': image['imageUrl'],
'width': int_or_none(image.get('width')),
'height': int_or_none(image.get('height')),
} for image in web_images if image.get('imageUrl')]
description = data.get('description')
category = data.get('mediaAnalytics', {}).get('category')
common_info = {
'description': description,
'series': series,
'episode': episode,
'season_number': season_number,
'episode_number': episode_number,
'categories': [category] if category else None,
'age_limit': parse_age_limit(data.get('legalAge')),
'thumbnails': thumbnails,
}
vcodec = 'none' if data.get('mediaType') == 'Audio' else None
for entry in entries:
entry.update(common_info)
for f in entry['formats']:
f['vcodec'] = vcodec
points = data.get('shortIndexPoints')
if isinstance(points, list):
chapters = []
for next_num, point in enumerate(points, start=1):
if not isinstance(point, dict):
continue
start_time = parse_duration(point.get('startPoint'))
if start_time is None:
continue
end_time = parse_duration(
data.get('duration')
if next_num == len(points)
else points[next_num].get('startPoint'))
if end_time is None:
continue
chapters.append({
'start_time': start_time,
'end_time': end_time,
'title': point.get('title'),
})
if chapters and len(entries) == 1:
entries[0]['chapters'] = chapters
return self.playlist_result(entries, video_id, title, description)
def _real_extract(self, url):
video_id = self._match_id(url)
return self._extract_from_mediaelement(video_id)
return self.url_result(
'nrk:%s' % video_id, ie=NRKIE.ie_key(), video_id=video_id)
class NRKTVEpisodeIE(InfoExtractor):
_VALID_URL = r'https?://tv\.nrk\.no/serie/(?P<id>[^/]+/sesong/\d+/episode/\d+)'
_VALID_URL = r'https?://tv\.nrk\.no/serie/(?P<id>[^/]+/sesong/(?P<season_number>\d+)/episode/(?P<episode_number>\d+))'
_TESTS = [{
'url': 'https://tv.nrk.no/serie/hellums-kro/sesong/1/episode/2',
'info_dict': {
'id': 'MUHH36005220BA',
'id': 'MUHH36005220',
'ext': 'mp4',
'title': 'Kro, krig og kjærlighet 2:6',
'description': 'md5:b32a7dc0b1ed27c8064f58b97bda4350',
'duration': 1563,
'title': 'Hellums kro - 2. Kro, krig og kjærlighet',
'description': 'md5:ad92ddffc04cea8ce14b415deef81787',
'duration': 1563.92,
'series': 'Hellums kro',
'season_number': 1,
'episode_number': 2,
'episode': '2:6',
'episode': '2. Kro, krig og kjærlighet',
'age_limit': 6,
},
'params': {
@ -472,15 +426,16 @@ class NRKTVEpisodeIE(InfoExtractor):
}, {
'url': 'https://tv.nrk.no/serie/backstage/sesong/1/episode/8',
'info_dict': {
'id': 'MSUI14000816AA',
'id': 'MSUI14000816',
'ext': 'mp4',
'title': 'Backstage 8:30',
'title': 'Backstage - 8. episode',
'description': 'md5:de6ca5d5a2d56849e4021f2bf2850df4',
'duration': 1320,
'series': 'Backstage',
'season_number': 1,
'episode_number': 8,
'episode': '8:30',
'episode': '8. episode',
'age_limit': 0,
},
'params': {
'skip_download': True,
@ -489,7 +444,7 @@ class NRKTVEpisodeIE(InfoExtractor):
}]
def _real_extract(self, url):
display_id = self._match_id(url)
display_id, season_number, episode_number = re.match(self._VALID_URL, url).groups()
webpage = self._download_webpage(url, display_id)
@ -501,10 +456,12 @@ class NRKTVEpisodeIE(InfoExtractor):
assert re.match(NRKTVIE._EPISODE_RE, nrk_id)
info.update({
'_type': 'url_transparent',
'_type': 'url',
'id': nrk_id,
'url': 'nrk:%s' % nrk_id,
'ie_key': NRKIE.ie_key(),
'season_number': int(season_number),
'episode_number': int(episode_number),
})
return info
@ -518,8 +475,6 @@ class NRKTVSerieBaseIE(NRKBaseIE):
nrk_id = episode.get('prfId') or episode.get('episodeId')
if not nrk_id or not isinstance(nrk_id, compat_str):
continue
if not re.match(NRKTVIE._EPISODE_RE, nrk_id):
continue
entries.append(self.url_result(
'nrk:%s' % nrk_id, ie=NRKIE.ie_key(), video_id=nrk_id))
return entries
@ -531,6 +486,10 @@ class NRKTVSerieBaseIE(NRKBaseIE):
if embedded.get(asset_key):
return asset_key
@staticmethod
def _catalog_name(serie_kind):
return 'podcast' if serie_kind in ('podcast', 'podkast') else 'series'
def _entries(self, data, display_id):
for page_num in itertools.count(1):
embedded = data.get('_embedded') or data
@ -564,7 +523,16 @@ class NRKTVSerieBaseIE(NRKBaseIE):
class NRKTVSeasonIE(NRKTVSerieBaseIE):
_VALID_URL = r'https?://(?P<domain>tv|radio)\.nrk\.no/serie/(?P<serie>[^/]+)/(?:sesong/)?(?P<id>\d+)'
_VALID_URL = r'''(?x)
https?://
(?P<domain>tv|radio)\.nrk\.no/
(?P<serie_kind>serie|pod[ck]ast)/
(?P<serie>[^/]+)/
(?:
(?:sesong/)?(?P<id>\d+)|
sesong/(?P<id_2>[^/?#&]+)
)
'''
_TESTS = [{
'url': 'https://tv.nrk.no/serie/backstage/sesong/1',
'info_dict': {
@ -600,19 +568,34 @@ class NRKTVSeasonIE(NRKTVSerieBaseIE):
# 180 entries, single page
'url': 'https://tv.nrk.no/serie/spangas/sesong/1',
'only_matching': True,
}, {
'url': 'https://radio.nrk.no/podkast/hele_historien/sesong/diagnose-kverulant',
'info_dict': {
'id': 'hele_historien/diagnose-kverulant',
'title': 'Diagnose kverulant',
},
'playlist_mincount': 3,
}, {
'url': 'https://radio.nrk.no/podkast/loerdagsraadet/sesong/202101',
'only_matching': True,
}]
@classmethod
def suitable(cls, url):
return (False if NRKTVIE.suitable(url) or NRKTVEpisodeIE.suitable(url)
return (False if NRKTVIE.suitable(url) or NRKTVEpisodeIE.suitable(url) or NRKRadioPodkastIE.suitable(url)
else super(NRKTVSeasonIE, cls).suitable(url))
def _real_extract(self, url):
domain, serie, season_id = re.match(self._VALID_URL, url).groups()
mobj = re.match(self._VALID_URL, url)
domain = mobj.group('domain')
serie_kind = mobj.group('serie_kind')
serie = mobj.group('serie')
season_id = mobj.group('id') or mobj.group('id_2')
display_id = '%s/%s' % (serie, season_id)
data = self._call_api(
'%s/catalog/series/%s/seasons/%s' % (domain, serie, season_id),
'%s/catalog/%s/%s/seasons/%s'
% (domain, self._catalog_name(serie_kind), serie, season_id),
display_id, 'season', query={'pageSize': 50})
title = try_get(data, lambda x: x['titles']['title'], compat_str) or display_id
@ -622,7 +605,7 @@ class NRKTVSeasonIE(NRKTVSerieBaseIE):
class NRKTVSeriesIE(NRKTVSerieBaseIE):
_VALID_URL = r'https?://(?P<domain>(?:tv|radio)\.nrk|(?:tv\.)?nrksuper)\.no/serie/(?P<id>[^/]+)'
_VALID_URL = r'https?://(?P<domain>(?:tv|radio)\.nrk|(?:tv\.)?nrksuper)\.no/(?P<serie_kind>serie|pod[ck]ast)/(?P<id>[^/]+)'
_TESTS = [{
# new layout, instalments
'url': 'https://tv.nrk.no/serie/groenn-glede',
@ -682,23 +665,33 @@ class NRKTVSeriesIE(NRKTVSerieBaseIE):
}, {
'url': 'https://nrksuper.no/serie/labyrint',
'only_matching': True,
}, {
'url': 'https://radio.nrk.no/podkast/ulrikkes_univers',
'info_dict': {
'id': 'ulrikkes_univers',
},
'playlist_mincount': 10,
}, {
'url': 'https://radio.nrk.no/podkast/ulrikkes_univers/nrkno-poddkast-26588-134079-05042018030000',
'only_matching': True,
}]
@classmethod
def suitable(cls, url):
return (
False if any(ie.suitable(url)
for ie in (NRKTVIE, NRKTVEpisodeIE, NRKTVSeasonIE))
for ie in (NRKTVIE, NRKTVEpisodeIE, NRKRadioPodkastIE, NRKTVSeasonIE))
else super(NRKTVSeriesIE, cls).suitable(url))
def _real_extract(self, url):
site, series_id = re.match(self._VALID_URL, url).groups()
site, serie_kind, series_id = re.match(self._VALID_URL, url).groups()
is_radio = site == 'radio.nrk'
domain = 'radio' if is_radio else 'tv'
size_prefix = 'p' if is_radio else 'embeddedInstalmentsP'
series = self._call_api(
'%s/catalog/series/%s' % (domain, series_id),
'%s/catalog/%s/%s'
% (domain, self._catalog_name(serie_kind), series_id),
series_id, 'serie', query={size_prefix + 'ageSize': 50})
titles = try_get(series, [
lambda x: x['titles'],
@ -713,12 +706,14 @@ class NRKTVSeriesIE(NRKTVSerieBaseIE):
embedded_seasons = embedded.get('seasons') or []
if len(linked_seasons) > len(embedded_seasons):
for season in linked_seasons:
season_name = season.get('name')
if season_name and isinstance(season_name, compat_str):
season_url = urljoin(url, season.get('href'))
if not season_url:
season_name = season.get('name')
if season_name and isinstance(season_name, compat_str):
season_url = 'https://%s.nrk.no/serie/%s/sesong/%s' % (domain, series_id, season_name)
if season_url:
entries.append(self.url_result(
'https://%s.nrk.no/serie/%s/sesong/%s'
% (domain, series_id, season_name),
ie=NRKTVSeasonIE.ie_key(),
season_url, ie=NRKTVSeasonIE.ie_key(),
video_title=season.get('title')))
else:
for season in embedded_seasons:
@ -743,6 +738,38 @@ class NRKTVDirekteIE(NRKTVIE):
}]
class NRKRadioPodkastIE(InfoExtractor):
_VALID_URL = r'https?://radio\.nrk\.no/pod[ck]ast/(?:[^/]+/)+(?P<id>l_[\da-f]{8}-[\da-f]{4}-[\da-f]{4}-[\da-f]{4}-[\da-f]{12})'
_TESTS = [{
'url': 'https://radio.nrk.no/podkast/ulrikkes_univers/l_96f4f1b0-de54-4e6a-b4f1-b0de54fe6af8',
'md5': '8d40dab61cea8ab0114e090b029a0565',
'info_dict': {
'id': 'MUHH48000314AA',
'ext': 'mp4',
'title': '20 spørsmål 23.05.2014',
'description': 'md5:bdea103bc35494c143c6a9acdd84887a',
'duration': 1741,
'series': '20 spørsmål',
'episode': '23.05.2014',
},
}, {
'url': 'https://radio.nrk.no/podcast/ulrikkes_univers/l_96f4f1b0-de54-4e6a-b4f1-b0de54fe6af8',
'only_matching': True,
}, {
'url': 'https://radio.nrk.no/podkast/ulrikkes_univers/sesong/1/l_96f4f1b0-de54-4e6a-b4f1-b0de54fe6af8',
'only_matching': True,
}, {
'url': 'https://radio.nrk.no/podkast/hele_historien/sesong/bortfoert-i-bergen/l_774d1a2c-7aa7-4965-8d1a-2c7aa7d9652c',
'only_matching': True,
}]
def _real_extract(self, url):
video_id = self._match_id(url)
return self.url_result(
'nrk:%s' % video_id, ie=NRKIE.ie_key(), video_id=video_id)
class NRKPlaylistBaseIE(InfoExtractor):
def _extract_description(self, webpage):
pass

View File

@ -1,6 +1,8 @@
# coding: utf-8
from __future__ import unicode_literals
import re
from .common import InfoExtractor
from ..utils import (
extract_attributes,
@ -11,36 +13,59 @@ from ..utils import (
class SkyBaseIE(InfoExtractor):
def _real_extract(self, url):
video_id = self._match_id(url)
webpage = self._download_webpage(url, video_id)
video_data = extract_attributes(self._search_regex(
r'(<div.+?class="[^"]*sdc-article-video__media-ooyala[^"]*"[^>]+>)',
webpage, 'video data'))
BRIGHTCOVE_URL_TEMPLATE = 'http://players.brightcove.net/%s/%s_default/index.html?videoId=%s'
_SDC_EL_REGEX = r'(?s)(<div[^>]+data-(?:component-name|fn)="sdc-(?:articl|sit)e-video"[^>]*>)'
video_url = 'ooyala:%s' % video_data['data-video-id']
if video_data.get('data-token-required') == 'true':
token_fetch_options = self._parse_json(video_data.get(
'data-token-fetch-options', '{}'), video_id, fatal=False) or {}
token_fetch_url = token_fetch_options.get('url')
if token_fetch_url:
embed_token = self._download_webpage(urljoin(
url, token_fetch_url), video_id, fatal=False)
if embed_token:
video_url = smuggle_url(
video_url, {'embed_token': embed_token.strip('"')})
def _process_ooyala_element(self, webpage, sdc_el, url):
sdc = extract_attributes(sdc_el)
provider = sdc.get('data-provider')
if provider == 'ooyala':
video_id = sdc['data-sdc-video-id']
video_url = 'ooyala:%s' % video_id
ie_key = 'Ooyala'
ooyala_el = self._search_regex(
r'(<div[^>]+class="[^"]*\bsdc-article-video__media-ooyala\b[^"]*"[^>]+data-video-id="%s"[^>]*>)' % video_id,
webpage, 'video data', fatal=False)
if ooyala_el:
ooyala_attrs = extract_attributes(ooyala_el) or {}
if ooyala_attrs.get('data-token-required') == 'true':
token_fetch_url = (self._parse_json(ooyala_attrs.get(
'data-token-fetch-options', '{}'),
video_id, fatal=False) or {}).get('url')
if token_fetch_url:
embed_token = self._download_json(urljoin(
url, token_fetch_url), video_id, fatal=False)
if embed_token:
video_url = smuggle_url(
video_url, {'embed_token': embed_token})
elif provider == 'brightcove':
video_id = sdc['data-video-id']
account_id = sdc.get('data-account-id') or '6058004172001'
player_id = sdc.get('data-player-id') or 'RC9PQUaJ6'
video_url = self.BRIGHTCOVE_URL_TEMPLATE % (account_id, player_id, video_id)
ie_key = 'BrightcoveNew'
return {
'_type': 'url_transparent',
'id': video_id,
'url': video_url,
'ie_key': ie_key,
}
def _real_extract(self, url):
video_id = self._match_id(url)
webpage = self._download_webpage(url, video_id)
info = self._process_ooyala_element(webpage, self._search_regex(
self._SDC_EL_REGEX, webpage, 'sdc element'), url)
info.update({
'title': self._og_search_title(webpage),
'description': strip_or_none(self._og_search_description(webpage)),
'ie_key': 'Ooyala',
}
})
return info
class SkySportsIE(SkyBaseIE):
IE_NAME = 'sky:sports'
_VALID_URL = r'https?://(?:www\.)?skysports\.com/watch/video/([^/]+/)*(?P<id>[0-9]+)'
_TESTS = [{
'url': 'http://www.skysports.com/watch/video/10328419/bale-its-our-time-to-shine',
@ -62,15 +87,45 @@ class SkySportsIE(SkyBaseIE):
class SkyNewsIE(SkyBaseIE):
IE_NAME = 'sky:news'
_VALID_URL = r'https?://news\.sky\.com/video/[0-9a-z-]+-(?P<id>[0-9]+)'
_TEST = {
'url': 'https://news.sky.com/video/russian-plane-inspected-after-deadly-fire-11712962',
'md5': 'd6327e581473cea9976a3236ded370cd',
'md5': '411e8893fd216c75eaf7e4c65d364115',
'info_dict': {
'id': '1ua21xaDE6lCtZDmbYfl8kwsKLooJbNM',
'id': 'ref:1ua21xaDE6lCtZDmbYfl8kwsKLooJbNM',
'ext': 'mp4',
'title': 'Russian plane inspected after deadly fire',
'description': 'The Russian Investigative Committee has released video of the wreckage of a passenger plane which caught fire near Moscow.',
'uploader_id': '6058004172001',
'timestamp': 1567112345,
'upload_date': '20190829',
},
'add_ie': ['Ooyala'],
'add_ie': ['BrightcoveNew'],
}
class SkySportsNewsIE(SkyBaseIE):
IE_NAME = 'sky:sports:news'
_VALID_URL = r'https?://(?:www\.)?skysports\.com/([^/]+/)*news/\d+/(?P<id>\d+)'
_TEST = {
'url': 'http://www.skysports.com/golf/news/12176/10871916/dustin-johnson-ready-to-conquer-players-championship-at-tpc-sawgrass',
'info_dict': {
'id': '10871916',
'title': 'Dustin Johnson ready to conquer Players Championship at TPC Sawgrass',
'description': 'Dustin Johnson is confident he can continue his dominant form in 2017 by adding the Players Championship to his list of victories.',
},
'playlist_count': 2,
}
def _real_extract(self, url):
article_id = self._match_id(url)
webpage = self._download_webpage(url, article_id)
entries = []
for sdc_el in re.findall(self._SDC_EL_REGEX, webpage):
entries.append(self._process_ooyala_element(webpage, sdc_el, url))
return self.playlist_result(
entries, article_id, self._og_search_title(webpage),
self._html_search_meta(['og:description', 'description'], webpage))

View File

@ -4,25 +4,28 @@ import re
from .common import InfoExtractor
from ..utils import (
determine_ext,
clean_html,
ExtractorError,
int_or_none,
js_to_json,
unescapeHTML,
str_or_none,
try_get,
)
class StitcherIE(InfoExtractor):
_VALID_URL = r'https?://(?:www\.)?stitcher\.com/podcast/(?:[^/]+/)+e/(?:(?P<display_id>[^/#?&]+?)-)?(?P<id>\d+)(?:[/#?&]|$)'
_VALID_URL = r'https?://(?:www\.)?stitcher\.com/(?:podcast|show)/(?:[^/]+/)+e(?:pisode)?/(?:(?P<display_id>[^/#?&]+?)-)?(?P<id>\d+)(?:[/#?&]|$)'
_TESTS = [{
'url': 'http://www.stitcher.com/podcast/the-talking-machines/e/40789481?autoplay=true',
'md5': '391dd4e021e6edeb7b8e68fbf2e9e940',
'md5': 'e9635098e0da10b21a0e2b85585530f6',
'info_dict': {
'id': '40789481',
'ext': 'mp3',
'title': 'Machine Learning Mastery and Cancer Clusters',
'description': 'md5:55163197a44e915a14a1ac3a1de0f2d3',
'description': 'md5:547adb4081864be114ae3831b4c2b42f',
'duration': 1604,
'thumbnail': r're:^https?://.*\.jpg',
'upload_date': '20180126',
'timestamp': 1516989316,
},
}, {
'url': 'http://www.stitcher.com/podcast/panoply/vulture-tv/e/the-rare-hourlong-comedy-plus-40846275?autoplay=true',
@ -38,6 +41,7 @@ class StitcherIE(InfoExtractor):
'params': {
'skip_download': True,
},
'skip': 'Page Not Found',
}, {
# escaped title
'url': 'http://www.stitcher.com/podcast/marketplace-on-stitcher/e/40910226?autoplay=true',
@ -45,37 +49,39 @@ class StitcherIE(InfoExtractor):
}, {
'url': 'http://www.stitcher.com/podcast/panoply/getting-in/e/episode-2a-how-many-extracurriculars-should-i-have-40876278?autoplay=true',
'only_matching': True,
}, {
'url': 'https://www.stitcher.com/show/threedom/episode/circles-on-a-stick-200212584',
'only_matching': True,
}]
def _real_extract(self, url):
mobj = re.match(self._VALID_URL, url)
audio_id = mobj.group('id')
display_id = mobj.group('display_id') or audio_id
display_id, audio_id = re.match(self._VALID_URL, url).groups()
webpage = self._download_webpage(url, display_id)
resp = self._download_json(
'https://api.prod.stitcher.com/episode/' + audio_id,
display_id or audio_id)
episode = try_get(resp, lambda x: x['data']['episodes'][0], dict)
if not episode:
raise ExtractorError(resp['errors'][0]['message'], expected=True)
episode = self._parse_json(
js_to_json(self._search_regex(
r'(?s)var\s+stitcher(?:Config)?\s*=\s*({.+?});\n', webpage, 'episode config')),
display_id)['config']['episode']
title = episode['title'].strip()
audio_url = episode['audio_url']
title = unescapeHTML(episode['title'])
formats = [{
'url': episode[episode_key],
'ext': determine_ext(episode[episode_key]) or 'mp3',
'vcodec': 'none',
} for episode_key in ('episodeURL',) if episode.get(episode_key)]
description = self._search_regex(
r'Episode Info:\s*</span>([^<]+)<', webpage, 'description', fatal=False)
duration = int_or_none(episode.get('duration'))
thumbnail = episode.get('episodeImage')
thumbnail = None
show_id = episode.get('show_id')
if show_id and episode.get('classic_id') != -1:
thumbnail = 'https://stitcher-classic.imgix.net/feedimages/%s.jpg' % show_id
return {
'id': audio_id,
'display_id': display_id,
'title': title,
'description': description,
'duration': duration,
'description': clean_html(episode.get('html_description') or episode.get('description')),
'duration': int_or_none(episode.get('duration')),
'thumbnail': thumbnail,
'formats': formats,
'url': audio_url,
'vcodec': 'none',
'timestamp': int_or_none(episode.get('date_created')),
'season_number': int_or_none(episode.get('season')),
'season_id': str_or_none(episode.get('season_id')),
}

View File

@ -25,7 +25,6 @@ class VVVVIDIE(InfoExtractor):
'duration': 239,
'series': '"Perché dovrei guardarlo?" di Dario Moccia',
'season_id': '437',
'season_number': 1,
'episode': 'Ping Pong',
'episode_number': 1,
'episode_id': '3334',
@ -75,7 +74,6 @@ class VVVVIDIE(InfoExtractor):
def _extract_common_video_info(self, video_data):
return {
'thumbnail': video_data.get('thumbnail'),
'episode_number': int_or_none(video_data.get('number')),
'episode_id': str_or_none(video_data.get('id')),
}
@ -145,6 +143,17 @@ class VVVVIDIE(InfoExtractor):
return d
info = {}
def metadata_from_url(r_url):
if not info and r_url:
mobj = re.search(r'_(?:S(\d+))?Ep(\d+)', r_url)
if mobj:
info['episode_number'] = int(mobj.group(2))
season_number = mobj.group(1)
if season_number:
info['season_number'] = int(season_number)
for quality in ('_sd', ''):
embed_code = video_data.get('embed_info' + quality)
if not embed_code:
@ -152,7 +161,6 @@ class VVVVIDIE(InfoExtractor):
embed_code = ds(embed_code)
video_type = video_data.get('video_type')
if video_type in ('video/rcs', 'video/kenc'):
embed_code = re.sub(r'https?://([^/]+)/z/', r'https://\1/i/', embed_code).replace('/manifest.f4m', '/master.m3u8')
if video_type == 'video/kenc':
kenc = self._download_json(
'https://www.vvvvid.it/kenc', video_id, query={
@ -163,15 +171,16 @@ class VVVVIDIE(InfoExtractor):
kenc_message = kenc.get('message')
if kenc_message:
embed_code += '?' + ds(kenc_message)
formats.extend(self._extract_m3u8_formats(
embed_code, video_id, 'mp4',
m3u8_id='hls', fatal=False))
formats.extend(self._extract_akamai_formats(embed_code, video_id))
else:
formats.extend(self._extract_wowza_formats(
'http://sb.top-ix.org/videomg/_definst_/mp4:%s/playlist.m3u8' % embed_code, video_id))
metadata_from_url(embed_code)
self._sort_formats(formats)
info = self._extract_common_video_info(video_data)
metadata_from_url(video_data.get('thumbnail'))
info.update(self._extract_common_video_info(video_data))
info.update({
'id': video_id,
'title': title,
@ -179,7 +188,6 @@ class VVVVIDIE(InfoExtractor):
'duration': int_or_none(video_data.get('length')),
'series': video_data.get('show_title'),
'season_id': season_id,
'season_number': video_data.get('season_number'),
'episode': title,
'view_count': int_or_none(video_data.get('views')),
'like_count': int_or_none(video_data.get('video_likes')),
@ -214,9 +222,10 @@ class VVVVIDShowIE(VVVVIDIE):
entries = []
for season in (seasons or []):
season_number = int_or_none(season.get('number'))
episodes = season.get('episodes') or []
for episode in episodes:
if episode.get('playable') is False:
continue
season_id = str_or_none(episode.get('season_id'))
video_id = str_or_none(episode.get('video_id'))
if not (season_id and video_id):
@ -228,7 +237,6 @@ class VVVVIDShowIE(VVVVIDIE):
'url': '/'.join([base_url, season_id, video_id]),
'title': episode.get('title'),
'description': episode.get('description'),
'season_number': season_number,
'season_id': season_id,
})
entries.append(info)

View File

@ -4,17 +4,13 @@ from __future__ import unicode_literals
import re
from .common import InfoExtractor
from ..utils import (
int_or_none,
strip_jsonp,
)
class WashingtonPostIE(InfoExtractor):
IE_NAME = 'washingtonpost'
_VALID_URL = r'(?:washingtonpost:|https?://(?:www\.)?washingtonpost\.com/video/(?:[^/]+/)*)(?P<id>[\da-f]{8}-[\da-f]{4}-[\da-f]{4}-[\da-f]{4}-[\da-f]{12})'
_VALID_URL = r'(?:washingtonpost:|https?://(?:www\.)?washingtonpost\.com/(?:video|posttv)/(?:[^/]+/)*)(?P<id>[\da-f]{8}-[\da-f]{4}-[\da-f]{4}-[\da-f]{4}-[\da-f]{12})'
_EMBED_URL = r'https?://(?:www\.)?washingtonpost\.com/video/c/embed/[\da-f]{8}-[\da-f]{4}-[\da-f]{4}-[\da-f]{4}-[\da-f]{12}'
_TEST = {
_TESTS = [{
'url': 'https://www.washingtonpost.com/video/c/video/480ba4ee-1ec7-11e6-82c2-a7dcb313287d',
'md5': '6f537e1334b714eb15f9563bd4b9cdfa',
'info_dict': {
@ -23,10 +19,15 @@ class WashingtonPostIE(InfoExtractor):
'title': 'Egypt finds belongings, debris from plane crash',
'description': 'md5:a17ceee432f215a5371388c1f680bd86',
'upload_date': '20160520',
'uploader': 'Reuters',
'timestamp': 1463778452,
'timestamp': 1463775187,
},
}
}, {
'url': 'https://www.washingtonpost.com/video/world/egypt-finds-belongings-debris-from-plane-crash/2016/05/20/480ba4ee-1ec7-11e6-82c2-a7dcb313287d_video.html',
'only_matching': True,
}, {
'url': 'https://www.washingtonpost.com/posttv/world/iraq-to-track-down-antiquities-after-islamic-state-museum-rampage/2015/02/28/7c57e916-bf86-11e4-9dfb-03366e719af8_video.html',
'only_matching': True,
}]
@classmethod
def _extract_urls(cls, webpage):
@ -35,73 +36,8 @@ class WashingtonPostIE(InfoExtractor):
def _real_extract(self, url):
video_id = self._match_id(url)
video_data = self._download_json(
'http://www.washingtonpost.com/posttv/c/videojson/%s?resType=jsonp' % video_id,
video_id, transform_source=strip_jsonp)[0]['contentConfig']
title = video_data['title']
urls = []
formats = []
for s in video_data.get('streams', []):
s_url = s.get('url')
if not s_url or s_url in urls:
continue
urls.append(s_url)
video_type = s.get('type')
if video_type == 'smil':
continue
elif video_type in ('ts', 'hls') and ('_master.m3u8' in s_url or '_mobile.m3u8' in s_url):
m3u8_formats = self._extract_m3u8_formats(
s_url, video_id, 'mp4', 'm3u8_native', m3u8_id='hls', fatal=False)
for m3u8_format in m3u8_formats:
width = m3u8_format.get('width')
if not width:
continue
vbr = self._search_regex(
r'%d_%d_(\d+)' % (width, m3u8_format['height']), m3u8_format['url'], 'vbr', default=None)
if vbr:
m3u8_format.update({
'vbr': int_or_none(vbr),
})
formats.extend(m3u8_formats)
else:
width = int_or_none(s.get('width'))
vbr = int_or_none(s.get('bitrate'))
has_width = width != 0
formats.append({
'format_id': (
'%s-%d-%d' % (video_type, width, vbr)
if width
else video_type),
'vbr': vbr if has_width else None,
'width': width,
'height': int_or_none(s.get('height')),
'acodec': s.get('audioCodec'),
'vcodec': s.get('videoCodec') if has_width else 'none',
'filesize': int_or_none(s.get('fileSize')),
'url': s_url,
'ext': 'mp4',
'protocol': 'm3u8_native' if video_type in ('ts', 'hls') else None,
})
source_media_url = video_data.get('sourceMediaURL')
if source_media_url:
formats.append({
'format_id': 'source_media',
'url': source_media_url,
})
self._sort_formats(
formats, ('width', 'height', 'vbr', 'filesize', 'tbr', 'format_id'))
return {
'id': video_id,
'title': title,
'description': video_data.get('blurb'),
'uploader': video_data.get('credits', {}).get('source'),
'formats': formats,
'duration': int_or_none(video_data.get('videoDuration'), 100),
'timestamp': int_or_none(
video_data.get('dateConfig', {}).get('dateFirstPublished'), 1000),
}
return self.url_result(
'arcpublishing:wapo:' + video_id, 'ArcPublishing', video_id)
class WashingtonPostArticleIE(InfoExtractor):
@ -121,9 +57,8 @@ class WashingtonPostArticleIE(InfoExtractor):
'title': 'Breaking Points: The Paper Mine',
'duration': 1290,
'description': 'Overly complicated paper pushing is nothing new to government bureaucracy. But the way federal retirement applications are filed may be the most outdated. David Fahrenthold explains.',
'uploader': 'The Washington Post',
'timestamp': 1395527908,
'upload_date': '20140322',
'timestamp': 1395440416,
'upload_date': '20140321',
},
}, {
'md5': '1fff6a689d8770966df78c8cb6c8c17c',
@ -133,9 +68,8 @@ class WashingtonPostArticleIE(InfoExtractor):
'title': 'The town bureaucracy sustains',
'description': 'Underneath the friendly town of Boyers is a sea of government paperwork. In a disused limestone mine, hundreds of locals now track, file and process retirement applications for the federal government. We set out to find out what it\'s like to do paperwork 230 feet underground.',
'duration': 2220,
'timestamp': 1395528005,
'upload_date': '20140322',
'uploader': 'The Washington Post',
'timestamp': 1395441819,
'upload_date': '20140321',
},
}],
}, {
@ -151,8 +85,7 @@ class WashingtonPostArticleIE(InfoExtractor):
'ext': 'mp4',
'description': 'Washington Post transportation reporter Ashley Halsey III explains why a plane\'s black box needs to be recovered from a crash site instead of having its information streamed in real time throughout the flight.',
'upload_date': '20141230',
'uploader': 'The Washington Post',
'timestamp': 1419974765,
'timestamp': 1419972442,
'title': 'Why black boxes dont transmit data in real time',
}
}]

View File

@ -66,7 +66,7 @@ class YandexVideoIE(InfoExtractor):
video_id = self._match_id(url)
player = try_get((self._download_json(
'https://frontend.vh.yandex.ru/graphql', video_id, data=b'''{
'https://frontend.vh.yandex.ru/graphql', video_id, data=('''{
player(content_id: "%s") {
computed_title
content_url
@ -86,7 +86,7 @@ class YandexVideoIE(InfoExtractor):
title
views_count
}
}''' % video_id.encode(), fatal=False)), lambda x: x['player']['content'])
}''' % video_id).encode(), fatal=False)), lambda x: x['player']['content'])
if not player or player.get('error'):
player = self._download_json(
'https://frontend.vh.yandex.ru/v23/player/%s.json' % video_id,