Update to ytdl-commit-dfbbe29

[redbulltv] fix embed data extraction
dfbbe2902f
This commit is contained in:
pukkandan 2021-05-20 21:08:49 +05:30
parent 5014558ab9
commit b73612a254
No known key found for this signature in database
GPG Key ID: 0F00D95A001F4698
10 changed files with 145 additions and 44 deletions

View File

@ -6,7 +6,7 @@ from .common import InfoExtractor
from ..compat import compat_urllib_parse_urlencode
from ..utils import (
ExtractorError,
unescapeHTML
merge_dicts,
)
@ -24,7 +24,8 @@ class EroProfileIE(InfoExtractor):
'title': 'sexy babe softcore',
'thumbnail': r're:https?://.*\.jpg',
'age_limit': 18,
}
},
'skip': 'Video not found',
}, {
'url': 'http://www.eroprofile.com/m/videos/view/Try-It-On-Pee_cut_2-wmv-4shared-com-file-sharing-download-movie-file',
'md5': '1baa9602ede46ce904c431f5418d8916',
@ -77,19 +78,15 @@ class EroProfileIE(InfoExtractor):
[r"glbUpdViews\s*\('\d*','(\d+)'", r'p/report/video/(\d+)'],
webpage, 'video id', default=None)
video_url = unescapeHTML(self._search_regex(
r'<source src="([^"]+)', webpage, 'video url'))
title = self._html_search_regex(
r'Title:</th><td>([^<]+)</td>', webpage, 'title')
thumbnail = self._search_regex(
r'onclick="showVideoPlayer\(\)"><img src="([^"]+)',
webpage, 'thumbnail', fatal=False)
(r'Title:</th><td>([^<]+)</td>', r'<h1[^>]*>(.+?)</h1>'),
webpage, 'title')
return {
info = self._parse_html5_media_entries(url, webpage, video_id)[0]
return merge_dicts(info, {
'id': video_id,
'display_id': display_id,
'url': video_url,
'title': title,
'thumbnail': thumbnail,
'age_limit': 18,
}
})

View File

@ -985,6 +985,7 @@ from .platzi import (
from .playfm import PlayFMIE
from .playplustv import PlayPlusTVIE
from .plays import PlaysTVIE
from .playstuff import PlayStuffIE
from .playtvak import PlaytvakIE
from .playvid import PlayvidIE
from .playwire import PlaywireIE

View File

@ -126,6 +126,7 @@ from .viqeo import ViqeoIE
from .expressen import ExpressenIE
from .zype import ZypeIE
from .odnoklassniki import OdnoklassnikiIE
from .vk import VKIE
from .kinja import KinjaEmbedIE
from .gedidigital import GediDigitalIE
from .rcs import RCSEmbedsIE
@ -2252,6 +2253,10 @@ class GenericIE(InfoExtractor):
'playlist_mincount': 52,
},
{
# Sibnet embed (https://help.sibnet.ru/?sibnet_video_embed)
'url': 'https://phpbb3.x-tk.ru/bbcode-video-sibnet-t24.html',
'only_matching': True,
}, {
# WimTv embed player
'url': 'http://www.msmotor.tv/wearefmi-pt-2-2021/',
'info_dict': {
@ -2803,6 +2808,11 @@ class GenericIE(InfoExtractor):
if odnoklassniki_url:
return self.url_result(odnoklassniki_url, OdnoklassnikiIE.ie_key())
# Look for sibnet embedded player
sibnet_urls = VKIE._extract_sibnet_urls(webpage)
if sibnet_urls:
return self.playlist_from_matches(sibnet_urls, video_id, video_title)
# Look for embedded ivi player
mobj = re.search(r'<embed[^>]+?src=(["\'])(?P<url>https?://(?:www\.)?ivi\.ru/video/player.+?)\1', webpage)
if mobj is not None:
@ -3454,6 +3464,9 @@ class GenericIE(InfoExtractor):
'url': src,
'ext': (mimetype2ext(src_type)
or ext if ext in KNOWN_EXTENSIONS else 'mp4'),
'http_headers': {
'Referer': full_response.geturl(),
},
})
if formats:
self._sort_formats(formats)
@ -3522,7 +3535,7 @@ class GenericIE(InfoExtractor):
m_video_type = re.findall(r'<meta.*?property="og:video:type".*?content="video/(.*?)"', webpage)
# We only look in og:video if the MIME type is a video, don't try if it's a Flash player:
if m_video_type is not None:
found = filter_video(re.findall(r'<meta.*?property="og:video".*?content="(.*?)"', webpage))
found = filter_video(re.findall(r'<meta.*?property="og:(?:video|audio)".*?content="(.*?)"', webpage))
if not found:
REDIRECT_REGEX = r'[0-9]{,2};\s*(?:URL|url)=\'?([^\'"]+)'
found = re.search(

View File

@ -182,7 +182,7 @@ class ORFRadioIE(InfoExtractor):
duration = end - start if end and start else None
entries.append({
'id': loop_stream_id.replace('.mp3', ''),
'url': 'http://loopstream01.apa.at/?channel=%s&id=%s' % (self._LOOP_STATION, loop_stream_id),
'url': 'https://loopstream01.apa.at/?channel=%s&id=%s' % (self._LOOP_STATION, loop_stream_id),
'title': title,
'description': clean_html(data.get('subtitle')),
'duration': duration,

View File

@ -9,8 +9,9 @@ from ..compat import compat_str
from ..utils import (
int_or_none,
merge_dicts,
try_get,
unified_timestamp,
xpath_text,
urljoin,
)
@ -27,10 +28,11 @@ class PhoenixIE(ZDFBaseIE):
'title': 'Wohin führt der Protest in der Pandemie?',
'description': 'md5:7d643fe7f565e53a24aac036b2122fbd',
'duration': 1691,
'timestamp': 1613906100,
'timestamp': 1613902500,
'upload_date': '20210221',
'uploader': 'Phoenix',
'channel': 'corona nachgehakt',
'series': 'corona nachgehakt',
'episode': 'Wohin führt der Protest in der Pandemie?',
},
}, {
# Youtube embed
@ -79,36 +81,39 @@ class PhoenixIE(ZDFBaseIE):
video_id = compat_str(video.get('basename') or video.get('content'))
details = self._download_xml(
details = self._download_json(
'https://www.phoenix.de/php/mediaplayer/data/beitrags_details.php',
video_id, 'Downloading details XML', query={
video_id, 'Downloading details JSON', query={
'ak': 'web',
'ptmd': 'true',
'id': video_id,
'profile': 'player2',
})
title = title or xpath_text(
details, './/information/title', 'title', fatal=True)
content_id = xpath_text(
details, './/video/details/basename', 'content id', fatal=True)
title = title or details['title']
content_id = details['tracking']['nielsen']['content']['assetid']
info = self._extract_ptmd(
'https://tmd.phoenix.de/tmd/2/ngplayer_2_3/vod/ptmd/phoenix/%s' % content_id,
content_id, None, url)
timestamp = unified_timestamp(xpath_text(details, './/details/airtime'))
duration = int_or_none(try_get(
details, lambda x: x['tracking']['nielsen']['content']['length']))
timestamp = unified_timestamp(details.get('editorialDate'))
series = try_get(
details, lambda x: x['tracking']['nielsen']['content']['program'],
compat_str)
episode = title if details.get('contentType') == 'episode' else None
thumbnails = []
for node in details.findall('.//teaserimages/teaserimage'):
thumbnail_url = node.text
teaser_images = try_get(details, lambda x: x['teaserImageRef']['layouts'], dict) or {}
for thumbnail_key, thumbnail_url in teaser_images.items():
thumbnail_url = urljoin(url, thumbnail_url)
if not thumbnail_url:
continue
thumbnail = {
'url': thumbnail_url,
}
thumbnail_key = node.get('key')
if thumbnail_key:
m = re.match('^([0-9]+)x([0-9]+)$', thumbnail_key)
if m:
thumbnail['width'] = int(m.group(1))
@ -118,11 +123,11 @@ class PhoenixIE(ZDFBaseIE):
return merge_dicts(info, {
'id': content_id,
'title': title,
'description': xpath_text(details, './/information/detail'),
'duration': int_or_none(xpath_text(details, './/details/lengthSec')),
'description': details.get('leadParagraph'),
'duration': duration,
'thumbnails': thumbnails,
'timestamp': timestamp,
'uploader': xpath_text(details, './/details/channel'),
'uploader_id': xpath_text(details, './/details/originChannelId'),
'channel': xpath_text(details, './/details/originChannelTitle'),
'uploader': details.get('tvService'),
'series': series,
'episode': episode,
})

View File

@ -0,0 +1,65 @@
from __future__ import unicode_literals
from .common import InfoExtractor
from ..compat import compat_str
from ..utils import (
smuggle_url,
try_get,
)
class PlayStuffIE(InfoExtractor):
_VALID_URL = r'https?://(?:www\.)?play\.stuff\.co\.nz/details/(?P<id>[^/?#&]+)'
_TESTS = [{
'url': 'https://play.stuff.co.nz/details/608778ac1de1c4001a3fa09a',
'md5': 'c82d3669e5247c64bc382577843e5bd0',
'info_dict': {
'id': '6250584958001',
'ext': 'mp4',
'title': 'Episode 1: Rotorua/Mt Maunganui/Tauranga',
'description': 'md5:c154bafb9f0dd02d01fd4100fb1c1913',
'uploader_id': '6005208634001',
'timestamp': 1619491027,
'upload_date': '20210427',
},
'add_ie': ['BrightcoveNew'],
}, {
# geo restricted, bypassable
'url': 'https://play.stuff.co.nz/details/_6155660351001',
'only_matching': True,
}]
BRIGHTCOVE_URL_TEMPLATE = 'http://players.brightcove.net/%s/%s_default/index.html?videoId=%s'
def _real_extract(self, url):
video_id = self._match_id(url)
webpage = self._download_webpage(url, video_id)
state = self._parse_json(
self._search_regex(
r'__INITIAL_STATE__\s*=\s*({.+?})\s*;', webpage, 'state'),
video_id)
account_id = try_get(
state, lambda x: x['configurations']['accountId'],
compat_str) or '6005208634001'
player_id = try_get(
state, lambda x: x['configurations']['playerId'],
compat_str) or 'default'
entries = []
for item_id, video in state['items'].items():
if not isinstance(video, dict):
continue
asset_id = try_get(
video, lambda x: x['content']['attributes']['assetId'],
compat_str)
if not asset_id:
continue
entries.append(self.url_result(
smuggle_url(
self.BRIGHTCOVE_URL_TEMPLATE % (account_id, player_id, asset_id),
{'geo_countries': ['NZ']}),
'BrightcoveNew', video_id))
return self.playlist_result(entries, video_id)

View File

@ -133,8 +133,10 @@ class RedBullEmbedIE(RedBullTVIE):
rrn_id = self._match_id(url)
asset_id = self._download_json(
'https://edge-graphql.crepo-production.redbullaws.com/v1/graphql',
rrn_id, headers={'API-KEY': 'e90a1ff11335423998b100c929ecc866'},
query={
rrn_id, headers={
'Accept': 'application/json',
'API-KEY': 'e90a1ff11335423998b100c929ecc866',
}, query={
'query': '''{
resource(id: "%s", enforceGeoBlocking: false) {
%s

View File

@ -21,6 +21,7 @@ from ..utils import (
class ShahidBaseIE(AWSIE):
_AWS_PROXY_HOST = 'api2.shahid.net'
_AWS_API_KEY = '2RRtuMHx95aNI1Kvtn2rChEuwsCogUd4samGPjLh'
_VALID_URL_BASE = r'https?://shahid\.mbc\.net/[a-z]{2}/'
def _handle_error(self, e):
fail_data = self._parse_json(
@ -49,7 +50,7 @@ class ShahidBaseIE(AWSIE):
class ShahidIE(ShahidBaseIE):
_NETRC_MACHINE = 'shahid'
_VALID_URL = r'https?://shahid\.mbc\.net/ar/(?:serie|show|movie)s/[^/]+/(?P<type>episode|clip|movie)-(?P<id>\d+)'
_VALID_URL = ShahidBaseIE._VALID_URL_BASE + r'(?:serie|show|movie)s/[^/]+/(?P<type>episode|clip|movie)-(?P<id>\d+)'
_TESTS = [{
'url': 'https://shahid.mbc.net/ar/shows/%D9%85%D8%AA%D8%AD%D9%81-%D8%A7%D9%84%D8%AF%D8%AD%D9%8A%D8%AD-%D8%A7%D9%84%D9%85%D9%88%D8%B3%D9%85-1-%D9%83%D9%84%D9%8A%D8%A8-1/clip-816924',
'info_dict': {
@ -73,6 +74,9 @@ class ShahidIE(ShahidBaseIE):
# shahid plus subscriber only
'url': 'https://shahid.mbc.net/ar/series/%D9%85%D8%B1%D8%A7%D9%8A%D8%A7-2011-%D8%A7%D9%84%D9%85%D9%88%D8%B3%D9%85-1-%D8%A7%D9%84%D8%AD%D9%84%D9%82%D8%A9-1/episode-90511',
'only_matching': True
}, {
'url': 'https://shahid.mbc.net/en/shows/Ramez-Fi-Al-Shallal-season-1-episode-1/episode-359319',
'only_matching': True
}]
def _real_initialize(self):
@ -168,7 +172,7 @@ class ShahidIE(ShahidBaseIE):
class ShahidShowIE(ShahidBaseIE):
_VALID_URL = r'https?://shahid\.mbc\.net/ar/(?:show|serie)s/[^/]+/(?:show|series)-(?P<id>\d+)'
_VALID_URL = ShahidBaseIE._VALID_URL_BASE + r'(?:show|serie)s/[^/]+/(?:show|series)-(?P<id>\d+)'
_TESTS = [{
'url': 'https://shahid.mbc.net/ar/shows/%D8%B1%D8%A7%D9%85%D8%B2-%D9%82%D8%B1%D8%B4-%D8%A7%D9%84%D8%A8%D8%AD%D8%B1/show-79187',
'info_dict': {

View File

@ -86,10 +86,10 @@ class SharedIE(SharedBaseIE):
class VivoIE(SharedBaseIE):
IE_DESC = 'vivo.sx'
_VALID_URL = r'https?://vivo\.sx/(?P<id>[\da-z]{10})'
_VALID_URL = r'https?://vivo\.s[xt]/(?P<id>[\da-z]{10})'
_FILE_NOT_FOUND = '>The file you have requested does not exists or has been removed'
_TEST = {
_TESTS = [{
'url': 'http://vivo.sx/d7ddda0e78',
'md5': '15b3af41be0b4fe01f4df075c2678b2c',
'info_dict': {
@ -98,7 +98,10 @@ class VivoIE(SharedBaseIE):
'title': 'Chicken',
'filesize': 515659,
},
}
}, {
'url': 'http://vivo.st/d7ddda0e78',
'only_matching': True,
}]
def _extract_title(self, webpage):
title = self._html_search_regex(

View File

@ -300,6 +300,13 @@ class VKIE(VKBaseIE):
'only_matching': True,
}]
@staticmethod
def _extract_sibnet_urls(webpage):
# https://help.sibnet.ru/?sibnet_video_embed
return [unescapeHTML(mobj.group('url')) for mobj in re.finditer(
r'<iframe\b[^>]+\bsrc=(["\'])(?P<url>(?:https?:)?//video\.sibnet\.ru/shell\.php\?.*?\bvideoid=\d+.*?)\1',
webpage)]
def _real_extract(self, url):
mobj = re.match(self._VALID_URL, url)
video_id = mobj.group('videoid')
@ -408,6 +415,10 @@ class VKIE(VKBaseIE):
if odnoklassniki_url:
return self.url_result(odnoklassniki_url, OdnoklassnikiIE.ie_key())
sibnet_urls = self._extract_sibnet_urls(info_page)
if sibnet_urls:
return self.url_result(sibnet_urls[0])
m_opts = re.search(r'(?s)var\s+opts\s*=\s*({.+?});', info_page)
if m_opts:
m_opts_url = re.search(r"url\s*:\s*'((?!/\b)[^']+)", m_opts.group(1))