[vice] Fix extraction and rework extractors (closes #11101, closes #13019, closes #13622, closes #13778)

This commit is contained in:
Sergey M․ 2018-03-03 23:07:29 +07:00
parent c01db237b5
commit 86c8cfc555
No known key found for this signature in database
GPG Key ID: 2C393E0F18A9236D
4 changed files with 182 additions and 149 deletions

View File

@ -1210,7 +1210,6 @@ from .vice import (
ViceArticleIE, ViceArticleIE,
ViceShowIE, ViceShowIE,
) )
from .viceland import VicelandIE
from .vidbit import VidbitIE from .vidbit import VidbitIE
from .viddler import ViddlerIE from .viddler import ViddlerIE
from .videa import VideaIE from .videa import VideaIE

View File

@ -103,6 +103,7 @@ from .vshare import VShareIE
from .mediasite import MediasiteIE from .mediasite import MediasiteIE
from .springboardplatform import SpringboardPlatformIE from .springboardplatform import SpringboardPlatformIE
from .yapfiles import YapFilesIE from .yapfiles import YapFilesIE
from .vice import ViceIE
class GenericIE(InfoExtractor): class GenericIE(InfoExtractor):
@ -2965,6 +2966,11 @@ class GenericIE(InfoExtractor):
return self.playlist_from_matches( return self.playlist_from_matches(
yapfiles_urls, video_id, video_title, ie=YapFilesIE.ie_key()) yapfiles_urls, video_id, video_title, ie=YapFilesIE.ie_key())
vice_urls = ViceIE._extract_urls(webpage)
if vice_urls:
return self.playlist_from_matches(
vice_urls, video_id, video_title, ie=ViceIE.ie_key())
def merge_dicts(dict1, dict2): def merge_dicts(dict1, dict2):
merged = {} merged = {}
for k, v in dict1.items(): for k, v in dict1.items():

View File

@ -5,113 +5,52 @@ import re
import time import time
import hashlib import hashlib
import json import json
import random
from .adobepass import AdobePassIE from .adobepass import AdobePassIE
from .youtube import YoutubeIE from .youtube import YoutubeIE
from .common import InfoExtractor from .common import InfoExtractor
from ..compat import compat_HTTPError from ..compat import (
compat_HTTPError,
compat_str,
)
from ..utils import ( from ..utils import (
ExtractorError,
int_or_none, int_or_none,
parse_age_limit, parse_age_limit,
str_or_none, str_or_none,
parse_duration, try_get,
ExtractorError,
extract_attributes,
) )
class ViceBaseIE(AdobePassIE): class ViceIE(AdobePassIE):
def _extract_preplay_video(self, url, locale, webpage):
watch_hub_data = extract_attributes(self._search_regex(
r'(?s)(<watch-hub\s*.+?</watch-hub>)', webpage, 'watch hub'))
video_id = watch_hub_data['vms-id']
title = watch_hub_data['video-title']
query = {}
is_locked = watch_hub_data.get('video-locked') == '1'
if is_locked:
resource = self._get_mvpd_resource(
'VICELAND', title, video_id,
watch_hub_data.get('video-rating'))
query['tvetoken'] = self._extract_mvpd_auth(
url, video_id, 'VICELAND', resource)
# signature generation algorithm is reverse engineered from signatureGenerator in
# webpack:///../shared/~/vice-player/dist/js/vice-player.js in
# https://www.viceland.com/assets/common/js/web.vendor.bundle.js
exp = int(time.time()) + 14400
query.update({
'exp': exp,
'sign': hashlib.sha512(('%s:GET:%d' % (video_id, exp)).encode()).hexdigest(),
})
try:
host = 'www.viceland' if is_locked else self._PREPLAY_HOST
preplay = self._download_json(
'https://%s.com/%s/preplay/%s' % (host, locale, video_id),
video_id, query=query)
except ExtractorError as e:
if isinstance(e.cause, compat_HTTPError) and e.cause.code == 400:
error = json.loads(e.cause.read().decode())
raise ExtractorError('%s said: %s' % (
self.IE_NAME, error['details']), expected=True)
raise
video_data = preplay['video']
base = video_data['base']
uplynk_preplay_url = preplay['preplayURL']
episode = video_data.get('episode', {})
channel = video_data.get('channel', {})
subtitles = {}
cc_url = preplay.get('ccURL')
if cc_url:
subtitles['en'] = [{
'url': cc_url,
}]
return {
'_type': 'url_transparent',
'url': uplynk_preplay_url,
'id': video_id,
'title': title,
'description': base.get('body') or base.get('display_body'),
'thumbnail': watch_hub_data.get('cover-image') or watch_hub_data.get('thumbnail'),
'duration': int_or_none(video_data.get('video_duration')) or parse_duration(watch_hub_data.get('video-duration')),
'timestamp': int_or_none(video_data.get('created_at'), 1000),
'age_limit': parse_age_limit(video_data.get('video_rating')),
'series': video_data.get('show_title') or watch_hub_data.get('show-title'),
'episode_number': int_or_none(episode.get('episode_number') or watch_hub_data.get('episode')),
'episode_id': str_or_none(episode.get('id') or video_data.get('episode_id')),
'season_number': int_or_none(watch_hub_data.get('season')),
'season_id': str_or_none(episode.get('season_id')),
'uploader': channel.get('base', {}).get('title') or watch_hub_data.get('channel-title'),
'uploader_id': str_or_none(channel.get('id')),
'subtitles': subtitles,
'ie_key': 'UplynkPreplay',
}
class ViceIE(ViceBaseIE):
IE_NAME = 'vice' IE_NAME = 'vice'
_VALID_URL = r'https?://(?:.+?\.)?vice\.com/(?:(?P<locale>[^/]+)/)?videos?/(?P<id>[^/?#&]+)' _VALID_URL = r'https?://(?:(?:video|vms)\.vice|(?:www\.)?viceland)\.com/(?P<locale>[^/]+)/(?:video/[^/]+|embed)/(?P<id>[\da-f]+)'
_TESTS = [{ _TESTS = [{
'url': 'https://news.vice.com/video/experimenting-on-animals-inside-the-monkey-lab', 'url': 'https://video.vice.com/en_us/video/pet-cremator/58c69e38a55424f1227dc3f7',
'md5': '7d3ae2f9ba5f196cdd9f9efd43657ac2',
'info_dict': { 'info_dict': {
'id': 'N2bzkydjraWDGwnt8jAttCF6Y0PDv4Zj', 'id': '5e647f0125e145c9aef2069412c0cbde',
'ext': 'flv', 'ext': 'mp4',
'title': 'Monkey Labs of Holland', 'title': '10 Questions You Always Wanted To Ask: Pet Cremator',
'description': 'md5:92b3c7dcbfe477f772dd4afa496c9149', 'description': 'md5:fe856caacf61fe0e74fab15ce2b07ca5',
'uploader': 'vice',
'uploader_id': '57a204088cb727dec794c67b',
'timestamp': 1489664942,
'upload_date': '20170316',
'age_limit': 14,
}, },
'add_ie': ['Ooyala'], 'params': {
# m3u8 download
'skip_download': True,
},
'add_ie': ['UplynkPreplay'],
}, { }, {
# geo restricted to US
'url': 'https://video.vice.com/en_us/video/the-signal-from-tolva/5816510690b70e6c5fd39a56', 'url': 'https://video.vice.com/en_us/video/the-signal-from-tolva/5816510690b70e6c5fd39a56',
'info_dict': { 'info_dict': {
'id': '5816510690b70e6c5fd39a56', 'id': '930c0ad1f47141cc955087eecaddb0e2',
'ext': 'mp4', 'ext': 'mp4',
'uploader': 'Waypoint', 'uploader': 'waypoint',
'title': 'The Signal From Tölva', 'title': 'The Signal From Tölva',
'description': 'md5:3927e3c79f9e8094606a2b3c5b5e55d5', 'description': 'md5:3927e3c79f9e8094606a2b3c5b5e55d5',
'uploader_id': '57f7d621e05ca860fa9ccaf9', 'uploader_id': '57f7d621e05ca860fa9ccaf9',
@ -139,27 +78,131 @@ class ViceIE(ViceBaseIE):
'params': { 'params': {
# AES-encrypted m3u8 # AES-encrypted m3u8
'skip_download': True, 'skip_download': True,
'proxy': '127.0.0.1:8118',
}, },
'add_ie': ['UplynkPreplay'], 'add_ie': ['UplynkPreplay'],
}, { }, {
'url': 'https://video.vice.com/en_us/video/pizza-show-trailer/56d8c9a54d286ed92f7f30e4', 'url': 'https://video.vice.com/en_us/video/pizza-show-trailer/56d8c9a54d286ed92f7f30e4',
'only_matching': True, 'only_matching': True,
}, {
'url': 'https://video.vice.com/en_us/embed/57f41d3556a0a80f54726060',
'only_matching': True,
}, {
'url': 'https://vms.vice.com/en_us/video/preplay/58c69e38a55424f1227dc3f7',
'only_matching': True,
}, {
'url': 'https://www.viceland.com/en_us/video/thursday-march-1-2018/5a8f2d7ff1cdb332dd446ec1',
'only_matching': True,
}] }]
_PREPLAY_HOST = 'video.vice' _PREPLAY_HOST = 'vms.vice'
@staticmethod
def _extract_urls(webpage):
return re.findall(
r'<iframe\b[^>]+\bsrc=["\']((?:https?:)?//video\.vice\.com/[^/]+/embed/[\da-f]+)',
webpage)
@staticmethod
def _extract_url(webpage):
urls = ViceIE._extract_urls(webpage)
return urls[0] if urls else None
def _real_extract(self, url): def _real_extract(self, url):
locale, video_id = re.match(self._VALID_URL, url).groups() locale, video_id = re.match(self._VALID_URL, url).groups()
webpage, urlh = self._download_webpage_handle(url, video_id)
embed_code = self._search_regex( webpage = self._download_webpage(
r'embedCode=([^&\'"]+)', webpage, 'https://video.vice.com/%s/embed/%s' % (locale, video_id),
'ooyala embed code', default=None) video_id)
if embed_code:
return self.url_result('ooyala:%s' % embed_code, 'Ooyala') video = self._parse_json(
youtube_id = self._search_regex( self._search_regex(
r'data-youtube-id="([^"]+)"', webpage, 'youtube id', default=None) r'PREFETCH_DATA\s*=\s*({.+?})\s*;\s*\n', webpage,
if youtube_id: 'app state'), video_id)['video']
return self.url_result(youtube_id, 'Youtube') video_id = video.get('vms_id') or video.get('id') or video_id
return self._extract_preplay_video(urlh.geturl(), locale, webpage) title = video['title']
is_locked = video.get('locked')
rating = video.get('rating')
thumbnail = video.get('thumbnail_url')
duration = int_or_none(video.get('duration'))
series = try_get(
video, lambda x: x['episode']['season']['show']['title'],
compat_str)
episode_number = try_get(
video, lambda x: x['episode']['episode_number'])
season_number = try_get(
video, lambda x: x['episode']['season']['season_number'])
uploader = None
query = {}
if is_locked:
resource = self._get_mvpd_resource(
'VICELAND', title, video_id, rating)
query['tvetoken'] = self._extract_mvpd_auth(
url, video_id, 'VICELAND', resource)
# signature generation algorithm is reverse engineered from signatureGenerator in
# webpack:///../shared/~/vice-player/dist/js/vice-player.js in
# https://www.viceland.com/assets/common/js/web.vendor.bundle.js
# new JS is located here https://vice-web-statics-cdn.vice.com/vice-player/player-embed.js
exp = int(time.time()) + 1440
query.update({
'exp': exp,
'sign': hashlib.sha512(('%s:GET:%d' % (video_id, exp)).encode()).hexdigest(),
'_ad_blocked': None,
'_ad_unit': '',
'_debug': '',
'platform': 'desktop',
'rn': random.randint(10000, 100000),
'fbprebidtoken': '',
})
try:
host = 'www.viceland' if is_locked else self._PREPLAY_HOST
preplay = self._download_json(
'https://%s.com/%s/video/preplay/%s' % (host, locale, video_id),
video_id, query=query)
except ExtractorError as e:
if isinstance(e.cause, compat_HTTPError) and e.cause.code in (400, 401):
error = json.loads(e.cause.read().decode())
error_message = error.get('error_description') or error['details']
raise ExtractorError('%s said: %s' % (
self.IE_NAME, error_message), expected=True)
raise
video_data = preplay['video']
base = video_data['base']
uplynk_preplay_url = preplay['preplayURL']
episode = video_data.get('episode', {})
channel = video_data.get('channel', {})
subtitles = {}
cc_url = preplay.get('ccURL')
if cc_url:
subtitles['en'] = [{
'url': cc_url,
}]
return {
'_type': 'url_transparent',
'url': uplynk_preplay_url,
'id': video_id,
'title': title,
'description': base.get('body') or base.get('display_body'),
'thumbnail': thumbnail,
'duration': int_or_none(video_data.get('video_duration')) or duration,
'timestamp': int_or_none(video_data.get('created_at'), 1000),
'age_limit': parse_age_limit(video_data.get('video_rating')),
'series': video_data.get('show_title') or series,
'episode_number': int_or_none(episode.get('episode_number') or episode_number),
'episode_id': str_or_none(episode.get('id') or video_data.get('episode_id')),
'season_number': int_or_none(season_number),
'season_id': str_or_none(episode.get('season_id')),
'uploader': channel.get('base', {}).get('title') or channel.get('name') or uploader,
'uploader_id': str_or_none(channel.get('id')),
'subtitles': subtitles,
'ie_key': 'UplynkPreplay',
}
class ViceShowIE(InfoExtractor): class ViceShowIE(InfoExtractor):
@ -203,14 +246,15 @@ class ViceArticleIE(InfoExtractor):
_TESTS = [{ _TESTS = [{
'url': 'https://www.vice.com/en_us/article/on-set-with-the-woman-making-mormon-porn-in-utah', 'url': 'https://www.vice.com/en_us/article/on-set-with-the-woman-making-mormon-porn-in-utah',
'info_dict': { 'info_dict': {
'id': '58dc0a3dee202d2a0ccfcbd8', 'id': '41eae2a47b174a1398357cec55f1f6fc',
'ext': 'mp4', 'ext': 'mp4',
'title': 'Mormon War on Porn ', 'title': 'Mormon War on Porn ',
'description': 'md5:ad396a2481e7f8afb5ed486878421090', 'description': 'md5:6394a8398506581d0346b9ab89093fef',
'uploader': 'VICE', 'uploader': 'vice',
'uploader_id': '57a204088cb727dec794c693', 'uploader_id': '57a204088cb727dec794c67b',
'timestamp': 1489160690, 'timestamp': 1491883129,
'upload_date': '20170310', 'upload_date': '20170411',
'age_limit': 17,
}, },
'params': { 'params': {
# AES-encrypted m3u8 # AES-encrypted m3u8
@ -219,17 +263,35 @@ class ViceArticleIE(InfoExtractor):
'add_ie': ['UplynkPreplay'], 'add_ie': ['UplynkPreplay'],
}, { }, {
'url': 'https://www.vice.com/en_us/article/how-to-hack-a-car', 'url': 'https://www.vice.com/en_us/article/how-to-hack-a-car',
'md5': 'a7ecf64ee4fa19b916c16f4b56184ae2', 'md5': '7fe8ebc4fa3323efafc127b82bd821d9',
'info_dict': { 'info_dict': {
'id': '3jstaBeXgAs', 'id': '3jstaBeXgAs',
'ext': 'mp4', 'ext': 'mp4',
'title': 'How to Hack a Car: Phreaked Out (Episode 2)', 'title': 'How to Hack a Car: Phreaked Out (Episode 2)',
'description': 'md5:ee95453f7ff495db8efe14ae8bf56f30', 'description': 'md5:ee95453f7ff495db8efe14ae8bf56f30',
'uploader_id': 'MotherboardTV',
'uploader': 'Motherboard', 'uploader': 'Motherboard',
'uploader_id': 'MotherboardTV',
'upload_date': '20140529', 'upload_date': '20140529',
}, },
'add_ie': ['Youtube'], 'add_ie': ['Youtube'],
}, {
'url': 'https://www.vice.com/en_us/article/znm9dx/karley-sciortino-slutever-reloaded',
'md5': 'a7ecf64ee4fa19b916c16f4b56184ae2',
'info_dict': {
'id': 'e2ed435eb67e43efb66e6ef9a6930a88',
'ext': 'mp4',
'title': "Making The World's First Male Sex Doll",
'description': 'md5:916078ef0e032d76343116208b6cc2c4',
'uploader': 'vice',
'uploader_id': '57a204088cb727dec794c67b',
'timestamp': 1476919911,
'upload_date': '20161019',
'age_limit': 17,
},
'params': {
'skip_download': True,
},
'add_ie': [ViceIE.ie_key()],
}, { }, {
'url': 'https://www.vice.com/en_us/article/cowboy-capitalists-part-1', 'url': 'https://www.vice.com/en_us/article/cowboy-capitalists-part-1',
'only_matching': True, 'only_matching': True,
@ -244,8 +306,8 @@ class ViceArticleIE(InfoExtractor):
webpage = self._download_webpage(url, display_id) webpage = self._download_webpage(url, display_id)
prefetch_data = self._parse_json(self._search_regex( prefetch_data = self._parse_json(self._search_regex(
r'window\.__PREFETCH_DATA\s*=\s*({.*});', r'__APP_STATE\s*=\s*({.+?})(?:\s*\|\|\s*{}\s*)?;\s*\n',
webpage, 'prefetch data'), display_id) webpage, 'app state'), display_id)['pageData']
body = prefetch_data['body'] body = prefetch_data['body']
def _url_res(video_url, ie_key): def _url_res(video_url, ie_key):
@ -256,6 +318,10 @@ class ViceArticleIE(InfoExtractor):
'ie_key': ie_key, 'ie_key': ie_key,
} }
vice_url = ViceIE._extract_url(webpage)
if vice_url:
return _url_res(vice_url, ViceIE.ie_key())
embed_code = self._search_regex( embed_code = self._search_regex(
r'embedCode=([^&\'"]+)', body, r'embedCode=([^&\'"]+)', body,
'ooyala embed code', default=None) 'ooyala embed code', default=None)

View File

@ -1,38 +0,0 @@
# coding: utf-8
from __future__ import unicode_literals
import re
from .vice import ViceBaseIE
class VicelandIE(ViceBaseIE):
_VALID_URL = r'https?://(?:www\.)?viceland\.com/(?P<locale>[^/]+)/video/[^/]+/(?P<id>[a-f0-9]+)'
_TEST = {
'url': 'https://www.viceland.com/en_us/video/trapped/588a70d0dba8a16007de7316',
'info_dict': {
'id': '588a70d0dba8a16007de7316',
'ext': 'mp4',
'title': 'TRAPPED (Series Trailer)',
'description': 'md5:7a8e95c2b6cd86461502a2845e581ccf',
'age_limit': 14,
'timestamp': 1485474122,
'upload_date': '20170126',
'uploader_id': '57a204098cb727dec794c6a3',
'uploader': 'Viceland',
},
'params': {
# m3u8 download
'skip_download': True,
},
'add_ie': ['UplynkPreplay'],
'skip': '404',
}
_PREPLAY_HOST = 'www.viceland'
def _real_extract(self, url):
mobj = re.match(self._VALID_URL, url)
video_id = mobj.group('id')
locale = mobj.group('locale')
webpage = self._download_webpage(url, video_id)
return self._extract_preplay_video(url, locale, webpage)