[ted] Fix extraction (closes #13535))

This commit is contained in:
Sergey M․ 2017-07-01 18:39:01 +07:00
parent 54faac2235
commit 4917478803
No known key found for this signature in database
GPG Key ID: 2C393E0F18A9236D
1 changed files with 34 additions and 15 deletions

View File

@ -6,7 +6,10 @@ import re
from .common import InfoExtractor
from ..compat import compat_str
from ..utils import int_or_none
from ..utils import (
int_or_none,
try_get,
)
class TEDIE(InfoExtractor):
@ -113,8 +116,9 @@ class TEDIE(InfoExtractor):
}
def _extract_info(self, webpage):
info_json = self._search_regex(r'q\("\w+.init",({.+})\)</script>',
webpage, 'info json')
info_json = self._search_regex(
r'(?s)q\(\s*"\w+.init"\s*,\s*({.+})\)\s*</script>',
webpage, 'info json')
return json.loads(info_json)
def _real_extract(self, url):
@ -136,11 +140,16 @@ class TEDIE(InfoExtractor):
webpage = self._download_webpage(url, name,
'Downloading playlist webpage')
info = self._extract_info(webpage)
playlist_info = info['playlist']
playlist_info = try_get(
info, lambda x: x['__INITIAL_DATA__']['playlist'],
dict) or info['playlist']
playlist_entries = [
self.url_result('http://www.ted.com/talks/' + talk['slug'], self.ie_key())
for talk in info['talks']
for talk in try_get(
info, lambda x: x['__INITIAL_DATA__']['talks'],
dict) or info['talks']
]
return self.playlist_result(
playlist_entries,
@ -149,9 +158,14 @@ class TEDIE(InfoExtractor):
def _talk_info(self, url, video_name):
webpage = self._download_webpage(url, video_name)
self.report_extraction(video_name)
talk_info = self._extract_info(webpage)['talks'][0]
info = self._extract_info(webpage)
talk_info = try_get(
info, lambda x: x['__INITIAL_DATA__']['talks'][0],
dict) or info['talks'][0]
title = talk_info['title'].strip()
external = talk_info.get('external')
if external:
@ -165,19 +179,27 @@ class TEDIE(InfoExtractor):
'url': ext_url or external['uri'],
}
native_downloads = try_get(
talk_info, lambda x: x['downloads']['nativeDownloads'],
dict) or talk_info['nativeDownloads']
formats = [{
'url': format_url,
'format_id': format_id,
'format': format_id,
} for (format_id, format_url) in talk_info['nativeDownloads'].items() if format_url is not None]
} for (format_id, format_url) in native_downloads.items() if format_url is not None]
if formats:
for f in formats:
finfo = self._NATIVE_FORMATS.get(f['format_id'])
if finfo:
f.update(finfo)
player_talk = talk_info['player_talks'][0]
resources_ = player_talk.get('resources') or talk_info.get('resources')
http_url = None
for format_id, resources in talk_info['resources'].items():
for format_id, resources in resources_.items():
if format_id == 'h264':
for resource in resources:
h264_url = resource.get('file')
@ -237,14 +259,11 @@ class TEDIE(InfoExtractor):
video_id = compat_str(talk_info['id'])
thumbnail = talk_info['thumb']
if not thumbnail.startswith('http'):
thumbnail = 'http://' + thumbnail
return {
'id': video_id,
'title': talk_info['title'].strip(),
'uploader': talk_info['speaker'],
'thumbnail': thumbnail,
'title': title,
'uploader': player_talk.get('speaker') or talk_info.get('speaker'),
'thumbnail': player_talk.get('thumb') or talk_info.get('thumb'),
'description': self._og_search_description(webpage),
'subtitles': self._get_subtitles(video_id, talk_info),
'formats': formats,