You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

324 lines
13 KiB

  1. # coding: utf-8
  2. from __future__ import unicode_literals
  3. import uuid
  4. import xml.etree.ElementTree as etree
  5. import json
  6. import re
  7. from .common import InfoExtractor
  8. from .brightcove import BrightcoveNewIE
  9. from ..compat import (
  10. compat_str,
  11. compat_etree_register_namespace,
  12. )
  13. from ..utils import (
  14. determine_ext,
  15. ExtractorError,
  16. extract_attributes,
  17. int_or_none,
  18. merge_dicts,
  19. parse_duration,
  20. smuggle_url,
  21. try_get,
  22. url_or_none,
  23. xpath_with_ns,
  24. xpath_element,
  25. xpath_text,
  26. )
  27. class ITVIE(InfoExtractor):
  28. _VALID_URL = r'https?://(?:www\.)?itv\.com/hub/[^/]+/(?P<id>[0-9a-zA-Z]+)'
  29. _GEO_COUNTRIES = ['GB']
  30. _TESTS = [{
  31. 'url': 'http://www.itv.com/hub/mr-bean-animated-series/2a2936a0053',
  32. 'info_dict': {
  33. 'id': '2a2936a0053',
  34. 'ext': 'flv',
  35. 'title': 'Home Movie',
  36. },
  37. 'params': {
  38. # rtmp download
  39. 'skip_download': True,
  40. },
  41. }, {
  42. # unavailable via data-playlist-url
  43. 'url': 'https://www.itv.com/hub/through-the-keyhole/2a2271a0033',
  44. 'only_matching': True,
  45. }, {
  46. # InvalidVodcrid
  47. 'url': 'https://www.itv.com/hub/james-martins-saturday-morning/2a5159a0034',
  48. 'only_matching': True,
  49. }, {
  50. # ContentUnavailable
  51. 'url': 'https://www.itv.com/hub/whos-doing-the-dishes/2a2898a0024',
  52. 'only_matching': True,
  53. }]
  54. def _real_extract(self, url):
  55. video_id = self._match_id(url)
  56. webpage = self._download_webpage(url, video_id)
  57. params = extract_attributes(self._search_regex(
  58. r'(?s)(<[^>]+id="video"[^>]*>)', webpage, 'params'))
  59. ns_map = {
  60. 'soapenv': 'http://schemas.xmlsoap.org/soap/envelope/',
  61. 'tem': 'http://tempuri.org/',
  62. 'itv': 'http://schemas.datacontract.org/2004/07/Itv.BB.Mercury.Common.Types',
  63. 'com': 'http://schemas.itv.com/2009/05/Common',
  64. }
  65. for ns, full_ns in ns_map.items():
  66. compat_etree_register_namespace(ns, full_ns)
  67. def _add_ns(name):
  68. return xpath_with_ns(name, ns_map)
  69. def _add_sub_element(element, name):
  70. return etree.SubElement(element, _add_ns(name))
  71. production_id = (
  72. params.get('data-video-autoplay-id')
  73. or '%s#001' % (
  74. params.get('data-video-episode-id')
  75. or video_id.replace('a', '/')))
  76. req_env = etree.Element(_add_ns('soapenv:Envelope'))
  77. _add_sub_element(req_env, 'soapenv:Header')
  78. body = _add_sub_element(req_env, 'soapenv:Body')
  79. get_playlist = _add_sub_element(body, ('tem:GetPlaylist'))
  80. request = _add_sub_element(get_playlist, 'tem:request')
  81. _add_sub_element(request, 'itv:ProductionId').text = production_id
  82. _add_sub_element(request, 'itv:RequestGuid').text = compat_str(uuid.uuid4()).upper()
  83. vodcrid = _add_sub_element(request, 'itv:Vodcrid')
  84. _add_sub_element(vodcrid, 'com:Id')
  85. _add_sub_element(request, 'itv:Partition')
  86. user_info = _add_sub_element(get_playlist, 'tem:userInfo')
  87. _add_sub_element(user_info, 'itv:Broadcaster').text = 'Itv'
  88. _add_sub_element(user_info, 'itv:DM')
  89. _add_sub_element(user_info, 'itv:RevenueScienceValue')
  90. _add_sub_element(user_info, 'itv:SessionId')
  91. _add_sub_element(user_info, 'itv:SsoToken')
  92. _add_sub_element(user_info, 'itv:UserToken')
  93. site_info = _add_sub_element(get_playlist, 'tem:siteInfo')
  94. _add_sub_element(site_info, 'itv:AdvertisingRestriction').text = 'None'
  95. _add_sub_element(site_info, 'itv:AdvertisingSite').text = 'ITV'
  96. _add_sub_element(site_info, 'itv:AdvertisingType').text = 'Any'
  97. _add_sub_element(site_info, 'itv:Area').text = 'ITVPLAYER.VIDEO'
  98. _add_sub_element(site_info, 'itv:Category')
  99. _add_sub_element(site_info, 'itv:Platform').text = 'DotCom'
  100. _add_sub_element(site_info, 'itv:Site').text = 'ItvCom'
  101. device_info = _add_sub_element(get_playlist, 'tem:deviceInfo')
  102. _add_sub_element(device_info, 'itv:ScreenSize').text = 'Big'
  103. player_info = _add_sub_element(get_playlist, 'tem:playerInfo')
  104. _add_sub_element(player_info, 'itv:Version').text = '2'
  105. headers = self.geo_verification_headers()
  106. headers.update({
  107. 'Content-Type': 'text/xml; charset=utf-8',
  108. 'SOAPAction': 'http://tempuri.org/PlaylistService/GetPlaylist',
  109. })
  110. info = self._search_json_ld(webpage, video_id, default={})
  111. formats = []
  112. subtitles = {}
  113. def extract_subtitle(sub_url):
  114. ext = determine_ext(sub_url, 'ttml')
  115. subtitles.setdefault('en', []).append({
  116. 'url': sub_url,
  117. 'ext': 'ttml' if ext == 'xml' else ext,
  118. })
  119. resp_env = self._download_xml(
  120. params['data-playlist-url'], video_id,
  121. headers=headers, data=etree.tostring(req_env), fatal=False)
  122. if resp_env:
  123. playlist = xpath_element(resp_env, './/Playlist')
  124. if playlist is None:
  125. fault_code = xpath_text(resp_env, './/faultcode')
  126. fault_string = xpath_text(resp_env, './/faultstring')
  127. if fault_code == 'InvalidGeoRegion':
  128. self.raise_geo_restricted(
  129. msg=fault_string, countries=self._GEO_COUNTRIES)
  130. elif fault_code not in (
  131. 'InvalidEntity', 'InvalidVodcrid', 'ContentUnavailable'):
  132. raise ExtractorError(
  133. '%s said: %s' % (self.IE_NAME, fault_string), expected=True)
  134. info.update({
  135. 'title': self._og_search_title(webpage),
  136. 'episode_title': params.get('data-video-episode'),
  137. 'series': params.get('data-video-title'),
  138. })
  139. else:
  140. title = xpath_text(playlist, 'EpisodeTitle', default=None)
  141. info.update({
  142. 'title': title,
  143. 'episode_title': title,
  144. 'episode_number': int_or_none(xpath_text(playlist, 'EpisodeNumber')),
  145. 'series': xpath_text(playlist, 'ProgrammeTitle'),
  146. 'duration': parse_duration(xpath_text(playlist, 'Duration')),
  147. })
  148. video_element = xpath_element(playlist, 'VideoEntries/Video', fatal=True)
  149. media_files = xpath_element(video_element, 'MediaFiles', fatal=True)
  150. rtmp_url = media_files.attrib['base']
  151. for media_file in media_files.findall('MediaFile'):
  152. play_path = xpath_text(media_file, 'URL')
  153. if not play_path:
  154. continue
  155. tbr = int_or_none(media_file.get('bitrate'), 1000)
  156. f = {
  157. 'format_id': 'rtmp' + ('-%d' % tbr if tbr else ''),
  158. 'play_path': play_path,
  159. # Providing this swfVfy allows to avoid truncated downloads
  160. 'player_url': 'http://www.itv.com/mercury/Mercury_VideoPlayer.swf',
  161. 'page_url': url,
  162. 'tbr': tbr,
  163. 'ext': 'flv',
  164. }
  165. app = self._search_regex(
  166. 'rtmpe?://[^/]+/(.+)$', rtmp_url, 'app', default=None)
  167. if app:
  168. f.update({
  169. 'url': rtmp_url.split('?', 1)[0],
  170. 'app': app,
  171. })
  172. else:
  173. f['url'] = rtmp_url
  174. formats.append(f)
  175. for caption_url in video_element.findall('ClosedCaptioningURIs/URL'):
  176. if caption_url.text:
  177. extract_subtitle(caption_url.text)
  178. ios_playlist_url = params.get('data-video-playlist') or params.get('data-video-id')
  179. hmac = params.get('data-video-hmac')
  180. if ios_playlist_url and hmac and re.match(r'https?://', ios_playlist_url):
  181. headers = self.geo_verification_headers()
  182. headers.update({
  183. 'Accept': 'application/vnd.itv.vod.playlist.v2+json',
  184. 'Content-Type': 'application/json',
  185. 'hmac': hmac.upper(),
  186. })
  187. ios_playlist = self._download_json(
  188. ios_playlist_url, video_id, data=json.dumps({
  189. 'user': {
  190. 'itvUserId': '',
  191. 'entitlements': [],
  192. 'token': ''
  193. },
  194. 'device': {
  195. 'manufacturer': 'Safari',
  196. 'model': '5',
  197. 'os': {
  198. 'name': 'Windows NT',
  199. 'version': '6.1',
  200. 'type': 'desktop'
  201. }
  202. },
  203. 'client': {
  204. 'version': '4.1',
  205. 'id': 'browser'
  206. },
  207. 'variantAvailability': {
  208. 'featureset': {
  209. 'min': ['hls', 'aes', 'outband-webvtt'],
  210. 'max': ['hls', 'aes', 'outband-webvtt']
  211. },
  212. 'platformTag': 'dotcom'
  213. }
  214. }).encode(), headers=headers, fatal=False)
  215. if ios_playlist:
  216. video_data = ios_playlist.get('Playlist', {}).get('Video', {})
  217. ios_base_url = video_data.get('Base')
  218. for media_file in video_data.get('MediaFiles', []):
  219. href = media_file.get('Href')
  220. if not href:
  221. continue
  222. if ios_base_url:
  223. href = ios_base_url + href
  224. ext = determine_ext(href)
  225. if ext == 'm3u8':
  226. formats.extend(self._extract_m3u8_formats(
  227. href, video_id, 'mp4', entry_protocol='m3u8_native',
  228. m3u8_id='hls', fatal=False))
  229. else:
  230. formats.append({
  231. 'url': href,
  232. })
  233. subs = video_data.get('Subtitles')
  234. if isinstance(subs, list):
  235. for sub in subs:
  236. if not isinstance(sub, dict):
  237. continue
  238. href = url_or_none(sub.get('Href'))
  239. if href:
  240. extract_subtitle(href)
  241. if not info.get('duration'):
  242. info['duration'] = parse_duration(video_data.get('Duration'))
  243. self._sort_formats(formats)
  244. info.update({
  245. 'id': video_id,
  246. 'formats': formats,
  247. 'subtitles': subtitles,
  248. })
  249. webpage_info = self._search_json_ld(webpage, video_id, default={})
  250. if not webpage_info.get('title'):
  251. webpage_info['title'] = self._html_search_regex(
  252. r'(?s)<h\d+[^>]+\bclass=["\'][^>]*episode-title["\'][^>]*>([^<]+)<',
  253. webpage, 'title', default=None) or self._og_search_title(
  254. webpage, default=None) or self._html_search_meta(
  255. 'twitter:title', webpage, 'title',
  256. default=None) or webpage_info['episode']
  257. return merge_dicts(info, webpage_info)
  258. class ITVBTCCIE(InfoExtractor):
  259. _VALID_URL = r'https?://(?:www\.)?itv\.com/btcc/(?:[^/]+/)*(?P<id>[^/?#&]+)'
  260. _TEST = {
  261. 'url': 'https://www.itv.com/btcc/articles/btcc-2019-brands-hatch-gp-race-action',
  262. 'info_dict': {
  263. 'id': 'btcc-2019-brands-hatch-gp-race-action',
  264. 'title': 'BTCC 2019: Brands Hatch GP race action',
  265. },
  266. 'playlist_count': 12,
  267. }
  268. BRIGHTCOVE_URL_TEMPLATE = 'http://players.brightcove.net/1582188683001/HkiHLnNRx_default/index.html?videoId=%s'
  269. def _real_extract(self, url):
  270. playlist_id = self._match_id(url)
  271. webpage = self._download_webpage(url, playlist_id)
  272. json_map = try_get(self._parse_json(self._html_search_regex(
  273. '(?s)<script[^>]+id=[\'"]__NEXT_DATA__[^>]*>([^<]+)</script>', webpage, 'json_map'), playlist_id),
  274. lambda x: x['props']['pageProps']['article']['body']['content']) or []
  275. # Discard empty objects
  276. video_ids = []
  277. for video in json_map:
  278. if video['data'].get('id'):
  279. video_ids.append(video['data']['id'])
  280. entries = [
  281. self.url_result(
  282. smuggle_url(self.BRIGHTCOVE_URL_TEMPLATE % video_id, {
  283. # ITV does not like some GB IP ranges, so here are some
  284. # IP blocks it accepts
  285. 'geo_ip_blocks': [
  286. '193.113.0.0/16', '54.36.162.0/23', '159.65.16.0/21'
  287. ],
  288. 'referrer': url,
  289. }),
  290. ie=BrightcoveNewIE.ie_key(), video_id=video_id)
  291. for video_id in video_ids]
  292. title = self._og_search_title(webpage, fatal=False)
  293. return self.playlist_result(entries, playlist_id, title)