Browse Source

Merge 'ytdl-org/youtube-dl/master' release 2020.11.19

Old Extractors left behind:
	VLivePlaylistIE
	YoutubeSearchURLIE
	YoutubeShowIE
	YoutubeFavouritesIE

If removing old extractors, make corresponding changes in
	docs/supportedsites.md
	youtube_dlc/extractor/extractors.py

Not merged:
	.github/ISSUE_TEMPLATE/1_broken_site.md
	.github/ISSUE_TEMPLATE/2_site_support_request.md
	.github/ISSUE_TEMPLATE/3_site_feature_request.md
	.github/ISSUE_TEMPLATE/4_bug_report.md
	.github/ISSUE_TEMPLATE/5_feature_request.md
	test/test_all_urls.py
	youtube_dlc/version.py
	Changelog
pull/245/head
pukkandan 11 months ago
parent
commit
8bdd16b499
34 changed files with 1828 additions and 1695 deletions
  1. +1
    -1
      devscripts/make_lazy_extractors.py
  2. +6
    -8
      docs/supportedsites.md
  3. +12
    -14
      test/test_all_urls.py
  4. +28
    -0
      test/test_utils.py
  5. +1
    -1
      youtube_dlc/extractor/afreecatv.py
  6. +110
    -57
      youtube_dlc/extractor/arte.py
  7. +111
    -157
      youtube_dlc/extractor/bandcamp.py
  8. +12
    -7
      youtube_dlc/extractor/cnbc.py
  9. +3
    -2
      youtube_dlc/extractor/common.py
  10. +23
    -4
      youtube_dlc/extractor/condenast.py
  11. +5
    -6
      youtube_dlc/extractor/extractors.py
  12. +36
    -11
      youtube_dlc/extractor/francetv.py
  13. +4
    -5
      youtube_dlc/extractor/generic.py
  14. +1
    -1
      youtube_dlc/extractor/iqiyi.py
  15. +88
    -0
      youtube_dlc/extractor/lbry.py
  16. +36
    -55
      youtube_dlc/extractor/lrt.py
  17. +46
    -14
      youtube_dlc/extractor/malltv.py
  18. +7
    -3
      youtube_dlc/extractor/mgtv.py
  19. +12
    -0
      youtube_dlc/extractor/mtv.py
  20. +2
    -3
      youtube_dlc/extractor/nbc.py
  21. +38
    -0
      youtube_dlc/extractor/ndr.py
  22. +95
    -62
      youtube_dlc/extractor/rai.py
  23. +95
    -16
      youtube_dlc/extractor/servus.py
  24. +24
    -129
      youtube_dlc/extractor/spiegel.py
  25. +7
    -4
      youtube_dlc/extractor/twentythreevideo.py
  26. +51
    -26
      youtube_dlc/extractor/urplay.py
  27. +16
    -66
      youtube_dlc/extractor/usanetwork.py
  28. +5
    -2
      youtube_dlc/extractor/ustream.py
  29. +7
    -4
      youtube_dlc/extractor/vimeo.py
  30. +111
    -174
      youtube_dlc/extractor/vlive.py
  31. +11
    -20
      youtube_dlc/extractor/xtube.py
  32. +3
    -4
      youtube_dlc/extractor/youporn.py
  33. +812
    -831
      youtube_dlc/extractor/youtube.py
  34. +9
    -8
      youtube_dlc/utils.py

+ 1
- 1
devscripts/make_lazy_extractors.py View File

@@ -61,7 +61,7 @@ def build_lazy_ie(ie, name):
return s


# find the correct sorting and add the required base classes so that sublcasses
# find the correct sorting and add the required base classes so that subclasses
# can be correctly created
classes = _ALL_CLASSES[:-1]
ordered_cls = []


+ 6
- 8
docs/supportedsites.md View File

@@ -59,9 +59,9 @@
- **ARD:mediathek**
- **ARDBetaMediathek**
- **Arkena**
- **arte.tv:+7**
- **arte.tv:embed**
- **arte.tv:playlist**
- **ArteTV**
- **ArteTVEmbed**
- **ArteTVPlaylist**
- **AsianCrush**
- **AsianCrushPlaylist**
- **AtresPlayer**
@@ -424,6 +424,7 @@
- **la7.it**
- **laola1tv**
- **laola1tv:embed**
- **lbry.tv**
- **LCI**
- **Lcp**
- **LcpPlay**
@@ -835,8 +836,6 @@
- **SpankBangPlaylist**
- **Spankwire**
- **Spiegel**
- **Spiegel:Article**: Articles on spiegel.de
- **Spiegeltv**
- **sport.francetvinfo.fr**
- **Sport5**
- **SportBox**
@@ -1147,19 +1146,18 @@
- **YourPorn**
- **YourUpload**
- **youtube**: YouTube.com
- **youtube:channel**: YouTube.com channels
- **youtube:favorites**: YouTube.com favourite videos, ":ytfav" for short (requires authentication)
- **youtube:history**: Youtube watch history, ":ythistory" for short (requires authentication)
- **youtube:live**: YouTube.com live streams
- **youtube:playlist**: YouTube.com playlists
- **youtube:playlists**: YouTube.com user/channel playlists
- **youtube:recommended**: YouTube.com recommended videos, ":ytrec" for short (requires authentication)
- **youtube:search**: YouTube.com searches
- **youtube:search:date**: YouTube.com searches, newest videos first
- **youtube:search_url**: YouTube.com search URLs
- **youtube:show**: YouTube.com (multi-season) shows
- **youtube:subscriptions**: YouTube.com subscriptions feed, "ytsubs" keyword (requires authentication)
- **youtube:user**: YouTube.com user videos (URL or "ytuser" keyword)
- **YoutubeYtUser**: YouTube.com user videos (URL or "ytuser" keyword)
- **youtube:tab**: YouTube.com tab
- **youtube:watchlater**: Youtube watch later list, ":ytwatchlater" for short (requires authentication)
- **Zapiks**
- **Zaq1**


+ 12
- 14
test/test_all_urls.py View File

@@ -31,15 +31,17 @@ class TestAllURLsMatching(unittest.TestCase):

def test_youtube_playlist_matching(self):
assertPlaylist = lambda url: self.assertMatch(url, ['youtube:playlist'])
assertTab = lambda url: self.assertMatch(url, ['youtube:tab'])
assertPlaylist('ECUl4u3cNGP61MdtwGTqZA0MreSaDybji8')
assertPlaylist('UUBABnxM4Ar9ten8Mdjj1j0Q') # 585
assertPlaylist('https://www.youtube.com/playlist?list=UUBABnxM4Ar9ten8Mdjj1j0Q')
assertPlaylist('PL63F0C78739B09958')
assertTab('https://www.youtube.com/playlist?list=UUBABnxM4Ar9ten8Mdjj1j0Q')
assertPlaylist('https://www.youtube.com/course?list=ECUl4u3cNGP61MdtwGTqZA0MreSaDybji8')
assertPlaylist('https://www.youtube.com/playlist?list=PLwP_SiAcdui0KVebT0mU9Apz359a4ubsC')
assertPlaylist('https://www.youtube.com/watch?v=AV6J6_AeFEQ&playnext=1&list=PL4023E734DA416012') # 668
assertTab('https://www.youtube.com/playlist?list=PLwP_SiAcdui0KVebT0mU9Apz359a4ubsC')
assertTab('https://www.youtube.com/watch?v=AV6J6_AeFEQ&playnext=1&list=PL4023E734DA416012') # 668
self.assertFalse('youtube:playlist' in self.matching_ies('PLtS2H6bU1M'))
# Top tracks
assertPlaylist('https://www.youtube.com/playlist?list=MCUS.20142101')
assertTab('https://www.youtube.com/playlist?list=MCUS.20142101')

def test_youtube_matching(self):
self.assertTrue(YoutubeIE.suitable('PLtS2H6bU1M'))
@@ -50,26 +52,22 @@ class TestAllURLsMatching(unittest.TestCase):
self.assertMatch('http://www.cleanvideosearch.com/media/action/yt/watch?videoId=8v_4O44sfjM', ['youtube'])

def test_youtube_channel_matching(self):
assertChannel = lambda url: self.assertMatch(url, ['youtube:channel'])
assertChannel = lambda url: self.assertMatch(url, ['youtube:tab'])
assertChannel('https://www.youtube.com/channel/HCtnHdj3df7iM')
assertChannel('https://www.youtube.com/channel/HCtnHdj3df7iM?feature=gb_ch_rec')
assertChannel('https://www.youtube.com/channel/HCtnHdj3df7iM/videos')

def test_youtube_user_matching(self):
self.assertMatch('http://www.youtube.com/NASAgovVideo/videos', ['youtube:user'])
# def test_youtube_user_matching(self):
# self.assertMatch('http://www.youtube.com/NASAgovVideo/videos', ['youtube:tab'])

def test_youtube_feeds(self):
self.assertMatch('https://www.youtube.com/feed/watch_later', ['youtube:watchlater'])
self.assertMatch('https://www.youtube.com/feed/subscriptions', ['youtube:subscriptions'])
self.assertMatch('https://www.youtube.com/feed/recommended', ['youtube:recommended'])
self.assertMatch('https://www.youtube.com/my_favorites', ['youtube:favorites'])

def test_youtube_show_matching(self):
self.assertMatch('http://www.youtube.com/show/airdisasters', ['youtube:show'])

def test_youtube_search_matching(self):
self.assertMatch('http://www.youtube.com/results?search_query=making+mustard', ['youtube:search_url'])
self.assertMatch('https://www.youtube.com/results?baz=bar&search_query=youtube-dl+test+video&filters=video&lclk=video', ['youtube:search_url'])
# def test_youtube_search_matching(self):
# self.assertMatch('http://www.youtube.com/results?search_query=making+mustard', ['youtube:search_url'])
# self.assertMatch('https://www.youtube.com/results?baz=bar&search_query=youtube-dl+test+video&filters=video&lclk=video', ['youtube:search_url'])

def test_youtube_extract(self):
assertExtractId = lambda url, id: self.assertEqual(YoutubeIE.extract_id(url), id)


+ 28
- 0
test/test_utils.py View File

@@ -937,6 +937,28 @@ class TestUtil(unittest.TestCase):
self.assertEqual(d['x'], 1)
self.assertEqual(d['y'], 'a')

# Just drop ! prefix for now though this results in a wrong value
on = js_to_json('''{
a: !0,
b: !1,
c: !!0,
d: !!42.42,
e: !!![],
f: !"abc",
g: !"",
!42: 42
}''')
self.assertEqual(json.loads(on), {
'a': 0,
'b': 1,
'c': 0,
'd': 42.42,
'e': [],
'f': "abc",
'g': "",
'42': 42
})

on = js_to_json('["abc", "def",]')
self.assertEqual(json.loads(on), ['abc', 'def'])

@@ -994,6 +1016,12 @@ class TestUtil(unittest.TestCase):
on = js_to_json('{42:4.2e1}')
self.assertEqual(json.loads(on), {'42': 42.0})

on = js_to_json('{ "0x40": "0x40" }')
self.assertEqual(json.loads(on), {'0x40': '0x40'})

on = js_to_json('{ "040": "040" }')
self.assertEqual(json.loads(on), {'040': '040'})

def test_js_to_json_malformed(self):
self.assertEqual(js_to_json('42a1'), '42"a1"')
self.assertEqual(js_to_json('42a-1'), '42"a"-1')


+ 1
- 1
youtube_dlc/extractor/afreecatv.py View File

@@ -275,7 +275,7 @@ class AfreecaTVIE(InfoExtractor):
video_element = video_xml.findall(compat_xpath('./track/video'))[-1]
if video_element is None or video_element.text is None:
raise ExtractorError(
'Video %s video does not exist' % video_id, expected=True)
'Video %s does not exist' % video_id, expected=True)

video_url = video_element.text.strip()



+ 110
- 57
youtube_dlc/extractor/arte.py View File

@@ -4,23 +4,57 @@ from __future__ import unicode_literals
import re

from .common import InfoExtractor
from ..compat import compat_str
from ..compat import (
compat_str,
compat_urlparse,
)
from ..utils import (
ExtractorError,
int_or_none,
qualities,
try_get,
unified_strdate,
url_or_none,
)

# There are different sources of video in arte.tv, the extraction process
# is different for each one. The videos usually expire in 7 days, so we can't
# add tests.


class ArteTVBaseIE(InfoExtractor):
def _extract_from_json_url(self, json_url, video_id, lang, title=None):
info = self._download_json(json_url, video_id)
_ARTE_LANGUAGES = 'fr|de|en|es|it|pl'
_API_BASE = 'https://api.arte.tv/api/player/v1'


class ArteTVIE(ArteTVBaseIE):
_VALID_URL = r'''(?x)
https?://
(?:
(?:www\.)?arte\.tv/(?P<lang>%(langs)s)/videos|
api\.arte\.tv/api/player/v\d+/config/(?P<lang_2>%(langs)s)
)
/(?P<id>\d{6}-\d{3}-[AF])
''' % {'langs': ArteTVBaseIE._ARTE_LANGUAGES}
_TESTS = [{
'url': 'https://www.arte.tv/en/videos/088501-000-A/mexico-stealing-petrol-to-survive/',
'info_dict': {
'id': '088501-000-A',
'ext': 'mp4',
'title': 'Mexico: Stealing Petrol to Survive',
'upload_date': '20190628',
},
}, {
'url': 'https://www.arte.tv/pl/videos/100103-000-A/usa-dyskryminacja-na-porodowce/',
'only_matching': True,
}, {
'url': 'https://api.arte.tv/api/player/v2/config/de/100605-013-A',
'only_matching': True,
}]

def _real_extract(self, url):
mobj = re.match(self._VALID_URL, url)
video_id = mobj.group('id')
lang = mobj.group('lang') or mobj.group('lang_2')

info = self._download_json(
'%s/config/%s/%s' % (self._API_BASE, lang, video_id), video_id)
player_info = info['videoJsonPlayer']

vsr = try_get(player_info, lambda x: x['VSR'], dict)
@@ -37,18 +71,11 @@ class ArteTVBaseIE(InfoExtractor):
if not upload_date_str:
upload_date_str = (player_info.get('VRA') or player_info.get('VDA') or '').split(' ')[0]

title = (player_info.get('VTI') or title or player_info['VID']).strip()
title = (player_info.get('VTI') or player_info['VID']).strip()
subtitle = player_info.get('VSU', '').strip()
if subtitle:
title += ' - %s' % subtitle

info_dict = {
'id': player_info['VID'],
'title': title,
'description': player_info.get('VDE'),
'upload_date': unified_strdate(upload_date_str),
'thumbnail': player_info.get('programImage') or player_info.get('VTU', {}).get('IUR'),
}
qfunc = qualities(['MQ', 'HQ', 'EQ', 'SQ'])

LANGS = {
@@ -65,6 +92,10 @@ class ArteTVBaseIE(InfoExtractor):
formats = []
for format_id, format_dict in vsr.items():
f = dict(format_dict)
format_url = url_or_none(f.get('url'))
streamer = f.get('streamer')
if not format_url and not streamer:
continue
versionCode = f.get('versionCode')
l = re.escape(langcode)

@@ -107,6 +138,16 @@ class ArteTVBaseIE(InfoExtractor):
else:
lang_pref = -1

media_type = f.get('mediaType')
if media_type == 'hls':
m3u8_formats = self._extract_m3u8_formats(
format_url, video_id, 'mp4', entry_protocol='m3u8_native',
m3u8_id=format_id, fatal=False)
for m3u8_format in m3u8_formats:
m3u8_format['language_preference'] = lang_pref
formats.extend(m3u8_formats)
continue

format = {
'format_id': format_id,
'preference': -10 if f.get('videoFormat') == 'M3U8' else None,
@@ -118,7 +159,7 @@ class ArteTVBaseIE(InfoExtractor):
'quality': qfunc(f.get('quality')),
}

if f.get('mediaType') == 'rtmp':
if media_type == 'rtmp':
format['url'] = f['streamer']
format['play_path'] = 'mp4:' + f['url']
format['ext'] = 'flv'
@@ -127,56 +168,50 @@ class ArteTVBaseIE(InfoExtractor):

formats.append(format)

self._check_formats(formats, video_id)
self._sort_formats(formats)

info_dict['formats'] = formats
return info_dict

return {
'id': player_info.get('VID') or video_id,
'title': title,
'description': player_info.get('VDE'),
'upload_date': unified_strdate(upload_date_str),
'thumbnail': player_info.get('programImage') or player_info.get('VTU', {}).get('IUR'),
'formats': formats,
}

class ArteTVPlus7IE(ArteTVBaseIE):
IE_NAME = 'arte.tv:+7'
_VALID_URL = r'https?://(?:www\.)?arte\.tv/(?P<lang>fr|de|en|es|it|pl)/videos/(?P<id>\d{6}-\d{3}-[AF])'

class ArteTVEmbedIE(InfoExtractor):
_VALID_URL = r'https?://(?:www\.)?arte\.tv/player/v\d+/index\.php\?.*?\bjson_url=.+'
_TESTS = [{
'url': 'https://www.arte.tv/en/videos/088501-000-A/mexico-stealing-petrol-to-survive/',
'url': 'https://www.arte.tv/player/v5/index.php?json_url=https%3A%2F%2Fapi.arte.tv%2Fapi%2Fplayer%2Fv2%2Fconfig%2Fde%2F100605-013-A&lang=de&autoplay=true&mute=0100605-013-A',
'info_dict': {
'id': '088501-000-A',
'id': '100605-013-A',
'ext': 'mp4',
'title': 'Mexico: Stealing Petrol to Survive',
'upload_date': '20190628',
'title': 'United we Stream November Lockdown Edition #13',
'description': 'md5:be40b667f45189632b78c1425c7c2ce1',
'upload_date': '20201116',
},
}, {
'url': 'https://www.arte.tv/player/v3/index.php?json_url=https://api.arte.tv/api/player/v2/config/de/100605-013-A',
'only_matching': True,
}]

def _real_extract(self, url):
lang, video_id = re.match(self._VALID_URL, url).groups()
return self._extract_from_json_url(
'https://api.arte.tv/api/player/v1/config/%s/%s' % (lang, video_id),
video_id, lang)


class ArteTVEmbedIE(ArteTVPlus7IE):
IE_NAME = 'arte.tv:embed'
_VALID_URL = r'''(?x)
https://www\.arte\.tv
/player/v3/index\.php\?json_url=
(?P<json_url>
https?://api\.arte\.tv/api/player/v1/config/
(?P<lang>[^/]+)/(?P<id>\d{6}-\d{3}-[AF])
)
'''

_TESTS = []
@staticmethod
def _extract_urls(webpage):
return [url for _, url in re.findall(
r'<(?:iframe|script)[^>]+src=(["\'])(?P<url>(?:https?:)?//(?:www\.)?arte\.tv/player/v\d+/index\.php\?.*?\bjson_url=.+?)\1',
webpage)]

def _real_extract(self, url):
json_url, lang, video_id = re.match(self._VALID_URL, url).groups()
return self._extract_from_json_url(json_url, video_id, lang)
qs = compat_urlparse.parse_qs(compat_urlparse.urlparse(url).query)
json_url = qs['json_url'][0]
video_id = ArteTVIE._match_id(json_url)
return self.url_result(
json_url, ie=ArteTVIE.ie_key(), video_id=video_id)


class ArteTVPlaylistIE(ArteTVBaseIE):
IE_NAME = 'arte.tv:playlist'
_VALID_URL = r'https?://(?:www\.)?arte\.tv/(?P<lang>fr|de|en|es|it|pl)/videos/(?P<id>RC-\d{6})'

_VALID_URL = r'https?://(?:www\.)?arte\.tv/(?P<lang>%s)/videos/(?P<id>RC-\d{6})' % ArteTVBaseIE._ARTE_LANGUAGES
_TESTS = [{
'url': 'https://www.arte.tv/en/videos/RC-016954/earn-a-living/',
'info_dict': {
@@ -185,17 +220,35 @@ class ArteTVPlaylistIE(ArteTVBaseIE):
'description': 'md5:d322c55011514b3a7241f7fb80d494c2',
},
'playlist_mincount': 6,
}, {
'url': 'https://www.arte.tv/pl/videos/RC-014123/arte-reportage/',
'only_matching': True,
}]

def _real_extract(self, url):
lang, playlist_id = re.match(self._VALID_URL, url).groups()
collection = self._download_json(
'https://api.arte.tv/api/player/v1/collectionData/%s/%s?source=videos'
% (lang, playlist_id), playlist_id)
'%s/collectionData/%s/%s?source=videos'
% (self._API_BASE, lang, playlist_id), playlist_id)
entries = []
for video in collection['videos']:
if not isinstance(video, dict):
continue
video_url = url_or_none(video.get('url')) or url_or_none(video.get('jsonUrl'))
if not video_url:
continue
video_id = video.get('programId')
entries.append({
'_type': 'url_transparent',
'url': video_url,
'id': video_id,
'title': video.get('title'),
'alt_title': video.get('subtitle'),
'thumbnail': url_or_none(try_get(video, lambda x: x['mainImage']['url'], compat_str)),
'duration': int_or_none(video.get('durationSeconds')),
'view_count': int_or_none(video.get('views')),
'ie_key': ArteTVIE.ie_key(),
})
title = collection.get('title')
description = collection.get('shortDescription') or collection.get('teaserText')
entries = [
self._extract_from_json_url(
video['jsonUrl'], video.get('programId') or playlist_id, lang)
for video in collection['videos'] if video.get('jsonUrl')]
return self.playlist_result(entries, playlist_id, title, description)

+ 111
- 157
youtube_dlc/extractor/bandcamp.py View File

@@ -1,3 +1,4 @@
# coding: utf-8
from __future__ import unicode_literals

import random
@@ -5,10 +6,7 @@ import re
import time

from .common import InfoExtractor
from ..compat import (
compat_str,
compat_urlparse,
)
from ..compat import compat_str
from ..utils import (
ExtractorError,
float_or_none,
@@ -17,71 +15,32 @@ from ..utils import (
parse_filesize,
str_or_none,
try_get,
unescapeHTML,
update_url_query,
unified_strdate,
unified_timestamp,
url_or_none,
urljoin,
)


class BandcampBaseIE(InfoExtractor):
"""Provide base functions for Bandcamp extractors"""

def _extract_json_from_html_data_attribute(self, webpage, suffix, video_id):
json_string = self._html_search_regex(
r' data-%s="([^"]*)' % suffix,
webpage, '%s json' % suffix, default='{}')

return self._parse_json(json_string, video_id)

def _parse_json_track(self, json):
formats = []
file_ = json.get('file')
if isinstance(file_, dict):
for format_id, format_url in file_.items():
if not url_or_none(format_url):
continue
ext, abr_str = format_id.split('-', 1)
formats.append({
'format_id': format_id,
'url': self._proto_relative_url(format_url, 'http:'),
'ext': ext,
'vcodec': 'none',
'acodec': ext,
'abr': int_or_none(abr_str),
})

return {
'duration': float_or_none(json.get('duration')),
'id': str_or_none(json.get('track_id') or json.get('id')),
'title': json.get('title'),
'title_link': json.get('title_link'),
'number': int_or_none(json.get('track_num')),
'formats': formats
}


class BandcampIE(BandcampBaseIE):
IE_NAME = "Bandcamp:track"
_VALID_URL = r'https?://[^/]+\.bandcamp\.com/track/(?P<title>[^/?#&]+)'
class BandcampIE(InfoExtractor):
_VALID_URL = r'https?://[^/]+\.bandcamp\.com/track/(?P<id>[^/?#&]+)'
_TESTS = [{
'url': 'http://youtube-dl.bandcamp.com/track/youtube-dl-test-song',
'md5': 'c557841d5e50261777a6585648adf439',
'info_dict': {
'id': '1812978515',
'ext': 'mp3',
'title': "youtube-dl \"'/\\\u00e4\u21ad - youtube-dl \"'/\\\u00e4\u21ad - youtube-dl test song \"'/\\\u00e4\u21ad",
'title': "youtube-dl \"'/\\ä↭ - youtube-dl \"'/\\ä↭ - youtube-dl test song \"'/\\ä↭",
'duration': 9.8485,
'uploader': "youtube-dl \"'/\\\u00e4\u21ad",
'timestamp': 1354224127,
'uploader': 'youtube-dl "\'/\\ä↭',
'upload_date': '20121129',
'timestamp': 1354224127,
},
'_skip': 'There is a limit of 200 free downloads / month for the test song'
}, {
# free download
'url': 'http://benprunty.bandcamp.com/track/lanius-battle',
'md5': '5d92af55811e47f38962a54c30b07ef0',
'info_dict': {
'id': '2650410135',
'ext': 'aiff',
@@ -120,52 +79,59 @@ class BandcampIE(BandcampBaseIE):
},
}]

def _extract_data_attr(self, webpage, video_id, attr='tralbum', fatal=True):
return self._parse_json(self._html_search_regex(
r'data-%s=(["\'])({.+?})\1' % attr, webpage,
attr + ' data', group=2), video_id, fatal=fatal)

def _real_extract(self, url):
mobj = re.match(self._VALID_URL, url)
title = mobj.group('title')
url_track_title = title
title = self._match_id(url)
webpage = self._download_webpage(url, title)
thumbnail = self._html_search_meta('og:image', webpage, default=None)

json_tralbum = self._extract_json_from_html_data_attribute(webpage, "tralbum", url_track_title)
json_embed = self._extract_json_from_html_data_attribute(webpage, "embed", url_track_title)

json_tracks = json_tralbum.get('trackinfo')
if not json_tracks:
raise ExtractorError('Could not extract track')

track = self._parse_json_track(json_tracks[0])
artist = json_tralbum.get('artist')
album_title = json_embed.get('album_title')

json_album = json_tralbum.get('packages')
if json_album:
json_album = json_album[0]
album_publish_date = json_album.get('album_publish_date')
album_release_date = json_album.get('album_release_date')
else:
album_publish_date = None
album_release_date = json_tralbum.get('album_release_date')

timestamp = unified_timestamp(json_tralbum.get('current', {}).get('publish_date') or album_publish_date)
release_date = unified_strdate(album_release_date)

download_link = self._search_regex(
r'freeDownloadPage(?:["\']|&quot;):\s*(["\']|&quot;)(?P<url>(?:(?!\1).)+)\1', webpage,
'download link', default=None, group='url')
tralbum = self._extract_data_attr(webpage, title)
thumbnail = self._og_search_thumbnail(webpage)

track_id = None
track = None
track_number = None
duration = None

formats = []
track_info = try_get(tralbum, lambda x: x['trackinfo'][0], dict)
if track_info:
file_ = track_info.get('file')
if isinstance(file_, dict):
for format_id, format_url in file_.items():
if not url_or_none(format_url):
continue
ext, abr_str = format_id.split('-', 1)
formats.append({
'format_id': format_id,
'url': self._proto_relative_url(format_url, 'http:'),
'ext': ext,
'vcodec': 'none',
'acodec': ext,
'abr': int_or_none(abr_str),
})
track = track_info.get('title')
track_id = str_or_none(
track_info.get('track_id') or track_info.get('id'))
track_number = int_or_none(track_info.get('track_num'))
duration = float_or_none(track_info.get('duration'))

embed = self._extract_data_attr(webpage, title, 'embed', False)
current = tralbum.get('current') or {}
artist = embed.get('artist') or current.get('artist') or tralbum.get('artist')
timestamp = unified_timestamp(
current.get('publish_date') or tralbum.get('album_publish_date'))

download_link = tralbum.get('freeDownloadPage')
if download_link:
track_id = self._search_regex(
r'\?id=(?P<id>\d+)&',
download_link, 'track id')
track_id = compat_str(tralbum['id'])

download_webpage = self._download_webpage(
download_link, track_id, 'Downloading free downloads page')

blob = self._parse_json(
self._search_regex(
r'data-blob=(["\'])(?P<blob>{.+?})\1', download_webpage,
'blob', group='blob'),
track_id, transform_source=unescapeHTML)
blob = self._extract_data_attr(download_webpage, track_id, 'blob')

info = try_get(
blob, (lambda x: x['digital_items'][0],
@@ -173,6 +139,8 @@ class BandcampIE(BandcampBaseIE):
if info:
downloads = info.get('downloads')
if isinstance(downloads, dict):
if not track:
track = info.get('title')
if not artist:
artist = info.get('artist')
if not thumbnail:
@@ -206,7 +174,7 @@ class BandcampIE(BandcampBaseIE):
retry_url = url_or_none(stat.get('retry_url'))
if not retry_url:
continue
track['formats'].append({
formats.append({
'url': self._proto_relative_url(retry_url, 'http:'),
'ext': download_formats.get(format_id),
'format_id': format_id,
@@ -215,30 +183,34 @@ class BandcampIE(BandcampBaseIE):
'vcodec': 'none',
})

self._sort_formats(track['formats'])
self._sort_formats(formats)

title = '%s - %s' % (artist, track.get('title')) if artist else track.get('title')
title = '%s - %s' % (artist, track) if artist else track

if not duration:
duration = float_or_none(self._html_search_meta(
'duration', webpage, default=None))

return {
'album': album_title,
'artist': artist,
'duration': track['duration'],
'formats': track['formats'],
'id': track['id'],
'release_date': release_date,
'id': track_id,
'title': title,
'thumbnail': thumbnail,
'uploader': artist,
'timestamp': timestamp,
'title': title,
'track': track['title'],
'track_id': track['id'],
'track_number': track['number'],
'uploader': artist
'release_date': unified_strdate(tralbum.get('album_release_date')),
'duration': duration,
'track': track,
'track_number': track_number,
'track_id': track_id,
'artist': artist,
'album': embed.get('album_title'),
'formats': formats,
}


class BandcampAlbumIE(BandcampBaseIE):
class BandcampAlbumIE(BandcampIE):
IE_NAME = 'Bandcamp:album'
_VALID_URL = r'https?://(?:(?P<subdomain>[^.]+)\.)?bandcamp\.com(?:/album/(?P<album_id>[^/?#&]+))?'
_VALID_URL = r'https?://(?:(?P<subdomain>[^.]+)\.)?bandcamp\.com(?:/album/(?P<id>[^/?#&]+))?'

_TESTS = [{
'url': 'http://blazo.bandcamp.com/album/jazz-format-mixtape-vol-1',
@@ -248,7 +220,10 @@ class BandcampAlbumIE(BandcampBaseIE):
'info_dict': {
'id': '1353101989',
'ext': 'mp3',
'title': 'Intro',
'title': 'Blazo - Intro',
'timestamp': 1311756226,
'upload_date': '20110727',
'uploader': 'Blazo',
}
},
{
@@ -256,7 +231,10 @@ class BandcampAlbumIE(BandcampBaseIE):
'info_dict': {
'id': '38097443',
'ext': 'mp3',
'title': 'Kero One - Keep It Alive (Blazo remix)',
'title': 'Blazo - Kero One - Keep It Alive (Blazo remix)',
'timestamp': 1311757238,
'upload_date': '20110727',
'uploader': 'Blazo',
}
},
],
@@ -292,6 +270,7 @@ class BandcampAlbumIE(BandcampBaseIE):
'title': '"Entropy" EP',
'uploader_id': 'jstrecords',
'id': 'entropy-ep',
'description': 'md5:0ff22959c943622972596062f2f366a5',
},
'playlist_mincount': 3,
}, {
@@ -301,6 +280,7 @@ class BandcampAlbumIE(BandcampBaseIE):
'id': 'we-are-the-plague',
'title': 'WE ARE THE PLAGUE',
'uploader_id': 'insulters',
'description': 'md5:b3cf845ee41b2b1141dc7bde9237255f',
},
'playlist_count': 2,
}]
@@ -312,41 +292,34 @@ class BandcampAlbumIE(BandcampBaseIE):
else super(BandcampAlbumIE, cls).suitable(url))

def _real_extract(self, url):
mobj = re.match(self._VALID_URL, url)
uploader_id = mobj.group('subdomain')
album_id = mobj.group('album_id')
uploader_id, album_id = re.match(self._VALID_URL, url).groups()
playlist_id = album_id or uploader_id
webpage = self._download_webpage(url, playlist_id)

json_tralbum = self._extract_json_from_html_data_attribute(webpage, "tralbum", playlist_id)
json_embed = self._extract_json_from_html_data_attribute(webpage, "embed", playlist_id)

json_tracks = json_tralbum.get('trackinfo')
if not json_tracks:
raise ExtractorError('Could not extract album tracks')

album_title = json_embed.get('album_title')

tralbum = self._extract_data_attr(webpage, playlist_id)
track_info = tralbum.get('trackinfo')
if not track_info:
raise ExtractorError('The page doesn\'t contain any tracks')
# Only tracks with duration info have songs
tracks = [self._parse_json_track(track) for track in json_tracks]
entries = [
self.url_result(
compat_urlparse.urljoin(url, track['title_link']),
ie=BandcampIE.ie_key(), video_id=track['id'],
video_title=track['title'])
for track in tracks
if track.get('duration')]
urljoin(url, t['title_link']), BandcampIE.ie_key(),
str_or_none(t.get('track_id') or t.get('id')), t.get('title'))
for t in track_info
if t.get('duration')]

current = tralbum.get('current') or {}

return {
'_type': 'playlist',
'uploader_id': uploader_id,
'id': playlist_id,
'title': album_title,
'entries': entries
'title': current.get('title'),
'description': current.get('about'),
'entries': entries,
}


class BandcampWeeklyIE(InfoExtractor):
class BandcampWeeklyIE(BandcampIE):
IE_NAME = 'Bandcamp:weekly'
_VALID_URL = r'https?://(?:www\.)?bandcamp\.com/?\?(?:.*?&)?show=(?P<id>\d+)'
_TESTS = [{
@@ -361,29 +334,23 @@ class BandcampWeeklyIE(InfoExtractor):
'release_date': '20170404',
'series': 'Bandcamp Weekly',
'episode': 'Magic Moments',
'episode_number': 208,
'episode_id': '224',
}
},
'params': {
'format': 'opus-lo',
},
}, {
'url': 'https://bandcamp.com/?blah/blah@&show=228',
'only_matching': True
}]

def _real_extract(self, url):
video_id = self._match_id(url)
webpage = self._download_webpage(url, video_id)

blob = self._parse_json(
self._search_regex(
r'data-blob=(["\'])(?P<blob>{.+?})\1', webpage,
'blob', group='blob'),
video_id, transform_source=unescapeHTML)
show_id = self._match_id(url)
webpage = self._download_webpage(url, show_id)

show = blob['bcw_show']
blob = self._extract_data_attr(webpage, show_id, 'blob')

# This is desired because any invalid show id redirects to `bandcamp.com`
# which happens to expose the latest Bandcamp Weekly episode.
show_id = int_or_none(show.get('show_id')) or int_or_none(video_id)
show = blob['bcw_data'][show_id]

formats = []
for format_id, format_url in show['audio_stream'].items():
@@ -408,20 +375,8 @@ class BandcampWeeklyIE(InfoExtractor):
if subtitle:
title += ' - %s' % subtitle

episode_number = None
seq = blob.get('bcw_seq')

if seq and isinstance(seq, list):
try:
episode_number = next(
int_or_none(e.get('episode_number'))
for e in seq
if isinstance(e, dict) and int_or_none(e.get('id')) == show_id)
except StopIteration:
pass

return {
'id': video_id,
'id': show_id,
'title': title,
'description': show.get('desc') or show.get('short_desc'),
'duration': float_or_none(show.get('audio_duration')),
@@ -429,7 +384,6 @@ class BandcampWeeklyIE(InfoExtractor):
'release_date': unified_strdate(show.get('published_date')),
'series': 'Bandcamp Weekly',
'episode': show.get('subtitle'),
'episode_number': episode_number,
'episode_id': compat_str(video_id),
'episode_id': show_id,
'formats': formats
}

+ 12
- 7
youtube_dlc/extractor/cnbc.py View File

@@ -1,6 +1,7 @@
# coding: utf-8
from __future__ import unicode_literals

import re

from .common import InfoExtractor
from ..utils import smuggle_url
@@ -38,7 +39,7 @@ class CNBCIE(InfoExtractor):


class CNBCVideoIE(InfoExtractor):
_VALID_URL = r'https?://(?:www\.)?cnbc\.com/video/(?:[^/]+/)+(?P<id>[^./?#&]+)'
_VALID_URL = r'https?://(?:www\.)?cnbc\.com(?P<path>/video/(?:[^/]+/)+(?P<id>[^./?#&]+)\.html)'
_TEST = {
'url': 'https://www.cnbc.com/video/2018/07/19/trump-i-dont-necessarily-agree-with-raising-rates.html',
'info_dict': {
@@ -56,11 +57,15 @@ class CNBCVideoIE(InfoExtractor):
}

def _real_extract(self, url):
display_id = self._match_id(url)
webpage = self._download_webpage(url, display_id)
video_id = self._search_regex(
r'content_id["\']\s*:\s*["\'](\d+)', webpage, display_id,
'video id')
path, display_id = re.match(self._VALID_URL, url).groups()
video_id = self._download_json(
'https://webql-redesign.cnbcfm.com/graphql', display_id, query={
'query': '''{
page(path: "%s") {
vcpsId
}
}''' % path,
})['data']['page']['vcpsId']
return self.url_result(
'http://video.cnbc.com/gallery/?video=%s' % video_id,
'http://video.cnbc.com/gallery/?video=%d' % video_id,
CNBCIE.ie_key())

+ 3
- 2
youtube_dlc/extractor/common.py View File

@@ -1456,9 +1456,10 @@ class InfoExtractor(object):
try:
self._request_webpage(url, video_id, 'Checking %s URL' % item, headers=headers)
return True
except ExtractorError:
except ExtractorError as e:
self.to_screen(
'%s: %s URL is invalid, skipping' % (video_id, item))
'%s: %s URL is invalid, skipping: %s'
% (video_id, item, error_to_compat_str(e.cause)))
return False

def http_scheme(self):


+ 23
- 4
youtube_dlc/extractor/condenast.py View File

@@ -16,6 +16,8 @@ from ..utils import (
mimetype2ext,
orderedSet,
parse_iso8601,
strip_or_none,
try_get,
)


@@ -82,6 +84,7 @@ class CondeNastIE(InfoExtractor):
'uploader': 'gq',
'upload_date': '20170321',
'timestamp': 1490126427,
'description': 'How much grimmer would things be if these people were competent?',
},
}, {
# JS embed
@@ -93,7 +96,7 @@ class CondeNastIE(InfoExtractor):
'title': '3D printed TSA Travel Sentry keys really do open TSA locks',
'uploader': 'arstechnica',
'upload_date': '20150916',
'timestamp': 1442434955,
'timestamp': 1442434920,
}
}, {
'url': 'https://player.cnevids.com/inline/video/59138decb57ac36b83000005.js?target=js-cne-player',
@@ -196,6 +199,13 @@ class CondeNastIE(InfoExtractor):
})
self._sort_formats(formats)

subtitles = {}
for t, caption in video_info.get('captions', {}).items():
caption_url = caption.get('src')
if not (t in ('vtt', 'srt', 'tml') and caption_url):
continue
subtitles.setdefault('en', []).append({'url': caption_url})

return {
'id': video_id,
'formats': formats,
@@ -208,6 +218,7 @@ class CondeNastIE(InfoExtractor):
'season': video_info.get('season_title'),
'timestamp': parse_iso8601(video_info.get('premiere_date')),
'categories': video_info.get('categories'),
'subtitles': subtitles,
}

def _real_extract(self, url):
@@ -225,8 +236,16 @@ class CondeNastIE(InfoExtractor):
if url_type == 'series':
return self._extract_series(url, webpage)
else:
params = self._extract_video_params(webpage, display_id)
info = self._search_json_ld(
webpage, display_id, fatal=False)
video = try_get(self._parse_json(self._search_regex(
r'__PRELOADED_STATE__\s*=\s*({.+?});', webpage,
'preload state', '{}'), display_id),
lambda x: x['transformed']['video'])
if video:
params = {'videoId': video['id']}
info = {'description': strip_or_none(video.get('description'))}
else:
params = self._extract_video_params(webpage, display_id)
info = self._search_json_ld(
webpage, display_id, fatal=False)
info.update(self._extract_video(params))
return info

+ 5
- 6
youtube_dlc/extractor/extractors.py View File

@@ -62,7 +62,7 @@ from .ard import (
ARDMediathekIE,
)
from .arte import (
ArteTVPlus7IE,
ArteTVIE,
ArteTVEmbedIE,
ArteTVPlaylistIE,
)
@@ -542,6 +542,7 @@ from .laola1tv import (
EHFTVIE,
ITTFIE,
)
from .lbry import LBRYIE
from .lci import LCIIE
from .lcp import (
LcpPlayIE,
@@ -1079,8 +1080,7 @@ from .spankbang import (
SpankBangPlaylistIE,
)
from .spankwire import SpankwireIE
from .spiegel import SpiegelIE, SpiegelArticleIE
from .spiegeltv import SpiegeltvIE
from .spiegel import SpiegelIE
from .spike import (
BellatorIE,
ParamountNetworkIE,
@@ -1505,12 +1505,11 @@ from .yourporn import YourPornIE
from .yourupload import YourUploadIE
from .youtube import (
YoutubeIE,
YoutubeChannelIE,
YoutubeFavouritesIE,
YoutubeHistoryIE,
YoutubeLiveIE,
YoutubeTabIE,
YoutubePlaylistIE,
YoutubePlaylistsIE,
YoutubeRecommendedIE,
YoutubeSearchDateIE,
YoutubeSearchIE,
@@ -1519,7 +1518,7 @@ from .youtube import (
YoutubeSubscriptionsIE,
YoutubeTruncatedIDIE,
YoutubeTruncatedURLIE,
YoutubeUserIE,
YoutubeYtUserIE,
YoutubeWatchLaterIE,
)
from .zapiks import ZapiksIE


+ 36
- 11
youtube_dlc/extractor/francetv.py View File

@@ -17,6 +17,7 @@ from ..utils import (
parse_duration,
try_get,
url_or_none,
urljoin,
)
from .dailymotion import DailymotionIE

@@ -128,18 +129,38 @@ class FranceTVIE(InfoExtractor):

is_live = None

formats = []
for video in info['videos']:
if video['statut'] != 'ONLINE':
videos = []

for video in (info.get('videos') or []):
if video.get('statut') != 'ONLINE':
continue
video_url = video['url']
if not video.get('url'):
continue
videos.append(video)

if not videos:
for device_type in ['desktop', 'mobile']:
fallback_info = self._download_json(
'https://player.webservices.francetelevisions.fr/v1/videos/%s' % video_id,
video_id, 'Downloading fallback %s video JSON' % device_type, query={
'device_type': device_type,
'browser': 'chrome',
}, fatal=False)

if fallback_info and fallback_info.get('video'):
videos.append(fallback_info['video'])

formats = []
for video in videos:
video_url = video.get('url')
if not video_url:
continue
if is_live is None:
is_live = (try_get(
video, lambda x: x['plages_ouverture'][0]['direct'],
bool) is True) or '/live.francetv.fr/' in video_url
format_id = video['format']
video, lambda x: x['plages_ouverture'][0]['direct'], bool) is True
or video.get('is_live') is True
or '/live.francetv.fr/' in video_url)
format_id = video.get('format')
ext = determine_ext(video_url)
if ext == 'f4m':
if georestricted:
@@ -154,6 +175,9 @@ class FranceTVIE(InfoExtractor):
sign(video_url, format_id), video_id, 'mp4',
entry_protocol='m3u8_native', m3u8_id=format_id,
fatal=False))
elif ext == 'mpd':
formats.extend(self._extract_mpd_formats(
sign(video_url, format_id), video_id, mpd_id=format_id, fatal=False))
elif video_url.startswith('rtmp'):
formats.append({
'url': video_url,
@@ -166,6 +190,7 @@ class FranceTVIE(InfoExtractor):
'url': video_url,
'format_id': format_id,
})

self._sort_formats(formats)

title = info['titre']
@@ -185,10 +210,10 @@ class FranceTVIE(InfoExtractor):
return {
'id': video_id,
'title': self._live_title(title) if is_live else title,
'description': clean_html(info['synopsis']),
'thumbnail': compat_urlparse.urljoin('http://pluzz.francetv.fr', info['image']),
'duration': int_or_none(info.get('real_duration')) or parse_duration(info['duree']),
'timestamp': int_or_none(info['diffusion']['timestamp']),
'description': clean_html(info.get('synopsis')),
'thumbnail': urljoin('http://pluzz.francetv.fr', info.get('image')),
'duration': int_or_none(info.get('real_duration')) or parse_duration(info.get('duree')),
'timestamp': int_or_none(try_get(info, lambda x: x['diffusion']['timestamp'])),
'is_live': is_live,
'formats': formats,
'subtitles': subtitles,


+ 4
- 5
youtube_dlc/extractor/generic.py View File

@@ -91,6 +91,7 @@ from .piksel import PikselIE
from .videa import VideaIE
from .twentymin import TwentyMinutenIE
from .ustream import UstreamIE
from .arte import ArteTVEmbedIE
from .videopress import VideoPressIE
from .rutube import RutubeIE
from .limelight import LimelightBaseIE
@@ -2760,11 +2761,9 @@ class GenericIE(InfoExtractor):
return self.url_result(ustream_url, UstreamIE.ie_key())

# Look for embedded arte.tv player
mobj = re.search(
r'<(?:script|iframe) [^>]*?src="(?P<url>http://www\.arte\.tv/(?:playerv2/embed|arte_vp/index)[^"]+)"',
webpage)
if mobj is not None:
return self.url_result(mobj.group('url'), 'ArteTVEmbed')
arte_urls = ArteTVEmbedIE._extract_urls(webpage)
if arte_urls:
return self.playlist_from_matches(arte_urls, video_id, video_title)

# Look for embedded francetv player
mobj = re.search(


+ 1
- 1
youtube_dlc/extractor/iqiyi.py View File

@@ -150,7 +150,7 @@ class IqiyiSDKInterpreter(object):
elif function in other_functions:
other_functions[function]()
else:
raise ExtractorError('Unknown funcion %s' % function)
raise ExtractorError('Unknown function %s' % function)

return sdk.target



+ 88
- 0
youtube_dlc/extractor/lbry.py View File

@@ -0,0 +1,88 @@
# coding: utf-8
from __future__ import unicode_literals

import json

from .common import InfoExtractor
from ..compat import compat_str
from ..utils import (
determine_ext,
ExtractorError,
int_or_none,
mimetype2ext,
try_get,
)


class LBRYIE(InfoExtractor):
IE_NAME = 'lbry.tv'
_VALID_URL = r'https?://(?:www\.)?(?:lbry\.tv|odysee\.com)/(?P<id>@[0-9a-zA-Z-]+:[0-9a-z]+/[0-9a-zA-Z().-]+:[0-9a-z])'
_TESTS = [{
# Video
'url': 'https://lbry.tv/@Mantega:1/First-day-LBRY:1',
'md5': '65bd7ec1f6744ada55da8e4c48a2edf9',
'info_dict': {
'id': '17f983b61f53091fb8ea58a9c56804e4ff8cff4d',
'ext': 'mp4',
'title': 'First day in LBRY? Start HERE!',
'description': 'md5:f6cb5c704b332d37f5119313c2c98f51',
'timestamp': 1595694354,
'upload_date': '20200725',
}
}, {
# Audio
'url': 'https://lbry.tv/@LBRYFoundation:0/Episode-1:e',
'md5': 'c94017d3eba9b49ce085a8fad6b98d00',
'info_dict': {
'id': 'e7d93d772bd87e2b62d5ab993c1c3ced86ebb396',
'ext': 'mp3',
'title': 'The LBRY Foundation Community Podcast Episode 1 - Introduction, Streaming on LBRY, Transcoding',
'description': 'md5:661ac4f1db09f31728931d7b88807a61',
'timestamp': 1591312601,
'upload_date': '20200604',
}
}, {
'url': 'https://odysee.com/@BrodieRobertson:5/apple-is-tracking-everything-you-do-on:e',
'only_matching': True,
}]

def _call_api_proxy(self, method, display_id, params):
return self._download_json(
'https://api.lbry.tv/api/v1/proxy', display_id,
headers={'Content-Type': 'application/json-rpc'},
data=json.dumps({
'method': method,
'params': params,
}).encode())['result']

def _real_extract(self, url):
display_id = self._match_id(url).replace(':', '#')
uri = 'lbry://' + display_id
result = self._call_api_proxy(
'resolve', display_id, {'urls': [uri]})[uri]
result_value = result['value']
if result_value.get('stream_type') not in ('video', 'audio'):
raise ExtractorError('Unsupported URL', expected=True)
streaming_url = self._call_api_proxy(
'get', display_id, {'uri': uri})['streaming_url']
source = result_value.get('source') or {}
media = result_value.get('video') or result_value.get('audio') or {}
signing_channel = result_value.get('signing_channel') or {}

return {
'id': result['claim_id'],
'title': result_value['title'],
'thumbnail': try_get(result_value, lambda x: x['thumbnail']['url'], compat_str),
'description': result_value.get('description'),
'license': result_value.get('license'),
'timestamp': int_or_none(result.get('timestamp')),
'tags': result_value.get('tags'),
'width': int_or_none(media.get('width')),
'height': int_or_none(media.get('height')),
'duration': int_or_none(media.get('duration')),
'channel': signing_channel.get('name'),
'channel_id': signing_channel.get('claim_id'),
'ext': determine_ext(source.get('name')) or mimetype2ext(source.get('media_type')),
'filesize': int_or_none(source.get('size')),
'url': streaming_url,
}

+ 36
- 55
youtube_dlc/extractor/lrt.py View File

@@ -5,28 +5,26 @@ import re

from .common import InfoExtractor
from ..utils import (
determine_ext,
int_or_none,
parse_duration,
remove_end,
clean_html,
merge_dicts,
)


class LRTIE(InfoExtractor):
IE_NAME = 'lrt.lt'
_VALID_URL = r'https?://(?:www\.)?lrt\.lt/mediateka/irasas/(?P<id>[0-9]+)'
_VALID_URL = r'https?://(?:www\.)?lrt\.lt(?P<path>/mediateka/irasas/(?P<id>[0-9]+))'
_TESTS = [{
# m3u8 download
'url': 'http://www.lrt.lt/mediateka/irasas/54391/',
'md5': 'fe44cf7e4ab3198055f2c598fc175cb0',
'url': 'https://www.lrt.lt/mediateka/irasas/2000127261/greita-ir-gardu-sicilijos-ikvepta-klasikiniu-makaronu-su-baklazanais-vakariene',
'md5': '85cb2bb530f31d91a9c65b479516ade4',
'info_dict': {
'id': '54391',
'id': '2000127261',
'ext': 'mp4',
'title': 'Septynios Kauno dienos',
'description': 'md5:24d84534c7dc76581e59f5689462411a',
'duration': 1783,
'view_count': int,
'like_count': int,
'title': 'Greita ir gardu: Sicilijos įkvėpta klasikinių makaronų su baklažanais vakarienė',
'description': 'md5:ad7d985f51b0dc1489ba2d76d7ed47fa',
'duration': 3035,
'timestamp': 1604079000,
'upload_date': '20201030',
},
}, {
# direct mp3 download
@@ -43,52 +41,35 @@ class LRTIE(InfoExtractor):
},
}]

def _extract_js_var(self, webpage, var_name, default):
return self._search_regex(
r'%s\s*=\s*(["\'])((?:(?!\1).)+)\1' % var_name,
webpage, var_name.replace('_', ' '), default, group=2)

def _real_extract(self, url):
video_id = self._match_id(url)
path, video_id = re.match(self._VALID_URL, url).groups()
webpage = self._download_webpage(url, video_id)

title = remove_end(self._og_search_title(webpage), ' - LRT')

formats = []
for _, file_url in re.findall(
r'file\s*:\s*(["\'])(?P<url>(?:(?!\1).)+)\1', webpage):
ext = determine_ext(file_url)
if ext not in ('m3u8', 'mp3'):
continue
# mp3 served as m3u8 produces stuttered media file
if ext == 'm3u8' and '.mp3' in file_url:
continue
if ext == 'm3u8':
formats.extend(self._extract_m3u8_formats(
file_url, video_id, 'mp4', entry_protocol='m3u8_native',
fatal=False))
elif ext == 'mp3':
formats.append({
'url': file_url,
'vcodec': 'none',
})
self._sort_formats(formats)
media_url = self._extract_js_var(webpage, 'main_url', path)
media = self._download_json(self._extract_js_var(
webpage, 'media_info_url',
'https://www.lrt.lt/servisai/stream_url/vod/media_info/'),
video_id, query={'url': media_url})
jw_data = self._parse_jwplayer_data(
media['playlist_item'], video_id, base_url=url)

thumbnail = self._og_search_thumbnail(webpage)
description = self._og_search_description(webpage)
duration = parse_duration(self._search_regex(
r'var\s+record_len\s*=\s*(["\'])(?P<duration>[0-9]+:[0-9]+:[0-9]+)\1',
webpage, 'duration', default=None, group='duration'))
json_ld_data = self._search_json_ld(webpage, video_id)

view_count = int_or_none(self._html_search_regex(
r'<div[^>]+class=(["\']).*?record-desc-seen.*?\1[^>]*>(?P<count>.+?)</div>',
webpage, 'view count', fatal=False, group='count'))
like_count = int_or_none(self._search_regex(
r'<span[^>]+id=(["\'])flikesCount.*?\1>(?P<count>\d+)<',
webpage, 'like count', fatal=False, group='count'))
tags = []
for tag in (media.get('tags') or []):
tag_name = tag.get('name')
if not tag_name:
continue
tags.append(tag_name)

return {
'id': video_id,
'title': title,
'formats': formats,
'thumbnail': thumbnail,
'description': description,
'duration': duration,
'view_count': view_count,
'like_count': like_count,
clean_info = {
'description': clean_html(media.get('content')),
'tags': tags,
}

return merge_dicts(clean_info, jw_data, json_ld_data)

+ 46
- 14
youtube_dlc/extractor/malltv.py View File

@@ -1,10 +1,16 @@
# coding: utf-8
from __future__ import unicode_literals

import re

from .common import InfoExtractor
from ..utils import merge_dicts
from ..utils import (
clean_html,
dict_get,
float_or_none,
int_or_none,
merge_dicts,
parse_duration,
try_get,
)


class MallTVIE(InfoExtractor):
@@ -17,7 +23,7 @@ class MallTVIE(InfoExtractor):
'display_id': '18-miliard-pro-neziskovky-opravdu-jsou-sportovci-nebo-clovek-v-tisni-pijavice',
'ext': 'mp4',
'title': '18 miliard pro neziskovky. Opravdu jsou sportovci nebo Člověk v tísni pijavice?',
'description': 'md5:25fc0ec42a72ba602b602c683fa29deb',
'description': 'md5:db7d5744a4bd4043d9d98324aa72ab35',
'duration': 216,
'timestamp': 1538870400,
'upload_date': '20181007',
@@ -37,20 +43,46 @@ class MallTVIE(InfoExtractor):
webpage = self._download_webpage(
url, display_id, headers=self.geo_verification_headers())

SOURCE_RE = r'(<source[^>]+\bsrc=(?:(["\'])(?:(?!\2).)+|[^\s]+)/(?P<id>[\da-z]+)/index)\b'
video = self._parse_json(self._search_regex(
r'videoObject\s*=\s*JSON\.parse\(JSON\.stringify\(({.+?})\)\);',
webpage, 'video object'), display_id)
video_source = video['VideoSource']
video_id = self._search_regex(
SOURCE_RE, webpage, 'video id', group='id')
r'/([\da-z]+)/index\b', video_source, 'video id')

formats = self._extract_m3u8_formats(
video_source + '.m3u8', video_id, 'mp4', 'm3u8_native')
self._sort_formats(formats)

subtitles = {}
for s in (video.get('Subtitles') or {}):
s_url = s.get('Url')
if not s_url:
continue
subtitles.setdefault(s.get('Language') or 'cz', []).append({
'url': s_url,
})

entity_counts = video.get('EntityCounts') or {}

media = self._parse_html5_media_entries(
url, re.sub(SOURCE_RE, r'\1.m3u8', webpage), video_id,
m3u8_id='hls', m3u8_entry_protocol='m3u8_native')[0]
def get_count(k):
v = entity_counts.get(k + 's') or {}
return int_or_none(dict_get(v, ('Count', 'StrCount')))

info = self._search_json_ld(webpage, video_id, default={})

return merge_dicts(media, info, {
return merge_dicts({
'id': video_id,
'display_id': display_id,
'title': self._og_search_title(webpage, default=None) or display_id,
'description': self._og_search_description(webpage, default=None),
'thumbnail': self._og_search_thumbnail(webpage, default=None),
})
'title': video.get('Title'),
'description': clean_html(video.get('Description')),
'thumbnail': video.get('ThumbnailUrl'),
'formats': formats,
'subtitles': subtitles,
'duration': int_or_none(video.get('DurationSeconds')) or parse_duration(video.get('Duration')),
'view_count': get_count('View'),
'like_count': get_count('Like'),
'dislike_count': get_count('Dislike'),
'average_rating': float_or_none(try_get(video, lambda x: x['EntityRating']['AvarageRate'])),
'comment_count': get_count('Comment'),
}, info)

+ 7
- 3
youtube_dlc/extractor/mgtv.py View File

@@ -17,9 +17,8 @@ from ..utils import (


class MGTVIE(InfoExtractor):
_VALID_URL = r'https?://(?:www\.)?mgtv\.com/(v|b)/(?:[^/]+/)*(?P<id>\d+)\.html'
_VALID_URL = r'https?://(?:w(?:ww)?\.)?mgtv\.com/(v|b)/(?:[^/]+/)*(?P<id>\d+)\.html'
IE_DESC = '芒果TV'
_GEO_COUNTRIES = ['CN']

_TESTS = [{
'url': 'http://www.mgtv.com/v/1/290525/f/3116640.html',
@@ -34,14 +33,18 @@ class MGTVIE(InfoExtractor):
}, {
'url': 'http://www.mgtv.com/b/301817/3826653.html',
'only_matching': True,
}, {
'url': 'https://w.mgtv.com/b/301817/3826653.html',
'only_matching': True,
}]

def _real_extract(self, url):
video_id = self._match_id(url)
tk2 = base64.urlsafe_b64encode(b'did=%s|pno=1030|ver=0.3.0301|clit=%d' % (compat_str(uuid.uuid4()).encode(), time.time()))[::-1]
try:
api_data = self._download_json(
'https://pcweb.api.mgtv.com/player/video', video_id, query={
'tk2': base64.urlsafe_b64encode(b'did=%s|pno=1030|ver=0.3.0301|clit=%d' % (compat_str(uuid.uuid4()).encode(), time.time()))[::-1],
'tk2': tk2,
'video_id': video_id,
}, headers=self.geo_verification_headers())['data']
except ExtractorError as e:
@@ -56,6 +59,7 @@ class MGTVIE(InfoExtractor):
stream_data = self._download_json(
'https://pcweb.api.mgtv.com/player/getSource', video_id, query={
'pm2': api_data['atc']['pm2'],
'tk2': tk2,
'video_id': video_id,
}, headers=self.geo_verification_headers())['data']
stream_domain = stream_data['stream_domain'][0]


+ 12
- 0
youtube_dlc/extractor/mtv.py View File

@@ -403,6 +403,18 @@ class MTVIE(MTVServicesInfoExtractor):
'only_matching': True,
}]

@staticmethod
def extract_child_with_type(parent, t):
children = parent['children']
return next(c for c in children if c.get('type') == t)

def _extract_mgid(self, webpage):
data = self._parse_json(self._search_regex(
r'__DATA__\s*=\s*({.+?});', webpage, 'data'), None)
main_container = self.extract_child_with_type(data, 'MainContainer')
video_player = self.extract_child_with_type(main_container, 'VideoPlayer')
return video_player['props']['media']['video']['config']['uri']


class MTVJapanIE(MTVServicesInfoExtractor):
IE_NAME = 'mtvjapan'


+ 2
- 3
youtube_dlc/extractor/nbc.py View File

@@ -10,7 +10,6 @@ from .adobepass import AdobePassIE
from ..compat import compat_urllib_parse_unquote
from ..utils import (
int_or_none,
js_to_json,
parse_duration,
smuggle_url,
try_get,
@@ -394,8 +393,8 @@ class NBCNewsIE(ThePlatformIE):
webpage = self._download_webpage(url, video_id)

data = self._parse_json(self._search_regex(
r'window\.__data\s*=\s*({.+});', webpage,
'bootstrap json'), video_id, js_to_json)
r'<script[^>]+id="__NEXT_DATA__"[^>]*>({.+?})</script>',
webpage, 'bootstrap json'), video_id)['props']['initialState']
video_data = try_get(data, lambda x: x['video']['current'], dict)
if not video_data:
video_data = data['article']['content'][0]['primaryMedia']['video']


+ 38
- 0
youtube_dlc/extractor/ndr.py View File

@@ -82,6 +82,29 @@ class NDRIE(NDRBaseIE):
'params': {
'skip_download': True,
},
}, {
# with subtitles
'url': 'https://www.ndr.de/fernsehen/sendungen/extra_3/extra-3-Satiremagazin-mit-Christian-Ehring,sendung1091858.html',
'info_dict': {
'id': 'extra18674',
'display_id': 'extra-3-Satiremagazin-mit-Christian-Ehring',
'ext': 'mp4',
'title': 'Extra 3 vom 11.11.2020 mit Christian Ehring',
'description': 'md5:42ee53990a715eaaf4dc7f13a3bd56c6',
'uploader': 'ndrtv',
'upload_date': '20201113',
'duration': 1749,
'subtitles': {
'de': [{
'ext': 'ttml',
'url': r're:^https://www\.ndr\.de.+',
}],
},
},
'params': {
'skip_download': True,
},
'expected_warnings': ['Unable to download f4m manifest'],
}, {
'url': 'https://www.ndr.de/Fettes-Brot-Ferris-MC-und-Thees-Uhlmann-live-on-stage,festivalsommer116.html',
'only_matching': True,
@@ -242,6 +265,20 @@ class NDREmbedBaseIE(InfoExtractor):
'preference': quality_key(thumbnail.get('quality')),
})

subtitles = {}
tracks = config.get('tracks')
if tracks and isinstance(tracks, list):
for track in tracks:
if not isinstance(track, dict):
continue
track_url = urljoin(url, track.get('src'))
if not track_url:
continue
subtitles.setdefault(track.get('srclang') or 'de', []).append({
'url': track_url,
'ext': 'ttml',
})

return {
'id': video_id,
'title': title,
@@ -251,6 +288,7 @@ class NDREmbedBaseIE(InfoExtractor):
'duration': duration,
'thumbnails': thumbnails,
'formats': formats,
'subtitles': subtitles,
}




+ 95
- 62
youtube_dlc/extractor/rai.py View File

@@ -17,7 +17,7 @@ from ..utils import (
int_or_none,
parse_duration,
strip_or_none,
try_get,
unescapeHTML,
unified_strdate,
unified_timestamp,
update_url_query,
@@ -30,7 +30,6 @@ class RaiBaseIE(InfoExtractor):
_UUID_RE = r'[\da-f]{8}-[\da-f]{4}-[\da-f]{4}-[\da-f]{4}-[\da-f]{12}'
_GEO_COUNTRIES = ['IT']
_GEO_BYPASS = False
_BASE_URL = 'https://www.raiplay.it'

def _extract_relinker_info(self, relinker_url, video_id):
if not re.match(r'https?://', relinker_url):
@@ -123,19 +122,40 @@ class RaiBaseIE(InfoExtractor):


class RaiPlayIE(RaiBaseIE):
_VALID_URL = r'(?P<url>(?P<base>https?://(?:www\.)?raiplay\.it/.+?-)(?P<id>%s)(?P<ext>\.(?:html|json)))' % RaiBaseIE._UUID_RE
_VALID_URL = r'(?P<url>https?://(?:www\.)?raiplay\.it/.+?-(?P<id>%s)\.(?:html|json))' % RaiBaseIE._UUID_RE
_TESTS = [{
'url': 'http://www.raiplay.it/video/2016/10/La-Casa-Bianca-e06118bb-59a9-4636-b914-498e4cfd2c66.html?source=twitter',
'md5': '340aa3b7afb54bfd14a8c11786450d76',
'info_dict': {
'id': 'e06118bb-59a9-4636-b914-498e4cfd2c66',
'ext': 'mp4',
'title': 'La Casa Bianca',
'alt_title': 'S2016 - Puntata del 23/10/2016',
'description': 'md5:a09d45890850458077d1f68bb036e0a5',
'thumbnail': r're:^https?://.*\.jpg$',
'uploader': 'Rai 3',
'creator': 'Rai 3',
'duration': 3278,
'timestamp': 1477764300,
'upload_date': '20161029',
'series': 'La Casa Bianca',
'season': '2016',
},
'skip': 'This content is not available',
}, {
'url': 'http://www.raiplay.it/video/2014/04/Report-del-07042014-cb27157f-9dd0-4aee-b788-b1f67643a391.html',
'md5': '8970abf8caf8aef4696e7b1f2adfc696',
'info_dict': {
'id': 'cb27157f-9dd0-4aee-b788-b1f67643a391',
'ext': 'mp4',
'title': 'Report del 07/04/2014',
'alt_title': 'St 2013/14 - Espresso nel caffè - 07/04/2014 ',
'alt_title': 'St 2013/14 - Espresso nel caffè - 07/04/2014',
'description': 'md5:d730c168a58f4bb35600fc2f881ec04e',
'thumbnail': r're:^https?://.*\.jpg$',
'uploader': 'Rai Gulp',
'duration': 6160,
'series': 'Report',
'season': '2013/14',
},
'params': {
'skip_download': True,
@@ -146,11 +166,10 @@ class RaiPlayIE(RaiBaseIE):
}]

def _real_extract(self, url):
mobj = re.match(self._VALID_URL, url)
url, base, video_id, ext = mobj.group('url', 'base', 'id', 'ext')
url, video_id = re.match(self._VALID_URL, url).groups()

media = self._download_json(
'%s%s.json' % (base, video_id), video_id, 'Downloading video JSON')
url.replace('.html', '.json'), video_id, 'Downloading video JSON')

title = media['name']
video = media['video']
@@ -159,34 +178,38 @@ class RaiPlayIE(RaiBaseIE):
self._sort_formats(relinker_info['formats'])

thumbnails = []
if 'images' in media:
for _, value in media.get('images').items():
if value:
thumbnails.append({
'url': urljoin(RaiBaseIE._BASE_URL, value.replace('[RESOLUTION]', '600x400'))
})
for _, value in media.get('images', {}).items():
if value:
thumbnails.append({
'url': urljoin(url, value),
})

timestamp = unified_timestamp(try_get(
media, lambda x: x['availabilities'][0]['start'], compat_str))
date_published = media.get('date_published')
time_published = media.get('time_published')
if date_published and time_published:
date_published += ' ' + time_published

subtitles = self._extract_subtitles(url, video.get('subtitles'))

program_info = media.get('program_info') or {}
season = media.get('season')

info = {
'id': video_id,
'title': self._live_title(title) if relinker_info.get(
'is_live') else title,
'alt_title': media.get('subtitle'),
'alt_title': strip_or_none(media.get('subtitle')),
'description': media.get('description'),
'uploader': strip_or_none(media.get('channel')),
'creator': strip_or_none(media.get('editor')),
'creator': strip_or_none(media.get('editor') or None),
'duration': parse_duration(video.get('duration')),
'timestamp': timestamp,
'timestamp': unified_timestamp(date_published),
'thumbnails': thumbnails,
'series': try_get(
media, lambda x: x['isPartOf']['name'], compat_str),
'season_number': int_or_none(try_get(
media, lambda x: x['isPartOf']['numeroStagioni'])),
'season': media.get('stagione') or None,
'series': program_info.get('name'),
'season_number': int_or_none(season),
'season': season if (season and not season.isdigit()) else None,
'episode': media.get('episode_title'),
'episode_number': int_or_none(media.get('episode')),
'subtitles': subtitles,
}

@@ -203,7 +226,7 @@ class RaiPlayLiveIE(RaiBaseIE):
'display_id': 'rainews24',
'ext': 'mp4',
'title': 're:^Diretta di Rai News 24 [0-9]{4}-[0-9]{2}-[0-9]{2} [0-9]{2}:[0-9]{2}$',
'description': 'md5:4d00bcf6dc98b27c6ec480de329d1497',
'description': 'md5:6eca31500550f9376819f174e5644754',
'uploader': 'Rai News 24',
'creator': 'Rai News 24',
'is_live': True,
@@ -216,32 +239,20 @@ class RaiPlayLiveIE(RaiBaseIE):
def _real_extract(self, url):
display_id = self._match_id(url)

media = self._download_json(
'%s.json' % urljoin(RaiBaseIE._BASE_URL, 'dirette/' + display_id),
display_id, 'Downloading channel JSON')

title = media['name']
video = media['video']
video_id = media['id'].replace('ContentItem-', '')
webpage = self._download_webpage(url, display_id)

relinker_info = self._extract_relinker_info(video['content_url'], video_id)
self._sort_formats(relinker_info['formats'])
video_id = self._search_regex(
r'data-uniquename=["\']ContentItem-(%s)' % RaiBaseIE._UUID_RE,
webpage, 'content id')

info = {
return {
'_type': 'url_transparent',
'ie_key': RaiPlayIE.ie_key(),
'url': 'http://www.raiplay.it/dirette/ContentItem-%s.html' % video_id,
'id': video_id,
'display_id': display_id,
'title': self._live_title(title) if relinker_info.get(
'is_live') else title,
'alt_title': media.get('subtitle'),
'description': media.get('description'),
'uploader': strip_or_none(media.get('channel')),
'creator': strip_or_none(media.get('editor')),
'duration': parse_duration(video.get('duration')),
}

info.update(relinker_info)
return info


class RaiPlayPlaylistIE(InfoExtractor):
_VALID_URL = r'https?://(?:www\.)?raiplay\.it/programmi/(?P<id>[^/?#&]+)'
@@ -250,7 +261,7 @@ class RaiPlayPlaylistIE(InfoExtractor):
'info_dict': {
'id': 'nondirloalmiocapo',
'title': 'Non dirlo al mio capo',
'description': 'md5:98ab6b98f7f44c2843fd7d6f045f153b',
'description': 'md5:9f3d603b2947c1c7abb098f3b14fac86',
},
'playlist_mincount': 12,
}]
@@ -258,25 +269,21 @@ class RaiPlayPlaylistIE(InfoExtractor):
def _real_extract(self, url):
playlist_id = self._match_id(url)

media = self._download_json(
'%s.json' % urljoin(RaiBaseIE._BASE_URL, 'programmi/' + playlist_id),
playlist_id, 'Downloading program JSON')

title = media['name']
description = media['program_info']['description']
webpage = self._download_webpage(url, playlist_id)

content_sets = [s['id'] for b in media['blocks'] for s in b['sets']]
title = self._html_search_meta(
('programma', 'nomeProgramma'), webpage, 'title')
description = unescapeHTML(self._html_search_meta(
('description', 'og:description'), webpage, 'description'))

entries = []
for cs in content_sets:
medias = self._download_json(
'%s/%s.json' % (urljoin(RaiBaseIE._BASE_URL, 'programmi/' + playlist_id), cs),
cs, 'Downloading content set JSON')
for m in medias['items']:
video_url = urljoin(url, m['path_id'])
entries.append(self.url_result(
video_url, ie=RaiPlayIE.ie_key(),
video_id=RaiPlayIE._match_id(video_url)))
for mobj in re.finditer(
r'<a\b[^>]+\bhref=(["\'])(?P<path>/raiplay/video/.+?)\1',
webpage):
video_url = urljoin(url, mobj.group('path'))
entries.append(self.url_result(
video_url, ie=RaiPlayIE.ie_key(),
video_id=RaiPlayIE._match_id(video_url)))

return self.playlist_result(entries, playlist_id, title, description)

@@ -294,7 +301,8 @@ class RaiIE(RaiBaseIE):
'thumbnail': r're:^https?://.*\.jpg$',
'duration': 1758,
'upload_date': '20140612',
}
},
'skip': 'This content is available only in Italy',
}, {
# with ContentItem in many metas
'url': 'http://www.rainews.it/dl/rainews/media/Weekend-al-cinema-da-Hollywood-arriva-il-thriller-di-Tate-Taylor-La-ragazza-del-treno-1632c009-c843-4836-bb65-80c33084a64b.html',
@@ -320,6 +328,19 @@ class RaiIE(RaiBaseIE):
'duration': 2214,
'upload_date': '20161103',
}
}, {
# drawMediaRaiTV(...)
'url': 'http://www.report.rai.it/dl/Report/puntata/ContentItem-0c7a664b-d0f4-4b2c-8835-3f82e46f433e.html',
'md5': '2dd727e61114e1ee9c47f0da6914e178',
'info_dict': {
'id': '59d69d28-6bb6-409d-a4b5-ed44096560af',
'ext': 'mp4',
'title': 'Il pacco',
'description': 'md5:4b1afae1364115ce5d78ed83cd2e5b3a',
'thumbnail': r're:^https?://.*\.jpg$',
'upload_date': '20141221',
},
'skip': 'This content is not available',
}, {
# initEdizione('ContentItem-...'
'url': 'http://www.tg1.rai.it/dl/tg1/2010/edizioni/ContentSet-9b6e0cba-4bef-4aef-8cf0-9f7f665b7dfb-tg1.html?item=undefined',
@@ -331,6 +352,18 @@ class RaiIE(RaiBaseIE):
'upload_date': '20170401',
},
'skip': 'Changes daily',
}, {
# HDS live stream with only relinker URL
'url': 'http://www.rai.tv/dl/RaiTV/dirette/PublishingBlock-1912dbbf-3f96-44c3-b4cf-523681fbacbc.html?channel=EuroNews',
'info_dict': {
'id': '1912dbbf-3f96-44c3-b4cf-523681fbacbc',
'ext': 'flv',
'title': 'EuroNews',
},
'params': {
'skip_download': True,
},
'skip': 'This content is available only in Italy',
}, {
# HLS live stream with ContentItem in og:url
'url': 'http://www.rainews.it/dl/rainews/live/ContentItem-3156f2f2-dc70-4953-8e2f-70d7489d4ce9.html',


+ 95
- 16
youtube_dlc/extractor/servus.py View File

@@ -1,9 +1,15 @@
# coding: utf-8
from __future__ import unicode_literals

import re

from .common import InfoExtractor
from ..utils import (
determine_ext,
float_or_none,
int_or_none,
unified_timestamp,
urlencode_postdata,
url_or_none,
)


class ServusIE(InfoExtractor):
@@ -12,20 +18,29 @@ class ServusIE(InfoExtractor):
(?:www\.)?
(?:
servus\.com/(?:(?:at|de)/p/[^/]+|tv/videos)|
servustv\.com/videos
(?:servustv|pm-wissen)\.com/videos
)
/(?P<id>[aA]{2}-\w+|\d+-\d+)
'''
_TESTS = [{
# new URL schema
'url': 'https://www.servustv.com/videos/aa-1t6vbu5pw1w12/',
'md5': '3e1dd16775aa8d5cbef23628cfffc1f4',
'md5': '60474d4c21f3eb148838f215c37f02b9',
'info_dict': {
'id': 'AA-1T6VBU5PW1W12',
'ext': 'mp4',
'title': 'Die Grünen aus Sicht des Volkes',
'alt_title': 'Talk im Hangar-7 Voxpops Gruene',
'description': 'md5:1247204d85783afe3682644398ff2ec4',
'thumbnail': r're:^https?://.*\.jpg',
'duration': 62.442,
'timestamp': 1605193976,
'upload_date': '20201112',
'series': 'Talk im Hangar-7',
'season': 'Season 9',