[franceculture] Fix extraction (Closes #10324)

This commit is contained in:
Sergey M․ 2016-08-13 21:00:34 +07:00
parent 647a7bf5e8
commit 82997dad57
No known key found for this signature in database
GPG Key ID: 2C393E0F18A9236D
2 changed files with 26 additions and 77 deletions

View File

@ -272,10 +272,7 @@ from .fox import FOXIE
from .foxgay import FoxgayIE from .foxgay import FoxgayIE
from .foxnews import FoxNewsIE from .foxnews import FoxNewsIE
from .foxsports import FoxSportsIE from .foxsports import FoxSportsIE
from .franceculture import ( from .franceculture import FranceCultureIE
FranceCultureIE,
FranceCultureEmissionIE,
)
from .franceinter import FranceInterIE from .franceinter import FranceInterIE
from .francetv import ( from .francetv import (
PluzzIE, PluzzIE,

View File

@ -2,104 +2,56 @@
from __future__ import unicode_literals from __future__ import unicode_literals
from .common import InfoExtractor from .common import InfoExtractor
from ..compat import (
compat_urlparse,
)
from ..utils import ( from ..utils import (
determine_ext, determine_ext,
int_or_none, unified_strdate,
ExtractorError,
) )
class FranceCultureIE(InfoExtractor): class FranceCultureIE(InfoExtractor):
_VALID_URL = r'https?://(?:www\.)?franceculture\.fr/player/reecouter\?play=(?P<id>[0-9]+)' _VALID_URL = r'https?://(?:www\.)?franceculture\.fr/emissions/(?:[^/]+/)*(?P<id>[^/?#&]+)'
_TEST = { _TEST = {
'url': 'http://www.franceculture.fr/player/reecouter?play=4795174', 'url': 'http://www.franceculture.fr/emissions/carnet-nomade/rendez-vous-au-pays-des-geeks',
'info_dict': { 'info_dict': {
'id': '4795174', 'id': 'rendez-vous-au-pays-des-geeks',
'display_id': 'rendez-vous-au-pays-des-geeks',
'ext': 'mp3', 'ext': 'mp3',
'title': 'Rendez-vous au pays des geeks', 'title': 'Rendez-vous au pays des geeks',
'alt_title': 'Carnet nomade | 13-14', 'thumbnail': 're:^https?://.*\\.jpg$',
'vcodec': 'none',
'upload_date': '20140301', 'upload_date': '20140301',
'thumbnail': r're:^http://static\.franceculture\.fr/.*/images/player/Carnet-nomade\.jpg$', 'vcodec': 'none',
'description': 'startswith:Avec :Jean-Baptiste Péretié pour son documentaire sur Arte "La revanche',
'timestamp': 1393700400,
} }
} }
def _extract_from_player(self, url, video_id): def _real_extract(self, url):
webpage = self._download_webpage(url, video_id) display_id = self._match_id(url)
video_path = self._search_regex( webpage = self._download_webpage(url, display_id)
r'<a id="player".*?href="([^"]+)"', webpage, 'video path')
video_url = compat_urlparse.urljoin(url, video_path) video_url = self._search_regex(
timestamp = int_or_none(self._search_regex( r'(?s)<div[^>]+class="[^"]*?title-zone-diffusion[^"]*?"[^>]*>.*?<a[^>]+href="([^"]+)"',
r'<a id="player".*?data-date="([0-9]+)"', webpage, 'video path')
title = self._og_search_title(webpage)
upload_date = unified_strdate(self._search_regex(
'(?s)<div[^>]+class="date"[^>]*>.*?<span[^>]+class="inner"[^>]*>([^<]+)<',
webpage, 'upload date', fatal=False)) webpage, 'upload date', fatal=False))
thumbnail = self._search_regex( thumbnail = self._search_regex(
r'<a id="player".*?>\s+<img src="([^"]+)"', r'(?s)<figure[^>]+itemtype="https://schema.org/ImageObject"[^>]*>.*?<img[^>]+data-pagespeed-(?:lazy|high-res)-src="([^"]+)"',
webpage, 'thumbnail', fatal=False) webpage, 'thumbnail', fatal=False)
display_id = self._search_regex(
r'<span class="path-diffusion">emission-(.*?)</span>', webpage, 'display_id')
title = self._html_search_regex(
r'<span class="title-diffusion">(.*?)</span>', webpage, 'title')
alt_title = self._html_search_regex(
r'<span class="title">(.*?)</span>',
webpage, 'alt_title', fatal=False)
description = self._html_search_regex(
r'<span class="description">(.*?)</span>',
webpage, 'description', fatal=False)
uploader = self._html_search_regex( uploader = self._html_search_regex(
r'(?s)<div id="emission".*?<span class="author">(.*?)</span>', r'(?s)<div id="emission".*?<span class="author">(.*?)</span>',
webpage, 'uploader', default=None) webpage, 'uploader', default=None)
vcodec = 'none' if determine_ext(video_url.lower()) == 'mp3' else None vcodec = 'none' if determine_ext(video_url.lower()) == 'mp3' else None
return { return {
'id': video_id, 'id': display_id,
'display_id': display_id,
'url': video_url, 'url': video_url,
'title': title,
'thumbnail': thumbnail,
'vcodec': vcodec, 'vcodec': vcodec,
'uploader': uploader, 'uploader': uploader,
'timestamp': timestamp, 'upload_date': upload_date,
'title': title,
'alt_title': alt_title,
'thumbnail': thumbnail,
'description': description,
'display_id': display_id,
} }
def _real_extract(self, url):
video_id = self._match_id(url)
return self._extract_from_player(url, video_id)
class FranceCultureEmissionIE(FranceCultureIE):
_VALID_URL = r'https?://(?:www\.)?franceculture\.fr/emission-(?P<id>[^?#]+)'
_TEST = {
'url': 'http://www.franceculture.fr/emission-les-carnets-de-la-creation-jean-gabriel-periot-cineaste-2015-10-13',
'info_dict': {
'title': 'Jean-Gabriel Périot, cinéaste',
'alt_title': 'Les Carnets de la création',
'id': '5093239',
'display_id': 'les-carnets-de-la-creation-jean-gabriel-periot-cineaste-2015-10-13',
'ext': 'mp3',
'timestamp': 1444762500,
'upload_date': '20151013',
'description': 'startswith:Aujourd\'hui dans "Les carnets de la création", le cinéaste',
},
}
def _real_extract(self, url):
video_id = self._match_id(url)
webpage = self._download_webpage(url, video_id)
video_path = self._html_search_regex(
r'<a class="rf-player-open".*?href="([^"]+)"', webpage, 'video path', 'no_path_player')
if video_path == 'no_path_player':
raise ExtractorError('no player : no sound in this page.', expected=True)
new_id = self._search_regex('play=(?P<id>[0-9]+)', video_path, 'new_id', group='id')
video_url = compat_urlparse.urljoin(url, video_path)
return self._extract_from_player(video_url, new_id)