diff --git a/youtube_dl/extractor/alura.py b/youtube_dl/extractor/alura.py index 5f75bdaff..7baf7af81 100644 --- a/youtube_dl/extractor/alura.py +++ b/youtube_dl/extractor/alura.py @@ -6,21 +6,15 @@ import re from .common import InfoExtractor from ..compat import ( - compat_str, compat_urlparse, ) from ..utils import ( - dict_get, - ExtractorError, - float_or_none, - int_or_none, - parse_duration, - qualities, - srt_subtitles_timecode, - try_get, - update_url_query, urlencode_postdata, + urljoin, + int_or_none, + clean_html, + ExtractorError ) @@ -28,17 +22,26 @@ class AluraIE(InfoExtractor): _VALID_URL = r'https?://(?:cursos\.)?alura\.com\.br/course/(?P[^/]+)/task/(?P\d+)' _LOGIN_URL = 'https://cursos.alura.com.br/loginForm?urlAfterLogin=/loginForm' _VIDEO_URL = 'https://cursos.alura.com.br/course/%s/task/%s/video' - _TEST = { - 'url': 'https://cursos.alura.com.br/course/design-patterns-python/task/9651', - 'md5': 'TODO: md5 sum of the first 10241 bytes of the video file (use --test)', + _NETRC_MACHINE = 'alura' + _TESTS = [{ + 'url': 'https://cursos.alura.com.br/course/clojure-mutabilidade-com-atoms-e-refs/task/60095', 'info_dict': { - 'id': '9651', + 'id': '60095', 'ext': 'mp4', - 'title': 'Video title goes here', - 'thumbnail': r're:^https?://.*\.jpg$', - # TODO more properties, either as: + 'title': 'ReferĂȘncias, ref-set e alter' + }, + 'skip': 'Requires alura account credentials', + }, + { + # URL without video + 'url': 'https://cursos.alura.com.br/course/clojure-mutabilidade-com-atoms-e-refs/task/60098', + 'only_matching': True, + }, + { + 'url': 'https://cursos.alura.com.br/course/fundamentos-market-digital/task/55219', + 'only_matching': True, } - } + ] def _real_extract(self, url): @@ -50,9 +53,9 @@ class AluraIE(InfoExtractor): if video_dict: webpage = self._download_webpage(url, video_id) - video_title = self._search_regex( + video_title = clean_html(self._search_regex( r']+class=(["\'])task-body-header-title-text\1[^>]*>(?P[^<]+)', - webpage, 'title', group='title') + webpage, 'title', group='title')) formats = [] for video_obj in video_dict: @@ -60,9 +63,15 @@ class AluraIE(InfoExtractor): video_format = self._extract_m3u8_formats( video_url_m3u8, None, 'mp4', entry_protocol='m3u8_native', m3u8_id='hls', fatal=False) - + for f in video_format: + m = re.search(r'^[\w \W]*-(?P<res>\w*).mp4[\W \w]*', f['url']) + if m: + if not f.get('height'): + f['height'] = int('720' if m.group('res') == 'hd' else '480') formats.extend(video_format) + self._sort_formats(formats, field_preference=('height', 'width', 'tbr', 'format_id')) + return { 'id': video_id, 'title': video_title, @@ -111,9 +120,55 @@ class AluraIE(InfoExtractor): if not post_url.startswith('http'): post_url = compat_urlparse.urljoin(self._LOGIN_URL, post_url) - self._download_webpage( + response = self._download_webpage( post_url, None, 'Logging in', data=urlencode_postdata(login_form), headers={'Content-Type': 'application/x-www-form-urlencoded'}) + if not is_logged(response): + error = self._html_search_regex( + r'(?s)<p[^>]+class="alert-message[^"]*">(.+?)</p>', + response, 'error message', default=None) + if error: + raise ExtractorError('Unable to login: %s' % error, expected=True) + raise ExtractorError('Unable to log in') + +class AluraCourseIE(AluraIE): + + _VALID_URL = r'https?://(?:cursos\.)?alura\.com\.br/course/(?P<id>[^/]+)' + _LOGIN_URL = 'https://cursos.alura.com.br/loginForm?urlAfterLogin=/loginForm' + _NETRC_MACHINE = 'aluracourse' + _TESTS = [{ + 'url': 'https://cursos.alura.com.br/course/clojure-mutabilidade-com-atoms-e-refs', + 'only_matching': True, + }] + + def _real_extract(self, url): + + course_path = self._match_id(url) + webpage = self._download_webpage(url, course_path) + + course_title = self._search_regex( + r'<h1.*?>(.*?)<strong>(?P<course_title>.*?)</strong></h[0-9]>', webpage, + 'course title', default=course_path, group='course_title') + + entries = [] + if webpage: + for path in re.findall(r'<a\b(?=[^>]* class="[^"]*(?<=[" ])courseSectionList-section[" ])(?=[^>]* href="([^"]*))', webpage): + page_url = urljoin(url, path) + section_path = self._download_webpage(page_url, course_path) + for path_video in re.findall(r'<a\b(?=[^>]* class="[^"]*(?<=[" ])task-menu-nav-item-link-VIDEO[" ])(?=[^>]* href="([^"]*))', section_path): + chapter = clean_html(self._search_regex(r'<h3[^>]+class=(["\'])task-menu-section-title-text\1[^>]*>(?P<chapter>[^<]+)',section_path, 'chapter', group='chapter')) + chapter_number = int_or_none(self._search_regex(r'<span[^>]+class=(["\'])task-menu-section-title-number[^>]*>(.*?)<strong>(?P<chapter_number>[^<]+)</strong>',section_path, 'chapter number', group='chapter_number')) + video_url = urljoin(url, path_video) + entry = { + '_type': 'url_transparent', + 'id': self._match_id(video_url), + 'url': video_url, + 'id_key': self.ie_key(), + 'chapter': chapter, + 'chapter_number': chapter_number + } + entries.append(entry) + return self.playlist_result(entries, course_path, course_title) \ No newline at end of file diff --git a/youtube_dl/extractor/extractors.py b/youtube_dl/extractor/extractors.py index 1a1b3746b..0d34237af 100644 --- a/youtube_dl/extractor/extractors.py +++ b/youtube_dl/extractor/extractors.py @@ -36,7 +36,10 @@ from .afreecatv import AfreecaTVIE from .airmozilla import AirMozillaIE from .aljazeera import AlJazeeraIE from .alphaporno import AlphaPornoIE -from .alura import AluraIE +from .alura import ( + AluraIE, + AluraCourseIE +) from .amcnetworks import AMCNetworksIE from .americastestkitchen import AmericasTestKitchenIE from .animeondemand import AnimeOnDemandIE