[vimeo] fix album extraction

closes #1933
closes #15704
closes #15855
closes #18967
closes #21986
This commit is contained in:
Remita Amine 2019-08-03 10:29:20 +01:00
parent 5efbc1366f
commit eb9c9c74a6
1 changed files with 42 additions and 16 deletions

View File

@ -2,12 +2,14 @@
from __future__ import unicode_literals from __future__ import unicode_literals
import base64 import base64
import functools
import json import json
import re import re
import itertools import itertools
from .common import InfoExtractor from .common import InfoExtractor
from ..compat import ( from ..compat import (
compat_kwargs,
compat_HTTPError, compat_HTTPError,
compat_str, compat_str,
compat_urlparse, compat_urlparse,
@ -19,6 +21,7 @@ from ..utils import (
int_or_none, int_or_none,
merge_dicts, merge_dicts,
NO_DEFAULT, NO_DEFAULT,
OnDemandPagedList,
parse_filesize, parse_filesize,
qualities, qualities,
RegexNotFoundError, RegexNotFoundError,
@ -98,6 +101,13 @@ class VimeoBaseInfoExtractor(InfoExtractor):
webpage, 'vuid', group='vuid') webpage, 'vuid', group='vuid')
return xsrft, vuid return xsrft, vuid
def _extract_vimeo_config(self, webpage, video_id, *args, **kwargs):
vimeo_config = self._search_regex(
r'vimeo\.config\s*=\s*(?:({.+?})|_extend\([^,]+,\s+({.+?})\));',
webpage, 'vimeo config', *args, **compat_kwargs(kwargs))
if vimeo_config:
return self._parse_json(vimeo_config, video_id)
def _set_vimeo_cookie(self, name, value): def _set_vimeo_cookie(self, name, value):
self._set_cookie('vimeo.com', name, value) self._set_cookie('vimeo.com', name, value)
@ -253,7 +263,7 @@ class VimeoIE(VimeoBaseInfoExtractor):
\. \.
)? )?
vimeo(?P<pro>pro)?\.com/ vimeo(?P<pro>pro)?\.com/
(?!(?:channels|album)/[^/?#]+/?(?:$|[?#])|[^/]+/review/|ondemand/) (?!(?:channels|album|showcase)/[^/?#]+/?(?:$|[?#])|[^/]+/review/|ondemand/)
(?:.*?/)? (?:.*?/)?
(?: (?:
(?: (?:
@ -580,11 +590,9 @@ class VimeoIE(VimeoBaseInfoExtractor):
# and latter we extract those that are Vimeo specific. # and latter we extract those that are Vimeo specific.
self.report_extraction(video_id) self.report_extraction(video_id)
vimeo_config = self._search_regex( vimeo_config = self._extract_vimeo_config(webpage, video_id, default=None)
r'vimeo\.config\s*=\s*(?:({.+?})|_extend\([^,]+,\s+({.+?})\));', webpage,
'vimeo config', default=None)
if vimeo_config: if vimeo_config:
seed_status = self._parse_json(vimeo_config, video_id).get('seed_status', {}) seed_status = vimeo_config.get('seed_status', {})
if seed_status.get('state') == 'failed': if seed_status.get('state') == 'failed':
raise ExtractorError( raise ExtractorError(
'%s said: %s' % (self.IE_NAME, seed_status['title']), '%s said: %s' % (self.IE_NAME, seed_status['title']),
@ -905,7 +913,7 @@ class VimeoUserIE(VimeoChannelIE):
class VimeoAlbumIE(VimeoChannelIE): class VimeoAlbumIE(VimeoChannelIE):
IE_NAME = 'vimeo:album' IE_NAME = 'vimeo:album'
_VALID_URL = r'https://vimeo\.com/album/(?P<id>\d+)(?:$|[?#]|/(?!video))' _VALID_URL = r'https://vimeo\.com/(?:album|showcase)/(?P<id>\d+)(?:$|[?#]|/(?!video))'
_TITLE_RE = r'<header id="page_header">\n\s*<h1>(.*?)</h1>' _TITLE_RE = r'<header id="page_header">\n\s*<h1>(.*?)</h1>'
_TESTS = [{ _TESTS = [{
'url': 'https://vimeo.com/album/2632481', 'url': 'https://vimeo.com/album/2632481',
@ -925,21 +933,39 @@ class VimeoAlbumIE(VimeoChannelIE):
'params': { 'params': {
'videopassword': 'youtube-dl', 'videopassword': 'youtube-dl',
} }
}, {
'url': 'https://vimeo.com/album/2632481/sort:plays/format:thumbnail',
'only_matching': True,
}, {
# TODO: respect page number
'url': 'https://vimeo.com/album/2632481/page:2/sort:plays/format:thumbnail',
'only_matching': True,
}] }]
_PAGE_SIZE = 100
def _page_url(self, base_url, pagenum): def _fetch_page(self, album_id, authorizaion, hashed_pass, page):
return '%s/page:%d/' % (base_url, pagenum) api_page = page + 1
query = {
'fields': 'link',
'page': api_page,
'per_page': self._PAGE_SIZE,
}
if hashed_pass:
query['_hashed_pass'] = hashed_pass
videos = self._download_json(
'https://api.vimeo.com/albums/%s/videos' % album_id,
album_id, 'Downloading page %d' % api_page, query=query, headers={
'Authorization': 'jwt ' + authorizaion,
})['data']
for video in videos:
link = video.get('link')
if not link:
continue
yield self.url_result(link, VimeoIE.ie_key(), VimeoIE._match_id(link))
def _real_extract(self, url): def _real_extract(self, url):
album_id = self._match_id(url) album_id = self._match_id(url)
return self._extract_videos(album_id, 'https://vimeo.com/album/%s' % album_id) webpage = self._download_webpage(url, album_id)
webpage = self._login_list_password(url, album_id, webpage)
api_config = self._extract_vimeo_config(webpage, album_id)['api']
entries = OnDemandPagedList(functools.partial(
self._fetch_page, album_id, api_config['jwt'],
api_config.get('hashed_pass')), self._PAGE_SIZE)
return self.playlist_result(entries, album_id, self._html_search_regex(
r'<title>\s*(.+?)(?:\s+on Vimeo)?</title>', webpage, 'title', fatal=False))
class VimeoGroupsIE(VimeoAlbumIE): class VimeoGroupsIE(VimeoAlbumIE):