[vk] improve extraction(fixes #7976)

This commit is contained in:
remitamine 2016-05-06 15:02:40 +01:00
parent 6f59aa934b
commit 04e88ca2ca
3 changed files with 64 additions and 22 deletions

View File

@ -0,0 +1,39 @@
# coding: utf-8
from __future__ import unicode_literals
from .common import InfoExtractor
class BIQLEIE(InfoExtractor):
_VALID_URL = r'https?://(?:www\.)?biqle\.(?:com|org|ru)/watch/(?P<id>-?\d+_\d+)'
_TESTS = [{
'url': 'http://www.biqle.ru/watch/847655_160197695',
'md5': 'ad5f746a874ccded7b8f211aeea96637',
'info_dict': {
'id': '160197695',
'ext': 'mp4',
'title': 'Foo Fighters - The Pretender (Live at Wembley Stadium)',
'uploader': 'Andrey Rogozin',
'upload_date': '20110605',
}
}, {
'url': 'https://biqle.org/watch/-44781847_168547604',
'md5': '7f24e72af1db0edf7c1aaba513174f97',
'info_dict': {
'id': '168547604',
'ext': 'mp4',
'title': 'Ребенок в шоке от автоматической мойки',
'uploader': 'Dmitry Kotov',
}
}]
def _real_extract(self, url):
video_id = self._match_id(url)
webpage = self._download_webpage(url, video_id)
embed_url = self._proto_relative_url(self._search_regex(
r'<iframe.+?src="((?:http:)?//daxab\.com/[^"]+)".*?></iframe>', webpage, 'embed url'))
return {
'_type': 'url_transparent',
'url': embed_url,
}

View File

@ -75,6 +75,7 @@ from .bigflix import BigflixIE
from .bild import BildIE from .bild import BildIE
from .bilibili import BiliBiliIE from .bilibili import BiliBiliIE
from .biobiochiletv import BioBioChileTVIE from .biobiochiletv import BioBioChileTVIE
from .biqle import BIQLEIE
from .bleacherreport import ( from .bleacherreport import (
BleacherReportIE, BleacherReportIE,
BleacherReportCMSIE, BleacherReportCMSIE,

View File

@ -26,12 +26,16 @@ class VKIE(InfoExtractor):
_VALID_URL = r'''(?x) _VALID_URL = r'''(?x)
https?:// https?://
(?: (?:
(?:m\.)?vk\.com/video_ext\.php\?.*?\boid=(?P<oid>-?\d+).*?\bid=(?P<id>\d+)| (?:
(?:m\.)?vk\.com/video_|
(?:www\.)?daxab.com/
)
ext\.php\?(?P<embed_query>.*?\boid=(?P<oid>-?\d+).*?\bid=(?P<id>\d+).*)|
(?: (?:
(?:m\.)?vk\.com/(?:.+?\?.*?z=)?video| (?:m\.)?vk\.com/(?:.+?\?.*?z=)?video|
(?:www\.)?biqle\.ru/watch/ (?:www\.)?daxab.com/embed/
) )
(?P<videoid>[^s].*?)(?:\?(?:.*\blist=(?P<list_id>[\da-f]+))?|%2F|$) (?P<videoid>-?\d+_\d+)(?:.*\blist=(?P<list_id>[\da-f]+))?
) )
''' '''
_NETRC_MACHINE = 'vk' _NETRC_MACHINE = 'vk'
@ -75,7 +79,8 @@ class VKIE(InfoExtractor):
'duration': 101, 'duration': 101,
'upload_date': '20120730', 'upload_date': '20120730',
'view_count': int, 'view_count': int,
} },
'skip': 'This video has been removed from public access.',
}, },
{ {
# VIDEO NOW REMOVED # VIDEO NOW REMOVED
@ -142,7 +147,7 @@ class VKIE(InfoExtractor):
'id': 'V3K4mi0SYkc', 'id': 'V3K4mi0SYkc',
'ext': 'webm', 'ext': 'webm',
'title': "DSWD Awards 'Children's Joy Foundation, Inc.' Certificate of Registration and License to Operate", 'title': "DSWD Awards 'Children's Joy Foundation, Inc.' Certificate of Registration and License to Operate",
'description': 'md5:bf9c26cfa4acdfb146362682edd3827a', 'description': 'md5:d9903938abdc74c738af77f527ca0596',
'duration': 178, 'duration': 178,
'upload_date': '20130116', 'upload_date': '20130116',
'uploader': "Children's Joy Foundation", 'uploader': "Children's Joy Foundation",
@ -173,11 +178,6 @@ class VKIE(InfoExtractor):
'url': 'https://vk.com/video205387401_164765225', 'url': 'https://vk.com/video205387401_164765225',
'only_matching': True, 'only_matching': True,
}, },
{
# vk wrapper
'url': 'http://www.biqle.ru/watch/847655_160197695',
'only_matching': True,
},
{ {
# pladform embed # pladform embed
'url': 'https://vk.com/video-76116461_171554880', 'url': 'https://vk.com/video-76116461_171554880',
@ -217,20 +217,22 @@ class VKIE(InfoExtractor):
mobj = re.match(self._VALID_URL, url) mobj = re.match(self._VALID_URL, url)
video_id = mobj.group('videoid') video_id = mobj.group('videoid')
if not video_id: info_url = url
if video_id:
info_url = 'https://vk.com/al_video.php?act=show&al=1&module=video&video=%s' % video_id
# Some videos (removed?) can only be downloaded with list id specified
list_id = mobj.group('list_id')
if list_id:
info_url += '&list=%s' % list_id
else:
info_url = 'http://vk.com/video_ext.php?' + mobj.group('embed_query')
video_id = '%s_%s' % (mobj.group('oid'), mobj.group('id')) video_id = '%s_%s' % (mobj.group('oid'), mobj.group('id'))
info_url = 'https://vk.com/al_video.php?act=show&al=1&module=video&video=%s' % video_id
# Some videos (removed?) can only be downloaded with list id specified
list_id = mobj.group('list_id')
if list_id:
info_url += '&list=%s' % list_id
info_page = self._download_webpage(info_url, video_id) info_page = self._download_webpage(info_url, video_id)
error_message = self._html_search_regex( error_message = self._html_search_regex(
r'(?s)<!><div[^>]+class="video_layer_message"[^>]*>(.+?)</div>', [r'(?s)<!><div[^>]+class="video_layer_message"[^>]*>(.+?)</div>',
r'(?s)<div[^>]+id="video_ext_msg"[^>]*>(.+?)</div>'],
info_page, 'error message', default=None) info_page, 'error message', default=None)
if error_message: if error_message:
raise ExtractorError(error_message, expected=True) raise ExtractorError(error_message, expected=True)
@ -305,17 +307,17 @@ class VKIE(InfoExtractor):
view_count = None view_count = None
views = self._html_search_regex( views = self._html_search_regex(
r'"mv_views_count_number"[^>]*>(.+?\bviews?)<', r'"mv_views_count_number"[^>]*>(.+?\bviews?)<',
info_page, 'view count', fatal=False) info_page, 'view count', default=None)
if views: if views:
view_count = str_to_int(self._search_regex( view_count = str_to_int(self._search_regex(
r'([\d,.]+)', views, 'view count', fatal=False)) r'([\d,.]+)', views, 'view count', fatal=False))
formats = [] formats = []
for k, v in data.items(): for k, v in data.items():
if not k.startswith('url') and k != 'extra_data' or not v: if not k.startswith('url') and not k.startswith('cache') and k != 'extra_data' or not v:
continue continue
height = int_or_none(self._search_regex( height = int_or_none(self._search_regex(
r'^url(\d+)', k, 'height', default=None)) r'^(?:url|cache)(\d+)', k, 'height', default=None))
formats.append({ formats.append({
'format_id': k, 'format_id': k,
'url': v, 'url': v,