[youporn] Improve formats extraction

This commit is contained in:
Sergey M․ 2017-06-22 00:40:15 +07:00
parent 97b6e30113
commit d4893e764b
No known key found for this signature in database
GPG Key ID: 2C393E0F18A9236D
1 changed files with 24 additions and 8 deletions

View File

@ -3,6 +3,7 @@ from __future__ import unicode_literals
import re import re
from .common import InfoExtractor from .common import InfoExtractor
from ..compat import compat_str
from ..utils import ( from ..utils import (
int_or_none, int_or_none,
sanitized_Request, sanitized_Request,
@ -68,7 +69,7 @@ class YouPornIE(InfoExtractor):
webpage = self._download_webpage(request, display_id) webpage = self._download_webpage(request, display_id)
title = self._search_regex( title = self._search_regex(
[r'(?:video_titles|videoTitle|title)\s*[:=]\s*(["\'])(?P<title>(?:(?!\1).)+)\1', [r'(?:video_titles|videoTitle)\s*[:=]\s*(["\'])(?P<title>(?:(?!\1).)+)\1',
r'<h1[^>]+class=["\']heading\d?["\'][^>]*>(?P<title>[^<]+)<'], r'<h1[^>]+class=["\']heading\d?["\'][^>]*>(?P<title>[^<]+)<'],
webpage, 'title', group='title', webpage, 'title', group='title',
default=None) or self._og_search_title( default=None) or self._og_search_title(
@ -77,22 +78,37 @@ class YouPornIE(InfoExtractor):
links = [] links = []
# Main source
definitions = self._parse_json(
self._search_regex(
r'mediaDefinition\s*=\s*(\[.+?\]);', webpage,
'media definitions', default='[]'),
video_id, fatal=False)
if definitions:
for definition in definitions:
if not isinstance(definition, dict):
continue
video_url = definition.get('videoUrl')
if isinstance(video_url, compat_str) and video_url:
links.append(video_url)
# Fallback #1, this also contains extra low quality 180p format
for _, link in re.findall(r'<a[^>]+href=(["\'])(http.+?)\1[^>]+title=["\']Download [Vv]ideo', webpage):
links.append(link)
# Fallback #2 (unavailable as at 22.06.2017)
sources = self._search_regex( sources = self._search_regex(
r'(?s)sources\s*:\s*({.+?})', webpage, 'sources', default=None) r'(?s)sources\s*:\s*({.+?})', webpage, 'sources', default=None)
if sources: if sources:
for _, link in re.findall(r'[^:]+\s*:\s*(["\'])(http.+?)\1', sources): for _, link in re.findall(r'[^:]+\s*:\s*(["\'])(http.+?)\1', sources):
links.append(link) links.append(link)
# Fallback #1 # Fallback #3 (unavailable as at 22.06.2017)
for _, link in re.findall( for _, link in re.findall(
r'(?:videoUrl|videoSrc|videoIpadUrl|html5PlayerSrc)\s*[:=]\s*(["\'])(http.+?)\1', webpage): r'(?:videoSrc|videoIpadUrl|html5PlayerSrc)\s*[:=]\s*(["\'])(http.+?)\1', webpage):
links.append(link) links.append(link)
# Fallback #2, this also contains extra low quality 180p format # Fallback #4, encrypted links (unavailable as at 22.06.2017)
for _, link in re.findall(r'<a[^>]+href=(["\'])(http.+?)\1[^>]+title=["\']Download [Vv]ideo', webpage):
links.append(link)
# Fallback #3, encrypted links
for _, encrypted_link in re.findall( for _, encrypted_link in re.findall(
r'encryptedQuality\d{3,4}URL\s*=\s*(["\'])([\da-zA-Z+/=]+)\1', webpage): r'encryptedQuality\d{3,4}URL\s*=\s*(["\'])([\da-zA-Z+/=]+)\1', webpage):
links.append(aes_decrypt_text(encrypted_link, title, 32).decode('utf-8')) links.append(aes_decrypt_text(encrypted_link, title, 32).decode('utf-8'))