[9gag] Fix and improve extraction

This commit is contained in:
Sergey M․ 2014-04-15 19:49:38 +07:00
parent 2d4c98dbd1
commit d7666dff82
1 changed files with 11 additions and 18 deletions

View File

@ -1,8 +1,10 @@
from __future__ import unicode_literals from __future__ import unicode_literals
import re import re
import json
from .common import InfoExtractor from .common import InfoExtractor
from ..utils import str_to_int
class NineGagIE(InfoExtractor): class NineGagIE(InfoExtractor):
@ -44,23 +46,14 @@ class NineGagIE(InfoExtractor):
webpage = self._download_webpage(url, display_id) webpage = self._download_webpage(url, display_id)
youtube_id = self._html_search_regex( post_view = json.loads(self._html_search_regex(
r'(?s)id="jsid-video-post-container".*?data-external-id="([^"]+)"', r'var postView = new app\.PostView\({ post: ({.+?}),', webpage, 'post view'))
webpage, 'video ID')
title = self._html_search_regex( youtube_id = post_view['videoExternalId']
r'(?s)id="jsid-video-post-container".*?data-title="([^"]+)"', title = post_view['title']
webpage, 'title', default=None) description = post_view['description']
if not title: view_count = str_to_int(post_view['externalView'])
title = self._og_search_title(webpage) thumbnail = post_view.get('thumbnail_700w') or post_view.get('ogImageUrl') or post_view.get('thumbnail_300w')
description = self._html_search_regex(
r'(?s)<div class="video-caption">.*?<p>(.*?)</p>', webpage,
'description', fatal=False)
view_count_str = self._html_search_regex(
r'<p><b>([0-9][0-9,]*)</b> views</p>', webpage, 'view count',
fatal=False)
view_count = (
None if view_count_str is None
else int(view_count_str.replace(',', '')))
return { return {
'_type': 'url_transparent', '_type': 'url_transparent',
@ -71,5 +64,5 @@ class NineGagIE(InfoExtractor):
'title': title, 'title': title,
'description': description, 'description': description,
'view_count': view_count, 'view_count': view_count,
'thumbnail': self._og_search_thumbnail(webpage), 'thumbnail': thumbnail,
} }