[wsj] Add new extractor (Fixes #4854)

2024-12-22 07:43:08 +00:00 · 2015-02-03 10:58:28 +01:00 · 2015-02-03 10:58:28 +01:00 · 9bb8e0a3f9
commit 9bb8e0a3f9
parent 1a6373ef39
5 changed files with 95 additions and 1 deletions
--- a/test/test_utils.py
+++ b/test/test_utils.py
@ -156,6 +156,9 @@ class TestUtil(unittest.TestCase):
        self.assertEqual(
            unified_strdate('11/26/2014 11:30:00 AM PST', day_first=False),
            '20141126')
        self.assertEqual(
            unified_strdate('2/2/2015 6:47:40 PM', day_first=False),
            '20150202')
    def test_find_xpath_attr(self):
        testxml = '''<root>
--- a/youtube_dl/extractor/init.py
+++ b/youtube_dl/extractor/init.py
@ -554,6 +554,7 @@ from .wimp import WimpIE
 from .wistia import WistiaIE
 from .worldstarhiphop import WorldStarHipHopIE
 from .wrzuta import WrzutaIE
 from .wsj import WSJIE
 from .xbef import XBefIE
 from .xboxclips import XboxClipsIE
 from .xhamster import XHamsterIE
--- a/youtube_dl/extractor/common.py
+++ b/youtube_dl/extractor/common.py
@ -145,6 +145,7 @@ class InfoExtractor(object):
    thumbnail:      Full URL to a video thumbnail image.
    description:    Full video description.
    uploader:       Full name of the video uploader.
    creator:        The main artist who created the video.
    timestamp:      UNIX timestamp of the moment the video became available.
    upload_date:    Video upload date (YYYYMMDD).
                    If not explicitly set, calculated from timestamp.
--- a/youtube_dl/extractor/wsj.py
+++ b/youtube_dl/extractor/wsj.py
@ -0,0 +1,89 @@
 # encoding: utf-8
 from __future__ import unicode_literals
 from .common import InfoExtractor
 from ..utils import (
    int_or_none,
    unified_strdate,
 )
 class WSJIE(InfoExtractor):
    _VALID_URL = r'https?://video-api\.wsj\.com/api-video/player/iframe\.html\?guid=(?P<id>[a-zA-Z0-9-]+)'
    IE_DESC = 'Wall Street Journal'
    _TEST = {
        'url': 'http://video-api.wsj.com/api-video/player/iframe.html?guid=1BD01A4C-BFE8-40A5-A42F-8A8AF9898B1A',
        'md5': '9747d7a6ebc2f4df64b981e1dde9efa9',
        'info_dict': {
            'id': '1BD01A4C-BFE8-40A5-A42F-8A8AF9898B1A',
            'ext': 'mp4',
            'upload_date': '20150202',
            'uploader_id': 'bbright',
            'creator': 'bbright',
            'categories': list,  # a long list
            'duration': 90,
            'title': 'Bills Coach Rex Ryan Updates His Old Jets Tattoo',
        },
    }
    def _real_extract(self, url):
        video_id = self._match_id(url)
        bitrates = [128, 174, 264, 320, 464, 664, 1264]
        api_url = (
            'http://video-api.wsj.com/api-video/find_all_videos.asp?'
            'type=guid&count=1&query=%s&'
            'fields=hls,adZone,thumbnailList,guid,state,secondsUntilStartTime,'
            'author,description,name,linkURL,videoStillURL,duration,videoURL,'
            'adCategory,catastrophic,linkShortURL,doctypeID,youtubeID,'
            'titletag,rssURL,wsj-section,wsj-subsection,allthingsd-section,'
            'allthingsd-subsection,sm-section,sm-subsection,provider,'
            'formattedCreationDate,keywords,keywordsOmniture,column,editor,'
            'emailURL,emailPartnerID,showName,omnitureProgramName,'
            'omnitureVideoFormat,linkRelativeURL,touchCastID,'
            'omniturePublishDate,%s') % (
                video_id, ','.join('video%dkMP4Url' % br for br in bitrates))
        info = self._download_json(api_url, video_id)['items'][0]
        # Thumbnails are conveniently in the correct format already
        thumbnails = info.get('thumbnailList')
        creator = info.get('author')
        uploader_id = info.get('editor')
        categories = info.get('keywords')
        duration = int_or_none(info.get('duration'))
        upload_date = unified_strdate(
            info.get('formattedCreationDate'), day_first=False)
        title = info.get('name', info.get('titletag'))
        formats = [{
            'format_id': 'f4m',
            'format_note': 'f4m (meta URL)',
            'url': info['videoURL'],
        }]
        if info.get('hls'):
            formats.extend(self._extract_m3u8_formats(
                info['hls'], video_id, ext='mp4',
                preference=0, entry_protocol='m3u8_native'))
        for br in bitrates:
            field = 'video%dkMP4Url' % br
            if info.get(field):
                formats.append({
                    'format_id': 'mp4-%d' % br,
                    'container': 'mp4',
                    'tbr': br,
                    'url': info[field],
                })
        self._sort_formats(formats)
        return {
            'id': video_id,
            'formats': formats,
            'thumbnails': thumbnails,
            'creator': creator,
            'uploader_id': uploader_id,
            'duration': duration,
            'upload_date': upload_date,
            'title': title,
            'formats': formats,
            'categories': categories,
        }
--- a/youtube_dl/utils.py
+++ b/youtube_dl/utils.py
@ -701,7 +701,7 @@ def unified_strdate(date_str, day_first=True):
    # %z (UTC offset) is only supported in python>=3.2
    date_str = re.sub(r' ?(\+|-)[0-9]{2}:?[0-9]{2}$', '', date_str)
    # Remove AM/PM + timezone
-    date_str = re.sub(r'(?i)\s*(?:AM|PM)\s+[A-Z]+', '', date_str)
+    date_str = re.sub(r'(?i)\s*(?:AM|PM)(?:\s+[A-Z]+)?', '', date_str)
    format_expressions = [
        '%d %B %Y',