From 5fe3a3c3fbb815fcf3f417a003744b0ab7b9049c Mon Sep 17 00:00:00 2001 From: Philipp Hagemeister Date: Mon, 8 Jul 2013 02:04:11 +0200 Subject: [PATCH] [archive.org] Add extractor (Fixes #1003) --- youtube_dl/extractor/__init__.py | 1 + youtube_dl/extractor/archiveorg.py | 66 ++++++++++++++++++++++++++++++ 2 files changed, 67 insertions(+) create mode 100644 youtube_dl/extractor/archiveorg.py diff --git a/youtube_dl/extractor/__init__.py b/youtube_dl/extractor/__init__.py index e46698736..f668f0f4a 100644 --- a/youtube_dl/extractor/__init__.py +++ b/youtube_dl/extractor/__init__.py @@ -1,4 +1,5 @@ +from .archiveorg import ArchiveOrgIE from .ard import ARDIE from .arte import ArteTvIE from .auengine import AUEngineIE diff --git a/youtube_dl/extractor/archiveorg.py b/youtube_dl/extractor/archiveorg.py new file mode 100644 index 000000000..29cb9bdee --- /dev/null +++ b/youtube_dl/extractor/archiveorg.py @@ -0,0 +1,66 @@ +import json +import re + +from .common import InfoExtractor +from ..utils import ( + determine_ext, + unified_strdate, +) + + +class ArchiveOrgIE(InfoExtractor): + IE_NAME = 'archive.org' + IE_DESC = 'archive.org videos' + _VALID_URL = r'(?:https?://)?(?:www\.)?archive.org/details/(?P[^?/]+)(?:[?].*)?$' + _TEST = { + u"url": u"http://archive.org/details/XD300-23_68HighlightsAResearchCntAugHumanIntellect", + u'file': u'XD300-23_68HighlightsAResearchCntAugHumanIntellect.ogv', + u'md5': u'8af1d4cf447933ed3c7f4871162602db', + u'info_dict': { + u"title": u"1968 Demo - FJCC Conference Presentation Reel #1", + u"description": u"Reel 1 of 3: Also known as the \"Mother of All Demos\", Doug Engelbart's presentation at the Fall Joint Computer Conference in San Francisco, December 9, 1968 titled \"A Research Center for Augmenting Human Intellect.\" For this presentation, Doug and his team astonished the audience by not only relating their research, but demonstrating it live. This was the debut of the mouse, interactive computing, hypermedia, computer supported software engineering, video teleconferencing, etc. See also Doug's 1968 Demo page for more background, highlights, links, and the detailed paper published in this conference proceedings. Filmed on 3 reels: Reel 1 | Reel 2 | Reel 3", + u"upload_date": u"19681210", + u"uploader": u"SRI International" + } + } + + + def _real_extract(self, url): + mobj = re.match(self._VALID_URL, url) + video_id = mobj.group('id') + + json_url = url + (u'?' if u'?' in url else '&') + u'output=json' + json_data = self._download_webpage(json_url, video_id) + data = json.loads(json_data) + + title = data['metadata']['title'][0] + description = data['metadata']['description'][0] + uploader = data['metadata']['creator'][0] + upload_date = unified_strdate(data['metadata']['date'][0]) + + formats = [{ + 'format': fdata['format'], + 'url': 'http://' + data['server'] + data['dir'] + fn, + 'file_size': int(fdata['size']), + } + for fn,fdata in data['files'].items() + if 'Video' in fdata['format']] + formats.sort(key=lambda fdata: fdata['file_size']) + + info = { + 'id': video_id, + 'title': title, + 'formats': formats, + 'description': description, + 'uploader': uploader, + 'upload_date': upload_date, + } + thumbnail = data.get('misc', {}).get('image') + if thumbnail: + info['thumbnail'] = thumbnail + + # TODO: Remove when #980 has been merged + info['url'] = formats[-1]['url'] + info['ext'] = determine_ext(formats[-1]['url']) + + return self.video_result(info) \ No newline at end of file