[extractor/common] add helper method to extract html5 media entries

2024-12-22 15:57:23 +00:00 · 2016-03-16 18:50:45 +01:00 · 2016-03-16 18:50:45 +01:00 · 59bbe4911a
commit 59bbe4911a
parent 4f3c5e0627
1 changed files with 58 additions and 0 deletions
--- a/youtube_dl/extractor/common.py
+++ b/youtube_dl/extractor/common.py
@ -54,6 +54,8 @@ from ..utils import (
    update_Request,
    update_url_query,
    parse_m3u8_attributes,
    extract_attributes,
    parse_codecs,
 )
@ -1610,6 +1612,62 @@ class InfoExtractor(object):
                        self.report_warning('Unknown MIME type %s in DASH manifest' % mime_type)
        return formats
    def _parse_html5_media_entries(self, base_url, webpage):
        def absolute_url(video_url):
            return compat_urlparse.urljoin(base_url, video_url)
        def parse_content_type(content_type):
            if not content_type:
                return {}
            ctr = re.search(r'(?P<mimetype>[^/]+/[^;]+)(?:;\s*codecs="?(?P<codecs>[^"]+))?', content_type)
            if ctr:
                mimetype, codecs = ctr.groups()
                f = parse_codecs(codecs)
                f['ext'] = mimetype2ext(mimetype)
                return f
            return {}
        entries = []
        for media_tag, media_type, media_content in re.findall(r'(?s)(<(?P<tag>video|audio)[^>]*>)(.*?)</(?P=tag)>', webpage):
            media_info = {
                'formats': [],
                'subtitles': {},
            }
            media_attributes = extract_attributes(media_tag)
            src = media_attributes.get('src')
            if src:
                media_info['formats'].append({
                    'url': absolute_url(src),
                    'vcodec': 'none' if media_type == 'audio' else None,
                })
            media_info['thumbnail'] = media_attributes.get('poster')
            if media_content:
                for source_tag in re.findall(r'<source[^>]+>', media_content):
                    source_attributes = extract_attributes(source_tag)
                    src = source_attributes.get('src')
                    if not src:
                        continue
                    f = parse_content_type(source_attributes.get('type'))
                    f.update({
                        'url': absolute_url(src),
                        'vcodec': 'none' if media_type == 'audio' else None,
                    })
                    media_info['formats'].append(f)
                for track_tag in re.findall(r'<track[^>]+>', media_content):
                    track_attributes = extract_attributes(track_tag)
                    kind = track_attributes.get('kind')
                    if not kind or kind == 'subtitles':
                        src = track_attributes.get('src')
                        if not src:
                            continue
                        lang = track_attributes.get('srclang') or track_attributes.get('lang') or track_attributes.get('label')
                        media_info['subtitles'].setdefault(lang, []).append({
                            'url': absolute_url(src),
                        })
            if media_info['formats']:
                entries.append(media_info)
        return entries
    def _live_title(self, name):
        """ Generate the title for a live video """
        now = datetime.datetime.now()