diff --git a/yt_dlp/extractor/common.py b/yt_dlp/extractor/common.py index 1524fcb15..b14cf0fc9 100644 --- a/yt_dlp/extractor/common.py +++ b/yt_dlp/extractor/common.py @@ -290,6 +290,7 @@ class InfoExtractor(object): categories: A list of categories that the video falls in, for example ["Sports", "Berlin"] tags: A list of tags assigned to the video, e.g. ["sweden", "pop music"] + cast: A list of the video cast is_live: True, False, or None (=unknown). Whether this video is a live stream that goes on instead of a fixed-length video. was_live: True, False, or None (=unknown). Whether this video was diff --git a/yt_dlp/extractor/pornhub.py b/yt_dlp/extractor/pornhub.py index 031454600..cf407a813 100644 --- a/yt_dlp/extractor/pornhub.py +++ b/yt_dlp/extractor/pornhub.py @@ -14,6 +14,7 @@ from ..compat import ( ) from .openload import PhantomJSwrapper from ..utils import ( + clean_html, determine_ext, ExtractorError, int_or_none, @@ -145,6 +146,7 @@ class PornHubIE(PornHubBaseIE): 'age_limit': 18, 'tags': list, 'categories': list, + 'cast': list, }, }, { # non-ASCII title @@ -464,7 +466,7 @@ class PornHubIE(PornHubBaseIE): r'(?s)]+\bclass=["\'].*?\b%sWrapper[^>]*>(.+?)' % meta_key, webpage, meta_key, default=None) if div: - return re.findall(r']+\bhref=[^>]+>([^<]+)', div) + return [clean_html(x).strip() for x in re.findall(r'(?s)]+\bhref=[^>]+>.+?', div)] info = self._search_json_ld(webpage, video_id, default={}) # description provided in JSON-LD is irrelevant @@ -485,6 +487,7 @@ class PornHubIE(PornHubBaseIE): 'age_limit': 18, 'tags': extract_list('tags'), 'categories': extract_list('categories'), + 'cast': extract_list('pornstars'), 'subtitles': subtitles, }, info)