From b6e0c7d2e3bb17b36a3b6e16fa8fd67092658d6c Mon Sep 17 00:00:00 2001 From: Unknown Date: Fri, 9 Oct 2020 07:06:49 +0200 Subject: [PATCH 1/2] [mtv] fix mtv.com and more(?) --- youtube_dlc/extractor/mtv.py | 41 ++++++++++++++++++++++++++++++++++-- youtube_dlc/utils.py | 7 ++++++ 2 files changed, 46 insertions(+), 2 deletions(-) diff --git a/youtube_dlc/extractor/mtv.py b/youtube_dlc/extractor/mtv.py index fedd5f46b..88c5eda38 100644 --- a/youtube_dlc/extractor/mtv.py +++ b/youtube_dlc/extractor/mtv.py @@ -7,6 +7,7 @@ from .common import InfoExtractor from ..compat import ( compat_str, compat_xpath, + compat_urlparse, ) from ..utils import ( ExtractorError, @@ -22,6 +23,7 @@ from ..utils import ( unescapeHTML, update_url_query, url_basename, + get_domain, xpath_text, ) @@ -253,7 +255,39 @@ class MTVServicesInfoExtractor(InfoExtractor): return try_get(feed, lambda x: x['result']['data']['id'], compat_str) - def _extract_mgid(self, webpage): + def _extract_new_triforce_mgid(self, webpage, url='', data_zone=None, video_id=None): + # print(compat_urlparse.urlparse(url).netloc) + domain = get_domain(url) + if domain is None: + raise ExtractorError( + '[%s] could not get domain' % self.IE_NAME, + expected=True) + url = url.replace("https://", "http://") + enc_url = compat_urlparse.quote(url, safe='') + _TRIFORCE_V8_TEMPLATE = 'https://%s/feeds/triforce/manifest/v8?url=%s' + triforce_manifest_url = _TRIFORCE_V8_TEMPLATE % (domain, enc_url) + + manifest = self._download_json(triforce_manifest_url, video_id, fatal=False) + if manifest.get('manifest').get('type') == 'redirect': + self.to_screen('Found a redirect. Downloading manifest from new location') + new_loc = manifest.get('manifest').get('newLocation') + new_loc = new_loc.replace("https://", "http://") + enc_new_loc = compat_urlparse.quote(new_loc, safe='') + triforce_manifest_new_loc = _TRIFORCE_V8_TEMPLATE % (domain, enc_new_loc) + manifest = self._download_json(triforce_manifest_new_loc, video_id, fatal=False) + + item_id = try_get(manifest, lambda x: x['manifest']['reporting']['itemId'], compat_str) + if not item_id: + self.to_screen('Found no id!') + return + + # 'episode' can be anything. 'content' is used often as well + _MGID_TEMPLATE = 'mgid:arc:episode:%s:%s' + mgid = _MGID_TEMPLATE % (domain, item_id) + + return mgid + + def _extract_mgid(self, webpage, url): try: # the url can be http://media.mtvnservices.com/fb/{mgid}.swf # or http://media.mtvnservices.com/{mgid} @@ -275,6 +309,9 @@ class MTVServicesInfoExtractor(InfoExtractor): mgid = self._search_regex( r'embed/(mgid:.+?)["\'&?/]', sm4_embed, 'mgid', default=None) + if not mgid: + mgid = self._extract_new_triforce_mgid(webpage, url) + if not mgid: mgid = self._extract_triforce_mgid(webpage) @@ -283,7 +320,7 @@ class MTVServicesInfoExtractor(InfoExtractor): def _real_extract(self, url): title = url_basename(url) webpage = self._download_webpage(url, title) - mgid = self._extract_mgid(webpage) + mgid = self._extract_mgid(webpage, url) videos_info = self._get_videos_info(mgid) return videos_info diff --git a/youtube_dlc/utils.py b/youtube_dlc/utils.py index 32b179c6f..54a4ea2aa 100644 --- a/youtube_dlc/utils.py +++ b/youtube_dlc/utils.py @@ -1984,6 +1984,7 @@ def get_elements_by_attribute(attribute, value, html, escape_value=True): class HTMLAttributeParser(compat_HTMLParser): """Trivial HTML parser to gather the attributes for a single element""" + def __init__(self): self.attrs = {} compat_HTMLParser.__init__(self) @@ -2378,6 +2379,7 @@ class GeoRestrictedError(ExtractorError): This exception may be thrown when a video is not available from your geographic location due to geographic restrictions imposed by a website. """ + def __init__(self, msg, countries=None): super(GeoRestrictedError, self).__init__(msg, expected=True) self.msg = msg @@ -3558,6 +3560,11 @@ def remove_quotes(s): return s +def get_domain(url): + domain = re.match(r'(?:https?:\/\/)?(?:www\.)?(?P[^\n\/]+\.[^\n\/]+)(?:\/(.*))?', url) + return domain.group('domain') if domain else None + + def url_basename(url): path = compat_urlparse.urlparse(url).path return path.strip('/').split('/')[-1] From cf7cb9428745dc744129e0ba90c626919fb98f48 Mon Sep 17 00:00:00 2001 From: Unknown Date: Fri, 9 Oct 2020 07:50:22 +0200 Subject: [PATCH 2/2] [mtvn] update mtv network related extractors --- youtube_dlc/extractor/bet.py | 2 ++ youtube_dlc/extractor/cmt.py | 6 ++++-- youtube_dlc/extractor/comedycentral.py | 2 +- youtube_dlc/extractor/mtv.py | 23 +++++++++++++---------- youtube_dlc/extractor/nick.py | 2 +- youtube_dlc/extractor/spike.py | 16 +++++++++++++--- youtube_dlc/extractor/vh1.py | 2 ++ 7 files changed, 36 insertions(+), 17 deletions(-) diff --git a/youtube_dlc/extractor/bet.py b/youtube_dlc/extractor/bet.py index d7ceaa85e..2c7144235 100644 --- a/youtube_dlc/extractor/bet.py +++ b/youtube_dlc/extractor/bet.py @@ -3,6 +3,8 @@ from __future__ import unicode_literals from .mtv import MTVServicesInfoExtractor from ..utils import unified_strdate +# TODO Remove - Reason: Outdated Site + class BetIE(MTVServicesInfoExtractor): _VALID_URL = r'https?://(?:www\.)?bet\.com/(?:[^/]+/)+(?P.+?)\.html' diff --git a/youtube_dlc/extractor/cmt.py b/youtube_dlc/extractor/cmt.py index e701fbeab..a4ddb9160 100644 --- a/youtube_dlc/extractor/cmt.py +++ b/youtube_dlc/extractor/cmt.py @@ -2,6 +2,8 @@ from __future__ import unicode_literals from .mtv import MTVIE +# TODO Remove - Reason: Outdated Site + class CMTIE(MTVIE): IE_NAME = 'cmt.com' @@ -39,7 +41,7 @@ class CMTIE(MTVIE): 'only_matching': True, }] - def _extract_mgid(self, webpage): + def _extract_mgid(self, webpage, url): mgid = self._search_regex( r'MTVN\.VIDEO\.contentUri\s*=\s*([\'"])(?P.+?)\1', webpage, 'mgid', group='mgid', default=None) @@ -50,5 +52,5 @@ class CMTIE(MTVIE): def _real_extract(self, url): video_id = self._match_id(url) webpage = self._download_webpage(url, video_id) - mgid = self._extract_mgid(webpage) + mgid = self._extract_mgid(webpage, url) return self.url_result('http://media.mtvnservices.com/embed/%s' % mgid) diff --git a/youtube_dlc/extractor/comedycentral.py b/youtube_dlc/extractor/comedycentral.py index d08b909a6..f54c4adeb 100644 --- a/youtube_dlc/extractor/comedycentral.py +++ b/youtube_dlc/extractor/comedycentral.py @@ -48,7 +48,7 @@ class ComedyCentralFullEpisodesIE(MTVServicesInfoExtractor): def _real_extract(self, url): playlist_id = self._match_id(url) webpage = self._download_webpage(url, playlist_id) - mgid = self._extract_triforce_mgid(webpage, data_zone='t2_lc_promo1') + mgid = self._extract_mgid(webpage, url, data_zone='t2_lc_promo1') videos_info = self._get_videos_info(mgid) return videos_info diff --git a/youtube_dlc/extractor/mtv.py b/youtube_dlc/extractor/mtv.py index 88c5eda38..e545a9ef3 100644 --- a/youtube_dlc/extractor/mtv.py +++ b/youtube_dlc/extractor/mtv.py @@ -255,8 +255,10 @@ class MTVServicesInfoExtractor(InfoExtractor): return try_get(feed, lambda x: x['result']['data']['id'], compat_str) - def _extract_new_triforce_mgid(self, webpage, url='', data_zone=None, video_id=None): + def _extract_new_triforce_mgid(self, webpage, url='', video_id=None): # print(compat_urlparse.urlparse(url).netloc) + if url == '': + return domain = get_domain(url) if domain is None: raise ExtractorError( @@ -268,13 +270,14 @@ class MTVServicesInfoExtractor(InfoExtractor): triforce_manifest_url = _TRIFORCE_V8_TEMPLATE % (domain, enc_url) manifest = self._download_json(triforce_manifest_url, video_id, fatal=False) - if manifest.get('manifest').get('type') == 'redirect': - self.to_screen('Found a redirect. Downloading manifest from new location') - new_loc = manifest.get('manifest').get('newLocation') - new_loc = new_loc.replace("https://", "http://") - enc_new_loc = compat_urlparse.quote(new_loc, safe='') - triforce_manifest_new_loc = _TRIFORCE_V8_TEMPLATE % (domain, enc_new_loc) - manifest = self._download_json(triforce_manifest_new_loc, video_id, fatal=False) + if manifest: + if manifest.get('manifest').get('type') == 'redirect': + self.to_screen('Found a redirect. Downloading manifest from new location') + new_loc = manifest.get('manifest').get('newLocation') + new_loc = new_loc.replace("https://", "http://") + enc_new_loc = compat_urlparse.quote(new_loc, safe='') + triforce_manifest_new_loc = _TRIFORCE_V8_TEMPLATE % (domain, enc_new_loc) + manifest = self._download_json(triforce_manifest_new_loc, video_id, fatal=False) item_id = try_get(manifest, lambda x: x['manifest']['reporting']['itemId'], compat_str) if not item_id: @@ -287,7 +290,7 @@ class MTVServicesInfoExtractor(InfoExtractor): return mgid - def _extract_mgid(self, webpage, url): + def _extract_mgid(self, webpage, url, data_zone=None): try: # the url can be http://media.mtvnservices.com/fb/{mgid}.swf # or http://media.mtvnservices.com/{mgid} @@ -313,7 +316,7 @@ class MTVServicesInfoExtractor(InfoExtractor): mgid = self._extract_new_triforce_mgid(webpage, url) if not mgid: - mgid = self._extract_triforce_mgid(webpage) + mgid = self._extract_triforce_mgid(webpage, data_zone) return mgid diff --git a/youtube_dlc/extractor/nick.py b/youtube_dlc/extractor/nick.py index 2e8b302ac..04b98f7bd 100644 --- a/youtube_dlc/extractor/nick.py +++ b/youtube_dlc/extractor/nick.py @@ -245,5 +245,5 @@ class NickRuIE(MTVServicesInfoExtractor): def _real_extract(self, url): video_id = self._match_id(url) webpage = self._download_webpage(url, video_id) - mgid = self._extract_mgid(webpage) + mgid = self._extract_mgid(webpage, url) return self.url_result('http://media.mtvnservices.com/embed/%s' % mgid) diff --git a/youtube_dlc/extractor/spike.py b/youtube_dlc/extractor/spike.py index aabff7a3c..3cee331f6 100644 --- a/youtube_dlc/extractor/spike.py +++ b/youtube_dlc/extractor/spike.py @@ -20,8 +20,18 @@ class BellatorIE(MTVServicesInfoExtractor): _FEED_URL = 'http://www.bellator.com/feeds/mrss/' _GEO_COUNTRIES = ['US'] - def _extract_mgid(self, webpage): - return self._extract_triforce_mgid(webpage) + def _extract_mgid(self, webpage, url): + mgid = None + + if not mgid: + mgid = self._extract_triforce_mgid(webpage) + + if not mgid: + mgid = self._extract_new_triforce_mgid(webpage, url) + + return mgid + +# TODO Remove - Reason: Outdated Site class ParamountNetworkIE(MTVServicesInfoExtractor): @@ -43,7 +53,7 @@ class ParamountNetworkIE(MTVServicesInfoExtractor): _FEED_URL = 'http://www.paramountnetwork.com/feeds/mrss/' _GEO_COUNTRIES = ['US'] - def _extract_mgid(self, webpage): + def _extract_mgid(self, webpage, url): root_data = self._parse_json(self._search_regex( r'window\.__DATA__\s*=\s*({.+})', webpage, 'data'), None) diff --git a/youtube_dlc/extractor/vh1.py b/youtube_dlc/extractor/vh1.py index dff94a2b8..ea576dc6b 100644 --- a/youtube_dlc/extractor/vh1.py +++ b/youtube_dlc/extractor/vh1.py @@ -3,6 +3,8 @@ from __future__ import unicode_literals from .mtv import MTVServicesInfoExtractor +# TODO Remove - Reason: Outdated Site + class VH1IE(MTVServicesInfoExtractor): IE_NAME = 'vh1.com'