[tiktok] fix extraction

This commit is contained in:
Aakash Gajjar 2019-10-25 13:35:54 +05:30
parent 162bcc68dc
commit 4b6d03ed87
2 changed files with 216 additions and 105 deletions

View File

@ -1151,10 +1151,7 @@ from .thisamericanlife import ThisAmericanLifeIE
from .thisav import ThisAVIE from .thisav import ThisAVIE
from .thisoldhouse import ThisOldHouseIE from .thisoldhouse import ThisOldHouseIE
from .threeqsdn import ThreeQSDNIE from .threeqsdn import ThreeQSDNIE
from .tiktok import ( from .tiktok import TikTokIE
TikTokIE,
TikTokUserIE,
)
from .tinypic import TinyPicIE from .tinypic import TinyPicIE
from .tmz import ( from .tmz import (
TMZIE, TMZIE,

View File

@ -1,138 +1,252 @@
# coding: utf-8 # coding: utf-8
from __future__ import unicode_literals from __future__ import unicode_literals
from datetime import datetime
import re
from .common import InfoExtractor from .common import InfoExtractor
from ..utils import ( from ..utils import (
compat_str,
ExtractorError, ExtractorError,
int_or_none, int_or_none,
str_or_none, str_or_none,
try_get, try_get
url_or_none,
) )
class TikTokBaseIE(InfoExtractor): class TikTokBaseIE(InfoExtractor):
def _extract_aweme(self, data): def _video_info(self, video_info):
video = data['video'] return {
description = str_or_none(try_get(data, lambda x: x['desc'])) 'id': str_or_none(video_info.get('id')),
width = int_or_none(try_get(data, lambda x: video['width'])) 'thumbnail': try_get(video_info, lambda x: x['covers'][0], str) or try_get(video_info, lambda x: x['video']['videoMeta']['cover'][0], str),
height = int_or_none(try_get(data, lambda x: video['height'])) 'video_url': try_get(video_info, lambda x: x['video']['urls'][0], str) or video_info.get('video', {}).get('urls', [None])[0],
'width': try_get(video_info, lambda x: x['video']['videoMeta']['width'], int) or try_get(video_info, lambda x: x['width'], int),
'height': try_get(video_info, lambda x: x['video']['videoMeta']['height'], int) or try_get(video_info, lambda x: x['height'], int),
'duration': try_get(video_info, lambda x: x['video']['videoMeta']['duration'], int),
'description': str_or_none(video_info.get('text')),
'comment_count': int_or_none(video_info.get('commentCount')),
'like_count': int_or_none(video_info.get('diggCount')),
'repost_count': int_or_none(video_info.get('shareCount')),
'timestamp': str_or_none(video_info.get('createTime')),
'track_id': str_or_none(video_info.get('musicId'))
}
def _author_info(self, author_info):
return {
'uploader': str_or_none(author_info.get('uniqueId')),
'creator': str_or_none(author_info.get('nickName')),
'uploader_id': str_or_none(author_info.get('userId')),
'channel_id': str_or_none(author_info.get('userId'))
}
def _track_info(self, track_info):
return {
'track': str_or_none(track_info.get('musicName')),
'track_id': str_or_none(track_info.get('musicId')),
'artist': str_or_none(track_info.get('authorName'))
}
def _share_info(self, share_info):
return {
'title': str_or_none(share_info.get('title')),
'description': str_or_none(share_info.get('desc')),
'image': try_get(share_info, lambda x: x['image'], dict),
'width': try_get(share_info, lambda x: x['image']['width'], int),
'height': try_get(share_info, lambda x: x['image']['height'], int),
}
def _extract_aweme(self, video_data, webpage):
video_info_data = try_get(
video_data, lambda x: x['videoData']['itemInfos'], dict)
author_info_data = try_get(
video_data, lambda x: x['videoData']['authorInfos'], dict)
track_info_data = try_get(
video_data, lambda x: x['videoData']['musicInfos'], dict)
share_info_data = try_get(video_data, lambda x: x['shareMeta'], dict)
video_info = self._video_info(video_info_data)
author_info = self._author_info(author_info_data)
track_info = self._track_info(track_info_data)
share_info = self._share_info(share_info_data)
timestamp = int(video_info.get('timestamp')) or 0
date = str_or_none(datetime.fromtimestamp(
timestamp).strftime('%Y%m%d'))
thumbnails = []
thumbnails.append({
'url': video_info.get('thumbnail') or self._og_search_thumbnail(webpage),
'width': video_info.get('width'),
'height': video_info.get('height')
})
description = video_info.get(
'description') or share_info.get('description')
if description is None:
tags = []
else:
tags = re.findall(r"#(\w+)", description)
format_urls = set()
formats = [] formats = []
for format_id in ( formats.append({
'play_addr_lowbr', 'play_addr', 'play_addr_h264', 'url': video_info.get('video_url') or self._og_search_video_url(webpage),
'download_addr'): 'ext': 'mp4',
for format in try_get( 'height': video_info.get('height'),
video, lambda x: x[format_id]['url_list'], list) or []: 'width': video_info.get('width'),
format_url = url_or_none(format) })
if not format_url:
continue
if format_url in format_urls:
continue
format_urls.add(format_url)
formats.append({
'url': format_url,
'ext': 'mp4',
'height': height,
'width': width,
})
self._sort_formats(formats)
thumbnail = url_or_none(try_get(
video, lambda x: x['cover']['url_list'][0], compat_str))
uploader = try_get(data, lambda x: x['author']['nickname'], compat_str)
timestamp = int_or_none(data.get('create_time'))
comment_count = int_or_none(data.get('comment_count')) or int_or_none(
try_get(data, lambda x: x['statistics']['comment_count']))
repost_count = int_or_none(try_get(
data, lambda x: x['statistics']['share_count']))
aweme_id = data['aweme_id']
return { return {
'id': aweme_id, 'artist': track_info.get('artist'),
'title': uploader or aweme_id, 'channel_id': author_info.get('channel_id'),
'channel_url': 'https://www.tiktok.com/@{}'.format(author_info.get('uploader')),
'comment_count': video_info.get('comment_count'),
'creator': author_info.get('creator'),
'description': description, 'description': description,
'thumbnail': thumbnail, 'duration': video_info.get('duration'),
'uploader': uploader,
'timestamp': timestamp,
'comment_count': comment_count,
'repost_count': repost_count,
'formats': formats, 'formats': formats,
'height': video_info.get('height'),
'id': video_info.get('id'),
'like_count': video_info.get('like_count'),
'playlist_title': share_info.get('title'),
'playlist_uploader': author_info.get('uploader'),
'playlist_uploader_id': author_info.get('uploader_id'),
'repost_count': video_info.get('repost_count'),
'release_date': date,
'tags': tags,
'thumbnail': video_info.get('thumbnail'),
'thumbnails': thumbnails,
'timestamp': int(video_info.get('timestamp')),
'title': share_info.get('title') or self._og_search_title(webpage),
'track': track_info.get('track'),
'track_id': track_info.get('track_id'),
'upload_date': date,
'uploader': author_info.get('uploader'),
'uploader_id': author_info.get('uploader_id'),
'uploader_url': 'https://www.tiktok.com/@{}'.format(author_info.get('uploader')),
'webpage_url': self._og_search_url(webpage),
'width': video_info.get('width')
} }
class TikTokIE(TikTokBaseIE): class TikTokIE(TikTokBaseIE):
_VALID_URL = r'''(?x) _VALID_URL = r'''(?x)
https?:// https?://
(?: (?:
(?:m\.)?tiktok\.com/v| (?:www|m)\.
(?:www\.)?tiktok\.com/share/video (?:tiktok.com)\/
) (@(?P<username>[\w\.]+))?
/(?P<id>\d+) (?:v|video|embed|trending)?(?:\/)?
''' (?:video)?(?:\/)?
(?:\?shareId=)?
)
(?P<id>[\d]{6,})
(?:\.html)?
(?:\?.*)?
$
'''
_TESTS = [{ _TESTS = [{
'url': 'https://m.tiktok.com/v/6606727368545406213.html', 'url': 'https://www.tiktok.com/@leenabhushan/video/6748451240264420610',
'md5': 'd584b572e92fcd48888051f238022420', 'md5': '34a7543afd5a151b0840ba6736fb633b',
'info_dict': { 'info_dict': {
'id': '6606727368545406213', 'id': '6748451240264420610',
'ext': 'mp4', 'ext': 'mp4',
'title': 'Zureeal', 'title': 'facestoriesbyleenabh on TikTok',
'description': '#bowsette#mario#cosplay#uk#lgbt#gaming#asian#bowsettecosplay', 'description': 'md5:a9f6c0c44a1ff2249cae610372d0ae95',
'thumbnail': r're:^https?://.*~noop.image', 'thumbnail': r're:^https?://[\w\/\.\-]+(~[\w\-]+\.image)?',
'uploader': 'Zureeal', 'uploader': 'leenabhushan',
'timestamp': 1538248586, 'timestamp': 1571246252,
'upload_date': '20180929', 'upload_date': '20191016',
'comment_count': int, 'comment_count': int,
'repost_count': int, 'repost_count': int,
'like_count': int,
'playlist_title': 'facestoriesbyleenabh on TikTok',
'playlist_uploader': 'leenabhushan',
'playlist_uploader_id': '6691488002098119685',
'artist': 'Jass Manak',
'channel_id': '6691488002098119685',
'channel_url': 'https://www.tiktok.com/@leenabhushan',
'creator': 'facestoriesbyleenabh',
'duration': 13,
'formats': list,
'height': 1280,
'release_date': '20191016',
'tags': list,
'thumbnails': list,
'track': 'Lehanga',
'track_id': '6716465478027447045',
'uploader_id': '6691488002098119685',
'uploader_url': r're:https://www.tiktok.com/@leenabhushan',
'webpage_url': r're:https://www.tiktok.com/@leenabhushan/(video/)?6748451240264420610',
'width': 720,
} }
}, { }, {
'url': 'https://www.tiktok.com/share/video/6606727368545406213', 'url': 'https://www.tiktok.com/@patroxofficial/video/6742501081818877190?langCountry=en',
'only_matching': True, 'md5': '06b9800d47d5fe51a19e322dd86e61c9',
'info_dict': {
'artist': 'Evan Todd, Jessica Keenan Wynn, Alice Lee, Barrett Wilbert Weed & Jon Eidson',
'channel_id': '18702747',
'channel_url': 'https://www.tiktok.com/@patroxofficial',
'comment_count': int,
'creator': 'patroX',
'description': 'md5:5e2a23877420bb85ce6521dbee39ba94',
'duration': 27,
'ext': 'mp4',
'formats': list,
'height': 960,
'id': '6742501081818877190',
'like_count': int,
'playlist_title': 'patroX on TikTok',
'playlist_uploader_id': '18702747',
'playlist_uploader': 'patroxofficial',
'release_date': '20190930',
'repost_count': int,
'tags': list,
'thumbnail': r're:^https?://[\w\/\.\-]+(~[\w\-]+\.image)?',
'thumbnails': list,
'timestamp': 1569860870,
'title': 'patroX on TikTok',
'track_id': '209649576000286720',
'track': 'Big Fun',
'upload_date': '20190930',
'uploader_id': '18702747',
'uploader_url': r're:https://www.tiktok.com/@patroxofficial',
'uploader': 'patroxofficial',
'webpage_url': r're:https://www.tiktok.com/@patroxofficial/(video/)?6742501081818877190',
'width': 540,
}
}, {
'url': 'https://m.tiktok.com/v/6749869095467945218.html',
'only_matching': True
}, {
'url': 'https://www.tiktok.com/@cchelseameow/video/6751181801206729990',
'only_matching': True
}, {
'url': 'https://www.tiktok.com/embed/6567659045795758085',
'only_matching': True
}, {
'url': 'https://www.tiktok.com/trending?shareId=6744531482393545985',
'only_matching': True
}, {
'url': 'https://www.tiktok.com/@leenabhushan/video/6748451240264420610?enter_from=h5_m',
'only_matching': True
}] }]
def _real_extract(self, url): def _real_extract(self, url):
video_id = self._match_id(url) video_id = self._match_id(url)
webpage = self._download_webpage(
'https://m.tiktok.com/v/%s.html' % video_id, video_id)
data = self._parse_json(self._search_regex(
r'\bdata\s*=\s*({.+?})\s*;', webpage, 'data'), video_id)
return self._extract_aweme(data)
headers = {
'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:72.0) Gecko/20100101 Firefox/72.0',
'Referer': url
}
webpage = self._download_webpage(url, video_id, headers=headers, note='Downloading video webpage')
json_string = self._search_regex(
r'id=\"__NEXT_DATA__\"\s+type=\"application\/json\">\s*(?P<json_string>[^<]+)',
webpage, 'json_string', group='json_string')
json_data = self._parse_json(json_string, video_id)
video_data = try_get(json_data, lambda x: x['props']['pageProps'], expected_type=dict)
class TikTokUserIE(TikTokBaseIE): # Chech statusCode for success
_VALID_URL = r'''(?x) if video_data.get('statusCode') == 0:
https?:// return self._extract_aweme(video_data, webpage)
(?:
(?:m\.)?tiktok\.com/h5/share/usr| raise ExtractorError("Video not available", video_id=video_id)
(?:www\.)?tiktok\.com/share/user
)
/(?P<id>\d+)
'''
_TESTS = [{
'url': 'https://m.tiktok.com/h5/share/usr/188294915489964032.html',
'info_dict': {
'id': '188294915489964032',
},
'playlist_mincount': 24,
}, {
'url': 'https://www.tiktok.com/share/user/188294915489964032',
'only_matching': True,
}]
def _real_extract(self, url):
user_id = self._match_id(url)
data = self._download_json(
'https://m.tiktok.com/h5/share/usr/list/%s/' % user_id, user_id,
query={'_signature': '_'})
entries = []
for aweme in data['aweme_list']:
try:
entry = self._extract_aweme(aweme)
except ExtractorError:
continue
entry['extractor_key'] = TikTokIE.ie_key()
entries.append(entry)
return self.playlist_result(entries, user_id)