[youtube] Improve cache and add an option to print the extracted signatures

This commit is contained in:
Philipp Hagemeister 2013-09-22 10:30:02 +02:00
parent c4417ddb61
commit edf3e38ebd
3 changed files with 65 additions and 12 deletions

View File

@ -40,7 +40,7 @@ class FileDownloader(object):
min_filesize: Skip files smaller than this size min_filesize: Skip files smaller than this size
max_filesize: Skip files larger than this size max_filesize: Skip files larger than this size
cachedir: Location of the cache files in the filesystem. cachedir: Location of the cache files in the filesystem.
False to disable filesystem cache. "NONE" to disable filesystem cache.
""" """
params = None params = None

View File

@ -167,6 +167,7 @@ def parseOpts(overrideArguments=None):
help='Output descriptions of all supported extractors', default=False) help='Output descriptions of all supported extractors', default=False)
general.add_option('--proxy', dest='proxy', default=None, help='Use the specified HTTP/HTTPS proxy', metavar='URL') general.add_option('--proxy', dest='proxy', default=None, help='Use the specified HTTP/HTTPS proxy', metavar='URL')
general.add_option('--no-check-certificate', action='store_true', dest='no_check_certificate', default=False, help='Suppress HTTPS certificate validation.') general.add_option('--no-check-certificate', action='store_true', dest='no_check_certificate', default=False, help='Suppress HTTPS certificate validation.')
general.add_option('--cache-dir', dest='cachedir', default=u'~/.youtube-dl/cache', help='Location in the filesystem where youtube-dl can store downloaded information permanently. NONE to disable filesystem caching, %default by default')
selection.add_option('--playlist-start', selection.add_option('--playlist-start',
@ -272,6 +273,10 @@ def parseOpts(overrideArguments=None):
verbosity.add_option('--dump-intermediate-pages', verbosity.add_option('--dump-intermediate-pages',
action='store_true', dest='dump_intermediate_pages', default=False, action='store_true', dest='dump_intermediate_pages', default=False,
help='print downloaded pages to debug problems(very verbose)') help='print downloaded pages to debug problems(very verbose)')
verbosity.add_option('--youtube-print-sig-code',
action='store_true', dest='youtube_print_sig_code', default=False,
help=optparse.SUPPRESS_HELP)
filesystem.add_option('-t', '--title', filesystem.add_option('-t', '--title',
action='store_true', dest='usetitle', help='use title in file name (default)', default=False) action='store_true', dest='usetitle', help='use title in file name (default)', default=False)
@ -613,6 +618,7 @@ def _real_main(argv=None):
'min_filesize': opts.min_filesize, 'min_filesize': opts.min_filesize,
'max_filesize': opts.max_filesize, 'max_filesize': opts.max_filesize,
'daterange': date, 'daterange': date,
'youtube_print_sig_code': opts.youtube_print_sig_code
}) })
if opts.verbose: if opts.verbose:

View File

@ -1,13 +1,13 @@
# coding: utf-8 # coding: utf-8
import collections import collections
import errno
import itertools import itertools
import io import io
import json import json
import operator import operator
import os.path import os.path
import re import re
import shutil
import socket import socket
import string import string
import struct import struct
@ -17,6 +17,7 @@ import zlib
from .common import InfoExtractor, SearchInfoExtractor from .common import InfoExtractor, SearchInfoExtractor
from .subtitles import SubtitlesInfoExtractor from .subtitles import SubtitlesInfoExtractor
from ..utils import ( from ..utils import (
compat_chr,
compat_http_client, compat_http_client,
compat_parse_qs, compat_parse_qs,
compat_urllib_error, compat_urllib_error,
@ -30,6 +31,7 @@ from ..utils import (
unescapeHTML, unescapeHTML,
unified_strdate, unified_strdate,
orderedSet, orderedSet,
write_json_file,
) )
class YoutubeBaseInfoExtractor(InfoExtractor): class YoutubeBaseInfoExtractor(InfoExtractor):
@ -433,18 +435,18 @@ class YoutubeIE(YoutubeBaseInfoExtractor, SubtitlesInfoExtractor):
# Read from filesystem cache # Read from filesystem cache
func_id = '%s_%s_%d' % (player_type, player_id, slen) func_id = '%s_%s_%d' % (player_type, player_id, slen)
assert os.path.basename(func_id) == func_id assert os.path.basename(func_id) == func_id
cache_dir = self.downloader.params.get('cachedir', cache_dir = self._downloader.params.get('cachedir',
u'~/.youtube-dl/cache') u'~/.youtube-dl/cache')
if cache_dir is not False: if cache_dir != u'NONE':
cache_fn = os.path.join(os.path.expanduser(cache_dir), cache_fn = os.path.join(os.path.expanduser(cache_dir),
u'youtube-sigfuncs', u'youtube-sigfuncs',
func_id + '.json') func_id + '.json')
try: try:
with io.open(cache_fn, '', encoding='utf-8') as cachef: with io.open(cache_fn, 'r', encoding='utf-8') as cachef:
cache_spec = json.load(cachef) cache_spec = json.load(cachef)
return lambda s: u''.join(s[i] for i in cache_spec) return lambda s: u''.join(s[i] for i in cache_spec)
except OSError: except IOError:
pass # No cache available pass # No cache available
if player_type == 'js': if player_type == 'js':
@ -464,13 +466,55 @@ class YoutubeIE(YoutubeBaseInfoExtractor, SubtitlesInfoExtractor):
assert False, 'Invalid player type %r' % player_type assert False, 'Invalid player type %r' % player_type
if cache_dir is not False: if cache_dir is not False:
cache_res = res(map(compat_chr, range(slen))) try:
cache_spec = [ord(c) for c in cache_res] cache_res = res(map(compat_chr, range(slen)))
shutil.makedirs(os.path.dirname(cache_fn)) cache_spec = [ord(c) for c in cache_res]
write_json_file(cache_spec, cache_fn) try:
os.makedirs(os.path.dirname(cache_fn))
except OSError as ose:
if ose.errno != errno.EEXIST:
raise
write_json_file(cache_spec, cache_fn)
except Exception as e:
tb = traceback.format_exc()
self._downloader.report_warning(
u'Writing cache to %r failed: %s' % (cache_fn, tb))
return res return res
def _print_sig_code(self, func, slen):
def gen_sig_code(idxs):
def _genslice(start, end, step):
starts = u'' if start == 0 else str(start)
ends = u':%d' % (end+step)
steps = u'' if step == 1 else (':%d' % step)
return u's[%s%s%s]' % (starts, ends, steps)
step = None
for i, prev in zip(idxs[1:], idxs[:-1]):
if step is not None:
if i - prev == step:
continue
yield _genslice(start, prev, step)
step = None
continue
if i - prev in [-1, 1]:
step = i - prev
start = prev
continue
else:
yield u's[%d]' % prev
if step is None:
yield u's[%d]' % i
else:
yield _genslice(start, i, step)
cache_res = func(map(compat_chr, range(slen)))
cache_spec = [ord(c) for c in cache_res]
expr_code = u' + '.join(gen_sig_code(cache_spec))
code = u'if len(s) == %d:\n return %s\n' % (slen, expr_code)
self.to_screen(u'Extracted signature:\n' + code)
def _parse_sig_js(self, jscode): def _parse_sig_js(self, jscode):
funcname = self._search_regex( funcname = self._search_regex(
r'signature=([a-zA-Z]+)', jscode, r'signature=([a-zA-Z]+)', jscode,
@ -1007,7 +1051,10 @@ class YoutubeIE(YoutubeBaseInfoExtractor, SubtitlesInfoExtractor):
video_id, player_url, len(s) video_id, player_url, len(s)
) )
self._player_cache[player_url] = func self._player_cache[player_url] = func
return self._player_cache[player_url](s) func = self._player_cache[player_url]
if self._downloader.params.get('youtube_print_sig_code'):
self._print_sig_code(func, len(s))
return func(s)
except Exception as e: except Exception as e:
tb = traceback.format_exc() tb = traceback.format_exc()
self._downloader.report_warning( self._downloader.report_warning(