From edf3e38ebd6c5db21585dc7b6384e325e6cfb540 Mon Sep 17 00:00:00 2001 From: Philipp Hagemeister Date: Sun, 22 Sep 2013 10:30:02 +0200 Subject: [PATCH] [youtube] Improve cache and add an option to print the extracted signatures --- youtube_dl/FileDownloader.py | 2 +- youtube_dl/__init__.py | 6 +++ youtube_dl/extractor/youtube.py | 69 +++++++++++++++++++++++++++------ 3 files changed, 65 insertions(+), 12 deletions(-) diff --git a/youtube_dl/FileDownloader.py b/youtube_dl/FileDownloader.py index 1eb71a80e..604714134 100644 --- a/youtube_dl/FileDownloader.py +++ b/youtube_dl/FileDownloader.py @@ -40,7 +40,7 @@ class FileDownloader(object): min_filesize: Skip files smaller than this size max_filesize: Skip files larger than this size cachedir: Location of the cache files in the filesystem. - False to disable filesystem cache. + "NONE" to disable filesystem cache. """ params = None diff --git a/youtube_dl/__init__.py b/youtube_dl/__init__.py index 1ed30aae3..072f69f2e 100644 --- a/youtube_dl/__init__.py +++ b/youtube_dl/__init__.py @@ -167,6 +167,7 @@ def parseOpts(overrideArguments=None): help='Output descriptions of all supported extractors', default=False) general.add_option('--proxy', dest='proxy', default=None, help='Use the specified HTTP/HTTPS proxy', metavar='URL') general.add_option('--no-check-certificate', action='store_true', dest='no_check_certificate', default=False, help='Suppress HTTPS certificate validation.') + general.add_option('--cache-dir', dest='cachedir', default=u'~/.youtube-dl/cache', help='Location in the filesystem where youtube-dl can store downloaded information permanently. NONE to disable filesystem caching, %default by default') selection.add_option('--playlist-start', @@ -272,6 +273,10 @@ def parseOpts(overrideArguments=None): verbosity.add_option('--dump-intermediate-pages', action='store_true', dest='dump_intermediate_pages', default=False, help='print downloaded pages to debug problems(very verbose)') + verbosity.add_option('--youtube-print-sig-code', + action='store_true', dest='youtube_print_sig_code', default=False, + help=optparse.SUPPRESS_HELP) + filesystem.add_option('-t', '--title', action='store_true', dest='usetitle', help='use title in file name (default)', default=False) @@ -613,6 +618,7 @@ def _real_main(argv=None): 'min_filesize': opts.min_filesize, 'max_filesize': opts.max_filesize, 'daterange': date, + 'youtube_print_sig_code': opts.youtube_print_sig_code }) if opts.verbose: diff --git a/youtube_dl/extractor/youtube.py b/youtube_dl/extractor/youtube.py index 63f59ae8f..4200f987e 100644 --- a/youtube_dl/extractor/youtube.py +++ b/youtube_dl/extractor/youtube.py @@ -1,13 +1,13 @@ # coding: utf-8 import collections +import errno import itertools import io import json import operator import os.path import re -import shutil import socket import string import struct @@ -17,6 +17,7 @@ import zlib from .common import InfoExtractor, SearchInfoExtractor from .subtitles import SubtitlesInfoExtractor from ..utils import ( + compat_chr, compat_http_client, compat_parse_qs, compat_urllib_error, @@ -30,6 +31,7 @@ from ..utils import ( unescapeHTML, unified_strdate, orderedSet, + write_json_file, ) class YoutubeBaseInfoExtractor(InfoExtractor): @@ -433,18 +435,18 @@ class YoutubeIE(YoutubeBaseInfoExtractor, SubtitlesInfoExtractor): # Read from filesystem cache func_id = '%s_%s_%d' % (player_type, player_id, slen) assert os.path.basename(func_id) == func_id - cache_dir = self.downloader.params.get('cachedir', - u'~/.youtube-dl/cache') + cache_dir = self._downloader.params.get('cachedir', + u'~/.youtube-dl/cache') - if cache_dir is not False: + if cache_dir != u'NONE': cache_fn = os.path.join(os.path.expanduser(cache_dir), u'youtube-sigfuncs', func_id + '.json') try: - with io.open(cache_fn, '', encoding='utf-8') as cachef: + with io.open(cache_fn, 'r', encoding='utf-8') as cachef: cache_spec = json.load(cachef) return lambda s: u''.join(s[i] for i in cache_spec) - except OSError: + except IOError: pass # No cache available if player_type == 'js': @@ -464,13 +466,55 @@ class YoutubeIE(YoutubeBaseInfoExtractor, SubtitlesInfoExtractor): assert False, 'Invalid player type %r' % player_type if cache_dir is not False: - cache_res = res(map(compat_chr, range(slen))) - cache_spec = [ord(c) for c in cache_res] - shutil.makedirs(os.path.dirname(cache_fn)) - write_json_file(cache_spec, cache_fn) + try: + cache_res = res(map(compat_chr, range(slen))) + cache_spec = [ord(c) for c in cache_res] + try: + os.makedirs(os.path.dirname(cache_fn)) + except OSError as ose: + if ose.errno != errno.EEXIST: + raise + write_json_file(cache_spec, cache_fn) + except Exception as e: + tb = traceback.format_exc() + self._downloader.report_warning( + u'Writing cache to %r failed: %s' % (cache_fn, tb)) return res + def _print_sig_code(self, func, slen): + def gen_sig_code(idxs): + def _genslice(start, end, step): + starts = u'' if start == 0 else str(start) + ends = u':%d' % (end+step) + steps = u'' if step == 1 else (':%d' % step) + return u's[%s%s%s]' % (starts, ends, steps) + + step = None + for i, prev in zip(idxs[1:], idxs[:-1]): + if step is not None: + if i - prev == step: + continue + yield _genslice(start, prev, step) + step = None + continue + if i - prev in [-1, 1]: + step = i - prev + start = prev + continue + else: + yield u's[%d]' % prev + if step is None: + yield u's[%d]' % i + else: + yield _genslice(start, i, step) + + cache_res = func(map(compat_chr, range(slen))) + cache_spec = [ord(c) for c in cache_res] + expr_code = u' + '.join(gen_sig_code(cache_spec)) + code = u'if len(s) == %d:\n return %s\n' % (slen, expr_code) + self.to_screen(u'Extracted signature:\n' + code) + def _parse_sig_js(self, jscode): funcname = self._search_regex( r'signature=([a-zA-Z]+)', jscode, @@ -1007,7 +1051,10 @@ class YoutubeIE(YoutubeBaseInfoExtractor, SubtitlesInfoExtractor): video_id, player_url, len(s) ) self._player_cache[player_url] = func - return self._player_cache[player_url](s) + func = self._player_cache[player_url] + if self._downloader.params.get('youtube_print_sig_code'): + self._print_sig_code(func, len(s)) + return func(s) except Exception as e: tb = traceback.format_exc() self._downloader.report_warning(