mirror of https://github.com/morpheus65535/bazarr
WIP
This commit is contained in:
parent
4e7e3a39d2
commit
645952c61a
22
bazarr.py
22
bazarr.py
|
@ -1,5 +1,7 @@
|
||||||
# coding=utf-8
|
# coding=utf-8
|
||||||
|
|
||||||
|
from __future__ import absolute_import
|
||||||
|
from __future__ import print_function
|
||||||
import subprocess as sp
|
import subprocess as sp
|
||||||
import time
|
import time
|
||||||
import os
|
import os
|
||||||
|
@ -12,14 +14,16 @@ from bazarr.get_args import args
|
||||||
def check_python_version():
|
def check_python_version():
|
||||||
python_version = platform.python_version_tuple()
|
python_version = platform.python_version_tuple()
|
||||||
minimum_python_version_tuple = (2, 7, 13)
|
minimum_python_version_tuple = (2, 7, 13)
|
||||||
|
minimum_python3_version_tuple = (3, 6, 0)
|
||||||
minimum_python_version = ".".join(str(i) for i in minimum_python_version_tuple)
|
minimum_python_version = ".".join(str(i) for i in minimum_python_version_tuple)
|
||||||
|
minimum_python3_version = ".".join(str(i) for i in minimum_python3_version_tuple)
|
||||||
|
|
||||||
if int(python_version[0]) > minimum_python_version_tuple[0]:
|
if int(python_version[0]) == minimum_python3_version_tuple[0] and int(python_version[1]) < minimum_python3_version_tuple[1]:
|
||||||
print "Python 3 isn't supported. Please use Python " + minimum_python_version + " or greater."
|
print("Python " + minimum_python3_version + " or greater required. Current version is " + platform.python_version() + ". Please upgrade Python.")
|
||||||
os._exit(0)
|
os._exit(0)
|
||||||
|
|
||||||
elif int(python_version[1]) < minimum_python_version_tuple[1] or int(python_version[2].rstrip('+')) < minimum_python_version_tuple[2]:
|
elif int(python_version[0]) == minimum_python_version_tuple[0] and (int(python_version[1]) < minimum_python_version_tuple[1] or int(python_version[2].rstrip('+')) < minimum_python_version_tuple[2]):
|
||||||
print "Python " + minimum_python_version + " or greater required. Current version is " + platform.python_version() + ". Please upgrade Python."
|
print("Python " + minimum_python_version + " or greater required. Current version is " + platform.python_version() + ". Please upgrade Python.")
|
||||||
os._exit(0)
|
os._exit(0)
|
||||||
|
|
||||||
|
|
||||||
|
@ -32,10 +36,10 @@ def start_bazarr():
|
||||||
script = [sys.executable, "-u", os.path.normcase(os.path.join(dir_name, 'bazarr', 'main.py'))] + sys.argv[1:]
|
script = [sys.executable, "-u", os.path.normcase(os.path.join(dir_name, 'bazarr', 'main.py'))] + sys.argv[1:]
|
||||||
|
|
||||||
ep = sp.Popen(script, stdout=sp.PIPE, stderr=sp.STDOUT, stdin=sp.PIPE)
|
ep = sp.Popen(script, stdout=sp.PIPE, stderr=sp.STDOUT, stdin=sp.PIPE)
|
||||||
print "Bazarr starting..."
|
print("Bazarr starting...")
|
||||||
try:
|
try:
|
||||||
for line in iter(ep.stdout.readline, ''):
|
for line in iter(ep.stdout.readline, ''):
|
||||||
sys.stdout.write(line)
|
sys.stdout.buffer.write(line)
|
||||||
except KeyboardInterrupt:
|
except KeyboardInterrupt:
|
||||||
pass
|
pass
|
||||||
|
|
||||||
|
@ -60,16 +64,16 @@ if __name__ == '__main__':
|
||||||
try:
|
try:
|
||||||
os.remove(stopfile)
|
os.remove(stopfile)
|
||||||
except:
|
except:
|
||||||
print 'Unable to delete stop file.'
|
print('Unable to delete stop file.')
|
||||||
else:
|
else:
|
||||||
print 'Bazarr exited.'
|
print('Bazarr exited.')
|
||||||
os._exit(0)
|
os._exit(0)
|
||||||
|
|
||||||
if os.path.exists(restartfile):
|
if os.path.exists(restartfile):
|
||||||
try:
|
try:
|
||||||
os.remove(restartfile)
|
os.remove(restartfile)
|
||||||
except:
|
except:
|
||||||
print 'Unable to delete restart file.'
|
print('Unable to delete restart file.')
|
||||||
else:
|
else:
|
||||||
start_bazarr()
|
start_bazarr()
|
||||||
|
|
||||||
|
|
|
@ -1,6 +1,7 @@
|
||||||
# coding=utf-8
|
# coding=utf-8
|
||||||
|
|
||||||
import cPickle as pickle
|
from __future__ import absolute_import
|
||||||
|
import six.moves.cPickle as pickle
|
||||||
import base64
|
import base64
|
||||||
import random
|
import random
|
||||||
import platform
|
import platform
|
||||||
|
@ -30,7 +31,7 @@ def track_event(category=None, action=None, label=None):
|
||||||
visitor = pickle.loads(base64.b64decode(settings.analytics.visitor))
|
visitor = pickle.loads(base64.b64decode(settings.analytics.visitor))
|
||||||
except:
|
except:
|
||||||
visitor = Visitor()
|
visitor = Visitor()
|
||||||
unique_id = long(random.getrandbits(32))
|
unique_id = int(random.getrandbits(32))
|
||||||
visitor.unique_id = unique_id
|
visitor.unique_id = unique_id
|
||||||
|
|
||||||
session = Session()
|
session = Session()
|
||||||
|
|
|
@ -1,4 +1,5 @@
|
||||||
# coding=utf-8
|
# coding=utf-8
|
||||||
|
from __future__ import absolute_import
|
||||||
import os
|
import os
|
||||||
import logging
|
import logging
|
||||||
import json
|
import json
|
||||||
|
|
|
@ -1,4 +1,5 @@
|
||||||
# coding=utf-8
|
# coding=utf-8
|
||||||
|
from __future__ import absolute_import
|
||||||
import os
|
import os
|
||||||
|
|
||||||
from simpleconfigparser import simpleconfigparser
|
from simpleconfigparser import simpleconfigparser
|
||||||
|
|
|
@ -1,3 +1,4 @@
|
||||||
|
from __future__ import absolute_import
|
||||||
import os
|
import os
|
||||||
import atexit
|
import atexit
|
||||||
|
|
||||||
|
|
|
@ -1,3 +1,4 @@
|
||||||
|
from __future__ import absolute_import
|
||||||
import enzyme
|
import enzyme
|
||||||
import logging
|
import logging
|
||||||
import os
|
import os
|
||||||
|
|
|
@ -1,4 +1,5 @@
|
||||||
# coding=utf-8
|
# coding=utf-8
|
||||||
|
from __future__ import absolute_import
|
||||||
import os
|
import os
|
||||||
import argparse
|
import argparse
|
||||||
|
|
||||||
|
|
|
@ -1,4 +1,5 @@
|
||||||
# coding=utf-8
|
# coding=utf-8
|
||||||
|
from __future__ import absolute_import
|
||||||
import os
|
import os
|
||||||
import requests
|
import requests
|
||||||
import logging
|
import logging
|
||||||
|
|
|
@ -1,5 +1,6 @@
|
||||||
# coding=utf-8
|
# coding=utf-8
|
||||||
|
|
||||||
|
from __future__ import absolute_import
|
||||||
import os
|
import os
|
||||||
import pycountry
|
import pycountry
|
||||||
|
|
||||||
|
|
|
@ -1,5 +1,6 @@
|
||||||
# coding=utf-8
|
# coding=utf-8
|
||||||
|
|
||||||
|
from __future__ import absolute_import
|
||||||
import os
|
import os
|
||||||
import requests
|
import requests
|
||||||
import logging
|
import logging
|
||||||
|
@ -13,6 +14,7 @@ from list_subtitles import store_subtitles_movie, list_missing_subtitles_movies,
|
||||||
|
|
||||||
from get_subtitle import movies_download_subtitles
|
from get_subtitle import movies_download_subtitles
|
||||||
from database import TableMovies, wal_cleaning
|
from database import TableMovies, wal_cleaning
|
||||||
|
import six
|
||||||
|
|
||||||
|
|
||||||
def update_all_movies():
|
def update_all_movies():
|
||||||
|
@ -82,7 +84,7 @@ def update_movies():
|
||||||
|
|
||||||
if movie["path"] != None and movie['movieFile']['relativePath'] != None:
|
if movie["path"] != None and movie['movieFile']['relativePath'] != None:
|
||||||
try:
|
try:
|
||||||
overview = unicode(movie['overview'])
|
overview = six.text_type(movie['overview'])
|
||||||
except:
|
except:
|
||||||
overview = ""
|
overview = ""
|
||||||
try:
|
try:
|
||||||
|
@ -136,27 +138,27 @@ def update_movies():
|
||||||
audioCodec = None
|
audioCodec = None
|
||||||
|
|
||||||
# Add movies in radarr to current movies list
|
# Add movies in radarr to current movies list
|
||||||
current_movies_radarr.append(unicode(movie['tmdbId']))
|
current_movies_radarr.append(six.text_type(movie['tmdbId']))
|
||||||
|
|
||||||
if unicode(movie['tmdbId']) in current_movies_db_list:
|
if six.text_type(movie['tmdbId']) in current_movies_db_list:
|
||||||
movies_to_update.append({'radarr_id': movie["id"],
|
movies_to_update.append({'radarr_id': movie["id"],
|
||||||
'title': unicode(movie["title"]),
|
'title': six.text_type(movie["title"]),
|
||||||
'path': unicode(movie["path"] + separator + movie['movieFile']['relativePath']),
|
'path': six.text_type(movie["path"] + separator + movie['movieFile']['relativePath']),
|
||||||
'tmdb_id': unicode(movie["tmdbId"]),
|
'tmdb_id': six.text_type(movie["tmdbId"]),
|
||||||
'poster': unicode(poster),
|
'poster': six.text_type(poster),
|
||||||
'fanart': unicode(fanart),
|
'fanart': six.text_type(fanart),
|
||||||
'audio_language': unicode(profile_id_to_language(movie['qualityProfileId'], audio_profiles)),
|
'audio_language': six.text_type(profile_id_to_language(movie['qualityProfileId'], audio_profiles)),
|
||||||
'scene_name': sceneName,
|
'scene_name': sceneName,
|
||||||
'monitored': unicode(bool(movie['monitored'])),
|
'monitored': six.text_type(bool(movie['monitored'])),
|
||||||
'year': unicode(movie['year']),
|
'year': six.text_type(movie['year']),
|
||||||
'sort_title': unicode(movie['sortTitle']),
|
'sort_title': six.text_type(movie['sortTitle']),
|
||||||
'alternative_titles': unicode(alternativeTitles),
|
'alternative_titles': six.text_type(alternativeTitles),
|
||||||
'format': unicode(format),
|
'format': six.text_type(format),
|
||||||
'resolution': unicode(resolution),
|
'resolution': six.text_type(resolution),
|
||||||
'video_codec': unicode(videoCodec),
|
'video_codec': six.text_type(videoCodec),
|
||||||
'audio_codec': unicode(audioCodec),
|
'audio_codec': six.text_type(audioCodec),
|
||||||
'overview': unicode(overview),
|
'overview': six.text_type(overview),
|
||||||
'imdb_id': unicode(imdbId)})
|
'imdb_id': six.text_type(imdbId)})
|
||||||
else:
|
else:
|
||||||
if movie_default_enabled is True:
|
if movie_default_enabled is True:
|
||||||
movies_to_add.append({'radarr_id': movie["id"],
|
movies_to_add.append({'radarr_id': movie["id"],
|
||||||
|
@ -171,7 +173,7 @@ def update_movies():
|
||||||
'fanart': fanart,
|
'fanart': fanart,
|
||||||
'audio_language': profile_id_to_language(movie['qualityProfileId'], audio_profiles),
|
'audio_language': profile_id_to_language(movie['qualityProfileId'], audio_profiles),
|
||||||
'scene_name': sceneName,
|
'scene_name': sceneName,
|
||||||
'monitored': unicode(bool(movie['monitored'])),
|
'monitored': six.text_type(bool(movie['monitored'])),
|
||||||
'sort_title': movie['sortTitle'],
|
'sort_title': movie['sortTitle'],
|
||||||
'year': movie['year'],
|
'year': movie['year'],
|
||||||
'alternative_titles': alternativeTitles,
|
'alternative_titles': alternativeTitles,
|
||||||
|
@ -191,7 +193,7 @@ def update_movies():
|
||||||
'fanart': fanart,
|
'fanart': fanart,
|
||||||
'audio_language': profile_id_to_language(movie['qualityProfileId'], audio_profiles),
|
'audio_language': profile_id_to_language(movie['qualityProfileId'], audio_profiles),
|
||||||
'scene_name': sceneName,
|
'scene_name': sceneName,
|
||||||
'monitored': unicode(bool(movie['monitored'])),
|
'monitored': six.text_type(bool(movie['monitored'])),
|
||||||
'sort_title': movie['sortTitle'],
|
'sort_title': movie['sortTitle'],
|
||||||
'year': movie['year'],
|
'year': movie['year'],
|
||||||
'alternative_titles': alternativeTitles,
|
'alternative_titles': alternativeTitles,
|
||||||
|
|
|
@ -1,4 +1,5 @@
|
||||||
# coding=utf-8
|
# coding=utf-8
|
||||||
|
from __future__ import absolute_import
|
||||||
import os
|
import os
|
||||||
import datetime
|
import datetime
|
||||||
import logging
|
import logging
|
||||||
|
@ -159,8 +160,8 @@ def provider_throttle(name, exception):
|
||||||
|
|
||||||
def throttled_count(name):
|
def throttled_count(name):
|
||||||
global throttle_count
|
global throttle_count
|
||||||
if name in throttle_count.keys():
|
if name in list(throttle_count.keys()):
|
||||||
if 'count' in throttle_count[name].keys():
|
if 'count' in list(throttle_count[name].keys()):
|
||||||
for key, value in throttle_count[name].items():
|
for key, value in throttle_count[name].items():
|
||||||
if key == 'count':
|
if key == 'count':
|
||||||
value += 1
|
value += 1
|
||||||
|
|
|
@ -1,5 +1,7 @@
|
||||||
# coding=utf-8
|
# coding=utf-8
|
||||||
|
|
||||||
|
from __future__ import absolute_import
|
||||||
|
from __future__ import print_function
|
||||||
import os
|
import os
|
||||||
import requests
|
import requests
|
||||||
import logging
|
import logging
|
||||||
|
@ -12,6 +14,7 @@ from config import settings, url_sonarr
|
||||||
from list_subtitles import list_missing_subtitles
|
from list_subtitles import list_missing_subtitles
|
||||||
from database import TableShows
|
from database import TableShows
|
||||||
from utils import get_sonarr_version
|
from utils import get_sonarr_version
|
||||||
|
import six
|
||||||
|
|
||||||
|
|
||||||
def update_series():
|
def update_series():
|
||||||
|
@ -60,7 +63,7 @@ def update_series():
|
||||||
for i, show in enumerate(r.json(), 1):
|
for i, show in enumerate(r.json(), 1):
|
||||||
notifications.write(msg="Getting series data from Sonarr...", queue='get_series', item=i, length=seriesListLength)
|
notifications.write(msg="Getting series data from Sonarr...", queue='get_series', item=i, length=seriesListLength)
|
||||||
try:
|
try:
|
||||||
overview = unicode(show['overview'])
|
overview = six.text_type(show['overview'])
|
||||||
except:
|
except:
|
||||||
overview = ""
|
overview = ""
|
||||||
try:
|
try:
|
||||||
|
@ -82,17 +85,17 @@ def update_series():
|
||||||
current_shows_sonarr.append(show['tvdbId'])
|
current_shows_sonarr.append(show['tvdbId'])
|
||||||
|
|
||||||
if show['tvdbId'] in current_shows_db_list:
|
if show['tvdbId'] in current_shows_db_list:
|
||||||
series_to_update.append({'title': unicode(show["title"]),
|
series_to_update.append({'title': six.text_type(show["title"]),
|
||||||
'path': unicode(show["path"]),
|
'path': six.text_type(show["path"]),
|
||||||
'tvdb_id': int(show["tvdbId"]),
|
'tvdb_id': int(show["tvdbId"]),
|
||||||
'sonarr_series_id': int(show["id"]),
|
'sonarr_series_id': int(show["id"]),
|
||||||
'overview': unicode(overview),
|
'overview': six.text_type(overview),
|
||||||
'poster': unicode(poster),
|
'poster': six.text_type(poster),
|
||||||
'fanart': unicode(fanart),
|
'fanart': six.text_type(fanart),
|
||||||
'audio_language': unicode(profile_id_to_language((show['qualityProfileId'] if get_sonarr_version().startswith('2') else show['languageProfileId']), audio_profiles)),
|
'audio_language': six.text_type(profile_id_to_language((show['qualityProfileId'] if get_sonarr_version().startswith('2') else show['languageProfileId']), audio_profiles)),
|
||||||
'sort_title': unicode(show['sortTitle']),
|
'sort_title': six.text_type(show['sortTitle']),
|
||||||
'year': unicode(show['year']),
|
'year': six.text_type(show['year']),
|
||||||
'alternate_titles': unicode(alternateTitles)})
|
'alternate_titles': six.text_type(alternateTitles)})
|
||||||
else:
|
else:
|
||||||
if serie_default_enabled is True:
|
if serie_default_enabled is True:
|
||||||
series_to_add.append({'title': show["title"],
|
series_to_add.append({'title': show["title"],
|
||||||
|
@ -161,9 +164,9 @@ def update_series():
|
||||||
removed_series = list(set(current_shows_db_list) - set(current_shows_sonarr))
|
removed_series = list(set(current_shows_db_list) - set(current_shows_sonarr))
|
||||||
|
|
||||||
for series in removed_series:
|
for series in removed_series:
|
||||||
print TableShows.delete().where(
|
print(TableShows.delete().where(
|
||||||
TableShows.tvdb_id == series
|
TableShows.tvdb_id == series
|
||||||
).execute()
|
).execute())
|
||||||
|
|
||||||
logging.debug('BAZARR All series synced from Sonarr into database.')
|
logging.debug('BAZARR All series synced from Sonarr into database.')
|
||||||
|
|
||||||
|
|
|
@ -1,12 +1,13 @@
|
||||||
# coding=utf-8
|
# coding=utf-8
|
||||||
|
|
||||||
|
from __future__ import absolute_import
|
||||||
import os
|
import os
|
||||||
import sys
|
import sys
|
||||||
import ast
|
import ast
|
||||||
import logging
|
import logging
|
||||||
import subprocess
|
import subprocess
|
||||||
import time
|
import time
|
||||||
import cPickle as pickle
|
import six.moves.cPickle as pickle
|
||||||
import codecs
|
import codecs
|
||||||
import types
|
import types
|
||||||
import re
|
import re
|
||||||
|
@ -37,6 +38,9 @@ from database import TableShows, TableEpisodes, TableMovies, TableHistory, Table
|
||||||
from peewee import fn, JOIN
|
from peewee import fn, JOIN
|
||||||
|
|
||||||
from analytics import track_event
|
from analytics import track_event
|
||||||
|
import six
|
||||||
|
from six.moves import range
|
||||||
|
from functools import reduce
|
||||||
|
|
||||||
|
|
||||||
def get_video(path, title, sceneName, use_scenename, providers=None, media_type="movie"):
|
def get_video(path, title, sceneName, use_scenename, providers=None, media_type="movie"):
|
||||||
|
@ -91,11 +95,11 @@ def get_scores(video, media_type, min_score_movie_perc=60 * 100 / 120.0, min_sco
|
||||||
"""
|
"""
|
||||||
max_score = 120.0
|
max_score = 120.0
|
||||||
min_score = max_score * min_score_movie_perc / 100.0
|
min_score = max_score * min_score_movie_perc / 100.0
|
||||||
scores = subliminal_scores.movie_scores.keys()
|
scores = list(subliminal_scores.movie_scores.keys())
|
||||||
if media_type == "series":
|
if media_type == "series":
|
||||||
max_score = 360.0
|
max_score = 360.0
|
||||||
min_score = max_score * min_score_series_perc / 100.0
|
min_score = max_score * min_score_series_perc / 100.0
|
||||||
scores = subliminal_scores.episode_scores.keys()
|
scores = list(subliminal_scores.episode_scores.keys())
|
||||||
if video.is_special:
|
if video.is_special:
|
||||||
min_score = max_score * min_score_special_ep / 100.0
|
min_score = max_score * min_score_special_ep / 100.0
|
||||||
|
|
||||||
|
@ -119,7 +123,7 @@ def download_subtitle(path, language, hi, forced, providers, providers_auth, sce
|
||||||
hi = "force non-HI"
|
hi = "force non-HI"
|
||||||
language_set = set()
|
language_set = set()
|
||||||
|
|
||||||
if not isinstance(language, types.ListType):
|
if not isinstance(language, list):
|
||||||
language = [language]
|
language = [language]
|
||||||
|
|
||||||
if forced == "True":
|
if forced == "True":
|
||||||
|
@ -185,7 +189,7 @@ def download_subtitle(path, language, hi, forced, providers, providers_auth, sce
|
||||||
|
|
||||||
saved_any = False
|
saved_any = False
|
||||||
if downloaded_subtitles:
|
if downloaded_subtitles:
|
||||||
for video, subtitles in downloaded_subtitles.iteritems():
|
for video, subtitles in six.iteritems(downloaded_subtitles):
|
||||||
if not subtitles:
|
if not subtitles:
|
||||||
continue
|
continue
|
||||||
|
|
||||||
|
@ -221,10 +225,10 @@ def download_subtitle(path, language, hi, forced, providers, providers_auth, sce
|
||||||
else:
|
else:
|
||||||
action = "downloaded"
|
action = "downloaded"
|
||||||
if video.used_scene_name:
|
if video.used_scene_name:
|
||||||
message = downloaded_language + is_forced_string + " subtitles " + action + " from " + downloaded_provider + " with a score of " + unicode(
|
message = downloaded_language + is_forced_string + " subtitles " + action + " from " + downloaded_provider + " with a score of " + six.text_type(
|
||||||
round(subtitle.score * 100 / max_score, 2)) + "% using this scene name: " + sceneName
|
round(subtitle.score * 100 / max_score, 2)) + "% using this scene name: " + sceneName
|
||||||
else:
|
else:
|
||||||
message = downloaded_language + is_forced_string + " subtitles " + action + " from " + downloaded_provider + " with a score of " + unicode(
|
message = downloaded_language + is_forced_string + " subtitles " + action + " from " + downloaded_provider + " with a score of " + six.text_type(
|
||||||
round(subtitle.score * 100 / max_score, 2)) + "% using filename guessing."
|
round(subtitle.score * 100 / max_score, 2)) + "% using filename guessing."
|
||||||
|
|
||||||
if use_postprocessing is True:
|
if use_postprocessing is True:
|
||||||
|
@ -444,7 +448,7 @@ def manual_download_subtitle(path, language, hi, forced, subtitle, provider, pro
|
||||||
downloaded_path = saved_subtitle.storage_path
|
downloaded_path = saved_subtitle.storage_path
|
||||||
logging.debug('BAZARR Subtitles file saved to disk: ' + downloaded_path)
|
logging.debug('BAZARR Subtitles file saved to disk: ' + downloaded_path)
|
||||||
is_forced_string = " forced" if subtitle.language.forced else ""
|
is_forced_string = " forced" if subtitle.language.forced else ""
|
||||||
message = downloaded_language + is_forced_string + " subtitles downloaded from " + downloaded_provider + " with a score of " + unicode(
|
message = downloaded_language + is_forced_string + " subtitles downloaded from " + downloaded_provider + " with a score of " + six.text_type(
|
||||||
score) + "% using manual search."
|
score) + "% using manual search."
|
||||||
|
|
||||||
if use_postprocessing is True:
|
if use_postprocessing is True:
|
||||||
|
@ -749,7 +753,7 @@ def wanted_download_subtitles(path, l, count_episodes):
|
||||||
|
|
||||||
for episode in episodes_details:
|
for episode in episodes_details:
|
||||||
attempt = episode.failed_attempts
|
attempt = episode.failed_attempts
|
||||||
if type(attempt) == unicode:
|
if type(attempt) == six.text_type:
|
||||||
attempt = ast.literal_eval(attempt)
|
attempt = ast.literal_eval(attempt)
|
||||||
for language in ast.literal_eval(episode.missing_subtitles):
|
for language in ast.literal_eval(episode.missing_subtitles):
|
||||||
if attempt is None:
|
if attempt is None:
|
||||||
|
@ -762,7 +766,7 @@ def wanted_download_subtitles(path, l, count_episodes):
|
||||||
|
|
||||||
TableEpisodes.update(
|
TableEpisodes.update(
|
||||||
{
|
{
|
||||||
TableEpisodes.failed_attempts: unicode(attempt)
|
TableEpisodes.failed_attempts: six.text_type(attempt)
|
||||||
}
|
}
|
||||||
).where(
|
).where(
|
||||||
TableEpisodes.sonarr_episode_id == episode.sonarr_episode_id
|
TableEpisodes.sonarr_episode_id == episode.sonarr_episode_id
|
||||||
|
@ -818,7 +822,7 @@ def wanted_download_subtitles_movie(path, l, count_movies):
|
||||||
|
|
||||||
for movie in movies_details:
|
for movie in movies_details:
|
||||||
attempt = movie.failed_attempts
|
attempt = movie.failed_attempts
|
||||||
if type(attempt) == unicode:
|
if type(attempt) == six.text_type:
|
||||||
attempt = ast.literal_eval(attempt)
|
attempt = ast.literal_eval(attempt)
|
||||||
for language in ast.literal_eval(movie.missing_subtitles):
|
for language in ast.literal_eval(movie.missing_subtitles):
|
||||||
if attempt is None:
|
if attempt is None:
|
||||||
|
@ -831,7 +835,7 @@ def wanted_download_subtitles_movie(path, l, count_movies):
|
||||||
|
|
||||||
TableMovies.update(
|
TableMovies.update(
|
||||||
{
|
{
|
||||||
TableMovies.failed_attempts: unicode(attempt)
|
TableMovies.failed_attempts: six.text_type(attempt)
|
||||||
}
|
}
|
||||||
).where(
|
).where(
|
||||||
TableMovies.radarr_id == movie.radarr_id
|
TableMovies.radarr_id == movie.radarr_id
|
||||||
|
@ -991,7 +995,7 @@ def refine_from_db(path, video):
|
||||||
TableMovies.audio_codec,
|
TableMovies.audio_codec,
|
||||||
TableMovies.imdb_id
|
TableMovies.imdb_id
|
||||||
).where(
|
).where(
|
||||||
TableMovies.path == unicode(path_replace_reverse_movie(path))
|
TableMovies.path == six.text_type(path_replace_reverse_movie(path))
|
||||||
).first()
|
).first()
|
||||||
|
|
||||||
if data:
|
if data:
|
||||||
|
|
|
@ -1,4 +1,5 @@
|
||||||
# coding=utf-8
|
# coding=utf-8
|
||||||
|
from __future__ import absolute_import
|
||||||
import ast
|
import ast
|
||||||
import os
|
import os
|
||||||
import re
|
import re
|
||||||
|
@ -126,7 +127,7 @@ def force_unicode(s):
|
||||||
:param s: string
|
:param s: string
|
||||||
:return: unicode string
|
:return: unicode string
|
||||||
"""
|
"""
|
||||||
if not isinstance(s, types.UnicodeType):
|
if not isinstance(s, str):
|
||||||
try:
|
try:
|
||||||
s = s.decode("utf-8")
|
s = s.decode("utf-8")
|
||||||
except UnicodeDecodeError:
|
except UnicodeDecodeError:
|
||||||
|
|
|
@ -1,12 +1,13 @@
|
||||||
# coding=utf-8
|
# coding=utf-8
|
||||||
|
|
||||||
|
from __future__ import absolute_import
|
||||||
import os
|
import os
|
||||||
import logging
|
import logging
|
||||||
import time
|
import time
|
||||||
import rarfile
|
import rarfile
|
||||||
|
|
||||||
from cork import Cork
|
from cork import Cork
|
||||||
from ConfigParser2 import ConfigParser
|
from backports import configparser2
|
||||||
from config import settings
|
from config import settings
|
||||||
from check_update import check_releases
|
from check_update import check_releases
|
||||||
from get_args import args
|
from get_args import args
|
||||||
|
@ -66,7 +67,7 @@ if not os.path.exists(os.path.join(args.config_dir, 'config', 'releases.txt')):
|
||||||
|
|
||||||
config_file = os.path.normpath(os.path.join(args.config_dir, 'config', 'config.ini'))
|
config_file = os.path.normpath(os.path.join(args.config_dir, 'config', 'config.ini'))
|
||||||
|
|
||||||
cfg = ConfigParser()
|
cfg = configparser2.ConfigParser()
|
||||||
|
|
||||||
|
|
||||||
def init_binaries():
|
def init_binaries():
|
||||||
|
|
|
@ -1,5 +1,6 @@
|
||||||
# coding=utf-8
|
# coding=utf-8
|
||||||
|
|
||||||
|
from __future__ import absolute_import
|
||||||
import os
|
import os
|
||||||
import sys
|
import sys
|
||||||
|
|
||||||
|
|
|
@ -1,5 +1,6 @@
|
||||||
# coding=utf-8
|
# coding=utf-8
|
||||||
|
|
||||||
|
from __future__ import absolute_import
|
||||||
import gc
|
import gc
|
||||||
import os
|
import os
|
||||||
import babelfish
|
import babelfish
|
||||||
|
@ -24,6 +25,7 @@ from helper import path_replace, path_replace_movie, path_replace_reverse, \
|
||||||
|
|
||||||
from queueconfig import notifications
|
from queueconfig import notifications
|
||||||
from embedded_subs_reader import embedded_subs_reader
|
from embedded_subs_reader import embedded_subs_reader
|
||||||
|
import six
|
||||||
|
|
||||||
gc.enable()
|
gc.enable()
|
||||||
|
|
||||||
|
@ -63,7 +65,7 @@ def store_subtitles(file):
|
||||||
logging.exception("BAZARR unable to index external subtitles.")
|
logging.exception("BAZARR unable to index external subtitles.")
|
||||||
pass
|
pass
|
||||||
else:
|
else:
|
||||||
for subtitle, language in subtitles.iteritems():
|
for subtitle, language in six.iteritems(subtitles):
|
||||||
subtitle_path = get_external_subtitles_path(file, subtitle)
|
subtitle_path = get_external_subtitles_path(file, subtitle)
|
||||||
if str(os.path.splitext(subtitle)[0]).lower().endswith(tuple(brazilian_portuguese)):
|
if str(os.path.splitext(subtitle)[0]).lower().endswith(tuple(brazilian_portuguese)):
|
||||||
logging.debug("BAZARR external subtitles detected: " + "pb")
|
logging.debug("BAZARR external subtitles detected: " + "pb")
|
||||||
|
@ -155,7 +157,7 @@ def store_subtitles_movie(file):
|
||||||
logging.exception("BAZARR unable to index external subtitles.")
|
logging.exception("BAZARR unable to index external subtitles.")
|
||||||
pass
|
pass
|
||||||
else:
|
else:
|
||||||
for subtitle, language in subtitles.iteritems():
|
for subtitle, language in six.iteritems(subtitles):
|
||||||
if str(os.path.splitext(subtitle)[0]).lower().endswith(tuple(brazilian_portuguese)) is True:
|
if str(os.path.splitext(subtitle)[0]).lower().endswith(tuple(brazilian_portuguese)) is True:
|
||||||
logging.debug("BAZARR external subtitles detected: " + "pb")
|
logging.debug("BAZARR external subtitles detected: " + "pb")
|
||||||
actual_subtitles.append(
|
actual_subtitles.append(
|
||||||
|
|
|
@ -1,5 +1,6 @@
|
||||||
# coding=utf-8
|
# coding=utf-8
|
||||||
|
|
||||||
|
from __future__ import absolute_import
|
||||||
import os
|
import os
|
||||||
import logging
|
import logging
|
||||||
import re
|
import re
|
||||||
|
@ -9,6 +10,7 @@ import platform
|
||||||
from logging.handlers import TimedRotatingFileHandler
|
from logging.handlers import TimedRotatingFileHandler
|
||||||
from get_args import args
|
from get_args import args
|
||||||
from config import settings
|
from config import settings
|
||||||
|
import six
|
||||||
|
|
||||||
|
|
||||||
logger = logging.getLogger()
|
logger = logging.getLogger()
|
||||||
|
@ -107,10 +109,10 @@ class MyFilter(logging.Filter):
|
||||||
|
|
||||||
class ArgsFilteringFilter(logging.Filter):
|
class ArgsFilteringFilter(logging.Filter):
|
||||||
def filter_args(self, record, func):
|
def filter_args(self, record, func):
|
||||||
if isinstance(record.args, (types.ListType, types.TupleType)):
|
if isinstance(record.args, (list, tuple)):
|
||||||
final_args = []
|
final_args = []
|
||||||
for arg in record.args:
|
for arg in record.args:
|
||||||
if not isinstance(arg, basestring):
|
if not isinstance(arg, six.string_types):
|
||||||
final_args.append(arg)
|
final_args.append(arg)
|
||||||
continue
|
continue
|
||||||
|
|
||||||
|
@ -118,7 +120,7 @@ class ArgsFilteringFilter(logging.Filter):
|
||||||
record.args = type(record.args)(final_args)
|
record.args = type(record.args)(final_args)
|
||||||
elif isinstance(record.args, dict):
|
elif isinstance(record.args, dict):
|
||||||
for key, arg in record.args.items():
|
for key, arg in record.args.items():
|
||||||
if not isinstance(arg, basestring):
|
if not isinstance(arg, six.string_types):
|
||||||
continue
|
continue
|
||||||
|
|
||||||
record.args[key] = func(arg)
|
record.args[key] = func(arg)
|
||||||
|
|
|
@ -1,5 +1,8 @@
|
||||||
# coding=utf-8
|
# coding=utf-8
|
||||||
|
|
||||||
|
import six
|
||||||
|
from six.moves import zip
|
||||||
|
from functools import reduce
|
||||||
bazarr_version = '0.8.2'
|
bazarr_version = '0.8.2'
|
||||||
|
|
||||||
import gc
|
import gc
|
||||||
|
@ -12,7 +15,7 @@ import pretty
|
||||||
import math
|
import math
|
||||||
import ast
|
import ast
|
||||||
import hashlib
|
import hashlib
|
||||||
import urllib
|
import six.moves.urllib.request, six.moves.urllib.parse, six.moves.urllib.error
|
||||||
import warnings
|
import warnings
|
||||||
import queueconfig
|
import queueconfig
|
||||||
import platform
|
import platform
|
||||||
|
@ -1575,12 +1578,12 @@ def save_settings():
|
||||||
settings_death_by_captcha_username = request.forms.get('settings_death_by_captcha_username')
|
settings_death_by_captcha_username = request.forms.get('settings_death_by_captcha_username')
|
||||||
settings_death_by_captcha_password = request.forms.get('settings_death_by_captcha_password')
|
settings_death_by_captcha_password = request.forms.get('settings_death_by_captcha_password')
|
||||||
|
|
||||||
before = (unicode(settings.general.ip), int(settings.general.port), unicode(settings.general.base_url),
|
before = (six.text_type(settings.general.ip), int(settings.general.port), six.text_type(settings.general.base_url),
|
||||||
unicode(settings.general.path_mappings), unicode(settings.general.getboolean('use_sonarr')),
|
six.text_type(settings.general.path_mappings), six.text_type(settings.general.getboolean('use_sonarr')),
|
||||||
unicode(settings.general.getboolean('use_radarr')), unicode(settings.general.path_mappings_movie))
|
six.text_type(settings.general.getboolean('use_radarr')), six.text_type(settings.general.path_mappings_movie))
|
||||||
after = (unicode(settings_general_ip), int(settings_general_port), unicode(settings_general_baseurl),
|
after = (six.text_type(settings_general_ip), int(settings_general_port), six.text_type(settings_general_baseurl),
|
||||||
unicode(settings_general_pathmapping), unicode(settings_general_use_sonarr),
|
six.text_type(settings_general_pathmapping), six.text_type(settings_general_use_sonarr),
|
||||||
unicode(settings_general_use_radarr), unicode(settings_general_pathmapping_movie))
|
six.text_type(settings_general_use_radarr), six.text_type(settings_general_pathmapping_movie))
|
||||||
|
|
||||||
settings.general.ip = text_type(settings_general_ip)
|
settings.general.ip = text_type(settings_general_ip)
|
||||||
settings.general.port = text_type(settings_general_port)
|
settings.general.port = text_type(settings_general_port)
|
||||||
|
@ -1645,7 +1648,7 @@ def save_settings():
|
||||||
settings_proxy_password = request.forms.get('settings_proxy_password')
|
settings_proxy_password = request.forms.get('settings_proxy_password')
|
||||||
settings_proxy_exclude = request.forms.get('settings_proxy_exclude')
|
settings_proxy_exclude = request.forms.get('settings_proxy_exclude')
|
||||||
|
|
||||||
before_proxy_password = (unicode(settings.proxy.type), unicode(settings.proxy.exclude))
|
before_proxy_password = (six.text_type(settings.proxy.type), six.text_type(settings.proxy.exclude))
|
||||||
if before_proxy_password[0] != settings_proxy_type:
|
if before_proxy_password[0] != settings_proxy_type:
|
||||||
configured()
|
configured()
|
||||||
if before_proxy_password[1] == settings_proxy_password:
|
if before_proxy_password[1] == settings_proxy_password:
|
||||||
|
@ -2029,7 +2032,7 @@ def remove_subtitles():
|
||||||
history_log(0, sonarrSeriesId, sonarrEpisodeId, result)
|
history_log(0, sonarrSeriesId, sonarrEpisodeId, result)
|
||||||
except OSError as e:
|
except OSError as e:
|
||||||
logging.exception('BAZARR cannot delete subtitles file: ' + subtitlesPath)
|
logging.exception('BAZARR cannot delete subtitles file: ' + subtitlesPath)
|
||||||
store_subtitles(unicode(episodePath))
|
store_subtitles(six.text_type(episodePath))
|
||||||
list_missing_subtitles(sonarrSeriesId)
|
list_missing_subtitles(sonarrSeriesId)
|
||||||
|
|
||||||
|
|
||||||
|
@ -2048,7 +2051,7 @@ def remove_subtitles_movie():
|
||||||
history_log_movie(0, radarrId, result)
|
history_log_movie(0, radarrId, result)
|
||||||
except OSError as e:
|
except OSError as e:
|
||||||
logging.exception('BAZARR cannot delete subtitles file: ' + subtitlesPath)
|
logging.exception('BAZARR cannot delete subtitles file: ' + subtitlesPath)
|
||||||
store_subtitles_movie(unicode(moviePath))
|
store_subtitles_movie(six.text_type(moviePath))
|
||||||
list_missing_subtitles_movies(radarrId)
|
list_missing_subtitles_movies(radarrId)
|
||||||
|
|
||||||
|
|
||||||
|
@ -2082,7 +2085,7 @@ def get_subtitle():
|
||||||
score = result[4]
|
score = result[4]
|
||||||
history_log(1, sonarrSeriesId, sonarrEpisodeId, message, path, language_code, provider, score)
|
history_log(1, sonarrSeriesId, sonarrEpisodeId, message, path, language_code, provider, score)
|
||||||
send_notifications(sonarrSeriesId, sonarrEpisodeId, message)
|
send_notifications(sonarrSeriesId, sonarrEpisodeId, message)
|
||||||
store_subtitles(unicode(episodePath))
|
store_subtitles(six.text_type(episodePath))
|
||||||
list_missing_subtitles(sonarrSeriesId)
|
list_missing_subtitles(sonarrSeriesId)
|
||||||
redirect(ref)
|
redirect(ref)
|
||||||
except OSError:
|
except OSError:
|
||||||
|
@ -2140,7 +2143,7 @@ def manual_get_subtitle():
|
||||||
score = result[4]
|
score = result[4]
|
||||||
history_log(2, sonarrSeriesId, sonarrEpisodeId, message, path, language_code, provider, score)
|
history_log(2, sonarrSeriesId, sonarrEpisodeId, message, path, language_code, provider, score)
|
||||||
send_notifications(sonarrSeriesId, sonarrEpisodeId, message)
|
send_notifications(sonarrSeriesId, sonarrEpisodeId, message)
|
||||||
store_subtitles(unicode(episodePath))
|
store_subtitles(six.text_type(episodePath))
|
||||||
list_missing_subtitles(sonarrSeriesId)
|
list_missing_subtitles(sonarrSeriesId)
|
||||||
redirect(ref)
|
redirect(ref)
|
||||||
except OSError:
|
except OSError:
|
||||||
|
@ -2184,7 +2187,7 @@ def perform_manual_upload_subtitle():
|
||||||
score = 360
|
score = 360
|
||||||
history_log(4, sonarrSeriesId, sonarrEpisodeId, message, path, language_code, provider, score)
|
history_log(4, sonarrSeriesId, sonarrEpisodeId, message, path, language_code, provider, score)
|
||||||
send_notifications(sonarrSeriesId, sonarrEpisodeId, message)
|
send_notifications(sonarrSeriesId, sonarrEpisodeId, message)
|
||||||
store_subtitles(unicode(episodePath))
|
store_subtitles(six.text_type(episodePath))
|
||||||
list_missing_subtitles(sonarrSeriesId)
|
list_missing_subtitles(sonarrSeriesId)
|
||||||
|
|
||||||
redirect(ref)
|
redirect(ref)
|
||||||
|
@ -2221,7 +2224,7 @@ def get_subtitle_movie():
|
||||||
score = result[4]
|
score = result[4]
|
||||||
history_log_movie(1, radarrId, message, path, language_code, provider, score)
|
history_log_movie(1, radarrId, message, path, language_code, provider, score)
|
||||||
send_notifications_movie(radarrId, message)
|
send_notifications_movie(radarrId, message)
|
||||||
store_subtitles_movie(unicode(moviePath))
|
store_subtitles_movie(six.text_type(moviePath))
|
||||||
list_missing_subtitles_movies(radarrId)
|
list_missing_subtitles_movies(radarrId)
|
||||||
redirect(ref)
|
redirect(ref)
|
||||||
except OSError:
|
except OSError:
|
||||||
|
@ -2277,7 +2280,7 @@ def manual_get_subtitle_movie():
|
||||||
score = result[4]
|
score = result[4]
|
||||||
history_log_movie(2, radarrId, message, path, language_code, provider, score)
|
history_log_movie(2, radarrId, message, path, language_code, provider, score)
|
||||||
send_notifications_movie(radarrId, message)
|
send_notifications_movie(radarrId, message)
|
||||||
store_subtitles_movie(unicode(moviePath))
|
store_subtitles_movie(six.text_type(moviePath))
|
||||||
list_missing_subtitles_movies(radarrId)
|
list_missing_subtitles_movies(radarrId)
|
||||||
redirect(ref)
|
redirect(ref)
|
||||||
except OSError:
|
except OSError:
|
||||||
|
@ -2320,7 +2323,7 @@ def perform_manual_upload_subtitle_movie():
|
||||||
score = 120
|
score = 120
|
||||||
history_log_movie(4, radarrId, message, path, language_code, provider, score)
|
history_log_movie(4, radarrId, message, path, language_code, provider, score)
|
||||||
send_notifications_movie(radarrId, message)
|
send_notifications_movie(radarrId, message)
|
||||||
store_subtitles_movie(unicode(moviePath))
|
store_subtitles_movie(six.text_type(moviePath))
|
||||||
list_missing_subtitles_movies(radarrId)
|
list_missing_subtitles_movies(radarrId)
|
||||||
|
|
||||||
redirect(ref)
|
redirect(ref)
|
||||||
|
@ -2421,7 +2424,7 @@ def api_history():
|
||||||
@route(base_url + 'test_url/<protocol>/<url:path>', method='GET')
|
@route(base_url + 'test_url/<protocol>/<url:path>', method='GET')
|
||||||
@custom_auth_basic(check_credentials)
|
@custom_auth_basic(check_credentials)
|
||||||
def test_url(protocol, url):
|
def test_url(protocol, url):
|
||||||
url = urllib.unquote(url)
|
url = six.moves.urllib.parse.unquote(url)
|
||||||
try:
|
try:
|
||||||
result = requests.get(protocol + "://" + url, allow_redirects=False, verify=False).json()['version']
|
result = requests.get(protocol + "://" + url, allow_redirects=False, verify=False).json()['version']
|
||||||
except:
|
except:
|
||||||
|
@ -2433,7 +2436,7 @@ def test_url(protocol, url):
|
||||||
@route(base_url + 'test_notification/<protocol>/<provider:path>', method='GET')
|
@route(base_url + 'test_notification/<protocol>/<provider:path>', method='GET')
|
||||||
@custom_auth_basic(check_credentials)
|
@custom_auth_basic(check_credentials)
|
||||||
def test_notification(protocol, provider):
|
def test_notification(protocol, provider):
|
||||||
provider = urllib.unquote(provider)
|
provider = six.moves.urllib.parse.unquote(provider)
|
||||||
apobj = apprise.Apprise()
|
apobj = apprise.Apprise()
|
||||||
apobj.add(protocol + "://" + provider)
|
apobj.add(protocol + "://" + provider)
|
||||||
|
|
||||||
|
|
|
@ -1,5 +1,6 @@
|
||||||
# coding=utf-8
|
# coding=utf-8
|
||||||
|
|
||||||
|
from __future__ import absolute_import
|
||||||
import apprise
|
import apprise
|
||||||
import os
|
import os
|
||||||
import logging
|
import logging
|
||||||
|
|
|
@ -1,3 +1,4 @@
|
||||||
|
from __future__ import absolute_import
|
||||||
from collections import deque
|
from collections import deque
|
||||||
import json
|
import json
|
||||||
|
|
||||||
|
|
|
@ -1,5 +1,6 @@
|
||||||
# coding=utf-8
|
# coding=utf-8
|
||||||
|
|
||||||
|
from __future__ import absolute_import
|
||||||
from get_episodes import sync_episodes, update_all_episodes
|
from get_episodes import sync_episodes, update_all_episodes
|
||||||
from get_movies import update_movies, update_all_movies
|
from get_movies import update_movies, update_all_movies
|
||||||
from get_series import update_series
|
from get_series import update_series
|
||||||
|
|
|
@ -1,5 +1,6 @@
|
||||||
# coding=utf-8
|
# coding=utf-8
|
||||||
|
|
||||||
|
from __future__ import absolute_import
|
||||||
import os
|
import os
|
||||||
import time
|
import time
|
||||||
import platform
|
import platform
|
||||||
|
|
|
@ -1,797 +0,0 @@
|
||||||
"""Configuration file parser.
|
|
||||||
|
|
||||||
A setup file consists of sections, lead by a "[section]" header,
|
|
||||||
and followed by "name: value" entries, with continuations and such in
|
|
||||||
the style of RFC 822.
|
|
||||||
|
|
||||||
The option values can contain format strings which refer to other values in
|
|
||||||
the same section, or values in a special [DEFAULT] section.
|
|
||||||
|
|
||||||
For example:
|
|
||||||
|
|
||||||
something: %(dir)s/whatever
|
|
||||||
|
|
||||||
would resolve the "%(dir)s" to the value of dir. All reference
|
|
||||||
expansions are done late, on demand.
|
|
||||||
|
|
||||||
Intrinsic defaults can be specified by passing them into the
|
|
||||||
ConfigParser constructor as a dictionary.
|
|
||||||
|
|
||||||
class:
|
|
||||||
|
|
||||||
ConfigParser -- responsible for parsing a list of
|
|
||||||
configuration files, and managing the parsed database.
|
|
||||||
|
|
||||||
methods:
|
|
||||||
|
|
||||||
__init__(defaults=None)
|
|
||||||
create the parser and specify a dictionary of intrinsic defaults. The
|
|
||||||
keys must be strings, the values must be appropriate for %()s string
|
|
||||||
interpolation. Note that `__name__' is always an intrinsic default;
|
|
||||||
its value is the section's name.
|
|
||||||
|
|
||||||
sections()
|
|
||||||
return all the configuration section names, sans DEFAULT
|
|
||||||
|
|
||||||
has_section(section)
|
|
||||||
return whether the given section exists
|
|
||||||
|
|
||||||
has_option(section, option)
|
|
||||||
return whether the given option exists in the given section
|
|
||||||
|
|
||||||
options(section)
|
|
||||||
return list of configuration options for the named section
|
|
||||||
|
|
||||||
read(filenames)
|
|
||||||
read and parse the list of named configuration files, given by
|
|
||||||
name. A single filename is also allowed. Non-existing files
|
|
||||||
are ignored. Return list of successfully read files.
|
|
||||||
|
|
||||||
readfp(fp, filename=None)
|
|
||||||
read and parse one configuration file, given as a file object.
|
|
||||||
The filename defaults to fp.name; it is only used in error
|
|
||||||
messages (if fp has no `name' attribute, the string `<???>' is used).
|
|
||||||
|
|
||||||
get(section, option, raw=False, vars=None)
|
|
||||||
return a string value for the named option. All % interpolations are
|
|
||||||
expanded in the return values, based on the defaults passed into the
|
|
||||||
constructor and the DEFAULT section. Additional substitutions may be
|
|
||||||
provided using the `vars' argument, which must be a dictionary whose
|
|
||||||
contents override any pre-existing defaults.
|
|
||||||
|
|
||||||
getint(section, options)
|
|
||||||
like get(), but convert value to an integer
|
|
||||||
|
|
||||||
getfloat(section, options)
|
|
||||||
like get(), but convert value to a float
|
|
||||||
|
|
||||||
getboolean(section, options)
|
|
||||||
like get(), but convert value to a boolean (currently case
|
|
||||||
insensitively defined as 0, false, no, off for False, and 1, true,
|
|
||||||
yes, on for True). Returns False or True.
|
|
||||||
|
|
||||||
items(section, raw=False, vars=None)
|
|
||||||
return a list of tuples with (name, value) for each option
|
|
||||||
in the section.
|
|
||||||
|
|
||||||
remove_section(section)
|
|
||||||
remove the given file section and all its options
|
|
||||||
|
|
||||||
remove_option(section, option)
|
|
||||||
remove the given option from the given section
|
|
||||||
|
|
||||||
set(section, option, value)
|
|
||||||
set the given option
|
|
||||||
|
|
||||||
write(fp)
|
|
||||||
write the configuration state in .ini format
|
|
||||||
"""
|
|
||||||
|
|
||||||
try:
|
|
||||||
from collections import OrderedDict as _default_dict
|
|
||||||
except ImportError:
|
|
||||||
# fallback for setup.py which hasn't yet built _collections
|
|
||||||
_default_dict = dict
|
|
||||||
|
|
||||||
import re
|
|
||||||
|
|
||||||
__all__ = ["NoSectionError", "DuplicateSectionError", "NoOptionError",
|
|
||||||
"InterpolationError", "InterpolationDepthError",
|
|
||||||
"InterpolationSyntaxError", "ParsingError",
|
|
||||||
"MissingSectionHeaderError",
|
|
||||||
"ConfigParser", "SafeConfigParser", "RawConfigParser",
|
|
||||||
"DEFAULTSECT", "MAX_INTERPOLATION_DEPTH"]
|
|
||||||
|
|
||||||
DEFAULTSECT = "DEFAULT"
|
|
||||||
|
|
||||||
MAX_INTERPOLATION_DEPTH = 10
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
# exception classes
|
|
||||||
class Error(Exception):
|
|
||||||
"""Base class for ConfigParser exceptions."""
|
|
||||||
|
|
||||||
def _get_message(self):
|
|
||||||
"""Getter for 'message'; needed only to override deprecation in
|
|
||||||
BaseException."""
|
|
||||||
return self.__message
|
|
||||||
|
|
||||||
def _set_message(self, value):
|
|
||||||
"""Setter for 'message'; needed only to override deprecation in
|
|
||||||
BaseException."""
|
|
||||||
self.__message = value
|
|
||||||
|
|
||||||
# BaseException.message has been deprecated since Python 2.6. To prevent
|
|
||||||
# DeprecationWarning from popping up over this pre-existing attribute, use
|
|
||||||
# a new property that takes lookup precedence.
|
|
||||||
message = property(_get_message, _set_message)
|
|
||||||
|
|
||||||
def __init__(self, msg=''):
|
|
||||||
self.message = msg
|
|
||||||
Exception.__init__(self, msg)
|
|
||||||
|
|
||||||
def __repr__(self):
|
|
||||||
return self.message
|
|
||||||
|
|
||||||
__str__ = __repr__
|
|
||||||
|
|
||||||
class NoSectionError(Error):
|
|
||||||
"""Raised when no section matches a requested option."""
|
|
||||||
|
|
||||||
def __init__(self, section):
|
|
||||||
Error.__init__(self, 'No section: %r' % (section,))
|
|
||||||
self.section = section
|
|
||||||
self.args = (section, )
|
|
||||||
|
|
||||||
class DuplicateSectionError(Error):
|
|
||||||
"""Raised when a section is multiply-created."""
|
|
||||||
|
|
||||||
def __init__(self, section):
|
|
||||||
Error.__init__(self, "Section %r already exists" % section)
|
|
||||||
self.section = section
|
|
||||||
self.args = (section, )
|
|
||||||
|
|
||||||
class NoOptionError(Error):
|
|
||||||
"""A requested option was not found."""
|
|
||||||
|
|
||||||
def __init__(self, option, section):
|
|
||||||
Error.__init__(self, "No option %r in section: %r" %
|
|
||||||
(option, section))
|
|
||||||
self.option = option
|
|
||||||
self.section = section
|
|
||||||
self.args = (option, section)
|
|
||||||
|
|
||||||
class InterpolationError(Error):
|
|
||||||
"""Base class for interpolation-related exceptions."""
|
|
||||||
|
|
||||||
def __init__(self, option, section, msg):
|
|
||||||
Error.__init__(self, msg)
|
|
||||||
self.option = option
|
|
||||||
self.section = section
|
|
||||||
self.args = (option, section, msg)
|
|
||||||
|
|
||||||
class InterpolationMissingOptionError(InterpolationError):
|
|
||||||
"""A string substitution required a setting which was not available."""
|
|
||||||
|
|
||||||
def __init__(self, option, section, rawval, reference):
|
|
||||||
msg = ("Bad value substitution:\n"
|
|
||||||
"\tsection: [%s]\n"
|
|
||||||
"\toption : %s\n"
|
|
||||||
"\tkey : %s\n"
|
|
||||||
"\trawval : %s\n"
|
|
||||||
% (section, option, reference, rawval))
|
|
||||||
InterpolationError.__init__(self, option, section, msg)
|
|
||||||
self.reference = reference
|
|
||||||
self.args = (option, section, rawval, reference)
|
|
||||||
|
|
||||||
class InterpolationSyntaxError(InterpolationError):
|
|
||||||
"""Raised when the source text into which substitutions are made
|
|
||||||
does not conform to the required syntax."""
|
|
||||||
|
|
||||||
class InterpolationDepthError(InterpolationError):
|
|
||||||
"""Raised when substitutions are nested too deeply."""
|
|
||||||
|
|
||||||
def __init__(self, option, section, rawval):
|
|
||||||
msg = ("Value interpolation too deeply recursive:\n"
|
|
||||||
"\tsection: [%s]\n"
|
|
||||||
"\toption : %s\n"
|
|
||||||
"\trawval : %s\n"
|
|
||||||
% (section, option, rawval))
|
|
||||||
InterpolationError.__init__(self, option, section, msg)
|
|
||||||
self.args = (option, section, rawval)
|
|
||||||
|
|
||||||
class ParsingError(Error):
|
|
||||||
"""Raised when a configuration file does not follow legal syntax."""
|
|
||||||
|
|
||||||
def __init__(self, filename):
|
|
||||||
Error.__init__(self, 'File contains parsing errors: %s' % filename)
|
|
||||||
self.filename = filename
|
|
||||||
self.errors = []
|
|
||||||
self.args = (filename, )
|
|
||||||
|
|
||||||
def append(self, lineno, line):
|
|
||||||
self.errors.append((lineno, line))
|
|
||||||
self.message += '\n\t[line %2d]: %s' % (lineno, line)
|
|
||||||
|
|
||||||
class MissingSectionHeaderError(ParsingError):
|
|
||||||
"""Raised when a key-value pair is found before any section header."""
|
|
||||||
|
|
||||||
def __init__(self, filename, lineno, line):
|
|
||||||
Error.__init__(
|
|
||||||
self,
|
|
||||||
'File contains no section headers.\nfile: %s, line: %d\n%r' %
|
|
||||||
(filename, lineno, line))
|
|
||||||
self.filename = filename
|
|
||||||
self.lineno = lineno
|
|
||||||
self.line = line
|
|
||||||
self.args = (filename, lineno, line)
|
|
||||||
|
|
||||||
|
|
||||||
class RawConfigParser:
|
|
||||||
def __init__(self, defaults=None, dict_type=_default_dict,
|
|
||||||
allow_no_value=False):
|
|
||||||
self._dict = dict_type
|
|
||||||
self._sections = self._dict()
|
|
||||||
self._defaults = self._dict()
|
|
||||||
if allow_no_value:
|
|
||||||
self._optcre = self.OPTCRE_NV
|
|
||||||
else:
|
|
||||||
self._optcre = self.OPTCRE
|
|
||||||
if defaults:
|
|
||||||
for key, value in defaults.items():
|
|
||||||
self._defaults[self.optionxform(key)] = value
|
|
||||||
self.comment_store = None ## used for storing comments in ini
|
|
||||||
|
|
||||||
|
|
||||||
def defaults(self):
|
|
||||||
return self._defaults
|
|
||||||
|
|
||||||
def sections(self):
|
|
||||||
"""Return a list of section names, excluding [DEFAULT]"""
|
|
||||||
# self._sections will never have [DEFAULT] in it
|
|
||||||
return self._sections.keys()
|
|
||||||
|
|
||||||
def add_section(self, section):
|
|
||||||
"""Create a new section in the configuration.
|
|
||||||
|
|
||||||
Raise DuplicateSectionError if a section by the specified name
|
|
||||||
already exists. Raise ValueError if name is DEFAULT or any of it's
|
|
||||||
case-insensitive variants.
|
|
||||||
"""
|
|
||||||
if section.lower() == "default":
|
|
||||||
raise ValueError, 'Invalid section name: %s' % section
|
|
||||||
|
|
||||||
if section in self._sections:
|
|
||||||
raise DuplicateSectionError(section)
|
|
||||||
self._sections[section] = self._dict()
|
|
||||||
|
|
||||||
def has_section(self, section):
|
|
||||||
"""Indicate whether the named section is present in the configuration.
|
|
||||||
|
|
||||||
The DEFAULT section is not acknowledged.
|
|
||||||
"""
|
|
||||||
return section in self._sections
|
|
||||||
|
|
||||||
def options(self, section):
|
|
||||||
"""Return a list of option names for the given section name."""
|
|
||||||
try:
|
|
||||||
opts = self._sections[section].copy()
|
|
||||||
except KeyError:
|
|
||||||
raise NoSectionError(section)
|
|
||||||
opts.update(self._defaults)
|
|
||||||
if '__name__' in opts:
|
|
||||||
del opts['__name__']
|
|
||||||
return opts.keys()
|
|
||||||
|
|
||||||
def read(self, filenames):
|
|
||||||
"""Read and parse a filename or a list of filenames.
|
|
||||||
|
|
||||||
Files that cannot be opened are silently ignored; this is
|
|
||||||
designed so that you can specify a list of potential
|
|
||||||
configuration file locations (e.g. current directory, user's
|
|
||||||
home directory, systemwide directory), and all existing
|
|
||||||
configuration files in the list will be read. A single
|
|
||||||
filename may also be given.
|
|
||||||
|
|
||||||
Return list of successfully read files.
|
|
||||||
"""
|
|
||||||
if isinstance(filenames, basestring):
|
|
||||||
filenames = [filenames]
|
|
||||||
read_ok = []
|
|
||||||
for filename in filenames:
|
|
||||||
try:
|
|
||||||
fp = open(filename)
|
|
||||||
except IOError:
|
|
||||||
continue
|
|
||||||
self._read(fp, filename)
|
|
||||||
fp.close()
|
|
||||||
read_ok.append(filename)
|
|
||||||
return read_ok
|
|
||||||
|
|
||||||
def readfp(self, fp, filename=None):
|
|
||||||
"""Like read() but the argument must be a file-like object.
|
|
||||||
|
|
||||||
The `fp' argument must have a `readline' method. Optional
|
|
||||||
second argument is the `filename', which if not given, is
|
|
||||||
taken from fp.name. If fp has no `name' attribute, `<???>' is
|
|
||||||
used.
|
|
||||||
|
|
||||||
"""
|
|
||||||
if filename is None:
|
|
||||||
try:
|
|
||||||
filename = fp.name
|
|
||||||
except AttributeError:
|
|
||||||
filename = '<???>'
|
|
||||||
self._read(fp, filename)
|
|
||||||
|
|
||||||
def get(self, section, option):
|
|
||||||
opt = self.optionxform(option)
|
|
||||||
if section not in self._sections:
|
|
||||||
if section != DEFAULTSECT:
|
|
||||||
raise NoSectionError(section)
|
|
||||||
if opt in self._defaults:
|
|
||||||
return self._defaults[opt]
|
|
||||||
else:
|
|
||||||
raise NoOptionError(option, section)
|
|
||||||
elif opt in self._sections[section]:
|
|
||||||
return self._sections[section][opt]
|
|
||||||
elif opt in self._defaults:
|
|
||||||
return self._defaults[opt]
|
|
||||||
else:
|
|
||||||
raise NoOptionError(option, section)
|
|
||||||
|
|
||||||
def items(self, section):
|
|
||||||
try:
|
|
||||||
d2 = self._sections[section]
|
|
||||||
except KeyError:
|
|
||||||
if section != DEFAULTSECT:
|
|
||||||
raise NoSectionError(section)
|
|
||||||
d2 = self._dict()
|
|
||||||
d = self._defaults.copy()
|
|
||||||
d.update(d2)
|
|
||||||
if "__name__" in d:
|
|
||||||
del d["__name__"]
|
|
||||||
return d.items()
|
|
||||||
|
|
||||||
def _get(self, section, conv, option):
|
|
||||||
return conv(self.get(section, option))
|
|
||||||
|
|
||||||
def getint(self, section, option):
|
|
||||||
return self._get(section, int, option)
|
|
||||||
|
|
||||||
def getfloat(self, section, option):
|
|
||||||
return self._get(section, float, option)
|
|
||||||
|
|
||||||
_boolean_states = {'1': True, 'yes': True, 'true': True, 'on': True,
|
|
||||||
'0': False, 'no': False, 'false': False, 'off': False}
|
|
||||||
|
|
||||||
def getboolean(self, section, option):
|
|
||||||
v = self.get(section, option)
|
|
||||||
if v.lower() not in self._boolean_states:
|
|
||||||
raise ValueError, 'Not a boolean: %s' % v
|
|
||||||
return self._boolean_states[v.lower()]
|
|
||||||
|
|
||||||
def optionxform(self, optionstr):
|
|
||||||
return optionstr.lower()
|
|
||||||
|
|
||||||
def has_option(self, section, option):
|
|
||||||
"""Check for the existence of a given option in a given section."""
|
|
||||||
if not section or section == DEFAULTSECT:
|
|
||||||
option = self.optionxform(option)
|
|
||||||
return option in self._defaults
|
|
||||||
elif section not in self._sections:
|
|
||||||
return False
|
|
||||||
else:
|
|
||||||
option = self.optionxform(option)
|
|
||||||
return (option in self._sections[section]
|
|
||||||
or option in self._defaults)
|
|
||||||
|
|
||||||
def set(self, section, option, value=None):
|
|
||||||
"""Set an option."""
|
|
||||||
if not section or section == DEFAULTSECT:
|
|
||||||
sectdict = self._defaults
|
|
||||||
else:
|
|
||||||
try:
|
|
||||||
sectdict = self._sections[section]
|
|
||||||
except KeyError:
|
|
||||||
raise NoSectionError(section)
|
|
||||||
sectdict[self.optionxform(option)] = value
|
|
||||||
|
|
||||||
def write(self, fp):
|
|
||||||
"""Write an .ini-format representation of the configuration state."""
|
|
||||||
if self._defaults:
|
|
||||||
fp.write("[%s]\n" % DEFAULTSECT)
|
|
||||||
for (key, value) in self._defaults.items():
|
|
||||||
fp.write("%s = %s\n" % (key, str(value).replace('\n', '\n\t')))
|
|
||||||
fp.write("\n")
|
|
||||||
for section in self._sections:
|
|
||||||
fp.write("[%s]\n" % section)
|
|
||||||
for (key, value) in self._sections[section].items():
|
|
||||||
if key == "__name__":
|
|
||||||
continue
|
|
||||||
if (value is not None) or (self._optcre == self.OPTCRE):
|
|
||||||
key = " = ".join((key, str(value).replace('\n', '\n\t')))
|
|
||||||
fp.write("%s\n" % (key))
|
|
||||||
fp.write("\n")
|
|
||||||
|
|
||||||
def remove_option(self, section, option):
|
|
||||||
"""Remove an option."""
|
|
||||||
if not section or section == DEFAULTSECT:
|
|
||||||
sectdict = self._defaults
|
|
||||||
else:
|
|
||||||
try:
|
|
||||||
sectdict = self._sections[section]
|
|
||||||
except KeyError:
|
|
||||||
raise NoSectionError(section)
|
|
||||||
option = self.optionxform(option)
|
|
||||||
existed = option in sectdict
|
|
||||||
if existed:
|
|
||||||
del sectdict[option]
|
|
||||||
return existed
|
|
||||||
|
|
||||||
def remove_section(self, section):
|
|
||||||
"""Remove a file section."""
|
|
||||||
existed = section in self._sections
|
|
||||||
if existed:
|
|
||||||
del self._sections[section]
|
|
||||||
return existed
|
|
||||||
|
|
||||||
#
|
|
||||||
# Regular expressions for parsing section headers and options.
|
|
||||||
#
|
|
||||||
SECTCRE = re.compile(
|
|
||||||
r'\[' # [
|
|
||||||
r'(?P<header>[^]]+)' # very permissive!
|
|
||||||
r'\]' # ]
|
|
||||||
)
|
|
||||||
OPTCRE = re.compile(
|
|
||||||
r'(?P<option>[^:=\s][^:=]*)' # very permissive!
|
|
||||||
r'\s*(?P<vi>[:=])\s*' # any number of space/tab,
|
|
||||||
# followed by separator
|
|
||||||
# (either : or =), followed
|
|
||||||
# by any # space/tab
|
|
||||||
r'(?P<value>.*)$' # everything up to eol
|
|
||||||
)
|
|
||||||
OPTCRE_NV = re.compile(
|
|
||||||
r'(?P<option>[^:=\s][^:=]*)' # very permissive!
|
|
||||||
r'\s*(?:' # any number of space/tab,
|
|
||||||
r'(?P<vi>[:=])\s*' # optionally followed by
|
|
||||||
# separator (either : or
|
|
||||||
# =), followed by any #
|
|
||||||
# space/tab
|
|
||||||
r'(?P<value>.*))?$' # everything up to eol
|
|
||||||
)
|
|
||||||
|
|
||||||
def _read(self, fp, fpname):
|
|
||||||
"""Parse a sectioned setup file.
|
|
||||||
|
|
||||||
The sections in setup file contains a title line at the top,
|
|
||||||
indicated by a name in square brackets (`[]'), plus key/value
|
|
||||||
options lines, indicated by `name: value' format lines.
|
|
||||||
Continuations are represented by an embedded newline then
|
|
||||||
leading whitespace. Blank lines, lines beginning with a '#',
|
|
||||||
and just about everything else are ignored.
|
|
||||||
"""
|
|
||||||
|
|
||||||
comment_store = {}
|
|
||||||
cursect = None # None, or a dictionary
|
|
||||||
optname = None
|
|
||||||
lineno = 0
|
|
||||||
e = None # None, or an exception
|
|
||||||
while True:
|
|
||||||
line = fp.readline()
|
|
||||||
if not line:
|
|
||||||
break
|
|
||||||
lineno = lineno + 1
|
|
||||||
# comment or blank line?
|
|
||||||
if line.strip() == '' :
|
|
||||||
continue
|
|
||||||
### store comments for doc purposes
|
|
||||||
### Deal with cases of sections and options being there or not
|
|
||||||
if line[0] in '#;' and cursect is not None:
|
|
||||||
if optname is None:
|
|
||||||
comment_store.setdefault(cursect['__name__'] +
|
|
||||||
"::" + "global",[]).append(line)
|
|
||||||
else:
|
|
||||||
comment_store.setdefault(cursect['__name__'] +
|
|
||||||
"::" + optname,[]).append(line)
|
|
||||||
continue
|
|
||||||
elif line[0] in '#;' and cursect is None:
|
|
||||||
comment_store.setdefault("global" +
|
|
||||||
"::" + optname,[]).append(line)
|
|
||||||
continue
|
|
||||||
|
|
||||||
if line.split(None, 1)[0].lower() == 'rem' and line[0] in "rR":
|
|
||||||
# no leading whitespace
|
|
||||||
continue
|
|
||||||
# continuation line?
|
|
||||||
if line[0].isspace() and cursect is not None and optname:
|
|
||||||
value = line.strip()
|
|
||||||
if value:
|
|
||||||
cursect[optname].append(value)
|
|
||||||
# a section header or option header?
|
|
||||||
else:
|
|
||||||
# is it a section header?
|
|
||||||
mo = self.SECTCRE.match(line)
|
|
||||||
if mo:
|
|
||||||
sectname = mo.group('header')
|
|
||||||
if sectname in self._sections:
|
|
||||||
cursect = self._sections[sectname]
|
|
||||||
elif sectname == DEFAULTSECT:
|
|
||||||
cursect = self._defaults
|
|
||||||
else:
|
|
||||||
cursect = self._dict()
|
|
||||||
cursect['__name__'] = sectname
|
|
||||||
self._sections[sectname] = cursect
|
|
||||||
# So sections can't start with a continuation line
|
|
||||||
optname = None
|
|
||||||
# no section header in the file?
|
|
||||||
elif cursect is None:
|
|
||||||
raise MissingSectionHeaderError(fpname, lineno, line)
|
|
||||||
# an option line?
|
|
||||||
else:
|
|
||||||
mo = self._optcre.match(line)
|
|
||||||
if mo:
|
|
||||||
optname, vi, optval = mo.group('option', 'vi', 'value')
|
|
||||||
optname = self.optionxform(optname.rstrip())
|
|
||||||
# This check is fine because the OPTCRE cannot
|
|
||||||
# match if it would set optval to None
|
|
||||||
if optval is not None:
|
|
||||||
if vi in ('=', ':') and ';' in optval:
|
|
||||||
# ';' is a comment delimiter only if it follows
|
|
||||||
# a spacing character
|
|
||||||
pos = optval.find(';')
|
|
||||||
if pos != -1 and optval[pos-1].isspace():
|
|
||||||
optval = optval[:pos]
|
|
||||||
optval = optval.strip()
|
|
||||||
# allow empty values
|
|
||||||
if optval == '""':
|
|
||||||
optval = ''
|
|
||||||
cursect[optname] = [optval]
|
|
||||||
else:
|
|
||||||
# valueless option handling
|
|
||||||
cursect[optname] = optval
|
|
||||||
else:
|
|
||||||
# a non-fatal parsing error occurred. set up the
|
|
||||||
# exception but keep going. the exception will be
|
|
||||||
# raised at the end of the file and will contain a
|
|
||||||
# list of all bogus lines
|
|
||||||
if not e:
|
|
||||||
e = ParsingError(fpname)
|
|
||||||
e.append(lineno, repr(line))
|
|
||||||
# if any parsing errors occurred, raise an exception
|
|
||||||
if e:
|
|
||||||
raise e
|
|
||||||
|
|
||||||
# join the multi-line values collected while reading
|
|
||||||
all_sections = [self._defaults]
|
|
||||||
all_sections.extend(self._sections.values())
|
|
||||||
for options in all_sections:
|
|
||||||
for name, val in options.items():
|
|
||||||
if isinstance(val, list):
|
|
||||||
options[name] = '\n'.join(val)
|
|
||||||
self.comment_store = comment_store
|
|
||||||
|
|
||||||
def ini_as_rst(self):
|
|
||||||
"""trivial helper function to putput comment_stroe as rest
|
|
||||||
|
|
||||||
.. todo:: write actual doctests with string input
|
|
||||||
>> p = ConfigParser2.SafeConfigParser()
|
|
||||||
>> p.read(f)
|
|
||||||
['/usr/home/pbrian/src/public/configparser2/example.ini']
|
|
||||||
>> open("/tmp/foo.rst", "w").write(p.ini_as_rst())
|
|
||||||
|
|
||||||
"""
|
|
||||||
outstr = ".. rst version of ini file\n\n"
|
|
||||||
_cursectname = None
|
|
||||||
for item in sorted(self.comment_store.keys()):
|
|
||||||
_sect, _opt = item.split("::")
|
|
||||||
if _sect != _cursectname:
|
|
||||||
outstr += "\n%s\n%s\n" % (_sect, "-"* len(_sect))
|
|
||||||
_cursectname = _sect
|
|
||||||
txt = " ".join(self.comment_store[item])
|
|
||||||
txt = txt.replace("#", "").replace(";","")
|
|
||||||
outstr += ":%s: %s" % (_opt, txt)
|
|
||||||
return outstr
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
import UserDict as _UserDict
|
|
||||||
|
|
||||||
class _Chainmap(_UserDict.DictMixin):
|
|
||||||
"""Combine multiple mappings for successive lookups.
|
|
||||||
|
|
||||||
For example, to emulate Python's normal lookup sequence:
|
|
||||||
|
|
||||||
import __builtin__
|
|
||||||
pylookup = _Chainmap(locals(), globals(), vars(__builtin__))
|
|
||||||
"""
|
|
||||||
|
|
||||||
def __init__(self, *maps):
|
|
||||||
self._maps = maps
|
|
||||||
|
|
||||||
def __getitem__(self, key):
|
|
||||||
for mapping in self._maps:
|
|
||||||
try:
|
|
||||||
return mapping[key]
|
|
||||||
except KeyError:
|
|
||||||
pass
|
|
||||||
raise KeyError(key)
|
|
||||||
|
|
||||||
def keys(self):
|
|
||||||
result = []
|
|
||||||
seen = set()
|
|
||||||
for mapping in self._maps:
|
|
||||||
for key in mapping:
|
|
||||||
if key not in seen:
|
|
||||||
result.append(key)
|
|
||||||
seen.add(key)
|
|
||||||
return result
|
|
||||||
|
|
||||||
class ConfigParser(RawConfigParser):
|
|
||||||
|
|
||||||
def get(self, section, option, raw=False, vars=None):
|
|
||||||
"""Get an option value for a given section.
|
|
||||||
|
|
||||||
If `vars' is provided, it must be a dictionary. The option is looked up
|
|
||||||
in `vars' (if provided), `section', and in `defaults' in that order.
|
|
||||||
|
|
||||||
All % interpolations are expanded in the return values, unless the
|
|
||||||
optional argument `raw' is true. Values for interpolation keys are
|
|
||||||
looked up in the same manner as the option.
|
|
||||||
|
|
||||||
The section DEFAULT is special.
|
|
||||||
"""
|
|
||||||
sectiondict = {}
|
|
||||||
try:
|
|
||||||
sectiondict = self._sections[section]
|
|
||||||
except KeyError:
|
|
||||||
if section != DEFAULTSECT:
|
|
||||||
raise NoSectionError(section)
|
|
||||||
# Update with the entry specific variables
|
|
||||||
vardict = {}
|
|
||||||
if vars:
|
|
||||||
for key, value in vars.items():
|
|
||||||
vardict[self.optionxform(key)] = value
|
|
||||||
d = _Chainmap(vardict, sectiondict, self._defaults)
|
|
||||||
option = self.optionxform(option)
|
|
||||||
try:
|
|
||||||
value = d[option]
|
|
||||||
except KeyError:
|
|
||||||
raise NoOptionError(option, section)
|
|
||||||
|
|
||||||
if raw or value is None:
|
|
||||||
return value
|
|
||||||
else:
|
|
||||||
return self._interpolate(section, option, value, d)
|
|
||||||
|
|
||||||
def items(self, section, raw=False, vars=None):
|
|
||||||
"""Return a list of tuples with (name, value) for each option
|
|
||||||
in the section.
|
|
||||||
|
|
||||||
All % interpolations are expanded in the return values, based on the
|
|
||||||
defaults passed into the constructor, unless the optional argument
|
|
||||||
`raw' is true. Additional substitutions may be provided using the
|
|
||||||
`vars' argument, which must be a dictionary whose contents overrides
|
|
||||||
any pre-existing defaults.
|
|
||||||
|
|
||||||
The section DEFAULT is special.
|
|
||||||
"""
|
|
||||||
d = self._defaults.copy()
|
|
||||||
try:
|
|
||||||
d.update(self._sections[section])
|
|
||||||
except KeyError:
|
|
||||||
if section != DEFAULTSECT:
|
|
||||||
raise NoSectionError(section)
|
|
||||||
# Update with the entry specific variables
|
|
||||||
if vars:
|
|
||||||
for key, value in vars.items():
|
|
||||||
d[self.optionxform(key)] = value
|
|
||||||
options = d.keys()
|
|
||||||
if "__name__" in options:
|
|
||||||
options.remove("__name__")
|
|
||||||
if raw:
|
|
||||||
return [(option, d[option])
|
|
||||||
for option in options]
|
|
||||||
else:
|
|
||||||
return [(option, self._interpolate(section, option, d[option], d))
|
|
||||||
for option in options]
|
|
||||||
|
|
||||||
def _interpolate(self, section, option, rawval, vars):
|
|
||||||
# do the string interpolation
|
|
||||||
value = rawval
|
|
||||||
depth = MAX_INTERPOLATION_DEPTH
|
|
||||||
while depth: # Loop through this until it's done
|
|
||||||
depth -= 1
|
|
||||||
if value and "%(" in value:
|
|
||||||
value = self._KEYCRE.sub(self._interpolation_replace, value)
|
|
||||||
try:
|
|
||||||
value = value % vars
|
|
||||||
except KeyError, e:
|
|
||||||
raise InterpolationMissingOptionError(
|
|
||||||
option, section, rawval, e.args[0])
|
|
||||||
else:
|
|
||||||
break
|
|
||||||
if value and "%(" in value:
|
|
||||||
raise InterpolationDepthError(option, section, rawval)
|
|
||||||
return value
|
|
||||||
|
|
||||||
_KEYCRE = re.compile(r"%\(([^)]*)\)s|.")
|
|
||||||
|
|
||||||
def _interpolation_replace(self, match):
|
|
||||||
s = match.group(1)
|
|
||||||
if s is None:
|
|
||||||
return match.group()
|
|
||||||
else:
|
|
||||||
return "%%(%s)s" % self.optionxform(s)
|
|
||||||
|
|
||||||
|
|
||||||
class SafeConfigParser(ConfigParser):
|
|
||||||
|
|
||||||
def _interpolate(self, section, option, rawval, vars):
|
|
||||||
# do the string interpolation
|
|
||||||
L = []
|
|
||||||
self._interpolate_some(option, L, rawval, section, vars, 1)
|
|
||||||
return ''.join(L)
|
|
||||||
|
|
||||||
_interpvar_re = re.compile(r"%\(([^)]+)\)s")
|
|
||||||
|
|
||||||
def _interpolate_some(self, option, accum, rest, section, map, depth):
|
|
||||||
if depth > MAX_INTERPOLATION_DEPTH:
|
|
||||||
raise InterpolationDepthError(option, section, rest)
|
|
||||||
while rest:
|
|
||||||
p = rest.find("%")
|
|
||||||
if p < 0:
|
|
||||||
accum.append(rest)
|
|
||||||
return
|
|
||||||
if p > 0:
|
|
||||||
accum.append(rest[:p])
|
|
||||||
rest = rest[p:]
|
|
||||||
# p is no longer used
|
|
||||||
c = rest[1:2]
|
|
||||||
if c == "%":
|
|
||||||
accum.append("%")
|
|
||||||
rest = rest[2:]
|
|
||||||
elif c == "(":
|
|
||||||
m = self._interpvar_re.match(rest)
|
|
||||||
if m is None:
|
|
||||||
raise InterpolationSyntaxError(option, section,
|
|
||||||
"bad interpolation variable reference %r" % rest)
|
|
||||||
var = self.optionxform(m.group(1))
|
|
||||||
rest = rest[m.end():]
|
|
||||||
try:
|
|
||||||
v = map[var]
|
|
||||||
except KeyError:
|
|
||||||
raise InterpolationMissingOptionError(
|
|
||||||
option, section, rest, var)
|
|
||||||
if "%" in v:
|
|
||||||
self._interpolate_some(option, accum, v,
|
|
||||||
section, map, depth + 1)
|
|
||||||
else:
|
|
||||||
accum.append(v)
|
|
||||||
else:
|
|
||||||
raise InterpolationSyntaxError(
|
|
||||||
option, section,
|
|
||||||
"'%%' must be followed by '%%' or '(', found: %r" % (rest,))
|
|
||||||
|
|
||||||
def set(self, section, option, value=None):
|
|
||||||
"""Set an option. Extend ConfigParser.set: check for string values."""
|
|
||||||
# The only legal non-string value if we allow valueless
|
|
||||||
# options is None, so we need to check if the value is a
|
|
||||||
# string if:
|
|
||||||
# - we do not allow valueless options, or
|
|
||||||
# - we allow valueless options but the value is not None
|
|
||||||
if self._optcre is self.OPTCRE or value:
|
|
||||||
if not isinstance(value, basestring):
|
|
||||||
raise TypeError("option values must be strings")
|
|
||||||
if value is not None:
|
|
||||||
# check for bad percent signs:
|
|
||||||
# first, replace all "good" interpolations
|
|
||||||
tmp_value = value.replace('%%', '')
|
|
||||||
tmp_value = self._interpvar_re.sub('', tmp_value)
|
|
||||||
# then, check if there's a lone percent sign left
|
|
||||||
if '%' in tmp_value:
|
|
||||||
raise ValueError("invalid interpolation syntax in %r at "
|
|
||||||
"position %d" % (value, tmp_value.find('%')))
|
|
||||||
ConfigParser.set(self, section, option, value)
|
|
|
@ -1,43 +0,0 @@
|
||||||
Behold, mortal, the origins of Beautiful Soup...
|
|
||||||
================================================
|
|
||||||
|
|
||||||
Leonard Richardson is the primary programmer.
|
|
||||||
|
|
||||||
Aaron DeVore is awesome.
|
|
||||||
|
|
||||||
Mark Pilgrim provided the encoding detection code that forms the base
|
|
||||||
of UnicodeDammit.
|
|
||||||
|
|
||||||
Thomas Kluyver and Ezio Melotti finished the work of getting Beautiful
|
|
||||||
Soup 4 working under Python 3.
|
|
||||||
|
|
||||||
Simon Willison wrote soupselect, which was used to make Beautiful Soup
|
|
||||||
support CSS selectors.
|
|
||||||
|
|
||||||
Sam Ruby helped with a lot of edge cases.
|
|
||||||
|
|
||||||
Jonathan Ellis was awarded the prestigous Beau Potage D'Or for his
|
|
||||||
work in solving the nestable tags conundrum.
|
|
||||||
|
|
||||||
An incomplete list of people have contributed patches to Beautiful
|
|
||||||
Soup:
|
|
||||||
|
|
||||||
Istvan Albert, Andrew Lin, Anthony Baxter, Andrew Boyko, Tony Chang,
|
|
||||||
Zephyr Fang, Fuzzy, Roman Gaufman, Yoni Gilad, Richie Hindle, Peteris
|
|
||||||
Krumins, Kent Johnson, Ben Last, Robert Leftwich, Staffan Malmgren,
|
|
||||||
Ksenia Marasanova, JP Moins, Adam Monsen, John Nagle, "Jon", Ed
|
|
||||||
Oskiewicz, Greg Phillips, Giles Radford, Arthur Rudolph, Marko
|
|
||||||
Samastur, Jouni Seppänen, Alexander Schmolck, Andy Theyers, Glyn
|
|
||||||
Webster, Paul Wright, Danny Yoo
|
|
||||||
|
|
||||||
An incomplete list of people who made suggestions or found bugs or
|
|
||||||
found ways to break Beautiful Soup:
|
|
||||||
|
|
||||||
Hanno Böck, Matteo Bertini, Chris Curvey, Simon Cusack, Bruce Eckel,
|
|
||||||
Matt Ernst, Michael Foord, Tom Harris, Bill de hOra, Donald Howes,
|
|
||||||
Matt Patterson, Scott Roberts, Steve Strassmann, Mike Williams,
|
|
||||||
warchild at redho dot com, Sami Kuisma, Carlos Rocha, Bob Hutchison,
|
|
||||||
Joren Mc, Michal Migurski, John Kleven, Tim Heaney, Tripp Lilley, Ed
|
|
||||||
Summers, Dennis Sutch, Chris Smith, Aaron Sweep^W Swartz, Stuart
|
|
||||||
Turner, Greg Edwards, Kevin J Kalupson, Nikos Kouremenos, Artur de
|
|
||||||
Sousa Rocha, Yichun Wei, Per Vognsen
|
|
|
@ -1,27 +0,0 @@
|
||||||
Beautiful Soup is made available under the MIT license:
|
|
||||||
|
|
||||||
Copyright (c) 2004-2015 Leonard Richardson
|
|
||||||
|
|
||||||
Permission is hereby granted, free of charge, to any person obtaining
|
|
||||||
a copy of this software and associated documentation files (the
|
|
||||||
"Software"), to deal in the Software without restriction, including
|
|
||||||
without limitation the rights to use, copy, modify, merge, publish,
|
|
||||||
distribute, sublicense, and/or sell copies of the Software, and to
|
|
||||||
permit persons to whom the Software is furnished to do so, subject to
|
|
||||||
the following conditions:
|
|
||||||
|
|
||||||
The above copyright notice and this permission notice shall be
|
|
||||||
included in all copies or substantial portions of the Software.
|
|
||||||
|
|
||||||
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
|
||||||
EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
|
|
||||||
MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
|
||||||
NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
|
|
||||||
BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
|
|
||||||
ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
|
|
||||||
CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
||||||
SOFTWARE.
|
|
||||||
|
|
||||||
Beautiful Soup incorporates code from the html5lib library, which is
|
|
||||||
also made available under the MIT license. Copyright (c) 2006-2013
|
|
||||||
James Graham and other contributors
|
|
1190
libs/bs4/NEWS.txt
1190
libs/bs4/NEWS.txt
File diff suppressed because it is too large
Load Diff
|
@ -1,63 +0,0 @@
|
||||||
= Introduction =
|
|
||||||
|
|
||||||
>>> from bs4 import BeautifulSoup
|
|
||||||
>>> soup = BeautifulSoup("<p>Some<b>bad<i>HTML")
|
|
||||||
>>> print soup.prettify()
|
|
||||||
<html>
|
|
||||||
<body>
|
|
||||||
<p>
|
|
||||||
Some
|
|
||||||
<b>
|
|
||||||
bad
|
|
||||||
<i>
|
|
||||||
HTML
|
|
||||||
</i>
|
|
||||||
</b>
|
|
||||||
</p>
|
|
||||||
</body>
|
|
||||||
</html>
|
|
||||||
>>> soup.find(text="bad")
|
|
||||||
u'bad'
|
|
||||||
|
|
||||||
>>> soup.i
|
|
||||||
<i>HTML</i>
|
|
||||||
|
|
||||||
>>> soup = BeautifulSoup("<tag1>Some<tag2/>bad<tag3>XML", "xml")
|
|
||||||
>>> print soup.prettify()
|
|
||||||
<?xml version="1.0" encoding="utf-8">
|
|
||||||
<tag1>
|
|
||||||
Some
|
|
||||||
<tag2 />
|
|
||||||
bad
|
|
||||||
<tag3>
|
|
||||||
XML
|
|
||||||
</tag3>
|
|
||||||
</tag1>
|
|
||||||
|
|
||||||
= Full documentation =
|
|
||||||
|
|
||||||
The bs4/doc/ directory contains full documentation in Sphinx
|
|
||||||
format. Run "make html" in that directory to create HTML
|
|
||||||
documentation.
|
|
||||||
|
|
||||||
= Running the unit tests =
|
|
||||||
|
|
||||||
Beautiful Soup supports unit test discovery from the project root directory:
|
|
||||||
|
|
||||||
$ nosetests
|
|
||||||
|
|
||||||
$ python -m unittest discover -s bs4 # Python 2.7 and up
|
|
||||||
|
|
||||||
If you checked out the source tree, you should see a script in the
|
|
||||||
home directory called test-all-versions. This script will run the unit
|
|
||||||
tests under Python 2.7, then create a temporary Python 3 conversion of
|
|
||||||
the source and run the unit tests again under Python 3.
|
|
||||||
|
|
||||||
= Links =
|
|
||||||
|
|
||||||
Homepage: http://www.crummy.com/software/BeautifulSoup/bs4/
|
|
||||||
Documentation: http://www.crummy.com/software/BeautifulSoup/bs4/doc/
|
|
||||||
http://readthedocs.org/docs/beautiful-soup-4/
|
|
||||||
Discussion group: http://groups.google.com/group/beautifulsoup/
|
|
||||||
Development: https://code.launchpad.net/beautifulsoup/
|
|
||||||
Bug tracker: https://bugs.launchpad.net/beautifulsoup/
|
|
|
@ -1,31 +0,0 @@
|
||||||
Additions
|
|
||||||
---------
|
|
||||||
|
|
||||||
More of the jQuery API: nextUntil?
|
|
||||||
|
|
||||||
Optimizations
|
|
||||||
-------------
|
|
||||||
|
|
||||||
The html5lib tree builder doesn't use the standard tree-building API,
|
|
||||||
which worries me and has resulted in a number of bugs.
|
|
||||||
|
|
||||||
markup_attr_map can be optimized since it's always a map now.
|
|
||||||
|
|
||||||
Upon encountering UTF-16LE data or some other uncommon serialization
|
|
||||||
of Unicode, UnicodeDammit will convert the data to Unicode, then
|
|
||||||
encode it at UTF-8. This is wasteful because it will just get decoded
|
|
||||||
back to Unicode.
|
|
||||||
|
|
||||||
CDATA
|
|
||||||
-----
|
|
||||||
|
|
||||||
The elementtree XMLParser has a strip_cdata argument that, when set to
|
|
||||||
False, should allow Beautiful Soup to preserve CDATA sections instead
|
|
||||||
of treating them as text. Except it doesn't. (This argument is also
|
|
||||||
present for HTMLParser, and also does nothing there.)
|
|
||||||
|
|
||||||
Currently, htm5lib converts CDATA sections into comments. An
|
|
||||||
as-yet-unreleased version of html5lib changes the parser's handling of
|
|
||||||
CDATA sections to allow CDATA sections in tags like <svg> and
|
|
||||||
<math>. The HTML5TreeBuilder will need to be updated to create CData
|
|
||||||
objects instead of Comment objects in this situation.
|
|
|
@ -17,18 +17,17 @@ http://www.crummy.com/software/BeautifulSoup/bs4/doc/
|
||||||
|
|
||||||
"""
|
"""
|
||||||
|
|
||||||
# Use of this source code is governed by a BSD-style license that can be
|
|
||||||
# found in the LICENSE file.
|
|
||||||
|
|
||||||
__author__ = "Leonard Richardson (leonardr@segfault.org)"
|
__author__ = "Leonard Richardson (leonardr@segfault.org)"
|
||||||
__version__ = "4.6.0"
|
__version__ = "4.8.0"
|
||||||
__copyright__ = "Copyright (c) 2004-2017 Leonard Richardson"
|
__copyright__ = "Copyright (c) 2004-2019 Leonard Richardson"
|
||||||
|
# Use of this source code is governed by the MIT license.
|
||||||
__license__ = "MIT"
|
__license__ = "MIT"
|
||||||
|
|
||||||
__all__ = ['BeautifulSoup']
|
__all__ = ['BeautifulSoup']
|
||||||
|
|
||||||
import os
|
import os
|
||||||
import re
|
import re
|
||||||
|
import sys
|
||||||
import traceback
|
import traceback
|
||||||
import warnings
|
import warnings
|
||||||
|
|
||||||
|
@ -50,7 +49,7 @@ from .element import (
|
||||||
|
|
||||||
# The very first thing we do is give a useful error if someone is
|
# The very first thing we do is give a useful error if someone is
|
||||||
# running this code under Python 3 without converting it.
|
# running this code under Python 3 without converting it.
|
||||||
'You are trying to run the Python 2 version of Beautiful Soup under Python 3. This will not work.'<>'You need to convert the code, either by installing it (`python setup.py install`) or by running 2to3 (`2to3 -w bs4`).'
|
'You are trying to run the Python 2 version of Beautiful Soup under Python 3. This will not work.'!='You need to convert the code, either by installing it (`python setup.py install`) or by running 2to3 (`2to3 -w bs4`).'
|
||||||
|
|
||||||
class BeautifulSoup(Tag):
|
class BeautifulSoup(Tag):
|
||||||
"""
|
"""
|
||||||
|
@ -74,7 +73,7 @@ class BeautifulSoup(Tag):
|
||||||
like HTML's <br> tag), call handle_starttag and then
|
like HTML's <br> tag), call handle_starttag and then
|
||||||
handle_endtag.
|
handle_endtag.
|
||||||
"""
|
"""
|
||||||
ROOT_TAG_NAME = u'[document]'
|
ROOT_TAG_NAME = '[document]'
|
||||||
|
|
||||||
# If the end-user gives no indication which tree builder they
|
# If the end-user gives no indication which tree builder they
|
||||||
# want, look for one with these features.
|
# want, look for one with these features.
|
||||||
|
@ -82,16 +81,56 @@ class BeautifulSoup(Tag):
|
||||||
|
|
||||||
ASCII_SPACES = '\x20\x0a\x09\x0c\x0d'
|
ASCII_SPACES = '\x20\x0a\x09\x0c\x0d'
|
||||||
|
|
||||||
NO_PARSER_SPECIFIED_WARNING = "No parser was explicitly specified, so I'm using the best available %(markup_type)s parser for this system (\"%(parser)s\"). This usually isn't a problem, but if you run this code on another system, or in a different virtual environment, it may use a different parser and behave differently.\n\nThe code that caused this warning is on line %(line_number)s of the file %(filename)s. To get rid of this warning, change code that looks like this:\n\n BeautifulSoup(YOUR_MARKUP})\n\nto this:\n\n BeautifulSoup(YOUR_MARKUP, \"%(parser)s\")\n"
|
NO_PARSER_SPECIFIED_WARNING = "No parser was explicitly specified, so I'm using the best available %(markup_type)s parser for this system (\"%(parser)s\"). This usually isn't a problem, but if you run this code on another system, or in a different virtual environment, it may use a different parser and behave differently.\n\nThe code that caused this warning is on line %(line_number)s of the file %(filename)s. To get rid of this warning, pass the additional argument 'features=\"%(parser)s\"' to the BeautifulSoup constructor.\n"
|
||||||
|
|
||||||
def __init__(self, markup="", features=None, builder=None,
|
def __init__(self, markup="", features=None, builder=None,
|
||||||
parse_only=None, from_encoding=None, exclude_encodings=None,
|
parse_only=None, from_encoding=None, exclude_encodings=None,
|
||||||
**kwargs):
|
**kwargs):
|
||||||
"""The Soup object is initialized as the 'root tag', and the
|
"""Constructor.
|
||||||
provided markup (which can be a string or a file-like object)
|
|
||||||
is fed into the underlying parser."""
|
:param markup: A string or a file-like object representing
|
||||||
|
markup to be parsed.
|
||||||
|
|
||||||
|
:param features: Desirable features of the parser to be used. This
|
||||||
|
may be the name of a specific parser ("lxml", "lxml-xml",
|
||||||
|
"html.parser", or "html5lib") or it may be the type of markup
|
||||||
|
to be used ("html", "html5", "xml"). It's recommended that you
|
||||||
|
name a specific parser, so that Beautiful Soup gives you the
|
||||||
|
same results across platforms and virtual environments.
|
||||||
|
|
||||||
|
:param builder: A TreeBuilder subclass to instantiate (or
|
||||||
|
instance to use) instead of looking one up based on
|
||||||
|
`features`. You only need to use this if you've implemented a
|
||||||
|
custom TreeBuilder.
|
||||||
|
|
||||||
|
:param parse_only: A SoupStrainer. Only parts of the document
|
||||||
|
matching the SoupStrainer will be considered. This is useful
|
||||||
|
when parsing part of a document that would otherwise be too
|
||||||
|
large to fit into memory.
|
||||||
|
|
||||||
|
:param from_encoding: A string indicating the encoding of the
|
||||||
|
document to be parsed. Pass this in if Beautiful Soup is
|
||||||
|
guessing wrongly about the document's encoding.
|
||||||
|
|
||||||
|
:param exclude_encodings: A list of strings indicating
|
||||||
|
encodings known to be wrong. Pass this in if you don't know
|
||||||
|
the document's encoding but you know Beautiful Soup's guess is
|
||||||
|
wrong.
|
||||||
|
|
||||||
|
:param kwargs: For backwards compatibility purposes, the
|
||||||
|
constructor accepts certain keyword arguments used in
|
||||||
|
Beautiful Soup 3. None of these arguments do anything in
|
||||||
|
Beautiful Soup 4; they will result in a warning and then be ignored.
|
||||||
|
|
||||||
|
Apart from this, any keyword arguments passed into the BeautifulSoup
|
||||||
|
constructor are propagated to the TreeBuilder constructor. This
|
||||||
|
makes it possible to configure a TreeBuilder beyond saying
|
||||||
|
which one to use.
|
||||||
|
|
||||||
|
"""
|
||||||
|
|
||||||
if 'convertEntities' in kwargs:
|
if 'convertEntities' in kwargs:
|
||||||
|
del kwargs['convertEntities']
|
||||||
warnings.warn(
|
warnings.warn(
|
||||||
"BS4 does not respect the convertEntities argument to the "
|
"BS4 does not respect the convertEntities argument to the "
|
||||||
"BeautifulSoup constructor. Entities are always converted "
|
"BeautifulSoup constructor. Entities are always converted "
|
||||||
|
@ -142,18 +181,22 @@ class BeautifulSoup(Tag):
|
||||||
from_encoding = from_encoding or deprecated_argument(
|
from_encoding = from_encoding or deprecated_argument(
|
||||||
"fromEncoding", "from_encoding")
|
"fromEncoding", "from_encoding")
|
||||||
|
|
||||||
if from_encoding and isinstance(markup, unicode):
|
if from_encoding and isinstance(markup, str):
|
||||||
warnings.warn("You provided Unicode markup but also provided a value for from_encoding. Your from_encoding will be ignored.")
|
warnings.warn("You provided Unicode markup but also provided a value for from_encoding. Your from_encoding will be ignored.")
|
||||||
from_encoding = None
|
from_encoding = None
|
||||||
|
|
||||||
if len(kwargs) > 0:
|
# We need this information to track whether or not the builder
|
||||||
arg = kwargs.keys().pop()
|
# was specified well enough that we can omit the 'you need to
|
||||||
raise TypeError(
|
# specify a parser' warning.
|
||||||
"__init__() got an unexpected keyword argument '%s'" % arg)
|
original_builder = builder
|
||||||
|
original_features = features
|
||||||
if builder is None:
|
|
||||||
original_features = features
|
if isinstance(builder, type):
|
||||||
if isinstance(features, basestring):
|
# A builder class was passed in; it needs to be instantiated.
|
||||||
|
builder_class = builder
|
||||||
|
builder = None
|
||||||
|
elif builder is None:
|
||||||
|
if isinstance(features, str):
|
||||||
features = [features]
|
features = [features]
|
||||||
if features is None or len(features) == 0:
|
if features is None or len(features) == 0:
|
||||||
features = self.DEFAULT_BUILDER_FEATURES
|
features = self.DEFAULT_BUILDER_FEATURES
|
||||||
|
@ -163,41 +206,73 @@ class BeautifulSoup(Tag):
|
||||||
"Couldn't find a tree builder with the features you "
|
"Couldn't find a tree builder with the features you "
|
||||||
"requested: %s. Do you need to install a parser library?"
|
"requested: %s. Do you need to install a parser library?"
|
||||||
% ",".join(features))
|
% ",".join(features))
|
||||||
builder = builder_class()
|
|
||||||
if not (original_features == builder.NAME or
|
# At this point either we have a TreeBuilder instance in
|
||||||
original_features in builder.ALTERNATE_NAMES):
|
# builder, or we have a builder_class that we can instantiate
|
||||||
|
# with the remaining **kwargs.
|
||||||
|
if builder is None:
|
||||||
|
builder = builder_class(**kwargs)
|
||||||
|
if not original_builder and not (
|
||||||
|
original_features == builder.NAME or
|
||||||
|
original_features in builder.ALTERNATE_NAMES
|
||||||
|
):
|
||||||
if builder.is_xml:
|
if builder.is_xml:
|
||||||
markup_type = "XML"
|
markup_type = "XML"
|
||||||
else:
|
else:
|
||||||
markup_type = "HTML"
|
markup_type = "HTML"
|
||||||
|
|
||||||
caller = traceback.extract_stack()[0]
|
# This code adapted from warnings.py so that we get the same line
|
||||||
filename = caller[0]
|
# of code as our warnings.warn() call gets, even if the answer is wrong
|
||||||
line_number = caller[1]
|
# (as it may be in a multithreading situation).
|
||||||
warnings.warn(self.NO_PARSER_SPECIFIED_WARNING % dict(
|
caller = None
|
||||||
filename=filename,
|
try:
|
||||||
line_number=line_number,
|
caller = sys._getframe(1)
|
||||||
parser=builder.NAME,
|
except ValueError:
|
||||||
markup_type=markup_type))
|
pass
|
||||||
|
if caller:
|
||||||
|
globals = caller.f_globals
|
||||||
|
line_number = caller.f_lineno
|
||||||
|
else:
|
||||||
|
globals = sys.__dict__
|
||||||
|
line_number= 1
|
||||||
|
filename = globals.get('__file__')
|
||||||
|
if filename:
|
||||||
|
fnl = filename.lower()
|
||||||
|
if fnl.endswith((".pyc", ".pyo")):
|
||||||
|
filename = filename[:-1]
|
||||||
|
if filename:
|
||||||
|
# If there is no filename at all, the user is most likely in a REPL,
|
||||||
|
# and the warning is not necessary.
|
||||||
|
values = dict(
|
||||||
|
filename=filename,
|
||||||
|
line_number=line_number,
|
||||||
|
parser=builder.NAME,
|
||||||
|
markup_type=markup_type
|
||||||
|
)
|
||||||
|
warnings.warn(self.NO_PARSER_SPECIFIED_WARNING % values, stacklevel=2)
|
||||||
|
else:
|
||||||
|
if kwargs:
|
||||||
|
warnings.warn("Keyword arguments to the BeautifulSoup constructor will be ignored. These would normally be passed into the TreeBuilder constructor, but a TreeBuilder instance was passed in as `builder`.")
|
||||||
|
|
||||||
self.builder = builder
|
self.builder = builder
|
||||||
self.is_xml = builder.is_xml
|
self.is_xml = builder.is_xml
|
||||||
self.known_xml = self.is_xml
|
self.known_xml = self.is_xml
|
||||||
self.builder.soup = self
|
self._namespaces = dict()
|
||||||
|
|
||||||
self.parse_only = parse_only
|
self.parse_only = parse_only
|
||||||
|
|
||||||
|
self.builder.initialize_soup(self)
|
||||||
|
|
||||||
if hasattr(markup, 'read'): # It's a file-type object.
|
if hasattr(markup, 'read'): # It's a file-type object.
|
||||||
markup = markup.read()
|
markup = markup.read()
|
||||||
elif len(markup) <= 256 and (
|
elif len(markup) <= 256 and (
|
||||||
(isinstance(markup, bytes) and not b'<' in markup)
|
(isinstance(markup, bytes) and not b'<' in markup)
|
||||||
or (isinstance(markup, unicode) and not u'<' in markup)
|
or (isinstance(markup, str) and not '<' in markup)
|
||||||
):
|
):
|
||||||
# Print out warnings for a couple beginner problems
|
# Print out warnings for a couple beginner problems
|
||||||
# involving passing non-markup to Beautiful Soup.
|
# involving passing non-markup to Beautiful Soup.
|
||||||
# Beautiful Soup will still parse the input as markup,
|
# Beautiful Soup will still parse the input as markup,
|
||||||
# just in case that's what the user really wants.
|
# just in case that's what the user really wants.
|
||||||
if (isinstance(markup, unicode)
|
if (isinstance(markup, str)
|
||||||
and not os.path.supports_unicode_filenames):
|
and not os.path.supports_unicode_filenames):
|
||||||
possible_filename = markup.encode("utf8")
|
possible_filename = markup.encode("utf8")
|
||||||
else:
|
else:
|
||||||
|
@ -205,13 +280,13 @@ class BeautifulSoup(Tag):
|
||||||
is_file = False
|
is_file = False
|
||||||
try:
|
try:
|
||||||
is_file = os.path.exists(possible_filename)
|
is_file = os.path.exists(possible_filename)
|
||||||
except Exception, e:
|
except Exception as e:
|
||||||
# This is almost certainly a problem involving
|
# This is almost certainly a problem involving
|
||||||
# characters not valid in filenames on this
|
# characters not valid in filenames on this
|
||||||
# system. Just let it go.
|
# system. Just let it go.
|
||||||
pass
|
pass
|
||||||
if is_file:
|
if is_file:
|
||||||
if isinstance(markup, unicode):
|
if isinstance(markup, str):
|
||||||
markup = markup.encode("utf8")
|
markup = markup.encode("utf8")
|
||||||
warnings.warn(
|
warnings.warn(
|
||||||
'"%s" looks like a filename, not markup. You should'
|
'"%s" looks like a filename, not markup. You should'
|
||||||
|
@ -263,9 +338,9 @@ class BeautifulSoup(Tag):
|
||||||
if isinstance(markup, bytes):
|
if isinstance(markup, bytes):
|
||||||
space = b' '
|
space = b' '
|
||||||
cant_start_with = (b"http:", b"https:")
|
cant_start_with = (b"http:", b"https:")
|
||||||
elif isinstance(markup, unicode):
|
elif isinstance(markup, str):
|
||||||
space = u' '
|
space = ' '
|
||||||
cant_start_with = (u"http:", u"https:")
|
cant_start_with = ("http:", "https:")
|
||||||
else:
|
else:
|
||||||
return
|
return
|
||||||
|
|
||||||
|
@ -302,9 +377,10 @@ class BeautifulSoup(Tag):
|
||||||
self.preserve_whitespace_tag_stack = []
|
self.preserve_whitespace_tag_stack = []
|
||||||
self.pushTag(self)
|
self.pushTag(self)
|
||||||
|
|
||||||
def new_tag(self, name, namespace=None, nsprefix=None, **attrs):
|
def new_tag(self, name, namespace=None, nsprefix=None, attrs={}, **kwattrs):
|
||||||
"""Create a new tag associated with this soup."""
|
"""Create a new tag associated with this soup."""
|
||||||
return Tag(None, self.builder, name, namespace, nsprefix, attrs)
|
kwattrs.update(attrs)
|
||||||
|
return Tag(None, self.builder, name, namespace, nsprefix, kwattrs)
|
||||||
|
|
||||||
def new_string(self, s, subclass=NavigableString):
|
def new_string(self, s, subclass=NavigableString):
|
||||||
"""Create a new NavigableString associated with this soup."""
|
"""Create a new NavigableString associated with this soup."""
|
||||||
|
@ -327,7 +403,7 @@ class BeautifulSoup(Tag):
|
||||||
|
|
||||||
def pushTag(self, tag):
|
def pushTag(self, tag):
|
||||||
#print "Push", tag.name
|
#print "Push", tag.name
|
||||||
if self.currentTag:
|
if self.currentTag is not None:
|
||||||
self.currentTag.contents.append(tag)
|
self.currentTag.contents.append(tag)
|
||||||
self.tagStack.append(tag)
|
self.tagStack.append(tag)
|
||||||
self.currentTag = self.tagStack[-1]
|
self.currentTag = self.tagStack[-1]
|
||||||
|
@ -336,7 +412,7 @@ class BeautifulSoup(Tag):
|
||||||
|
|
||||||
def endData(self, containerClass=NavigableString):
|
def endData(self, containerClass=NavigableString):
|
||||||
if self.current_data:
|
if self.current_data:
|
||||||
current_data = u''.join(self.current_data)
|
current_data = ''.join(self.current_data)
|
||||||
# If whitespace is not preserved, and this string contains
|
# If whitespace is not preserved, and this string contains
|
||||||
# nothing but ASCII spaces, replace it with a single space
|
# nothing but ASCII spaces, replace it with a single space
|
||||||
# or newline.
|
# or newline.
|
||||||
|
@ -366,60 +442,71 @@ class BeautifulSoup(Tag):
|
||||||
|
|
||||||
def object_was_parsed(self, o, parent=None, most_recent_element=None):
|
def object_was_parsed(self, o, parent=None, most_recent_element=None):
|
||||||
"""Add an object to the parse tree."""
|
"""Add an object to the parse tree."""
|
||||||
parent = parent or self.currentTag
|
if parent is None:
|
||||||
previous_element = most_recent_element or self._most_recent_element
|
parent = self.currentTag
|
||||||
|
if most_recent_element is not None:
|
||||||
|
previous_element = most_recent_element
|
||||||
|
else:
|
||||||
|
previous_element = self._most_recent_element
|
||||||
|
|
||||||
next_element = previous_sibling = next_sibling = None
|
next_element = previous_sibling = next_sibling = None
|
||||||
if isinstance(o, Tag):
|
if isinstance(o, Tag):
|
||||||
next_element = o.next_element
|
next_element = o.next_element
|
||||||
next_sibling = o.next_sibling
|
next_sibling = o.next_sibling
|
||||||
previous_sibling = o.previous_sibling
|
previous_sibling = o.previous_sibling
|
||||||
if not previous_element:
|
if previous_element is None:
|
||||||
previous_element = o.previous_element
|
previous_element = o.previous_element
|
||||||
|
|
||||||
|
fix = parent.next_element is not None
|
||||||
|
|
||||||
o.setup(parent, previous_element, next_element, previous_sibling, next_sibling)
|
o.setup(parent, previous_element, next_element, previous_sibling, next_sibling)
|
||||||
|
|
||||||
self._most_recent_element = o
|
self._most_recent_element = o
|
||||||
parent.contents.append(o)
|
parent.contents.append(o)
|
||||||
|
|
||||||
if parent.next_sibling:
|
# Check if we are inserting into an already parsed node.
|
||||||
# This node is being inserted into an element that has
|
if fix:
|
||||||
# already been parsed. Deal with any dangling references.
|
self._linkage_fixer(parent)
|
||||||
index = len(parent.contents)-1
|
|
||||||
while index >= 0:
|
|
||||||
if parent.contents[index] is o:
|
|
||||||
break
|
|
||||||
index -= 1
|
|
||||||
else:
|
|
||||||
raise ValueError(
|
|
||||||
"Error building tree: supposedly %r was inserted "
|
|
||||||
"into %r after the fact, but I don't see it!" % (
|
|
||||||
o, parent
|
|
||||||
)
|
|
||||||
)
|
|
||||||
if index == 0:
|
|
||||||
previous_element = parent
|
|
||||||
previous_sibling = None
|
|
||||||
else:
|
|
||||||
previous_element = previous_sibling = parent.contents[index-1]
|
|
||||||
if index == len(parent.contents)-1:
|
|
||||||
next_element = parent.next_sibling
|
|
||||||
next_sibling = None
|
|
||||||
else:
|
|
||||||
next_element = next_sibling = parent.contents[index+1]
|
|
||||||
|
|
||||||
o.previous_element = previous_element
|
def _linkage_fixer(self, el):
|
||||||
if previous_element:
|
"""Make sure linkage of this fragment is sound."""
|
||||||
previous_element.next_element = o
|
|
||||||
o.next_element = next_element
|
first = el.contents[0]
|
||||||
if next_element:
|
child = el.contents[-1]
|
||||||
next_element.previous_element = o
|
descendant = child
|
||||||
o.next_sibling = next_sibling
|
|
||||||
if next_sibling:
|
if child is first and el.parent is not None:
|
||||||
next_sibling.previous_sibling = o
|
# Parent should be linked to first child
|
||||||
o.previous_sibling = previous_sibling
|
el.next_element = child
|
||||||
if previous_sibling:
|
# We are no longer linked to whatever this element is
|
||||||
previous_sibling.next_sibling = o
|
prev_el = child.previous_element
|
||||||
|
if prev_el is not None and prev_el is not el:
|
||||||
|
prev_el.next_element = None
|
||||||
|
# First child should be linked to the parent, and no previous siblings.
|
||||||
|
child.previous_element = el
|
||||||
|
child.previous_sibling = None
|
||||||
|
|
||||||
|
# We have no sibling as we've been appended as the last.
|
||||||
|
child.next_sibling = None
|
||||||
|
|
||||||
|
# This index is a tag, dig deeper for a "last descendant"
|
||||||
|
if isinstance(child, Tag) and child.contents:
|
||||||
|
descendant = child._last_descendant(False)
|
||||||
|
|
||||||
|
# As the final step, link last descendant. It should be linked
|
||||||
|
# to the parent's next sibling (if found), else walk up the chain
|
||||||
|
# and find a parent with a sibling. It should have no next sibling.
|
||||||
|
descendant.next_element = None
|
||||||
|
descendant.next_sibling = None
|
||||||
|
target = el
|
||||||
|
while True:
|
||||||
|
if target is None:
|
||||||
|
break
|
||||||
|
elif target.next_sibling is not None:
|
||||||
|
descendant.next_element = target.next_sibling
|
||||||
|
target.next_sibling.previous_element = child
|
||||||
|
break
|
||||||
|
target = target.parent
|
||||||
|
|
||||||
def _popToTag(self, name, nsprefix=None, inclusivePop=True):
|
def _popToTag(self, name, nsprefix=None, inclusivePop=True):
|
||||||
"""Pops the tag stack up to and including the most recent
|
"""Pops the tag stack up to and including the most recent
|
||||||
|
@ -465,7 +552,7 @@ class BeautifulSoup(Tag):
|
||||||
self.currentTag, self._most_recent_element)
|
self.currentTag, self._most_recent_element)
|
||||||
if tag is None:
|
if tag is None:
|
||||||
return tag
|
return tag
|
||||||
if self._most_recent_element:
|
if self._most_recent_element is not None:
|
||||||
self._most_recent_element.next_element = tag
|
self._most_recent_element.next_element = tag
|
||||||
self._most_recent_element = tag
|
self._most_recent_element = tag
|
||||||
self.pushTag(tag)
|
self.pushTag(tag)
|
||||||
|
@ -490,9 +577,9 @@ class BeautifulSoup(Tag):
|
||||||
encoding_part = ''
|
encoding_part = ''
|
||||||
if eventual_encoding != None:
|
if eventual_encoding != None:
|
||||||
encoding_part = ' encoding="%s"' % eventual_encoding
|
encoding_part = ' encoding="%s"' % eventual_encoding
|
||||||
prefix = u'<?xml version="1.0"%s?>\n' % encoding_part
|
prefix = '<?xml version="1.0"%s?>\n' % encoding_part
|
||||||
else:
|
else:
|
||||||
prefix = u''
|
prefix = ''
|
||||||
if not pretty_print:
|
if not pretty_print:
|
||||||
indent_level = None
|
indent_level = None
|
||||||
else:
|
else:
|
||||||
|
@ -526,4 +613,4 @@ class FeatureNotFound(ValueError):
|
||||||
if __name__ == '__main__':
|
if __name__ == '__main__':
|
||||||
import sys
|
import sys
|
||||||
soup = BeautifulSoup(sys.stdin)
|
soup = BeautifulSoup(sys.stdin)
|
||||||
print soup.prettify()
|
print(soup.prettify())
|
||||||
|
|
|
@ -1,5 +1,5 @@
|
||||||
# Use of this source code is governed by a BSD-style license that can be
|
# Use of this source code is governed by the MIT license.
|
||||||
# found in the LICENSE file.
|
__license__ = "MIT"
|
||||||
|
|
||||||
from collections import defaultdict
|
from collections import defaultdict
|
||||||
import itertools
|
import itertools
|
||||||
|
@ -7,8 +7,7 @@ import sys
|
||||||
from bs4.element import (
|
from bs4.element import (
|
||||||
CharsetMetaAttributeValue,
|
CharsetMetaAttributeValue,
|
||||||
ContentMetaAttributeValue,
|
ContentMetaAttributeValue,
|
||||||
HTMLAwareEntitySubstitution,
|
nonwhitespace_re
|
||||||
whitespace_re
|
|
||||||
)
|
)
|
||||||
|
|
||||||
__all__ = [
|
__all__ = [
|
||||||
|
@ -90,18 +89,46 @@ class TreeBuilder(object):
|
||||||
|
|
||||||
is_xml = False
|
is_xml = False
|
||||||
picklable = False
|
picklable = False
|
||||||
preserve_whitespace_tags = set()
|
|
||||||
empty_element_tags = None # A tag will be considered an empty-element
|
empty_element_tags = None # A tag will be considered an empty-element
|
||||||
# tag when and only when it has no contents.
|
# tag when and only when it has no contents.
|
||||||
|
|
||||||
# A value for these tag/attribute combinations is a space- or
|
# A value for these tag/attribute combinations is a space- or
|
||||||
# comma-separated list of CDATA, rather than a single CDATA.
|
# comma-separated list of CDATA, rather than a single CDATA.
|
||||||
cdata_list_attributes = {}
|
DEFAULT_CDATA_LIST_ATTRIBUTES = {}
|
||||||
|
|
||||||
|
DEFAULT_PRESERVE_WHITESPACE_TAGS = set()
|
||||||
|
|
||||||
|
USE_DEFAULT = object()
|
||||||
|
|
||||||
|
def __init__(self, multi_valued_attributes=USE_DEFAULT, preserve_whitespace_tags=USE_DEFAULT):
|
||||||
|
"""Constructor.
|
||||||
|
|
||||||
def __init__(self):
|
:param multi_valued_attributes: If this is set to None, the
|
||||||
|
TreeBuilder will not turn any values for attributes like
|
||||||
|
'class' into lists. Setting this do a dictionary will
|
||||||
|
customize this behavior; look at DEFAULT_CDATA_LIST_ATTRIBUTES
|
||||||
|
for an example.
|
||||||
|
|
||||||
|
Internally, these are called "CDATA list attributes", but that
|
||||||
|
probably doesn't make sense to an end-user, so the argument name
|
||||||
|
is `multi_valued_attributes`.
|
||||||
|
|
||||||
|
:param preserve_whitespace_tags:
|
||||||
|
"""
|
||||||
self.soup = None
|
self.soup = None
|
||||||
|
if multi_valued_attributes is self.USE_DEFAULT:
|
||||||
|
multi_valued_attributes = self.DEFAULT_CDATA_LIST_ATTRIBUTES
|
||||||
|
self.cdata_list_attributes = multi_valued_attributes
|
||||||
|
if preserve_whitespace_tags is self.USE_DEFAULT:
|
||||||
|
preserve_whitespace_tags = self.DEFAULT_PRESERVE_WHITESPACE_TAGS
|
||||||
|
self.preserve_whitespace_tags = preserve_whitespace_tags
|
||||||
|
|
||||||
|
def initialize_soup(self, soup):
|
||||||
|
"""The BeautifulSoup object has been initialized and is now
|
||||||
|
being associated with the TreeBuilder.
|
||||||
|
"""
|
||||||
|
self.soup = soup
|
||||||
|
|
||||||
def reset(self):
|
def reset(self):
|
||||||
pass
|
pass
|
||||||
|
|
||||||
|
@ -125,7 +152,7 @@ class TreeBuilder(object):
|
||||||
if self.empty_element_tags is None:
|
if self.empty_element_tags is None:
|
||||||
return True
|
return True
|
||||||
return tag_name in self.empty_element_tags
|
return tag_name in self.empty_element_tags
|
||||||
|
|
||||||
def feed(self, markup):
|
def feed(self, markup):
|
||||||
raise NotImplementedError()
|
raise NotImplementedError()
|
||||||
|
|
||||||
|
@ -160,14 +187,14 @@ class TreeBuilder(object):
|
||||||
universal = self.cdata_list_attributes.get('*', [])
|
universal = self.cdata_list_attributes.get('*', [])
|
||||||
tag_specific = self.cdata_list_attributes.get(
|
tag_specific = self.cdata_list_attributes.get(
|
||||||
tag_name.lower(), None)
|
tag_name.lower(), None)
|
||||||
for attr in attrs.keys():
|
for attr in list(attrs.keys()):
|
||||||
if attr in universal or (tag_specific and attr in tag_specific):
|
if attr in universal or (tag_specific and attr in tag_specific):
|
||||||
# We have a "class"-type attribute whose string
|
# We have a "class"-type attribute whose string
|
||||||
# value is a whitespace-separated list of
|
# value is a whitespace-separated list of
|
||||||
# values. Split it into a list.
|
# values. Split it into a list.
|
||||||
value = attrs[attr]
|
value = attrs[attr]
|
||||||
if isinstance(value, basestring):
|
if isinstance(value, str):
|
||||||
values = whitespace_re.split(value)
|
values = nonwhitespace_re.findall(value)
|
||||||
else:
|
else:
|
||||||
# html5lib sometimes calls setAttributes twice
|
# html5lib sometimes calls setAttributes twice
|
||||||
# for the same tag when rearranging the parse
|
# for the same tag when rearranging the parse
|
||||||
|
@ -231,15 +258,20 @@ class HTMLTreeBuilder(TreeBuilder):
|
||||||
Such as which tags are empty-element tags.
|
Such as which tags are empty-element tags.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
preserve_whitespace_tags = HTMLAwareEntitySubstitution.preserve_whitespace_tags
|
|
||||||
empty_element_tags = set([
|
empty_element_tags = set([
|
||||||
# These are from HTML5.
|
# These are from HTML5.
|
||||||
'area', 'base', 'br', 'col', 'embed', 'hr', 'img', 'input', 'keygen', 'link', 'menuitem', 'meta', 'param', 'source', 'track', 'wbr',
|
'area', 'base', 'br', 'col', 'embed', 'hr', 'img', 'input', 'keygen', 'link', 'menuitem', 'meta', 'param', 'source', 'track', 'wbr',
|
||||||
|
|
||||||
# These are from HTML4, removed in HTML5.
|
# These are from earlier versions of HTML and are removed in HTML5.
|
||||||
'spacer', 'frame'
|
'basefont', 'bgsound', 'command', 'frame', 'image', 'isindex', 'nextid', 'spacer'
|
||||||
])
|
])
|
||||||
|
|
||||||
|
# The HTML standard defines these as block-level elements. Beautiful
|
||||||
|
# Soup does not treat these elements differently from other elements,
|
||||||
|
# but it may do so eventually, and this information is available if
|
||||||
|
# you need to use it.
|
||||||
|
block_elements = set(["address", "article", "aside", "blockquote", "canvas", "dd", "div", "dl", "dt", "fieldset", "figcaption", "figure", "footer", "form", "h1", "h2", "h3", "h4", "h5", "h6", "header", "hr", "li", "main", "nav", "noscript", "ol", "output", "p", "pre", "section", "table", "tfoot", "ul", "video"])
|
||||||
|
|
||||||
# The HTML standard defines these attributes as containing a
|
# The HTML standard defines these attributes as containing a
|
||||||
# space-separated list of values, not a single value. That is,
|
# space-separated list of values, not a single value. That is,
|
||||||
# class="foo bar" means that the 'class' attribute has two values,
|
# class="foo bar" means that the 'class' attribute has two values,
|
||||||
|
@ -247,7 +279,7 @@ class HTMLTreeBuilder(TreeBuilder):
|
||||||
# encounter one of these attributes, we will parse its value into
|
# encounter one of these attributes, we will parse its value into
|
||||||
# a list of values if possible. Upon output, the list will be
|
# a list of values if possible. Upon output, the list will be
|
||||||
# converted back into a string.
|
# converted back into a string.
|
||||||
cdata_list_attributes = {
|
DEFAULT_CDATA_LIST_ATTRIBUTES = {
|
||||||
"*" : ['class', 'accesskey', 'dropzone'],
|
"*" : ['class', 'accesskey', 'dropzone'],
|
||||||
"a" : ['rel', 'rev'],
|
"a" : ['rel', 'rev'],
|
||||||
"link" : ['rel', 'rev'],
|
"link" : ['rel', 'rev'],
|
||||||
|
@ -264,6 +296,8 @@ class HTMLTreeBuilder(TreeBuilder):
|
||||||
"output" : ["for"],
|
"output" : ["for"],
|
||||||
}
|
}
|
||||||
|
|
||||||
|
DEFAULT_PRESERVE_WHITESPACE_TAGS = set(['pre', 'textarea'])
|
||||||
|
|
||||||
def set_up_substitutions(self, tag):
|
def set_up_substitutions(self, tag):
|
||||||
# We are only interested in <meta> tags
|
# We are only interested in <meta> tags
|
||||||
if tag.name != 'meta':
|
if tag.name != 'meta':
|
||||||
|
|
|
@ -1,5 +1,5 @@
|
||||||
# Use of this source code is governed by a BSD-style license that can be
|
# Use of this source code is governed by the MIT license.
|
||||||
# found in the LICENSE file.
|
__license__ = "MIT"
|
||||||
|
|
||||||
__all__ = [
|
__all__ = [
|
||||||
'HTML5TreeBuilder',
|
'HTML5TreeBuilder',
|
||||||
|
@ -15,7 +15,7 @@ from bs4.builder import (
|
||||||
)
|
)
|
||||||
from bs4.element import (
|
from bs4.element import (
|
||||||
NamespacedAttribute,
|
NamespacedAttribute,
|
||||||
whitespace_re,
|
nonwhitespace_re,
|
||||||
)
|
)
|
||||||
import html5lib
|
import html5lib
|
||||||
from html5lib.constants import (
|
from html5lib.constants import (
|
||||||
|
@ -33,7 +33,7 @@ try:
|
||||||
# Pre-0.99999999
|
# Pre-0.99999999
|
||||||
from html5lib.treebuilders import _base as treebuilder_base
|
from html5lib.treebuilders import _base as treebuilder_base
|
||||||
new_html5lib = False
|
new_html5lib = False
|
||||||
except ImportError, e:
|
except ImportError as e:
|
||||||
# 0.99999999 and up
|
# 0.99999999 and up
|
||||||
from html5lib.treebuilders import base as treebuilder_base
|
from html5lib.treebuilders import base as treebuilder_base
|
||||||
new_html5lib = True
|
new_html5lib = True
|
||||||
|
@ -64,7 +64,7 @@ class HTML5TreeBuilder(HTMLTreeBuilder):
|
||||||
parser = html5lib.HTMLParser(tree=self.create_treebuilder)
|
parser = html5lib.HTMLParser(tree=self.create_treebuilder)
|
||||||
|
|
||||||
extra_kwargs = dict()
|
extra_kwargs = dict()
|
||||||
if not isinstance(markup, unicode):
|
if not isinstance(markup, str):
|
||||||
if new_html5lib:
|
if new_html5lib:
|
||||||
extra_kwargs['override_encoding'] = self.user_specified_encoding
|
extra_kwargs['override_encoding'] = self.user_specified_encoding
|
||||||
else:
|
else:
|
||||||
|
@ -72,13 +72,13 @@ class HTML5TreeBuilder(HTMLTreeBuilder):
|
||||||
doc = parser.parse(markup, **extra_kwargs)
|
doc = parser.parse(markup, **extra_kwargs)
|
||||||
|
|
||||||
# Set the character encoding detected by the tokenizer.
|
# Set the character encoding detected by the tokenizer.
|
||||||
if isinstance(markup, unicode):
|
if isinstance(markup, str):
|
||||||
# We need to special-case this because html5lib sets
|
# We need to special-case this because html5lib sets
|
||||||
# charEncoding to UTF-8 if it gets Unicode input.
|
# charEncoding to UTF-8 if it gets Unicode input.
|
||||||
doc.original_encoding = None
|
doc.original_encoding = None
|
||||||
else:
|
else:
|
||||||
original_encoding = parser.tokenizer.stream.charEncoding[0]
|
original_encoding = parser.tokenizer.stream.charEncoding[0]
|
||||||
if not isinstance(original_encoding, basestring):
|
if not isinstance(original_encoding, str):
|
||||||
# In 0.99999999 and up, the encoding is an html5lib
|
# In 0.99999999 and up, the encoding is an html5lib
|
||||||
# Encoding object. We want to use a string for compatibility
|
# Encoding object. We want to use a string for compatibility
|
||||||
# with other tree builders.
|
# with other tree builders.
|
||||||
|
@ -92,7 +92,7 @@ class HTML5TreeBuilder(HTMLTreeBuilder):
|
||||||
|
|
||||||
def test_fragment_to_document(self, fragment):
|
def test_fragment_to_document(self, fragment):
|
||||||
"""See `TreeBuilder`."""
|
"""See `TreeBuilder`."""
|
||||||
return u'<html><head></head><body>%s</body></html>' % fragment
|
return '<html><head></head><body>%s</body></html>' % fragment
|
||||||
|
|
||||||
|
|
||||||
class TreeBuilderForHtml5lib(treebuilder_base.TreeBuilder):
|
class TreeBuilderForHtml5lib(treebuilder_base.TreeBuilder):
|
||||||
|
@ -174,7 +174,7 @@ class TreeBuilderForHtml5lib(treebuilder_base.TreeBuilder):
|
||||||
rv.append("|%s<%s>" % (' ' * indent, name))
|
rv.append("|%s<%s>" % (' ' * indent, name))
|
||||||
if element.attrs:
|
if element.attrs:
|
||||||
attributes = []
|
attributes = []
|
||||||
for name, value in element.attrs.items():
|
for name, value in list(element.attrs.items()):
|
||||||
if isinstance(name, NamespacedAttribute):
|
if isinstance(name, NamespacedAttribute):
|
||||||
name = "%s %s" % (prefixes[name.namespace], name.name)
|
name = "%s %s" % (prefixes[name.namespace], name.name)
|
||||||
if isinstance(value, list):
|
if isinstance(value, list):
|
||||||
|
@ -199,14 +199,14 @@ class AttrList(object):
|
||||||
def __setitem__(self, name, value):
|
def __setitem__(self, name, value):
|
||||||
# If this attribute is a multi-valued attribute for this element,
|
# If this attribute is a multi-valued attribute for this element,
|
||||||
# turn its value into a list.
|
# turn its value into a list.
|
||||||
list_attr = HTML5TreeBuilder.cdata_list_attributes
|
list_attr = self.element.cdata_list_attributes
|
||||||
if (name in list_attr['*']
|
if (name in list_attr['*']
|
||||||
or (self.element.name in list_attr
|
or (self.element.name in list_attr
|
||||||
and name in list_attr[self.element.name])):
|
and name in list_attr[self.element.name])):
|
||||||
# A node that is being cloned may have already undergone
|
# A node that is being cloned may have already undergone
|
||||||
# this procedure.
|
# this procedure.
|
||||||
if not isinstance(value, list):
|
if not isinstance(value, list):
|
||||||
value = whitespace_re.split(value)
|
value = nonwhitespace_re.findall(value)
|
||||||
self.element[name] = value
|
self.element[name] = value
|
||||||
def items(self):
|
def items(self):
|
||||||
return list(self.attrs.items())
|
return list(self.attrs.items())
|
||||||
|
@ -229,7 +229,7 @@ class Element(treebuilder_base.Node):
|
||||||
|
|
||||||
def appendChild(self, node):
|
def appendChild(self, node):
|
||||||
string_child = child = None
|
string_child = child = None
|
||||||
if isinstance(node, basestring):
|
if isinstance(node, str):
|
||||||
# Some other piece of code decided to pass in a string
|
# Some other piece of code decided to pass in a string
|
||||||
# instead of creating a TextElement object to contain the
|
# instead of creating a TextElement object to contain the
|
||||||
# string.
|
# string.
|
||||||
|
@ -246,10 +246,10 @@ class Element(treebuilder_base.Node):
|
||||||
child = node.element
|
child = node.element
|
||||||
node.parent = self
|
node.parent = self
|
||||||
|
|
||||||
if not isinstance(child, basestring) and child.parent is not None:
|
if not isinstance(child, str) and child.parent is not None:
|
||||||
node.element.extract()
|
node.element.extract()
|
||||||
|
|
||||||
if (string_child and self.element.contents
|
if (string_child is not None and self.element.contents
|
||||||
and self.element.contents[-1].__class__ == NavigableString):
|
and self.element.contents[-1].__class__ == NavigableString):
|
||||||
# We are appending a string onto another string.
|
# We are appending a string onto another string.
|
||||||
# TODO This has O(n^2) performance, for input like
|
# TODO This has O(n^2) performance, for input like
|
||||||
|
@ -259,7 +259,7 @@ class Element(treebuilder_base.Node):
|
||||||
old_element.replace_with(new_element)
|
old_element.replace_with(new_element)
|
||||||
self.soup._most_recent_element = new_element
|
self.soup._most_recent_element = new_element
|
||||||
else:
|
else:
|
||||||
if isinstance(node, basestring):
|
if isinstance(node, str):
|
||||||
# Create a brand new NavigableString from this string.
|
# Create a brand new NavigableString from this string.
|
||||||
child = self.soup.new_string(node)
|
child = self.soup.new_string(node)
|
||||||
|
|
||||||
|
@ -299,7 +299,7 @@ class Element(treebuilder_base.Node):
|
||||||
|
|
||||||
self.soup.builder._replace_cdata_list_attribute_values(
|
self.soup.builder._replace_cdata_list_attribute_values(
|
||||||
self.name, attributes)
|
self.name, attributes)
|
||||||
for name, value in attributes.items():
|
for name, value in list(attributes.items()):
|
||||||
self.element[name] = value
|
self.element[name] = value
|
||||||
|
|
||||||
# The attributes may contain variables that need substitution.
|
# The attributes may contain variables that need substitution.
|
||||||
|
@ -360,16 +360,16 @@ class Element(treebuilder_base.Node):
|
||||||
# Set the first child's previous_element and previous_sibling
|
# Set the first child's previous_element and previous_sibling
|
||||||
# to elements within the new parent
|
# to elements within the new parent
|
||||||
first_child = to_append[0]
|
first_child = to_append[0]
|
||||||
if new_parents_last_descendant:
|
if new_parents_last_descendant is not None:
|
||||||
first_child.previous_element = new_parents_last_descendant
|
first_child.previous_element = new_parents_last_descendant
|
||||||
else:
|
else:
|
||||||
first_child.previous_element = new_parent_element
|
first_child.previous_element = new_parent_element
|
||||||
first_child.previous_sibling = new_parents_last_child
|
first_child.previous_sibling = new_parents_last_child
|
||||||
if new_parents_last_descendant:
|
if new_parents_last_descendant is not None:
|
||||||
new_parents_last_descendant.next_element = first_child
|
new_parents_last_descendant.next_element = first_child
|
||||||
else:
|
else:
|
||||||
new_parent_element.next_element = first_child
|
new_parent_element.next_element = first_child
|
||||||
if new_parents_last_child:
|
if new_parents_last_child is not None:
|
||||||
new_parents_last_child.next_sibling = first_child
|
new_parents_last_child.next_sibling = first_child
|
||||||
|
|
||||||
# Find the very last element being moved. It is now the
|
# Find the very last element being moved. It is now the
|
||||||
|
@ -379,7 +379,7 @@ class Element(treebuilder_base.Node):
|
||||||
last_childs_last_descendant = to_append[-1]._last_descendant(False, True)
|
last_childs_last_descendant = to_append[-1]._last_descendant(False, True)
|
||||||
|
|
||||||
last_childs_last_descendant.next_element = new_parents_last_descendant_next_element
|
last_childs_last_descendant.next_element = new_parents_last_descendant_next_element
|
||||||
if new_parents_last_descendant_next_element:
|
if new_parents_last_descendant_next_element is not None:
|
||||||
# TODO: This code has no test coverage and I'm not sure
|
# TODO: This code has no test coverage and I'm not sure
|
||||||
# how to get html5lib to go through this path, but it's
|
# how to get html5lib to go through this path, but it's
|
||||||
# just the other side of the previous line.
|
# just the other side of the previous line.
|
||||||
|
|
|
@ -1,17 +1,18 @@
|
||||||
|
# encoding: utf-8
|
||||||
"""Use the HTMLParser library to parse HTML files that aren't too bad."""
|
"""Use the HTMLParser library to parse HTML files that aren't too bad."""
|
||||||
|
|
||||||
# Use of this source code is governed by a BSD-style license that can be
|
# Use of this source code is governed by the MIT license.
|
||||||
# found in the LICENSE file.
|
__license__ = "MIT"
|
||||||
|
|
||||||
__all__ = [
|
__all__ = [
|
||||||
'HTMLParserTreeBuilder',
|
'HTMLParserTreeBuilder',
|
||||||
]
|
]
|
||||||
|
|
||||||
from HTMLParser import HTMLParser
|
from html.parser import HTMLParser
|
||||||
|
|
||||||
try:
|
try:
|
||||||
from HTMLParser import HTMLParseError
|
from html.parser import HTMLParseError
|
||||||
except ImportError, e:
|
except ImportError as e:
|
||||||
# HTMLParseError is removed in Python 3.5. Since it can never be
|
# HTMLParseError is removed in Python 3.5. Since it can never be
|
||||||
# thrown in 3.5, we can just define our own class as a placeholder.
|
# thrown in 3.5, we can just define our own class as a placeholder.
|
||||||
class HTMLParseError(Exception):
|
class HTMLParseError(Exception):
|
||||||
|
@ -64,7 +65,18 @@ class BeautifulSoupHTMLParser(HTMLParser):
|
||||||
# order. It's a list of closing tags we've already handled and
|
# order. It's a list of closing tags we've already handled and
|
||||||
# will ignore, assuming they ever show up.
|
# will ignore, assuming they ever show up.
|
||||||
self.already_closed_empty_element = []
|
self.already_closed_empty_element = []
|
||||||
|
|
||||||
|
def error(self, msg):
|
||||||
|
"""In Python 3, HTMLParser subclasses must implement error(), although this
|
||||||
|
requirement doesn't appear to be documented.
|
||||||
|
|
||||||
|
In Python 2, HTMLParser implements error() as raising an exception.
|
||||||
|
|
||||||
|
In any event, this method is called only on very strange markup and our best strategy
|
||||||
|
is to pretend it didn't happen and keep going.
|
||||||
|
"""
|
||||||
|
warnings.warn(msg)
|
||||||
|
|
||||||
def handle_startendtag(self, name, attrs):
|
def handle_startendtag(self, name, attrs):
|
||||||
# This is only called when the markup looks like
|
# This is only called when the markup looks like
|
||||||
# <tag/>.
|
# <tag/>.
|
||||||
|
@ -129,11 +141,26 @@ class BeautifulSoupHTMLParser(HTMLParser):
|
||||||
else:
|
else:
|
||||||
real_name = int(name)
|
real_name = int(name)
|
||||||
|
|
||||||
try:
|
data = None
|
||||||
data = unichr(real_name)
|
if real_name < 256:
|
||||||
except (ValueError, OverflowError), e:
|
# HTML numeric entities are supposed to reference Unicode
|
||||||
data = u"\N{REPLACEMENT CHARACTER}"
|
# code points, but sometimes they reference code points in
|
||||||
|
# some other encoding (ahem, Windows-1252). E.g. “
|
||||||
|
# instead of É for LEFT DOUBLE QUOTATION MARK. This
|
||||||
|
# code tries to detect this situation and compensate.
|
||||||
|
for encoding in (self.soup.original_encoding, 'windows-1252'):
|
||||||
|
if not encoding:
|
||||||
|
continue
|
||||||
|
try:
|
||||||
|
data = bytearray([real_name]).decode(encoding)
|
||||||
|
except UnicodeDecodeError as e:
|
||||||
|
pass
|
||||||
|
if not data:
|
||||||
|
try:
|
||||||
|
data = chr(real_name)
|
||||||
|
except (ValueError, OverflowError) as e:
|
||||||
|
pass
|
||||||
|
data = data or "\N{REPLACEMENT CHARACTER}"
|
||||||
self.handle_data(data)
|
self.handle_data(data)
|
||||||
|
|
||||||
def handle_entityref(self, name):
|
def handle_entityref(self, name):
|
||||||
|
@ -141,7 +168,12 @@ class BeautifulSoupHTMLParser(HTMLParser):
|
||||||
if character is not None:
|
if character is not None:
|
||||||
data = character
|
data = character
|
||||||
else:
|
else:
|
||||||
data = "&%s;" % name
|
# If this were XML, it would be ambiguous whether "&foo"
|
||||||
|
# was an character entity reference with a missing
|
||||||
|
# semicolon or the literal string "&foo". Since this is
|
||||||
|
# HTML, we have a complete list of all character entity references,
|
||||||
|
# and this one wasn't found, so assume it's the literal string "&foo".
|
||||||
|
data = "&%s" % name
|
||||||
self.handle_data(data)
|
self.handle_data(data)
|
||||||
|
|
||||||
def handle_comment(self, data):
|
def handle_comment(self, data):
|
||||||
|
@ -182,12 +214,15 @@ class HTMLParserTreeBuilder(HTMLTreeBuilder):
|
||||||
NAME = HTMLPARSER
|
NAME = HTMLPARSER
|
||||||
features = [NAME, HTML, STRICT]
|
features = [NAME, HTML, STRICT]
|
||||||
|
|
||||||
def __init__(self, *args, **kwargs):
|
def __init__(self, parser_args=None, parser_kwargs=None, **kwargs):
|
||||||
|
super(HTMLParserTreeBuilder, self).__init__(**kwargs)
|
||||||
|
parser_args = parser_args or []
|
||||||
|
parser_kwargs = parser_kwargs or {}
|
||||||
if CONSTRUCTOR_TAKES_STRICT and not CONSTRUCTOR_STRICT_IS_DEPRECATED:
|
if CONSTRUCTOR_TAKES_STRICT and not CONSTRUCTOR_STRICT_IS_DEPRECATED:
|
||||||
kwargs['strict'] = False
|
parser_kwargs['strict'] = False
|
||||||
if CONSTRUCTOR_TAKES_CONVERT_CHARREFS:
|
if CONSTRUCTOR_TAKES_CONVERT_CHARREFS:
|
||||||
kwargs['convert_charrefs'] = False
|
parser_kwargs['convert_charrefs'] = False
|
||||||
self.parser_args = (args, kwargs)
|
self.parser_args = (parser_args, parser_kwargs)
|
||||||
|
|
||||||
def prepare_markup(self, markup, user_specified_encoding=None,
|
def prepare_markup(self, markup, user_specified_encoding=None,
|
||||||
document_declared_encoding=None, exclude_encodings=None):
|
document_declared_encoding=None, exclude_encodings=None):
|
||||||
|
@ -196,7 +231,7 @@ class HTMLParserTreeBuilder(HTMLTreeBuilder):
|
||||||
declared within markup, whether any characters had to be
|
declared within markup, whether any characters had to be
|
||||||
replaced with REPLACEMENT CHARACTER).
|
replaced with REPLACEMENT CHARACTER).
|
||||||
"""
|
"""
|
||||||
if isinstance(markup, unicode):
|
if isinstance(markup, str):
|
||||||
yield (markup, None, None, False)
|
yield (markup, None, None, False)
|
||||||
return
|
return
|
||||||
|
|
||||||
|
@ -213,7 +248,8 @@ class HTMLParserTreeBuilder(HTMLTreeBuilder):
|
||||||
parser.soup = self.soup
|
parser.soup = self.soup
|
||||||
try:
|
try:
|
||||||
parser.feed(markup)
|
parser.feed(markup)
|
||||||
except HTMLParseError, e:
|
parser.close()
|
||||||
|
except HTMLParseError as e:
|
||||||
warnings.warn(RuntimeWarning(
|
warnings.warn(RuntimeWarning(
|
||||||
"Python's built-in HTMLParser cannot parse the given document. This is not a bug in Beautiful Soup. The best solution is to install an external parser (lxml or html5lib), and use Beautiful Soup with that parser. See http://www.crummy.com/software/BeautifulSoup/bs4/doc/#installing-a-parser for help."))
|
"Python's built-in HTMLParser cannot parse the given document. This is not a bug in Beautiful Soup. The best solution is to install an external parser (lxml or html5lib), and use Beautiful Soup with that parser. See http://www.crummy.com/software/BeautifulSoup/bs4/doc/#installing-a-parser for help."))
|
||||||
raise e
|
raise e
|
||||||
|
|
|
@ -1,13 +1,18 @@
|
||||||
# Use of this source code is governed by a BSD-style license that can be
|
# Use of this source code is governed by the MIT license.
|
||||||
# found in the LICENSE file.
|
__license__ = "MIT"
|
||||||
|
|
||||||
__all__ = [
|
__all__ = [
|
||||||
'LXMLTreeBuilderForXML',
|
'LXMLTreeBuilderForXML',
|
||||||
'LXMLTreeBuilder',
|
'LXMLTreeBuilder',
|
||||||
]
|
]
|
||||||
|
|
||||||
|
try:
|
||||||
|
from collections.abc import Callable # Python 3.6
|
||||||
|
except ImportError as e:
|
||||||
|
from collections import Callable
|
||||||
|
|
||||||
from io import BytesIO
|
from io import BytesIO
|
||||||
from StringIO import StringIO
|
from io import StringIO
|
||||||
import collections
|
|
||||||
from lxml import etree
|
from lxml import etree
|
||||||
from bs4.element import (
|
from bs4.element import (
|
||||||
Comment,
|
Comment,
|
||||||
|
@ -28,6 +33,10 @@ from bs4.dammit import EncodingDetector
|
||||||
|
|
||||||
LXML = 'lxml'
|
LXML = 'lxml'
|
||||||
|
|
||||||
|
def _invert(d):
|
||||||
|
"Invert a dictionary."
|
||||||
|
return dict((v,k) for k, v in list(d.items()))
|
||||||
|
|
||||||
class LXMLTreeBuilderForXML(TreeBuilder):
|
class LXMLTreeBuilderForXML(TreeBuilder):
|
||||||
DEFAULT_PARSER_CLASS = etree.XMLParser
|
DEFAULT_PARSER_CLASS = etree.XMLParser
|
||||||
|
|
||||||
|
@ -44,7 +53,29 @@ class LXMLTreeBuilderForXML(TreeBuilder):
|
||||||
|
|
||||||
# This namespace mapping is specified in the XML Namespace
|
# This namespace mapping is specified in the XML Namespace
|
||||||
# standard.
|
# standard.
|
||||||
DEFAULT_NSMAPS = {'http://www.w3.org/XML/1998/namespace' : "xml"}
|
DEFAULT_NSMAPS = dict(xml='http://www.w3.org/XML/1998/namespace')
|
||||||
|
|
||||||
|
DEFAULT_NSMAPS_INVERTED = _invert(DEFAULT_NSMAPS)
|
||||||
|
|
||||||
|
def initialize_soup(self, soup):
|
||||||
|
"""Let the BeautifulSoup object know about the standard namespace
|
||||||
|
mapping.
|
||||||
|
"""
|
||||||
|
super(LXMLTreeBuilderForXML, self).initialize_soup(soup)
|
||||||
|
self._register_namespaces(self.DEFAULT_NSMAPS)
|
||||||
|
|
||||||
|
def _register_namespaces(self, mapping):
|
||||||
|
"""Let the BeautifulSoup object know about namespaces encountered
|
||||||
|
while parsing the document.
|
||||||
|
|
||||||
|
This might be useful later on when creating CSS selectors.
|
||||||
|
"""
|
||||||
|
for key, value in list(mapping.items()):
|
||||||
|
if key and key not in self.soup._namespaces:
|
||||||
|
# Let the BeautifulSoup object know about a new namespace.
|
||||||
|
# If there are multiple namespaces defined with the same
|
||||||
|
# prefix, the first one in the document takes precedence.
|
||||||
|
self.soup._namespaces[key] = value
|
||||||
|
|
||||||
def default_parser(self, encoding):
|
def default_parser(self, encoding):
|
||||||
# This can either return a parser object or a class, which
|
# This can either return a parser object or a class, which
|
||||||
|
@ -58,12 +89,12 @@ class LXMLTreeBuilderForXML(TreeBuilder):
|
||||||
# Use the default parser.
|
# Use the default parser.
|
||||||
parser = self.default_parser(encoding)
|
parser = self.default_parser(encoding)
|
||||||
|
|
||||||
if isinstance(parser, collections.Callable):
|
if isinstance(parser, Callable):
|
||||||
# Instantiate the parser with default arguments
|
# Instantiate the parser with default arguments
|
||||||
parser = parser(target=self, strip_cdata=False, encoding=encoding)
|
parser = parser(target=self, strip_cdata=False, encoding=encoding)
|
||||||
return parser
|
return parser
|
||||||
|
|
||||||
def __init__(self, parser=None, empty_element_tags=None):
|
def __init__(self, parser=None, empty_element_tags=None, **kwargs):
|
||||||
# TODO: Issue a warning if parser is present but not a
|
# TODO: Issue a warning if parser is present but not a
|
||||||
# callable, since that means there's no way to create new
|
# callable, since that means there's no way to create new
|
||||||
# parsers for different encodings.
|
# parsers for different encodings.
|
||||||
|
@ -71,8 +102,9 @@ class LXMLTreeBuilderForXML(TreeBuilder):
|
||||||
if empty_element_tags is not None:
|
if empty_element_tags is not None:
|
||||||
self.empty_element_tags = set(empty_element_tags)
|
self.empty_element_tags = set(empty_element_tags)
|
||||||
self.soup = None
|
self.soup = None
|
||||||
self.nsmaps = [self.DEFAULT_NSMAPS]
|
self.nsmaps = [self.DEFAULT_NSMAPS_INVERTED]
|
||||||
|
super(LXMLTreeBuilderForXML, self).__init__(**kwargs)
|
||||||
|
|
||||||
def _getNsTag(self, tag):
|
def _getNsTag(self, tag):
|
||||||
# Split the namespace URL out of a fully-qualified lxml tag
|
# Split the namespace URL out of a fully-qualified lxml tag
|
||||||
# name. Copied from lxml's src/lxml/sax.py.
|
# name. Copied from lxml's src/lxml/sax.py.
|
||||||
|
@ -101,12 +133,12 @@ class LXMLTreeBuilderForXML(TreeBuilder):
|
||||||
else:
|
else:
|
||||||
self.processing_instruction_class = XMLProcessingInstruction
|
self.processing_instruction_class = XMLProcessingInstruction
|
||||||
|
|
||||||
if isinstance(markup, unicode):
|
if isinstance(markup, str):
|
||||||
# We were given Unicode. Maybe lxml can parse Unicode on
|
# We were given Unicode. Maybe lxml can parse Unicode on
|
||||||
# this system?
|
# this system?
|
||||||
yield markup, None, document_declared_encoding, False
|
yield markup, None, document_declared_encoding, False
|
||||||
|
|
||||||
if isinstance(markup, unicode):
|
if isinstance(markup, str):
|
||||||
# No, apparently not. Convert the Unicode to UTF-8 and
|
# No, apparently not. Convert the Unicode to UTF-8 and
|
||||||
# tell lxml to parse it as UTF-8.
|
# tell lxml to parse it as UTF-8.
|
||||||
yield (markup.encode("utf8"), "utf8",
|
yield (markup.encode("utf8"), "utf8",
|
||||||
|
@ -121,7 +153,7 @@ class LXMLTreeBuilderForXML(TreeBuilder):
|
||||||
def feed(self, markup):
|
def feed(self, markup):
|
||||||
if isinstance(markup, bytes):
|
if isinstance(markup, bytes):
|
||||||
markup = BytesIO(markup)
|
markup = BytesIO(markup)
|
||||||
elif isinstance(markup, unicode):
|
elif isinstance(markup, str):
|
||||||
markup = StringIO(markup)
|
markup = StringIO(markup)
|
||||||
|
|
||||||
# Call feed() at least once, even if the markup is empty,
|
# Call feed() at least once, even if the markup is empty,
|
||||||
|
@ -136,30 +168,36 @@ class LXMLTreeBuilderForXML(TreeBuilder):
|
||||||
if len(data) != 0:
|
if len(data) != 0:
|
||||||
self.parser.feed(data)
|
self.parser.feed(data)
|
||||||
self.parser.close()
|
self.parser.close()
|
||||||
except (UnicodeDecodeError, LookupError, etree.ParserError), e:
|
except (UnicodeDecodeError, LookupError, etree.ParserError) as e:
|
||||||
raise ParserRejectedMarkup(str(e))
|
raise ParserRejectedMarkup(str(e))
|
||||||
|
|
||||||
def close(self):
|
def close(self):
|
||||||
self.nsmaps = [self.DEFAULT_NSMAPS]
|
self.nsmaps = [self.DEFAULT_NSMAPS_INVERTED]
|
||||||
|
|
||||||
def start(self, name, attrs, nsmap={}):
|
def start(self, name, attrs, nsmap={}):
|
||||||
# Make sure attrs is a mutable dict--lxml may send an immutable dictproxy.
|
# Make sure attrs is a mutable dict--lxml may send an immutable dictproxy.
|
||||||
attrs = dict(attrs)
|
attrs = dict(attrs)
|
||||||
nsprefix = None
|
nsprefix = None
|
||||||
# Invert each namespace map as it comes in.
|
# Invert each namespace map as it comes in.
|
||||||
if len(self.nsmaps) > 1:
|
if len(nsmap) == 0 and len(self.nsmaps) > 1:
|
||||||
# There are no new namespaces for this tag, but
|
# There are no new namespaces for this tag, but
|
||||||
# non-default namespaces are in play, so we need a
|
# non-default namespaces are in play, so we need a
|
||||||
# separate tag stack to know when they end.
|
# separate tag stack to know when they end.
|
||||||
self.nsmaps.append(None)
|
self.nsmaps.append(None)
|
||||||
elif len(nsmap) > 0:
|
elif len(nsmap) > 0:
|
||||||
# A new namespace mapping has come into play.
|
# A new namespace mapping has come into play.
|
||||||
inverted_nsmap = dict((value, key) for key, value in nsmap.items())
|
|
||||||
self.nsmaps.append(inverted_nsmap)
|
# First, Let the BeautifulSoup object know about it.
|
||||||
|
self._register_namespaces(nsmap)
|
||||||
|
|
||||||
|
# Then, add it to our running list of inverted namespace
|
||||||
|
# mappings.
|
||||||
|
self.nsmaps.append(_invert(nsmap))
|
||||||
|
|
||||||
# Also treat the namespace mapping as a set of attributes on the
|
# Also treat the namespace mapping as a set of attributes on the
|
||||||
# tag, so we can recreate it later.
|
# tag, so we can recreate it later.
|
||||||
attrs = attrs.copy()
|
attrs = attrs.copy()
|
||||||
for prefix, namespace in nsmap.items():
|
for prefix, namespace in list(nsmap.items()):
|
||||||
attribute = NamespacedAttribute(
|
attribute = NamespacedAttribute(
|
||||||
"xmlns", prefix, "http://www.w3.org/2000/xmlns/")
|
"xmlns", prefix, "http://www.w3.org/2000/xmlns/")
|
||||||
attrs[attribute] = namespace
|
attrs[attribute] = namespace
|
||||||
|
@ -168,7 +206,7 @@ class LXMLTreeBuilderForXML(TreeBuilder):
|
||||||
# from lxml with namespaces attached to their names, and
|
# from lxml with namespaces attached to their names, and
|
||||||
# turn then into NamespacedAttribute objects.
|
# turn then into NamespacedAttribute objects.
|
||||||
new_attrs = {}
|
new_attrs = {}
|
||||||
for attr, value in attrs.items():
|
for attr, value in list(attrs.items()):
|
||||||
namespace, attr = self._getNsTag(attr)
|
namespace, attr = self._getNsTag(attr)
|
||||||
if namespace is None:
|
if namespace is None:
|
||||||
new_attrs[attr] = value
|
new_attrs[attr] = value
|
||||||
|
@ -228,7 +266,7 @@ class LXMLTreeBuilderForXML(TreeBuilder):
|
||||||
|
|
||||||
def test_fragment_to_document(self, fragment):
|
def test_fragment_to_document(self, fragment):
|
||||||
"""See `TreeBuilder`."""
|
"""See `TreeBuilder`."""
|
||||||
return u'<?xml version="1.0" encoding="utf-8"?>\n%s' % fragment
|
return '<?xml version="1.0" encoding="utf-8"?>\n%s' % fragment
|
||||||
|
|
||||||
|
|
||||||
class LXMLTreeBuilder(HTMLTreeBuilder, LXMLTreeBuilderForXML):
|
class LXMLTreeBuilder(HTMLTreeBuilder, LXMLTreeBuilderForXML):
|
||||||
|
@ -249,10 +287,10 @@ class LXMLTreeBuilder(HTMLTreeBuilder, LXMLTreeBuilderForXML):
|
||||||
self.parser = self.parser_for(encoding)
|
self.parser = self.parser_for(encoding)
|
||||||
self.parser.feed(markup)
|
self.parser.feed(markup)
|
||||||
self.parser.close()
|
self.parser.close()
|
||||||
except (UnicodeDecodeError, LookupError, etree.ParserError), e:
|
except (UnicodeDecodeError, LookupError, etree.ParserError) as e:
|
||||||
raise ParserRejectedMarkup(str(e))
|
raise ParserRejectedMarkup(str(e))
|
||||||
|
|
||||||
|
|
||||||
def test_fragment_to_document(self, fragment):
|
def test_fragment_to_document(self, fragment):
|
||||||
"""See `TreeBuilder`."""
|
"""See `TreeBuilder`."""
|
||||||
return u'<html><body>%s</body></html>' % fragment
|
return '<html><body>%s</body></html>' % fragment
|
||||||
|
|
|
@ -6,12 +6,11 @@ necessary. It is heavily based on code from Mark Pilgrim's Universal
|
||||||
Feed Parser. It works best on XML and HTML, but it does not rewrite the
|
Feed Parser. It works best on XML and HTML, but it does not rewrite the
|
||||||
XML or HTML to reflect a new encoding; that's the tree builder's job.
|
XML or HTML to reflect a new encoding; that's the tree builder's job.
|
||||||
"""
|
"""
|
||||||
# Use of this source code is governed by a BSD-style license that can be
|
# Use of this source code is governed by the MIT license.
|
||||||
# found in the LICENSE file.
|
|
||||||
__license__ = "MIT"
|
__license__ = "MIT"
|
||||||
|
|
||||||
import codecs
|
import codecs
|
||||||
from htmlentitydefs import codepoint2name
|
from html.entities import codepoint2name
|
||||||
import re
|
import re
|
||||||
import logging
|
import logging
|
||||||
import string
|
import string
|
||||||
|
@ -46,9 +45,9 @@ except ImportError:
|
||||||
pass
|
pass
|
||||||
|
|
||||||
xml_encoding_re = re.compile(
|
xml_encoding_re = re.compile(
|
||||||
'^<\?.*encoding=[\'"](.*?)[\'"].*\?>'.encode(), re.I)
|
'^<\\?.*encoding=[\'"](.*?)[\'"].*\\?>'.encode(), re.I)
|
||||||
html_meta_re = re.compile(
|
html_meta_re = re.compile(
|
||||||
'<\s*meta[^>]+charset\s*=\s*["\']?([^>]*?)[ /;\'">]'.encode(), re.I)
|
'<\\s*meta[^>]+charset\\s*=\\s*["\']?([^>]*?)[ /;\'">]'.encode(), re.I)
|
||||||
|
|
||||||
class EntitySubstitution(object):
|
class EntitySubstitution(object):
|
||||||
|
|
||||||
|
@ -58,15 +57,24 @@ class EntitySubstitution(object):
|
||||||
lookup = {}
|
lookup = {}
|
||||||
reverse_lookup = {}
|
reverse_lookup = {}
|
||||||
characters_for_re = []
|
characters_for_re = []
|
||||||
for codepoint, name in list(codepoint2name.items()):
|
|
||||||
character = unichr(codepoint)
|
# &apos is an XHTML entity and an HTML 5, but not an HTML 4
|
||||||
if codepoint != 34:
|
# entity. We don't want to use it, but we want to recognize it on the way in.
|
||||||
|
#
|
||||||
|
# TODO: Ideally we would be able to recognize all HTML 5 named
|
||||||
|
# entities, but that's a little tricky.
|
||||||
|
extra = [(39, 'apos')]
|
||||||
|
for codepoint, name in list(codepoint2name.items()) + extra:
|
||||||
|
character = chr(codepoint)
|
||||||
|
if codepoint not in (34, 39):
|
||||||
# There's no point in turning the quotation mark into
|
# There's no point in turning the quotation mark into
|
||||||
# ", unless it happens within an attribute value, which
|
# " or the single quote into ', unless it
|
||||||
# is handled elsewhere.
|
# happens within an attribute value, which is handled
|
||||||
|
# elsewhere.
|
||||||
characters_for_re.append(character)
|
characters_for_re.append(character)
|
||||||
lookup[character] = name
|
lookup[character] = name
|
||||||
# But we do want to turn " into the quotation mark.
|
# But we do want to recognize those entities on the way in and
|
||||||
|
# convert them to Unicode characters.
|
||||||
reverse_lookup[name] = character
|
reverse_lookup[name] = character
|
||||||
re_definition = "[%s]" % "".join(characters_for_re)
|
re_definition = "[%s]" % "".join(characters_for_re)
|
||||||
return lookup, reverse_lookup, re.compile(re_definition)
|
return lookup, reverse_lookup, re.compile(re_definition)
|
||||||
|
@ -82,7 +90,7 @@ class EntitySubstitution(object):
|
||||||
}
|
}
|
||||||
|
|
||||||
BARE_AMPERSAND_OR_BRACKET = re.compile("([<>]|"
|
BARE_AMPERSAND_OR_BRACKET = re.compile("([<>]|"
|
||||||
"&(?!#\d+;|#x[0-9a-fA-F]+;|\w+;)"
|
"&(?!#\\d+;|#x[0-9a-fA-F]+;|\\w+;)"
|
||||||
")")
|
")")
|
||||||
|
|
||||||
AMPERSAND_OR_BRACKET = re.compile("([<>&])")
|
AMPERSAND_OR_BRACKET = re.compile("([<>&])")
|
||||||
|
@ -274,7 +282,7 @@ class EncodingDetector:
|
||||||
def strip_byte_order_mark(cls, data):
|
def strip_byte_order_mark(cls, data):
|
||||||
"""If a byte-order mark is present, strip it and return the encoding it implies."""
|
"""If a byte-order mark is present, strip it and return the encoding it implies."""
|
||||||
encoding = None
|
encoding = None
|
||||||
if isinstance(data, unicode):
|
if isinstance(data, str):
|
||||||
# Unicode data cannot have a byte-order mark.
|
# Unicode data cannot have a byte-order mark.
|
||||||
return data, encoding
|
return data, encoding
|
||||||
if (len(data) >= 4) and (data[:2] == b'\xfe\xff') \
|
if (len(data) >= 4) and (data[:2] == b'\xfe\xff') \
|
||||||
|
@ -352,9 +360,9 @@ class UnicodeDammit:
|
||||||
markup, override_encodings, is_html, exclude_encodings)
|
markup, override_encodings, is_html, exclude_encodings)
|
||||||
|
|
||||||
# Short-circuit if the data is in Unicode to begin with.
|
# Short-circuit if the data is in Unicode to begin with.
|
||||||
if isinstance(markup, unicode) or markup == '':
|
if isinstance(markup, str) or markup == '':
|
||||||
self.markup = markup
|
self.markup = markup
|
||||||
self.unicode_markup = unicode(markup)
|
self.unicode_markup = str(markup)
|
||||||
self.original_encoding = None
|
self.original_encoding = None
|
||||||
return
|
return
|
||||||
|
|
||||||
|
@ -438,7 +446,7 @@ class UnicodeDammit:
|
||||||
def _to_unicode(self, data, encoding, errors="strict"):
|
def _to_unicode(self, data, encoding, errors="strict"):
|
||||||
'''Given a string and its encoding, decodes the string into Unicode.
|
'''Given a string and its encoding, decodes the string into Unicode.
|
||||||
%encoding is a string recognized by encodings.aliases'''
|
%encoding is a string recognized by encodings.aliases'''
|
||||||
return unicode(data, encoding, errors)
|
return str(data, encoding, errors)
|
||||||
|
|
||||||
@property
|
@property
|
||||||
def declared_html_encoding(self):
|
def declared_html_encoding(self):
|
||||||
|
|
|
@ -1,12 +1,11 @@
|
||||||
"""Diagnostic functions, mainly for use when doing tech support."""
|
"""Diagnostic functions, mainly for use when doing tech support."""
|
||||||
|
|
||||||
# Use of this source code is governed by a BSD-style license that can be
|
# Use of this source code is governed by the MIT license.
|
||||||
# found in the LICENSE file.
|
|
||||||
__license__ = "MIT"
|
__license__ = "MIT"
|
||||||
|
|
||||||
import cProfile
|
import cProfile
|
||||||
from StringIO import StringIO
|
from io import StringIO
|
||||||
from HTMLParser import HTMLParser
|
from html.parser import HTMLParser
|
||||||
import bs4
|
import bs4
|
||||||
from bs4 import BeautifulSoup, __version__
|
from bs4 import BeautifulSoup, __version__
|
||||||
from bs4.builder import builder_registry
|
from bs4.builder import builder_registry
|
||||||
|
@ -22,8 +21,8 @@ import cProfile
|
||||||
|
|
||||||
def diagnose(data):
|
def diagnose(data):
|
||||||
"""Diagnostic suite for isolating common problems."""
|
"""Diagnostic suite for isolating common problems."""
|
||||||
print "Diagnostic running on Beautiful Soup %s" % __version__
|
print("Diagnostic running on Beautiful Soup %s" % __version__)
|
||||||
print "Python version %s" % sys.version
|
print("Python version %s" % sys.version)
|
||||||
|
|
||||||
basic_parsers = ["html.parser", "html5lib", "lxml"]
|
basic_parsers = ["html.parser", "html5lib", "lxml"]
|
||||||
for name in basic_parsers:
|
for name in basic_parsers:
|
||||||
|
@ -32,16 +31,16 @@ def diagnose(data):
|
||||||
break
|
break
|
||||||
else:
|
else:
|
||||||
basic_parsers.remove(name)
|
basic_parsers.remove(name)
|
||||||
print (
|
print((
|
||||||
"I noticed that %s is not installed. Installing it may help." %
|
"I noticed that %s is not installed. Installing it may help." %
|
||||||
name)
|
name))
|
||||||
|
|
||||||
if 'lxml' in basic_parsers:
|
if 'lxml' in basic_parsers:
|
||||||
basic_parsers.append(["lxml", "xml"])
|
basic_parsers.append("lxml-xml")
|
||||||
try:
|
try:
|
||||||
from lxml import etree
|
from lxml import etree
|
||||||
print "Found lxml version %s" % ".".join(map(str,etree.LXML_VERSION))
|
print("Found lxml version %s" % ".".join(map(str,etree.LXML_VERSION)))
|
||||||
except ImportError, e:
|
except ImportError as e:
|
||||||
print (
|
print (
|
||||||
"lxml is not installed or couldn't be imported.")
|
"lxml is not installed or couldn't be imported.")
|
||||||
|
|
||||||
|
@ -49,37 +48,43 @@ def diagnose(data):
|
||||||
if 'html5lib' in basic_parsers:
|
if 'html5lib' in basic_parsers:
|
||||||
try:
|
try:
|
||||||
import html5lib
|
import html5lib
|
||||||
print "Found html5lib version %s" % html5lib.__version__
|
print("Found html5lib version %s" % html5lib.__version__)
|
||||||
except ImportError, e:
|
except ImportError as e:
|
||||||
print (
|
print (
|
||||||
"html5lib is not installed or couldn't be imported.")
|
"html5lib is not installed or couldn't be imported.")
|
||||||
|
|
||||||
if hasattr(data, 'read'):
|
if hasattr(data, 'read'):
|
||||||
data = data.read()
|
data = data.read()
|
||||||
elif os.path.exists(data):
|
|
||||||
print '"%s" looks like a filename. Reading data from the file.' % data
|
|
||||||
with open(data) as fp:
|
|
||||||
data = fp.read()
|
|
||||||
elif data.startswith("http:") or data.startswith("https:"):
|
elif data.startswith("http:") or data.startswith("https:"):
|
||||||
print '"%s" looks like a URL. Beautiful Soup is not an HTTP client.' % data
|
print('"%s" looks like a URL. Beautiful Soup is not an HTTP client.' % data)
|
||||||
print "You need to use some other library to get the document behind the URL, and feed that document to Beautiful Soup."
|
print("You need to use some other library to get the document behind the URL, and feed that document to Beautiful Soup.")
|
||||||
return
|
return
|
||||||
print
|
else:
|
||||||
|
try:
|
||||||
|
if os.path.exists(data):
|
||||||
|
print('"%s" looks like a filename. Reading data from the file.' % data)
|
||||||
|
with open(data) as fp:
|
||||||
|
data = fp.read()
|
||||||
|
except ValueError:
|
||||||
|
# This can happen on some platforms when the 'filename' is
|
||||||
|
# too long. Assume it's data and not a filename.
|
||||||
|
pass
|
||||||
|
print()
|
||||||
|
|
||||||
for parser in basic_parsers:
|
for parser in basic_parsers:
|
||||||
print "Trying to parse your markup with %s" % parser
|
print("Trying to parse your markup with %s" % parser)
|
||||||
success = False
|
success = False
|
||||||
try:
|
try:
|
||||||
soup = BeautifulSoup(data, parser)
|
soup = BeautifulSoup(data, features=parser)
|
||||||
success = True
|
success = True
|
||||||
except Exception, e:
|
except Exception as e:
|
||||||
print "%s could not parse the markup." % parser
|
print("%s could not parse the markup." % parser)
|
||||||
traceback.print_exc()
|
traceback.print_exc()
|
||||||
if success:
|
if success:
|
||||||
print "Here's what %s did with the markup:" % parser
|
print("Here's what %s did with the markup:" % parser)
|
||||||
print soup.prettify()
|
print(soup.prettify())
|
||||||
|
|
||||||
print "-" * 80
|
print("-" * 80)
|
||||||
|
|
||||||
def lxml_trace(data, html=True, **kwargs):
|
def lxml_trace(data, html=True, **kwargs):
|
||||||
"""Print out the lxml events that occur during parsing.
|
"""Print out the lxml events that occur during parsing.
|
||||||
|
@ -89,7 +94,7 @@ def lxml_trace(data, html=True, **kwargs):
|
||||||
"""
|
"""
|
||||||
from lxml import etree
|
from lxml import etree
|
||||||
for event, element in etree.iterparse(StringIO(data), html=html, **kwargs):
|
for event, element in etree.iterparse(StringIO(data), html=html, **kwargs):
|
||||||
print("%s, %4s, %s" % (event, element.tag, element.text))
|
print(("%s, %4s, %s" % (event, element.tag, element.text)))
|
||||||
|
|
||||||
class AnnouncingParser(HTMLParser):
|
class AnnouncingParser(HTMLParser):
|
||||||
"""Announces HTMLParser parse events, without doing anything else."""
|
"""Announces HTMLParser parse events, without doing anything else."""
|
||||||
|
@ -149,7 +154,7 @@ def rword(length=5):
|
||||||
|
|
||||||
def rsentence(length=4):
|
def rsentence(length=4):
|
||||||
"Generate a random sentence-like string."
|
"Generate a random sentence-like string."
|
||||||
return " ".join(rword(random.randint(4,9)) for i in range(length))
|
return " ".join(rword(random.randint(4,9)) for i in list(range(length)))
|
||||||
|
|
||||||
def rdoc(num_elements=1000):
|
def rdoc(num_elements=1000):
|
||||||
"""Randomly generate an invalid HTML document."""
|
"""Randomly generate an invalid HTML document."""
|
||||||
|
@ -171,9 +176,9 @@ def rdoc(num_elements=1000):
|
||||||
|
|
||||||
def benchmark_parsers(num_elements=100000):
|
def benchmark_parsers(num_elements=100000):
|
||||||
"""Very basic head-to-head performance benchmark."""
|
"""Very basic head-to-head performance benchmark."""
|
||||||
print "Comparative parser benchmark on Beautiful Soup %s" % __version__
|
print("Comparative parser benchmark on Beautiful Soup %s" % __version__)
|
||||||
data = rdoc(num_elements)
|
data = rdoc(num_elements)
|
||||||
print "Generated a large invalid HTML document (%d bytes)." % len(data)
|
print("Generated a large invalid HTML document (%d bytes)." % len(data))
|
||||||
|
|
||||||
for parser in ["lxml", ["lxml", "html"], "html5lib", "html.parser"]:
|
for parser in ["lxml", ["lxml", "html"], "html5lib", "html.parser"]:
|
||||||
success = False
|
success = False
|
||||||
|
@ -182,24 +187,24 @@ def benchmark_parsers(num_elements=100000):
|
||||||
soup = BeautifulSoup(data, parser)
|
soup = BeautifulSoup(data, parser)
|
||||||
b = time.time()
|
b = time.time()
|
||||||
success = True
|
success = True
|
||||||
except Exception, e:
|
except Exception as e:
|
||||||
print "%s could not parse the markup." % parser
|
print("%s could not parse the markup." % parser)
|
||||||
traceback.print_exc()
|
traceback.print_exc()
|
||||||
if success:
|
if success:
|
||||||
print "BS4+%s parsed the markup in %.2fs." % (parser, b-a)
|
print("BS4+%s parsed the markup in %.2fs." % (parser, b-a))
|
||||||
|
|
||||||
from lxml import etree
|
from lxml import etree
|
||||||
a = time.time()
|
a = time.time()
|
||||||
etree.HTML(data)
|
etree.HTML(data)
|
||||||
b = time.time()
|
b = time.time()
|
||||||
print "Raw lxml parsed the markup in %.2fs." % (b-a)
|
print("Raw lxml parsed the markup in %.2fs." % (b-a))
|
||||||
|
|
||||||
import html5lib
|
import html5lib
|
||||||
parser = html5lib.HTMLParser()
|
parser = html5lib.HTMLParser()
|
||||||
a = time.time()
|
a = time.time()
|
||||||
parser.parse(data)
|
parser.parse(data)
|
||||||
b = time.time()
|
b = time.time()
|
||||||
print "Raw html5lib parsed the markup in %.2fs." % (b-a)
|
print("Raw html5lib parsed the markup in %.2fs." % (b-a))
|
||||||
|
|
||||||
def profile(num_elements=100000, parser="lxml"):
|
def profile(num_elements=100000, parser="lxml"):
|
||||||
|
|
||||||
|
|
File diff suppressed because it is too large
Load Diff
|
@ -1,7 +1,7 @@
|
||||||
|
# encoding: utf-8
|
||||||
"""Helper classes for tests."""
|
"""Helper classes for tests."""
|
||||||
|
|
||||||
# Use of this source code is governed by a BSD-style license that can be
|
# Use of this source code is governed by the MIT license.
|
||||||
# found in the LICENSE file.
|
|
||||||
__license__ = "MIT"
|
__license__ = "MIT"
|
||||||
|
|
||||||
import pickle
|
import pickle
|
||||||
|
@ -16,29 +16,66 @@ from bs4.element import (
|
||||||
ContentMetaAttributeValue,
|
ContentMetaAttributeValue,
|
||||||
Doctype,
|
Doctype,
|
||||||
SoupStrainer,
|
SoupStrainer,
|
||||||
|
Tag
|
||||||
)
|
)
|
||||||
|
|
||||||
from bs4.builder import HTMLParserTreeBuilder
|
from bs4.builder import HTMLParserTreeBuilder
|
||||||
default_builder = HTMLParserTreeBuilder
|
default_builder = HTMLParserTreeBuilder
|
||||||
|
|
||||||
|
BAD_DOCUMENT = """A bare string
|
||||||
|
<!DOCTYPE xsl:stylesheet SYSTEM "htmlent.dtd">
|
||||||
|
<!DOCTYPE xsl:stylesheet PUBLIC "htmlent.dtd">
|
||||||
|
<div><![CDATA[A CDATA section where it doesn't belong]]></div>
|
||||||
|
<div><svg><![CDATA[HTML5 does allow CDATA sections in SVG]]></svg></div>
|
||||||
|
<div>A <meta> tag</div>
|
||||||
|
<div>A <br> tag that supposedly has contents.</br></div>
|
||||||
|
<div>AT&T</div>
|
||||||
|
<div><textarea>Within a textarea, markup like <b> tags and <&<& should be treated as literal</textarea></div>
|
||||||
|
<div><script>if (i < 2) { alert("<b>Markup within script tags should be treated as literal.</b>"); }</script></div>
|
||||||
|
<div>This numeric entity is missing the final semicolon: <x t="piñata"></div>
|
||||||
|
<div><a href="http://example.com/</a> that attribute value never got closed</div>
|
||||||
|
<div><a href="foo</a>, </a><a href="bar">that attribute value was closed by the subsequent tag</a></div>
|
||||||
|
<! This document starts with a bogus declaration ><div>a</div>
|
||||||
|
<div>This document contains <!an incomplete declaration <div>(do you see it?)</div>
|
||||||
|
<div>This document ends with <!an incomplete declaration
|
||||||
|
<div><a style={height:21px;}>That attribute value was bogus</a></div>
|
||||||
|
<! DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN">The doctype is invalid because it contains extra whitespace
|
||||||
|
<div><table><td nowrap>That boolean attribute had no value</td></table></div>
|
||||||
|
<div>Here's a nonexistent entity: &#foo; (do you see it?)</div>
|
||||||
|
<div>This document ends before the entity finishes: >
|
||||||
|
<div><p>Paragraphs shouldn't contain block display elements, but this one does: <dl><dt>you see?</dt></p>
|
||||||
|
<b b="20" a="1" b="10" a="2" a="3" a="4">Multiple values for the same attribute.</b>
|
||||||
|
<div><table><tr><td>Here's a table</td></tr></table></div>
|
||||||
|
<div><table id="1"><tr><td>Here's a nested table:<table id="2"><tr><td>foo</td></tr></table></td></div>
|
||||||
|
<div>This tag contains nothing but whitespace: <b> </b></div>
|
||||||
|
<div><blockquote><p><b>This p tag is cut off by</blockquote></p>the end of the blockquote tag</div>
|
||||||
|
<div><table><div>This table contains bare markup</div></table></div>
|
||||||
|
<div><div id="1">\n <a href="link1">This link is never closed.\n</div>\n<div id="2">\n <div id="3">\n <a href="link2">This link is closed.</a>\n </div>\n</div></div>
|
||||||
|
<div>This document contains a <!DOCTYPE surprise>surprise doctype</div>
|
||||||
|
<div><a><B><Cd><EFG>Mixed case tags are folded to lowercase</efg></CD></b></A></div>
|
||||||
|
<div><our\u2603>Tag name contains Unicode characters</our\u2603></div>
|
||||||
|
<div><a \u2603="snowman">Attribute name contains Unicode characters</a></div>
|
||||||
|
<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd">
|
||||||
|
"""
|
||||||
|
|
||||||
|
|
||||||
class SoupTest(unittest.TestCase):
|
class SoupTest(unittest.TestCase):
|
||||||
|
|
||||||
@property
|
@property
|
||||||
def default_builder(self):
|
def default_builder(self):
|
||||||
return default_builder()
|
return default_builder
|
||||||
|
|
||||||
def soup(self, markup, **kwargs):
|
def soup(self, markup, **kwargs):
|
||||||
"""Build a Beautiful Soup object from markup."""
|
"""Build a Beautiful Soup object from markup."""
|
||||||
builder = kwargs.pop('builder', self.default_builder)
|
builder = kwargs.pop('builder', self.default_builder)
|
||||||
return BeautifulSoup(markup, builder=builder, **kwargs)
|
return BeautifulSoup(markup, builder=builder, **kwargs)
|
||||||
|
|
||||||
def document_for(self, markup):
|
def document_for(self, markup, **kwargs):
|
||||||
"""Turn an HTML fragment into a document.
|
"""Turn an HTML fragment into a document.
|
||||||
|
|
||||||
The details depend on the builder.
|
The details depend on the builder.
|
||||||
"""
|
"""
|
||||||
return self.default_builder.test_fragment_to_document(markup)
|
return self.default_builder(**kwargs).test_fragment_to_document(markup)
|
||||||
|
|
||||||
def assertSoupEquals(self, to_parse, compare_parsed_to=None):
|
def assertSoupEquals(self, to_parse, compare_parsed_to=None):
|
||||||
builder = self.default_builder
|
builder = self.default_builder
|
||||||
|
@ -59,6 +96,121 @@ class SoupTest(unittest.TestCase):
|
||||||
self.assertEqual(earlier, e.previous_element)
|
self.assertEqual(earlier, e.previous_element)
|
||||||
earlier = e
|
earlier = e
|
||||||
|
|
||||||
|
def linkage_validator(self, el, _recursive_call=False):
|
||||||
|
"""Ensure proper linkage throughout the document."""
|
||||||
|
descendant = None
|
||||||
|
# Document element should have no previous element or previous sibling.
|
||||||
|
# It also shouldn't have a next sibling.
|
||||||
|
if el.parent is None:
|
||||||
|
assert el.previous_element is None,\
|
||||||
|
"Bad previous_element\nNODE: {}\nPREV: {}\nEXPECTED: {}".format(
|
||||||
|
el, el.previous_element, None
|
||||||
|
)
|
||||||
|
assert el.previous_sibling is None,\
|
||||||
|
"Bad previous_sibling\nNODE: {}\nPREV: {}\nEXPECTED: {}".format(
|
||||||
|
el, el.previous_sibling, None
|
||||||
|
)
|
||||||
|
assert el.next_sibling is None,\
|
||||||
|
"Bad next_sibling\nNODE: {}\nNEXT: {}\nEXPECTED: {}".format(
|
||||||
|
el, el.next_sibling, None
|
||||||
|
)
|
||||||
|
|
||||||
|
idx = 0
|
||||||
|
child = None
|
||||||
|
last_child = None
|
||||||
|
last_idx = len(el.contents) - 1
|
||||||
|
for child in el.contents:
|
||||||
|
descendant = None
|
||||||
|
|
||||||
|
# Parent should link next element to their first child
|
||||||
|
# That child should have no previous sibling
|
||||||
|
if idx == 0:
|
||||||
|
if el.parent is not None:
|
||||||
|
assert el.next_element is child,\
|
||||||
|
"Bad next_element\nNODE: {}\nNEXT: {}\nEXPECTED: {}".format(
|
||||||
|
el, el.next_element, child
|
||||||
|
)
|
||||||
|
assert child.previous_element is el,\
|
||||||
|
"Bad previous_element\nNODE: {}\nPREV: {}\nEXPECTED: {}".format(
|
||||||
|
child, child.previous_element, el
|
||||||
|
)
|
||||||
|
assert child.previous_sibling is None,\
|
||||||
|
"Bad previous_sibling\nNODE: {}\nPREV {}\nEXPECTED: {}".format(
|
||||||
|
child, child.previous_sibling, None
|
||||||
|
)
|
||||||
|
|
||||||
|
# If not the first child, previous index should link as sibling to this index
|
||||||
|
# Previous element should match the last index or the last bubbled up descendant
|
||||||
|
else:
|
||||||
|
assert child.previous_sibling is el.contents[idx - 1],\
|
||||||
|
"Bad previous_sibling\nNODE: {}\nPREV {}\nEXPECTED {}".format(
|
||||||
|
child, child.previous_sibling, el.contents[idx - 1]
|
||||||
|
)
|
||||||
|
assert el.contents[idx - 1].next_sibling is child,\
|
||||||
|
"Bad next_sibling\nNODE: {}\nNEXT {}\nEXPECTED {}".format(
|
||||||
|
el.contents[idx - 1], el.contents[idx - 1].next_sibling, child
|
||||||
|
)
|
||||||
|
|
||||||
|
if last_child is not None:
|
||||||
|
assert child.previous_element is last_child,\
|
||||||
|
"Bad previous_element\nNODE: {}\nPREV {}\nEXPECTED {}\nCONTENTS {}".format(
|
||||||
|
child, child.previous_element, last_child, child.parent.contents
|
||||||
|
)
|
||||||
|
assert last_child.next_element is child,\
|
||||||
|
"Bad next_element\nNODE: {}\nNEXT {}\nEXPECTED {}".format(
|
||||||
|
last_child, last_child.next_element, child
|
||||||
|
)
|
||||||
|
|
||||||
|
if isinstance(child, Tag) and child.contents:
|
||||||
|
descendant = self.linkage_validator(child, True)
|
||||||
|
# A bubbled up descendant should have no next siblings
|
||||||
|
assert descendant.next_sibling is None,\
|
||||||
|
"Bad next_sibling\nNODE: {}\nNEXT {}\nEXPECTED {}".format(
|
||||||
|
descendant, descendant.next_sibling, None
|
||||||
|
)
|
||||||
|
|
||||||
|
# Mark last child as either the bubbled up descendant or the current child
|
||||||
|
if descendant is not None:
|
||||||
|
last_child = descendant
|
||||||
|
else:
|
||||||
|
last_child = child
|
||||||
|
|
||||||
|
# If last child, there are non next siblings
|
||||||
|
if idx == last_idx:
|
||||||
|
assert child.next_sibling is None,\
|
||||||
|
"Bad next_sibling\nNODE: {}\nNEXT {}\nEXPECTED {}".format(
|
||||||
|
child, child.next_sibling, None
|
||||||
|
)
|
||||||
|
idx += 1
|
||||||
|
|
||||||
|
child = descendant if descendant is not None else child
|
||||||
|
if child is None:
|
||||||
|
child = el
|
||||||
|
|
||||||
|
if not _recursive_call and child is not None:
|
||||||
|
target = el
|
||||||
|
while True:
|
||||||
|
if target is None:
|
||||||
|
assert child.next_element is None, \
|
||||||
|
"Bad next_element\nNODE: {}\nNEXT {}\nEXPECTED {}".format(
|
||||||
|
child, child.next_element, None
|
||||||
|
)
|
||||||
|
break
|
||||||
|
elif target.next_sibling is not None:
|
||||||
|
assert child.next_element is target.next_sibling, \
|
||||||
|
"Bad next_element\nNODE: {}\nNEXT {}\nEXPECTED {}".format(
|
||||||
|
child, child.next_element, target.next_sibling
|
||||||
|
)
|
||||||
|
break
|
||||||
|
target = target.parent
|
||||||
|
|
||||||
|
# We are done, so nothing to return
|
||||||
|
return None
|
||||||
|
else:
|
||||||
|
# Return the child to the recursive caller
|
||||||
|
return child
|
||||||
|
|
||||||
|
|
||||||
class HTMLTreeBuilderSmokeTest(object):
|
class HTMLTreeBuilderSmokeTest(object):
|
||||||
|
|
||||||
"""A basic test of a treebuilder's competence.
|
"""A basic test of a treebuilder's competence.
|
||||||
|
@ -80,7 +232,7 @@ class HTMLTreeBuilderSmokeTest(object):
|
||||||
soup = self.soup("")
|
soup = self.soup("")
|
||||||
new_tag = soup.new_tag(name)
|
new_tag = soup.new_tag(name)
|
||||||
self.assertEqual(True, new_tag.is_empty_element)
|
self.assertEqual(True, new_tag.is_empty_element)
|
||||||
|
|
||||||
def test_pickle_and_unpickle_identity(self):
|
def test_pickle_and_unpickle_identity(self):
|
||||||
# Pickling a tree, then unpickling it, yields a tree identical
|
# Pickling a tree, then unpickling it, yields a tree identical
|
||||||
# to the original.
|
# to the original.
|
||||||
|
@ -150,12 +302,20 @@ class HTMLTreeBuilderSmokeTest(object):
|
||||||
soup.encode("utf-8").replace(b"\n", b""),
|
soup.encode("utf-8").replace(b"\n", b""),
|
||||||
markup.replace(b"\n", b""))
|
markup.replace(b"\n", b""))
|
||||||
|
|
||||||
|
def test_namespaced_html(self):
|
||||||
|
"""When a namespaced XML document is parsed as HTML it should
|
||||||
|
be treated as HTML with weird tag names.
|
||||||
|
"""
|
||||||
|
markup = b"""<ns1:foo>content</ns1:foo><ns1:foo/><ns2:foo/>"""
|
||||||
|
soup = self.soup(markup)
|
||||||
|
self.assertEqual(2, len(soup.find_all("ns1:foo")))
|
||||||
|
|
||||||
def test_processing_instruction(self):
|
def test_processing_instruction(self):
|
||||||
# We test both Unicode and bytestring to verify that
|
# We test both Unicode and bytestring to verify that
|
||||||
# process_markup correctly sets processing_instruction_class
|
# process_markup correctly sets processing_instruction_class
|
||||||
# even when the markup is already Unicode and there is no
|
# even when the markup is already Unicode and there is no
|
||||||
# need to process anything.
|
# need to process anything.
|
||||||
markup = u"""<?PITarget PIContent?>"""
|
markup = """<?PITarget PIContent?>"""
|
||||||
soup = self.soup(markup)
|
soup = self.soup(markup)
|
||||||
self.assertEqual(markup, soup.decode())
|
self.assertEqual(markup, soup.decode())
|
||||||
|
|
||||||
|
@ -292,6 +452,18 @@ Hello, world!
|
||||||
"<tbody><tr><td>Bar</td></tr></tbody>"
|
"<tbody><tr><td>Bar</td></tr></tbody>"
|
||||||
"<tfoot><tr><td>Baz</td></tr></tfoot></table>")
|
"<tfoot><tr><td>Baz</td></tr></tfoot></table>")
|
||||||
|
|
||||||
|
def test_multivalued_attribute_with_whitespace(self):
|
||||||
|
# Whitespace separating the values of a multi-valued attribute
|
||||||
|
# should be ignored.
|
||||||
|
|
||||||
|
markup = '<div class=" foo bar "></a>'
|
||||||
|
soup = self.soup(markup)
|
||||||
|
self.assertEqual(['foo', 'bar'], soup.div['class'])
|
||||||
|
|
||||||
|
# If you search by the literal name of the class it's like the whitespace
|
||||||
|
# wasn't there.
|
||||||
|
self.assertEqual(soup.div, soup.find('div', class_="foo bar"))
|
||||||
|
|
||||||
def test_deeply_nested_multivalued_attribute(self):
|
def test_deeply_nested_multivalued_attribute(self):
|
||||||
# html5lib can set the attributes of the same tag many times
|
# html5lib can set the attributes of the same tag many times
|
||||||
# as it rearranges the tree. This has caused problems with
|
# as it rearranges the tree. This has caused problems with
|
||||||
|
@ -311,15 +483,41 @@ Hello, world!
|
||||||
def test_angle_brackets_in_attribute_values_are_escaped(self):
|
def test_angle_brackets_in_attribute_values_are_escaped(self):
|
||||||
self.assertSoupEquals('<a b="<a>"></a>', '<a b="<a>"></a>')
|
self.assertSoupEquals('<a b="<a>"></a>', '<a b="<a>"></a>')
|
||||||
|
|
||||||
|
def test_strings_resembling_character_entity_references(self):
|
||||||
|
# "&T" and "&p" look like incomplete character entities, but they are
|
||||||
|
# not.
|
||||||
|
self.assertSoupEquals(
|
||||||
|
"<p>• AT&T is in the s&p 500</p>",
|
||||||
|
"<p>\u2022 AT&T is in the s&p 500</p>"
|
||||||
|
)
|
||||||
|
|
||||||
|
def test_apos_entity(self):
|
||||||
|
self.assertSoupEquals(
|
||||||
|
"<p>Bob's Bar</p>",
|
||||||
|
"<p>Bob's Bar</p>",
|
||||||
|
)
|
||||||
|
|
||||||
|
def test_entities_in_foreign_document_encoding(self):
|
||||||
|
# “ and ” are invalid numeric entities referencing
|
||||||
|
# Windows-1252 characters. - references a character common
|
||||||
|
# to Windows-1252 and Unicode, and ☃ references a
|
||||||
|
# character only found in Unicode.
|
||||||
|
#
|
||||||
|
# All of these entities should be converted to Unicode
|
||||||
|
# characters.
|
||||||
|
markup = "<p>“Hello” -☃</p>"
|
||||||
|
soup = self.soup(markup)
|
||||||
|
self.assertEqual("“Hello” -☃", soup.p.string)
|
||||||
|
|
||||||
def test_entities_in_attributes_converted_to_unicode(self):
|
def test_entities_in_attributes_converted_to_unicode(self):
|
||||||
expect = u'<p id="pi\N{LATIN SMALL LETTER N WITH TILDE}ata"></p>'
|
expect = '<p id="pi\N{LATIN SMALL LETTER N WITH TILDE}ata"></p>'
|
||||||
self.assertSoupEquals('<p id="piñata"></p>', expect)
|
self.assertSoupEquals('<p id="piñata"></p>', expect)
|
||||||
self.assertSoupEquals('<p id="piñata"></p>', expect)
|
self.assertSoupEquals('<p id="piñata"></p>', expect)
|
||||||
self.assertSoupEquals('<p id="piñata"></p>', expect)
|
self.assertSoupEquals('<p id="piñata"></p>', expect)
|
||||||
self.assertSoupEquals('<p id="piñata"></p>', expect)
|
self.assertSoupEquals('<p id="piñata"></p>', expect)
|
||||||
|
|
||||||
def test_entities_in_text_converted_to_unicode(self):
|
def test_entities_in_text_converted_to_unicode(self):
|
||||||
expect = u'<p>pi\N{LATIN SMALL LETTER N WITH TILDE}ata</p>'
|
expect = '<p>pi\N{LATIN SMALL LETTER N WITH TILDE}ata</p>'
|
||||||
self.assertSoupEquals("<p>piñata</p>", expect)
|
self.assertSoupEquals("<p>piñata</p>", expect)
|
||||||
self.assertSoupEquals("<p>piñata</p>", expect)
|
self.assertSoupEquals("<p>piñata</p>", expect)
|
||||||
self.assertSoupEquals("<p>piñata</p>", expect)
|
self.assertSoupEquals("<p>piñata</p>", expect)
|
||||||
|
@ -330,11 +528,11 @@ Hello, world!
|
||||||
'<p>I said "good day!"</p>')
|
'<p>I said "good day!"</p>')
|
||||||
|
|
||||||
def test_out_of_range_entity(self):
|
def test_out_of_range_entity(self):
|
||||||
expect = u"\N{REPLACEMENT CHARACTER}"
|
expect = "\N{REPLACEMENT CHARACTER}"
|
||||||
self.assertSoupEquals("�", expect)
|
self.assertSoupEquals("�", expect)
|
||||||
self.assertSoupEquals("�", expect)
|
self.assertSoupEquals("�", expect)
|
||||||
self.assertSoupEquals("�", expect)
|
self.assertSoupEquals("�", expect)
|
||||||
|
|
||||||
def test_multipart_strings(self):
|
def test_multipart_strings(self):
|
||||||
"Mostly to prevent a recurrence of a bug in the html5lib treebuilder."
|
"Mostly to prevent a recurrence of a bug in the html5lib treebuilder."
|
||||||
soup = self.soup("<html><h2>\nfoo</h2><p></p></html>")
|
soup = self.soup("<html><h2>\nfoo</h2><p></p></html>")
|
||||||
|
@ -408,9 +606,9 @@ Hello, world!
|
||||||
# A seemingly innocuous document... but it's in Unicode! And
|
# A seemingly innocuous document... but it's in Unicode! And
|
||||||
# it contains characters that can't be represented in the
|
# it contains characters that can't be represented in the
|
||||||
# encoding found in the declaration! The horror!
|
# encoding found in the declaration! The horror!
|
||||||
markup = u'<html><head><meta encoding="euc-jp"></head><body>Sacr\N{LATIN SMALL LETTER E WITH ACUTE} bleu!</body>'
|
markup = '<html><head><meta encoding="euc-jp"></head><body>Sacr\N{LATIN SMALL LETTER E WITH ACUTE} bleu!</body>'
|
||||||
soup = self.soup(markup)
|
soup = self.soup(markup)
|
||||||
self.assertEqual(u'Sacr\xe9 bleu!', soup.body.string)
|
self.assertEqual('Sacr\xe9 bleu!', soup.body.string)
|
||||||
|
|
||||||
def test_soupstrainer(self):
|
def test_soupstrainer(self):
|
||||||
"""Parsers should be able to work with SoupStrainers."""
|
"""Parsers should be able to work with SoupStrainers."""
|
||||||
|
@ -450,7 +648,7 @@ Hello, world!
|
||||||
# Both XML and HTML entities are converted to Unicode characters
|
# Both XML and HTML entities are converted to Unicode characters
|
||||||
# during parsing.
|
# during parsing.
|
||||||
text = "<p><<sacré bleu!>></p>"
|
text = "<p><<sacré bleu!>></p>"
|
||||||
expected = u"<p><<sacr\N{LATIN SMALL LETTER E WITH ACUTE} bleu!>></p>"
|
expected = "<p><<sacr\N{LATIN SMALL LETTER E WITH ACUTE} bleu!>></p>"
|
||||||
self.assertSoupEquals(text, expected)
|
self.assertSoupEquals(text, expected)
|
||||||
|
|
||||||
def test_smart_quotes_converted_on_the_way_in(self):
|
def test_smart_quotes_converted_on_the_way_in(self):
|
||||||
|
@ -460,15 +658,15 @@ Hello, world!
|
||||||
soup = self.soup(quote)
|
soup = self.soup(quote)
|
||||||
self.assertEqual(
|
self.assertEqual(
|
||||||
soup.p.string,
|
soup.p.string,
|
||||||
u"\N{LEFT SINGLE QUOTATION MARK}Foo\N{RIGHT SINGLE QUOTATION MARK}")
|
"\N{LEFT SINGLE QUOTATION MARK}Foo\N{RIGHT SINGLE QUOTATION MARK}")
|
||||||
|
|
||||||
def test_non_breaking_spaces_converted_on_the_way_in(self):
|
def test_non_breaking_spaces_converted_on_the_way_in(self):
|
||||||
soup = self.soup("<a> </a>")
|
soup = self.soup("<a> </a>")
|
||||||
self.assertEqual(soup.a.string, u"\N{NO-BREAK SPACE}" * 2)
|
self.assertEqual(soup.a.string, "\N{NO-BREAK SPACE}" * 2)
|
||||||
|
|
||||||
def test_entities_converted_on_the_way_out(self):
|
def test_entities_converted_on_the_way_out(self):
|
||||||
text = "<p><<sacré bleu!>></p>"
|
text = "<p><<sacré bleu!>></p>"
|
||||||
expected = u"<p><<sacr\N{LATIN SMALL LETTER E WITH ACUTE} bleu!>></p>".encode("utf-8")
|
expected = "<p><<sacr\N{LATIN SMALL LETTER E WITH ACUTE} bleu!>></p>".encode("utf-8")
|
||||||
soup = self.soup(text)
|
soup = self.soup(text)
|
||||||
self.assertEqual(soup.p.encode("utf-8"), expected)
|
self.assertEqual(soup.p.encode("utf-8"), expected)
|
||||||
|
|
||||||
|
@ -477,7 +675,7 @@ Hello, world!
|
||||||
# easy-to-understand document.
|
# easy-to-understand document.
|
||||||
|
|
||||||
# Here it is in Unicode. Note that it claims to be in ISO-Latin-1.
|
# Here it is in Unicode. Note that it claims to be in ISO-Latin-1.
|
||||||
unicode_html = u'<html><head><meta content="text/html; charset=ISO-Latin-1" http-equiv="Content-type"/></head><body><p>Sacr\N{LATIN SMALL LETTER E WITH ACUTE} bleu!</p></body></html>'
|
unicode_html = '<html><head><meta content="text/html; charset=ISO-Latin-1" http-equiv="Content-type"/></head><body><p>Sacr\N{LATIN SMALL LETTER E WITH ACUTE} bleu!</p></body></html>'
|
||||||
|
|
||||||
# That's because we're going to encode it into ISO-Latin-1, and use
|
# That's because we're going to encode it into ISO-Latin-1, and use
|
||||||
# that to test.
|
# that to test.
|
||||||
|
@ -586,6 +784,13 @@ Hello, world!
|
||||||
data.a['foo'] = 'bar'
|
data.a['foo'] = 'bar'
|
||||||
self.assertEqual('<a foo="bar">text</a>', data.a.decode())
|
self.assertEqual('<a foo="bar">text</a>', data.a.decode())
|
||||||
|
|
||||||
|
def test_worst_case(self):
|
||||||
|
"""Test the worst case (currently) for linking issues."""
|
||||||
|
|
||||||
|
soup = self.soup(BAD_DOCUMENT)
|
||||||
|
self.linkage_validator(soup)
|
||||||
|
|
||||||
|
|
||||||
class XMLTreeBuilderSmokeTest(object):
|
class XMLTreeBuilderSmokeTest(object):
|
||||||
|
|
||||||
def test_pickle_and_unpickle_identity(self):
|
def test_pickle_and_unpickle_identity(self):
|
||||||
|
@ -624,6 +829,17 @@ class XMLTreeBuilderSmokeTest(object):
|
||||||
self.assertEqual(
|
self.assertEqual(
|
||||||
soup.encode("utf-8"), markup)
|
soup.encode("utf-8"), markup)
|
||||||
|
|
||||||
|
def test_nested_namespaces(self):
|
||||||
|
doc = b"""<?xml version="1.0" encoding="utf-8"?>
|
||||||
|
<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.1//EN" "http://www.w3.org/TR/xhtml11/DTD/xhtml11.dtd">
|
||||||
|
<parent xmlns="http://ns1/">
|
||||||
|
<child xmlns="http://ns2/" xmlns:ns3="http://ns3/">
|
||||||
|
<grandchild ns3:attr="value" xmlns="http://ns4/"/>
|
||||||
|
</child>
|
||||||
|
</parent>"""
|
||||||
|
soup = self.soup(doc)
|
||||||
|
self.assertEqual(doc, soup.encode())
|
||||||
|
|
||||||
def test_formatter_processes_script_tag_for_xml_documents(self):
|
def test_formatter_processes_script_tag_for_xml_documents(self):
|
||||||
doc = """
|
doc = """
|
||||||
<script type="text/javascript">
|
<script type="text/javascript">
|
||||||
|
@ -637,15 +853,15 @@ class XMLTreeBuilderSmokeTest(object):
|
||||||
self.assertTrue(b"< < hey > >" in encoded)
|
self.assertTrue(b"< < hey > >" in encoded)
|
||||||
|
|
||||||
def test_can_parse_unicode_document(self):
|
def test_can_parse_unicode_document(self):
|
||||||
markup = u'<?xml version="1.0" encoding="euc-jp"><root>Sacr\N{LATIN SMALL LETTER E WITH ACUTE} bleu!</root>'
|
markup = '<?xml version="1.0" encoding="euc-jp"><root>Sacr\N{LATIN SMALL LETTER E WITH ACUTE} bleu!</root>'
|
||||||
soup = self.soup(markup)
|
soup = self.soup(markup)
|
||||||
self.assertEqual(u'Sacr\xe9 bleu!', soup.root.string)
|
self.assertEqual('Sacr\xe9 bleu!', soup.root.string)
|
||||||
|
|
||||||
def test_popping_namespaced_tag(self):
|
def test_popping_namespaced_tag(self):
|
||||||
markup = '<rss xmlns:dc="foo"><dc:creator>b</dc:creator><dc:date>2012-07-02T20:33:42Z</dc:date><dc:rights>c</dc:rights><image>d</image></rss>'
|
markup = '<rss xmlns:dc="foo"><dc:creator>b</dc:creator><dc:date>2012-07-02T20:33:42Z</dc:date><dc:rights>c</dc:rights><image>d</image></rss>'
|
||||||
soup = self.soup(markup)
|
soup = self.soup(markup)
|
||||||
self.assertEqual(
|
self.assertEqual(
|
||||||
unicode(soup.rss), markup)
|
str(soup.rss), markup)
|
||||||
|
|
||||||
def test_docstring_includes_correct_encoding(self):
|
def test_docstring_includes_correct_encoding(self):
|
||||||
soup = self.soup("<root/>")
|
soup = self.soup("<root/>")
|
||||||
|
@ -676,17 +892,17 @@ class XMLTreeBuilderSmokeTest(object):
|
||||||
def test_closing_namespaced_tag(self):
|
def test_closing_namespaced_tag(self):
|
||||||
markup = '<p xmlns:dc="http://purl.org/dc/elements/1.1/"><dc:date>20010504</dc:date></p>'
|
markup = '<p xmlns:dc="http://purl.org/dc/elements/1.1/"><dc:date>20010504</dc:date></p>'
|
||||||
soup = self.soup(markup)
|
soup = self.soup(markup)
|
||||||
self.assertEqual(unicode(soup.p), markup)
|
self.assertEqual(str(soup.p), markup)
|
||||||
|
|
||||||
def test_namespaced_attributes(self):
|
def test_namespaced_attributes(self):
|
||||||
markup = '<foo xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"><bar xsi:schemaLocation="http://www.example.com"/></foo>'
|
markup = '<foo xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"><bar xsi:schemaLocation="http://www.example.com"/></foo>'
|
||||||
soup = self.soup(markup)
|
soup = self.soup(markup)
|
||||||
self.assertEqual(unicode(soup.foo), markup)
|
self.assertEqual(str(soup.foo), markup)
|
||||||
|
|
||||||
def test_namespaced_attributes_xml_namespace(self):
|
def test_namespaced_attributes_xml_namespace(self):
|
||||||
markup = '<foo xml:lang="fr">bar</foo>'
|
markup = '<foo xml:lang="fr">bar</foo>'
|
||||||
soup = self.soup(markup)
|
soup = self.soup(markup)
|
||||||
self.assertEqual(unicode(soup.foo), markup)
|
self.assertEqual(str(soup.foo), markup)
|
||||||
|
|
||||||
def test_find_by_prefixed_name(self):
|
def test_find_by_prefixed_name(self):
|
||||||
doc = """<?xml version="1.0" encoding="utf-8"?>
|
doc = """<?xml version="1.0" encoding="utf-8"?>
|
||||||
|
@ -721,6 +937,12 @@ class XMLTreeBuilderSmokeTest(object):
|
||||||
# The two tags have the same namespace prefix.
|
# The two tags have the same namespace prefix.
|
||||||
self.assertEqual(tag.prefix, duplicate.prefix)
|
self.assertEqual(tag.prefix, duplicate.prefix)
|
||||||
|
|
||||||
|
def test_worst_case(self):
|
||||||
|
"""Test the worst case (currently) for linking issues."""
|
||||||
|
|
||||||
|
soup = self.soup(BAD_DOCUMENT)
|
||||||
|
self.linkage_validator(soup)
|
||||||
|
|
||||||
|
|
||||||
class HTML5TreeBuilderSmokeTest(HTMLTreeBuilderSmokeTest):
|
class HTML5TreeBuilderSmokeTest(HTMLTreeBuilderSmokeTest):
|
||||||
"""Smoke test for a tree builder that supports HTML5."""
|
"""Smoke test for a tree builder that supports HTML5."""
|
||||||
|
|
|
@ -5,7 +5,7 @@ import warnings
|
||||||
try:
|
try:
|
||||||
from bs4.builder import HTML5TreeBuilder
|
from bs4.builder import HTML5TreeBuilder
|
||||||
HTML5LIB_PRESENT = True
|
HTML5LIB_PRESENT = True
|
||||||
except ImportError, e:
|
except ImportError as e:
|
||||||
HTML5LIB_PRESENT = False
|
HTML5LIB_PRESENT = False
|
||||||
from bs4.element import SoupStrainer
|
from bs4.element import SoupStrainer
|
||||||
from bs4.testing import (
|
from bs4.testing import (
|
||||||
|
@ -22,7 +22,7 @@ class HTML5LibBuilderSmokeTest(SoupTest, HTML5TreeBuilderSmokeTest):
|
||||||
|
|
||||||
@property
|
@property
|
||||||
def default_builder(self):
|
def default_builder(self):
|
||||||
return HTML5TreeBuilder()
|
return HTML5TreeBuilder
|
||||||
|
|
||||||
def test_soupstrainer(self):
|
def test_soupstrainer(self):
|
||||||
# The html5lib tree builder does not support SoupStrainers.
|
# The html5lib tree builder does not support SoupStrainers.
|
||||||
|
@ -74,14 +74,14 @@ class HTML5LibBuilderSmokeTest(SoupTest, HTML5TreeBuilderSmokeTest):
|
||||||
def test_reparented_markup(self):
|
def test_reparented_markup(self):
|
||||||
markup = '<p><em>foo</p>\n<p>bar<a></a></em></p>'
|
markup = '<p><em>foo</p>\n<p>bar<a></a></em></p>'
|
||||||
soup = self.soup(markup)
|
soup = self.soup(markup)
|
||||||
self.assertEqual(u"<body><p><em>foo</em></p><em>\n</em><p><em>bar<a></a></em></p></body>", soup.body.decode())
|
self.assertEqual("<body><p><em>foo</em></p><em>\n</em><p><em>bar<a></a></em></p></body>", soup.body.decode())
|
||||||
self.assertEqual(2, len(soup.find_all('p')))
|
self.assertEqual(2, len(soup.find_all('p')))
|
||||||
|
|
||||||
|
|
||||||
def test_reparented_markup_ends_with_whitespace(self):
|
def test_reparented_markup_ends_with_whitespace(self):
|
||||||
markup = '<p><em>foo</p>\n<p>bar<a></a></em></p>\n'
|
markup = '<p><em>foo</p>\n<p>bar<a></a></em></p>\n'
|
||||||
soup = self.soup(markup)
|
soup = self.soup(markup)
|
||||||
self.assertEqual(u"<body><p><em>foo</em></p><em>\n</em><p><em>bar<a></a></em></p>\n</body>", soup.body.decode())
|
self.assertEqual("<body><p><em>foo</em></p><em>\n</em><p><em>bar<a></a></em></p>\n</body>", soup.body.decode())
|
||||||
self.assertEqual(2, len(soup.find_all('p')))
|
self.assertEqual(2, len(soup.find_all('p')))
|
||||||
|
|
||||||
def test_reparented_markup_containing_identical_whitespace_nodes(self):
|
def test_reparented_markup_containing_identical_whitespace_nodes(self):
|
||||||
|
@ -127,4 +127,44 @@ class HTML5LibBuilderSmokeTest(SoupTest, HTML5TreeBuilderSmokeTest):
|
||||||
def test_foster_parenting(self):
|
def test_foster_parenting(self):
|
||||||
markup = b"""<table><td></tbody>A"""
|
markup = b"""<table><td></tbody>A"""
|
||||||
soup = self.soup(markup)
|
soup = self.soup(markup)
|
||||||
self.assertEqual(u"<body>A<table><tbody><tr><td></td></tr></tbody></table></body>", soup.body.decode())
|
self.assertEqual("<body>A<table><tbody><tr><td></td></tr></tbody></table></body>", soup.body.decode())
|
||||||
|
|
||||||
|
def test_extraction(self):
|
||||||
|
"""
|
||||||
|
Test that extraction does not destroy the tree.
|
||||||
|
|
||||||
|
https://bugs.launchpad.net/beautifulsoup/+bug/1782928
|
||||||
|
"""
|
||||||
|
|
||||||
|
markup = """
|
||||||
|
<html><head></head>
|
||||||
|
<style>
|
||||||
|
</style><script></script><body><p>hello</p></body></html>
|
||||||
|
"""
|
||||||
|
soup = self.soup(markup)
|
||||||
|
[s.extract() for s in soup('script')]
|
||||||
|
[s.extract() for s in soup('style')]
|
||||||
|
|
||||||
|
self.assertEqual(len(soup.find_all("p")), 1)
|
||||||
|
|
||||||
|
def test_empty_comment(self):
|
||||||
|
"""
|
||||||
|
Test that empty comment does not break structure.
|
||||||
|
|
||||||
|
https://bugs.launchpad.net/beautifulsoup/+bug/1806598
|
||||||
|
"""
|
||||||
|
|
||||||
|
markup = """
|
||||||
|
<html>
|
||||||
|
<body>
|
||||||
|
<form>
|
||||||
|
<!----><input type="text">
|
||||||
|
</form>
|
||||||
|
</body>
|
||||||
|
</html>
|
||||||
|
"""
|
||||||
|
soup = self.soup(markup)
|
||||||
|
inputs = []
|
||||||
|
for form in soup.find_all('form'):
|
||||||
|
inputs.extend(form.find_all('input'))
|
||||||
|
self.assertEqual(len(inputs), 1)
|
||||||
|
|
|
@ -5,12 +5,11 @@ from pdb import set_trace
|
||||||
import pickle
|
import pickle
|
||||||
from bs4.testing import SoupTest, HTMLTreeBuilderSmokeTest
|
from bs4.testing import SoupTest, HTMLTreeBuilderSmokeTest
|
||||||
from bs4.builder import HTMLParserTreeBuilder
|
from bs4.builder import HTMLParserTreeBuilder
|
||||||
|
from bs4.builder._htmlparser import BeautifulSoupHTMLParser
|
||||||
|
|
||||||
class HTMLParserTreeBuilderSmokeTest(SoupTest, HTMLTreeBuilderSmokeTest):
|
class HTMLParserTreeBuilderSmokeTest(SoupTest, HTMLTreeBuilderSmokeTest):
|
||||||
|
|
||||||
@property
|
default_builder = HTMLParserTreeBuilder
|
||||||
def default_builder(self):
|
|
||||||
return HTMLParserTreeBuilder()
|
|
||||||
|
|
||||||
def test_namespaced_system_doctype(self):
|
def test_namespaced_system_doctype(self):
|
||||||
# html.parser can't handle namespaced doctypes, so skip this one.
|
# html.parser can't handle namespaced doctypes, so skip this one.
|
||||||
|
@ -32,3 +31,17 @@ class HTMLParserTreeBuilderSmokeTest(SoupTest, HTMLTreeBuilderSmokeTest):
|
||||||
def test_redundant_empty_element_closing_tags(self):
|
def test_redundant_empty_element_closing_tags(self):
|
||||||
self.assertSoupEquals('<br></br><br></br><br></br>', "<br/><br/><br/>")
|
self.assertSoupEquals('<br></br><br></br><br></br>', "<br/><br/><br/>")
|
||||||
self.assertSoupEquals('</br></br></br>', "")
|
self.assertSoupEquals('</br></br></br>', "")
|
||||||
|
|
||||||
|
def test_empty_element(self):
|
||||||
|
# This verifies that any buffered data present when the parser
|
||||||
|
# finishes working is handled.
|
||||||
|
self.assertSoupEquals("foo &# bar", "foo &# bar")
|
||||||
|
|
||||||
|
|
||||||
|
class TestHTMLParserSubclass(SoupTest):
|
||||||
|
def test_error(self):
|
||||||
|
"""Verify that our HTMLParser subclass implements error() in a way
|
||||||
|
that doesn't cause a crash.
|
||||||
|
"""
|
||||||
|
parser = BeautifulSoupHTMLParser()
|
||||||
|
parser.error("don't crash")
|
||||||
|
|
|
@ -7,7 +7,7 @@ try:
|
||||||
import lxml.etree
|
import lxml.etree
|
||||||
LXML_PRESENT = True
|
LXML_PRESENT = True
|
||||||
LXML_VERSION = lxml.etree.LXML_VERSION
|
LXML_VERSION = lxml.etree.LXML_VERSION
|
||||||
except ImportError, e:
|
except ImportError as e:
|
||||||
LXML_PRESENT = False
|
LXML_PRESENT = False
|
||||||
LXML_VERSION = (0,)
|
LXML_VERSION = (0,)
|
||||||
|
|
||||||
|
@ -36,7 +36,7 @@ class LXMLTreeBuilderSmokeTest(SoupTest, HTMLTreeBuilderSmokeTest):
|
||||||
|
|
||||||
@property
|
@property
|
||||||
def default_builder(self):
|
def default_builder(self):
|
||||||
return LXMLTreeBuilder()
|
return LXMLTreeBuilder
|
||||||
|
|
||||||
def test_out_of_range_entity(self):
|
def test_out_of_range_entity(self):
|
||||||
self.assertSoupEquals(
|
self.assertSoupEquals(
|
||||||
|
@ -46,6 +46,12 @@ class LXMLTreeBuilderSmokeTest(SoupTest, HTMLTreeBuilderSmokeTest):
|
||||||
self.assertSoupEquals(
|
self.assertSoupEquals(
|
||||||
"<p>foo�bar</p>", "<p>foobar</p>")
|
"<p>foo�bar</p>", "<p>foobar</p>")
|
||||||
|
|
||||||
|
def test_entities_in_foreign_document_encoding(self):
|
||||||
|
# We can't implement this case correctly because by the time we
|
||||||
|
# hear about markup like "“", it's been (incorrectly) converted into
|
||||||
|
# a string like u'\x93'
|
||||||
|
pass
|
||||||
|
|
||||||
# In lxml < 2.3.5, an empty doctype causes a segfault. Skip this
|
# In lxml < 2.3.5, an empty doctype causes a segfault. Skip this
|
||||||
# test if an old version of lxml is installed.
|
# test if an old version of lxml is installed.
|
||||||
|
|
||||||
|
@ -62,7 +68,7 @@ class LXMLTreeBuilderSmokeTest(SoupTest, HTMLTreeBuilderSmokeTest):
|
||||||
# if one is installed.
|
# if one is installed.
|
||||||
with warnings.catch_warnings(record=True) as w:
|
with warnings.catch_warnings(record=True) as w:
|
||||||
soup = BeautifulStoneSoup("<b />")
|
soup = BeautifulStoneSoup("<b />")
|
||||||
self.assertEqual(u"<b/>", unicode(soup.b))
|
self.assertEqual("<b/>", str(soup.b))
|
||||||
self.assertTrue("BeautifulStoneSoup class is deprecated" in str(w[0].message))
|
self.assertTrue("BeautifulStoneSoup class is deprecated" in str(w[0].message))
|
||||||
|
|
||||||
@skipIf(
|
@skipIf(
|
||||||
|
@ -73,4 +79,22 @@ class LXMLXMLTreeBuilderSmokeTest(SoupTest, XMLTreeBuilderSmokeTest):
|
||||||
|
|
||||||
@property
|
@property
|
||||||
def default_builder(self):
|
def default_builder(self):
|
||||||
return LXMLTreeBuilderForXML()
|
return LXMLTreeBuilderForXML
|
||||||
|
|
||||||
|
def test_namespace_indexing(self):
|
||||||
|
# We should not track un-prefixed namespaces as we can only hold one
|
||||||
|
# and it will be recognized as the default namespace by soupsieve,
|
||||||
|
# which may be confusing in some situations. When no namespace is provided
|
||||||
|
# for a selector, the default namespace (if defined) is assumed.
|
||||||
|
|
||||||
|
soup = self.soup(
|
||||||
|
'<?xml version="1.1"?>\n'
|
||||||
|
'<root>'
|
||||||
|
'<tag xmlns="http://unprefixed-namespace.com">content</tag>'
|
||||||
|
'<prefix:tag xmlns:prefix="http://prefixed-namespace.com">content</tag>'
|
||||||
|
'</root>'
|
||||||
|
)
|
||||||
|
self.assertEqual(
|
||||||
|
soup._namespaces,
|
||||||
|
{'xml': 'http://www.w3.org/XML/1998/namespace', 'prefix': 'http://prefixed-namespace.com'}
|
||||||
|
)
|
||||||
|
|
|
@ -24,6 +24,7 @@ from bs4.dammit import (
|
||||||
EncodingDetector,
|
EncodingDetector,
|
||||||
)
|
)
|
||||||
from bs4.testing import (
|
from bs4.testing import (
|
||||||
|
default_builder,
|
||||||
SoupTest,
|
SoupTest,
|
||||||
skipIf,
|
skipIf,
|
||||||
)
|
)
|
||||||
|
@ -32,7 +33,7 @@ import warnings
|
||||||
try:
|
try:
|
||||||
from bs4.builder import LXMLTreeBuilder, LXMLTreeBuilderForXML
|
from bs4.builder import LXMLTreeBuilder, LXMLTreeBuilderForXML
|
||||||
LXML_PRESENT = True
|
LXML_PRESENT = True
|
||||||
except ImportError, e:
|
except ImportError as e:
|
||||||
LXML_PRESENT = False
|
LXML_PRESENT = False
|
||||||
|
|
||||||
PYTHON_3_PRE_3_2 = (sys.version_info[0] == 3 and sys.version_info < (3,2))
|
PYTHON_3_PRE_3_2 = (sys.version_info[0] == 3 and sys.version_info < (3,2))
|
||||||
|
@ -40,21 +41,86 @@ PYTHON_3_PRE_3_2 = (sys.version_info[0] == 3 and sys.version_info < (3,2))
|
||||||
class TestConstructor(SoupTest):
|
class TestConstructor(SoupTest):
|
||||||
|
|
||||||
def test_short_unicode_input(self):
|
def test_short_unicode_input(self):
|
||||||
data = u"<h1>éé</h1>"
|
data = "<h1>éé</h1>"
|
||||||
soup = self.soup(data)
|
soup = self.soup(data)
|
||||||
self.assertEqual(u"éé", soup.h1.string)
|
self.assertEqual("éé", soup.h1.string)
|
||||||
|
|
||||||
def test_embedded_null(self):
|
def test_embedded_null(self):
|
||||||
data = u"<h1>foo\0bar</h1>"
|
data = "<h1>foo\0bar</h1>"
|
||||||
soup = self.soup(data)
|
soup = self.soup(data)
|
||||||
self.assertEqual(u"foo\0bar", soup.h1.string)
|
self.assertEqual("foo\0bar", soup.h1.string)
|
||||||
|
|
||||||
def test_exclude_encodings(self):
|
def test_exclude_encodings(self):
|
||||||
utf8_data = u"Räksmörgås".encode("utf-8")
|
utf8_data = "Räksmörgås".encode("utf-8")
|
||||||
soup = self.soup(utf8_data, exclude_encodings=["utf-8"])
|
soup = self.soup(utf8_data, exclude_encodings=["utf-8"])
|
||||||
self.assertEqual("windows-1252", soup.original_encoding)
|
self.assertEqual("windows-1252", soup.original_encoding)
|
||||||
|
|
||||||
|
def test_custom_builder_class(self):
|
||||||
|
# Verify that you can pass in a custom Builder class and
|
||||||
|
# it'll be instantiated with the appropriate keyword arguments.
|
||||||
|
class Mock(object):
|
||||||
|
def __init__(self, **kwargs):
|
||||||
|
self.called_with = kwargs
|
||||||
|
self.is_xml = True
|
||||||
|
def initialize_soup(self, soup):
|
||||||
|
pass
|
||||||
|
def prepare_markup(self, *args, **kwargs):
|
||||||
|
return ''
|
||||||
|
|
||||||
|
kwargs = dict(
|
||||||
|
var="value",
|
||||||
|
# This is a deprecated BS3-era keyword argument, which
|
||||||
|
# will be stripped out.
|
||||||
|
convertEntities=True,
|
||||||
|
)
|
||||||
|
with warnings.catch_warnings(record=True):
|
||||||
|
soup = BeautifulSoup('', builder=Mock, **kwargs)
|
||||||
|
assert isinstance(soup.builder, Mock)
|
||||||
|
self.assertEqual(dict(var="value"), soup.builder.called_with)
|
||||||
|
|
||||||
|
# You can also instantiate the TreeBuilder yourself. In this
|
||||||
|
# case, that specific object is used and any keyword arguments
|
||||||
|
# to the BeautifulSoup constructor are ignored.
|
||||||
|
builder = Mock(**kwargs)
|
||||||
|
with warnings.catch_warnings(record=True) as w:
|
||||||
|
soup = BeautifulSoup(
|
||||||
|
'', builder=builder, ignored_value=True,
|
||||||
|
)
|
||||||
|
msg = str(w[0].message)
|
||||||
|
assert msg.startswith("Keyword arguments to the BeautifulSoup constructor will be ignored.")
|
||||||
|
self.assertEqual(builder, soup.builder)
|
||||||
|
self.assertEqual(kwargs, builder.called_with)
|
||||||
|
|
||||||
|
def test_cdata_list_attributes(self):
|
||||||
|
# Most attribute values are represented as scalars, but the
|
||||||
|
# HTML standard says that some attributes, like 'class' have
|
||||||
|
# space-separated lists as values.
|
||||||
|
markup = '<a id=" an id " class=" a class "></a>'
|
||||||
|
soup = self.soup(markup)
|
||||||
|
|
||||||
|
# Note that the spaces are stripped for 'class' but not for 'id'.
|
||||||
|
a = soup.a
|
||||||
|
self.assertEqual(" an id ", a['id'])
|
||||||
|
self.assertEqual(["a", "class"], a['class'])
|
||||||
|
|
||||||
|
# TreeBuilder takes an argument called 'mutli_valued_attributes' which lets
|
||||||
|
# you customize or disable this. As always, you can customize the TreeBuilder
|
||||||
|
# by passing in a keyword argument to the BeautifulSoup constructor.
|
||||||
|
soup = self.soup(markup, builder=default_builder, multi_valued_attributes=None)
|
||||||
|
self.assertEqual(" a class ", soup.a['class'])
|
||||||
|
|
||||||
|
# Here are two ways of saying that `id` is a multi-valued
|
||||||
|
# attribute in this context, but 'class' is not.
|
||||||
|
for switcheroo in ({'*': 'id'}, {'a': 'id'}):
|
||||||
|
with warnings.catch_warnings(record=True) as w:
|
||||||
|
# This will create a warning about not explicitly
|
||||||
|
# specifying a parser, but we'll ignore it.
|
||||||
|
soup = self.soup(markup, builder=None, multi_valued_attributes=switcheroo)
|
||||||
|
a = soup.a
|
||||||
|
self.assertEqual(["an", "id"], a['id'])
|
||||||
|
self.assertEqual(" a class ", a['class'])
|
||||||
|
|
||||||
|
|
||||||
class TestWarnings(SoupTest):
|
class TestWarnings(SoupTest):
|
||||||
|
|
||||||
def _no_parser_specified(self, s, is_there=True):
|
def _no_parser_specified(self, s, is_there=True):
|
||||||
|
@ -129,7 +195,7 @@ class TestWarnings(SoupTest):
|
||||||
with warnings.catch_warnings(record=True) as warning_list:
|
with warnings.catch_warnings(record=True) as warning_list:
|
||||||
# note - this url must differ from the bytes one otherwise
|
# note - this url must differ from the bytes one otherwise
|
||||||
# python's warnings system swallows the second warning
|
# python's warnings system swallows the second warning
|
||||||
soup = self.soup(u"http://www.crummyunicode.com/")
|
soup = self.soup("http://www.crummyunicode.com/")
|
||||||
self.assertTrue(any("looks like a URL" in str(w.message)
|
self.assertTrue(any("looks like a URL" in str(w.message)
|
||||||
for w in warning_list))
|
for w in warning_list))
|
||||||
|
|
||||||
|
@ -141,7 +207,7 @@ class TestWarnings(SoupTest):
|
||||||
|
|
||||||
def test_url_warning_with_unicode_and_space(self):
|
def test_url_warning_with_unicode_and_space(self):
|
||||||
with warnings.catch_warnings(record=True) as warning_list:
|
with warnings.catch_warnings(record=True) as warning_list:
|
||||||
soup = self.soup(u"http://www.crummyuncode.com/ is great")
|
soup = self.soup("http://www.crummyuncode.com/ is great")
|
||||||
self.assertFalse(any("looks like a URL" in str(w.message)
|
self.assertFalse(any("looks like a URL" in str(w.message)
|
||||||
for w in warning_list))
|
for w in warning_list))
|
||||||
|
|
||||||
|
@ -163,9 +229,9 @@ class TestEntitySubstitution(unittest.TestCase):
|
||||||
def test_simple_html_substitution(self):
|
def test_simple_html_substitution(self):
|
||||||
# Unicode characters corresponding to named HTML entites
|
# Unicode characters corresponding to named HTML entites
|
||||||
# are substituted, and no others.
|
# are substituted, and no others.
|
||||||
s = u"foo\u2200\N{SNOWMAN}\u00f5bar"
|
s = "foo\u2200\N{SNOWMAN}\u00f5bar"
|
||||||
self.assertEqual(self.sub.substitute_html(s),
|
self.assertEqual(self.sub.substitute_html(s),
|
||||||
u"foo∀\N{SNOWMAN}õbar")
|
"foo∀\N{SNOWMAN}õbar")
|
||||||
|
|
||||||
def test_smart_quote_substitution(self):
|
def test_smart_quote_substitution(self):
|
||||||
# MS smart quotes are a common source of frustration, so we
|
# MS smart quotes are a common source of frustration, so we
|
||||||
|
@ -217,7 +283,7 @@ class TestEntitySubstitution(unittest.TestCase):
|
||||||
self.assertEqual(
|
self.assertEqual(
|
||||||
self.sub.substitute_xml_containing_entities("ÁT&T"),
|
self.sub.substitute_xml_containing_entities("ÁT&T"),
|
||||||
"ÁT&T")
|
"ÁT&T")
|
||||||
|
|
||||||
def test_quotes_not_html_substituted(self):
|
def test_quotes_not_html_substituted(self):
|
||||||
"""There's no need to do this except inside attribute values."""
|
"""There's no need to do this except inside attribute values."""
|
||||||
text = 'Bob\'s "bar"'
|
text = 'Bob\'s "bar"'
|
||||||
|
@ -230,7 +296,7 @@ class TestEncodingConversion(SoupTest):
|
||||||
|
|
||||||
def setUp(self):
|
def setUp(self):
|
||||||
super(TestEncodingConversion, self).setUp()
|
super(TestEncodingConversion, self).setUp()
|
||||||
self.unicode_data = u'<html><head><meta charset="utf-8"/></head><body><foo>Sacr\N{LATIN SMALL LETTER E WITH ACUTE} bleu!</foo></body></html>'
|
self.unicode_data = '<html><head><meta charset="utf-8"/></head><body><foo>Sacr\N{LATIN SMALL LETTER E WITH ACUTE} bleu!</foo></body></html>'
|
||||||
self.utf8_data = self.unicode_data.encode("utf-8")
|
self.utf8_data = self.unicode_data.encode("utf-8")
|
||||||
# Just so you know what it looks like.
|
# Just so you know what it looks like.
|
||||||
self.assertEqual(
|
self.assertEqual(
|
||||||
|
@ -250,7 +316,7 @@ class TestEncodingConversion(SoupTest):
|
||||||
ascii = b"<foo>a</foo>"
|
ascii = b"<foo>a</foo>"
|
||||||
soup_from_ascii = self.soup(ascii)
|
soup_from_ascii = self.soup(ascii)
|
||||||
unicode_output = soup_from_ascii.decode()
|
unicode_output = soup_from_ascii.decode()
|
||||||
self.assertTrue(isinstance(unicode_output, unicode))
|
self.assertTrue(isinstance(unicode_output, str))
|
||||||
self.assertEqual(unicode_output, self.document_for(ascii.decode()))
|
self.assertEqual(unicode_output, self.document_for(ascii.decode()))
|
||||||
self.assertEqual(soup_from_ascii.original_encoding.lower(), "utf-8")
|
self.assertEqual(soup_from_ascii.original_encoding.lower(), "utf-8")
|
||||||
finally:
|
finally:
|
||||||
|
@ -262,7 +328,7 @@ class TestEncodingConversion(SoupTest):
|
||||||
# is not set.
|
# is not set.
|
||||||
soup_from_unicode = self.soup(self.unicode_data)
|
soup_from_unicode = self.soup(self.unicode_data)
|
||||||
self.assertEqual(soup_from_unicode.decode(), self.unicode_data)
|
self.assertEqual(soup_from_unicode.decode(), self.unicode_data)
|
||||||
self.assertEqual(soup_from_unicode.foo.string, u'Sacr\xe9 bleu!')
|
self.assertEqual(soup_from_unicode.foo.string, 'Sacr\xe9 bleu!')
|
||||||
self.assertEqual(soup_from_unicode.original_encoding, None)
|
self.assertEqual(soup_from_unicode.original_encoding, None)
|
||||||
|
|
||||||
def test_utf8_in_unicode_out(self):
|
def test_utf8_in_unicode_out(self):
|
||||||
|
@ -270,7 +336,7 @@ class TestEncodingConversion(SoupTest):
|
||||||
# attribute is set.
|
# attribute is set.
|
||||||
soup_from_utf8 = self.soup(self.utf8_data)
|
soup_from_utf8 = self.soup(self.utf8_data)
|
||||||
self.assertEqual(soup_from_utf8.decode(), self.unicode_data)
|
self.assertEqual(soup_from_utf8.decode(), self.unicode_data)
|
||||||
self.assertEqual(soup_from_utf8.foo.string, u'Sacr\xe9 bleu!')
|
self.assertEqual(soup_from_utf8.foo.string, 'Sacr\xe9 bleu!')
|
||||||
|
|
||||||
def test_utf8_out(self):
|
def test_utf8_out(self):
|
||||||
# The internal data structures can be encoded as UTF-8.
|
# The internal data structures can be encoded as UTF-8.
|
||||||
|
@ -281,14 +347,14 @@ class TestEncodingConversion(SoupTest):
|
||||||
PYTHON_3_PRE_3_2,
|
PYTHON_3_PRE_3_2,
|
||||||
"Bad HTMLParser detected; skipping test of non-ASCII characters in attribute name.")
|
"Bad HTMLParser detected; skipping test of non-ASCII characters in attribute name.")
|
||||||
def test_attribute_name_containing_unicode_characters(self):
|
def test_attribute_name_containing_unicode_characters(self):
|
||||||
markup = u'<div><a \N{SNOWMAN}="snowman"></a></div>'
|
markup = '<div><a \N{SNOWMAN}="snowman"></a></div>'
|
||||||
self.assertEqual(self.soup(markup).div.encode("utf8"), markup.encode("utf8"))
|
self.assertEqual(self.soup(markup).div.encode("utf8"), markup.encode("utf8"))
|
||||||
|
|
||||||
class TestUnicodeDammit(unittest.TestCase):
|
class TestUnicodeDammit(unittest.TestCase):
|
||||||
"""Standalone tests of UnicodeDammit."""
|
"""Standalone tests of UnicodeDammit."""
|
||||||
|
|
||||||
def test_unicode_input(self):
|
def test_unicode_input(self):
|
||||||
markup = u"I'm already Unicode! \N{SNOWMAN}"
|
markup = "I'm already Unicode! \N{SNOWMAN}"
|
||||||
dammit = UnicodeDammit(markup)
|
dammit = UnicodeDammit(markup)
|
||||||
self.assertEqual(dammit.unicode_markup, markup)
|
self.assertEqual(dammit.unicode_markup, markup)
|
||||||
|
|
||||||
|
@ -296,7 +362,7 @@ class TestUnicodeDammit(unittest.TestCase):
|
||||||
markup = b"<foo>\x91\x92\x93\x94</foo>"
|
markup = b"<foo>\x91\x92\x93\x94</foo>"
|
||||||
dammit = UnicodeDammit(markup)
|
dammit = UnicodeDammit(markup)
|
||||||
self.assertEqual(
|
self.assertEqual(
|
||||||
dammit.unicode_markup, u"<foo>\u2018\u2019\u201c\u201d</foo>")
|
dammit.unicode_markup, "<foo>\u2018\u2019\u201c\u201d</foo>")
|
||||||
|
|
||||||
def test_smart_quotes_to_xml_entities(self):
|
def test_smart_quotes_to_xml_entities(self):
|
||||||
markup = b"<foo>\x91\x92\x93\x94</foo>"
|
markup = b"<foo>\x91\x92\x93\x94</foo>"
|
||||||
|
@ -320,14 +386,14 @@ class TestUnicodeDammit(unittest.TestCase):
|
||||||
utf8 = b"Sacr\xc3\xa9 bleu! \xe2\x98\x83"
|
utf8 = b"Sacr\xc3\xa9 bleu! \xe2\x98\x83"
|
||||||
dammit = UnicodeDammit(utf8)
|
dammit = UnicodeDammit(utf8)
|
||||||
self.assertEqual(dammit.original_encoding.lower(), 'utf-8')
|
self.assertEqual(dammit.original_encoding.lower(), 'utf-8')
|
||||||
self.assertEqual(dammit.unicode_markup, u'Sacr\xe9 bleu! \N{SNOWMAN}')
|
self.assertEqual(dammit.unicode_markup, 'Sacr\xe9 bleu! \N{SNOWMAN}')
|
||||||
|
|
||||||
|
|
||||||
def test_convert_hebrew(self):
|
def test_convert_hebrew(self):
|
||||||
hebrew = b"\xed\xe5\xec\xf9"
|
hebrew = b"\xed\xe5\xec\xf9"
|
||||||
dammit = UnicodeDammit(hebrew, ["iso-8859-8"])
|
dammit = UnicodeDammit(hebrew, ["iso-8859-8"])
|
||||||
self.assertEqual(dammit.original_encoding.lower(), 'iso-8859-8')
|
self.assertEqual(dammit.original_encoding.lower(), 'iso-8859-8')
|
||||||
self.assertEqual(dammit.unicode_markup, u'\u05dd\u05d5\u05dc\u05e9')
|
self.assertEqual(dammit.unicode_markup, '\u05dd\u05d5\u05dc\u05e9')
|
||||||
|
|
||||||
def test_dont_see_smart_quotes_where_there_are_none(self):
|
def test_dont_see_smart_quotes_where_there_are_none(self):
|
||||||
utf_8 = b"\343\202\261\343\203\274\343\202\277\343\202\244 Watch"
|
utf_8 = b"\343\202\261\343\203\274\343\202\277\343\202\244 Watch"
|
||||||
|
@ -336,19 +402,19 @@ class TestUnicodeDammit(unittest.TestCase):
|
||||||
self.assertEqual(dammit.unicode_markup.encode("utf-8"), utf_8)
|
self.assertEqual(dammit.unicode_markup.encode("utf-8"), utf_8)
|
||||||
|
|
||||||
def test_ignore_inappropriate_codecs(self):
|
def test_ignore_inappropriate_codecs(self):
|
||||||
utf8_data = u"Räksmörgås".encode("utf-8")
|
utf8_data = "Räksmörgås".encode("utf-8")
|
||||||
dammit = UnicodeDammit(utf8_data, ["iso-8859-8"])
|
dammit = UnicodeDammit(utf8_data, ["iso-8859-8"])
|
||||||
self.assertEqual(dammit.original_encoding.lower(), 'utf-8')
|
self.assertEqual(dammit.original_encoding.lower(), 'utf-8')
|
||||||
|
|
||||||
def test_ignore_invalid_codecs(self):
|
def test_ignore_invalid_codecs(self):
|
||||||
utf8_data = u"Räksmörgås".encode("utf-8")
|
utf8_data = "Räksmörgås".encode("utf-8")
|
||||||
for bad_encoding in ['.utf8', '...', 'utF---16.!']:
|
for bad_encoding in ['.utf8', '...', 'utF---16.!']:
|
||||||
dammit = UnicodeDammit(utf8_data, [bad_encoding])
|
dammit = UnicodeDammit(utf8_data, [bad_encoding])
|
||||||
self.assertEqual(dammit.original_encoding.lower(), 'utf-8')
|
self.assertEqual(dammit.original_encoding.lower(), 'utf-8')
|
||||||
|
|
||||||
def test_exclude_encodings(self):
|
def test_exclude_encodings(self):
|
||||||
# This is UTF-8.
|
# This is UTF-8.
|
||||||
utf8_data = u"Räksmörgås".encode("utf-8")
|
utf8_data = "Räksmörgås".encode("utf-8")
|
||||||
|
|
||||||
# But if we exclude UTF-8 from consideration, the guess is
|
# But if we exclude UTF-8 from consideration, the guess is
|
||||||
# Windows-1252.
|
# Windows-1252.
|
||||||
|
@ -364,7 +430,7 @@ class TestUnicodeDammit(unittest.TestCase):
|
||||||
detected = EncodingDetector(
|
detected = EncodingDetector(
|
||||||
b'<?xml version="1.0" encoding="UTF-\xdb" ?>')
|
b'<?xml version="1.0" encoding="UTF-\xdb" ?>')
|
||||||
encodings = list(detected.encodings)
|
encodings = list(detected.encodings)
|
||||||
assert u'utf-\N{REPLACEMENT CHARACTER}' in encodings
|
assert 'utf-\N{REPLACEMENT CHARACTER}' in encodings
|
||||||
|
|
||||||
def test_detect_html5_style_meta_tag(self):
|
def test_detect_html5_style_meta_tag(self):
|
||||||
|
|
||||||
|
@ -404,7 +470,7 @@ class TestUnicodeDammit(unittest.TestCase):
|
||||||
bs4.dammit.chardet_dammit = noop
|
bs4.dammit.chardet_dammit = noop
|
||||||
dammit = UnicodeDammit(doc)
|
dammit = UnicodeDammit(doc)
|
||||||
self.assertEqual(True, dammit.contains_replacement_characters)
|
self.assertEqual(True, dammit.contains_replacement_characters)
|
||||||
self.assertTrue(u"\ufffd" in dammit.unicode_markup)
|
self.assertTrue("\ufffd" in dammit.unicode_markup)
|
||||||
|
|
||||||
soup = BeautifulSoup(doc, "html.parser")
|
soup = BeautifulSoup(doc, "html.parser")
|
||||||
self.assertTrue(soup.contains_replacement_characters)
|
self.assertTrue(soup.contains_replacement_characters)
|
||||||
|
@ -416,17 +482,17 @@ class TestUnicodeDammit(unittest.TestCase):
|
||||||
# A document written in UTF-16LE will have its byte order marker stripped.
|
# A document written in UTF-16LE will have its byte order marker stripped.
|
||||||
data = b'\xff\xfe<\x00a\x00>\x00\xe1\x00\xe9\x00<\x00/\x00a\x00>\x00'
|
data = b'\xff\xfe<\x00a\x00>\x00\xe1\x00\xe9\x00<\x00/\x00a\x00>\x00'
|
||||||
dammit = UnicodeDammit(data)
|
dammit = UnicodeDammit(data)
|
||||||
self.assertEqual(u"<a>áé</a>", dammit.unicode_markup)
|
self.assertEqual("<a>áé</a>", dammit.unicode_markup)
|
||||||
self.assertEqual("utf-16le", dammit.original_encoding)
|
self.assertEqual("utf-16le", dammit.original_encoding)
|
||||||
|
|
||||||
def test_detwingle(self):
|
def test_detwingle(self):
|
||||||
# Here's a UTF8 document.
|
# Here's a UTF8 document.
|
||||||
utf8 = (u"\N{SNOWMAN}" * 3).encode("utf8")
|
utf8 = ("\N{SNOWMAN}" * 3).encode("utf8")
|
||||||
|
|
||||||
# Here's a Windows-1252 document.
|
# Here's a Windows-1252 document.
|
||||||
windows_1252 = (
|
windows_1252 = (
|
||||||
u"\N{LEFT DOUBLE QUOTATION MARK}Hi, I like Windows!"
|
"\N{LEFT DOUBLE QUOTATION MARK}Hi, I like Windows!"
|
||||||
u"\N{RIGHT DOUBLE QUOTATION MARK}").encode("windows_1252")
|
"\N{RIGHT DOUBLE QUOTATION MARK}").encode("windows_1252")
|
||||||
|
|
||||||
# Through some unholy alchemy, they've been stuck together.
|
# Through some unholy alchemy, they've been stuck together.
|
||||||
doc = utf8 + windows_1252 + utf8
|
doc = utf8 + windows_1252 + utf8
|
||||||
|
@ -441,7 +507,7 @@ class TestUnicodeDammit(unittest.TestCase):
|
||||||
|
|
||||||
fixed = UnicodeDammit.detwingle(doc)
|
fixed = UnicodeDammit.detwingle(doc)
|
||||||
self.assertEqual(
|
self.assertEqual(
|
||||||
u"☃☃☃“Hi, I like Windows!”☃☃☃", fixed.decode("utf8"))
|
"☃☃☃“Hi, I like Windows!”☃☃☃", fixed.decode("utf8"))
|
||||||
|
|
||||||
def test_detwingle_ignores_multibyte_characters(self):
|
def test_detwingle_ignores_multibyte_characters(self):
|
||||||
# Each of these characters has a UTF-8 representation ending
|
# Each of these characters has a UTF-8 representation ending
|
||||||
|
@ -449,9 +515,9 @@ class TestUnicodeDammit(unittest.TestCase):
|
||||||
# Windows-1252. But our code knows to skip over multibyte
|
# Windows-1252. But our code knows to skip over multibyte
|
||||||
# UTF-8 characters, so they'll survive the process unscathed.
|
# UTF-8 characters, so they'll survive the process unscathed.
|
||||||
for tricky_unicode_char in (
|
for tricky_unicode_char in (
|
||||||
u"\N{LATIN SMALL LIGATURE OE}", # 2-byte char '\xc5\x93'
|
"\N{LATIN SMALL LIGATURE OE}", # 2-byte char '\xc5\x93'
|
||||||
u"\N{LATIN SUBSCRIPT SMALL LETTER X}", # 3-byte char '\xe2\x82\x93'
|
"\N{LATIN SUBSCRIPT SMALL LETTER X}", # 3-byte char '\xe2\x82\x93'
|
||||||
u"\xf0\x90\x90\x93", # This is a CJK character, not sure which one.
|
"\xf0\x90\x90\x93", # This is a CJK character, not sure which one.
|
||||||
):
|
):
|
||||||
input = tricky_unicode_char.encode("utf8")
|
input = tricky_unicode_char.encode("utf8")
|
||||||
self.assertTrue(input.endswith(b'\x93'))
|
self.assertTrue(input.endswith(b'\x93'))
|
||||||
|
|
|
@ -1,4 +1,3 @@
|
||||||
|
|
||||||
# -*- coding: utf-8 -*-
|
# -*- coding: utf-8 -*-
|
||||||
"""Tests for Beautiful Soup's tree traversal methods.
|
"""Tests for Beautiful Soup's tree traversal methods.
|
||||||
|
|
||||||
|
@ -26,6 +25,7 @@ from bs4.element import (
|
||||||
Comment,
|
Comment,
|
||||||
Declaration,
|
Declaration,
|
||||||
Doctype,
|
Doctype,
|
||||||
|
Formatter,
|
||||||
NavigableString,
|
NavigableString,
|
||||||
SoupStrainer,
|
SoupStrainer,
|
||||||
Tag,
|
Tag,
|
||||||
|
@ -71,13 +71,13 @@ class TestFind(TreeTest):
|
||||||
self.assertEqual(soup.find("b").string, "2")
|
self.assertEqual(soup.find("b").string, "2")
|
||||||
|
|
||||||
def test_unicode_text_find(self):
|
def test_unicode_text_find(self):
|
||||||
soup = self.soup(u'<h1>Räksmörgås</h1>')
|
soup = self.soup('<h1>Räksmörgås</h1>')
|
||||||
self.assertEqual(soup.find(string=u'Räksmörgås'), u'Räksmörgås')
|
self.assertEqual(soup.find(string='Räksmörgås'), 'Räksmörgås')
|
||||||
|
|
||||||
def test_unicode_attribute_find(self):
|
def test_unicode_attribute_find(self):
|
||||||
soup = self.soup(u'<h1 id="Räksmörgås">here it is</h1>')
|
soup = self.soup('<h1 id="Räksmörgås">here it is</h1>')
|
||||||
str(soup)
|
str(soup)
|
||||||
self.assertEqual("here it is", soup.find(id=u'Räksmörgås').text)
|
self.assertEqual("here it is", soup.find(id='Räksmörgås').text)
|
||||||
|
|
||||||
|
|
||||||
def test_find_everything(self):
|
def test_find_everything(self):
|
||||||
|
@ -97,17 +97,17 @@ class TestFindAll(TreeTest):
|
||||||
"""You can search the tree for text nodes."""
|
"""You can search the tree for text nodes."""
|
||||||
soup = self.soup("<html>Foo<b>bar</b>\xbb</html>")
|
soup = self.soup("<html>Foo<b>bar</b>\xbb</html>")
|
||||||
# Exact match.
|
# Exact match.
|
||||||
self.assertEqual(soup.find_all(string="bar"), [u"bar"])
|
self.assertEqual(soup.find_all(string="bar"), ["bar"])
|
||||||
self.assertEqual(soup.find_all(text="bar"), [u"bar"])
|
self.assertEqual(soup.find_all(text="bar"), ["bar"])
|
||||||
# Match any of a number of strings.
|
# Match any of a number of strings.
|
||||||
self.assertEqual(
|
self.assertEqual(
|
||||||
soup.find_all(text=["Foo", "bar"]), [u"Foo", u"bar"])
|
soup.find_all(text=["Foo", "bar"]), ["Foo", "bar"])
|
||||||
# Match a regular expression.
|
# Match a regular expression.
|
||||||
self.assertEqual(soup.find_all(text=re.compile('.*')),
|
self.assertEqual(soup.find_all(text=re.compile('.*')),
|
||||||
[u"Foo", u"bar", u'\xbb'])
|
["Foo", "bar", '\xbb'])
|
||||||
# Match anything.
|
# Match anything.
|
||||||
self.assertEqual(soup.find_all(text=True),
|
self.assertEqual(soup.find_all(text=True),
|
||||||
[u"Foo", u"bar", u'\xbb'])
|
["Foo", "bar", '\xbb'])
|
||||||
|
|
||||||
def test_find_all_limit(self):
|
def test_find_all_limit(self):
|
||||||
"""You can limit the number of items returned by find_all."""
|
"""You can limit the number of items returned by find_all."""
|
||||||
|
@ -250,8 +250,8 @@ class TestFindAllByAttribute(TreeTest):
|
||||||
["Matching a.", "Matching b."])
|
["Matching a.", "Matching b."])
|
||||||
|
|
||||||
def test_find_all_by_utf8_attribute_value(self):
|
def test_find_all_by_utf8_attribute_value(self):
|
||||||
peace = u"םולש".encode("utf8")
|
peace = "םולש".encode("utf8")
|
||||||
data = u'<a title="םולש"></a>'.encode("utf8")
|
data = '<a title="םולש"></a>'.encode("utf8")
|
||||||
soup = self.soup(data)
|
soup = self.soup(data)
|
||||||
self.assertEqual([soup.a], soup.find_all(title=peace))
|
self.assertEqual([soup.a], soup.find_all(title=peace))
|
||||||
self.assertEqual([soup.a], soup.find_all(title=peace.decode("utf8")))
|
self.assertEqual([soup.a], soup.find_all(title=peace.decode("utf8")))
|
||||||
|
@ -417,6 +417,48 @@ class TestFindAllByAttribute(TreeTest):
|
||||||
self.assertEqual([], soup.find_all(id=1, text="bar"))
|
self.assertEqual([], soup.find_all(id=1, text="bar"))
|
||||||
|
|
||||||
|
|
||||||
|
class TestSmooth(TreeTest):
|
||||||
|
"""Test Tag.smooth."""
|
||||||
|
|
||||||
|
def test_smooth(self):
|
||||||
|
soup = self.soup("<div>a</div>")
|
||||||
|
div = soup.div
|
||||||
|
div.append("b")
|
||||||
|
div.append("c")
|
||||||
|
div.append(Comment("Comment 1"))
|
||||||
|
div.append(Comment("Comment 2"))
|
||||||
|
div.append("d")
|
||||||
|
builder = self.default_builder()
|
||||||
|
span = Tag(soup, builder, 'span')
|
||||||
|
span.append('1')
|
||||||
|
span.append('2')
|
||||||
|
div.append(span)
|
||||||
|
|
||||||
|
# At this point the tree has a bunch of adjacent
|
||||||
|
# NavigableStrings. This is normal, but it has no meaning in
|
||||||
|
# terms of HTML, so we may want to smooth things out for
|
||||||
|
# output.
|
||||||
|
|
||||||
|
# Since the <span> tag has two children, its .string is None.
|
||||||
|
self.assertEqual(None, div.span.string)
|
||||||
|
|
||||||
|
self.assertEqual(7, len(div.contents))
|
||||||
|
div.smooth()
|
||||||
|
self.assertEqual(5, len(div.contents))
|
||||||
|
|
||||||
|
# The three strings at the beginning of div.contents have been
|
||||||
|
# merged into on string.
|
||||||
|
#
|
||||||
|
self.assertEqual('abc', div.contents[0])
|
||||||
|
|
||||||
|
# The call is recursive -- the <span> tag was also smoothed.
|
||||||
|
self.assertEqual('12', div.span.string)
|
||||||
|
|
||||||
|
# The two comments have _not_ been merged, even though
|
||||||
|
# comments are strings. Merging comments would change the
|
||||||
|
# meaning of the HTML.
|
||||||
|
self.assertEqual('Comment 1', div.contents[1])
|
||||||
|
self.assertEqual('Comment 2', div.contents[2])
|
||||||
|
|
||||||
|
|
||||||
class TestIndex(TreeTest):
|
class TestIndex(TreeTest):
|
||||||
|
@ -605,7 +647,7 @@ class SiblingTest(TreeTest):
|
||||||
</html>'''
|
</html>'''
|
||||||
# All that whitespace looks good but makes the tests more
|
# All that whitespace looks good but makes the tests more
|
||||||
# difficult. Get rid of it.
|
# difficult. Get rid of it.
|
||||||
markup = re.compile("\n\s*").sub("", markup)
|
markup = re.compile(r"\n\s*").sub("", markup)
|
||||||
self.tree = self.soup(markup)
|
self.tree = self.soup(markup)
|
||||||
|
|
||||||
|
|
||||||
|
@ -703,12 +745,12 @@ class TestTagCreation(SoupTest):
|
||||||
"""Test the ability to create new tags."""
|
"""Test the ability to create new tags."""
|
||||||
def test_new_tag(self):
|
def test_new_tag(self):
|
||||||
soup = self.soup("")
|
soup = self.soup("")
|
||||||
new_tag = soup.new_tag("foo", bar="baz")
|
new_tag = soup.new_tag("foo", bar="baz", attrs={"name": "a name"})
|
||||||
self.assertTrue(isinstance(new_tag, Tag))
|
self.assertTrue(isinstance(new_tag, Tag))
|
||||||
self.assertEqual("foo", new_tag.name)
|
self.assertEqual("foo", new_tag.name)
|
||||||
self.assertEqual(dict(bar="baz"), new_tag.attrs)
|
self.assertEqual(dict(bar="baz", name="a name"), new_tag.attrs)
|
||||||
self.assertEqual(None, new_tag.parent)
|
self.assertEqual(None, new_tag.parent)
|
||||||
|
|
||||||
def test_tag_inherits_self_closing_rules_from_builder(self):
|
def test_tag_inherits_self_closing_rules_from_builder(self):
|
||||||
if XML_BUILDER_PRESENT:
|
if XML_BUILDER_PRESENT:
|
||||||
xml_soup = BeautifulSoup("", "lxml-xml")
|
xml_soup = BeautifulSoup("", "lxml-xml")
|
||||||
|
@ -821,6 +863,26 @@ class TestTreeModification(SoupTest):
|
||||||
soup = self.soup(text)
|
soup = self.soup(text)
|
||||||
self.assertRaises(ValueError, soup.a.insert, 0, soup.a)
|
self.assertRaises(ValueError, soup.a.insert, 0, soup.a)
|
||||||
|
|
||||||
|
def test_insert_beautifulsoup_object_inserts_children(self):
|
||||||
|
"""Inserting one BeautifulSoup object into another actually inserts all
|
||||||
|
of its children -- you'll never combine BeautifulSoup objects.
|
||||||
|
"""
|
||||||
|
soup = self.soup("<p>And now, a word:</p><p>And we're back.</p>")
|
||||||
|
|
||||||
|
text = "<p>p2</p><p>p3</p>"
|
||||||
|
to_insert = self.soup(text)
|
||||||
|
soup.insert(1, to_insert)
|
||||||
|
|
||||||
|
for i in soup.descendants:
|
||||||
|
assert not isinstance(i, BeautifulSoup)
|
||||||
|
|
||||||
|
p1, p2, p3, p4 = list(soup.children)
|
||||||
|
self.assertEqual("And now, a word:", p1.string)
|
||||||
|
self.assertEqual("p2", p2.string)
|
||||||
|
self.assertEqual("p3", p3.string)
|
||||||
|
self.assertEqual("And we're back.", p4.string)
|
||||||
|
|
||||||
|
|
||||||
def test_replace_with_maintains_next_element_throughout(self):
|
def test_replace_with_maintains_next_element_throughout(self):
|
||||||
soup = self.soup('<p><a>one</a><b>three</b></p>')
|
soup = self.soup('<p><a>one</a><b>three</b></p>')
|
||||||
a = soup.a
|
a = soup.a
|
||||||
|
@ -877,7 +939,7 @@ class TestTreeModification(SoupTest):
|
||||||
self.assertEqual(soup.a.contents[0].next_element, "bar")
|
self.assertEqual(soup.a.contents[0].next_element, "bar")
|
||||||
|
|
||||||
def test_insert_tag(self):
|
def test_insert_tag(self):
|
||||||
builder = self.default_builder
|
builder = self.default_builder()
|
||||||
soup = self.soup(
|
soup = self.soup(
|
||||||
"<a><b>Find</b><c>lady!</c><d></d></a>", builder=builder)
|
"<a><b>Find</b><c>lady!</c><d></d></a>", builder=builder)
|
||||||
magic_tag = Tag(soup, builder, 'magictag')
|
magic_tag = Tag(soup, builder, 'magictag')
|
||||||
|
@ -912,6 +974,13 @@ class TestTreeModification(SoupTest):
|
||||||
soup.a.append(soup.b)
|
soup.a.append(soup.b)
|
||||||
self.assertEqual(data, soup.decode())
|
self.assertEqual(data, soup.decode())
|
||||||
|
|
||||||
|
def test_extend(self):
|
||||||
|
data = "<a><b><c><d><e><f><g></g></f></e></d></c></b></a>"
|
||||||
|
soup = self.soup(data)
|
||||||
|
l = [soup.g, soup.f, soup.e, soup.d, soup.c, soup.b]
|
||||||
|
soup.a.extend(l)
|
||||||
|
self.assertEqual("<a><g></g><f></f><e></e><d></d><c></c><b></b></a>", soup.decode())
|
||||||
|
|
||||||
def test_move_tag_to_beginning_of_parent(self):
|
def test_move_tag_to_beginning_of_parent(self):
|
||||||
data = "<a><b></b><c></c><d></d></a>"
|
data = "<a><b></b><c></c><d></d></a>"
|
||||||
soup = self.soup(data)
|
soup = self.soup(data)
|
||||||
|
@ -938,6 +1007,29 @@ class TestTreeModification(SoupTest):
|
||||||
self.assertEqual(
|
self.assertEqual(
|
||||||
soup.decode(), self.document_for("QUUX<b>bar</b><a>foo</a>BAZ"))
|
soup.decode(), self.document_for("QUUX<b>bar</b><a>foo</a>BAZ"))
|
||||||
|
|
||||||
|
# Can't insert an element before itself.
|
||||||
|
b = soup.b
|
||||||
|
self.assertRaises(ValueError, b.insert_before, b)
|
||||||
|
|
||||||
|
# Can't insert before if an element has no parent.
|
||||||
|
b.extract()
|
||||||
|
self.assertRaises(ValueError, b.insert_before, "nope")
|
||||||
|
|
||||||
|
# Can insert an identical element
|
||||||
|
soup = self.soup("<a>")
|
||||||
|
soup.a.insert_before(soup.new_tag("a"))
|
||||||
|
|
||||||
|
def test_insert_multiple_before(self):
|
||||||
|
soup = self.soup("<a>foo</a><b>bar</b>")
|
||||||
|
soup.b.insert_before("BAZ", " ", "QUUX")
|
||||||
|
soup.a.insert_before("QUUX", " ", "BAZ")
|
||||||
|
self.assertEqual(
|
||||||
|
soup.decode(), self.document_for("QUUX BAZ<a>foo</a>BAZ QUUX<b>bar</b>"))
|
||||||
|
|
||||||
|
soup.a.insert_before(soup.b, "FOO")
|
||||||
|
self.assertEqual(
|
||||||
|
soup.decode(), self.document_for("QUUX BAZ<b>bar</b>FOO<a>foo</a>BAZ QUUX"))
|
||||||
|
|
||||||
def test_insert_after(self):
|
def test_insert_after(self):
|
||||||
soup = self.soup("<a>foo</a><b>bar</b>")
|
soup = self.soup("<a>foo</a><b>bar</b>")
|
||||||
soup.b.insert_after("BAZ")
|
soup.b.insert_after("BAZ")
|
||||||
|
@ -948,6 +1040,28 @@ class TestTreeModification(SoupTest):
|
||||||
self.assertEqual(
|
self.assertEqual(
|
||||||
soup.decode(), self.document_for("QUUX<b>bar</b><a>foo</a>BAZ"))
|
soup.decode(), self.document_for("QUUX<b>bar</b><a>foo</a>BAZ"))
|
||||||
|
|
||||||
|
# Can't insert an element after itself.
|
||||||
|
b = soup.b
|
||||||
|
self.assertRaises(ValueError, b.insert_after, b)
|
||||||
|
|
||||||
|
# Can't insert after if an element has no parent.
|
||||||
|
b.extract()
|
||||||
|
self.assertRaises(ValueError, b.insert_after, "nope")
|
||||||
|
|
||||||
|
# Can insert an identical element
|
||||||
|
soup = self.soup("<a>")
|
||||||
|
soup.a.insert_before(soup.new_tag("a"))
|
||||||
|
|
||||||
|
def test_insert_multiple_after(self):
|
||||||
|
soup = self.soup("<a>foo</a><b>bar</b>")
|
||||||
|
soup.b.insert_after("BAZ", " ", "QUUX")
|
||||||
|
soup.a.insert_after("QUUX", " ", "BAZ")
|
||||||
|
self.assertEqual(
|
||||||
|
soup.decode(), self.document_for("<a>foo</a>QUUX BAZ<b>bar</b>BAZ QUUX"))
|
||||||
|
soup.b.insert_after(soup.a, "FOO ")
|
||||||
|
self.assertEqual(
|
||||||
|
soup.decode(), self.document_for("QUUX BAZ<b>bar</b><a>foo</a>FOO BAZ QUUX"))
|
||||||
|
|
||||||
def test_insert_after_raises_exception_if_after_has_no_meaning(self):
|
def test_insert_after_raises_exception_if_after_has_no_meaning(self):
|
||||||
soup = self.soup("")
|
soup = self.soup("")
|
||||||
tag = soup.new_tag("a")
|
tag = soup.new_tag("a")
|
||||||
|
@ -1111,7 +1225,7 @@ class TestTreeModification(SoupTest):
|
||||||
<script>baz</script>
|
<script>baz</script>
|
||||||
</html>""")
|
</html>""")
|
||||||
[soup.script.extract() for i in soup.find_all("script")]
|
[soup.script.extract() for i in soup.find_all("script")]
|
||||||
self.assertEqual("<body>\n\n<a></a>\n</body>", unicode(soup.body))
|
self.assertEqual("<body>\n\n<a></a>\n</body>", str(soup.body))
|
||||||
|
|
||||||
|
|
||||||
def test_extract_works_when_element_is_surrounded_by_identical_strings(self):
|
def test_extract_works_when_element_is_surrounded_by_identical_strings(self):
|
||||||
|
@ -1186,7 +1300,7 @@ class TestElementObjects(SoupTest):
|
||||||
tag = soup.bTag
|
tag = soup.bTag
|
||||||
self.assertEqual(soup.b, tag)
|
self.assertEqual(soup.b, tag)
|
||||||
self.assertEqual(
|
self.assertEqual(
|
||||||
'.bTag is deprecated, use .find("b") instead.',
|
'.bTag is deprecated, use .find("b") instead. If you really were looking for a tag called bTag, use .find("bTag")',
|
||||||
str(w[0].message))
|
str(w[0].message))
|
||||||
|
|
||||||
def test_has_attr(self):
|
def test_has_attr(self):
|
||||||
|
@ -1349,19 +1463,19 @@ class TestPersistence(SoupTest):
|
||||||
soup = BeautifulSoup(b'<p> </p>', 'html.parser')
|
soup = BeautifulSoup(b'<p> </p>', 'html.parser')
|
||||||
encoding = soup.original_encoding
|
encoding = soup.original_encoding
|
||||||
copy = soup.__copy__()
|
copy = soup.__copy__()
|
||||||
self.assertEqual(u"<p> </p>", unicode(copy))
|
self.assertEqual("<p> </p>", str(copy))
|
||||||
self.assertEqual(encoding, copy.original_encoding)
|
self.assertEqual(encoding, copy.original_encoding)
|
||||||
|
|
||||||
def test_unicode_pickle(self):
|
def test_unicode_pickle(self):
|
||||||
# A tree containing Unicode characters can be pickled.
|
# A tree containing Unicode characters can be pickled.
|
||||||
html = u"<b>\N{SNOWMAN}</b>"
|
html = "<b>\N{SNOWMAN}</b>"
|
||||||
soup = self.soup(html)
|
soup = self.soup(html)
|
||||||
dumped = pickle.dumps(soup, pickle.HIGHEST_PROTOCOL)
|
dumped = pickle.dumps(soup, pickle.HIGHEST_PROTOCOL)
|
||||||
loaded = pickle.loads(dumped)
|
loaded = pickle.loads(dumped)
|
||||||
self.assertEqual(loaded.decode(), soup.decode())
|
self.assertEqual(loaded.decode(), soup.decode())
|
||||||
|
|
||||||
def test_copy_navigablestring_is_not_attached_to_tree(self):
|
def test_copy_navigablestring_is_not_attached_to_tree(self):
|
||||||
html = u"<b>Foo<a></a></b><b>Bar</b>"
|
html = "<b>Foo<a></a></b><b>Bar</b>"
|
||||||
soup = self.soup(html)
|
soup = self.soup(html)
|
||||||
s1 = soup.find(string="Foo")
|
s1 = soup.find(string="Foo")
|
||||||
s2 = copy.copy(s1)
|
s2 = copy.copy(s1)
|
||||||
|
@ -1373,7 +1487,7 @@ class TestPersistence(SoupTest):
|
||||||
self.assertEqual(None, s2.previous_element)
|
self.assertEqual(None, s2.previous_element)
|
||||||
|
|
||||||
def test_copy_navigablestring_subclass_has_same_type(self):
|
def test_copy_navigablestring_subclass_has_same_type(self):
|
||||||
html = u"<b><!--Foo--></b>"
|
html = "<b><!--Foo--></b>"
|
||||||
soup = self.soup(html)
|
soup = self.soup(html)
|
||||||
s1 = soup.string
|
s1 = soup.string
|
||||||
s2 = copy.copy(s1)
|
s2 = copy.copy(s1)
|
||||||
|
@ -1381,19 +1495,19 @@ class TestPersistence(SoupTest):
|
||||||
self.assertTrue(isinstance(s2, Comment))
|
self.assertTrue(isinstance(s2, Comment))
|
||||||
|
|
||||||
def test_copy_entire_soup(self):
|
def test_copy_entire_soup(self):
|
||||||
html = u"<div><b>Foo<a></a></b><b>Bar</b></div>end"
|
html = "<div><b>Foo<a></a></b><b>Bar</b></div>end"
|
||||||
soup = self.soup(html)
|
soup = self.soup(html)
|
||||||
soup_copy = copy.copy(soup)
|
soup_copy = copy.copy(soup)
|
||||||
self.assertEqual(soup, soup_copy)
|
self.assertEqual(soup, soup_copy)
|
||||||
|
|
||||||
def test_copy_tag_copies_contents(self):
|
def test_copy_tag_copies_contents(self):
|
||||||
html = u"<div><b>Foo<a></a></b><b>Bar</b></div>end"
|
html = "<div><b>Foo<a></a></b><b>Bar</b></div>end"
|
||||||
soup = self.soup(html)
|
soup = self.soup(html)
|
||||||
div = soup.div
|
div = soup.div
|
||||||
div_copy = copy.copy(div)
|
div_copy = copy.copy(div)
|
||||||
|
|
||||||
# The two tags look the same, and evaluate to equal.
|
# The two tags look the same, and evaluate to equal.
|
||||||
self.assertEqual(unicode(div), unicode(div_copy))
|
self.assertEqual(str(div), str(div_copy))
|
||||||
self.assertEqual(div, div_copy)
|
self.assertEqual(div, div_copy)
|
||||||
|
|
||||||
# But they're not the same object.
|
# But they're not the same object.
|
||||||
|
@ -1409,67 +1523,75 @@ class TestPersistence(SoupTest):
|
||||||
class TestSubstitutions(SoupTest):
|
class TestSubstitutions(SoupTest):
|
||||||
|
|
||||||
def test_default_formatter_is_minimal(self):
|
def test_default_formatter_is_minimal(self):
|
||||||
markup = u"<b><<Sacr\N{LATIN SMALL LETTER E WITH ACUTE} bleu!>></b>"
|
markup = "<b><<Sacr\N{LATIN SMALL LETTER E WITH ACUTE} bleu!>></b>"
|
||||||
soup = self.soup(markup)
|
soup = self.soup(markup)
|
||||||
decoded = soup.decode(formatter="minimal")
|
decoded = soup.decode(formatter="minimal")
|
||||||
# The < is converted back into < but the e-with-acute is left alone.
|
# The < is converted back into < but the e-with-acute is left alone.
|
||||||
self.assertEqual(
|
self.assertEqual(
|
||||||
decoded,
|
decoded,
|
||||||
self.document_for(
|
self.document_for(
|
||||||
u"<b><<Sacr\N{LATIN SMALL LETTER E WITH ACUTE} bleu!>></b>"))
|
"<b><<Sacr\N{LATIN SMALL LETTER E WITH ACUTE} bleu!>></b>"))
|
||||||
|
|
||||||
def test_formatter_html(self):
|
def test_formatter_html(self):
|
||||||
markup = u"<b><<Sacr\N{LATIN SMALL LETTER E WITH ACUTE} bleu!>></b>"
|
markup = "<br><b><<Sacr\N{LATIN SMALL LETTER E WITH ACUTE} bleu!>></b>"
|
||||||
soup = self.soup(markup)
|
soup = self.soup(markup)
|
||||||
decoded = soup.decode(formatter="html")
|
decoded = soup.decode(formatter="html")
|
||||||
self.assertEqual(
|
self.assertEqual(
|
||||||
decoded,
|
decoded,
|
||||||
self.document_for("<b><<Sacré bleu!>></b>"))
|
self.document_for("<br/><b><<Sacré bleu!>></b>"))
|
||||||
|
|
||||||
|
def test_formatter_html5(self):
|
||||||
|
markup = "<br><b><<Sacr\N{LATIN SMALL LETTER E WITH ACUTE} bleu!>></b>"
|
||||||
|
soup = self.soup(markup)
|
||||||
|
decoded = soup.decode(formatter="html5")
|
||||||
|
self.assertEqual(
|
||||||
|
decoded,
|
||||||
|
self.document_for("<br><b><<Sacré bleu!>></b>"))
|
||||||
|
|
||||||
def test_formatter_minimal(self):
|
def test_formatter_minimal(self):
|
||||||
markup = u"<b><<Sacr\N{LATIN SMALL LETTER E WITH ACUTE} bleu!>></b>"
|
markup = "<b><<Sacr\N{LATIN SMALL LETTER E WITH ACUTE} bleu!>></b>"
|
||||||
soup = self.soup(markup)
|
soup = self.soup(markup)
|
||||||
decoded = soup.decode(formatter="minimal")
|
decoded = soup.decode(formatter="minimal")
|
||||||
# The < is converted back into < but the e-with-acute is left alone.
|
# The < is converted back into < but the e-with-acute is left alone.
|
||||||
self.assertEqual(
|
self.assertEqual(
|
||||||
decoded,
|
decoded,
|
||||||
self.document_for(
|
self.document_for(
|
||||||
u"<b><<Sacr\N{LATIN SMALL LETTER E WITH ACUTE} bleu!>></b>"))
|
"<b><<Sacr\N{LATIN SMALL LETTER E WITH ACUTE} bleu!>></b>"))
|
||||||
|
|
||||||
def test_formatter_null(self):
|
def test_formatter_null(self):
|
||||||
markup = u"<b><<Sacr\N{LATIN SMALL LETTER E WITH ACUTE} bleu!>></b>"
|
markup = "<b><<Sacr\N{LATIN SMALL LETTER E WITH ACUTE} bleu!>></b>"
|
||||||
soup = self.soup(markup)
|
soup = self.soup(markup)
|
||||||
decoded = soup.decode(formatter=None)
|
decoded = soup.decode(formatter=None)
|
||||||
# Neither the angle brackets nor the e-with-acute are converted.
|
# Neither the angle brackets nor the e-with-acute are converted.
|
||||||
# This is not valid HTML, but it's what the user wanted.
|
# This is not valid HTML, but it's what the user wanted.
|
||||||
self.assertEqual(decoded,
|
self.assertEqual(decoded,
|
||||||
self.document_for(u"<b><<Sacr\N{LATIN SMALL LETTER E WITH ACUTE} bleu!>></b>"))
|
self.document_for("<b><<Sacr\N{LATIN SMALL LETTER E WITH ACUTE} bleu!>></b>"))
|
||||||
|
|
||||||
def test_formatter_custom(self):
|
def test_formatter_custom(self):
|
||||||
markup = u"<b><foo></b><b>bar</b>"
|
markup = "<b><foo></b><b>bar</b><br/>"
|
||||||
soup = self.soup(markup)
|
soup = self.soup(markup)
|
||||||
decoded = soup.decode(formatter = lambda x: x.upper())
|
decoded = soup.decode(formatter = lambda x: x.upper())
|
||||||
# Instead of normal entity conversion code, the custom
|
# Instead of normal entity conversion code, the custom
|
||||||
# callable is called on every string.
|
# callable is called on every string.
|
||||||
self.assertEqual(
|
self.assertEqual(
|
||||||
decoded,
|
decoded,
|
||||||
self.document_for(u"<b><FOO></b><b>BAR</b>"))
|
self.document_for("<b><FOO></b><b>BAR</b><br/>"))
|
||||||
|
|
||||||
def test_formatter_is_run_on_attribute_values(self):
|
def test_formatter_is_run_on_attribute_values(self):
|
||||||
markup = u'<a href="http://a.com?a=b&c=é">e</a>'
|
markup = '<a href="http://a.com?a=b&c=é">e</a>'
|
||||||
soup = self.soup(markup)
|
soup = self.soup(markup)
|
||||||
a = soup.a
|
a = soup.a
|
||||||
|
|
||||||
expect_minimal = u'<a href="http://a.com?a=b&c=é">e</a>'
|
expect_minimal = '<a href="http://a.com?a=b&c=é">e</a>'
|
||||||
|
|
||||||
self.assertEqual(expect_minimal, a.decode())
|
self.assertEqual(expect_minimal, a.decode())
|
||||||
self.assertEqual(expect_minimal, a.decode(formatter="minimal"))
|
self.assertEqual(expect_minimal, a.decode(formatter="minimal"))
|
||||||
|
|
||||||
expect_html = u'<a href="http://a.com?a=b&c=é">e</a>'
|
expect_html = '<a href="http://a.com?a=b&c=é">e</a>'
|
||||||
self.assertEqual(expect_html, a.decode(formatter="html"))
|
self.assertEqual(expect_html, a.decode(formatter="html"))
|
||||||
|
|
||||||
self.assertEqual(markup, a.decode(formatter=None))
|
self.assertEqual(markup, a.decode(formatter=None))
|
||||||
expect_upper = u'<a href="HTTP://A.COM?A=B&C=É">E</a>'
|
expect_upper = '<a href="HTTP://A.COM?A=B&C=É">E</a>'
|
||||||
self.assertEqual(expect_upper, a.decode(formatter=lambda x: x.upper()))
|
self.assertEqual(expect_upper, a.decode(formatter=lambda x: x.upper()))
|
||||||
|
|
||||||
def test_formatter_skips_script_tag_for_html_documents(self):
|
def test_formatter_skips_script_tag_for_html_documents(self):
|
||||||
|
@ -1491,28 +1613,28 @@ class TestSubstitutions(SoupTest):
|
||||||
self.assertTrue(b"< < hey > >" in encoded)
|
self.assertTrue(b"< < hey > >" in encoded)
|
||||||
|
|
||||||
def test_prettify_leaves_preformatted_text_alone(self):
|
def test_prettify_leaves_preformatted_text_alone(self):
|
||||||
soup = self.soup("<div> foo <pre> \tbar\n \n </pre> baz ")
|
soup = self.soup("<div> foo <pre> \tbar\n \n </pre> baz <textarea> eee\nfff\t</textarea></div>")
|
||||||
# Everything outside the <pre> tag is reformatted, but everything
|
# Everything outside the <pre> tag is reformatted, but everything
|
||||||
# inside is left alone.
|
# inside is left alone.
|
||||||
self.assertEqual(
|
self.assertEqual(
|
||||||
u'<div>\n foo\n <pre> \tbar\n \n </pre>\n baz\n</div>',
|
'<div>\n foo\n <pre> \tbar\n \n </pre>\n baz\n <textarea> eee\nfff\t</textarea>\n</div>',
|
||||||
soup.div.prettify())
|
soup.div.prettify())
|
||||||
|
|
||||||
def test_prettify_accepts_formatter(self):
|
def test_prettify_accepts_formatter_function(self):
|
||||||
soup = BeautifulSoup("<html><body>foo</body></html>", 'html.parser')
|
soup = BeautifulSoup("<html><body>foo</body></html>", 'html.parser')
|
||||||
pretty = soup.prettify(formatter = lambda x: x.upper())
|
pretty = soup.prettify(formatter = lambda x: x.upper())
|
||||||
self.assertTrue("FOO" in pretty)
|
self.assertTrue("FOO" in pretty)
|
||||||
|
|
||||||
def test_prettify_outputs_unicode_by_default(self):
|
def test_prettify_outputs_unicode_by_default(self):
|
||||||
soup = self.soup("<a></a>")
|
soup = self.soup("<a></a>")
|
||||||
self.assertEqual(unicode, type(soup.prettify()))
|
self.assertEqual(str, type(soup.prettify()))
|
||||||
|
|
||||||
def test_prettify_can_encode_data(self):
|
def test_prettify_can_encode_data(self):
|
||||||
soup = self.soup("<a></a>")
|
soup = self.soup("<a></a>")
|
||||||
self.assertEqual(bytes, type(soup.prettify("utf-8")))
|
self.assertEqual(bytes, type(soup.prettify("utf-8")))
|
||||||
|
|
||||||
def test_html_entity_substitution_off_by_default(self):
|
def test_html_entity_substitution_off_by_default(self):
|
||||||
markup = u"<b>Sacr\N{LATIN SMALL LETTER E WITH ACUTE} bleu!</b>"
|
markup = "<b>Sacr\N{LATIN SMALL LETTER E WITH ACUTE} bleu!</b>"
|
||||||
soup = self.soup(markup)
|
soup = self.soup(markup)
|
||||||
encoded = soup.b.encode("utf-8")
|
encoded = soup.b.encode("utf-8")
|
||||||
self.assertEqual(encoded, markup.encode('utf-8'))
|
self.assertEqual(encoded, markup.encode('utf-8'))
|
||||||
|
@ -1556,54 +1678,77 @@ class TestEncoding(SoupTest):
|
||||||
"""Test the ability to encode objects into strings."""
|
"""Test the ability to encode objects into strings."""
|
||||||
|
|
||||||
def test_unicode_string_can_be_encoded(self):
|
def test_unicode_string_can_be_encoded(self):
|
||||||
html = u"<b>\N{SNOWMAN}</b>"
|
html = "<b>\N{SNOWMAN}</b>"
|
||||||
soup = self.soup(html)
|
soup = self.soup(html)
|
||||||
self.assertEqual(soup.b.string.encode("utf-8"),
|
self.assertEqual(soup.b.string.encode("utf-8"),
|
||||||
u"\N{SNOWMAN}".encode("utf-8"))
|
"\N{SNOWMAN}".encode("utf-8"))
|
||||||
|
|
||||||
def test_tag_containing_unicode_string_can_be_encoded(self):
|
def test_tag_containing_unicode_string_can_be_encoded(self):
|
||||||
html = u"<b>\N{SNOWMAN}</b>"
|
html = "<b>\N{SNOWMAN}</b>"
|
||||||
soup = self.soup(html)
|
soup = self.soup(html)
|
||||||
self.assertEqual(
|
self.assertEqual(
|
||||||
soup.b.encode("utf-8"), html.encode("utf-8"))
|
soup.b.encode("utf-8"), html.encode("utf-8"))
|
||||||
|
|
||||||
def test_encoding_substitutes_unrecognized_characters_by_default(self):
|
def test_encoding_substitutes_unrecognized_characters_by_default(self):
|
||||||
html = u"<b>\N{SNOWMAN}</b>"
|
html = "<b>\N{SNOWMAN}</b>"
|
||||||
soup = self.soup(html)
|
soup = self.soup(html)
|
||||||
self.assertEqual(soup.b.encode("ascii"), b"<b>☃</b>")
|
self.assertEqual(soup.b.encode("ascii"), b"<b>☃</b>")
|
||||||
|
|
||||||
def test_encoding_can_be_made_strict(self):
|
def test_encoding_can_be_made_strict(self):
|
||||||
html = u"<b>\N{SNOWMAN}</b>"
|
html = "<b>\N{SNOWMAN}</b>"
|
||||||
soup = self.soup(html)
|
soup = self.soup(html)
|
||||||
self.assertRaises(
|
self.assertRaises(
|
||||||
UnicodeEncodeError, soup.encode, "ascii", errors="strict")
|
UnicodeEncodeError, soup.encode, "ascii", errors="strict")
|
||||||
|
|
||||||
def test_decode_contents(self):
|
def test_decode_contents(self):
|
||||||
html = u"<b>\N{SNOWMAN}</b>"
|
html = "<b>\N{SNOWMAN}</b>"
|
||||||
soup = self.soup(html)
|
soup = self.soup(html)
|
||||||
self.assertEqual(u"\N{SNOWMAN}", soup.b.decode_contents())
|
self.assertEqual("\N{SNOWMAN}", soup.b.decode_contents())
|
||||||
|
|
||||||
def test_encode_contents(self):
|
def test_encode_contents(self):
|
||||||
html = u"<b>\N{SNOWMAN}</b>"
|
html = "<b>\N{SNOWMAN}</b>"
|
||||||
soup = self.soup(html)
|
soup = self.soup(html)
|
||||||
self.assertEqual(
|
self.assertEqual(
|
||||||
u"\N{SNOWMAN}".encode("utf8"), soup.b.encode_contents(
|
"\N{SNOWMAN}".encode("utf8"), soup.b.encode_contents(
|
||||||
encoding="utf8"))
|
encoding="utf8"))
|
||||||
|
|
||||||
def test_deprecated_renderContents(self):
|
def test_deprecated_renderContents(self):
|
||||||
html = u"<b>\N{SNOWMAN}</b>"
|
html = "<b>\N{SNOWMAN}</b>"
|
||||||
soup = self.soup(html)
|
soup = self.soup(html)
|
||||||
self.assertEqual(
|
self.assertEqual(
|
||||||
u"\N{SNOWMAN}".encode("utf8"), soup.b.renderContents())
|
"\N{SNOWMAN}".encode("utf8"), soup.b.renderContents())
|
||||||
|
|
||||||
def test_repr(self):
|
def test_repr(self):
|
||||||
html = u"<b>\N{SNOWMAN}</b>"
|
html = "<b>\N{SNOWMAN}</b>"
|
||||||
soup = self.soup(html)
|
soup = self.soup(html)
|
||||||
if PY3K:
|
if PY3K:
|
||||||
self.assertEqual(html, repr(soup))
|
self.assertEqual(html, repr(soup))
|
||||||
else:
|
else:
|
||||||
self.assertEqual(b'<b>\\u2603</b>', repr(soup))
|
self.assertEqual(b'<b>\\u2603</b>', repr(soup))
|
||||||
|
|
||||||
|
class TestFormatter(SoupTest):
|
||||||
|
|
||||||
|
def test_sort_attributes(self):
|
||||||
|
# Test the ability to override Formatter.attributes() to,
|
||||||
|
# e.g., disable the normal sorting of attributes.
|
||||||
|
class UnsortedFormatter(Formatter):
|
||||||
|
def attributes(self, tag):
|
||||||
|
self.called_with = tag
|
||||||
|
for k, v in sorted(tag.attrs.items()):
|
||||||
|
if k == 'ignore':
|
||||||
|
continue
|
||||||
|
yield k,v
|
||||||
|
|
||||||
|
soup = self.soup('<p cval="1" aval="2" ignore="ignored"></p>')
|
||||||
|
formatter = UnsortedFormatter()
|
||||||
|
decoded = soup.decode(formatter=formatter)
|
||||||
|
|
||||||
|
# attributes() was called on the <p> tag. It filtered out one
|
||||||
|
# attribute and sorted the other two.
|
||||||
|
self.assertEqual(formatter.called_with, soup.p)
|
||||||
|
self.assertEqual('<p aval="2" cval="1"></p>', decoded)
|
||||||
|
|
||||||
|
|
||||||
class TestNavigableStringSubclasses(SoupTest):
|
class TestNavigableStringSubclasses(SoupTest):
|
||||||
|
|
||||||
def test_cdata(self):
|
def test_cdata(self):
|
||||||
|
@ -1720,7 +1865,7 @@ class TestSoupSelector(TreeTest):
|
||||||
els = self.soup.select('title')
|
els = self.soup.select('title')
|
||||||
self.assertEqual(len(els), 1)
|
self.assertEqual(len(els), 1)
|
||||||
self.assertEqual(els[0].name, 'title')
|
self.assertEqual(els[0].name, 'title')
|
||||||
self.assertEqual(els[0].contents, [u'The title'])
|
self.assertEqual(els[0].contents, ['The title'])
|
||||||
|
|
||||||
def test_one_tag_many(self):
|
def test_one_tag_many(self):
|
||||||
els = self.soup.select('div')
|
els = self.soup.select('div')
|
||||||
|
@ -1755,7 +1900,7 @@ class TestSoupSelector(TreeTest):
|
||||||
self.assertEqual(len(self.soup.select('del')), 0)
|
self.assertEqual(len(self.soup.select('del')), 0)
|
||||||
|
|
||||||
def test_invalid_tag(self):
|
def test_invalid_tag(self):
|
||||||
self.assertRaises(ValueError, self.soup.select, 'tag%t')
|
self.assertRaises(SyntaxError, self.soup.select, 'tag%t')
|
||||||
|
|
||||||
def test_select_dashed_tag_ids(self):
|
def test_select_dashed_tag_ids(self):
|
||||||
self.assertSelects('custom-dashed-tag', ['dash1', 'dash2'])
|
self.assertSelects('custom-dashed-tag', ['dash1', 'dash2'])
|
||||||
|
@ -1766,7 +1911,7 @@ class TestSoupSelector(TreeTest):
|
||||||
self.assertEqual(dashed[0]['id'], 'dash2')
|
self.assertEqual(dashed[0]['id'], 'dash2')
|
||||||
|
|
||||||
def test_dashed_tag_text(self):
|
def test_dashed_tag_text(self):
|
||||||
self.assertEqual(self.soup.select('body > custom-dashed-tag')[0].text, u'Hello there.')
|
self.assertEqual(self.soup.select('body > custom-dashed-tag')[0].text, 'Hello there.')
|
||||||
|
|
||||||
def test_select_dashed_matches_find_all(self):
|
def test_select_dashed_matches_find_all(self):
|
||||||
self.assertEqual(self.soup.select('custom-dashed-tag'), self.soup.find_all('custom-dashed-tag'))
|
self.assertEqual(self.soup.select('custom-dashed-tag'), self.soup.find_all('custom-dashed-tag'))
|
||||||
|
@ -1946,32 +2091,31 @@ class TestSoupSelector(TreeTest):
|
||||||
NotImplementedError, self.soup.select, "a:no-such-pseudoclass")
|
NotImplementedError, self.soup.select, "a:no-such-pseudoclass")
|
||||||
|
|
||||||
self.assertRaises(
|
self.assertRaises(
|
||||||
NotImplementedError, self.soup.select, "a:nth-of-type(a)")
|
SyntaxError, self.soup.select, "a:nth-of-type(a)")
|
||||||
|
|
||||||
|
|
||||||
def test_nth_of_type(self):
|
def test_nth_of_type(self):
|
||||||
# Try to select first paragraph
|
# Try to select first paragraph
|
||||||
els = self.soup.select('div#inner p:nth-of-type(1)')
|
els = self.soup.select('div#inner p:nth-of-type(1)')
|
||||||
self.assertEqual(len(els), 1)
|
self.assertEqual(len(els), 1)
|
||||||
self.assertEqual(els[0].string, u'Some text')
|
self.assertEqual(els[0].string, 'Some text')
|
||||||
|
|
||||||
# Try to select third paragraph
|
# Try to select third paragraph
|
||||||
els = self.soup.select('div#inner p:nth-of-type(3)')
|
els = self.soup.select('div#inner p:nth-of-type(3)')
|
||||||
self.assertEqual(len(els), 1)
|
self.assertEqual(len(els), 1)
|
||||||
self.assertEqual(els[0].string, u'Another')
|
self.assertEqual(els[0].string, 'Another')
|
||||||
|
|
||||||
# Try to select (non-existent!) fourth paragraph
|
# Try to select (non-existent!) fourth paragraph
|
||||||
els = self.soup.select('div#inner p:nth-of-type(4)')
|
els = self.soup.select('div#inner p:nth-of-type(4)')
|
||||||
self.assertEqual(len(els), 0)
|
self.assertEqual(len(els), 0)
|
||||||
|
|
||||||
# Pass in an invalid value.
|
# Zero will select no tags.
|
||||||
self.assertRaises(
|
els = self.soup.select('div p:nth-of-type(0)')
|
||||||
ValueError, self.soup.select, 'div p:nth-of-type(0)')
|
self.assertEqual(len(els), 0)
|
||||||
|
|
||||||
def test_nth_of_type_direct_descendant(self):
|
def test_nth_of_type_direct_descendant(self):
|
||||||
els = self.soup.select('div#inner > p:nth-of-type(1)')
|
els = self.soup.select('div#inner > p:nth-of-type(1)')
|
||||||
self.assertEqual(len(els), 1)
|
self.assertEqual(len(els), 1)
|
||||||
self.assertEqual(els[0].string, u'Some text')
|
self.assertEqual(els[0].string, 'Some text')
|
||||||
|
|
||||||
def test_id_child_selector_nth_of_type(self):
|
def test_id_child_selector_nth_of_type(self):
|
||||||
self.assertSelects('#inner > p:nth-of-type(2)', ['p1'])
|
self.assertSelects('#inner > p:nth-of-type(2)', ['p1'])
|
||||||
|
@ -2003,7 +2147,7 @@ class TestSoupSelector(TreeTest):
|
||||||
self.assertEqual([], self.soup.select('#inner ~ h2'))
|
self.assertEqual([], self.soup.select('#inner ~ h2'))
|
||||||
|
|
||||||
def test_dangling_combinator(self):
|
def test_dangling_combinator(self):
|
||||||
self.assertRaises(ValueError, self.soup.select, 'h1 >')
|
self.assertRaises(SyntaxError, self.soup.select, 'h1 >')
|
||||||
|
|
||||||
def test_sibling_combinator_wont_select_same_tag_twice(self):
|
def test_sibling_combinator_wont_select_same_tag_twice(self):
|
||||||
self.assertSelects('p[lang] ~ p', ['lang-en-gb', 'lang-en-us', 'lang-fr'])
|
self.assertSelects('p[lang] ~ p', ['lang-en-gb', 'lang-en-us', 'lang-fr'])
|
||||||
|
@ -2034,8 +2178,8 @@ class TestSoupSelector(TreeTest):
|
||||||
self.assertSelects('div x,y, z', ['xid', 'yid', 'zida', 'zidb', 'zidab', 'zidac'])
|
self.assertSelects('div x,y, z', ['xid', 'yid', 'zida', 'zidb', 'zidab', 'zidac'])
|
||||||
|
|
||||||
def test_invalid_multiple_select(self):
|
def test_invalid_multiple_select(self):
|
||||||
self.assertRaises(ValueError, self.soup.select, ',x, y')
|
self.assertRaises(SyntaxError, self.soup.select, ',x, y')
|
||||||
self.assertRaises(ValueError, self.soup.select, 'x,,y')
|
self.assertRaises(SyntaxError, self.soup.select, 'x,,y')
|
||||||
|
|
||||||
def test_multiple_select_attrs(self):
|
def test_multiple_select_attrs(self):
|
||||||
self.assertSelects('p[lang=en], p[lang=en-gb]', ['lang-en', 'lang-en-gb'])
|
self.assertSelects('p[lang=en], p[lang=en-gb]', ['lang-en', 'lang-en-gb'])
|
||||||
|
@ -2046,5 +2190,16 @@ class TestSoupSelector(TreeTest):
|
||||||
def test_multiple_select_nested(self):
|
def test_multiple_select_nested(self):
|
||||||
self.assertSelects('body > div > x, y > z', ['xid', 'zidb'])
|
self.assertSelects('body > div > x, y > z', ['xid', 'zidb'])
|
||||||
|
|
||||||
|
def test_select_duplicate_elements(self):
|
||||||
|
# When markup contains duplicate elements, a multiple select
|
||||||
|
# will find all of them.
|
||||||
|
markup = '<div class="c1"/><div class="c2"/><div class="c1"/>'
|
||||||
|
soup = BeautifulSoup(markup, 'html.parser')
|
||||||
|
selected = soup.select(".c1, .c2")
|
||||||
|
self.assertEqual(3, len(selected))
|
||||||
|
|
||||||
|
# Verify that find_all finds the same elements, though because
|
||||||
|
# of an implementation detail it finds them in a different
|
||||||
|
# order.
|
||||||
|
for element in soup.find_all(class_=['c1', 'c2']):
|
||||||
|
assert element in selected
|
||||||
|
|
|
@ -1,3 +0,0 @@
|
||||||
from pkgutil import extend_path
|
|
||||||
|
|
||||||
__path__ = extend_path(__path__, __name__)
|
|
|
@ -1,23 +0,0 @@
|
||||||
# Copyright 2009 Brian Quinlan. All Rights Reserved.
|
|
||||||
# Licensed to PSF under a Contributor Agreement.
|
|
||||||
|
|
||||||
"""Execute computations asynchronously using threads or processes."""
|
|
||||||
|
|
||||||
__author__ = 'Brian Quinlan (brian@sweetapp.com)'
|
|
||||||
|
|
||||||
from concurrent.futures._base import (FIRST_COMPLETED,
|
|
||||||
FIRST_EXCEPTION,
|
|
||||||
ALL_COMPLETED,
|
|
||||||
CancelledError,
|
|
||||||
TimeoutError,
|
|
||||||
Future,
|
|
||||||
Executor,
|
|
||||||
wait,
|
|
||||||
as_completed)
|
|
||||||
from concurrent.futures.thread import ThreadPoolExecutor
|
|
||||||
|
|
||||||
try:
|
|
||||||
from concurrent.futures.process import ProcessPoolExecutor
|
|
||||||
except ImportError:
|
|
||||||
# some platforms don't have multiprocessing
|
|
||||||
pass
|
|
|
@ -1,607 +0,0 @@
|
||||||
# Copyright 2009 Brian Quinlan. All Rights Reserved.
|
|
||||||
# Licensed to PSF under a Contributor Agreement.
|
|
||||||
|
|
||||||
import collections
|
|
||||||
import logging
|
|
||||||
import threading
|
|
||||||
import itertools
|
|
||||||
import time
|
|
||||||
|
|
||||||
__author__ = 'Brian Quinlan (brian@sweetapp.com)'
|
|
||||||
|
|
||||||
FIRST_COMPLETED = 'FIRST_COMPLETED'
|
|
||||||
FIRST_EXCEPTION = 'FIRST_EXCEPTION'
|
|
||||||
ALL_COMPLETED = 'ALL_COMPLETED'
|
|
||||||
_AS_COMPLETED = '_AS_COMPLETED'
|
|
||||||
|
|
||||||
# Possible future states (for internal use by the futures package).
|
|
||||||
PENDING = 'PENDING'
|
|
||||||
RUNNING = 'RUNNING'
|
|
||||||
# The future was cancelled by the user...
|
|
||||||
CANCELLED = 'CANCELLED'
|
|
||||||
# ...and _Waiter.add_cancelled() was called by a worker.
|
|
||||||
CANCELLED_AND_NOTIFIED = 'CANCELLED_AND_NOTIFIED'
|
|
||||||
FINISHED = 'FINISHED'
|
|
||||||
|
|
||||||
_FUTURE_STATES = [
|
|
||||||
PENDING,
|
|
||||||
RUNNING,
|
|
||||||
CANCELLED,
|
|
||||||
CANCELLED_AND_NOTIFIED,
|
|
||||||
FINISHED
|
|
||||||
]
|
|
||||||
|
|
||||||
_STATE_TO_DESCRIPTION_MAP = {
|
|
||||||
PENDING: "pending",
|
|
||||||
RUNNING: "running",
|
|
||||||
CANCELLED: "cancelled",
|
|
||||||
CANCELLED_AND_NOTIFIED: "cancelled",
|
|
||||||
FINISHED: "finished"
|
|
||||||
}
|
|
||||||
|
|
||||||
# Logger for internal use by the futures package.
|
|
||||||
LOGGER = logging.getLogger("concurrent.futures")
|
|
||||||
|
|
||||||
class Error(Exception):
|
|
||||||
"""Base class for all future-related exceptions."""
|
|
||||||
pass
|
|
||||||
|
|
||||||
class CancelledError(Error):
|
|
||||||
"""The Future was cancelled."""
|
|
||||||
pass
|
|
||||||
|
|
||||||
class TimeoutError(Error):
|
|
||||||
"""The operation exceeded the given deadline."""
|
|
||||||
pass
|
|
||||||
|
|
||||||
class _Waiter(object):
|
|
||||||
"""Provides the event that wait() and as_completed() block on."""
|
|
||||||
def __init__(self):
|
|
||||||
self.event = threading.Event()
|
|
||||||
self.finished_futures = []
|
|
||||||
|
|
||||||
def add_result(self, future):
|
|
||||||
self.finished_futures.append(future)
|
|
||||||
|
|
||||||
def add_exception(self, future):
|
|
||||||
self.finished_futures.append(future)
|
|
||||||
|
|
||||||
def add_cancelled(self, future):
|
|
||||||
self.finished_futures.append(future)
|
|
||||||
|
|
||||||
class _AsCompletedWaiter(_Waiter):
|
|
||||||
"""Used by as_completed()."""
|
|
||||||
|
|
||||||
def __init__(self):
|
|
||||||
super(_AsCompletedWaiter, self).__init__()
|
|
||||||
self.lock = threading.Lock()
|
|
||||||
|
|
||||||
def add_result(self, future):
|
|
||||||
with self.lock:
|
|
||||||
super(_AsCompletedWaiter, self).add_result(future)
|
|
||||||
self.event.set()
|
|
||||||
|
|
||||||
def add_exception(self, future):
|
|
||||||
with self.lock:
|
|
||||||
super(_AsCompletedWaiter, self).add_exception(future)
|
|
||||||
self.event.set()
|
|
||||||
|
|
||||||
def add_cancelled(self, future):
|
|
||||||
with self.lock:
|
|
||||||
super(_AsCompletedWaiter, self).add_cancelled(future)
|
|
||||||
self.event.set()
|
|
||||||
|
|
||||||
class _FirstCompletedWaiter(_Waiter):
|
|
||||||
"""Used by wait(return_when=FIRST_COMPLETED)."""
|
|
||||||
|
|
||||||
def add_result(self, future):
|
|
||||||
super(_FirstCompletedWaiter, self).add_result(future)
|
|
||||||
self.event.set()
|
|
||||||
|
|
||||||
def add_exception(self, future):
|
|
||||||
super(_FirstCompletedWaiter, self).add_exception(future)
|
|
||||||
self.event.set()
|
|
||||||
|
|
||||||
def add_cancelled(self, future):
|
|
||||||
super(_FirstCompletedWaiter, self).add_cancelled(future)
|
|
||||||
self.event.set()
|
|
||||||
|
|
||||||
class _AllCompletedWaiter(_Waiter):
|
|
||||||
"""Used by wait(return_when=FIRST_EXCEPTION and ALL_COMPLETED)."""
|
|
||||||
|
|
||||||
def __init__(self, num_pending_calls, stop_on_exception):
|
|
||||||
self.num_pending_calls = num_pending_calls
|
|
||||||
self.stop_on_exception = stop_on_exception
|
|
||||||
self.lock = threading.Lock()
|
|
||||||
super(_AllCompletedWaiter, self).__init__()
|
|
||||||
|
|
||||||
def _decrement_pending_calls(self):
|
|
||||||
with self.lock:
|
|
||||||
self.num_pending_calls -= 1
|
|
||||||
if not self.num_pending_calls:
|
|
||||||
self.event.set()
|
|
||||||
|
|
||||||
def add_result(self, future):
|
|
||||||
super(_AllCompletedWaiter, self).add_result(future)
|
|
||||||
self._decrement_pending_calls()
|
|
||||||
|
|
||||||
def add_exception(self, future):
|
|
||||||
super(_AllCompletedWaiter, self).add_exception(future)
|
|
||||||
if self.stop_on_exception:
|
|
||||||
self.event.set()
|
|
||||||
else:
|
|
||||||
self._decrement_pending_calls()
|
|
||||||
|
|
||||||
def add_cancelled(self, future):
|
|
||||||
super(_AllCompletedWaiter, self).add_cancelled(future)
|
|
||||||
self._decrement_pending_calls()
|
|
||||||
|
|
||||||
class _AcquireFutures(object):
|
|
||||||
"""A context manager that does an ordered acquire of Future conditions."""
|
|
||||||
|
|
||||||
def __init__(self, futures):
|
|
||||||
self.futures = sorted(futures, key=id)
|
|
||||||
|
|
||||||
def __enter__(self):
|
|
||||||
for future in self.futures:
|
|
||||||
future._condition.acquire()
|
|
||||||
|
|
||||||
def __exit__(self, *args):
|
|
||||||
for future in self.futures:
|
|
||||||
future._condition.release()
|
|
||||||
|
|
||||||
def _create_and_install_waiters(fs, return_when):
|
|
||||||
if return_when == _AS_COMPLETED:
|
|
||||||
waiter = _AsCompletedWaiter()
|
|
||||||
elif return_when == FIRST_COMPLETED:
|
|
||||||
waiter = _FirstCompletedWaiter()
|
|
||||||
else:
|
|
||||||
pending_count = sum(
|
|
||||||
f._state not in [CANCELLED_AND_NOTIFIED, FINISHED] for f in fs)
|
|
||||||
|
|
||||||
if return_when == FIRST_EXCEPTION:
|
|
||||||
waiter = _AllCompletedWaiter(pending_count, stop_on_exception=True)
|
|
||||||
elif return_when == ALL_COMPLETED:
|
|
||||||
waiter = _AllCompletedWaiter(pending_count, stop_on_exception=False)
|
|
||||||
else:
|
|
||||||
raise ValueError("Invalid return condition: %r" % return_when)
|
|
||||||
|
|
||||||
for f in fs:
|
|
||||||
f._waiters.append(waiter)
|
|
||||||
|
|
||||||
return waiter
|
|
||||||
|
|
||||||
def as_completed(fs, timeout=None):
|
|
||||||
"""An iterator over the given futures that yields each as it completes.
|
|
||||||
|
|
||||||
Args:
|
|
||||||
fs: The sequence of Futures (possibly created by different Executors) to
|
|
||||||
iterate over.
|
|
||||||
timeout: The maximum number of seconds to wait. If None, then there
|
|
||||||
is no limit on the wait time.
|
|
||||||
|
|
||||||
Returns:
|
|
||||||
An iterator that yields the given Futures as they complete (finished or
|
|
||||||
cancelled). If any given Futures are duplicated, they will be returned
|
|
||||||
once.
|
|
||||||
|
|
||||||
Raises:
|
|
||||||
TimeoutError: If the entire result iterator could not be generated
|
|
||||||
before the given timeout.
|
|
||||||
"""
|
|
||||||
if timeout is not None:
|
|
||||||
end_time = timeout + time.time()
|
|
||||||
|
|
||||||
fs = set(fs)
|
|
||||||
with _AcquireFutures(fs):
|
|
||||||
finished = set(
|
|
||||||
f for f in fs
|
|
||||||
if f._state in [CANCELLED_AND_NOTIFIED, FINISHED])
|
|
||||||
pending = fs - finished
|
|
||||||
waiter = _create_and_install_waiters(fs, _AS_COMPLETED)
|
|
||||||
|
|
||||||
try:
|
|
||||||
for future in finished:
|
|
||||||
yield future
|
|
||||||
|
|
||||||
while pending:
|
|
||||||
if timeout is None:
|
|
||||||
wait_timeout = None
|
|
||||||
else:
|
|
||||||
wait_timeout = end_time - time.time()
|
|
||||||
if wait_timeout < 0:
|
|
||||||
raise TimeoutError(
|
|
||||||
'%d (of %d) futures unfinished' % (
|
|
||||||
len(pending), len(fs)))
|
|
||||||
|
|
||||||
waiter.event.wait(wait_timeout)
|
|
||||||
|
|
||||||
with waiter.lock:
|
|
||||||
finished = waiter.finished_futures
|
|
||||||
waiter.finished_futures = []
|
|
||||||
waiter.event.clear()
|
|
||||||
|
|
||||||
for future in finished:
|
|
||||||
yield future
|
|
||||||
pending.remove(future)
|
|
||||||
|
|
||||||
finally:
|
|
||||||
for f in fs:
|
|
||||||
with f._condition:
|
|
||||||
f._waiters.remove(waiter)
|
|
||||||
|
|
||||||
DoneAndNotDoneFutures = collections.namedtuple(
|
|
||||||
'DoneAndNotDoneFutures', 'done not_done')
|
|
||||||
def wait(fs, timeout=None, return_when=ALL_COMPLETED):
|
|
||||||
"""Wait for the futures in the given sequence to complete.
|
|
||||||
|
|
||||||
Args:
|
|
||||||
fs: The sequence of Futures (possibly created by different Executors) to
|
|
||||||
wait upon.
|
|
||||||
timeout: The maximum number of seconds to wait. If None, then there
|
|
||||||
is no limit on the wait time.
|
|
||||||
return_when: Indicates when this function should return. The options
|
|
||||||
are:
|
|
||||||
|
|
||||||
FIRST_COMPLETED - Return when any future finishes or is
|
|
||||||
cancelled.
|
|
||||||
FIRST_EXCEPTION - Return when any future finishes by raising an
|
|
||||||
exception. If no future raises an exception
|
|
||||||
then it is equivalent to ALL_COMPLETED.
|
|
||||||
ALL_COMPLETED - Return when all futures finish or are cancelled.
|
|
||||||
|
|
||||||
Returns:
|
|
||||||
A named 2-tuple of sets. The first set, named 'done', contains the
|
|
||||||
futures that completed (is finished or cancelled) before the wait
|
|
||||||
completed. The second set, named 'not_done', contains uncompleted
|
|
||||||
futures.
|
|
||||||
"""
|
|
||||||
with _AcquireFutures(fs):
|
|
||||||
done = set(f for f in fs
|
|
||||||
if f._state in [CANCELLED_AND_NOTIFIED, FINISHED])
|
|
||||||
not_done = set(fs) - done
|
|
||||||
|
|
||||||
if (return_when == FIRST_COMPLETED) and done:
|
|
||||||
return DoneAndNotDoneFutures(done, not_done)
|
|
||||||
elif (return_when == FIRST_EXCEPTION) and done:
|
|
||||||
if any(f for f in done
|
|
||||||
if not f.cancelled() and f.exception() is not None):
|
|
||||||
return DoneAndNotDoneFutures(done, not_done)
|
|
||||||
|
|
||||||
if len(done) == len(fs):
|
|
||||||
return DoneAndNotDoneFutures(done, not_done)
|
|
||||||
|
|
||||||
waiter = _create_and_install_waiters(fs, return_when)
|
|
||||||
|
|
||||||
waiter.event.wait(timeout)
|
|
||||||
for f in fs:
|
|
||||||
with f._condition:
|
|
||||||
f._waiters.remove(waiter)
|
|
||||||
|
|
||||||
done.update(waiter.finished_futures)
|
|
||||||
return DoneAndNotDoneFutures(done, set(fs) - done)
|
|
||||||
|
|
||||||
class Future(object):
|
|
||||||
"""Represents the result of an asynchronous computation."""
|
|
||||||
|
|
||||||
def __init__(self):
|
|
||||||
"""Initializes the future. Should not be called by clients."""
|
|
||||||
self._condition = threading.Condition()
|
|
||||||
self._state = PENDING
|
|
||||||
self._result = None
|
|
||||||
self._exception = None
|
|
||||||
self._traceback = None
|
|
||||||
self._waiters = []
|
|
||||||
self._done_callbacks = []
|
|
||||||
|
|
||||||
def _invoke_callbacks(self):
|
|
||||||
for callback in self._done_callbacks:
|
|
||||||
try:
|
|
||||||
callback(self)
|
|
||||||
except Exception:
|
|
||||||
LOGGER.exception('exception calling callback for %r', self)
|
|
||||||
|
|
||||||
def __repr__(self):
|
|
||||||
with self._condition:
|
|
||||||
if self._state == FINISHED:
|
|
||||||
if self._exception:
|
|
||||||
return '<Future at %s state=%s raised %s>' % (
|
|
||||||
hex(id(self)),
|
|
||||||
_STATE_TO_DESCRIPTION_MAP[self._state],
|
|
||||||
self._exception.__class__.__name__)
|
|
||||||
else:
|
|
||||||
return '<Future at %s state=%s returned %s>' % (
|
|
||||||
hex(id(self)),
|
|
||||||
_STATE_TO_DESCRIPTION_MAP[self._state],
|
|
||||||
self._result.__class__.__name__)
|
|
||||||
return '<Future at %s state=%s>' % (
|
|
||||||
hex(id(self)),
|
|
||||||
_STATE_TO_DESCRIPTION_MAP[self._state])
|
|
||||||
|
|
||||||
def cancel(self):
|
|
||||||
"""Cancel the future if possible.
|
|
||||||
|
|
||||||
Returns True if the future was cancelled, False otherwise. A future
|
|
||||||
cannot be cancelled if it is running or has already completed.
|
|
||||||
"""
|
|
||||||
with self._condition:
|
|
||||||
if self._state in [RUNNING, FINISHED]:
|
|
||||||
return False
|
|
||||||
|
|
||||||
if self._state in [CANCELLED, CANCELLED_AND_NOTIFIED]:
|
|
||||||
return True
|
|
||||||
|
|
||||||
self._state = CANCELLED
|
|
||||||
self._condition.notify_all()
|
|
||||||
|
|
||||||
self._invoke_callbacks()
|
|
||||||
return True
|
|
||||||
|
|
||||||
def cancelled(self):
|
|
||||||
"""Return True if the future has cancelled."""
|
|
||||||
with self._condition:
|
|
||||||
return self._state in [CANCELLED, CANCELLED_AND_NOTIFIED]
|
|
||||||
|
|
||||||
def running(self):
|
|
||||||
"""Return True if the future is currently executing."""
|
|
||||||
with self._condition:
|
|
||||||
return self._state == RUNNING
|
|
||||||
|
|
||||||
def done(self):
|
|
||||||
"""Return True of the future was cancelled or finished executing."""
|
|
||||||
with self._condition:
|
|
||||||
return self._state in [CANCELLED, CANCELLED_AND_NOTIFIED, FINISHED]
|
|
||||||
|
|
||||||
def __get_result(self):
|
|
||||||
if self._exception:
|
|
||||||
raise type(self._exception), self._exception, self._traceback
|
|
||||||
else:
|
|
||||||
return self._result
|
|
||||||
|
|
||||||
def add_done_callback(self, fn):
|
|
||||||
"""Attaches a callable that will be called when the future finishes.
|
|
||||||
|
|
||||||
Args:
|
|
||||||
fn: A callable that will be called with this future as its only
|
|
||||||
argument when the future completes or is cancelled. The callable
|
|
||||||
will always be called by a thread in the same process in which
|
|
||||||
it was added. If the future has already completed or been
|
|
||||||
cancelled then the callable will be called immediately. These
|
|
||||||
callables are called in the order that they were added.
|
|
||||||
"""
|
|
||||||
with self._condition:
|
|
||||||
if self._state not in [CANCELLED, CANCELLED_AND_NOTIFIED, FINISHED]:
|
|
||||||
self._done_callbacks.append(fn)
|
|
||||||
return
|
|
||||||
fn(self)
|
|
||||||
|
|
||||||
def result(self, timeout=None):
|
|
||||||
"""Return the result of the call that the future represents.
|
|
||||||
|
|
||||||
Args:
|
|
||||||
timeout: The number of seconds to wait for the result if the future
|
|
||||||
isn't done. If None, then there is no limit on the wait time.
|
|
||||||
|
|
||||||
Returns:
|
|
||||||
The result of the call that the future represents.
|
|
||||||
|
|
||||||
Raises:
|
|
||||||
CancelledError: If the future was cancelled.
|
|
||||||
TimeoutError: If the future didn't finish executing before the given
|
|
||||||
timeout.
|
|
||||||
Exception: If the call raised then that exception will be raised.
|
|
||||||
"""
|
|
||||||
with self._condition:
|
|
||||||
if self._state in [CANCELLED, CANCELLED_AND_NOTIFIED]:
|
|
||||||
raise CancelledError()
|
|
||||||
elif self._state == FINISHED:
|
|
||||||
return self.__get_result()
|
|
||||||
|
|
||||||
self._condition.wait(timeout)
|
|
||||||
|
|
||||||
if self._state in [CANCELLED, CANCELLED_AND_NOTIFIED]:
|
|
||||||
raise CancelledError()
|
|
||||||
elif self._state == FINISHED:
|
|
||||||
return self.__get_result()
|
|
||||||
else:
|
|
||||||
raise TimeoutError()
|
|
||||||
|
|
||||||
def exception_info(self, timeout=None):
|
|
||||||
"""Return a tuple of (exception, traceback) raised by the call that the
|
|
||||||
future represents.
|
|
||||||
|
|
||||||
Args:
|
|
||||||
timeout: The number of seconds to wait for the exception if the
|
|
||||||
future isn't done. If None, then there is no limit on the wait
|
|
||||||
time.
|
|
||||||
|
|
||||||
Returns:
|
|
||||||
The exception raised by the call that the future represents or None
|
|
||||||
if the call completed without raising.
|
|
||||||
|
|
||||||
Raises:
|
|
||||||
CancelledError: If the future was cancelled.
|
|
||||||
TimeoutError: If the future didn't finish executing before the given
|
|
||||||
timeout.
|
|
||||||
"""
|
|
||||||
with self._condition:
|
|
||||||
if self._state in [CANCELLED, CANCELLED_AND_NOTIFIED]:
|
|
||||||
raise CancelledError()
|
|
||||||
elif self._state == FINISHED:
|
|
||||||
return self._exception, self._traceback
|
|
||||||
|
|
||||||
self._condition.wait(timeout)
|
|
||||||
|
|
||||||
if self._state in [CANCELLED, CANCELLED_AND_NOTIFIED]:
|
|
||||||
raise CancelledError()
|
|
||||||
elif self._state == FINISHED:
|
|
||||||
return self._exception, self._traceback
|
|
||||||
else:
|
|
||||||
raise TimeoutError()
|
|
||||||
|
|
||||||
def exception(self, timeout=None):
|
|
||||||
"""Return the exception raised by the call that the future represents.
|
|
||||||
|
|
||||||
Args:
|
|
||||||
timeout: The number of seconds to wait for the exception if the
|
|
||||||
future isn't done. If None, then there is no limit on the wait
|
|
||||||
time.
|
|
||||||
|
|
||||||
Returns:
|
|
||||||
The exception raised by the call that the future represents or None
|
|
||||||
if the call completed without raising.
|
|
||||||
|
|
||||||
Raises:
|
|
||||||
CancelledError: If the future was cancelled.
|
|
||||||
TimeoutError: If the future didn't finish executing before the given
|
|
||||||
timeout.
|
|
||||||
"""
|
|
||||||
return self.exception_info(timeout)[0]
|
|
||||||
|
|
||||||
# The following methods should only be used by Executors and in tests.
|
|
||||||
def set_running_or_notify_cancel(self):
|
|
||||||
"""Mark the future as running or process any cancel notifications.
|
|
||||||
|
|
||||||
Should only be used by Executor implementations and unit tests.
|
|
||||||
|
|
||||||
If the future has been cancelled (cancel() was called and returned
|
|
||||||
True) then any threads waiting on the future completing (though calls
|
|
||||||
to as_completed() or wait()) are notified and False is returned.
|
|
||||||
|
|
||||||
If the future was not cancelled then it is put in the running state
|
|
||||||
(future calls to running() will return True) and True is returned.
|
|
||||||
|
|
||||||
This method should be called by Executor implementations before
|
|
||||||
executing the work associated with this future. If this method returns
|
|
||||||
False then the work should not be executed.
|
|
||||||
|
|
||||||
Returns:
|
|
||||||
False if the Future was cancelled, True otherwise.
|
|
||||||
|
|
||||||
Raises:
|
|
||||||
RuntimeError: if this method was already called or if set_result()
|
|
||||||
or set_exception() was called.
|
|
||||||
"""
|
|
||||||
with self._condition:
|
|
||||||
if self._state == CANCELLED:
|
|
||||||
self._state = CANCELLED_AND_NOTIFIED
|
|
||||||
for waiter in self._waiters:
|
|
||||||
waiter.add_cancelled(self)
|
|
||||||
# self._condition.notify_all() is not necessary because
|
|
||||||
# self.cancel() triggers a notification.
|
|
||||||
return False
|
|
||||||
elif self._state == PENDING:
|
|
||||||
self._state = RUNNING
|
|
||||||
return True
|
|
||||||
else:
|
|
||||||
LOGGER.critical('Future %s in unexpected state: %s',
|
|
||||||
id(self),
|
|
||||||
self._state)
|
|
||||||
raise RuntimeError('Future in unexpected state')
|
|
||||||
|
|
||||||
def set_result(self, result):
|
|
||||||
"""Sets the return value of work associated with the future.
|
|
||||||
|
|
||||||
Should only be used by Executor implementations and unit tests.
|
|
||||||
"""
|
|
||||||
with self._condition:
|
|
||||||
self._result = result
|
|
||||||
self._state = FINISHED
|
|
||||||
for waiter in self._waiters:
|
|
||||||
waiter.add_result(self)
|
|
||||||
self._condition.notify_all()
|
|
||||||
self._invoke_callbacks()
|
|
||||||
|
|
||||||
def set_exception_info(self, exception, traceback):
|
|
||||||
"""Sets the result of the future as being the given exception
|
|
||||||
and traceback.
|
|
||||||
|
|
||||||
Should only be used by Executor implementations and unit tests.
|
|
||||||
"""
|
|
||||||
with self._condition:
|
|
||||||
self._exception = exception
|
|
||||||
self._traceback = traceback
|
|
||||||
self._state = FINISHED
|
|
||||||
for waiter in self._waiters:
|
|
||||||
waiter.add_exception(self)
|
|
||||||
self._condition.notify_all()
|
|
||||||
self._invoke_callbacks()
|
|
||||||
|
|
||||||
def set_exception(self, exception):
|
|
||||||
"""Sets the result of the future as being the given exception.
|
|
||||||
|
|
||||||
Should only be used by Executor implementations and unit tests.
|
|
||||||
"""
|
|
||||||
self.set_exception_info(exception, None)
|
|
||||||
|
|
||||||
class Executor(object):
|
|
||||||
"""This is an abstract base class for concrete asynchronous executors."""
|
|
||||||
|
|
||||||
def submit(self, fn, *args, **kwargs):
|
|
||||||
"""Submits a callable to be executed with the given arguments.
|
|
||||||
|
|
||||||
Schedules the callable to be executed as fn(*args, **kwargs) and returns
|
|
||||||
a Future instance representing the execution of the callable.
|
|
||||||
|
|
||||||
Returns:
|
|
||||||
A Future representing the given call.
|
|
||||||
"""
|
|
||||||
raise NotImplementedError()
|
|
||||||
|
|
||||||
def map(self, fn, *iterables, **kwargs):
|
|
||||||
"""Returns a iterator equivalent to map(fn, iter).
|
|
||||||
|
|
||||||
Args:
|
|
||||||
fn: A callable that will take as many arguments as there are
|
|
||||||
passed iterables.
|
|
||||||
timeout: The maximum number of seconds to wait. If None, then there
|
|
||||||
is no limit on the wait time.
|
|
||||||
|
|
||||||
Returns:
|
|
||||||
An iterator equivalent to: map(func, *iterables) but the calls may
|
|
||||||
be evaluated out-of-order.
|
|
||||||
|
|
||||||
Raises:
|
|
||||||
TimeoutError: If the entire result iterator could not be generated
|
|
||||||
before the given timeout.
|
|
||||||
Exception: If fn(*args) raises for any values.
|
|
||||||
"""
|
|
||||||
timeout = kwargs.get('timeout')
|
|
||||||
if timeout is not None:
|
|
||||||
end_time = timeout + time.time()
|
|
||||||
|
|
||||||
fs = [self.submit(fn, *args) for args in itertools.izip(*iterables)]
|
|
||||||
|
|
||||||
# Yield must be hidden in closure so that the futures are submitted
|
|
||||||
# before the first iterator value is required.
|
|
||||||
def result_iterator():
|
|
||||||
try:
|
|
||||||
for future in fs:
|
|
||||||
if timeout is None:
|
|
||||||
yield future.result()
|
|
||||||
else:
|
|
||||||
yield future.result(end_time - time.time())
|
|
||||||
finally:
|
|
||||||
for future in fs:
|
|
||||||
future.cancel()
|
|
||||||
return result_iterator()
|
|
||||||
|
|
||||||
def shutdown(self, wait=True):
|
|
||||||
"""Clean-up the resources associated with the Executor.
|
|
||||||
|
|
||||||
It is safe to call this method several times. Otherwise, no other
|
|
||||||
methods can be called after this one.
|
|
||||||
|
|
||||||
Args:
|
|
||||||
wait: If True then shutdown will not return until all running
|
|
||||||
futures have finished executing and the resources used by the
|
|
||||||
executor have been reclaimed.
|
|
||||||
"""
|
|
||||||
pass
|
|
||||||
|
|
||||||
def __enter__(self):
|
|
||||||
return self
|
|
||||||
|
|
||||||
def __exit__(self, exc_type, exc_val, exc_tb):
|
|
||||||
self.shutdown(wait=True)
|
|
||||||
return False
|
|
|
@ -1,359 +0,0 @@
|
||||||
# Copyright 2009 Brian Quinlan. All Rights Reserved.
|
|
||||||
# Licensed to PSF under a Contributor Agreement.
|
|
||||||
|
|
||||||
"""Implements ProcessPoolExecutor.
|
|
||||||
|
|
||||||
The follow diagram and text describe the data-flow through the system:
|
|
||||||
|
|
||||||
|======================= In-process =====================|== Out-of-process ==|
|
|
||||||
|
|
||||||
+----------+ +----------+ +--------+ +-----------+ +---------+
|
|
||||||
| | => | Work Ids | => | | => | Call Q | => | |
|
|
||||||
| | +----------+ | | +-----------+ | |
|
|
||||||
| | | ... | | | | ... | | |
|
|
||||||
| | | 6 | | | | 5, call() | | |
|
|
||||||
| | | 7 | | | | ... | | |
|
|
||||||
| Process | | ... | | Local | +-----------+ | Process |
|
|
||||||
| Pool | +----------+ | Worker | | #1..n |
|
|
||||||
| Executor | | Thread | | |
|
|
||||||
| | +----------- + | | +-----------+ | |
|
|
||||||
| | <=> | Work Items | <=> | | <= | Result Q | <= | |
|
|
||||||
| | +------------+ | | +-----------+ | |
|
|
||||||
| | | 6: call() | | | | ... | | |
|
|
||||||
| | | future | | | | 4, result | | |
|
|
||||||
| | | ... | | | | 3, except | | |
|
|
||||||
+----------+ +------------+ +--------+ +-----------+ +---------+
|
|
||||||
|
|
||||||
Executor.submit() called:
|
|
||||||
- creates a uniquely numbered _WorkItem and adds it to the "Work Items" dict
|
|
||||||
- adds the id of the _WorkItem to the "Work Ids" queue
|
|
||||||
|
|
||||||
Local worker thread:
|
|
||||||
- reads work ids from the "Work Ids" queue and looks up the corresponding
|
|
||||||
WorkItem from the "Work Items" dict: if the work item has been cancelled then
|
|
||||||
it is simply removed from the dict, otherwise it is repackaged as a
|
|
||||||
_CallItem and put in the "Call Q". New _CallItems are put in the "Call Q"
|
|
||||||
until "Call Q" is full. NOTE: the size of the "Call Q" is kept small because
|
|
||||||
calls placed in the "Call Q" can no longer be cancelled with Future.cancel().
|
|
||||||
- reads _ResultItems from "Result Q", updates the future stored in the
|
|
||||||
"Work Items" dict and deletes the dict entry
|
|
||||||
|
|
||||||
Process #1..n:
|
|
||||||
- reads _CallItems from "Call Q", executes the calls, and puts the resulting
|
|
||||||
_ResultItems in "Request Q"
|
|
||||||
"""
|
|
||||||
|
|
||||||
import atexit
|
|
||||||
from concurrent.futures import _base
|
|
||||||
import Queue as queue
|
|
||||||
import multiprocessing
|
|
||||||
import threading
|
|
||||||
import weakref
|
|
||||||
import sys
|
|
||||||
|
|
||||||
__author__ = 'Brian Quinlan (brian@sweetapp.com)'
|
|
||||||
|
|
||||||
# Workers are created as daemon threads and processes. This is done to allow the
|
|
||||||
# interpreter to exit when there are still idle processes in a
|
|
||||||
# ProcessPoolExecutor's process pool (i.e. shutdown() was not called). However,
|
|
||||||
# allowing workers to die with the interpreter has two undesirable properties:
|
|
||||||
# - The workers would still be running during interpretor shutdown,
|
|
||||||
# meaning that they would fail in unpredictable ways.
|
|
||||||
# - The workers could be killed while evaluating a work item, which could
|
|
||||||
# be bad if the callable being evaluated has external side-effects e.g.
|
|
||||||
# writing to a file.
|
|
||||||
#
|
|
||||||
# To work around this problem, an exit handler is installed which tells the
|
|
||||||
# workers to exit when their work queues are empty and then waits until the
|
|
||||||
# threads/processes finish.
|
|
||||||
|
|
||||||
_threads_queues = weakref.WeakKeyDictionary()
|
|
||||||
_shutdown = False
|
|
||||||
|
|
||||||
def _python_exit():
|
|
||||||
global _shutdown
|
|
||||||
_shutdown = True
|
|
||||||
items = list(_threads_queues.items()) if _threads_queues else ()
|
|
||||||
for t, q in items:
|
|
||||||
q.put(None)
|
|
||||||
for t, q in items:
|
|
||||||
t.join(sys.maxint)
|
|
||||||
|
|
||||||
# Controls how many more calls than processes will be queued in the call queue.
|
|
||||||
# A smaller number will mean that processes spend more time idle waiting for
|
|
||||||
# work while a larger number will make Future.cancel() succeed less frequently
|
|
||||||
# (Futures in the call queue cannot be cancelled).
|
|
||||||
EXTRA_QUEUED_CALLS = 1
|
|
||||||
|
|
||||||
class _WorkItem(object):
|
|
||||||
def __init__(self, future, fn, args, kwargs):
|
|
||||||
self.future = future
|
|
||||||
self.fn = fn
|
|
||||||
self.args = args
|
|
||||||
self.kwargs = kwargs
|
|
||||||
|
|
||||||
class _ResultItem(object):
|
|
||||||
def __init__(self, work_id, exception=None, result=None):
|
|
||||||
self.work_id = work_id
|
|
||||||
self.exception = exception
|
|
||||||
self.result = result
|
|
||||||
|
|
||||||
class _CallItem(object):
|
|
||||||
def __init__(self, work_id, fn, args, kwargs):
|
|
||||||
self.work_id = work_id
|
|
||||||
self.fn = fn
|
|
||||||
self.args = args
|
|
||||||
self.kwargs = kwargs
|
|
||||||
|
|
||||||
def _process_worker(call_queue, result_queue):
|
|
||||||
"""Evaluates calls from call_queue and places the results in result_queue.
|
|
||||||
|
|
||||||
This worker is run in a separate process.
|
|
||||||
|
|
||||||
Args:
|
|
||||||
call_queue: A multiprocessing.Queue of _CallItems that will be read and
|
|
||||||
evaluated by the worker.
|
|
||||||
result_queue: A multiprocessing.Queue of _ResultItems that will written
|
|
||||||
to by the worker.
|
|
||||||
shutdown: A multiprocessing.Event that will be set as a signal to the
|
|
||||||
worker that it should exit when call_queue is empty.
|
|
||||||
"""
|
|
||||||
while True:
|
|
||||||
call_item = call_queue.get(block=True)
|
|
||||||
if call_item is None:
|
|
||||||
# Wake up queue management thread
|
|
||||||
result_queue.put(None)
|
|
||||||
return
|
|
||||||
try:
|
|
||||||
r = call_item.fn(*call_item.args, **call_item.kwargs)
|
|
||||||
except BaseException:
|
|
||||||
e = sys.exc_info()[1]
|
|
||||||
result_queue.put(_ResultItem(call_item.work_id,
|
|
||||||
exception=e))
|
|
||||||
else:
|
|
||||||
result_queue.put(_ResultItem(call_item.work_id,
|
|
||||||
result=r))
|
|
||||||
|
|
||||||
def _add_call_item_to_queue(pending_work_items,
|
|
||||||
work_ids,
|
|
||||||
call_queue):
|
|
||||||
"""Fills call_queue with _WorkItems from pending_work_items.
|
|
||||||
|
|
||||||
This function never blocks.
|
|
||||||
|
|
||||||
Args:
|
|
||||||
pending_work_items: A dict mapping work ids to _WorkItems e.g.
|
|
||||||
{5: <_WorkItem...>, 6: <_WorkItem...>, ...}
|
|
||||||
work_ids: A queue.Queue of work ids e.g. Queue([5, 6, ...]). Work ids
|
|
||||||
are consumed and the corresponding _WorkItems from
|
|
||||||
pending_work_items are transformed into _CallItems and put in
|
|
||||||
call_queue.
|
|
||||||
call_queue: A multiprocessing.Queue that will be filled with _CallItems
|
|
||||||
derived from _WorkItems.
|
|
||||||
"""
|
|
||||||
while True:
|
|
||||||
if call_queue.full():
|
|
||||||
return
|
|
||||||
try:
|
|
||||||
work_id = work_ids.get(block=False)
|
|
||||||
except queue.Empty:
|
|
||||||
return
|
|
||||||
else:
|
|
||||||
work_item = pending_work_items[work_id]
|
|
||||||
|
|
||||||
if work_item.future.set_running_or_notify_cancel():
|
|
||||||
call_queue.put(_CallItem(work_id,
|
|
||||||
work_item.fn,
|
|
||||||
work_item.args,
|
|
||||||
work_item.kwargs),
|
|
||||||
block=True)
|
|
||||||
else:
|
|
||||||
del pending_work_items[work_id]
|
|
||||||
continue
|
|
||||||
|
|
||||||
def _queue_management_worker(executor_reference,
|
|
||||||
processes,
|
|
||||||
pending_work_items,
|
|
||||||
work_ids_queue,
|
|
||||||
call_queue,
|
|
||||||
result_queue):
|
|
||||||
"""Manages the communication between this process and the worker processes.
|
|
||||||
|
|
||||||
This function is run in a local thread.
|
|
||||||
|
|
||||||
Args:
|
|
||||||
executor_reference: A weakref.ref to the ProcessPoolExecutor that owns
|
|
||||||
this thread. Used to determine if the ProcessPoolExecutor has been
|
|
||||||
garbage collected and that this function can exit.
|
|
||||||
process: A list of the multiprocessing.Process instances used as
|
|
||||||
workers.
|
|
||||||
pending_work_items: A dict mapping work ids to _WorkItems e.g.
|
|
||||||
{5: <_WorkItem...>, 6: <_WorkItem...>, ...}
|
|
||||||
work_ids_queue: A queue.Queue of work ids e.g. Queue([5, 6, ...]).
|
|
||||||
call_queue: A multiprocessing.Queue that will be filled with _CallItems
|
|
||||||
derived from _WorkItems for processing by the process workers.
|
|
||||||
result_queue: A multiprocessing.Queue of _ResultItems generated by the
|
|
||||||
process workers.
|
|
||||||
"""
|
|
||||||
nb_shutdown_processes = [0]
|
|
||||||
def shutdown_one_process():
|
|
||||||
"""Tell a worker to terminate, which will in turn wake us again"""
|
|
||||||
call_queue.put(None)
|
|
||||||
nb_shutdown_processes[0] += 1
|
|
||||||
while True:
|
|
||||||
_add_call_item_to_queue(pending_work_items,
|
|
||||||
work_ids_queue,
|
|
||||||
call_queue)
|
|
||||||
|
|
||||||
result_item = result_queue.get(block=True)
|
|
||||||
if result_item is not None:
|
|
||||||
work_item = pending_work_items[result_item.work_id]
|
|
||||||
del pending_work_items[result_item.work_id]
|
|
||||||
|
|
||||||
if result_item.exception:
|
|
||||||
work_item.future.set_exception(result_item.exception)
|
|
||||||
else:
|
|
||||||
work_item.future.set_result(result_item.result)
|
|
||||||
# Delete references to object. See issue16284
|
|
||||||
del work_item
|
|
||||||
# Check whether we should start shutting down.
|
|
||||||
executor = executor_reference()
|
|
||||||
# No more work items can be added if:
|
|
||||||
# - The interpreter is shutting down OR
|
|
||||||
# - The executor that owns this worker has been collected OR
|
|
||||||
# - The executor that owns this worker has been shutdown.
|
|
||||||
if _shutdown or executor is None or executor._shutdown_thread:
|
|
||||||
# Since no new work items can be added, it is safe to shutdown
|
|
||||||
# this thread if there are no pending work items.
|
|
||||||
if not pending_work_items:
|
|
||||||
while nb_shutdown_processes[0] < len(processes):
|
|
||||||
shutdown_one_process()
|
|
||||||
# If .join() is not called on the created processes then
|
|
||||||
# some multiprocessing.Queue methods may deadlock on Mac OS
|
|
||||||
# X.
|
|
||||||
for p in processes:
|
|
||||||
p.join()
|
|
||||||
call_queue.close()
|
|
||||||
return
|
|
||||||
del executor
|
|
||||||
|
|
||||||
_system_limits_checked = False
|
|
||||||
_system_limited = None
|
|
||||||
def _check_system_limits():
|
|
||||||
global _system_limits_checked, _system_limited
|
|
||||||
if _system_limits_checked:
|
|
||||||
if _system_limited:
|
|
||||||
raise NotImplementedError(_system_limited)
|
|
||||||
_system_limits_checked = True
|
|
||||||
try:
|
|
||||||
import os
|
|
||||||
nsems_max = os.sysconf("SC_SEM_NSEMS_MAX")
|
|
||||||
except (AttributeError, ValueError):
|
|
||||||
# sysconf not available or setting not available
|
|
||||||
return
|
|
||||||
if nsems_max == -1:
|
|
||||||
# indetermine limit, assume that limit is determined
|
|
||||||
# by available memory only
|
|
||||||
return
|
|
||||||
if nsems_max >= 256:
|
|
||||||
# minimum number of semaphores available
|
|
||||||
# according to POSIX
|
|
||||||
return
|
|
||||||
_system_limited = "system provides too few semaphores (%d available, 256 necessary)" % nsems_max
|
|
||||||
raise NotImplementedError(_system_limited)
|
|
||||||
|
|
||||||
class ProcessPoolExecutor(_base.Executor):
|
|
||||||
def __init__(self, max_workers=None):
|
|
||||||
"""Initializes a new ProcessPoolExecutor instance.
|
|
||||||
|
|
||||||
Args:
|
|
||||||
max_workers: The maximum number of processes that can be used to
|
|
||||||
execute the given calls. If None or not given then as many
|
|
||||||
worker processes will be created as the machine has processors.
|
|
||||||
"""
|
|
||||||
_check_system_limits()
|
|
||||||
|
|
||||||
if max_workers is None:
|
|
||||||
self._max_workers = multiprocessing.cpu_count()
|
|
||||||
else:
|
|
||||||
self._max_workers = max_workers
|
|
||||||
|
|
||||||
# Make the call queue slightly larger than the number of processes to
|
|
||||||
# prevent the worker processes from idling. But don't make it too big
|
|
||||||
# because futures in the call queue cannot be cancelled.
|
|
||||||
self._call_queue = multiprocessing.Queue(self._max_workers +
|
|
||||||
EXTRA_QUEUED_CALLS)
|
|
||||||
self._result_queue = multiprocessing.Queue()
|
|
||||||
self._work_ids = queue.Queue()
|
|
||||||
self._queue_management_thread = None
|
|
||||||
self._processes = set()
|
|
||||||
|
|
||||||
# Shutdown is a two-step process.
|
|
||||||
self._shutdown_thread = False
|
|
||||||
self._shutdown_lock = threading.Lock()
|
|
||||||
self._queue_count = 0
|
|
||||||
self._pending_work_items = {}
|
|
||||||
|
|
||||||
def _start_queue_management_thread(self):
|
|
||||||
# When the executor gets lost, the weakref callback will wake up
|
|
||||||
# the queue management thread.
|
|
||||||
def weakref_cb(_, q=self._result_queue):
|
|
||||||
q.put(None)
|
|
||||||
if self._queue_management_thread is None:
|
|
||||||
self._queue_management_thread = threading.Thread(
|
|
||||||
target=_queue_management_worker,
|
|
||||||
args=(weakref.ref(self, weakref_cb),
|
|
||||||
self._processes,
|
|
||||||
self._pending_work_items,
|
|
||||||
self._work_ids,
|
|
||||||
self._call_queue,
|
|
||||||
self._result_queue))
|
|
||||||
self._queue_management_thread.daemon = True
|
|
||||||
self._queue_management_thread.start()
|
|
||||||
_threads_queues[self._queue_management_thread] = self._result_queue
|
|
||||||
|
|
||||||
def _adjust_process_count(self):
|
|
||||||
for _ in range(len(self._processes), self._max_workers):
|
|
||||||
p = multiprocessing.Process(
|
|
||||||
target=_process_worker,
|
|
||||||
args=(self._call_queue,
|
|
||||||
self._result_queue))
|
|
||||||
p.start()
|
|
||||||
self._processes.add(p)
|
|
||||||
|
|
||||||
def submit(self, fn, *args, **kwargs):
|
|
||||||
with self._shutdown_lock:
|
|
||||||
if self._shutdown_thread:
|
|
||||||
raise RuntimeError('cannot schedule new futures after shutdown')
|
|
||||||
|
|
||||||
f = _base.Future()
|
|
||||||
w = _WorkItem(f, fn, args, kwargs)
|
|
||||||
|
|
||||||
self._pending_work_items[self._queue_count] = w
|
|
||||||
self._work_ids.put(self._queue_count)
|
|
||||||
self._queue_count += 1
|
|
||||||
# Wake up queue management thread
|
|
||||||
self._result_queue.put(None)
|
|
||||||
|
|
||||||
self._start_queue_management_thread()
|
|
||||||
self._adjust_process_count()
|
|
||||||
return f
|
|
||||||
submit.__doc__ = _base.Executor.submit.__doc__
|
|
||||||
|
|
||||||
def shutdown(self, wait=True):
|
|
||||||
with self._shutdown_lock:
|
|
||||||
self._shutdown_thread = True
|
|
||||||
if self._queue_management_thread:
|
|
||||||
# Wake up queue management thread
|
|
||||||
self._result_queue.put(None)
|
|
||||||
if wait:
|
|
||||||
self._queue_management_thread.join(sys.maxint)
|
|
||||||
# To reduce the risk of openning too many files, remove references to
|
|
||||||
# objects that use file descriptors.
|
|
||||||
self._queue_management_thread = None
|
|
||||||
self._call_queue = None
|
|
||||||
self._result_queue = None
|
|
||||||
self._processes = None
|
|
||||||
shutdown.__doc__ = _base.Executor.shutdown.__doc__
|
|
||||||
|
|
||||||
atexit.register(_python_exit)
|
|
|
@ -1,134 +0,0 @@
|
||||||
# Copyright 2009 Brian Quinlan. All Rights Reserved.
|
|
||||||
# Licensed to PSF under a Contributor Agreement.
|
|
||||||
|
|
||||||
"""Implements ThreadPoolExecutor."""
|
|
||||||
|
|
||||||
import atexit
|
|
||||||
from concurrent.futures import _base
|
|
||||||
import Queue as queue
|
|
||||||
import threading
|
|
||||||
import weakref
|
|
||||||
import sys
|
|
||||||
|
|
||||||
__author__ = 'Brian Quinlan (brian@sweetapp.com)'
|
|
||||||
|
|
||||||
# Workers are created as daemon threads. This is done to allow the interpreter
|
|
||||||
# to exit when there are still idle threads in a ThreadPoolExecutor's thread
|
|
||||||
# pool (i.e. shutdown() was not called). However, allowing workers to die with
|
|
||||||
# the interpreter has two undesirable properties:
|
|
||||||
# - The workers would still be running during interpretor shutdown,
|
|
||||||
# meaning that they would fail in unpredictable ways.
|
|
||||||
# - The workers could be killed while evaluating a work item, which could
|
|
||||||
# be bad if the callable being evaluated has external side-effects e.g.
|
|
||||||
# writing to a file.
|
|
||||||
#
|
|
||||||
# To work around this problem, an exit handler is installed which tells the
|
|
||||||
# workers to exit when their work queues are empty and then waits until the
|
|
||||||
# threads finish.
|
|
||||||
|
|
||||||
_threads_queues = weakref.WeakKeyDictionary()
|
|
||||||
_shutdown = False
|
|
||||||
|
|
||||||
def _python_exit():
|
|
||||||
global _shutdown
|
|
||||||
_shutdown = True
|
|
||||||
items = list(_threads_queues.items()) if _threads_queues else ()
|
|
||||||
for t, q in items:
|
|
||||||
q.put(None)
|
|
||||||
for t, q in items:
|
|
||||||
t.join(sys.maxint)
|
|
||||||
|
|
||||||
atexit.register(_python_exit)
|
|
||||||
|
|
||||||
class _WorkItem(object):
|
|
||||||
def __init__(self, future, fn, args, kwargs):
|
|
||||||
self.future = future
|
|
||||||
self.fn = fn
|
|
||||||
self.args = args
|
|
||||||
self.kwargs = kwargs
|
|
||||||
|
|
||||||
def run(self):
|
|
||||||
if not self.future.set_running_or_notify_cancel():
|
|
||||||
return
|
|
||||||
|
|
||||||
try:
|
|
||||||
result = self.fn(*self.args, **self.kwargs)
|
|
||||||
except BaseException:
|
|
||||||
e, tb = sys.exc_info()[1:]
|
|
||||||
self.future.set_exception_info(e, tb)
|
|
||||||
else:
|
|
||||||
self.future.set_result(result)
|
|
||||||
|
|
||||||
def _worker(executor_reference, work_queue):
|
|
||||||
try:
|
|
||||||
while True:
|
|
||||||
work_item = work_queue.get(block=True)
|
|
||||||
if work_item is not None:
|
|
||||||
work_item.run()
|
|
||||||
# Delete references to object. See issue16284
|
|
||||||
del work_item
|
|
||||||
continue
|
|
||||||
executor = executor_reference()
|
|
||||||
# Exit if:
|
|
||||||
# - The interpreter is shutting down OR
|
|
||||||
# - The executor that owns the worker has been collected OR
|
|
||||||
# - The executor that owns the worker has been shutdown.
|
|
||||||
if _shutdown or executor is None or executor._shutdown:
|
|
||||||
# Notice other workers
|
|
||||||
work_queue.put(None)
|
|
||||||
return
|
|
||||||
del executor
|
|
||||||
except BaseException:
|
|
||||||
_base.LOGGER.critical('Exception in worker', exc_info=True)
|
|
||||||
|
|
||||||
class ThreadPoolExecutor(_base.Executor):
|
|
||||||
def __init__(self, max_workers):
|
|
||||||
"""Initializes a new ThreadPoolExecutor instance.
|
|
||||||
|
|
||||||
Args:
|
|
||||||
max_workers: The maximum number of threads that can be used to
|
|
||||||
execute the given calls.
|
|
||||||
"""
|
|
||||||
self._max_workers = max_workers
|
|
||||||
self._work_queue = queue.Queue()
|
|
||||||
self._threads = set()
|
|
||||||
self._shutdown = False
|
|
||||||
self._shutdown_lock = threading.Lock()
|
|
||||||
|
|
||||||
def submit(self, fn, *args, **kwargs):
|
|
||||||
with self._shutdown_lock:
|
|
||||||
if self._shutdown:
|
|
||||||
raise RuntimeError('cannot schedule new futures after shutdown')
|
|
||||||
|
|
||||||
f = _base.Future()
|
|
||||||
w = _WorkItem(f, fn, args, kwargs)
|
|
||||||
|
|
||||||
self._work_queue.put(w)
|
|
||||||
self._adjust_thread_count()
|
|
||||||
return f
|
|
||||||
submit.__doc__ = _base.Executor.submit.__doc__
|
|
||||||
|
|
||||||
def _adjust_thread_count(self):
|
|
||||||
# When the executor gets lost, the weakref callback will wake up
|
|
||||||
# the worker threads.
|
|
||||||
def weakref_cb(_, q=self._work_queue):
|
|
||||||
q.put(None)
|
|
||||||
# TODO(bquinlan): Should avoid creating new threads if there are more
|
|
||||||
# idle threads than items in the work queue.
|
|
||||||
if len(self._threads) < self._max_workers:
|
|
||||||
t = threading.Thread(target=_worker,
|
|
||||||
args=(weakref.ref(self, weakref_cb),
|
|
||||||
self._work_queue))
|
|
||||||
t.daemon = True
|
|
||||||
t.start()
|
|
||||||
self._threads.add(t)
|
|
||||||
_threads_queues[t] = self._work_queue
|
|
||||||
|
|
||||||
def shutdown(self, wait=True):
|
|
||||||
with self._shutdown_lock:
|
|
||||||
self._shutdown = True
|
|
||||||
self._work_queue.put(None)
|
|
||||||
if wait:
|
|
||||||
for t in self._threads:
|
|
||||||
t.join(sys.maxint)
|
|
||||||
shutdown.__doc__ = _base.Executor.shutdown.__doc__
|
|
|
@ -1,73 +1,6 @@
|
||||||
#!/usr/bin/env python
|
#!/usr/bin/env python
|
||||||
# -*- coding: UTF-8 -*-
|
# -*- coding: UTF-8 -*-
|
||||||
|
|
||||||
"""Death by Captcha HTTP and socket API clients.
|
|
||||||
|
|
||||||
There are two types of Death by Captcha (DBC hereinafter) API: HTTP and
|
|
||||||
socket ones. Both offer the same functionalily, with the socket API
|
|
||||||
sporting faster responses and using way less connections.
|
|
||||||
|
|
||||||
To access the socket API, use SocketClient class; for the HTTP API, use
|
|
||||||
HttpClient class. Both are thread-safe. SocketClient keeps a persistent
|
|
||||||
connection opened and serializes all API requests sent through it, thus
|
|
||||||
it is advised to keep a pool of them if you're script is heavily
|
|
||||||
multithreaded.
|
|
||||||
|
|
||||||
Both SocketClient and HttpClient give you the following methods:
|
|
||||||
|
|
||||||
get_user()
|
|
||||||
Returns your DBC account details as a dict with the following keys:
|
|
||||||
|
|
||||||
"user": your account numeric ID; if login fails, it will be the only
|
|
||||||
item with the value of 0;
|
|
||||||
"rate": your CAPTCHA rate, i.e. how much you will be charged for one
|
|
||||||
solved CAPTCHA in US cents;
|
|
||||||
"balance": your DBC account balance in US cents;
|
|
||||||
"is_banned": flag indicating whether your account is suspended or not.
|
|
||||||
|
|
||||||
get_balance()
|
|
||||||
Returns your DBC account balance in US cents.
|
|
||||||
|
|
||||||
get_captcha(cid)
|
|
||||||
Returns an uploaded CAPTCHA details as a dict with the following keys:
|
|
||||||
|
|
||||||
"captcha": the CAPTCHA numeric ID; if no such CAPTCHAs found, it will
|
|
||||||
be the only item with the value of 0;
|
|
||||||
"text": the CAPTCHA text, if solved, otherwise None;
|
|
||||||
"is_correct": flag indicating whether the CAPTCHA was solved correctly
|
|
||||||
(DBC can detect that in rare cases).
|
|
||||||
|
|
||||||
The only argument `cid` is the CAPTCHA numeric ID.
|
|
||||||
|
|
||||||
get_text(cid)
|
|
||||||
Returns an uploaded CAPTCHA text (None if not solved). The only argument
|
|
||||||
`cid` is the CAPTCHA numeric ID.
|
|
||||||
|
|
||||||
report(cid)
|
|
||||||
Reports an incorrectly solved CAPTCHA. The only argument `cid` is the
|
|
||||||
CAPTCHA numeric ID. Returns True on success, False otherwise.
|
|
||||||
|
|
||||||
upload(captcha)
|
|
||||||
Uploads a CAPTCHA. The only argument `captcha` can be either file-like
|
|
||||||
object (any object with `read` method defined, actually, so StringIO
|
|
||||||
will do), or CAPTCHA image file name. On successul upload you'll get
|
|
||||||
the CAPTCHA details dict (see get_captcha() method).
|
|
||||||
|
|
||||||
NOTE: AT THIS POINT THE UPLOADED CAPTCHA IS NOT SOLVED YET! You have
|
|
||||||
to poll for its status periodically using get_captcha() or get_text()
|
|
||||||
method until the CAPTCHA is solved and you get the text.
|
|
||||||
|
|
||||||
decode(captcha, timeout=DEFAULT_TIMEOUT)
|
|
||||||
A convenient method that uploads a CAPTCHA and polls for its status
|
|
||||||
periodically, but no longer than `timeout` (defaults to 60 seconds).
|
|
||||||
If solved, you'll get the CAPTCHA details dict (see get_captcha()
|
|
||||||
method for details). See upload() method for details on `captcha`
|
|
||||||
argument.
|
|
||||||
|
|
||||||
Visit http://www.deathbycaptcha.com/user/api for updates.
|
|
||||||
|
|
||||||
"""
|
|
||||||
|
|
||||||
import base64
|
import base64
|
||||||
import binascii
|
import binascii
|
||||||
import errno
|
import errno
|
||||||
|
@ -79,8 +12,7 @@ import socket
|
||||||
import sys
|
import sys
|
||||||
import threading
|
import threading
|
||||||
import time
|
import time
|
||||||
import urllib
|
|
||||||
import urllib2
|
|
||||||
try:
|
try:
|
||||||
from json import read as json_decode, write as json_encode
|
from json import read as json_decode, write as json_encode
|
||||||
except ImportError:
|
except ImportError:
|
||||||
|
@ -89,64 +21,71 @@ except ImportError:
|
||||||
except ImportError:
|
except ImportError:
|
||||||
from simplejson import loads as json_decode, dumps as json_encode
|
from simplejson import loads as json_decode, dumps as json_encode
|
||||||
|
|
||||||
|
try:
|
||||||
|
from urllib2 import build_opener, HTTPRedirectHandler, Request, HTTPError
|
||||||
|
from urllib import urlencode, urlopen
|
||||||
|
except ImportError:
|
||||||
|
from urllib.request import build_opener, HTTPRedirectHandler, Request, urlopen
|
||||||
|
from urllib.error import HTTPError
|
||||||
|
from urllib.parse import urlencode
|
||||||
|
|
||||||
# API version and unique software ID
|
# API version and unique software ID
|
||||||
API_VERSION = 'DBC/Python v4.6'
|
API_VERSION = 'DBC/Python v4.0.11'
|
||||||
|
SOFTWARE_VENDOR_ID = 0
|
||||||
|
|
||||||
# Default CAPTCHA timeout and decode() polling interval
|
# Default CAPTCHA timeout and decode() polling interval
|
||||||
DEFAULT_TIMEOUT = 60
|
DEFAULT_TIMEOUT = 60
|
||||||
DEFAULT_TOKEN_TIMEOUT = 120
|
POLLS_INTERVAL = 5
|
||||||
POLLS_INTERVAL = [1, 1, 2, 3, 2, 2, 3, 2, 2]
|
|
||||||
DFLT_POLL_INTERVAL = 3
|
|
||||||
|
|
||||||
# Base HTTP API url
|
# Base HTTP API url
|
||||||
HTTP_BASE_URL = 'http://api.dbcapi.me/api'
|
HTTP_BASE_URL = 'http://api.deathbycaptcha.com/api'
|
||||||
|
|
||||||
# Preferred HTTP API server's response content type, do not change
|
# Preferred HTTP API server's response content type, do not change
|
||||||
HTTP_RESPONSE_TYPE = 'application/json'
|
HTTP_RESPONSE_TYPE = 'application/json'
|
||||||
|
|
||||||
# Socket API server's host & ports range
|
# Socket API server's host & ports range
|
||||||
SOCKET_HOST = 'api.dbcapi.me'
|
SOCKET_HOST = 'api.deathbycaptcha.com'
|
||||||
SOCKET_PORTS = range(8123, 8131)
|
SOCKET_PORTS = range(8123, 8131)
|
||||||
|
|
||||||
|
|
||||||
def _load_image(captcha):
|
|
||||||
if hasattr(captcha, 'read'):
|
|
||||||
img = captcha.read()
|
|
||||||
elif type(captcha) == bytearray:
|
|
||||||
img = captcha
|
|
||||||
else:
|
|
||||||
img = ''
|
|
||||||
try:
|
|
||||||
captcha_file = open(captcha, 'rb')
|
|
||||||
except Exception:
|
|
||||||
raise
|
|
||||||
else:
|
|
||||||
img = captcha_file.read()
|
|
||||||
captcha_file.close()
|
|
||||||
if not len(img):
|
|
||||||
raise ValueError('CAPTCHA image is empty')
|
|
||||||
elif imghdr.what(None, img) is None:
|
|
||||||
raise TypeError('Unknown CAPTCHA image type')
|
|
||||||
else:
|
|
||||||
return img
|
|
||||||
|
|
||||||
|
|
||||||
class AccessDeniedException(Exception):
|
class AccessDeniedException(Exception):
|
||||||
pass
|
pass
|
||||||
|
|
||||||
|
|
||||||
class Client(object):
|
class Client(object):
|
||||||
|
"""Death by Captcha API Client"""
|
||||||
"""Death by Captcha API Client."""
|
|
||||||
|
|
||||||
def __init__(self, username, password):
|
def __init__(self, username, password):
|
||||||
self.is_verbose = False
|
self.is_verbose = False
|
||||||
self.userpwd = {'username': username, 'password': password}
|
self.userpwd = {'username': username,
|
||||||
|
'password': password}
|
||||||
|
|
||||||
|
def _load_file(self, captcha):
|
||||||
|
if hasattr(captcha, 'read'):
|
||||||
|
raw_captcha = captcha.read()
|
||||||
|
elif isinstance(captcha, bytearray):
|
||||||
|
raw_captcha = captcha
|
||||||
|
elif os.path.isfile(captcha):
|
||||||
|
raw_captcha = ''
|
||||||
|
try:
|
||||||
|
f = open(captcha, 'rb')
|
||||||
|
except Exception as e:
|
||||||
|
raise e
|
||||||
|
else:
|
||||||
|
raw_captcha = f.read()
|
||||||
|
f.close()
|
||||||
|
else:
|
||||||
|
f_stream = urlopen(captcha)
|
||||||
|
raw_captcha = f_stream.read()
|
||||||
|
|
||||||
|
if not len(raw_captcha):
|
||||||
|
raise ValueError('CAPTCHA image is empty')
|
||||||
|
elif imghdr.what(None, raw_captcha) is None:
|
||||||
|
raise TypeError('Unknown CAPTCHA image type')
|
||||||
|
else:
|
||||||
|
return raw_captcha
|
||||||
|
|
||||||
def _log(self, cmd, msg=''):
|
def _log(self, cmd, msg=''):
|
||||||
if self.is_verbose:
|
if self.is_verbose:
|
||||||
print '%d %s %s' % (time.time(), cmd, msg.rstrip())
|
print('%d %s %s' % (time.time(), cmd, msg.rstrip()))
|
||||||
return self
|
return self
|
||||||
|
|
||||||
def close(self):
|
def close(self):
|
||||||
|
@ -156,16 +95,16 @@ class Client(object):
|
||||||
pass
|
pass
|
||||||
|
|
||||||
def get_user(self):
|
def get_user(self):
|
||||||
"""Fetch user details -- ID, balance, rate and banned status."""
|
"""Fetch the user's details dict -- balance, rate and banned status."""
|
||||||
raise NotImplementedError()
|
raise NotImplemented()
|
||||||
|
|
||||||
def get_balance(self):
|
def get_balance(self):
|
||||||
"""Fetch user balance (in US cents)."""
|
"""Fetch the user's balance (in US cents)."""
|
||||||
return self.get_user().get('balance')
|
return self.get_user().get('balance')
|
||||||
|
|
||||||
def get_captcha(self, cid):
|
def get_captcha(self, cid):
|
||||||
"""Fetch a CAPTCHA details -- ID, text and correctness flag."""
|
"""Fetch a CAPTCHA details dict -- its ID, text and correctness."""
|
||||||
raise NotImplementedError()
|
raise NotImplemented()
|
||||||
|
|
||||||
def get_text(self, cid):
|
def get_text(self, cid):
|
||||||
"""Fetch a CAPTCHA text."""
|
"""Fetch a CAPTCHA text."""
|
||||||
|
@ -173,7 +112,11 @@ class Client(object):
|
||||||
|
|
||||||
def report(self, cid):
|
def report(self, cid):
|
||||||
"""Report a CAPTCHA as incorrectly solved."""
|
"""Report a CAPTCHA as incorrectly solved."""
|
||||||
raise NotImplementedError()
|
raise NotImplemented()
|
||||||
|
|
||||||
|
def remove(self, cid):
|
||||||
|
"""Remove an unsolved CAPTCHA."""
|
||||||
|
raise NotImplemented()
|
||||||
|
|
||||||
def upload(self, captcha):
|
def upload(self, captcha):
|
||||||
"""Upload a CAPTCHA.
|
"""Upload a CAPTCHA.
|
||||||
|
@ -182,56 +125,32 @@ class Client(object):
|
||||||
dict on success.
|
dict on success.
|
||||||
|
|
||||||
"""
|
"""
|
||||||
raise NotImplementedError()
|
raise NotImplemented()
|
||||||
|
|
||||||
def decode(self, captcha=None, timeout=None, **kwargs):
|
def decode(self, captcha, timeout=DEFAULT_TIMEOUT):
|
||||||
"""
|
"""Try to solve a CAPTCHA.
|
||||||
Try to solve a CAPTCHA.
|
|
||||||
|
|
||||||
See Client.upload() for arguments details.
|
See Client.upload() for arguments details.
|
||||||
|
|
||||||
Uploads a CAPTCHA, polls for its status periodically with arbitrary
|
Uploads a CAPTCHA, polls for its status periodically with arbitrary
|
||||||
timeout (in seconds), returns CAPTCHA details if (correctly) solved.
|
timeout (in seconds), returns CAPTCHA details if (correctly) solved.
|
||||||
|
|
||||||
"""
|
"""
|
||||||
if not timeout:
|
|
||||||
if not captcha:
|
|
||||||
timeout = DEFAULT_TOKEN_TIMEOUT
|
|
||||||
else:
|
|
||||||
timeout = DEFAULT_TIMEOUT
|
|
||||||
|
|
||||||
deadline = time.time() + (max(0, timeout) or DEFAULT_TIMEOUT)
|
deadline = time.time() + (max(0, timeout) or DEFAULT_TIMEOUT)
|
||||||
uploaded_captcha = self.upload(captcha, **kwargs)
|
c = self.upload(captcha)
|
||||||
if uploaded_captcha:
|
if c:
|
||||||
intvl_idx = 0 # POLL_INTERVAL index
|
while deadline > time.time() and not c.get('text'):
|
||||||
while deadline > time.time() and not uploaded_captcha.get('text'):
|
time.sleep(POLLS_INTERVAL)
|
||||||
intvl, intvl_idx = self._get_poll_interval(intvl_idx)
|
c = self.get_captcha(c['captcha'])
|
||||||
time.sleep(intvl)
|
if c.get('text') and c.get('is_correct'):
|
||||||
pulled = self.get_captcha(uploaded_captcha['captcha'])
|
return c
|
||||||
if pulled['captcha'] == uploaded_captcha['captcha']:
|
|
||||||
uploaded_captcha = pulled
|
|
||||||
if uploaded_captcha.get('text') and \
|
|
||||||
uploaded_captcha.get('is_correct'):
|
|
||||||
return uploaded_captcha
|
|
||||||
|
|
||||||
def _get_poll_interval(self, idx):
|
|
||||||
"""Returns poll interval and next index depending on index provided"""
|
|
||||||
|
|
||||||
if len(POLLS_INTERVAL) > idx:
|
|
||||||
intvl = POLLS_INTERVAL[idx]
|
|
||||||
else:
|
|
||||||
intvl = DFLT_POLL_INTERVAL
|
|
||||||
idx += 1
|
|
||||||
|
|
||||||
return intvl, idx
|
|
||||||
|
|
||||||
|
|
||||||
class HttpClient(Client):
|
class HttpClient(Client):
|
||||||
|
|
||||||
"""Death by Captcha HTTP API client."""
|
"""Death by Captcha HTTP API client."""
|
||||||
|
|
||||||
def __init__(self, *args):
|
def __init__(self, *args):
|
||||||
Client.__init__(self, *args)
|
Client.__init__(self, *args)
|
||||||
self.opener = urllib2.build_opener(urllib2.HTTPRedirectHandler())
|
self.opener = build_opener(HTTPRedirectHandler())
|
||||||
|
|
||||||
def _call(self, cmd, payload=None, headers=None):
|
def _call(self, cmd, payload=None, headers=None):
|
||||||
if headers is None:
|
if headers is None:
|
||||||
|
@ -239,30 +158,22 @@ class HttpClient(Client):
|
||||||
headers['Accept'] = HTTP_RESPONSE_TYPE
|
headers['Accept'] = HTTP_RESPONSE_TYPE
|
||||||
headers['User-Agent'] = API_VERSION
|
headers['User-Agent'] = API_VERSION
|
||||||
if hasattr(payload, 'items'):
|
if hasattr(payload, 'items'):
|
||||||
payload = urllib.urlencode(payload)
|
payload = urlencode(payload)
|
||||||
self._log('SEND', '%s %d %s' % (cmd, len(payload), payload))
|
self._log('SEND', '%s %d %s' % (cmd, len(payload), payload))
|
||||||
else:
|
|
||||||
self._log('SEND', '%s' % cmd)
|
|
||||||
if payload is not None:
|
if payload is not None:
|
||||||
headers['Content-Length'] = len(payload)
|
headers['Content-Length'] = len(payload)
|
||||||
try:
|
try:
|
||||||
response = self.opener.open(urllib2.Request(
|
response = self.opener.open(Request(
|
||||||
HTTP_BASE_URL + '/' + cmd.strip('/'),
|
HTTP_BASE_URL + '/' + cmd.strip('/'),
|
||||||
data=payload,
|
data=payload,
|
||||||
headers=headers
|
headers=headers
|
||||||
)).read()
|
)).read()
|
||||||
except urllib2.HTTPError, err:
|
except HTTPError as e:
|
||||||
if 403 == err.code:
|
if 403 == e.code:
|
||||||
raise AccessDeniedException('Access denied, please check'
|
raise AccessDeniedException(
|
||||||
' your credentials and/or balance')
|
'Access denied, please check your credentials and/or balance')
|
||||||
elif 400 == err.code or 413 == err.code:
|
elif 400 == e.code or 413 == e.code:
|
||||||
raise ValueError("CAPTCHA was rejected by the service, check"
|
raise ValueError("CAPTCHA was rejected by the service, check if it's a valid image")
|
||||||
" if it's a valid image")
|
|
||||||
elif 503 == err.code:
|
|
||||||
raise OverflowError("CAPTCHA was rejected due to service"
|
|
||||||
" overload, try again later")
|
|
||||||
else:
|
|
||||||
raise err
|
|
||||||
else:
|
else:
|
||||||
self._log('RECV', '%d %s' % (len(response), response))
|
self._log('RECV', '%d %s' % (len(response), response))
|
||||||
try:
|
try:
|
||||||
|
@ -281,53 +192,38 @@ class HttpClient(Client):
|
||||||
return not self._call('captcha/%d/report' % cid,
|
return not self._call('captcha/%d/report' % cid,
|
||||||
self.userpwd.copy()).get('is_correct')
|
self.userpwd.copy()).get('is_correct')
|
||||||
|
|
||||||
def upload(self, captcha=None, **kwargs):
|
def remove(self, cid):
|
||||||
|
return not self._call('captcha/%d/remove' % cid,
|
||||||
|
self.userpwd.copy()).get('captcha')
|
||||||
|
|
||||||
|
def upload(self, captcha):
|
||||||
boundary = binascii.hexlify(os.urandom(16))
|
boundary = binascii.hexlify(os.urandom(16))
|
||||||
banner = kwargs.get('banner', '')
|
data = self.userpwd.copy()
|
||||||
if banner:
|
data['swid'] = SOFTWARE_VENDOR_ID
|
||||||
kwargs['banner'] = 'base64:' + base64.b64encode(_load_image(banner))
|
body = '\r\n'.join(('\r\n'.join(('--%s' % boundary,
|
||||||
body = '\r\n'.join(('\r\n'.join((
|
'Content-Disposition: form-data; name="%s"' % k,
|
||||||
'--%s' % boundary,
|
'Content-Type: text/plain',
|
||||||
'Content-Disposition: form-data; name="%s"' % k,
|
'Content-Length: %d' % len(str(v)),
|
||||||
'Content-Type: text/plain',
|
'',
|
||||||
'Content-Length: %d' % len(str(v)),
|
str(v))))
|
||||||
'',
|
for k, v in data.items())
|
||||||
str(v)
|
captcha = self._load_file(captcha)
|
||||||
))) for k, v in self.userpwd.items())
|
body += '\r\n'.join(('',
|
||||||
|
'--%s' % boundary,
|
||||||
body += '\r\n'.join(('\r\n'.join((
|
'Content-Disposition: form-data; name="captchafile"; filename="captcha"',
|
||||||
'--%s' % boundary,
|
'Content-Type: application/octet-stream',
|
||||||
'Content-Disposition: form-data; name="%s"' % k,
|
'Content-Length: %d' % len(captcha),
|
||||||
'Content-Type: text/plain',
|
'',
|
||||||
'Content-Length: %d' % len(str(v)),
|
captcha,
|
||||||
'',
|
'--%s--' % boundary,
|
||||||
str(v)
|
''))
|
||||||
))) for k, v in kwargs.items())
|
|
||||||
|
|
||||||
if captcha:
|
|
||||||
img = _load_image(captcha)
|
|
||||||
body += '\r\n'.join((
|
|
||||||
'',
|
|
||||||
'--%s' % boundary,
|
|
||||||
'Content-Disposition: form-data; name="captchafile"; '
|
|
||||||
'filename="captcha"',
|
|
||||||
'Content-Type: application/octet-stream',
|
|
||||||
'Content-Length: %d' % len(img),
|
|
||||||
'',
|
|
||||||
img,
|
|
||||||
'--%s--' % boundary,
|
|
||||||
''
|
|
||||||
))
|
|
||||||
|
|
||||||
response = self._call('captcha', body, {
|
response = self._call('captcha', body, {
|
||||||
'Content-Type': 'multipart/form-data; boundary="%s"' % boundary
|
'Content-Type': 'multipart/form-data; boundary="%s"' % boundary
|
||||||
}) or {}
|
}) or {}
|
||||||
if response.get('captcha'):
|
if response.get('captcha'):
|
||||||
return response
|
return response
|
||||||
|
|
||||||
|
|
||||||
class SocketClient(Client):
|
class SocketClient(Client):
|
||||||
|
|
||||||
"""Death by Captcha socket API client."""
|
"""Death by Captcha socket API client."""
|
||||||
|
|
||||||
TERMINATOR = '\r\n'
|
TERMINATOR = '\r\n'
|
||||||
|
@ -357,11 +253,12 @@ class SocketClient(Client):
|
||||||
self.socket.settimeout(0)
|
self.socket.settimeout(0)
|
||||||
try:
|
try:
|
||||||
self.socket.connect(host)
|
self.socket.connect(host)
|
||||||
except socket.error, err:
|
except socket.error as e:
|
||||||
if (err.args[0] not in
|
if errno.EINPROGRESS == e[0]:
|
||||||
(errno.EAGAIN, errno.EWOULDBLOCK, errno.EINPROGRESS)):
|
pass
|
||||||
|
else:
|
||||||
self.close()
|
self.close()
|
||||||
raise err
|
raise e
|
||||||
return self.socket
|
return self.socket
|
||||||
|
|
||||||
def __del__(self):
|
def __del__(self):
|
||||||
|
@ -372,30 +269,27 @@ class SocketClient(Client):
|
||||||
fds = [sock]
|
fds = [sock]
|
||||||
buf += self.TERMINATOR
|
buf += self.TERMINATOR
|
||||||
response = ''
|
response = ''
|
||||||
intvl_idx = 0
|
|
||||||
while True:
|
while True:
|
||||||
intvl, intvl_idx = self._get_poll_interval(intvl_idx)
|
rd, wr, ex = select.select((not buf and fds) or [],
|
||||||
rds, wrs, exs = select.select((not buf and fds) or [],
|
(buf and fds) or [],
|
||||||
(buf and fds) or [],
|
fds,
|
||||||
fds,
|
POLLS_INTERVAL)
|
||||||
intvl)
|
if ex:
|
||||||
if exs:
|
|
||||||
raise IOError('select() failed')
|
raise IOError('select() failed')
|
||||||
try:
|
try:
|
||||||
if wrs:
|
if wr:
|
||||||
while buf:
|
while buf:
|
||||||
buf = buf[wrs[0].send(buf):]
|
buf = buf[wr[0].send(buf):]
|
||||||
elif rds:
|
elif rd:
|
||||||
while True:
|
while True:
|
||||||
s = rds[0].recv(256)
|
s = rd[0].recv(256)
|
||||||
if not s:
|
if not s:
|
||||||
raise IOError('recv(): connection lost')
|
raise IOError('recv(): connection lost')
|
||||||
else:
|
else:
|
||||||
response += s
|
response += s
|
||||||
except socket.error, err:
|
except socket.error as e:
|
||||||
if (err.args[0] not in
|
if e[0] not in (errno.EAGAIN, errno.EINPROGRESS):
|
||||||
(errno.EAGAIN, errno.EWOULDBLOCK, errno.EINPROGRESS)):
|
raise e
|
||||||
raise err
|
|
||||||
if response.endswith(self.TERMINATOR):
|
if response.endswith(self.TERMINATOR):
|
||||||
self._log('RECV', response)
|
self._log('RECV', response)
|
||||||
return response.rstrip(self.TERMINATOR)
|
return response.rstrip(self.TERMINATOR)
|
||||||
|
@ -409,18 +303,16 @@ class SocketClient(Client):
|
||||||
request = json_encode(data)
|
request = json_encode(data)
|
||||||
|
|
||||||
response = None
|
response = None
|
||||||
for _ in range(2):
|
for i in range(2):
|
||||||
if not self.socket and cmd != 'login':
|
|
||||||
self._call('login', self.userpwd.copy())
|
|
||||||
self.socket_lock.acquire()
|
self.socket_lock.acquire()
|
||||||
try:
|
try:
|
||||||
sock = self.connect()
|
sock = self.connect()
|
||||||
response = self._sendrecv(sock, request)
|
response = self._sendrecv(sock, request)
|
||||||
except IOError, err:
|
except IOError as e:
|
||||||
sys.stderr.write(str(err) + "\n")
|
sys.stderr.write(str(e) + "\n")
|
||||||
self.close()
|
self.close()
|
||||||
except socket.error, err:
|
except socket.error as e:
|
||||||
sys.stderr.write(str(err) + "\n")
|
sys.stderr.write(str(e) + "\n")
|
||||||
self.close()
|
self.close()
|
||||||
raise IOError('Connection refused')
|
raise IOError('Connection refused')
|
||||||
else:
|
else:
|
||||||
|
@ -428,89 +320,84 @@ class SocketClient(Client):
|
||||||
finally:
|
finally:
|
||||||
self.socket_lock.release()
|
self.socket_lock.release()
|
||||||
|
|
||||||
if response is None:
|
|
||||||
raise IOError('Connection lost or timed out during API request')
|
|
||||||
|
|
||||||
try:
|
try:
|
||||||
response = json_decode(response)
|
if response is None:
|
||||||
except Exception:
|
raise IOError('Connection lost timed out during API request')
|
||||||
raise RuntimeError('Invalid API response')
|
try:
|
||||||
|
response = json_decode(response)
|
||||||
if not response.get('error'):
|
except Exception:
|
||||||
return response
|
raise RuntimeError('Invalid API response')
|
||||||
|
if 'error' in response:
|
||||||
error = response['error']
|
error = response['error']
|
||||||
if error in ('not-logged-in', 'invalid-credentials'):
|
if 'not-logged-in' == error:
|
||||||
raise AccessDeniedException('Access denied, check your credentials')
|
raise AccessDeniedException('Access denied, check your credentials')
|
||||||
elif 'banned' == error:
|
elif 'banned' == error:
|
||||||
raise AccessDeniedException('Access denied, account is suspended')
|
raise AccessDeniedException('Access denied, account is suspended')
|
||||||
elif 'insufficient-funds' == error:
|
elif 'insufficient-funds' == error:
|
||||||
raise AccessDeniedException(
|
raise AccessDeniedException('CAPTCHA was rejected due to low balance')
|
||||||
'CAPTCHA was rejected due to low balance')
|
elif 'invalid-captcha' == error:
|
||||||
elif 'invalid-captcha' == error:
|
raise ValueError('CAPTCHA is not a valid image')
|
||||||
raise ValueError('CAPTCHA is not a valid image')
|
elif 'service-overload' == error:
|
||||||
elif 'service-overload' == error:
|
raise ValueError(
|
||||||
raise OverflowError(
|
'CAPTCHA was rejected due to service overload, try again later')
|
||||||
'CAPTCHA was rejected due to service overload, try again later')
|
else:
|
||||||
else:
|
raise RuntimeError('API server error occured: %s' % error)
|
||||||
|
except Exception as e:
|
||||||
self.socket_lock.acquire()
|
self.socket_lock.acquire()
|
||||||
self.close()
|
self.close()
|
||||||
self.socket_lock.release()
|
self.socket_lock.release()
|
||||||
raise RuntimeError('API server error occured: %s' % error)
|
raise e
|
||||||
|
else:
|
||||||
|
return response
|
||||||
|
|
||||||
def get_user(self):
|
def get_user(self):
|
||||||
return self._call('user') or {'user': 0}
|
return self._call('user', self.userpwd.copy()) or {'user': 0}
|
||||||
|
|
||||||
def get_captcha(self, cid):
|
def get_captcha(self, cid):
|
||||||
return self._call('captcha', {'captcha': cid}) or {'captcha': 0}
|
return self._call('captcha', {'captcha': cid}) or {'captcha': 0}
|
||||||
|
|
||||||
def upload(self, captcha=None, **kwargs):
|
def upload(self, captcha):
|
||||||
data = {}
|
data = self.userpwd.copy()
|
||||||
if captcha:
|
data['captcha'] = base64.b64encode(self._load_file(captcha))
|
||||||
data['captcha'] = base64.b64encode(_load_image(captcha))
|
|
||||||
if kwargs:
|
|
||||||
banner = kwargs.get('banner', '')
|
|
||||||
if banner:
|
|
||||||
kwargs['banner'] = base64.b64encode(_load_image(banner))
|
|
||||||
data.update(kwargs)
|
|
||||||
response = self._call('upload', data)
|
response = self._call('upload', data)
|
||||||
if response.get('captcha'):
|
if response.get('captcha'):
|
||||||
uploaded_captcha = dict(
|
return dict((k, response.get(k)) for k in ('captcha', 'text', 'is_correct'))
|
||||||
(k, response.get(k))
|
|
||||||
for k in ('captcha', 'text', 'is_correct')
|
|
||||||
)
|
|
||||||
if not uploaded_captcha['text']:
|
|
||||||
uploaded_captcha['text'] = None
|
|
||||||
return uploaded_captcha
|
|
||||||
|
|
||||||
def report(self, cid):
|
def report(self, cid):
|
||||||
return not self._call('report', {'captcha': cid}).get('is_correct')
|
data = self.userpwd.copy()
|
||||||
|
data['captcha'] = cid
|
||||||
|
return not self._call('report', data).get('is_correct')
|
||||||
|
|
||||||
|
def remove(self, cid):
|
||||||
|
data = self.userpwd.copy()
|
||||||
|
data['captcha'] = cid
|
||||||
|
return not self._call('remove', data).get('captcha')
|
||||||
|
|
||||||
if '__main__' == __name__:
|
if '__main__' == __name__:
|
||||||
|
import sys
|
||||||
|
|
||||||
# Put your DBC username & password here:
|
# Put your DBC username & password here:
|
||||||
# client = HttpClient(sys.argv[1], sys.argv[2])
|
#client = HttpClient(sys.argv[1], sys.argv[2])
|
||||||
client = SocketClient(sys.argv[1], sys.argv[2])
|
client = SocketClient(sys.argv[1], sys.argv[2])
|
||||||
client.is_verbose = True
|
client.is_verbose = True
|
||||||
|
|
||||||
print 'Your balance is %s US cents' % client.get_balance()
|
print('Your balance is %s US cents' % client.get_balance())
|
||||||
|
|
||||||
for fn in sys.argv[3:]:
|
for fn in sys.argv[3:]:
|
||||||
try:
|
try:
|
||||||
# Put your CAPTCHA image file name or file-like object, and optional
|
# Put your CAPTCHA image file name or file-like object, and optional
|
||||||
# solving timeout (in seconds) here:
|
# solving timeout (in seconds) here:
|
||||||
captcha = client.decode(fn, DEFAULT_TIMEOUT)
|
captcha = client.decode(fn, DEFAULT_TIMEOUT)
|
||||||
except Exception, e:
|
except Exception as e:
|
||||||
sys.stderr.write('Failed uploading CAPTCHA: %s\n' % (e, ))
|
sys.stderr.write('Failed uploading CAPTCHA: %s\n' % (e, ))
|
||||||
captcha = None
|
captcha = None
|
||||||
|
|
||||||
if captcha:
|
if captcha:
|
||||||
print 'CAPTCHA %d solved: %s' % \
|
print('CAPTCHA %d solved: %s' % (captcha['captcha'], captcha['text']))
|
||||||
(captcha['captcha'], captcha['text'])
|
|
||||||
|
|
||||||
# Report as incorrectly solved if needed. Make sure the CAPTCHA was
|
# Report as incorrectly solved if needed. Make sure the CAPTCHA was
|
||||||
# in fact incorrectly solved!
|
# in fact incorrectly solved!
|
||||||
# try:
|
try:
|
||||||
# client.report(captcha['captcha'])
|
client.report(captcha['captcha'])
|
||||||
# except Exception, e:
|
except Exception as e:
|
||||||
# sys.stderr.write('Failed reporting CAPTCHA: %s\n' % (e, ))
|
sys.stderr.write('Failed reporting CAPTCHA: %s\n' % (e, ))
|
||||||
|
|
|
@ -40,7 +40,7 @@ import operator
|
||||||
import itertools
|
import itertools
|
||||||
import collections
|
import collections
|
||||||
|
|
||||||
__version__ = '4.3.0'
|
__version__ = '4.4.0'
|
||||||
|
|
||||||
if sys.version >= '3':
|
if sys.version >= '3':
|
||||||
from inspect import getfullargspec
|
from inspect import getfullargspec
|
||||||
|
@ -65,6 +65,12 @@ except AttributeError:
|
||||||
# let's assume there are no coroutine functions in old Python
|
# let's assume there are no coroutine functions in old Python
|
||||||
def iscoroutinefunction(f):
|
def iscoroutinefunction(f):
|
||||||
return False
|
return False
|
||||||
|
try:
|
||||||
|
from inspect import isgeneratorfunction
|
||||||
|
except ImportError:
|
||||||
|
# assume no generator function in old Python versions
|
||||||
|
def isgeneratorfunction(caller):
|
||||||
|
return False
|
||||||
|
|
||||||
|
|
||||||
DEF = re.compile(r'\s*def\s*([_\w][_\w\d]*)\s*\(')
|
DEF = re.compile(r'\s*def\s*([_\w][_\w\d]*)\s*\(')
|
||||||
|
@ -173,7 +179,8 @@ class FunctionMaker(object):
|
||||||
# Ensure each generated function has a unique filename for profilers
|
# Ensure each generated function has a unique filename for profilers
|
||||||
# (such as cProfile) that depend on the tuple of (<filename>,
|
# (such as cProfile) that depend on the tuple of (<filename>,
|
||||||
# <definition line>, <function name>) being unique.
|
# <definition line>, <function name>) being unique.
|
||||||
filename = '<decorator-gen-%d>' % (next(self._compile_count),)
|
filename = '<%s:decorator-gen-%d>' % (
|
||||||
|
__file__, next(self._compile_count))
|
||||||
try:
|
try:
|
||||||
code = compile(src, filename, 'single')
|
code = compile(src, filename, 'single')
|
||||||
exec(code, evaldict)
|
exec(code, evaldict)
|
||||||
|
@ -218,6 +225,8 @@ class FunctionMaker(object):
|
||||||
def decorate(func, caller, extras=()):
|
def decorate(func, caller, extras=()):
|
||||||
"""
|
"""
|
||||||
decorate(func, caller) decorates a function using a caller.
|
decorate(func, caller) decorates a function using a caller.
|
||||||
|
If the caller is a generator function, the resulting function
|
||||||
|
will be a generator function.
|
||||||
"""
|
"""
|
||||||
evaldict = dict(_call_=caller, _func_=func)
|
evaldict = dict(_call_=caller, _func_=func)
|
||||||
es = ''
|
es = ''
|
||||||
|
@ -225,9 +234,23 @@ def decorate(func, caller, extras=()):
|
||||||
ex = '_e%d_' % i
|
ex = '_e%d_' % i
|
||||||
evaldict[ex] = extra
|
evaldict[ex] = extra
|
||||||
es += ex + ', '
|
es += ex + ', '
|
||||||
fun = FunctionMaker.create(
|
|
||||||
func, "return _call_(_func_, %s%%(shortsignature)s)" % es,
|
if '3.5' <= sys.version < '3.6':
|
||||||
evaldict, __wrapped__=func)
|
# with Python 3.5 isgeneratorfunction returns True for all coroutines
|
||||||
|
# however we know that it is NOT possible to have a generator
|
||||||
|
# coroutine in python 3.5: PEP525 was not there yet
|
||||||
|
generatorcaller = isgeneratorfunction(
|
||||||
|
caller) and not iscoroutinefunction(caller)
|
||||||
|
else:
|
||||||
|
generatorcaller = isgeneratorfunction(caller)
|
||||||
|
if generatorcaller:
|
||||||
|
fun = FunctionMaker.create(
|
||||||
|
func, "for res in _call_(_func_, %s%%(shortsignature)s):\n"
|
||||||
|
" yield res" % es, evaldict, __wrapped__=func)
|
||||||
|
else:
|
||||||
|
fun = FunctionMaker.create(
|
||||||
|
func, "return _call_(_func_, %s%%(shortsignature)s)" % es,
|
||||||
|
evaldict, __wrapped__=func)
|
||||||
if hasattr(func, '__qualname__'):
|
if hasattr(func, '__qualname__'):
|
||||||
fun.__qualname__ = func.__qualname__
|
fun.__qualname__ = func.__qualname__
|
||||||
return fun
|
return fun
|
||||||
|
@ -261,12 +284,12 @@ def decorator(caller, _func=None):
|
||||||
doc = caller.__call__.__doc__
|
doc = caller.__call__.__doc__
|
||||||
evaldict = dict(_call=caller, _decorate_=decorate)
|
evaldict = dict(_call=caller, _decorate_=decorate)
|
||||||
dec = FunctionMaker.create(
|
dec = FunctionMaker.create(
|
||||||
'%s(%s func)' % (name, defaultargs),
|
'%s(func, %s)' % (name, defaultargs),
|
||||||
'if func is None: return lambda func: _decorate_(func, _call, (%s))\n'
|
'if func is None: return lambda func: _decorate_(func, _call, (%s))\n'
|
||||||
'return _decorate_(func, _call, (%s))' % (defaultargs, defaultargs),
|
'return _decorate_(func, _call, (%s))' % (defaultargs, defaultargs),
|
||||||
evaldict, doc=doc, module=caller.__module__, __wrapped__=caller)
|
evaldict, doc=doc, module=caller.__module__, __wrapped__=caller)
|
||||||
if defaults:
|
if defaults:
|
||||||
dec.__defaults__ = defaults + (None,)
|
dec.__defaults__ = (None,) + defaults
|
||||||
return dec
|
return dec
|
||||||
|
|
||||||
|
|
||||||
|
|
|
@ -1,4 +1,4 @@
|
||||||
__version__ = '0.6.5'
|
__version__ = '0.7.1'
|
||||||
|
|
||||||
from .lock import Lock # noqa
|
from .lock import Lock # noqa
|
||||||
from .lock import NeedRegenerationException # noqa
|
from .lock import NeedRegenerationException # noqa
|
||||||
|
|
|
@ -10,8 +10,9 @@ from ..util import compat
|
||||||
import time
|
import time
|
||||||
import datetime
|
import datetime
|
||||||
from numbers import Number
|
from numbers import Number
|
||||||
from functools import wraps
|
from functools import wraps, partial
|
||||||
import threading
|
import threading
|
||||||
|
from decorator import decorate
|
||||||
|
|
||||||
_backend_loader = PluginLoader("dogpile.cache")
|
_backend_loader = PluginLoader("dogpile.cache")
|
||||||
register_backend = _backend_loader.register
|
register_backend = _backend_loader.register
|
||||||
|
@ -188,7 +189,7 @@ class DefaultInvalidationStrategy(RegionInvalidationStrategy):
|
||||||
|
|
||||||
|
|
||||||
class CacheRegion(object):
|
class CacheRegion(object):
|
||||||
"""A front end to a particular cache backend.
|
r"""A front end to a particular cache backend.
|
||||||
|
|
||||||
:param name: Optional, a string name for the region.
|
:param name: Optional, a string name for the region.
|
||||||
This isn't used internally
|
This isn't used internally
|
||||||
|
@ -484,6 +485,26 @@ class CacheRegion(object):
|
||||||
else:
|
else:
|
||||||
return self._LockWrapper()
|
return self._LockWrapper()
|
||||||
|
|
||||||
|
# cached value
|
||||||
|
_actual_backend = None
|
||||||
|
|
||||||
|
@property
|
||||||
|
def actual_backend(self):
|
||||||
|
"""Return the ultimate backend underneath any proxies.
|
||||||
|
|
||||||
|
The backend might be the result of one or more ``proxy.wrap``
|
||||||
|
applications. If so, derive the actual underlying backend.
|
||||||
|
|
||||||
|
.. versionadded:: 0.6.6
|
||||||
|
|
||||||
|
"""
|
||||||
|
if self._actual_backend is None:
|
||||||
|
_backend = self.backend
|
||||||
|
while hasattr(_backend, 'proxied'):
|
||||||
|
_backend = _backend.proxied
|
||||||
|
self._actual_backend = _backend
|
||||||
|
return self._actual_backend
|
||||||
|
|
||||||
def invalidate(self, hard=True):
|
def invalidate(self, hard=True):
|
||||||
"""Invalidate this :class:`.CacheRegion`.
|
"""Invalidate this :class:`.CacheRegion`.
|
||||||
|
|
||||||
|
@ -723,7 +744,8 @@ class CacheRegion(object):
|
||||||
]
|
]
|
||||||
|
|
||||||
def get_or_create(
|
def get_or_create(
|
||||||
self, key, creator, expiration_time=None, should_cache_fn=None):
|
self, key, creator, expiration_time=None, should_cache_fn=None,
|
||||||
|
creator_args=None):
|
||||||
"""Return a cached value based on the given key.
|
"""Return a cached value based on the given key.
|
||||||
|
|
||||||
If the value does not exist or is considered to be expired
|
If the value does not exist or is considered to be expired
|
||||||
|
@ -759,6 +781,11 @@ class CacheRegion(object):
|
||||||
|
|
||||||
:param creator: function which creates a new value.
|
:param creator: function which creates a new value.
|
||||||
|
|
||||||
|
:param creator_args: optional tuple of (args, kwargs) that will be
|
||||||
|
passed to the creator function if present.
|
||||||
|
|
||||||
|
.. versionadded:: 0.7.0
|
||||||
|
|
||||||
:param expiration_time: optional expiration time which will overide
|
:param expiration_time: optional expiration time which will overide
|
||||||
the expiration time already configured on this :class:`.CacheRegion`
|
the expiration time already configured on this :class:`.CacheRegion`
|
||||||
if not None. To set no expiration, use the value -1.
|
if not None. To set no expiration, use the value -1.
|
||||||
|
@ -799,7 +826,7 @@ class CacheRegion(object):
|
||||||
value = self.backend.get(key)
|
value = self.backend.get(key)
|
||||||
if (value is NO_VALUE or value.metadata['v'] != value_version or
|
if (value is NO_VALUE or value.metadata['v'] != value_version or
|
||||||
self.region_invalidator.is_hard_invalidated(
|
self.region_invalidator.is_hard_invalidated(
|
||||||
value.metadata["ct"])):
|
value.metadata["ct"])):
|
||||||
raise NeedRegenerationException()
|
raise NeedRegenerationException()
|
||||||
ct = value.metadata["ct"]
|
ct = value.metadata["ct"]
|
||||||
if self.region_invalidator.is_soft_invalidated(ct):
|
if self.region_invalidator.is_soft_invalidated(ct):
|
||||||
|
@ -808,7 +835,10 @@ class CacheRegion(object):
|
||||||
return value.payload, ct
|
return value.payload, ct
|
||||||
|
|
||||||
def gen_value():
|
def gen_value():
|
||||||
created_value = creator()
|
if creator_args:
|
||||||
|
created_value = creator(*creator_args[0], **creator_args[1])
|
||||||
|
else:
|
||||||
|
created_value = creator()
|
||||||
value = self._value(created_value)
|
value = self._value(created_value)
|
||||||
|
|
||||||
if not should_cache_fn or \
|
if not should_cache_fn or \
|
||||||
|
@ -831,8 +861,13 @@ class CacheRegion(object):
|
||||||
|
|
||||||
if self.async_creation_runner:
|
if self.async_creation_runner:
|
||||||
def async_creator(mutex):
|
def async_creator(mutex):
|
||||||
return self.async_creation_runner(
|
if creator_args:
|
||||||
self, orig_key, creator, mutex)
|
@wraps(creator)
|
||||||
|
def go():
|
||||||
|
return creator(*creator_args[0], **creator_args[1])
|
||||||
|
else:
|
||||||
|
go = creator
|
||||||
|
return self.async_creation_runner(self, orig_key, go, mutex)
|
||||||
else:
|
else:
|
||||||
async_creator = None
|
async_creator = None
|
||||||
|
|
||||||
|
@ -896,7 +931,7 @@ class CacheRegion(object):
|
||||||
|
|
||||||
if (value is NO_VALUE or value.metadata['v'] != value_version or
|
if (value is NO_VALUE or value.metadata['v'] != value_version or
|
||||||
self.region_invalidator.is_hard_invalidated(
|
self.region_invalidator.is_hard_invalidated(
|
||||||
value.metadata['v'])):
|
value.metadata['ct'])):
|
||||||
# dogpile.core understands a 0 here as
|
# dogpile.core understands a 0 here as
|
||||||
# "the value is not available", e.g.
|
# "the value is not available", e.g.
|
||||||
# _has_value() will return False.
|
# _has_value() will return False.
|
||||||
|
@ -1228,26 +1263,31 @@ class CacheRegion(object):
|
||||||
if function_key_generator is None:
|
if function_key_generator is None:
|
||||||
function_key_generator = self.function_key_generator
|
function_key_generator = self.function_key_generator
|
||||||
|
|
||||||
def decorator(fn):
|
def get_or_create_for_user_func(key_generator, user_func, *arg, **kw):
|
||||||
|
key = key_generator(*arg, **kw)
|
||||||
|
|
||||||
|
timeout = expiration_time() if expiration_time_is_callable \
|
||||||
|
else expiration_time
|
||||||
|
return self.get_or_create(key, user_func, timeout,
|
||||||
|
should_cache_fn, (arg, kw))
|
||||||
|
|
||||||
|
def cache_decorator(user_func):
|
||||||
if to_str is compat.string_type:
|
if to_str is compat.string_type:
|
||||||
# backwards compatible
|
# backwards compatible
|
||||||
key_generator = function_key_generator(namespace, fn)
|
key_generator = function_key_generator(namespace, user_func)
|
||||||
else:
|
else:
|
||||||
key_generator = function_key_generator(
|
key_generator = function_key_generator(
|
||||||
namespace, fn,
|
namespace, user_func,
|
||||||
to_str=to_str)
|
to_str=to_str)
|
||||||
|
|
||||||
@wraps(fn)
|
def refresh(*arg, **kw):
|
||||||
def decorate(*arg, **kw):
|
"""
|
||||||
|
Like invalidate, but regenerates the value instead
|
||||||
|
"""
|
||||||
key = key_generator(*arg, **kw)
|
key = key_generator(*arg, **kw)
|
||||||
|
value = user_func(*arg, **kw)
|
||||||
@wraps(fn)
|
self.set(key, value)
|
||||||
def creator():
|
return value
|
||||||
return fn(*arg, **kw)
|
|
||||||
timeout = expiration_time() if expiration_time_is_callable \
|
|
||||||
else expiration_time
|
|
||||||
return self.get_or_create(key, creator, timeout,
|
|
||||||
should_cache_fn)
|
|
||||||
|
|
||||||
def invalidate(*arg, **kw):
|
def invalidate(*arg, **kw):
|
||||||
key = key_generator(*arg, **kw)
|
key = key_generator(*arg, **kw)
|
||||||
|
@ -1261,20 +1301,18 @@ class CacheRegion(object):
|
||||||
key = key_generator(*arg, **kw)
|
key = key_generator(*arg, **kw)
|
||||||
return self.get(key)
|
return self.get(key)
|
||||||
|
|
||||||
def refresh(*arg, **kw):
|
user_func.set = set_
|
||||||
key = key_generator(*arg, **kw)
|
user_func.invalidate = invalidate
|
||||||
value = fn(*arg, **kw)
|
user_func.get = get
|
||||||
self.set(key, value)
|
user_func.refresh = refresh
|
||||||
return value
|
user_func.original = user_func
|
||||||
|
|
||||||
decorate.set = set_
|
# Use `decorate` to preserve the signature of :param:`user_func`.
|
||||||
decorate.invalidate = invalidate
|
|
||||||
decorate.refresh = refresh
|
|
||||||
decorate.get = get
|
|
||||||
decorate.original = fn
|
|
||||||
|
|
||||||
return decorate
|
return decorate(user_func, partial(
|
||||||
return decorator
|
get_or_create_for_user_func, key_generator))
|
||||||
|
|
||||||
|
return cache_decorator
|
||||||
|
|
||||||
def cache_multi_on_arguments(
|
def cache_multi_on_arguments(
|
||||||
self, namespace=None, expiration_time=None,
|
self, namespace=None, expiration_time=None,
|
||||||
|
@ -1402,51 +1440,50 @@ class CacheRegion(object):
|
||||||
if function_multi_key_generator is None:
|
if function_multi_key_generator is None:
|
||||||
function_multi_key_generator = self.function_multi_key_generator
|
function_multi_key_generator = self.function_multi_key_generator
|
||||||
|
|
||||||
def decorator(fn):
|
def get_or_create_for_user_func(key_generator, user_func, *arg, **kw):
|
||||||
|
cache_keys = arg
|
||||||
|
keys = key_generator(*arg, **kw)
|
||||||
|
key_lookup = dict(zip(keys, cache_keys))
|
||||||
|
|
||||||
|
@wraps(user_func)
|
||||||
|
def creator(*keys_to_create):
|
||||||
|
return user_func(*[key_lookup[k] for k in keys_to_create])
|
||||||
|
|
||||||
|
timeout = expiration_time() if expiration_time_is_callable \
|
||||||
|
else expiration_time
|
||||||
|
|
||||||
|
if asdict:
|
||||||
|
def dict_create(*keys):
|
||||||
|
d_values = creator(*keys)
|
||||||
|
return [
|
||||||
|
d_values.get(key_lookup[k], NO_VALUE)
|
||||||
|
for k in keys]
|
||||||
|
|
||||||
|
def wrap_cache_fn(value):
|
||||||
|
if value is NO_VALUE:
|
||||||
|
return False
|
||||||
|
elif not should_cache_fn:
|
||||||
|
return True
|
||||||
|
else:
|
||||||
|
return should_cache_fn(value)
|
||||||
|
|
||||||
|
result = self.get_or_create_multi(
|
||||||
|
keys, dict_create, timeout, wrap_cache_fn)
|
||||||
|
result = dict(
|
||||||
|
(k, v) for k, v in zip(cache_keys, result)
|
||||||
|
if v is not NO_VALUE)
|
||||||
|
else:
|
||||||
|
result = self.get_or_create_multi(
|
||||||
|
keys, creator, timeout,
|
||||||
|
should_cache_fn)
|
||||||
|
|
||||||
|
return result
|
||||||
|
|
||||||
|
def cache_decorator(user_func):
|
||||||
key_generator = function_multi_key_generator(
|
key_generator = function_multi_key_generator(
|
||||||
namespace, fn,
|
namespace, user_func,
|
||||||
to_str=to_str)
|
to_str=to_str)
|
||||||
|
|
||||||
@wraps(fn)
|
|
||||||
def decorate(*arg, **kw):
|
|
||||||
cache_keys = arg
|
|
||||||
keys = key_generator(*arg, **kw)
|
|
||||||
key_lookup = dict(zip(keys, cache_keys))
|
|
||||||
|
|
||||||
@wraps(fn)
|
|
||||||
def creator(*keys_to_create):
|
|
||||||
return fn(*[key_lookup[k] for k in keys_to_create])
|
|
||||||
|
|
||||||
timeout = expiration_time() if expiration_time_is_callable \
|
|
||||||
else expiration_time
|
|
||||||
|
|
||||||
if asdict:
|
|
||||||
def dict_create(*keys):
|
|
||||||
d_values = creator(*keys)
|
|
||||||
return [
|
|
||||||
d_values.get(key_lookup[k], NO_VALUE)
|
|
||||||
for k in keys]
|
|
||||||
|
|
||||||
def wrap_cache_fn(value):
|
|
||||||
if value is NO_VALUE:
|
|
||||||
return False
|
|
||||||
elif not should_cache_fn:
|
|
||||||
return True
|
|
||||||
else:
|
|
||||||
return should_cache_fn(value)
|
|
||||||
|
|
||||||
result = self.get_or_create_multi(
|
|
||||||
keys, dict_create, timeout, wrap_cache_fn)
|
|
||||||
result = dict(
|
|
||||||
(k, v) for k, v in zip(cache_keys, result)
|
|
||||||
if v is not NO_VALUE)
|
|
||||||
else:
|
|
||||||
result = self.get_or_create_multi(
|
|
||||||
keys, creator, timeout,
|
|
||||||
should_cache_fn)
|
|
||||||
|
|
||||||
return result
|
|
||||||
|
|
||||||
def invalidate(*arg):
|
def invalidate(*arg):
|
||||||
keys = key_generator(*arg)
|
keys = key_generator(*arg)
|
||||||
self.delete_multi(keys)
|
self.delete_multi(keys)
|
||||||
|
@ -1466,7 +1503,7 @@ class CacheRegion(object):
|
||||||
|
|
||||||
def refresh(*arg):
|
def refresh(*arg):
|
||||||
keys = key_generator(*arg)
|
keys = key_generator(*arg)
|
||||||
values = fn(*arg)
|
values = user_func(*arg)
|
||||||
if asdict:
|
if asdict:
|
||||||
self.set_multi(
|
self.set_multi(
|
||||||
dict(zip(keys, [values[a] for a in arg]))
|
dict(zip(keys, [values[a] for a in arg]))
|
||||||
|
@ -1478,13 +1515,18 @@ class CacheRegion(object):
|
||||||
)
|
)
|
||||||
return values
|
return values
|
||||||
|
|
||||||
decorate.set = set_
|
user_func.set = set_
|
||||||
decorate.invalidate = invalidate
|
user_func.invalidate = invalidate
|
||||||
decorate.refresh = refresh
|
user_func.refresh = refresh
|
||||||
decorate.get = get
|
user_func.get = get
|
||||||
|
|
||||||
|
# Use `decorate` to preserve the signature of :param:`user_func`.
|
||||||
|
|
||||||
|
return decorate(user_func, partial(get_or_create_for_user_func, key_generator))
|
||||||
|
|
||||||
|
return cache_decorator
|
||||||
|
|
||||||
|
|
||||||
return decorate
|
|
||||||
return decorator
|
|
||||||
|
|
||||||
|
|
||||||
def make_region(*arg, **kw):
|
def make_region(*arg, **kw):
|
||||||
|
|
|
@ -1,5 +1,4 @@
|
||||||
from hashlib import sha1
|
from hashlib import sha1
|
||||||
import inspect
|
|
||||||
from ..util import compat
|
from ..util import compat
|
||||||
from ..util import langhelpers
|
from ..util import langhelpers
|
||||||
|
|
||||||
|
@ -28,7 +27,7 @@ def function_key_generator(namespace, fn, to_str=compat.string_type):
|
||||||
else:
|
else:
|
||||||
namespace = '%s:%s|%s' % (fn.__module__, fn.__name__, namespace)
|
namespace = '%s:%s|%s' % (fn.__module__, fn.__name__, namespace)
|
||||||
|
|
||||||
args = inspect.getargspec(fn)
|
args = compat.inspect_getargspec(fn)
|
||||||
has_self = args[0] and args[0][0] in ('self', 'cls')
|
has_self = args[0] and args[0][0] in ('self', 'cls')
|
||||||
|
|
||||||
def generate_key(*args, **kw):
|
def generate_key(*args, **kw):
|
||||||
|
@ -50,7 +49,7 @@ def function_multi_key_generator(namespace, fn, to_str=compat.string_type):
|
||||||
else:
|
else:
|
||||||
namespace = '%s:%s|%s' % (fn.__module__, fn.__name__, namespace)
|
namespace = '%s:%s|%s' % (fn.__module__, fn.__name__, namespace)
|
||||||
|
|
||||||
args = inspect.getargspec(fn)
|
args = compat.inspect_getargspec(fn)
|
||||||
has_self = args[0] and args[0][0] in ('self', 'cls')
|
has_self = args[0] and args[0][0] in ('self', 'cls')
|
||||||
|
|
||||||
def generate_keys(*args, **kw):
|
def generate_keys(*args, **kw):
|
||||||
|
@ -88,7 +87,7 @@ def kwarg_function_key_generator(namespace, fn, to_str=compat.string_type):
|
||||||
else:
|
else:
|
||||||
namespace = '%s:%s|%s' % (fn.__module__, fn.__name__, namespace)
|
namespace = '%s:%s|%s' % (fn.__module__, fn.__name__, namespace)
|
||||||
|
|
||||||
argspec = inspect.getargspec(fn)
|
argspec = compat.inspect_getargspec(fn)
|
||||||
default_list = list(argspec.defaults or [])
|
default_list = list(argspec.defaults or [])
|
||||||
# Reverse the list, as we want to compare the argspec by negative index,
|
# Reverse the list, as we want to compare the argspec by negative index,
|
||||||
# meaning default_list[0] should be args[-1], which works well with
|
# meaning default_list[0] should be args[-1], which works well with
|
||||||
|
|
|
@ -69,11 +69,10 @@ class Lock(object):
|
||||||
"""Return true if the expiration time is reached, or no
|
"""Return true if the expiration time is reached, or no
|
||||||
value is available."""
|
value is available."""
|
||||||
|
|
||||||
return not self._has_value(createdtime) or \
|
return not self._has_value(createdtime) or (
|
||||||
(
|
self.expiretime is not None and
|
||||||
self.expiretime is not None and
|
time.time() - createdtime > self.expiretime
|
||||||
time.time() - createdtime > self.expiretime
|
)
|
||||||
)
|
|
||||||
|
|
||||||
def _has_value(self, createdtime):
|
def _has_value(self, createdtime):
|
||||||
"""Return true if the creation function has proceeded
|
"""Return true if the creation function has proceeded
|
||||||
|
@ -91,68 +90,100 @@ class Lock(object):
|
||||||
value = NOT_REGENERATED
|
value = NOT_REGENERATED
|
||||||
createdtime = -1
|
createdtime = -1
|
||||||
|
|
||||||
generated = self._enter_create(createdtime)
|
generated = self._enter_create(value, createdtime)
|
||||||
|
|
||||||
if generated is not NOT_REGENERATED:
|
if generated is not NOT_REGENERATED:
|
||||||
generated, createdtime = generated
|
generated, createdtime = generated
|
||||||
return generated
|
return generated
|
||||||
elif value is NOT_REGENERATED:
|
elif value is NOT_REGENERATED:
|
||||||
|
# we called upon the creator, and it said that it
|
||||||
|
# didn't regenerate. this typically means another
|
||||||
|
# thread is running the creation function, and that the
|
||||||
|
# cache should still have a value. However,
|
||||||
|
# we don't have a value at all, which is unusual since we just
|
||||||
|
# checked for it, so check again (TODO: is this a real codepath?)
|
||||||
try:
|
try:
|
||||||
value, createdtime = value_fn()
|
value, createdtime = value_fn()
|
||||||
return value
|
return value
|
||||||
except NeedRegenerationException:
|
except NeedRegenerationException:
|
||||||
raise Exception("Generation function should "
|
raise Exception(
|
||||||
"have just been called by a concurrent "
|
"Generation function should "
|
||||||
"thread.")
|
"have just been called by a concurrent "
|
||||||
|
"thread.")
|
||||||
else:
|
else:
|
||||||
return value
|
return value
|
||||||
|
|
||||||
def _enter_create(self, createdtime):
|
def _enter_create(self, value, createdtime):
|
||||||
|
|
||||||
if not self._is_expired(createdtime):
|
if not self._is_expired(createdtime):
|
||||||
return NOT_REGENERATED
|
return NOT_REGENERATED
|
||||||
|
|
||||||
async = False
|
_async = False
|
||||||
|
|
||||||
if self._has_value(createdtime):
|
if self._has_value(createdtime):
|
||||||
|
has_value = True
|
||||||
if not self.mutex.acquire(False):
|
if not self.mutex.acquire(False):
|
||||||
log.debug("creation function in progress "
|
log.debug(
|
||||||
"elsewhere, returning")
|
"creation function in progress "
|
||||||
|
"elsewhere, returning")
|
||||||
return NOT_REGENERATED
|
return NOT_REGENERATED
|
||||||
else:
|
else:
|
||||||
|
has_value = False
|
||||||
log.debug("no value, waiting for create lock")
|
log.debug("no value, waiting for create lock")
|
||||||
self.mutex.acquire()
|
self.mutex.acquire()
|
||||||
|
|
||||||
try:
|
try:
|
||||||
log.debug("value creation lock %r acquired" % self.mutex)
|
log.debug("value creation lock %r acquired" % self.mutex)
|
||||||
|
|
||||||
# see if someone created the value already
|
if not has_value:
|
||||||
try:
|
# we entered without a value, or at least with "creationtime ==
|
||||||
value, createdtime = self.value_and_created_fn()
|
# 0". Run the "getter" function again, to see if another
|
||||||
except NeedRegenerationException:
|
# thread has already generated the value while we waited on the
|
||||||
pass
|
# mutex, or if the caller is otherwise telling us there is a
|
||||||
else:
|
# value already which allows us to use async regeneration. (the
|
||||||
if not self._is_expired(createdtime):
|
# latter is used by the multi-key routine).
|
||||||
log.debug("value already present")
|
try:
|
||||||
return value, createdtime
|
value, createdtime = self.value_and_created_fn()
|
||||||
elif self.async_creator:
|
except NeedRegenerationException:
|
||||||
log.debug("Passing creation lock to async runner")
|
# nope, nobody created the value, we're it.
|
||||||
self.async_creator(self.mutex)
|
# we must create it right now
|
||||||
async = True
|
pass
|
||||||
return value, createdtime
|
else:
|
||||||
|
has_value = True
|
||||||
|
# caller is telling us there is a value and that we can
|
||||||
|
# use async creation if it is expired.
|
||||||
|
if not self._is_expired(createdtime):
|
||||||
|
# it's not expired, return it
|
||||||
|
log.debug("Concurrent thread created the value")
|
||||||
|
return value, createdtime
|
||||||
|
|
||||||
log.debug("Calling creation function")
|
# otherwise it's expired, call creator again
|
||||||
created = self.creator()
|
|
||||||
return created
|
if has_value and self.async_creator:
|
||||||
|
# we have a value we can return, safe to use async_creator
|
||||||
|
log.debug("Passing creation lock to async runner")
|
||||||
|
|
||||||
|
# so...run it!
|
||||||
|
self.async_creator(self.mutex)
|
||||||
|
_async = True
|
||||||
|
|
||||||
|
# and return the expired value for now
|
||||||
|
return value, createdtime
|
||||||
|
|
||||||
|
# it's expired, and it's our turn to create it synchronously, *or*,
|
||||||
|
# there's no value at all, and we have to create it synchronously
|
||||||
|
log.debug(
|
||||||
|
"Calling creation function for %s value",
|
||||||
|
"not-yet-present" if not has_value else
|
||||||
|
"previously expired"
|
||||||
|
)
|
||||||
|
return self.creator()
|
||||||
finally:
|
finally:
|
||||||
if not async:
|
if not _async:
|
||||||
self.mutex.release()
|
self.mutex.release()
|
||||||
log.debug("Released creation lock")
|
log.debug("Released creation lock")
|
||||||
|
|
||||||
|
|
||||||
def __enter__(self):
|
def __enter__(self):
|
||||||
return self._enter()
|
return self._enter()
|
||||||
|
|
||||||
def __exit__(self, type, value, traceback):
|
def __exit__(self, type, value, traceback):
|
||||||
pass
|
pass
|
||||||
|
|
||||||
|
|
|
@ -51,11 +51,33 @@ else:
|
||||||
import thread # noqa
|
import thread # noqa
|
||||||
|
|
||||||
|
|
||||||
|
if py3k:
|
||||||
|
import collections
|
||||||
|
ArgSpec = collections.namedtuple(
|
||||||
|
"ArgSpec",
|
||||||
|
["args", "varargs", "keywords", "defaults"])
|
||||||
|
|
||||||
|
from inspect import getfullargspec as inspect_getfullargspec
|
||||||
|
|
||||||
|
def inspect_getargspec(func):
|
||||||
|
return ArgSpec(
|
||||||
|
*inspect_getfullargspec(func)[0:4]
|
||||||
|
)
|
||||||
|
else:
|
||||||
|
from inspect import getargspec as inspect_getargspec # noqa
|
||||||
|
|
||||||
if py3k or jython:
|
if py3k or jython:
|
||||||
import pickle
|
import pickle
|
||||||
else:
|
else:
|
||||||
import cPickle as pickle # noqa
|
import cPickle as pickle # noqa
|
||||||
|
|
||||||
|
if py3k:
|
||||||
|
def read_config_file(config, fileobj):
|
||||||
|
return config.read_file(fileobj)
|
||||||
|
else:
|
||||||
|
def read_config_file(config, fileobj):
|
||||||
|
return config.readfp(fileobj)
|
||||||
|
|
||||||
|
|
||||||
def timedelta_total_seconds(td):
|
def timedelta_total_seconds(td):
|
||||||
if py27:
|
if py27:
|
||||||
|
|
|
@ -50,7 +50,7 @@ class NameRegistry(object):
|
||||||
self.creator = creator
|
self.creator = creator
|
||||||
|
|
||||||
def get(self, identifier, *args, **kw):
|
def get(self, identifier, *args, **kw):
|
||||||
"""Get and possibly create the value.
|
r"""Get and possibly create the value.
|
||||||
|
|
||||||
:param identifier: Hash key for the value.
|
:param identifier: Hash key for the value.
|
||||||
If the creation function is called, this identifier
|
If the creation function is called, this identifier
|
||||||
|
@ -75,10 +75,12 @@ class NameRegistry(object):
|
||||||
if identifier in self._values:
|
if identifier in self._values:
|
||||||
return self._values[identifier]
|
return self._values[identifier]
|
||||||
else:
|
else:
|
||||||
self._values[identifier] = value = self.creator(identifier, *args, **kw)
|
self._values[identifier] = value = self.creator(
|
||||||
|
identifier, *args, **kw)
|
||||||
return value
|
return value
|
||||||
except KeyError:
|
except KeyError:
|
||||||
self._values[identifier] = value = self.creator(identifier, *args, **kw)
|
self._values[identifier] = value = self.creator(
|
||||||
|
identifier, *args, **kw)
|
||||||
return value
|
return value
|
||||||
finally:
|
finally:
|
||||||
self._mutex.release()
|
self._mutex.release()
|
||||||
|
|
|
@ -23,7 +23,7 @@ class ReadWriteMutex(object):
|
||||||
|
|
||||||
def __init__(self):
|
def __init__(self):
|
||||||
# counts how many asynchronous methods are executing
|
# counts how many asynchronous methods are executing
|
||||||
self.async = 0
|
self.async_ = 0
|
||||||
|
|
||||||
# pointer to thread that is the current sync operation
|
# pointer to thread that is the current sync operation
|
||||||
self.current_sync_operation = None
|
self.current_sync_operation = None
|
||||||
|
@ -31,7 +31,7 @@ class ReadWriteMutex(object):
|
||||||
# condition object to lock on
|
# condition object to lock on
|
||||||
self.condition = threading.Condition(threading.Lock())
|
self.condition = threading.Condition(threading.Lock())
|
||||||
|
|
||||||
def acquire_read_lock(self, wait = True):
|
def acquire_read_lock(self, wait=True):
|
||||||
"""Acquire the 'read' lock."""
|
"""Acquire the 'read' lock."""
|
||||||
self.condition.acquire()
|
self.condition.acquire()
|
||||||
try:
|
try:
|
||||||
|
@ -45,7 +45,7 @@ class ReadWriteMutex(object):
|
||||||
if self.current_sync_operation is not None:
|
if self.current_sync_operation is not None:
|
||||||
return False
|
return False
|
||||||
|
|
||||||
self.async += 1
|
self.async_ += 1
|
||||||
log.debug("%s acquired read lock", self)
|
log.debug("%s acquired read lock", self)
|
||||||
finally:
|
finally:
|
||||||
self.condition.release()
|
self.condition.release()
|
||||||
|
@ -57,23 +57,23 @@ class ReadWriteMutex(object):
|
||||||
"""Release the 'read' lock."""
|
"""Release the 'read' lock."""
|
||||||
self.condition.acquire()
|
self.condition.acquire()
|
||||||
try:
|
try:
|
||||||
self.async -= 1
|
self.async_ -= 1
|
||||||
|
|
||||||
# check if we are the last asynchronous reader thread
|
# check if we are the last asynchronous reader thread
|
||||||
# out the door.
|
# out the door.
|
||||||
if self.async == 0:
|
if self.async_ == 0:
|
||||||
# yes. so if a sync operation is waiting, notifyAll to wake
|
# yes. so if a sync operation is waiting, notifyAll to wake
|
||||||
# it up
|
# it up
|
||||||
if self.current_sync_operation is not None:
|
if self.current_sync_operation is not None:
|
||||||
self.condition.notifyAll()
|
self.condition.notifyAll()
|
||||||
elif self.async < 0:
|
elif self.async_ < 0:
|
||||||
raise LockError("Synchronizer error - too many "
|
raise LockError("Synchronizer error - too many "
|
||||||
"release_read_locks called")
|
"release_read_locks called")
|
||||||
log.debug("%s released read lock", self)
|
log.debug("%s released read lock", self)
|
||||||
finally:
|
finally:
|
||||||
self.condition.release()
|
self.condition.release()
|
||||||
|
|
||||||
def acquire_write_lock(self, wait = True):
|
def acquire_write_lock(self, wait=True):
|
||||||
"""Acquire the 'write' lock."""
|
"""Acquire the 'write' lock."""
|
||||||
self.condition.acquire()
|
self.condition.acquire()
|
||||||
try:
|
try:
|
||||||
|
@ -96,7 +96,7 @@ class ReadWriteMutex(object):
|
||||||
self.current_sync_operation = threading.currentThread()
|
self.current_sync_operation = threading.currentThread()
|
||||||
|
|
||||||
# now wait again for asyncs to finish
|
# now wait again for asyncs to finish
|
||||||
if self.async > 0:
|
if self.async_ > 0:
|
||||||
if wait:
|
if wait:
|
||||||
# wait
|
# wait
|
||||||
self.condition.wait()
|
self.condition.wait()
|
||||||
|
|
|
@ -6,8 +6,16 @@
|
||||||
# s/class \(\w\+\):/class \1(object):/
|
# s/class \(\w\+\):/class \1(object):/
|
||||||
|
|
||||||
# Use iterator versions of map and range:
|
# Use iterator versions of map and range:
|
||||||
from itertools import imap as map
|
try:
|
||||||
range = xrange
|
from itertools import imap as map
|
||||||
|
except ImportError:
|
||||||
|
imap = map
|
||||||
|
|
||||||
|
try:
|
||||||
|
import xrange
|
||||||
|
range = xrange
|
||||||
|
except ImportError:
|
||||||
|
pass
|
||||||
|
|
||||||
# Except that xrange only supports machine integers, not longs, so...
|
# Except that xrange only supports machine integers, not longs, so...
|
||||||
def long_range(start, end):
|
def long_range(start, end):
|
||||||
|
|
|
@ -23,12 +23,10 @@ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
|
||||||
THE SOFTWARE.
|
THE SOFTWARE.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
# Bazarr patch to use custom ConfigParser2:
|
try:
|
||||||
from ConfigParser2 import ConfigParser as configparser, NoOptionError, NoSectionError
|
from backports.configparser2 import ConfigParser as configparser, NoOptionError, NoSectionError
|
||||||
#try:
|
except ImportError:
|
||||||
# from configparser2 import ConfigParser as configparser, NoOptionError, NoSectionError
|
from ConfigParser import SafeConfigParser as configparser, NoOptionError, NoSectionError
|
||||||
#except ImportError:
|
|
||||||
# from ConfigParser import SafeConfigParser as configparser, NoOptionError, NoSectionError
|
|
||||||
|
|
||||||
|
|
||||||
class simpleconfigparser(configparser):
|
class simpleconfigparser(configparser):
|
||||||
|
|
65
libs/six.py
65
libs/six.py
|
@ -1,4 +1,4 @@
|
||||||
# Copyright (c) 2010-2017 Benjamin Peterson
|
# Copyright (c) 2010-2018 Benjamin Peterson
|
||||||
#
|
#
|
||||||
# Permission is hereby granted, free of charge, to any person obtaining a copy
|
# Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||||
# of this software and associated documentation files (the "Software"), to deal
|
# of this software and associated documentation files (the "Software"), to deal
|
||||||
|
@ -29,7 +29,7 @@ import sys
|
||||||
import types
|
import types
|
||||||
|
|
||||||
__author__ = "Benjamin Peterson <benjamin@python.org>"
|
__author__ = "Benjamin Peterson <benjamin@python.org>"
|
||||||
__version__ = "1.11.0"
|
__version__ = "1.12.0"
|
||||||
|
|
||||||
|
|
||||||
# Useful for very coarse version differentiation.
|
# Useful for very coarse version differentiation.
|
||||||
|
@ -844,10 +844,71 @@ def add_metaclass(metaclass):
|
||||||
orig_vars.pop(slots_var)
|
orig_vars.pop(slots_var)
|
||||||
orig_vars.pop('__dict__', None)
|
orig_vars.pop('__dict__', None)
|
||||||
orig_vars.pop('__weakref__', None)
|
orig_vars.pop('__weakref__', None)
|
||||||
|
if hasattr(cls, '__qualname__'):
|
||||||
|
orig_vars['__qualname__'] = cls.__qualname__
|
||||||
return metaclass(cls.__name__, cls.__bases__, orig_vars)
|
return metaclass(cls.__name__, cls.__bases__, orig_vars)
|
||||||
return wrapper
|
return wrapper
|
||||||
|
|
||||||
|
|
||||||
|
def ensure_binary(s, encoding='utf-8', errors='strict'):
|
||||||
|
"""Coerce **s** to six.binary_type.
|
||||||
|
|
||||||
|
For Python 2:
|
||||||
|
- `unicode` -> encoded to `str`
|
||||||
|
- `str` -> `str`
|
||||||
|
|
||||||
|
For Python 3:
|
||||||
|
- `str` -> encoded to `bytes`
|
||||||
|
- `bytes` -> `bytes`
|
||||||
|
"""
|
||||||
|
if isinstance(s, text_type):
|
||||||
|
return s.encode(encoding, errors)
|
||||||
|
elif isinstance(s, binary_type):
|
||||||
|
return s
|
||||||
|
else:
|
||||||
|
raise TypeError("not expecting type '%s'" % type(s))
|
||||||
|
|
||||||
|
|
||||||
|
def ensure_str(s, encoding='utf-8', errors='strict'):
|
||||||
|
"""Coerce *s* to `str`.
|
||||||
|
|
||||||
|
For Python 2:
|
||||||
|
- `unicode` -> encoded to `str`
|
||||||
|
- `str` -> `str`
|
||||||
|
|
||||||
|
For Python 3:
|
||||||
|
- `str` -> `str`
|
||||||
|
- `bytes` -> decoded to `str`
|
||||||
|
"""
|
||||||
|
if not isinstance(s, (text_type, binary_type)):
|
||||||
|
raise TypeError("not expecting type '%s'" % type(s))
|
||||||
|
if PY2 and isinstance(s, text_type):
|
||||||
|
s = s.encode(encoding, errors)
|
||||||
|
elif PY3 and isinstance(s, binary_type):
|
||||||
|
s = s.decode(encoding, errors)
|
||||||
|
return s
|
||||||
|
|
||||||
|
|
||||||
|
def ensure_text(s, encoding='utf-8', errors='strict'):
|
||||||
|
"""Coerce *s* to six.text_type.
|
||||||
|
|
||||||
|
For Python 2:
|
||||||
|
- `unicode` -> `unicode`
|
||||||
|
- `str` -> `unicode`
|
||||||
|
|
||||||
|
For Python 3:
|
||||||
|
- `str` -> `str`
|
||||||
|
- `bytes` -> decoded to `str`
|
||||||
|
"""
|
||||||
|
if isinstance(s, binary_type):
|
||||||
|
return s.decode(encoding, errors)
|
||||||
|
elif isinstance(s, text_type):
|
||||||
|
return s
|
||||||
|
else:
|
||||||
|
raise TypeError("not expecting type '%s'" % type(s))
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
def python_2_unicode_compatible(klass):
|
def python_2_unicode_compatible(klass):
|
||||||
"""
|
"""
|
||||||
A decorator that defines __unicode__ and __str__ methods under Python 2.
|
A decorator that defines __unicode__ and __str__ methods under Python 2.
|
||||||
|
|
|
@ -1,6 +1,6 @@
|
||||||
# -*- coding: utf-8 -*-
|
# -*- coding: utf-8 -*-
|
||||||
__title__ = 'subliminal'
|
__title__ = 'subliminal'
|
||||||
__version__ = '2.1.0.dev'
|
__version__ = '2.0.5'
|
||||||
__short_version__ = '.'.join(__version__.split('.')[:2])
|
__short_version__ = '.'.join(__version__.split('.')[:2])
|
||||||
__author__ = 'Antoine Bertin'
|
__author__ = 'Antoine Bertin'
|
||||||
__license__ = 'MIT'
|
__license__ = 'MIT'
|
||||||
|
|
|
@ -219,12 +219,13 @@ config_file = 'config.ini'
|
||||||
@click.option('--legendastv', type=click.STRING, nargs=2, metavar='USERNAME PASSWORD', help='LegendasTV configuration.')
|
@click.option('--legendastv', type=click.STRING, nargs=2, metavar='USERNAME PASSWORD', help='LegendasTV configuration.')
|
||||||
@click.option('--opensubtitles', type=click.STRING, nargs=2, metavar='USERNAME PASSWORD',
|
@click.option('--opensubtitles', type=click.STRING, nargs=2, metavar='USERNAME PASSWORD',
|
||||||
help='OpenSubtitles configuration.')
|
help='OpenSubtitles configuration.')
|
||||||
|
@click.option('--subscenter', type=click.STRING, nargs=2, metavar='USERNAME PASSWORD', help='SubsCenter configuration.')
|
||||||
@click.option('--cache-dir', type=click.Path(writable=True, file_okay=False), default=dirs.user_cache_dir,
|
@click.option('--cache-dir', type=click.Path(writable=True, file_okay=False), default=dirs.user_cache_dir,
|
||||||
show_default=True, expose_value=True, help='Path to the cache directory.')
|
show_default=True, expose_value=True, help='Path to the cache directory.')
|
||||||
@click.option('--debug', is_flag=True, help='Print useful information for debugging subliminal and for reporting bugs.')
|
@click.option('--debug', is_flag=True, help='Print useful information for debugging subliminal and for reporting bugs.')
|
||||||
@click.version_option(__version__)
|
@click.version_option(__version__)
|
||||||
@click.pass_context
|
@click.pass_context
|
||||||
def subliminal(ctx, addic7ed, legendastv, opensubtitles, cache_dir, debug):
|
def subliminal(ctx, addic7ed, legendastv, opensubtitles, subscenter, cache_dir, debug):
|
||||||
"""Subtitles, faster than your thoughts."""
|
"""Subtitles, faster than your thoughts."""
|
||||||
# create cache directory
|
# create cache directory
|
||||||
try:
|
try:
|
||||||
|
@ -252,6 +253,8 @@ def subliminal(ctx, addic7ed, legendastv, opensubtitles, cache_dir, debug):
|
||||||
ctx.obj['provider_configs']['legendastv'] = {'username': legendastv[0], 'password': legendastv[1]}
|
ctx.obj['provider_configs']['legendastv'] = {'username': legendastv[0], 'password': legendastv[1]}
|
||||||
if opensubtitles:
|
if opensubtitles:
|
||||||
ctx.obj['provider_configs']['opensubtitles'] = {'username': opensubtitles[0], 'password': opensubtitles[1]}
|
ctx.obj['provider_configs']['opensubtitles'] = {'username': opensubtitles[0], 'password': opensubtitles[1]}
|
||||||
|
if subscenter:
|
||||||
|
ctx.obj['provider_configs']['subscenter'] = {'username': subscenter[0], 'password': subscenter[1]}
|
||||||
|
|
||||||
|
|
||||||
@subliminal.command()
|
@subliminal.command()
|
||||||
|
|
|
@ -1,38 +1,19 @@
|
||||||
# -*- coding: utf-8 -*-
|
# -*- coding: utf-8 -*-
|
||||||
from collections import defaultdict
|
from collections import defaultdict
|
||||||
|
from concurrent.futures import ThreadPoolExecutor
|
||||||
import platform
|
|
||||||
is_windows_special_path = False
|
|
||||||
|
|
||||||
if platform.system() == "Windows":
|
|
||||||
try:
|
|
||||||
__file__.decode("ascii")
|
|
||||||
except UnicodeDecodeError:
|
|
||||||
is_windows_special_path = True
|
|
||||||
|
|
||||||
if not is_windows_special_path:
|
|
||||||
from concurrent.futures import ThreadPoolExecutor
|
|
||||||
else:
|
|
||||||
ThreadPoolExecutor = object
|
|
||||||
|
|
||||||
|
|
||||||
from datetime import datetime
|
from datetime import datetime
|
||||||
import io
|
import io
|
||||||
import itertools
|
import itertools
|
||||||
import logging
|
import logging
|
||||||
import operator
|
import operator
|
||||||
import os
|
import os.path
|
||||||
import socket
|
import socket
|
||||||
|
|
||||||
from babelfish import Language, LanguageReverseError
|
from babelfish import Language, LanguageReverseError
|
||||||
from guessit import guessit
|
from guessit import guessit
|
||||||
from six.moves.xmlrpc_client import ProtocolError
|
from rarfile import NotRarFile, RarCannotExec, RarFile
|
||||||
from rarfile import BadRarFile, NotRarFile, RarCannotExec, RarFile
|
|
||||||
from zipfile import BadZipfile
|
|
||||||
from ssl import SSLError
|
|
||||||
import requests
|
import requests
|
||||||
|
|
||||||
from .exceptions import ServiceUnavailable
|
|
||||||
from .extensions import provider_manager, refiner_manager
|
from .extensions import provider_manager, refiner_manager
|
||||||
from .score import compute_score as default_compute_score
|
from .score import compute_score as default_compute_score
|
||||||
from .subtitle import SUBTITLE_EXTENSIONS, get_subtitle_path
|
from .subtitle import SUBTITLE_EXTENSIONS, get_subtitle_path
|
||||||
|
@ -98,18 +79,6 @@ class ProviderPool(object):
|
||||||
self.initialized_providers[name].terminate()
|
self.initialized_providers[name].terminate()
|
||||||
except (requests.Timeout, socket.timeout):
|
except (requests.Timeout, socket.timeout):
|
||||||
logger.error('Provider %r timed out, improperly terminated', name)
|
logger.error('Provider %r timed out, improperly terminated', name)
|
||||||
except (ServiceUnavailable, ProtocolError): # OpenSubtitles raises xmlrpclib.ProtocolError when unavailable
|
|
||||||
logger.error('Provider %r unavailable, improperly terminated', name)
|
|
||||||
except requests.exceptions.HTTPError as e:
|
|
||||||
if e.response.status_code in range(500, 600):
|
|
||||||
logger.error('Provider %r unavailable, improperly terminated', name)
|
|
||||||
else:
|
|
||||||
logger.exception('Provider %r http error %r, improperly terminated', name, e.response.status_code)
|
|
||||||
except SSLError as e:
|
|
||||||
if e.args[0] == 'The read operation timed out':
|
|
||||||
logger.error('Provider %r unavailable, improperly terminated', name)
|
|
||||||
else:
|
|
||||||
logger.exception('Provider %r SSL error %r, improperly terminated', name, e.args[0])
|
|
||||||
except:
|
except:
|
||||||
logger.exception('Provider %r terminated unexpectedly', name)
|
logger.exception('Provider %r terminated unexpectedly', name)
|
||||||
|
|
||||||
|
@ -149,18 +118,6 @@ class ProviderPool(object):
|
||||||
return self[provider].list_subtitles(video, provider_languages)
|
return self[provider].list_subtitles(video, provider_languages)
|
||||||
except (requests.Timeout, socket.timeout):
|
except (requests.Timeout, socket.timeout):
|
||||||
logger.error('Provider %r timed out', provider)
|
logger.error('Provider %r timed out', provider)
|
||||||
except (ServiceUnavailable, ProtocolError): # OpenSubtitles raises xmlrpclib.ProtocolError when unavailable
|
|
||||||
logger.error('Provider %r unavailable', provider)
|
|
||||||
except requests.exceptions.HTTPError as e:
|
|
||||||
if e.response.status_code in range(500, 600):
|
|
||||||
logger.error('Provider %r unavailable', provider)
|
|
||||||
else:
|
|
||||||
logger.exception('Provider %r http error %r', provider, e.response.status_code)
|
|
||||||
except SSLError as e:
|
|
||||||
if e.args[0] == 'The read operation timed out':
|
|
||||||
logger.error('Provider %r unavailable', provider)
|
|
||||||
else:
|
|
||||||
logger.exception('Provider %r SSL error %r', provider, e.args[0])
|
|
||||||
except:
|
except:
|
||||||
logger.exception('Unexpected error in provider %r', provider)
|
logger.exception('Unexpected error in provider %r', provider)
|
||||||
|
|
||||||
|
@ -216,28 +173,6 @@ class ProviderPool(object):
|
||||||
logger.error('Provider %r timed out, discarding it', subtitle.provider_name)
|
logger.error('Provider %r timed out, discarding it', subtitle.provider_name)
|
||||||
self.discarded_providers.add(subtitle.provider_name)
|
self.discarded_providers.add(subtitle.provider_name)
|
||||||
return False
|
return False
|
||||||
except (ServiceUnavailable, ProtocolError): # OpenSubtitles raises xmlrpclib.ProtocolError when unavailable
|
|
||||||
logger.error('Provider %r unavailable, discarding it', subtitle.provider_name)
|
|
||||||
self.discarded_providers.add(subtitle.provider_name)
|
|
||||||
return False
|
|
||||||
except requests.exceptions.HTTPError as e:
|
|
||||||
if e.response.status_code in range(500, 600):
|
|
||||||
logger.error('Provider %r unavailable, discarding it', subtitle.provider_name)
|
|
||||||
else:
|
|
||||||
logger.exception('Provider %r http error %r, discarding it', subtitle.provider_name,
|
|
||||||
e.response.status_code)
|
|
||||||
self.discarded_providers.add(subtitle.provider_name)
|
|
||||||
return False
|
|
||||||
except SSLError as e:
|
|
||||||
if e.args[0] == 'The read operation timed out':
|
|
||||||
logger.error('Provider %r unavailable, discarding it', subtitle.provider_name)
|
|
||||||
else:
|
|
||||||
logger.exception('Provider %r SSL error %r, discarding it', subtitle.provider_name, e.args[0])
|
|
||||||
self.discarded_providers.add(subtitle.provider_name)
|
|
||||||
return False
|
|
||||||
except (BadRarFile, BadZipfile):
|
|
||||||
logger.error('Bad archive for %r', subtitle)
|
|
||||||
return False
|
|
||||||
except:
|
except:
|
||||||
logger.exception('Unexpected error in provider %r, discarding it', subtitle.provider_name)
|
logger.exception('Unexpected error in provider %r, discarding it', subtitle.provider_name)
|
||||||
self.discarded_providers.add(subtitle.provider_name)
|
self.discarded_providers.add(subtitle.provider_name)
|
||||||
|
@ -557,15 +492,9 @@ def scan_videos(path, age=None, archives=True):
|
||||||
continue
|
continue
|
||||||
|
|
||||||
# skip old files
|
# skip old files
|
||||||
try:
|
if age and datetime.utcnow() - datetime.utcfromtimestamp(os.path.getmtime(filepath)) > age:
|
||||||
file_age = datetime.utcfromtimestamp(os.path.getmtime(filepath))
|
logger.debug('Skipping old file %r in %r', filename, dirpath)
|
||||||
except ValueError:
|
|
||||||
logger.warning('Could not get age of file %r in %r', filename, dirpath)
|
|
||||||
continue
|
continue
|
||||||
else:
|
|
||||||
if age and datetime.utcnow() - file_age > age:
|
|
||||||
logger.debug('Skipping old file %r in %r', filename, dirpath)
|
|
||||||
continue
|
|
||||||
|
|
||||||
# scan
|
# scan
|
||||||
if filename.endswith(VIDEO_EXTENSIONS): # video
|
if filename.endswith(VIDEO_EXTENSIONS): # video
|
||||||
|
@ -612,8 +541,7 @@ def refine(video, episode_refiners=None, movie_refiners=None, **kwargs):
|
||||||
try:
|
try:
|
||||||
refiner_manager[refiner].plugin(video, **kwargs)
|
refiner_manager[refiner].plugin(video, **kwargs)
|
||||||
except:
|
except:
|
||||||
logger.error('Failed to refine video %r', video.name)
|
logger.exception('Failed to refine video')
|
||||||
logger.debug('Refiner exception:', exc_info=True)
|
|
||||||
|
|
||||||
|
|
||||||
def list_subtitles(videos, languages, pool_class=ProviderPool, **kwargs):
|
def list_subtitles(videos, languages, pool_class=ProviderPool, **kwargs):
|
||||||
|
|
|
@ -19,8 +19,8 @@ class AuthenticationError(ProviderError):
|
||||||
pass
|
pass
|
||||||
|
|
||||||
|
|
||||||
class ServiceUnavailable(ProviderError):
|
class TooManyRequests(ProviderError):
|
||||||
"""Exception raised when status is '503 Service Unavailable'."""
|
"""Exception raised by providers when too many requests are made."""
|
||||||
pass
|
pass
|
||||||
|
|
||||||
|
|
||||||
|
|
|
@ -29,9 +29,9 @@ class RegistrableExtensionManager(ExtensionManager):
|
||||||
|
|
||||||
super(RegistrableExtensionManager, self).__init__(namespace, **kwargs)
|
super(RegistrableExtensionManager, self).__init__(namespace, **kwargs)
|
||||||
|
|
||||||
def list_entry_points(self):
|
def _find_entry_points(self, namespace):
|
||||||
# copy of default extensions
|
# copy of default extensions
|
||||||
eps = list(super(RegistrableExtensionManager, self).list_entry_points())
|
eps = list(super(RegistrableExtensionManager, self)._find_entry_points(namespace))
|
||||||
|
|
||||||
# internal extensions
|
# internal extensions
|
||||||
for iep in self.internal_extensions:
|
for iep in self.internal_extensions:
|
||||||
|
@ -93,6 +93,7 @@ provider_manager = RegistrableExtensionManager('subliminal.providers', [
|
||||||
'opensubtitles = subliminal.providers.opensubtitles:OpenSubtitlesProvider',
|
'opensubtitles = subliminal.providers.opensubtitles:OpenSubtitlesProvider',
|
||||||
'podnapisi = subliminal.providers.podnapisi:PodnapisiProvider',
|
'podnapisi = subliminal.providers.podnapisi:PodnapisiProvider',
|
||||||
'shooter = subliminal.providers.shooter:ShooterProvider',
|
'shooter = subliminal.providers.shooter:ShooterProvider',
|
||||||
|
'subscenter = subliminal.providers.subscenter:SubsCenterProvider',
|
||||||
'thesubdb = subliminal.providers.thesubdb:TheSubDBProvider',
|
'thesubdb = subliminal.providers.thesubdb:TheSubDBProvider',
|
||||||
'tvsubtitles = subliminal.providers.tvsubtitles:TVsubtitlesProvider'
|
'tvsubtitles = subliminal.providers.tvsubtitles:TVsubtitlesProvider'
|
||||||
])
|
])
|
||||||
|
|
|
@ -68,9 +68,6 @@ class Provider(object):
|
||||||
#: Required hash, if any
|
#: Required hash, if any
|
||||||
required_hash = None
|
required_hash = None
|
||||||
|
|
||||||
#: Subtitle class to use
|
|
||||||
subtitle_class = None
|
|
||||||
|
|
||||||
def __enter__(self):
|
def __enter__(self):
|
||||||
self.initialize()
|
self.initialize()
|
||||||
return self
|
return self
|
||||||
|
|
|
@ -9,7 +9,7 @@ from requests import Session
|
||||||
from . import ParserBeautifulSoup, Provider
|
from . import ParserBeautifulSoup, Provider
|
||||||
from .. import __short_version__
|
from .. import __short_version__
|
||||||
from ..cache import SHOW_EXPIRATION_TIME, region
|
from ..cache import SHOW_EXPIRATION_TIME, region
|
||||||
from ..exceptions import AuthenticationError, ConfigurationError, DownloadLimitExceeded
|
from ..exceptions import AuthenticationError, ConfigurationError, DownloadLimitExceeded, TooManyRequests
|
||||||
from ..score import get_equivalent_release_groups
|
from ..score import get_equivalent_release_groups
|
||||||
from ..subtitle import Subtitle, fix_line_ending, guess_matches
|
from ..subtitle import Subtitle, fix_line_ending, guess_matches
|
||||||
from ..utils import sanitize, sanitize_release_group
|
from ..utils import sanitize, sanitize_release_group
|
||||||
|
@ -19,11 +19,8 @@ logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
language_converters.register('addic7ed = subliminal.converters.addic7ed:Addic7edConverter')
|
language_converters.register('addic7ed = subliminal.converters.addic7ed:Addic7edConverter')
|
||||||
|
|
||||||
# Series cell matching regex
|
|
||||||
show_cells_re = re.compile(b'<td class="version">.*?</td>', re.DOTALL)
|
|
||||||
|
|
||||||
#: Series header parsing regex
|
#: Series header parsing regex
|
||||||
series_year_re = re.compile(r'^(?P<series>[ \w\'.:(),*&!?-]+?)(?: \((?P<year>\d{4})\))?$')
|
series_year_re = re.compile(r'^(?P<series>[ \w\'.:(),&!?-]+?)(?: \((?P<year>\d{4})\))?$')
|
||||||
|
|
||||||
|
|
||||||
class Addic7edSubtitle(Subtitle):
|
class Addic7edSubtitle(Subtitle):
|
||||||
|
@ -32,7 +29,7 @@ class Addic7edSubtitle(Subtitle):
|
||||||
|
|
||||||
def __init__(self, language, hearing_impaired, page_link, series, season, episode, title, year, version,
|
def __init__(self, language, hearing_impaired, page_link, series, season, episode, title, year, version,
|
||||||
download_link):
|
download_link):
|
||||||
super(Addic7edSubtitle, self).__init__(language, hearing_impaired=hearing_impaired, page_link=page_link)
|
super(Addic7edSubtitle, self).__init__(language, hearing_impaired, page_link)
|
||||||
self.series = series
|
self.series = series
|
||||||
self.season = season
|
self.season = season
|
||||||
self.episode = episode
|
self.episode = episode
|
||||||
|
@ -48,9 +45,8 @@ class Addic7edSubtitle(Subtitle):
|
||||||
def get_matches(self, video):
|
def get_matches(self, video):
|
||||||
matches = set()
|
matches = set()
|
||||||
|
|
||||||
# series name
|
# series
|
||||||
if video.series and sanitize(self.series) in (
|
if video.series and sanitize(self.series) == sanitize(video.series):
|
||||||
sanitize(name) for name in [video.series] + video.alternative_series):
|
|
||||||
matches.add('series')
|
matches.add('series')
|
||||||
# season
|
# season
|
||||||
if video.season and self.season == video.season:
|
if video.season and self.season == video.season:
|
||||||
|
@ -58,7 +54,7 @@ class Addic7edSubtitle(Subtitle):
|
||||||
# episode
|
# episode
|
||||||
if video.episode and self.episode == video.episode:
|
if video.episode and self.episode == video.episode:
|
||||||
matches.add('episode')
|
matches.add('episode')
|
||||||
# title of the episode
|
# title
|
||||||
if video.title and sanitize(self.title) == sanitize(video.title):
|
if video.title and sanitize(self.title) == sanitize(video.title):
|
||||||
matches.add('title')
|
matches.add('title')
|
||||||
# year
|
# year
|
||||||
|
@ -90,23 +86,21 @@ class Addic7edProvider(Provider):
|
||||||
]}
|
]}
|
||||||
video_types = (Episode,)
|
video_types = (Episode,)
|
||||||
server_url = 'http://www.addic7ed.com/'
|
server_url = 'http://www.addic7ed.com/'
|
||||||
subtitle_class = Addic7edSubtitle
|
|
||||||
|
|
||||||
def __init__(self, username=None, password=None):
|
def __init__(self, username=None, password=None):
|
||||||
if any((username, password)) and not all((username, password)):
|
if username is not None and password is None or username is None and password is not None:
|
||||||
raise ConfigurationError('Username and password must be specified')
|
raise ConfigurationError('Username and password must be specified')
|
||||||
|
|
||||||
self.username = username
|
self.username = username
|
||||||
self.password = password
|
self.password = password
|
||||||
self.logged_in = False
|
self.logged_in = False
|
||||||
self.session = None
|
|
||||||
|
|
||||||
def initialize(self):
|
def initialize(self):
|
||||||
self.session = Session()
|
self.session = Session()
|
||||||
self.session.headers['User-Agent'] = 'Subliminal/%s' % __short_version__
|
self.session.headers['User-Agent'] = 'Subliminal/%s' % __short_version__
|
||||||
|
|
||||||
# login
|
# login
|
||||||
if self.username and self.password:
|
if self.username is not None and self.password is not None:
|
||||||
logger.info('Logging in')
|
logger.info('Logging in')
|
||||||
data = {'username': self.username, 'password': self.password, 'Submit': 'Log in'}
|
data = {'username': self.username, 'password': self.password, 'Submit': 'Log in'}
|
||||||
r = self.session.post(self.server_url + 'dologin.php', data, allow_redirects=False, timeout=10)
|
r = self.session.post(self.server_url + 'dologin.php', data, allow_redirects=False, timeout=10)
|
||||||
|
@ -140,16 +134,7 @@ class Addic7edProvider(Provider):
|
||||||
logger.info('Getting show ids')
|
logger.info('Getting show ids')
|
||||||
r = self.session.get(self.server_url + 'shows.php', timeout=10)
|
r = self.session.get(self.server_url + 'shows.php', timeout=10)
|
||||||
r.raise_for_status()
|
r.raise_for_status()
|
||||||
|
soup = ParserBeautifulSoup(r.content, ['lxml', 'html.parser'])
|
||||||
# LXML parser seems to fail when parsing Addic7ed.com HTML markup.
|
|
||||||
# Last known version to work properly is 3.6.4 (next version, 3.7.0, fails)
|
|
||||||
# Assuming the site's markup is bad, and stripping it down to only contain what's needed.
|
|
||||||
show_cells = re.findall(show_cells_re, r.content)
|
|
||||||
if show_cells:
|
|
||||||
soup = ParserBeautifulSoup(b''.join(show_cells), ['lxml', 'html.parser'])
|
|
||||||
else:
|
|
||||||
# If RegEx fails, fall back to original r.content and use 'html.parser'
|
|
||||||
soup = ParserBeautifulSoup(r.content, ['html.parser'])
|
|
||||||
|
|
||||||
# populate the show ids
|
# populate the show ids
|
||||||
show_ids = {}
|
show_ids = {}
|
||||||
|
@ -181,6 +166,8 @@ class Addic7edProvider(Provider):
|
||||||
logger.info('Searching show ids with %r', params)
|
logger.info('Searching show ids with %r', params)
|
||||||
r = self.session.get(self.server_url + 'search.php', params=params, timeout=10)
|
r = self.session.get(self.server_url + 'search.php', params=params, timeout=10)
|
||||||
r.raise_for_status()
|
r.raise_for_status()
|
||||||
|
if r.status_code == 304:
|
||||||
|
raise TooManyRequests()
|
||||||
soup = ParserBeautifulSoup(r.content, ['lxml', 'html.parser'])
|
soup = ParserBeautifulSoup(r.content, ['lxml', 'html.parser'])
|
||||||
|
|
||||||
# get the suggestion
|
# get the suggestion
|
||||||
|
@ -231,23 +218,24 @@ class Addic7edProvider(Provider):
|
||||||
|
|
||||||
# search as last resort
|
# search as last resort
|
||||||
if not show_id:
|
if not show_id:
|
||||||
logger.warning('Series %s not found in show ids', series)
|
logger.warning('Series not found in show ids')
|
||||||
show_id = self._search_show_id(series)
|
show_id = self._search_show_id(series)
|
||||||
|
|
||||||
return show_id
|
return show_id
|
||||||
|
|
||||||
def query(self, show_id, series, season, year=None, country=None):
|
def query(self, series, season, year=None, country=None):
|
||||||
|
# get the show id
|
||||||
|
show_id = self.get_show_id(series, year, country)
|
||||||
|
if show_id is None:
|
||||||
|
logger.error('No show id found for %r (%r)', series, {'year': year, 'country': country})
|
||||||
|
return []
|
||||||
|
|
||||||
# get the page of the season of the show
|
# get the page of the season of the show
|
||||||
logger.info('Getting the page of show id %d, season %d', show_id, season)
|
logger.info('Getting the page of show id %d, season %d', show_id, season)
|
||||||
r = self.session.get(self.server_url + 'show/%d' % show_id, params={'season': season}, timeout=10)
|
r = self.session.get(self.server_url + 'show/%d' % show_id, params={'season': season}, timeout=10)
|
||||||
r.raise_for_status()
|
r.raise_for_status()
|
||||||
|
if r.status_code == 304:
|
||||||
if not r.content:
|
raise TooManyRequests()
|
||||||
# Provider returns a status of 304 Not Modified with an empty content
|
|
||||||
# raise_for_status won't raise exception for that status code
|
|
||||||
logger.debug('No data returned from provider')
|
|
||||||
return []
|
|
||||||
|
|
||||||
soup = ParserBeautifulSoup(r.content, ['lxml', 'html.parser'])
|
soup = ParserBeautifulSoup(r.content, ['lxml', 'html.parser'])
|
||||||
|
|
||||||
# loop over subtitle rows
|
# loop over subtitle rows
|
||||||
|
@ -274,32 +262,16 @@ class Addic7edProvider(Provider):
|
||||||
version = cells[4].text
|
version = cells[4].text
|
||||||
download_link = cells[9].a['href'][1:]
|
download_link = cells[9].a['href'][1:]
|
||||||
|
|
||||||
subtitle = self.subtitle_class(language, hearing_impaired, page_link, series, season, episode, title, year,
|
subtitle = Addic7edSubtitle(language, hearing_impaired, page_link, series, season, episode, title, year,
|
||||||
version, download_link)
|
version, download_link)
|
||||||
logger.debug('Found subtitle %r', subtitle)
|
logger.debug('Found subtitle %r', subtitle)
|
||||||
subtitles.append(subtitle)
|
subtitles.append(subtitle)
|
||||||
|
|
||||||
return subtitles
|
return subtitles
|
||||||
|
|
||||||
def list_subtitles(self, video, languages):
|
def list_subtitles(self, video, languages):
|
||||||
# lookup show_id
|
return [s for s in self.query(video.series, video.season, video.year)
|
||||||
titles = [video.series] + video.alternative_series
|
if s.language in languages and s.episode == video.episode]
|
||||||
show_id = None
|
|
||||||
for title in titles:
|
|
||||||
show_id = self.get_show_id(title, video.year)
|
|
||||||
if show_id is not None:
|
|
||||||
break
|
|
||||||
|
|
||||||
# query for subtitles with the show_id
|
|
||||||
if show_id is not None:
|
|
||||||
subtitles = [s for s in self.query(show_id, title, video.season, video.year)
|
|
||||||
if s.language in languages and s.episode == video.episode]
|
|
||||||
if subtitles:
|
|
||||||
return subtitles
|
|
||||||
else:
|
|
||||||
logger.error('No show id found for %r (%r)', video.series, {'year': video.year})
|
|
||||||
|
|
||||||
return []
|
|
||||||
|
|
||||||
def download_subtitle(self, subtitle):
|
def download_subtitle(self, subtitle):
|
||||||
# download the subtitle
|
# download the subtitle
|
||||||
|
@ -308,12 +280,6 @@ class Addic7edProvider(Provider):
|
||||||
timeout=10)
|
timeout=10)
|
||||||
r.raise_for_status()
|
r.raise_for_status()
|
||||||
|
|
||||||
if not r.content:
|
|
||||||
# Provider returns a status of 304 Not Modified with an empty content
|
|
||||||
# raise_for_status won't raise exception for that status code
|
|
||||||
logger.debug('Unable to download subtitle. No data returned from provider')
|
|
||||||
return
|
|
||||||
|
|
||||||
# detect download limit exceeded
|
# detect download limit exceeded
|
||||||
if r.headers['Content-Type'] == 'text/html':
|
if r.headers['Content-Type'] == 'text/html':
|
||||||
raise DownloadLimitExceeded
|
raise DownloadLimitExceeded
|
||||||
|
|
|
@ -18,7 +18,7 @@ from zipfile import ZipFile, is_zipfile
|
||||||
from . import ParserBeautifulSoup, Provider
|
from . import ParserBeautifulSoup, Provider
|
||||||
from .. import __short_version__
|
from .. import __short_version__
|
||||||
from ..cache import SHOW_EXPIRATION_TIME, region
|
from ..cache import SHOW_EXPIRATION_TIME, region
|
||||||
from ..exceptions import AuthenticationError, ConfigurationError, ProviderError, ServiceUnavailable
|
from ..exceptions import AuthenticationError, ConfigurationError, ProviderError
|
||||||
from ..subtitle import SUBTITLE_EXTENSIONS, Subtitle, fix_line_ending, guess_matches, sanitize
|
from ..subtitle import SUBTITLE_EXTENSIONS, Subtitle, fix_line_ending, guess_matches, sanitize
|
||||||
from ..video import Episode, Movie
|
from ..video import Episode, Movie
|
||||||
|
|
||||||
|
@ -44,11 +44,8 @@ rating_re = re.compile(r'nota (?P<rating>\d+)')
|
||||||
#: Timestamp parsing regex
|
#: Timestamp parsing regex
|
||||||
timestamp_re = re.compile(r'(?P<day>\d+)/(?P<month>\d+)/(?P<year>\d+) - (?P<hour>\d+):(?P<minute>\d+)')
|
timestamp_re = re.compile(r'(?P<day>\d+)/(?P<month>\d+)/(?P<year>\d+) - (?P<hour>\d+):(?P<minute>\d+)')
|
||||||
|
|
||||||
#: Title with year/country regex
|
|
||||||
title_re = re.compile(r'^(?P<series>.*?)(?: \((?:(?P<year>\d{4})|(?P<country>[A-Z]{2}))\))?$')
|
|
||||||
|
|
||||||
#: Cache key for releases
|
#: Cache key for releases
|
||||||
releases_key = __name__ + ':releases|{archive_id}|{archive_name}'
|
releases_key = __name__ + ':releases|{archive_id}'
|
||||||
|
|
||||||
|
|
||||||
class LegendasTVArchive(object):
|
class LegendasTVArchive(object):
|
||||||
|
@ -63,8 +60,8 @@ class LegendasTVArchive(object):
|
||||||
:param int rating: rating (0-10).
|
:param int rating: rating (0-10).
|
||||||
:param timestamp: timestamp.
|
:param timestamp: timestamp.
|
||||||
:type timestamp: datetime.datetime
|
:type timestamp: datetime.datetime
|
||||||
"""
|
|
||||||
|
|
||||||
|
"""
|
||||||
def __init__(self, id, name, pack, featured, link, downloads=0, rating=0, timestamp=None):
|
def __init__(self, id, name, pack, featured, link, downloads=0, rating=0, timestamp=None):
|
||||||
#: Identifier
|
#: Identifier
|
||||||
self.id = id
|
self.id = id
|
||||||
|
@ -99,11 +96,10 @@ class LegendasTVArchive(object):
|
||||||
|
|
||||||
class LegendasTVSubtitle(Subtitle):
|
class LegendasTVSubtitle(Subtitle):
|
||||||
"""LegendasTV Subtitle."""
|
"""LegendasTV Subtitle."""
|
||||||
|
|
||||||
provider_name = 'legendastv'
|
provider_name = 'legendastv'
|
||||||
|
|
||||||
def __init__(self, language, type, title, year, imdb_id, season, archive, name):
|
def __init__(self, language, type, title, year, imdb_id, season, archive, name):
|
||||||
super(LegendasTVSubtitle, self).__init__(language, page_link=archive.link)
|
super(LegendasTVSubtitle, self).__init__(language, archive.link)
|
||||||
self.type = type
|
self.type = type
|
||||||
self.title = title
|
self.title = title
|
||||||
self.year = year
|
self.year = year
|
||||||
|
@ -122,12 +118,11 @@ class LegendasTVSubtitle(Subtitle):
|
||||||
# episode
|
# episode
|
||||||
if isinstance(video, Episode) and self.type == 'episode':
|
if isinstance(video, Episode) and self.type == 'episode':
|
||||||
# series
|
# series
|
||||||
if video.series and (sanitize(self.title) in (
|
if video.series and sanitize(self.title) == sanitize(video.series):
|
||||||
sanitize(name) for name in [video.series] + video.alternative_series)):
|
|
||||||
matches.add('series')
|
matches.add('series')
|
||||||
|
|
||||||
# year
|
# year (year is based on season air date hence the adjustment)
|
||||||
if video.original_series and self.year is None or video.year and video.year == self.year:
|
if video.original_series and self.year is None or video.year and video.year == self.year - self.season + 1:
|
||||||
matches.add('year')
|
matches.add('year')
|
||||||
|
|
||||||
# imdb_id
|
# imdb_id
|
||||||
|
@ -137,8 +132,7 @@ class LegendasTVSubtitle(Subtitle):
|
||||||
# movie
|
# movie
|
||||||
elif isinstance(video, Movie) and self.type == 'movie':
|
elif isinstance(video, Movie) and self.type == 'movie':
|
||||||
# title
|
# title
|
||||||
if video.title and (sanitize(self.title) in (
|
if video.title and sanitize(self.title) == sanitize(video.title):
|
||||||
sanitize(name) for name in [video.title] + video.alternative_titles)):
|
|
||||||
matches.add('title')
|
matches.add('title')
|
||||||
|
|
||||||
# year
|
# year
|
||||||
|
@ -149,6 +143,9 @@ class LegendasTVSubtitle(Subtitle):
|
||||||
if video.imdb_id and self.imdb_id == video.imdb_id:
|
if video.imdb_id and self.imdb_id == video.imdb_id:
|
||||||
matches.add('imdb_id')
|
matches.add('imdb_id')
|
||||||
|
|
||||||
|
# archive name
|
||||||
|
matches |= guess_matches(video, guessit(self.archive.name, {'type': self.type}))
|
||||||
|
|
||||||
# name
|
# name
|
||||||
matches |= guess_matches(video, guessit(self.name, {'type': self.type}))
|
matches |= guess_matches(video, guessit(self.name, {'type': self.type}))
|
||||||
|
|
||||||
|
@ -160,38 +157,29 @@ class LegendasTVProvider(Provider):
|
||||||
|
|
||||||
:param str username: username.
|
:param str username: username.
|
||||||
:param str password: password.
|
:param str password: password.
|
||||||
"""
|
|
||||||
|
|
||||||
|
"""
|
||||||
languages = {Language.fromlegendastv(l) for l in language_converters['legendastv'].codes}
|
languages = {Language.fromlegendastv(l) for l in language_converters['legendastv'].codes}
|
||||||
server_url = 'http://legendas.tv/'
|
server_url = 'http://legendas.tv/'
|
||||||
subtitle_class = LegendasTVSubtitle
|
|
||||||
|
|
||||||
def __init__(self, username=None, password=None):
|
def __init__(self, username=None, password=None):
|
||||||
|
if username and not password or not username and password:
|
||||||
# Provider needs UNRAR installed. If not available raise ConfigurationError
|
|
||||||
try:
|
|
||||||
rarfile.custom_check(rarfile.UNRAR_TOOL)
|
|
||||||
except rarfile.RarExecError:
|
|
||||||
raise ConfigurationError('UNRAR tool not available')
|
|
||||||
|
|
||||||
if any((username, password)) and not all((username, password)):
|
|
||||||
raise ConfigurationError('Username and password must be specified')
|
raise ConfigurationError('Username and password must be specified')
|
||||||
|
|
||||||
self.username = username
|
self.username = username
|
||||||
self.password = password
|
self.password = password
|
||||||
self.logged_in = False
|
self.logged_in = False
|
||||||
self.session = None
|
|
||||||
|
|
||||||
def initialize(self):
|
def initialize(self):
|
||||||
self.session = Session()
|
self.session = Session()
|
||||||
self.session.headers['User-Agent'] = 'Subliminal/%s' % __short_version__
|
self.session.headers['User-Agent'] = 'Subliminal/%s' % __short_version__
|
||||||
|
|
||||||
# login
|
# login
|
||||||
if self.username and self.password:
|
if self.username is not None and self.password is not None:
|
||||||
logger.info('Logging in')
|
logger.info('Logging in')
|
||||||
data = {'_method': 'POST', 'data[User][username]': self.username, 'data[User][password]': self.password}
|
data = {'_method': 'POST', 'data[User][username]': self.username, 'data[User][password]': self.password}
|
||||||
r = self.session.post(self.server_url + 'login', data, allow_redirects=False, timeout=10)
|
r = self.session.post(self.server_url + 'login', data, allow_redirects=False, timeout=10)
|
||||||
raise_for_status(r)
|
r.raise_for_status()
|
||||||
|
|
||||||
soup = ParserBeautifulSoup(r.content, ['html.parser'])
|
soup = ParserBeautifulSoup(r.content, ['html.parser'])
|
||||||
if soup.find('div', {'class': 'alert-error'}, string=re.compile(u'Usuário ou senha inválidos')):
|
if soup.find('div', {'class': 'alert-error'}, string=re.compile(u'Usuário ou senha inválidos')):
|
||||||
|
@ -205,174 +193,94 @@ class LegendasTVProvider(Provider):
|
||||||
if self.logged_in:
|
if self.logged_in:
|
||||||
logger.info('Logging out')
|
logger.info('Logging out')
|
||||||
r = self.session.get(self.server_url + 'users/logout', allow_redirects=False, timeout=10)
|
r = self.session.get(self.server_url + 'users/logout', allow_redirects=False, timeout=10)
|
||||||
raise_for_status(r)
|
r.raise_for_status()
|
||||||
logger.debug('Logged out')
|
logger.debug('Logged out')
|
||||||
self.logged_in = False
|
self.logged_in = False
|
||||||
|
|
||||||
self.session.close()
|
self.session.close()
|
||||||
|
|
||||||
@staticmethod
|
@region.cache_on_arguments(expiration_time=SHOW_EXPIRATION_TIME)
|
||||||
def is_valid_title(title, title_id, sanitized_title, season, year):
|
def search_titles(self, title):
|
||||||
"""Check if is a valid title."""
|
|
||||||
sanitized_result = sanitize(title['title'])
|
|
||||||
if sanitized_result != sanitized_title:
|
|
||||||
logger.debug("Mismatched title, discarding title %d (%s)",
|
|
||||||
title_id, sanitized_result)
|
|
||||||
return
|
|
||||||
|
|
||||||
# episode type
|
|
||||||
if season:
|
|
||||||
# discard mismatches on type
|
|
||||||
if title['type'] != 'episode':
|
|
||||||
logger.debug("Mismatched 'episode' type, discarding title %d (%s)", title_id, sanitized_result)
|
|
||||||
return
|
|
||||||
|
|
||||||
# discard mismatches on season
|
|
||||||
if 'season' not in title or title['season'] != season:
|
|
||||||
logger.debug('Mismatched season %s, discarding title %d (%s)',
|
|
||||||
title.get('season'), title_id, sanitized_result)
|
|
||||||
return
|
|
||||||
# movie type
|
|
||||||
else:
|
|
||||||
# discard mismatches on type
|
|
||||||
if title['type'] != 'movie':
|
|
||||||
logger.debug("Mismatched 'movie' type, discarding title %d (%s)", title_id, sanitized_result)
|
|
||||||
return
|
|
||||||
|
|
||||||
# discard mismatches on year
|
|
||||||
if year is not None and 'year' in title and title['year'] != year:
|
|
||||||
logger.debug("Mismatched movie year, discarding title %d (%s)", title_id, sanitized_result)
|
|
||||||
return
|
|
||||||
return True
|
|
||||||
|
|
||||||
@region.cache_on_arguments(expiration_time=SHOW_EXPIRATION_TIME, should_cache_fn=lambda value: value)
|
|
||||||
def search_titles(self, title, season, title_year):
|
|
||||||
"""Search for titles matching the `title`.
|
"""Search for titles matching the `title`.
|
||||||
|
|
||||||
For episodes, each season has it own title
|
|
||||||
:param str title: the title to search for.
|
:param str title: the title to search for.
|
||||||
:param int season: season of the title
|
|
||||||
:param int title_year: year of the title
|
|
||||||
:return: found titles.
|
:return: found titles.
|
||||||
:rtype: dict
|
:rtype: dict
|
||||||
|
|
||||||
"""
|
"""
|
||||||
|
# make the query
|
||||||
|
logger.info('Searching title %r', title)
|
||||||
|
r = self.session.get(self.server_url + 'legenda/sugestao/{}'.format(title), timeout=10)
|
||||||
|
r.raise_for_status()
|
||||||
|
results = json.loads(r.text)
|
||||||
|
|
||||||
|
# loop over results
|
||||||
titles = {}
|
titles = {}
|
||||||
sanitized_titles = [sanitize(title)]
|
for result in results:
|
||||||
ignore_characters = {'\'', '.'}
|
source = result['_source']
|
||||||
if any(c in title for c in ignore_characters):
|
|
||||||
sanitized_titles.append(sanitize(title, ignore_characters=ignore_characters))
|
|
||||||
|
|
||||||
for sanitized_title in sanitized_titles:
|
# extract id
|
||||||
# make the query
|
title_id = int(source['id_filme'])
|
||||||
if season:
|
|
||||||
logger.info('Searching episode title %r for season %r', sanitized_title, season)
|
|
||||||
else:
|
|
||||||
logger.info('Searching movie title %r', sanitized_title)
|
|
||||||
|
|
||||||
r = self.session.get(self.server_url + 'legenda/sugestao/{}'.format(sanitized_title), timeout=10)
|
# extract type and title
|
||||||
raise_for_status(r)
|
title = {'type': type_map[source['tipo']], 'title': source['dsc_nome']}
|
||||||
results = json.loads(r.text)
|
|
||||||
|
|
||||||
# loop over results
|
# extract year
|
||||||
for result in results:
|
if source['dsc_data_lancamento'] and source['dsc_data_lancamento'].isdigit():
|
||||||
source = result['_source']
|
title['year'] = int(source['dsc_data_lancamento'])
|
||||||
|
|
||||||
# extract id
|
# extract imdb_id
|
||||||
title_id = int(source['id_filme'])
|
if source['id_imdb'] != '0':
|
||||||
|
if not source['id_imdb'].startswith('tt'):
|
||||||
|
title['imdb_id'] = 'tt' + source['id_imdb'].zfill(7)
|
||||||
|
else:
|
||||||
|
title['imdb_id'] = source['id_imdb']
|
||||||
|
|
||||||
# extract type
|
# extract season
|
||||||
title = {'type': type_map[source['tipo']]}
|
if title['type'] == 'episode':
|
||||||
|
if source['temporada'] and source['temporada'].isdigit():
|
||||||
# extract title, year and country
|
title['season'] = int(source['temporada'])
|
||||||
name, year, country = title_re.match(source['dsc_nome']).groups()
|
else:
|
||||||
title['title'] = name
|
match = season_re.search(source['dsc_nome_br'])
|
||||||
|
if match:
|
||||||
# extract imdb_id
|
title['season'] = int(match.group('season'))
|
||||||
if source['id_imdb'] != '0':
|
|
||||||
if not source['id_imdb'].startswith('tt'):
|
|
||||||
title['imdb_id'] = 'tt' + source['id_imdb'].zfill(7)
|
|
||||||
else:
|
else:
|
||||||
title['imdb_id'] = source['id_imdb']
|
logger.warning('No season detected for title %d', title_id)
|
||||||
|
|
||||||
# extract season
|
# add title
|
||||||
if title['type'] == 'episode':
|
titles[title_id] = title
|
||||||
if source['temporada'] and source['temporada'].isdigit():
|
|
||||||
title['season'] = int(source['temporada'])
|
|
||||||
else:
|
|
||||||
match = season_re.search(source['dsc_nome_br'])
|
|
||||||
if match:
|
|
||||||
title['season'] = int(match.group('season'))
|
|
||||||
else:
|
|
||||||
logger.debug('No season detected for title %d (%s)', title_id, name)
|
|
||||||
|
|
||||||
# extract year
|
logger.debug('Found %d titles', len(titles))
|
||||||
if year:
|
|
||||||
title['year'] = int(year)
|
|
||||||
elif source['dsc_data_lancamento'] and source['dsc_data_lancamento'].isdigit():
|
|
||||||
# year is based on season air date hence the adjustment
|
|
||||||
title['year'] = int(source['dsc_data_lancamento']) - title.get('season', 1) + 1
|
|
||||||
|
|
||||||
# add title only if is valid
|
|
||||||
# Check against title without ignored chars
|
|
||||||
if self.is_valid_title(title, title_id, sanitized_titles[0], season, title_year):
|
|
||||||
titles[title_id] = title
|
|
||||||
|
|
||||||
logger.debug('Found %d titles', len(titles))
|
|
||||||
|
|
||||||
return titles
|
return titles
|
||||||
|
|
||||||
@region.cache_on_arguments(expiration_time=timedelta(minutes=15).total_seconds())
|
@region.cache_on_arguments(expiration_time=timedelta(minutes=15).total_seconds())
|
||||||
def get_archives(self, title_id, language_code, title_type, season, episode):
|
def get_archives(self, title_id, language_code):
|
||||||
"""Get the archive list from a given `title_id`, `language_code`, `title_type`, `season` and `episode`.
|
"""Get the archive list from a given `title_id` and `language_code`.
|
||||||
|
|
||||||
:param int title_id: title id.
|
:param int title_id: title id.
|
||||||
:param int language_code: language code.
|
:param int language_code: language code.
|
||||||
:param str title_type: episode or movie
|
|
||||||
:param int season: season
|
|
||||||
:param int episode: episode
|
|
||||||
:return: the archives.
|
:return: the archives.
|
||||||
:rtype: list of :class:`LegendasTVArchive`
|
:rtype: list of :class:`LegendasTVArchive`
|
||||||
|
|
||||||
"""
|
"""
|
||||||
|
logger.info('Getting archives for title %d and language %d', title_id, language_code)
|
||||||
archives = []
|
archives = []
|
||||||
page = 0
|
page = 1
|
||||||
while True:
|
while True:
|
||||||
# get the archive page
|
# get the archive page
|
||||||
url = self.server_url + 'legenda/busca/-/{language}/-/{page}/{title}'.format(
|
url = self.server_url + 'util/carrega_legendas_busca_filme/{title}/{language}/-/{page}'.format(
|
||||||
language=language_code, page=page, title=title_id)
|
title=title_id, language=language_code, page=page)
|
||||||
r = self.session.get(url)
|
r = self.session.get(url)
|
||||||
raise_for_status(r)
|
r.raise_for_status()
|
||||||
|
|
||||||
# parse the results
|
# parse the results
|
||||||
soup = ParserBeautifulSoup(r.content, ['lxml', 'html.parser'])
|
soup = ParserBeautifulSoup(r.content, ['lxml', 'html.parser'])
|
||||||
for archive_soup in soup.select('div.list_element > article > div > div.f_left'):
|
for archive_soup in soup.select('div.list_element > article > div'):
|
||||||
# create archive
|
# create archive
|
||||||
archive = LegendasTVArchive(archive_soup.a['href'].split('/')[2],
|
archive = LegendasTVArchive(archive_soup.a['href'].split('/')[2], archive_soup.a.text,
|
||||||
archive_soup.a.text,
|
'pack' in archive_soup['class'], 'destaque' in archive_soup['class'],
|
||||||
'pack' in archive_soup.parent['class'],
|
|
||||||
'destaque' in archive_soup.parent['class'],
|
|
||||||
self.server_url + archive_soup.a['href'][1:])
|
self.server_url + archive_soup.a['href'][1:])
|
||||||
# clean name of path separators and pack flags
|
|
||||||
clean_name = archive.name.replace('/', '-')
|
|
||||||
if archive.pack and clean_name.startswith('(p)'):
|
|
||||||
clean_name = clean_name[3:]
|
|
||||||
|
|
||||||
# guess from name
|
|
||||||
guess = guessit(clean_name, {'type': title_type})
|
|
||||||
|
|
||||||
# episode
|
|
||||||
if season and episode:
|
|
||||||
# discard mismatches on episode in non-pack archives
|
|
||||||
|
|
||||||
# Guessit may return int for single episode or list for multi-episode
|
|
||||||
# Check if archive name has multiple episodes releases on it
|
|
||||||
if not archive.pack and 'episode' in guess:
|
|
||||||
wanted_episode = set(episode) if isinstance(episode, list) else {episode}
|
|
||||||
archive_episode = guess['episode'] if isinstance(guess['episode'], list) else {guess['episode']}
|
|
||||||
|
|
||||||
if not wanted_episode.intersection(archive_episode):
|
|
||||||
logger.debug('Mismatched episode %s, discarding archive: %s', guess['episode'], clean_name)
|
|
||||||
continue
|
|
||||||
|
|
||||||
# extract text containing downloads, rating and timestamp
|
# extract text containing downloads, rating and timestamp
|
||||||
data_text = archive_soup.find('p', class_='data').text
|
data_text = archive_soup.find('p', class_='data').text
|
||||||
|
@ -392,8 +300,6 @@ class LegendasTVProvider(Provider):
|
||||||
raise ProviderError('Archive timestamp is in the future')
|
raise ProviderError('Archive timestamp is in the future')
|
||||||
|
|
||||||
# add archive
|
# add archive
|
||||||
logger.info('Found archive for title %d and language %d at page %s: %s',
|
|
||||||
title_id, language_code, page, archive)
|
|
||||||
archives.append(archive)
|
archives.append(archive)
|
||||||
|
|
||||||
# stop on last page
|
# stop on last page
|
||||||
|
@ -416,7 +322,7 @@ class LegendasTVProvider(Provider):
|
||||||
"""
|
"""
|
||||||
logger.info('Downloading archive %s', archive.id)
|
logger.info('Downloading archive %s', archive.id)
|
||||||
r = self.session.get(self.server_url + 'downloadarquivo/{}'.format(archive.id))
|
r = self.session.get(self.server_url + 'downloadarquivo/{}'.format(archive.id))
|
||||||
raise_for_status(r)
|
r.raise_for_status()
|
||||||
|
|
||||||
# open the archive
|
# open the archive
|
||||||
archive_stream = io.BytesIO(r.content)
|
archive_stream = io.BytesIO(r.content)
|
||||||
|
@ -431,26 +337,60 @@ class LegendasTVProvider(Provider):
|
||||||
|
|
||||||
def query(self, language, title, season=None, episode=None, year=None):
|
def query(self, language, title, season=None, episode=None, year=None):
|
||||||
# search for titles
|
# search for titles
|
||||||
titles = self.search_titles(title, season, year)
|
titles = self.search_titles(sanitize(title))
|
||||||
|
|
||||||
|
# search for titles with the quote or dot character
|
||||||
|
ignore_characters = {'\'', '.'}
|
||||||
|
if any(c in title for c in ignore_characters):
|
||||||
|
titles.update(self.search_titles(sanitize(title, ignore_characters=ignore_characters)))
|
||||||
|
|
||||||
subtitles = []
|
subtitles = []
|
||||||
# iterate over titles
|
# iterate over titles
|
||||||
for title_id, t in titles.items():
|
for title_id, t in titles.items():
|
||||||
|
# discard mismatches on title
|
||||||
|
if sanitize(t['title']) != sanitize(title):
|
||||||
|
continue
|
||||||
|
|
||||||
logger.info('Getting archives for title %d and language %d', title_id, language.legendastv)
|
# episode
|
||||||
archives = self.get_archives(title_id, language.legendastv, t['type'], season, episode)
|
if season and episode:
|
||||||
if not archives:
|
# discard mismatches on type
|
||||||
logger.info('No archives found for title %d and language %d', title_id, language.legendastv)
|
if t['type'] != 'episode':
|
||||||
|
continue
|
||||||
|
|
||||||
|
# discard mismatches on season
|
||||||
|
if 'season' not in t or t['season'] != season:
|
||||||
|
continue
|
||||||
|
# movie
|
||||||
|
else:
|
||||||
|
# discard mismatches on type
|
||||||
|
if t['type'] != 'movie':
|
||||||
|
continue
|
||||||
|
|
||||||
|
# discard mismatches on year
|
||||||
|
if year is not None and 'year' in t and t['year'] != year:
|
||||||
|
continue
|
||||||
|
|
||||||
# iterate over title's archives
|
# iterate over title's archives
|
||||||
for a in archives:
|
for a in self.get_archives(title_id, language.legendastv):
|
||||||
|
# clean name of path separators and pack flags
|
||||||
|
clean_name = a.name.replace('/', '-')
|
||||||
|
if a.pack and clean_name.startswith('(p)'):
|
||||||
|
clean_name = clean_name[3:]
|
||||||
|
|
||||||
|
# guess from name
|
||||||
|
guess = guessit(clean_name, {'type': t['type']})
|
||||||
|
|
||||||
|
# episode
|
||||||
|
if season and episode:
|
||||||
|
# discard mismatches on episode in non-pack archives
|
||||||
|
if not a.pack and 'episode' in guess and guess['episode'] != episode:
|
||||||
|
continue
|
||||||
|
|
||||||
# compute an expiration time based on the archive timestamp
|
# compute an expiration time based on the archive timestamp
|
||||||
expiration_time = (datetime.utcnow().replace(tzinfo=pytz.utc) - a.timestamp).total_seconds()
|
expiration_time = (datetime.utcnow().replace(tzinfo=pytz.utc) - a.timestamp).total_seconds()
|
||||||
|
|
||||||
# attempt to get the releases from the cache
|
# attempt to get the releases from the cache
|
||||||
cache_key = releases_key.format(archive_id=a.id, archive_name=a.name)
|
releases = region.get(releases_key.format(archive_id=a.id), expiration_time=expiration_time)
|
||||||
releases = region.get(cache_key, expiration_time=expiration_time)
|
|
||||||
|
|
||||||
# the releases are not in cache or cache is expired
|
# the releases are not in cache or cache is expired
|
||||||
if releases == NO_VALUE:
|
if releases == NO_VALUE:
|
||||||
|
@ -477,12 +417,12 @@ class LegendasTVProvider(Provider):
|
||||||
releases.append(name)
|
releases.append(name)
|
||||||
|
|
||||||
# cache the releases
|
# cache the releases
|
||||||
region.set(cache_key, releases)
|
region.set(releases_key.format(archive_id=a.id), releases)
|
||||||
|
|
||||||
# iterate over releases
|
# iterate over releases
|
||||||
for r in releases:
|
for r in releases:
|
||||||
subtitle = self.subtitle_class(language, t['type'], t['title'], t.get('year'), t.get('imdb_id'),
|
subtitle = LegendasTVSubtitle(language, t['type'], t['title'], t.get('year'), t.get('imdb_id'),
|
||||||
t.get('season'), a, r)
|
t.get('season'), a, r)
|
||||||
logger.debug('Found subtitle %r', subtitle)
|
logger.debug('Found subtitle %r', subtitle)
|
||||||
subtitles.append(subtitle)
|
subtitles.append(subtitle)
|
||||||
|
|
||||||
|
@ -491,19 +431,13 @@ class LegendasTVProvider(Provider):
|
||||||
def list_subtitles(self, video, languages):
|
def list_subtitles(self, video, languages):
|
||||||
season = episode = None
|
season = episode = None
|
||||||
if isinstance(video, Episode):
|
if isinstance(video, Episode):
|
||||||
titles = [video.series] + video.alternative_series
|
title = video.series
|
||||||
season = video.season
|
season = video.season
|
||||||
episode = video.episode
|
episode = video.episode
|
||||||
else:
|
else:
|
||||||
titles = [video.title] + video.alternative_titles
|
title = video.title
|
||||||
|
|
||||||
for title in titles:
|
return [s for l in languages for s in self.query(l, title, season=season, episode=episode, year=video.year)]
|
||||||
subtitles = [s for l in languages for s in
|
|
||||||
self.query(l, title, season=season, episode=episode, year=video.year)]
|
|
||||||
if subtitles:
|
|
||||||
return subtitles
|
|
||||||
|
|
||||||
return []
|
|
||||||
|
|
||||||
def download_subtitle(self, subtitle):
|
def download_subtitle(self, subtitle):
|
||||||
# download archive in case we previously hit the releases cache and didn't download it
|
# download archive in case we previously hit the releases cache and didn't download it
|
||||||
|
@ -512,11 +446,3 @@ class LegendasTVProvider(Provider):
|
||||||
|
|
||||||
# extract subtitle's content
|
# extract subtitle's content
|
||||||
subtitle.content = fix_line_ending(subtitle.archive.content.read(subtitle.name))
|
subtitle.content = fix_line_ending(subtitle.archive.content.read(subtitle.name))
|
||||||
|
|
||||||
|
|
||||||
def raise_for_status(r):
|
|
||||||
# When site is under maintaince and http status code 200.
|
|
||||||
if 'Em breve estaremos de volta' in r.text:
|
|
||||||
raise ServiceUnavailable
|
|
||||||
else:
|
|
||||||
r.raise_for_status()
|
|
||||||
|
|
|
@ -42,7 +42,6 @@ class NapiProjektSubtitle(Subtitle):
|
||||||
def __init__(self, language, hash):
|
def __init__(self, language, hash):
|
||||||
super(NapiProjektSubtitle, self).__init__(language)
|
super(NapiProjektSubtitle, self).__init__(language)
|
||||||
self.hash = hash
|
self.hash = hash
|
||||||
self.content = None
|
|
||||||
|
|
||||||
@property
|
@property
|
||||||
def id(self):
|
def id(self):
|
||||||
|
@ -63,10 +62,6 @@ class NapiProjektProvider(Provider):
|
||||||
languages = {Language.fromalpha2(l) for l in ['pl']}
|
languages = {Language.fromalpha2(l) for l in ['pl']}
|
||||||
required_hash = 'napiprojekt'
|
required_hash = 'napiprojekt'
|
||||||
server_url = 'http://napiprojekt.pl/unit_napisy/dl.php'
|
server_url = 'http://napiprojekt.pl/unit_napisy/dl.php'
|
||||||
subtitle_class = NapiProjektSubtitle
|
|
||||||
|
|
||||||
def __init__(self):
|
|
||||||
self.session = None
|
|
||||||
|
|
||||||
def initialize(self):
|
def initialize(self):
|
||||||
self.session = Session()
|
self.session = Session()
|
||||||
|
@ -86,16 +81,16 @@ class NapiProjektProvider(Provider):
|
||||||
'f': hash,
|
'f': hash,
|
||||||
't': get_subhash(hash)}
|
't': get_subhash(hash)}
|
||||||
logger.info('Searching subtitle %r', params)
|
logger.info('Searching subtitle %r', params)
|
||||||
r = self.session.get(self.server_url, params=params, timeout=10)
|
response = self.session.get(self.server_url, params=params, timeout=10)
|
||||||
r.raise_for_status()
|
response.raise_for_status()
|
||||||
|
|
||||||
# handle subtitles not found and errors
|
# handle subtitles not found and errors
|
||||||
if r.content[:4] == b'NPc0':
|
if response.content[:4] == b'NPc0':
|
||||||
logger.debug('No subtitles found')
|
logger.debug('No subtitles found')
|
||||||
return None
|
return None
|
||||||
|
|
||||||
subtitle = self.subtitle_class(language, hash)
|
subtitle = NapiProjektSubtitle(language, hash)
|
||||||
subtitle.content = r.content
|
subtitle.content = response.content
|
||||||
logger.debug('Found subtitle %r', subtitle)
|
logger.debug('Found subtitle %r', subtitle)
|
||||||
|
|
||||||
return subtitle
|
return subtitle
|
||||||
|
|
|
@ -11,8 +11,7 @@ from six.moves.xmlrpc_client import ServerProxy
|
||||||
|
|
||||||
from . import Provider, TimeoutSafeTransport
|
from . import Provider, TimeoutSafeTransport
|
||||||
from .. import __short_version__
|
from .. import __short_version__
|
||||||
from ..exceptions import (AuthenticationError, ConfigurationError, DownloadLimitExceeded, ProviderError,
|
from ..exceptions import AuthenticationError, ConfigurationError, DownloadLimitExceeded, ProviderError
|
||||||
ServiceUnavailable)
|
|
||||||
from ..subtitle import Subtitle, fix_line_ending, guess_matches
|
from ..subtitle import Subtitle, fix_line_ending, guess_matches
|
||||||
from ..utils import sanitize
|
from ..utils import sanitize
|
||||||
from ..video import Episode, Movie
|
from ..video import Episode, Movie
|
||||||
|
@ -27,8 +26,7 @@ class OpenSubtitlesSubtitle(Subtitle):
|
||||||
|
|
||||||
def __init__(self, language, hearing_impaired, page_link, subtitle_id, matched_by, movie_kind, hash, movie_name,
|
def __init__(self, language, hearing_impaired, page_link, subtitle_id, matched_by, movie_kind, hash, movie_name,
|
||||||
movie_release_name, movie_year, movie_imdb_id, series_season, series_episode, filename, encoding):
|
movie_release_name, movie_year, movie_imdb_id, series_season, series_episode, filename, encoding):
|
||||||
super(OpenSubtitlesSubtitle, self).__init__(language, hearing_impaired=hearing_impaired,
|
super(OpenSubtitlesSubtitle, self).__init__(language, hearing_impaired, page_link, encoding)
|
||||||
page_link=page_link, encoding=encoding)
|
|
||||||
self.subtitle_id = subtitle_id
|
self.subtitle_id = subtitle_id
|
||||||
self.matched_by = matched_by
|
self.matched_by = matched_by
|
||||||
self.movie_kind = movie_kind
|
self.movie_kind = movie_kind
|
||||||
|
@ -60,8 +58,7 @@ class OpenSubtitlesSubtitle(Subtitle):
|
||||||
if isinstance(video, Episode) and self.movie_kind == 'episode':
|
if isinstance(video, Episode) and self.movie_kind == 'episode':
|
||||||
# tag match, assume series, year, season and episode matches
|
# tag match, assume series, year, season and episode matches
|
||||||
if self.matched_by == 'tag':
|
if self.matched_by == 'tag':
|
||||||
if not video.imdb_id or self.movie_imdb_id == video.imdb_id:
|
matches |= {'series', 'year', 'season', 'episode'}
|
||||||
matches |= {'series', 'year', 'season', 'episode'}
|
|
||||||
# series
|
# series
|
||||||
if video.series and sanitize(self.series_name) == sanitize(video.series):
|
if video.series and sanitize(self.series_name) == sanitize(video.series):
|
||||||
matches.add('series')
|
matches.add('series')
|
||||||
|
@ -90,8 +87,7 @@ class OpenSubtitlesSubtitle(Subtitle):
|
||||||
elif isinstance(video, Movie) and self.movie_kind == 'movie':
|
elif isinstance(video, Movie) and self.movie_kind == 'movie':
|
||||||
# tag match, assume title and year matches
|
# tag match, assume title and year matches
|
||||||
if self.matched_by == 'tag':
|
if self.matched_by == 'tag':
|
||||||
if not video.imdb_id or self.movie_imdb_id == video.imdb_id:
|
matches |= {'title', 'year'}
|
||||||
matches |= {'title', 'year'}
|
|
||||||
# title
|
# title
|
||||||
if video.title and sanitize(self.movie_name) == sanitize(video.title):
|
if video.title and sanitize(self.movie_name) == sanitize(video.title):
|
||||||
matches.add('title')
|
matches.add('title')
|
||||||
|
@ -126,11 +122,10 @@ class OpenSubtitlesProvider(Provider):
|
||||||
|
|
||||||
"""
|
"""
|
||||||
languages = {Language.fromopensubtitles(l) for l in language_converters['opensubtitles'].codes}
|
languages = {Language.fromopensubtitles(l) for l in language_converters['opensubtitles'].codes}
|
||||||
subtitle_class = OpenSubtitlesSubtitle
|
|
||||||
|
|
||||||
def __init__(self, username=None, password=None):
|
def __init__(self, username=None, password=None):
|
||||||
self.server = ServerProxy('https://api.opensubtitles.org/xml-rpc', TimeoutSafeTransport(10))
|
self.server = ServerProxy('https://api.opensubtitles.org/xml-rpc', TimeoutSafeTransport(10))
|
||||||
if any((username, password)) and not all((username, password)):
|
if username and not password or not username and password:
|
||||||
raise ConfigurationError('Username and password must be specified')
|
raise ConfigurationError('Username and password must be specified')
|
||||||
# None values not allowed for logging in, so replace it by ''
|
# None values not allowed for logging in, so replace it by ''
|
||||||
self.username = username or ''
|
self.username = username or ''
|
||||||
|
@ -161,10 +156,7 @@ class OpenSubtitlesProvider(Provider):
|
||||||
if hash and size:
|
if hash and size:
|
||||||
criteria.append({'moviehash': hash, 'moviebytesize': str(size)})
|
criteria.append({'moviehash': hash, 'moviebytesize': str(size)})
|
||||||
if imdb_id:
|
if imdb_id:
|
||||||
if season and episode:
|
criteria.append({'imdbid': imdb_id[2:]})
|
||||||
criteria.append({'imdbid': imdb_id[2:], 'season': season, 'episode': episode})
|
|
||||||
else:
|
|
||||||
criteria.append({'imdbid': imdb_id[2:]})
|
|
||||||
if tag:
|
if tag:
|
||||||
criteria.append({'tag': tag})
|
criteria.append({'tag': tag})
|
||||||
if query and season and episode:
|
if query and season and episode:
|
||||||
|
@ -207,9 +199,9 @@ class OpenSubtitlesProvider(Provider):
|
||||||
filename = subtitle_item['SubFileName']
|
filename = subtitle_item['SubFileName']
|
||||||
encoding = subtitle_item.get('SubEncoding') or None
|
encoding = subtitle_item.get('SubEncoding') or None
|
||||||
|
|
||||||
subtitle = self.subtitle_class(language, hearing_impaired, page_link, subtitle_id, matched_by, movie_kind,
|
subtitle = OpenSubtitlesSubtitle(language, hearing_impaired, page_link, subtitle_id, matched_by, movie_kind,
|
||||||
hash, movie_name, movie_release_name, movie_year, movie_imdb_id,
|
hash, movie_name, movie_release_name, movie_year, movie_imdb_id,
|
||||||
series_season, series_episode, filename, encoding)
|
series_season, series_episode, filename, encoding)
|
||||||
logger.debug('Found subtitle %r by %s', subtitle, matched_by)
|
logger.debug('Found subtitle %r by %s', subtitle, matched_by)
|
||||||
subtitles.append(subtitle)
|
subtitles.append(subtitle)
|
||||||
|
|
||||||
|
@ -268,6 +260,11 @@ class DisabledUserAgent(OpenSubtitlesError, AuthenticationError):
|
||||||
pass
|
pass
|
||||||
|
|
||||||
|
|
||||||
|
class ServiceUnavailable(OpenSubtitlesError):
|
||||||
|
"""Exception raised when status is '503 Service Unavailable'."""
|
||||||
|
pass
|
||||||
|
|
||||||
|
|
||||||
def checked(response):
|
def checked(response):
|
||||||
"""Check a response status before returning it.
|
"""Check a response status before returning it.
|
||||||
|
|
||||||
|
|
|
@ -31,7 +31,7 @@ class PodnapisiSubtitle(Subtitle):
|
||||||
|
|
||||||
def __init__(self, language, hearing_impaired, page_link, pid, releases, title, season=None, episode=None,
|
def __init__(self, language, hearing_impaired, page_link, pid, releases, title, season=None, episode=None,
|
||||||
year=None):
|
year=None):
|
||||||
super(PodnapisiSubtitle, self).__init__(language, hearing_impaired=hearing_impaired, page_link=page_link)
|
super(PodnapisiSubtitle, self).__init__(language, hearing_impaired, page_link)
|
||||||
self.pid = pid
|
self.pid = pid
|
||||||
self.releases = releases
|
self.releases = releases
|
||||||
self.title = title
|
self.title = title
|
||||||
|
@ -49,8 +49,7 @@ class PodnapisiSubtitle(Subtitle):
|
||||||
# episode
|
# episode
|
||||||
if isinstance(video, Episode):
|
if isinstance(video, Episode):
|
||||||
# series
|
# series
|
||||||
if video.series and (sanitize(self.title) in (
|
if video.series and sanitize(self.title) == sanitize(video.series):
|
||||||
sanitize(name) for name in [video.series] + video.alternative_series)):
|
|
||||||
matches.add('series')
|
matches.add('series')
|
||||||
# year
|
# year
|
||||||
if video.original_series and self.year is None or video.year and video.year == self.year:
|
if video.original_series and self.year is None or video.year and video.year == self.year:
|
||||||
|
@ -67,8 +66,7 @@ class PodnapisiSubtitle(Subtitle):
|
||||||
# movie
|
# movie
|
||||||
elif isinstance(video, Movie):
|
elif isinstance(video, Movie):
|
||||||
# title
|
# title
|
||||||
if video.title and (sanitize(self.title) in (
|
if video.title and sanitize(self.title) == sanitize(video.title):
|
||||||
sanitize(name) for name in [video.title] + video.alternative_titles)):
|
|
||||||
matches.add('title')
|
matches.add('title')
|
||||||
# year
|
# year
|
||||||
if video.year and self.year == video.year:
|
if video.year and self.year == video.year:
|
||||||
|
@ -84,11 +82,7 @@ class PodnapisiProvider(Provider):
|
||||||
"""Podnapisi Provider."""
|
"""Podnapisi Provider."""
|
||||||
languages = ({Language('por', 'BR'), Language('srp', script='Latn')} |
|
languages = ({Language('por', 'BR'), Language('srp', script='Latn')} |
|
||||||
{Language.fromalpha2(l) for l in language_converters['alpha2'].codes})
|
{Language.fromalpha2(l) for l in language_converters['alpha2'].codes})
|
||||||
server_url = 'https://www.podnapisi.net/subtitles/'
|
server_url = 'http://podnapisi.net/subtitles/'
|
||||||
subtitle_class = PodnapisiSubtitle
|
|
||||||
|
|
||||||
def __init__(self):
|
|
||||||
self.session = None
|
|
||||||
|
|
||||||
def initialize(self):
|
def initialize(self):
|
||||||
self.session = Session()
|
self.session = Session()
|
||||||
|
@ -114,9 +108,7 @@ class PodnapisiProvider(Provider):
|
||||||
pids = set()
|
pids = set()
|
||||||
while True:
|
while True:
|
||||||
# query the server
|
# query the server
|
||||||
r = self.session.get(self.server_url + 'search/old', params=params, timeout=10)
|
xml = etree.fromstring(self.session.get(self.server_url + 'search/old', params=params, timeout=10).content)
|
||||||
r.raise_for_status()
|
|
||||||
xml = etree.fromstring(r.content)
|
|
||||||
|
|
||||||
# exit if no results
|
# exit if no results
|
||||||
if not int(xml.find('pagination/results').text):
|
if not int(xml.find('pagination/results').text):
|
||||||
|
@ -126,14 +118,10 @@ class PodnapisiProvider(Provider):
|
||||||
# loop over subtitles
|
# loop over subtitles
|
||||||
for subtitle_xml in xml.findall('subtitle'):
|
for subtitle_xml in xml.findall('subtitle'):
|
||||||
# read xml elements
|
# read xml elements
|
||||||
pid = subtitle_xml.find('pid').text
|
|
||||||
# ignore duplicates, see http://www.podnapisi.net/forum/viewtopic.php?f=62&t=26164&start=10#p213321
|
|
||||||
if pid in pids:
|
|
||||||
continue
|
|
||||||
|
|
||||||
language = Language.fromietf(subtitle_xml.find('language').text)
|
language = Language.fromietf(subtitle_xml.find('language').text)
|
||||||
hearing_impaired = 'n' in (subtitle_xml.find('flags').text or '')
|
hearing_impaired = 'n' in (subtitle_xml.find('flags').text or '')
|
||||||
page_link = subtitle_xml.find('url').text
|
page_link = subtitle_xml.find('url').text
|
||||||
|
pid = subtitle_xml.find('pid').text
|
||||||
releases = []
|
releases = []
|
||||||
if subtitle_xml.find('release').text:
|
if subtitle_xml.find('release').text:
|
||||||
for release in subtitle_xml.find('release').text.split():
|
for release in subtitle_xml.find('release').text.split():
|
||||||
|
@ -146,11 +134,15 @@ class PodnapisiProvider(Provider):
|
||||||
year = int(subtitle_xml.find('year').text)
|
year = int(subtitle_xml.find('year').text)
|
||||||
|
|
||||||
if is_episode:
|
if is_episode:
|
||||||
subtitle = self.subtitle_class(language, hearing_impaired, page_link, pid, releases, title,
|
subtitle = PodnapisiSubtitle(language, hearing_impaired, page_link, pid, releases, title,
|
||||||
season=season, episode=episode, year=year)
|
season=season, episode=episode, year=year)
|
||||||
else:
|
else:
|
||||||
subtitle = self.subtitle_class(language, hearing_impaired, page_link, pid, releases, title,
|
subtitle = PodnapisiSubtitle(language, hearing_impaired, page_link, pid, releases, title,
|
||||||
year=year)
|
year=year)
|
||||||
|
|
||||||
|
# ignore duplicates, see http://www.podnapisi.net/forum/viewtopic.php?f=62&t=26164&start=10#p213321
|
||||||
|
if pid in pids:
|
||||||
|
continue
|
||||||
|
|
||||||
logger.debug('Found subtitle %r', subtitle)
|
logger.debug('Found subtitle %r', subtitle)
|
||||||
subtitles.append(subtitle)
|
subtitles.append(subtitle)
|
||||||
|
@ -167,21 +159,11 @@ class PodnapisiProvider(Provider):
|
||||||
return subtitles
|
return subtitles
|
||||||
|
|
||||||
def list_subtitles(self, video, languages):
|
def list_subtitles(self, video, languages):
|
||||||
season = episode = None
|
|
||||||
if isinstance(video, Episode):
|
if isinstance(video, Episode):
|
||||||
titles = [video.series] + video.alternative_series
|
return [s for l in languages for s in self.query(l, video.series, season=video.season,
|
||||||
season = video.season
|
episode=video.episode, year=video.year)]
|
||||||
episode = video.episode
|
elif isinstance(video, Movie):
|
||||||
else:
|
return [s for l in languages for s in self.query(l, video.title, year=video.year)]
|
||||||
titles = [video.title] + video.alternative_titles
|
|
||||||
|
|
||||||
for title in titles:
|
|
||||||
subtitles = [s for l in languages for s in
|
|
||||||
self.query(l, title, season=season, episode=episode, year=video.year)]
|
|
||||||
if subtitles:
|
|
||||||
return subtitles
|
|
||||||
|
|
||||||
return []
|
|
||||||
|
|
||||||
def download_subtitle(self, subtitle):
|
def download_subtitle(self, subtitle):
|
||||||
# download as a zip
|
# download as a zip
|
||||||
|
|
|
@ -42,10 +42,6 @@ class ShooterProvider(Provider):
|
||||||
"""Shooter Provider."""
|
"""Shooter Provider."""
|
||||||
languages = {Language(l) for l in ['eng', 'zho']}
|
languages = {Language(l) for l in ['eng', 'zho']}
|
||||||
server_url = 'https://www.shooter.cn/api/subapi.php'
|
server_url = 'https://www.shooter.cn/api/subapi.php'
|
||||||
subtitle_class = ShooterSubtitle
|
|
||||||
|
|
||||||
def __init__(self):
|
|
||||||
self.session = None
|
|
||||||
|
|
||||||
def initialize(self):
|
def initialize(self):
|
||||||
self.session = Session()
|
self.session = Session()
|
||||||
|
@ -68,7 +64,7 @@ class ShooterProvider(Provider):
|
||||||
|
|
||||||
# parse the subtitles
|
# parse the subtitles
|
||||||
results = json.loads(r.text)
|
results = json.loads(r.text)
|
||||||
subtitles = [self.subtitle_class(language, hash, t['Link']) for s in results for t in s['Files']]
|
subtitles = [ShooterSubtitle(language, hash, t['Link']) for s in results for t in s['Files']]
|
||||||
|
|
||||||
return subtitles
|
return subtitles
|
||||||
|
|
||||||
|
|
|
@ -26,7 +26,7 @@ class SubsCenterSubtitle(Subtitle):
|
||||||
provider_name = 'subscenter'
|
provider_name = 'subscenter'
|
||||||
|
|
||||||
def __init__(self, language, hearing_impaired, page_link, series, season, episode, title, subtitle_id, subtitle_key,
|
def __init__(self, language, hearing_impaired, page_link, series, season, episode, title, subtitle_id, subtitle_key,
|
||||||
subtitle_version, downloaded, releases):
|
downloaded, releases):
|
||||||
super(SubsCenterSubtitle, self).__init__(language, hearing_impaired, page_link)
|
super(SubsCenterSubtitle, self).__init__(language, hearing_impaired, page_link)
|
||||||
self.series = series
|
self.series = series
|
||||||
self.season = season
|
self.season = season
|
||||||
|
@ -34,7 +34,6 @@ class SubsCenterSubtitle(Subtitle):
|
||||||
self.title = title
|
self.title = title
|
||||||
self.subtitle_id = subtitle_id
|
self.subtitle_id = subtitle_id
|
||||||
self.subtitle_key = subtitle_key
|
self.subtitle_key = subtitle_key
|
||||||
self.subtitle_version = subtitle_version
|
|
||||||
self.downloaded = downloaded
|
self.downloaded = downloaded
|
||||||
self.releases = releases
|
self.releases = releases
|
||||||
|
|
||||||
|
@ -75,8 +74,7 @@ class SubsCenterSubtitle(Subtitle):
|
||||||
class SubsCenterProvider(Provider):
|
class SubsCenterProvider(Provider):
|
||||||
"""SubsCenter Provider."""
|
"""SubsCenter Provider."""
|
||||||
languages = {Language.fromalpha2(l) for l in ['he']}
|
languages = {Language.fromalpha2(l) for l in ['he']}
|
||||||
server_url = 'http://www.subscenter.org/he/'
|
server_url = 'http://www.subscenter.co/he/'
|
||||||
subtitle_class = SubsCenterSubtitle
|
|
||||||
|
|
||||||
def __init__(self, username=None, password=None):
|
def __init__(self, username=None, password=None):
|
||||||
if username is not None and password is None or username is None and password is not None:
|
if username is not None and password is None or username is None and password is not None:
|
||||||
|
@ -191,7 +189,6 @@ class SubsCenterProvider(Provider):
|
||||||
hearing_impaired = bool(subtitle_item['hearing_impaired'])
|
hearing_impaired = bool(subtitle_item['hearing_impaired'])
|
||||||
subtitle_id = subtitle_item['id']
|
subtitle_id = subtitle_item['id']
|
||||||
subtitle_key = subtitle_item['key']
|
subtitle_key = subtitle_item['key']
|
||||||
subtitle_version = subtitle_item['h_version']
|
|
||||||
downloaded = subtitle_item['downloaded']
|
downloaded = subtitle_item['downloaded']
|
||||||
release = subtitle_item['subtitle_version']
|
release = subtitle_item['subtitle_version']
|
||||||
|
|
||||||
|
@ -203,9 +200,8 @@ class SubsCenterProvider(Provider):
|
||||||
continue
|
continue
|
||||||
|
|
||||||
# otherwise create it
|
# otherwise create it
|
||||||
subtitle = self.subtitle_class(language, hearing_impaired, page_link, title, season, episode,
|
subtitle = SubsCenterSubtitle(language, hearing_impaired, page_link, title, season, episode,
|
||||||
title, subtitle_id, subtitle_key, subtitle_version, downloaded,
|
title, subtitle_id, subtitle_key, downloaded, [release])
|
||||||
[release])
|
|
||||||
logger.debug('Found subtitle %r', subtitle)
|
logger.debug('Found subtitle %r', subtitle)
|
||||||
subtitles[subtitle_id] = subtitle
|
subtitles[subtitle_id] = subtitle
|
||||||
|
|
||||||
|
@ -225,19 +221,15 @@ class SubsCenterProvider(Provider):
|
||||||
def download_subtitle(self, subtitle):
|
def download_subtitle(self, subtitle):
|
||||||
# download
|
# download
|
||||||
url = self.server_url + 'subtitle/download/{}/{}/'.format(subtitle.language.alpha2, subtitle.subtitle_id)
|
url = self.server_url + 'subtitle/download/{}/{}/'.format(subtitle.language.alpha2, subtitle.subtitle_id)
|
||||||
params = {'v': subtitle.subtitle_version, 'key': subtitle.subtitle_key}
|
params = {'v': subtitle.releases[0], 'key': subtitle.subtitle_key}
|
||||||
r = self.session.get(url, params=params, headers={'Referer': subtitle.page_link}, timeout=10)
|
r = self.session.get(url, params=params, headers={'Referer': subtitle.page_link}, timeout=10)
|
||||||
r.raise_for_status()
|
r.raise_for_status()
|
||||||
|
|
||||||
# open the zip
|
# open the zip
|
||||||
try:
|
with zipfile.ZipFile(io.BytesIO(r.content)) as zf:
|
||||||
with zipfile.ZipFile(io.BytesIO(r.content)) as zf:
|
# remove some filenames from the namelist
|
||||||
# remove some filenames from the namelist
|
namelist = [n for n in zf.namelist() if not n.endswith('.txt')]
|
||||||
namelist = [n for n in zf.namelist() if not n.endswith('.txt')]
|
if len(namelist) > 1:
|
||||||
if len(namelist) > 1:
|
raise ProviderError('More than one file to unzip')
|
||||||
raise ProviderError('More than one file to unzip')
|
|
||||||
|
|
||||||
subtitle.content = fix_line_ending(zf.read(namelist[0]))
|
subtitle.content = fix_line_ending(zf.read(namelist[0]))
|
||||||
except zipfile.BadZipfile:
|
|
||||||
# if no zip file was retrieved, daily downloads limit has exceeded
|
|
||||||
raise ProviderError('Daily limit exceeded')
|
|
||||||
|
|
|
@ -40,10 +40,6 @@ class TheSubDBProvider(Provider):
|
||||||
languages = {Language.fromthesubdb(l) for l in language_converters['thesubdb'].codes}
|
languages = {Language.fromthesubdb(l) for l in language_converters['thesubdb'].codes}
|
||||||
required_hash = 'thesubdb'
|
required_hash = 'thesubdb'
|
||||||
server_url = 'http://api.thesubdb.com/'
|
server_url = 'http://api.thesubdb.com/'
|
||||||
subtitle_class = TheSubDBSubtitle
|
|
||||||
|
|
||||||
def __init__(self):
|
|
||||||
self.session = None
|
|
||||||
|
|
||||||
def initialize(self):
|
def initialize(self):
|
||||||
self.session = Session()
|
self.session = Session()
|
||||||
|
@ -70,7 +66,7 @@ class TheSubDBProvider(Provider):
|
||||||
for language_code in r.text.split(','):
|
for language_code in r.text.split(','):
|
||||||
language = Language.fromthesubdb(language_code)
|
language = Language.fromthesubdb(language_code)
|
||||||
|
|
||||||
subtitle = self.subtitle_class(language, hash)
|
subtitle = TheSubDBSubtitle(language, hash)
|
||||||
logger.debug('Found subtitle %r', subtitle)
|
logger.debug('Found subtitle %r', subtitle)
|
||||||
subtitles.append(subtitle)
|
subtitles.append(subtitle)
|
||||||
|
|
||||||
|
|
|
@ -47,8 +47,7 @@ class TVsubtitlesSubtitle(Subtitle):
|
||||||
matches = set()
|
matches = set()
|
||||||
|
|
||||||
# series
|
# series
|
||||||
if video.series and (sanitize(self.series) in (
|
if video.series and sanitize(self.series) == sanitize(video.series):
|
||||||
sanitize(name) for name in [video.series] + video.alternative_series)):
|
|
||||||
matches.add('series')
|
matches.add('series')
|
||||||
# season
|
# season
|
||||||
if video.season and self.season == video.season:
|
if video.season and self.season == video.season:
|
||||||
|
@ -81,10 +80,6 @@ class TVsubtitlesProvider(Provider):
|
||||||
]}
|
]}
|
||||||
video_types = (Episode,)
|
video_types = (Episode,)
|
||||||
server_url = 'http://www.tvsubtitles.net/'
|
server_url = 'http://www.tvsubtitles.net/'
|
||||||
subtitle_class = TVsubtitlesSubtitle
|
|
||||||
|
|
||||||
def __init__(self):
|
|
||||||
self.session = None
|
|
||||||
|
|
||||||
def initialize(self):
|
def initialize(self):
|
||||||
self.session = Session()
|
self.session = Session()
|
||||||
|
@ -163,7 +158,13 @@ class TVsubtitlesProvider(Provider):
|
||||||
|
|
||||||
return episode_ids
|
return episode_ids
|
||||||
|
|
||||||
def query(self, show_id, series, season, episode, year=None):
|
def query(self, series, season, episode, year=None):
|
||||||
|
# search the show id
|
||||||
|
show_id = self.search_show_id(series, year)
|
||||||
|
if show_id is None:
|
||||||
|
logger.error('No show id found for %r (%r)', series, {'year': year})
|
||||||
|
return []
|
||||||
|
|
||||||
# get the episode ids
|
# get the episode ids
|
||||||
episode_ids = self.get_episode_ids(show_id, season)
|
episode_ids = self.get_episode_ids(show_id, season)
|
||||||
if episode not in episode_ids:
|
if episode not in episode_ids:
|
||||||
|
@ -183,9 +184,9 @@ class TVsubtitlesProvider(Provider):
|
||||||
subtitle_id = int(row.parent['href'][10:-5])
|
subtitle_id = int(row.parent['href'][10:-5])
|
||||||
page_link = self.server_url + 'subtitle-%d.html' % subtitle_id
|
page_link = self.server_url + 'subtitle-%d.html' % subtitle_id
|
||||||
rip = row.find('p', title='rip').text.strip() or None
|
rip = row.find('p', title='rip').text.strip() or None
|
||||||
release = row.find('h5').text.strip() or None
|
release = row.find('p', title='release').text.strip() or None
|
||||||
|
|
||||||
subtitle = self.subtitle_class(language, page_link, subtitle_id, series, season, episode, year, rip,
|
subtitle = TVsubtitlesSubtitle(language, page_link, subtitle_id, series, season, episode, year, rip,
|
||||||
release)
|
release)
|
||||||
logger.debug('Found subtitle %s', subtitle)
|
logger.debug('Found subtitle %s', subtitle)
|
||||||
subtitles.append(subtitle)
|
subtitles.append(subtitle)
|
||||||
|
@ -193,24 +194,7 @@ class TVsubtitlesProvider(Provider):
|
||||||
return subtitles
|
return subtitles
|
||||||
|
|
||||||
def list_subtitles(self, video, languages):
|
def list_subtitles(self, video, languages):
|
||||||
# lookup show_id
|
return [s for s in self.query(video.series, video.season, video.episode, video.year) if s.language in languages]
|
||||||
titles = [video.series] + video.alternative_series
|
|
||||||
show_id = None
|
|
||||||
for title in titles:
|
|
||||||
show_id = self.search_show_id(title, video.year)
|
|
||||||
if show_id is not None:
|
|
||||||
break
|
|
||||||
|
|
||||||
# query for subtitles with the show_id
|
|
||||||
if show_id is not None:
|
|
||||||
subtitles = [s for s in self.query(show_id, title, video.season, video.episode, video.year)
|
|
||||||
if s.language in languages and s.episode == video.episode]
|
|
||||||
if subtitles:
|
|
||||||
return subtitles
|
|
||||||
else:
|
|
||||||
logger.error('No show id found for %r (%r)', video.series, {'year': video.year})
|
|
||||||
|
|
||||||
return []
|
|
||||||
|
|
||||||
def download_subtitle(self, subtitle):
|
def download_subtitle(self, subtitle):
|
||||||
# download as a zip
|
# download as a zip
|
||||||
|
|
|
@ -3,7 +3,7 @@ from datetime import datetime, timedelta
|
||||||
from functools import wraps
|
from functools import wraps
|
||||||
import logging
|
import logging
|
||||||
import re
|
import re
|
||||||
import _strptime
|
|
||||||
import requests
|
import requests
|
||||||
|
|
||||||
from .. import __short_version__
|
from .. import __short_version__
|
||||||
|
@ -331,7 +331,6 @@ def refine(video, **kwargs):
|
||||||
# add series information
|
# add series information
|
||||||
logger.debug('Found series %r', series)
|
logger.debug('Found series %r', series)
|
||||||
video.series = matching_result['match']['series']
|
video.series = matching_result['match']['series']
|
||||||
video.alternative_series.extend(series['aliases'])
|
|
||||||
video.year = matching_result['match']['year']
|
video.year = matching_result['match']['year']
|
||||||
video.original_series = matching_result['match']['original_series']
|
video.original_series = matching_result['match']['original_series']
|
||||||
video.series_tvdb_id = series['id']
|
video.series_tvdb_id = series['id']
|
||||||
|
|
|
@ -44,7 +44,7 @@ movie_scores = {'hash': 119, 'title': 60, 'year': 30, 'release_group': 15,
|
||||||
'format': 7, 'audio_codec': 3, 'resolution': 2, 'video_codec': 2, 'hearing_impaired': 1}
|
'format': 7, 'audio_codec': 3, 'resolution': 2, 'video_codec': 2, 'hearing_impaired': 1}
|
||||||
|
|
||||||
#: Equivalent release groups
|
#: Equivalent release groups
|
||||||
equivalent_release_groups = ({'LOL', 'DIMENSION'}, {'ASAP', 'IMMERSE', 'FLEET'}, {'AVS', 'SVA'})
|
equivalent_release_groups = ({'LOL', 'DIMENSION'}, {'ASAP', 'IMMERSE', 'FLEET'})
|
||||||
|
|
||||||
|
|
||||||
def get_equivalent_release_groups(release_group):
|
def get_equivalent_release_groups(release_group):
|
||||||
|
|
|
@ -208,14 +208,8 @@ def guess_matches(video, guess, partial=False):
|
||||||
if video.season and 'season' in guess and guess['season'] == video.season:
|
if video.season and 'season' in guess and guess['season'] == video.season:
|
||||||
matches.add('season')
|
matches.add('season')
|
||||||
# episode
|
# episode
|
||||||
# Currently we only have single-ep support (guessit returns a multi-ep as a list with int values)
|
if video.episode and 'episode' in guess and guess['episode'] == video.episode:
|
||||||
# Most providers only support single-ep, so make sure it contains only 1 episode
|
matches.add('episode')
|
||||||
# In case of multi-ep, take the lowest episode (subtitles will normally be available on lowest episode number)
|
|
||||||
if video.episode and 'episode' in guess:
|
|
||||||
episode_guess = guess['episode']
|
|
||||||
episode = min(episode_guess) if episode_guess and isinstance(episode_guess, list) else episode_guess
|
|
||||||
if episode == video.episode:
|
|
||||||
matches.add('episode')
|
|
||||||
# year
|
# year
|
||||||
if video.year and 'year' in guess and guess['year'] == video.year:
|
if video.year and 'year' in guess and guess['year'] == video.year:
|
||||||
matches.add('year')
|
matches.add('year')
|
||||||
|
@ -258,4 +252,4 @@ def fix_line_ending(content):
|
||||||
:rtype: bytes
|
:rtype: bytes
|
||||||
|
|
||||||
"""
|
"""
|
||||||
return content.replace(b'\r\n', b'\n')
|
return content.replace(b'\r\n', b'\n').replace(b'\r', b'\n')
|
||||||
|
|
|
@ -13,9 +13,9 @@ VIDEO_EXTENSIONS = ('.3g2', '.3gp', '.3gp2', '.3gpp', '.60d', '.ajp', '.asf', '.
|
||||||
'.bix', '.box', '.cam', '.dat', '.divx', '.dmf', '.dv', '.dvr-ms', '.evo', '.flc', '.fli',
|
'.bix', '.box', '.cam', '.dat', '.divx', '.dmf', '.dv', '.dvr-ms', '.evo', '.flc', '.fli',
|
||||||
'.flic', '.flv', '.flx', '.gvi', '.gvp', '.h264', '.m1v', '.m2p', '.m2ts', '.m2v', '.m4e',
|
'.flic', '.flv', '.flx', '.gvi', '.gvp', '.h264', '.m1v', '.m2p', '.m2ts', '.m2v', '.m4e',
|
||||||
'.m4v', '.mjp', '.mjpeg', '.mjpg', '.mkv', '.moov', '.mov', '.movhd', '.movie', '.movx', '.mp4',
|
'.m4v', '.mjp', '.mjpeg', '.mjpg', '.mkv', '.moov', '.mov', '.movhd', '.movie', '.movx', '.mp4',
|
||||||
'.mpe', '.mpeg', '.mpg', '.mpv', '.mpv2', '.mxf', '.nsv', '.nut', '.ogg', '.ogm', '.ogv', '.omf',
|
'.mpe', '.mpeg', '.mpg', '.mpv', '.mpv2', '.mxf', '.nsv', '.nut', '.ogg', '.ogm' '.ogv', '.omf',
|
||||||
'.ps', '.qt', '.ram', '.rm', '.rmvb', '.swf', '.ts', '.vfw', '.vid', '.video', '.viv', '.vivo',
|
'.ps', '.qt', '.ram', '.rm', '.rmvb', '.swf', '.ts', '.vfw', '.vid', '.video', '.viv', '.vivo',
|
||||||
'.vob', '.vro', '.webm', '.wm', '.wmv', '.wmx', '.wrap', '.wvx', '.wx', '.x264', '.xvid')
|
'.vob', '.vro', '.wm', '.wmv', '.wmx', '.wrap', '.wvx', '.wx', '.x264', '.xvid')
|
||||||
|
|
||||||
|
|
||||||
class Video(object):
|
class Video(object):
|
||||||
|
@ -123,12 +123,11 @@ class Episode(Video):
|
||||||
:param int year: year of the series.
|
:param int year: year of the series.
|
||||||
:param bool original_series: whether the series is the first with this name.
|
:param bool original_series: whether the series is the first with this name.
|
||||||
:param int tvdb_id: TVDB id of the episode.
|
:param int tvdb_id: TVDB id of the episode.
|
||||||
:param list alternative_series: alternative names of the series
|
|
||||||
:param \*\*kwargs: additional parameters for the :class:`Video` constructor.
|
:param \*\*kwargs: additional parameters for the :class:`Video` constructor.
|
||||||
|
|
||||||
"""
|
"""
|
||||||
def __init__(self, name, series, season, episode, title=None, year=None, original_series=True, tvdb_id=None,
|
def __init__(self, name, series, season, episode, title=None, year=None, original_series=True, tvdb_id=None,
|
||||||
series_tvdb_id=None, series_imdb_id=None, alternative_series=None, **kwargs):
|
series_tvdb_id=None, series_imdb_id=None, **kwargs):
|
||||||
super(Episode, self).__init__(name, **kwargs)
|
super(Episode, self).__init__(name, **kwargs)
|
||||||
|
|
||||||
#: Series of the episode
|
#: Series of the episode
|
||||||
|
@ -158,9 +157,6 @@ class Episode(Video):
|
||||||
#: IMDb id of the series
|
#: IMDb id of the series
|
||||||
self.series_imdb_id = series_imdb_id
|
self.series_imdb_id = series_imdb_id
|
||||||
|
|
||||||
#: Alternative names of the series
|
|
||||||
self.alternative_series = alternative_series or []
|
|
||||||
|
|
||||||
@classmethod
|
@classmethod
|
||||||
def fromguess(cls, name, guess):
|
def fromguess(cls, name, guess):
|
||||||
if guess['type'] != 'episode':
|
if guess['type'] != 'episode':
|
||||||
|
@ -169,13 +165,7 @@ class Episode(Video):
|
||||||
if 'title' not in guess or 'episode' not in guess:
|
if 'title' not in guess or 'episode' not in guess:
|
||||||
raise ValueError('Insufficient data to process the guess')
|
raise ValueError('Insufficient data to process the guess')
|
||||||
|
|
||||||
# Currently we only have single-ep support (guessit returns a multi-ep as a list with int values)
|
return cls(name, guess['title'], guess.get('season', 1), guess['episode'], title=guess.get('episode_title'),
|
||||||
# Most providers only support single-ep, so make sure it contains only 1 episode
|
|
||||||
# In case of multi-ep, take the lowest episode (subtitles will normally be available on lowest episode number)
|
|
||||||
episode_guess = guess.get('episode')
|
|
||||||
episode = min(episode_guess) if episode_guess and isinstance(episode_guess, list) else episode_guess
|
|
||||||
|
|
||||||
return cls(name, guess['title'], guess.get('season', 1), episode, title=guess.get('episode_title'),
|
|
||||||
year=guess.get('year'), format=guess.get('format'), original_series='year' not in guess,
|
year=guess.get('year'), format=guess.get('format'), original_series='year' not in guess,
|
||||||
release_group=guess.get('release_group'), resolution=guess.get('screen_size'),
|
release_group=guess.get('release_group'), resolution=guess.get('screen_size'),
|
||||||
video_codec=guess.get('video_codec'), audio_codec=guess.get('audio_codec'))
|
video_codec=guess.get('video_codec'), audio_codec=guess.get('audio_codec'))
|
||||||
|
@ -196,11 +186,10 @@ class Movie(Video):
|
||||||
|
|
||||||
:param str title: title of the movie.
|
:param str title: title of the movie.
|
||||||
:param int year: year of the movie.
|
:param int year: year of the movie.
|
||||||
:param list alternative_titles: alternative titles of the movie
|
|
||||||
:param \*\*kwargs: additional parameters for the :class:`Video` constructor.
|
:param \*\*kwargs: additional parameters for the :class:`Video` constructor.
|
||||||
|
|
||||||
"""
|
"""
|
||||||
def __init__(self, name, title, year=None, alternative_titles=None, **kwargs):
|
def __init__(self, name, title, year=None, **kwargs):
|
||||||
super(Movie, self).__init__(name, **kwargs)
|
super(Movie, self).__init__(name, **kwargs)
|
||||||
|
|
||||||
#: Title of the movie
|
#: Title of the movie
|
||||||
|
@ -209,9 +198,6 @@ class Movie(Video):
|
||||||
#: Year of the movie
|
#: Year of the movie
|
||||||
self.year = year
|
self.year = year
|
||||||
|
|
||||||
#: Alternative titles of the movie
|
|
||||||
self.alternative_titles = alternative_titles or []
|
|
||||||
|
|
||||||
@classmethod
|
@classmethod
|
||||||
def fromguess(cls, name, guess):
|
def fromguess(cls, name, guess):
|
||||||
if guess['type'] != 'movie':
|
if guess['type'] != 'movie':
|
||||||
|
@ -220,13 +206,9 @@ class Movie(Video):
|
||||||
if 'title' not in guess:
|
if 'title' not in guess:
|
||||||
raise ValueError('Insufficient data to process the guess')
|
raise ValueError('Insufficient data to process the guess')
|
||||||
|
|
||||||
alternative_titles = []
|
|
||||||
if 'alternative_title' in guess:
|
|
||||||
alternative_titles.append(u"%s %s" % (guess['title'], guess['alternative_title']))
|
|
||||||
|
|
||||||
return cls(name, guess['title'], format=guess.get('format'), release_group=guess.get('release_group'),
|
return cls(name, guess['title'], format=guess.get('format'), release_group=guess.get('release_group'),
|
||||||
resolution=guess.get('screen_size'), video_codec=guess.get('video_codec'),
|
resolution=guess.get('screen_size'), video_codec=guess.get('video_codec'),
|
||||||
audio_codec=guess.get('audio_codec'), year=guess.get('year'), alternative_titles=alternative_titles)
|
audio_codec=guess.get('audio_codec'), year=guess.get('year'))
|
||||||
|
|
||||||
@classmethod
|
@classmethod
|
||||||
def fromname(cls, name):
|
def fromname(cls, name):
|
||||||
|
|
|
@ -10,7 +10,7 @@ import time
|
||||||
import operator
|
import operator
|
||||||
|
|
||||||
import itertools
|
import itertools
|
||||||
from httplib import ResponseNotReady
|
from http.client import ResponseNotReady
|
||||||
|
|
||||||
import rarfile
|
import rarfile
|
||||||
import requests
|
import requests
|
||||||
|
@ -21,14 +21,13 @@ from babelfish import LanguageReverseError
|
||||||
from guessit.jsonutils import GuessitEncoder
|
from guessit.jsonutils import GuessitEncoder
|
||||||
from subliminal import ProviderError, refiner_manager
|
from subliminal import ProviderError, refiner_manager
|
||||||
|
|
||||||
from extensions import provider_registry
|
from subliminal_patch.extensions import provider_registry
|
||||||
from subliminal.exceptions import ServiceUnavailable, DownloadLimitExceeded
|
|
||||||
from subliminal.score import compute_score as default_compute_score
|
from subliminal.score import compute_score as default_compute_score
|
||||||
from subliminal.utils import hash_napiprojekt, hash_opensubtitles, hash_shooter, hash_thesubdb
|
from subliminal.utils import hash_napiprojekt, hash_opensubtitles, hash_shooter, hash_thesubdb
|
||||||
from subliminal.video import VIDEO_EXTENSIONS, Video, Episode, Movie
|
from subliminal.video import VIDEO_EXTENSIONS, Video, Episode, Movie
|
||||||
from subliminal.core import guessit, ProviderPool, io, is_windows_special_path, \
|
from subliminal.core import guessit, ProviderPool, io, is_windows_special_path, \
|
||||||
ThreadPoolExecutor, check_video
|
ThreadPoolExecutor, check_video
|
||||||
from subliminal_patch.exceptions import TooManyRequests, APIThrottled
|
from subliminal_patch.exceptions import TooManyRequests, APIThrottled, ServiceUnavailable, DownloadLimitExceeded
|
||||||
|
|
||||||
from subzero.language import Language
|
from subzero.language import Language
|
||||||
from scandir import scandir, scandir_generic as _scandir_generic
|
from scandir import scandir, scandir_generic as _scandir_generic
|
||||||
|
@ -186,7 +185,7 @@ class SZProviderPool(ProviderPool):
|
||||||
except (requests.Timeout, socket.timeout):
|
except (requests.Timeout, socket.timeout):
|
||||||
logger.error('Provider %r timed out', provider)
|
logger.error('Provider %r timed out', provider)
|
||||||
|
|
||||||
except (TooManyRequests, DownloadLimitExceeded, ServiceUnavailable, APIThrottled), e:
|
except (TooManyRequests, DownloadLimitExceeded, ServiceUnavailable, APIThrottled) as e:
|
||||||
self.throttle_callback(provider, e)
|
self.throttle_callback(provider, e)
|
||||||
return
|
return
|
||||||
|
|
||||||
|
@ -283,7 +282,7 @@ class SZProviderPool(ProviderPool):
|
||||||
logger.debug("RAR Traceback: %s", traceback.format_exc())
|
logger.debug("RAR Traceback: %s", traceback.format_exc())
|
||||||
return False
|
return False
|
||||||
|
|
||||||
except (TooManyRequests, DownloadLimitExceeded, ServiceUnavailable, APIThrottled), e:
|
except (TooManyRequests, DownloadLimitExceeded, ServiceUnavailable, APIThrottled) as e:
|
||||||
self.throttle_callback(subtitle.provider_name, e)
|
self.throttle_callback(subtitle.provider_name, e)
|
||||||
self.discarded_providers.add(subtitle.provider_name)
|
self.discarded_providers.add(subtitle.provider_name)
|
||||||
return False
|
return False
|
||||||
|
@ -648,7 +647,7 @@ def search_external_subtitles(path, languages=None, only_one=False):
|
||||||
abspath = unicode(os.path.abspath(
|
abspath = unicode(os.path.abspath(
|
||||||
os.path.join(*[video_path if not os.path.isabs(folder_or_subfolder) else "", folder_or_subfolder,
|
os.path.join(*[video_path if not os.path.isabs(folder_or_subfolder) else "", folder_or_subfolder,
|
||||||
video_filename])))
|
video_filename])))
|
||||||
except Exception, e:
|
except Exception as e:
|
||||||
logger.error("skipping path %s because of %s", repr(folder_or_subfolder), e)
|
logger.error("skipping path %s because of %s", repr(folder_or_subfolder), e)
|
||||||
continue
|
continue
|
||||||
logger.debug("external subs: scanning path %s", abspath)
|
logger.debug("external subs: scanning path %s", abspath)
|
||||||
|
|
|
@ -9,3 +9,13 @@ class TooManyRequests(ProviderError):
|
||||||
|
|
||||||
class APIThrottled(ProviderError):
|
class APIThrottled(ProviderError):
|
||||||
pass
|
pass
|
||||||
|
|
||||||
|
|
||||||
|
class ServiceUnavailable(ProviderError):
|
||||||
|
"""Exception raised when status is '503 Service Unavailable'."""
|
||||||
|
pass
|
||||||
|
|
||||||
|
|
||||||
|
class DownloadLimitExceeded(ProviderError):
|
||||||
|
"""Exception raised by providers when download limit is exceeded."""
|
||||||
|
pass
|
||||||
|
|
|
@ -8,7 +8,7 @@ import os
|
||||||
import socket
|
import socket
|
||||||
import logging
|
import logging
|
||||||
import requests
|
import requests
|
||||||
import xmlrpclib
|
import xmlrpc.client
|
||||||
import dns.resolver
|
import dns.resolver
|
||||||
import ipaddress
|
import ipaddress
|
||||||
import re
|
import re
|
||||||
|
@ -16,7 +16,7 @@ import re
|
||||||
from requests import exceptions
|
from requests import exceptions
|
||||||
from urllib3.util import connection
|
from urllib3.util import connection
|
||||||
from retry.api import retry_call
|
from retry.api import retry_call
|
||||||
from exceptions import APIThrottled
|
from .exceptions import APIThrottled
|
||||||
from dogpile.cache.api import NO_VALUE
|
from dogpile.cache.api import NO_VALUE
|
||||||
from subliminal.cache import region
|
from subliminal.cache import region
|
||||||
from subliminal_patch.pitcher import pitchers
|
from subliminal_patch.pitcher import pitchers
|
||||||
|
@ -32,10 +32,8 @@ try:
|
||||||
except ImportError:
|
except ImportError:
|
||||||
from urllib.parse import urlparse
|
from urllib.parse import urlparse
|
||||||
|
|
||||||
from subzero.lib.io import get_viable_encoding
|
|
||||||
|
|
||||||
logger = logging.getLogger(__name__)
|
logger = logging.getLogger(__name__)
|
||||||
pem_file = os.path.normpath(os.path.join(os.path.dirname(os.path.realpath(unicode(__file__, get_viable_encoding()))), "..", certifi.where()))
|
pem_file = os.path.normpath(os.path.join(os.path.dirname(os.path.realpath(__file__)), "..", certifi.where()))
|
||||||
try:
|
try:
|
||||||
default_ssl_context = ssl.create_default_context(cafile=pem_file)
|
default_ssl_context = ssl.create_default_context(cafile=pem_file)
|
||||||
except AttributeError:
|
except AttributeError:
|
||||||
|
@ -99,7 +97,7 @@ class CFSession(CloudScraper):
|
||||||
# Solve Challenge
|
# Solve Challenge
|
||||||
resp = self.sendChallengeResponse(resp, **kwargs)
|
resp = self.sendChallengeResponse(resp, **kwargs)
|
||||||
|
|
||||||
except ValueError, e:
|
except ValueError as e:
|
||||||
if e.message == "Captcha":
|
if e.message == "Captcha":
|
||||||
parsed_url = urlparse(url)
|
parsed_url = urlparse(url)
|
||||||
domain = parsed_url.netloc
|
domain = parsed_url.netloc
|
||||||
|
@ -231,7 +229,7 @@ class RetryingCFSession(RetryingSession, CFSession):
|
||||||
pass
|
pass
|
||||||
|
|
||||||
|
|
||||||
class SubZeroRequestsTransport(xmlrpclib.SafeTransport):
|
class SubZeroRequestsTransport(xmlrpc.client.SafeTransport):
|
||||||
"""
|
"""
|
||||||
Drop in Transport for xmlrpclib that uses Requests instead of httplib
|
Drop in Transport for xmlrpclib that uses Requests instead of httplib
|
||||||
|
|
||||||
|
|
|
@ -8,7 +8,7 @@ from subliminal.cache import region
|
||||||
from dogpile.cache.api import NO_VALUE
|
from dogpile.cache.api import NO_VALUE
|
||||||
from python_anticaptcha import AnticaptchaClient, NoCaptchaTaskProxylessTask, NoCaptchaTask, AnticaptchaException,\
|
from python_anticaptcha import AnticaptchaClient, NoCaptchaTaskProxylessTask, NoCaptchaTask, AnticaptchaException,\
|
||||||
Proxy
|
Proxy
|
||||||
from deathbycaptcha import SocketClient as DBCClient, DEFAULT_TOKEN_TIMEOUT
|
from deathbycaptcha import SocketClient as DBCClient, DEFAULT_TIMEOUT
|
||||||
|
|
||||||
|
|
||||||
logger = logging.getLogger(__name__)
|
logger = logging.getLogger(__name__)
|
||||||
|
@ -185,7 +185,7 @@ class DBCProxyLessPitcher(Pitcher):
|
||||||
password = None
|
password = None
|
||||||
|
|
||||||
def __init__(self, website_name, website_url, website_key,
|
def __init__(self, website_name, website_url, website_key,
|
||||||
timeout=DEFAULT_TOKEN_TIMEOUT, tries=3, *args, **kwargs):
|
timeout=DEFAULT_TIMEOUT, tries=3, *args, **kwargs):
|
||||||
super(DBCProxyLessPitcher, self).__init__(website_name, website_url, website_key, tries=tries)
|
super(DBCProxyLessPitcher, self).__init__(website_name, website_url, website_key, tries=tries)
|
||||||
|
|
||||||
self.username, self.password = self.client_key.split(":", 1)
|
self.username, self.password = self.client_key.split(":", 1)
|
||||||
|
|
|
@ -5,7 +5,7 @@ import datetime
|
||||||
from subliminal.refiners.tvdb import Episode, logger, search_series, series_re, sanitize, get_series, \
|
from subliminal.refiners.tvdb import Episode, logger, search_series, series_re, sanitize, get_series, \
|
||||||
get_series_episode, region, tvdb_client
|
get_series_episode, region, tvdb_client
|
||||||
|
|
||||||
from util import fix_session_bases
|
from .util import fix_session_bases
|
||||||
|
|
||||||
TVDB_SEASON_EXPIRATION_TIME = datetime.timedelta(days=1).total_seconds()
|
TVDB_SEASON_EXPIRATION_TIME = datetime.timedelta(days=1).total_seconds()
|
||||||
|
|
||||||
|
|
|
@ -272,9 +272,9 @@ class Subtitle(Subtitle_):
|
||||||
def prepare_text(text, style):
|
def prepare_text(text, style):
|
||||||
body = []
|
body = []
|
||||||
for fragment, sty in parse_tags(text, style, sub.styles):
|
for fragment, sty in parse_tags(text, style, sub.styles):
|
||||||
fragment = fragment.replace(ur"\h", u" ")
|
fragment = fragment.replace(r"\h", u" ")
|
||||||
fragment = fragment.replace(ur"\n", u"\n")
|
fragment = fragment.replace(r"\n", u"\n")
|
||||||
fragment = fragment.replace(ur"\N", u"\n")
|
fragment = fragment.replace(r"\N", u"\n")
|
||||||
if format == "srt":
|
if format == "srt":
|
||||||
if sty.italic:
|
if sty.italic:
|
||||||
fragment = u"<i>%s</i>" % fragment
|
fragment = u"<i>%s</i>" % fragment
|
||||||
|
|
|
@ -1,2 +1,8 @@
|
||||||
|
|
||||||
import dict, geezip, httpfake, io, json, rar, which
|
from .dict import *
|
||||||
|
from .geezip import *
|
||||||
|
from .httpfake import *
|
||||||
|
from .io import *
|
||||||
|
from .json import *
|
||||||
|
from .rar import *
|
||||||
|
from .which import *
|
|
@ -28,7 +28,7 @@ class GeezipFile(gzip.GzipFile):
|
||||||
fileobj.write(self.compress.flush(Z_FINISH))
|
fileobj.write(self.compress.flush(Z_FINISH))
|
||||||
gzip.write32u(fileobj, self.crc)
|
gzip.write32u(fileobj, self.crc)
|
||||||
# self.size may exceed 2GB, or even 4GB
|
# self.size may exceed 2GB, or even 4GB
|
||||||
gzip.write32u(fileobj, self.size & 0xffffffffL)
|
gzip.write32u(fileobj, self.size & 0xffffffff)
|
||||||
fileobj.flush()
|
fileobj.flush()
|
||||||
finally:
|
finally:
|
||||||
myfileobj = self.myfileobj
|
myfileobj = self.myfileobj
|
||||||
|
|
|
@ -1,5 +1,5 @@
|
||||||
# coding=utf-8
|
# coding=utf-8
|
||||||
|
|
||||||
from registry import registry
|
from .registry import registry
|
||||||
from mods import hearing_impaired, ocr_fixes, fps, offset, common, color
|
from .mods import hearing_impaired, ocr_fixes, fps, offset, common, color
|
||||||
from main import SubtitleModifications, SubMod
|
from .main import SubtitleModifications, SubMod
|
||||||
|
|
|
@ -1,3 +1,3 @@
|
||||||
# coding=utf-8
|
# coding=utf-8
|
||||||
|
|
||||||
from data import data
|
from .data import data
|
File diff suppressed because one or more lines are too long
|
@ -6,14 +6,14 @@ import pysubs2
|
||||||
import logging
|
import logging
|
||||||
import time
|
import time
|
||||||
|
|
||||||
from mods import EMPTY_TAG_PROCESSOR, EmptyEntryError
|
from .mods import EMPTY_TAG_PROCESSOR, EmptyEntryError
|
||||||
from registry import registry
|
from .registry import registry
|
||||||
from subzero.language import Language
|
from subzero.language import Language
|
||||||
|
|
||||||
logger = logging.getLogger(__name__)
|
logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
|
|
||||||
lowercase_re = re.compile(ur'(?sux)[a-zà-ž]')
|
lowercase_re = re.compile(r'(?sux)[a-zà-ž]')
|
||||||
|
|
||||||
|
|
||||||
class SubtitleModifications(object):
|
class SubtitleModifications(object):
|
||||||
|
@ -143,7 +143,7 @@ class SubtitleModifications(object):
|
||||||
continue
|
continue
|
||||||
|
|
||||||
# clear empty args
|
# clear empty args
|
||||||
final_mod_args = dict(filter(lambda (k, v): bool(v), args.iteritems()))
|
final_mod_args = dict(filter(lambda kv: bool(kv[1]), args.iteritems()))
|
||||||
|
|
||||||
_data = SubtitleModifications.get_mod_signature(identifier, **final_mod_args)
|
_data = SubtitleModifications.get_mod_signature(identifier, **final_mod_args)
|
||||||
if _data == mods_merged_log[identifier]["final_identifier"]:
|
if _data == mods_merged_log[identifier]["final_identifier"]:
|
||||||
|
@ -180,7 +180,7 @@ class SubtitleModifications(object):
|
||||||
entries_used = 0
|
entries_used = 0
|
||||||
for entry in self.f:
|
for entry in self.f:
|
||||||
entry_used = False
|
entry_used = False
|
||||||
for sub in entry.text.strip().split("\N"):
|
for sub in entry.text.strip().split(r"\N"):
|
||||||
# skip HI bracket entries, those might actually be lowercase
|
# skip HI bracket entries, those might actually be lowercase
|
||||||
sub = sub.strip()
|
sub = sub.strip()
|
||||||
for processor in registry.mods["remove_HI"].processors[:4]:
|
for processor in registry.mods["remove_HI"].processors[:4]:
|
||||||
|
@ -272,7 +272,7 @@ class SubtitleModifications(object):
|
||||||
continue
|
continue
|
||||||
|
|
||||||
skip_entry = False
|
skip_entry = False
|
||||||
for line in t.split(ur"\N"):
|
for line in t.split(r"\N"):
|
||||||
# don't bother the mods with surrounding tags
|
# don't bother the mods with surrounding tags
|
||||||
old_line = line
|
old_line = line
|
||||||
line = line.strip()
|
line = line.strip()
|
||||||
|
@ -377,7 +377,7 @@ class SubtitleModifications(object):
|
||||||
logger.debug(u"%d: %r -> ''", index, entry.text)
|
logger.debug(u"%d: %r -> ''", index, entry.text)
|
||||||
continue
|
continue
|
||||||
|
|
||||||
new_text = ur"\N".join(lines)
|
new_text = r"\N".join(lines)
|
||||||
|
|
||||||
# cheap man's approach to avoid open tags
|
# cheap man's approach to avoid open tags
|
||||||
add_start_tags = []
|
add_start_tags = []
|
||||||
|
|
|
@ -95,7 +95,7 @@ class SubtitleTextModification(SubtitleModification):
|
||||||
pass
|
pass
|
||||||
|
|
||||||
|
|
||||||
TAG = ur"(?:\s*{\\[iusb][0-1]}\s*)*"
|
TAG = r"(?:\s*{\\[iusb][0-1]}\s*)*"
|
||||||
EMPTY_TAG_PROCESSOR = ReProcessor(re.compile(r'({\\\w1})[\s.,-_!?]*({\\\w0})'), "", name="empty_tag")
|
EMPTY_TAG_PROCESSOR = ReProcessor(re.compile(r'({\\\w1})[\s.,-_!?]*({\\\w0})'), "", name="empty_tag")
|
||||||
|
|
||||||
empty_line_post_processors = [
|
empty_line_post_processors = [
|
||||||
|
|
|
@ -22,10 +22,10 @@ class CommonFixes(SubtitleTextModification):
|
||||||
|
|
||||||
processors = [
|
processors = [
|
||||||
# normalize hyphens
|
# normalize hyphens
|
||||||
NReProcessor(re.compile(ur'(?u)([‑‐﹘﹣])'), u"-", name="CM_hyphens"),
|
NReProcessor(re.compile(r'(?u)([‑‐﹘﹣])'), u"-", name="CM_hyphens"),
|
||||||
|
|
||||||
# -- = em dash
|
# -- = em dash
|
||||||
NReProcessor(re.compile(r'(?u)(\w|\b|\s|^)(-\s?-{1,2})'), ur"\1—", name="CM_multidash"),
|
NReProcessor(re.compile(r'(?u)(\w|\b|\s|^)(-\s?-{1,2})'), r"\1—", name="CM_multidash"),
|
||||||
|
|
||||||
# line = _/-/\s
|
# line = _/-/\s
|
||||||
NReProcessor(re.compile(r'(?u)(^\W*[-_.:>~]+\W*$)'), "", name="<CM_non_word_only"),
|
NReProcessor(re.compile(r'(?u)(^\W*[-_.:>~]+\W*$)'), "", name="<CM_non_word_only"),
|
||||||
|
@ -37,23 +37,23 @@ class CommonFixes(SubtitleTextModification):
|
||||||
NReProcessor(re.compile(r'(?u)(^\W*:\s*(?=\w+))'), "", name="CM_empty_colon_start"),
|
NReProcessor(re.compile(r'(?u)(^\W*:\s*(?=\w+))'), "", name="CM_empty_colon_start"),
|
||||||
|
|
||||||
# fix music symbols
|
# fix music symbols
|
||||||
NReProcessor(re.compile(ur'(?u)(^[-\s>~]*[*#¶]+\s+)|(\s*[*#¶]+\s*$)'),
|
NReProcessor(re.compile(r'(?u)(^[-\s>~]*[*#¶]+\s+)|(\s*[*#¶]+\s*$)'),
|
||||||
lambda x: u"♪ " if x.group(1) else u" ♪",
|
lambda x: u"♪ " if x.group(1) else u" ♪",
|
||||||
name="CM_music_symbols"),
|
name="CM_music_symbols"),
|
||||||
|
|
||||||
# '' = "
|
# '' = "
|
||||||
NReProcessor(re.compile(ur'(?u)([\'’ʼ❜‘‛][\'’ʼ❜‘‛]+)'), u'"', name="CM_double_apostrophe"),
|
NReProcessor(re.compile(r'(?u)([\'’ʼ❜‘‛][\'’ʼ❜‘‛]+)'), u'"', name="CM_double_apostrophe"),
|
||||||
|
|
||||||
# double quotes instead of single quotes inside words
|
# double quotes instead of single quotes inside words
|
||||||
NReProcessor(re.compile(ur'(?u)([A-zÀ-ž])"([A-zÀ-ž])'), ur"\1'\2", name="CM_double_as_single"),
|
NReProcessor(re.compile(r'(?u)([A-zÀ-ž])"([A-zÀ-ž])'), r"\1'\2", name="CM_double_as_single"),
|
||||||
|
|
||||||
# normalize quotes
|
# normalize quotes
|
||||||
NReProcessor(re.compile(ur'(?u)(\s*["”“‟„])\s*(["”“‟„]["”“‟„\s]*)'),
|
NReProcessor(re.compile(r'(?u)(\s*["”“‟„])\s*(["”“‟„]["”“‟„\s]*)'),
|
||||||
lambda match: '"' + (" " if match.group(2).endswith(" ") else ""),
|
lambda match: '"' + (" " if match.group(2).endswith(" ") else ""),
|
||||||
name="CM_normalize_quotes"),
|
name="CM_normalize_quotes"),
|
||||||
|
|
||||||
# normalize single quotes
|
# normalize single quotes
|
||||||
NReProcessor(re.compile(ur'(?u)([\'’ʼ❜‘‛])'), u"'", name="CM_normalize_squotes"),
|
NReProcessor(re.compile(r'(?u)([\'’ʼ❜‘‛])'), u"'", name="CM_normalize_squotes"),
|
||||||
|
|
||||||
# remove leading ...
|
# remove leading ...
|
||||||
NReProcessor(re.compile(r'(?u)^\.\.\.[\s]*'), "", name="CM_leading_ellipsis"),
|
NReProcessor(re.compile(r'(?u)^\.\.\.[\s]*'), "", name="CM_leading_ellipsis"),
|
||||||
|
@ -89,8 +89,8 @@ class CommonFixes(SubtitleTextModification):
|
||||||
# space before ending doublequote?
|
# space before ending doublequote?
|
||||||
|
|
||||||
# replace uppercase I with lowercase L in words
|
# replace uppercase I with lowercase L in words
|
||||||
NReProcessor(re.compile(ur'(?u)([a-zà-ž]+)(I+)'),
|
NReProcessor(re.compile(r'(?u)([a-zà-ž]+)(I+)'),
|
||||||
lambda match: ur'%s%s' % (match.group(1), "l" * len(match.group(2))),
|
lambda match: r'%s%s' % (match.group(1), "l" * len(match.group(2))),
|
||||||
name="CM_uppercase_i_in_word"),
|
name="CM_uppercase_i_in_word"),
|
||||||
|
|
||||||
# fix spaces in numbers (allows for punctuation: ,.:' (comma/dot only fixed if after space, those may be
|
# fix spaces in numbers (allows for punctuation: ,.:' (comma/dot only fixed if after space, those may be
|
||||||
|
@ -101,11 +101,11 @@ class CommonFixes(SubtitleTextModification):
|
||||||
name="CM_spaces_in_numbers"),
|
name="CM_spaces_in_numbers"),
|
||||||
|
|
||||||
# uppercase after dot
|
# uppercase after dot
|
||||||
NReProcessor(re.compile(ur'(?u)((?<!(?=\s*[A-ZÀ-Ž-_0-9.]\s*))(?:[^.\s])+\.\s+)([a-zà-ž])'),
|
NReProcessor(re.compile(r'(?u)((?<!(?=\s*[A-ZÀ-Ž-_0-9.]\s*))(?:[^.\s])+\.\s+)([a-zà-ž])'),
|
||||||
lambda match: ur'%s%s' % (match.group(1), match.group(2).upper()), name="CM_uppercase_after_dot"),
|
lambda match: r'%s%s' % (match.group(1), match.group(2).upper()), name="CM_uppercase_after_dot"),
|
||||||
|
|
||||||
# remove double interpunction
|
# remove double interpunction
|
||||||
NReProcessor(re.compile(ur'(?u)(\s*[,!?])\s*([,.!?][,.!?\s]*)'),
|
NReProcessor(re.compile(r'(?u)(\s*[,!?])\s*([,.!?][,.!?\s]*)'),
|
||||||
lambda match: match.group(1).strip() + (" " if match.group(2).endswith(" ") else ""),
|
lambda match: match.group(1).strip() + (" " if match.group(2).endswith(" ") else ""),
|
||||||
name="CM_double_interpunct"),
|
name="CM_double_interpunct"),
|
||||||
|
|
||||||
|
@ -149,14 +149,14 @@ class ReverseRTL(SubtitleModification):
|
||||||
|
|
||||||
processors = [
|
processors = [
|
||||||
# new? (?u)(^([\s.!?]*)(.+?)(\s*)(-?\s*)$); \5\4\3\2
|
# new? (?u)(^([\s.!?]*)(.+?)(\s*)(-?\s*)$); \5\4\3\2
|
||||||
#NReProcessor(re.compile(ur"(?u)((?=(?<=\b|^)|(?<=\s))([.!?-]+)([^.!?-]+)(?=\b|$|\s))"), r"\3\2",
|
#NReProcessor(re.compile(r"(?u)((?=(?<=\b|^)|(?<=\s))([.!?-]+)([^.!?-]+)(?=\b|$|\s))"), r"\3\2",
|
||||||
# name="CM_RTL_reverse")
|
# name="CM_RTL_reverse")
|
||||||
NReProcessor(re.compile(ur"(?u)(^([\s.!?:,'-]*)(.+?)(\s*)(-?\s*)$)"), r"\5\4\3\2",
|
NReProcessor(re.compile(r"(?u)(^([\s.!?:,'-]*)(.+?)(\s*)(-?\s*)$)"), r"\5\4\3\2",
|
||||||
name="CM_RTL_reverse")
|
name="CM_RTL_reverse")
|
||||||
]
|
]
|
||||||
|
|
||||||
|
|
||||||
split_upper_re = re.compile(ur"(\s*[.!?♪\-]\s*)")
|
split_upper_re = re.compile(r"(\s*[.!?♪\-]\s*)")
|
||||||
|
|
||||||
|
|
||||||
class FixUppercase(SubtitleModification):
|
class FixUppercase(SubtitleModification):
|
||||||
|
|
|
@ -26,71 +26,71 @@ class HearingImpaired(SubtitleTextModification):
|
||||||
|
|
||||||
processors = [
|
processors = [
|
||||||
# full bracket entry, single or multiline; starting with brackets and ending with brackets
|
# full bracket entry, single or multiline; starting with brackets and ending with brackets
|
||||||
FullBracketEntryProcessor(re.compile(ur'(?sux)^-?%(t)s[([].+(?=[^)\]]{3,}).+[)\]]%(t)s$' % {"t": TAG}),
|
FullBracketEntryProcessor(re.compile(r'(?sux)^-?%(t)s[([].+(?=[^)\]]{3,}).+[)\]]%(t)s$' % {"t": TAG}),
|
||||||
"", name="HI_brackets_full"),
|
"", name="HI_brackets_full"),
|
||||||
|
|
||||||
# uppercase text before colon (at least 3 uppercase chars); at start or after a sentence,
|
# uppercase text before colon (at least 3 uppercase chars); at start or after a sentence,
|
||||||
# possibly with a dash in front; ignore anything ending with a quote
|
# possibly with a dash in front; ignore anything ending with a quote
|
||||||
NReProcessor(re.compile(ur'(?u)(?:(?<=^)|(?<=[.\-!?\"\']))([\s\->~]*(?=[A-ZÀ-Ž&+]\s*[A-ZÀ-Ž&+]\s*[A-ZÀ-Ž&+])'
|
NReProcessor(re.compile(r'(?u)(?:(?<=^)|(?<=[.\-!?\"\']))([\s\->~]*(?=[A-ZÀ-Ž&+]\s*[A-ZÀ-Ž&+]\s*[A-ZÀ-Ž&+])'
|
||||||
ur'[A-zÀ-ž-_0-9\s\"\'&+()\[\],:]+:(?![\"\'’ʼ❜‘‛”“‟„])(?:\s+|$))(?![0-9])'), "",
|
r'[A-zÀ-ž-_0-9\s\"\'&+()\[\],:]+:(?![\"\'’ʼ❜‘‛”“‟„])(?:\s+|$))(?![0-9])'), "",
|
||||||
name="HI_before_colon_caps"),
|
name="HI_before_colon_caps"),
|
||||||
|
|
||||||
# any text before colon (at least 3 chars); at start or after a sentence,
|
# any text before colon (at least 3 chars); at start or after a sentence,
|
||||||
# possibly with a dash in front; try not breaking actual sentences with a colon at the end by not matching if
|
# possibly with a dash in front; try not breaking actual sentences with a colon at the end by not matching if
|
||||||
# a space is inside the text; ignore anything ending with a quote
|
# a space is inside the text; ignore anything ending with a quote
|
||||||
NReProcessor(re.compile(ur'(?u)(?:(?<=^)|(?<=[.\-!?\"]))([\s\->~]*((?=[A-zÀ-ž&+]\s*[A-zÀ-ž&+]\s*[A-zÀ-ž&+])'
|
NReProcessor(re.compile(r'(?u)(?:(?<=^)|(?<=[.\-!?\"]))([\s\->~]*((?=[A-zÀ-ž&+]\s*[A-zÀ-ž&+]\s*[A-zÀ-ž&+])'
|
||||||
ur'[A-zÀ-ž-_0-9\s\"\'&+()\[\]]+:)(?![\"’ʼ❜‘‛”“‟„])\s*)(?![0-9])'),
|
r'[A-zÀ-ž-_0-9\s\"\'&+()\[\]]+:)(?![\"’ʼ❜‘‛”“‟„])\s*)(?![0-9])'),
|
||||||
lambda match:
|
lambda match:
|
||||||
match.group(1) if (match.group(2).count(" ") > 0 or match.group(1).count("-") > 0)
|
match.group(1) if (match.group(2).count(" ") > 0 or match.group(1).count("-") > 0)
|
||||||
else "" if not match.group(1).startswith(" ") else " ",
|
else "" if not match.group(1).startswith(" ") else " ",
|
||||||
name="HI_before_colon_noncaps"),
|
name="HI_before_colon_noncaps"),
|
||||||
|
|
||||||
# brackets (only remove if at least 3 chars in brackets)
|
# brackets (only remove if at least 3 chars in brackets)
|
||||||
NReProcessor(re.compile(ur'(?sux)-?%(t)s[([][^([)\]]+?(?=[A-zÀ-ž"\'.]{3,})[^([)\]]+[)\]][\s:]*%(t)s' %
|
NReProcessor(re.compile(r'(?sux)-?%(t)s[([][^([)\]]+?(?=[A-zÀ-ž"\'.]{3,})[^([)\]]+[)\]][\s:]*%(t)s' %
|
||||||
{"t": TAG}), "", name="HI_brackets"),
|
{"t": TAG}), "", name="HI_brackets"),
|
||||||
|
|
||||||
#NReProcessor(re.compile(ur'(?sux)-?%(t)s[([]%(t)s(?=[A-zÀ-ž"\'.]{3,})[^([)\]]+%(t)s$' % {"t": TAG}),
|
#NReProcessor(re.compile(r'(?sux)-?%(t)s[([]%(t)s(?=[A-zÀ-ž"\'.]{3,})[^([)\]]+%(t)s$' % {"t": TAG}),
|
||||||
# "", name="HI_bracket_open_start"),
|
# "", name="HI_bracket_open_start"),
|
||||||
|
|
||||||
#NReProcessor(re.compile(ur'(?sux)-?%(t)s(?=[A-zÀ-ž"\'.]{3,})[^([)\]]+[)\]][\s:]*%(t)s' % {"t": TAG}), "",
|
#NReProcessor(re.compile(r'(?sux)-?%(t)s(?=[A-zÀ-ž"\'.]{3,})[^([)\]]+[)\]][\s:]*%(t)s' % {"t": TAG}), "",
|
||||||
# name="HI_bracket_open_end"),
|
# name="HI_bracket_open_end"),
|
||||||
|
|
||||||
# text before colon (and possible dash in front), max 11 chars after the first whitespace (if any)
|
# text before colon (and possible dash in front), max 11 chars after the first whitespace (if any)
|
||||||
# NReProcessor(re.compile(r'(?u)(^[A-z\-\'"_]+[\w\s]{0,11}:[^0-9{2}][\s]*)'), "", name="HI_before_colon"),
|
# NReProcessor(re.compile(r'(?u)(^[A-z\-\'"_]+[\w\s]{0,11}:[^0-9{2}][\s]*)'), "", name="HI_before_colon"),
|
||||||
|
|
||||||
# starting text before colon (at least 3 chars)
|
# starting text before colon (at least 3 chars)
|
||||||
#NReProcessor(re.compile(ur'(?u)(\b|^)([\s-]*(?=[A-zÀ-ž-_0-9"\']{3,})[A-zÀ-ž-_0-9"\']+:\s*)'), "",
|
#NReProcessor(re.compile(r'(?u)(\b|^)([\s-]*(?=[A-zÀ-ž-_0-9"\']{3,})[A-zÀ-ž-_0-9"\']+:\s*)'), "",
|
||||||
# name="HI_before_colon"),
|
# name="HI_before_colon"),
|
||||||
|
|
||||||
|
|
||||||
# text in brackets at start, after optional dash, before colon or at end of line
|
# text in brackets at start, after optional dash, before colon or at end of line
|
||||||
# fixme: may be too aggressive
|
# fixme: may be too aggressive
|
||||||
#NReProcessor(re.compile(ur'(?um)(^-?\s?[([][A-zÀ-ž-_\s]{3,}[)\]](?:(?=$)|:\s*))'), "",
|
#NReProcessor(re.compile(r'(?um)(^-?\s?[([][A-zÀ-ž-_\s]{3,}[)\]](?:(?=$)|:\s*))'), "",
|
||||||
# name="HI_brackets_special"),
|
# name="HI_brackets_special"),
|
||||||
|
|
||||||
# all caps line (at least 4 consecutive uppercase chars)
|
# all caps line (at least 4 consecutive uppercase chars)
|
||||||
NReProcessor(re.compile(ur'(?u)(^(?=.*[A-ZÀ-Ž&+]{4,})[A-ZÀ-Ž-_\s&+]+$)'), "", name="HI_all_caps",
|
NReProcessor(re.compile(r'(?u)(^(?=.*[A-ZÀ-Ž&+]{4,})[A-ZÀ-Ž-_\s&+]+$)'), "", name="HI_all_caps",
|
||||||
supported=lambda p: not p.only_uppercase),
|
supported=lambda p: not p.only_uppercase),
|
||||||
|
|
||||||
# remove MAN:
|
# remove MAN:
|
||||||
NReProcessor(re.compile(ur'(?suxi)(\b(?:WO)MAN:\s*)'), "", name="HI_remove_man"),
|
NReProcessor(re.compile(r'(?suxi)(\b(?:WO)MAN:\s*)'), "", name="HI_remove_man"),
|
||||||
|
|
||||||
# dash in front
|
# dash in front
|
||||||
# NReProcessor(re.compile(r'(?u)^\s*-\s*'), "", name="HI_starting_dash"),
|
# NReProcessor(re.compile(r'(?u)^\s*-\s*'), "", name="HI_starting_dash"),
|
||||||
|
|
||||||
# all caps at start before new sentence
|
# all caps at start before new sentence
|
||||||
NReProcessor(re.compile(ur'(?u)^(?=[A-ZÀ-Ž]{4,})[A-ZÀ-Ž-_\s]+\s([A-ZÀ-Ž][a-zà-ž].+)'), r"\1",
|
NReProcessor(re.compile(r'(?u)^(?=[A-ZÀ-Ž]{4,})[A-ZÀ-Ž-_\s]+\s([A-ZÀ-Ž][a-zà-ž].+)'), r"\1",
|
||||||
name="HI_starting_upper_then_sentence", supported=lambda p: not p.only_uppercase),
|
name="HI_starting_upper_then_sentence", supported=lambda p: not p.only_uppercase),
|
||||||
]
|
]
|
||||||
|
|
||||||
post_processors = empty_line_post_processors
|
post_processors = empty_line_post_processors
|
||||||
last_processors = [
|
last_processors = [
|
||||||
# remove music symbols
|
# remove music symbols
|
||||||
NReProcessor(re.compile(ur'(?u)(^%(t)s[*#¶♫♪\s]*%(t)s[*#¶♫♪\s]+%(t)s[*#¶♫♪\s]*%(t)s$)' % {"t": TAG}),
|
NReProcessor(re.compile(r'(?u)(^%(t)s[*#¶♫♪\s]*%(t)s[*#¶♫♪\s]+%(t)s[*#¶♫♪\s]*%(t)s$)' % {"t": TAG}),
|
||||||
"", name="HI_music_symbols_only"),
|
"", name="HI_music_symbols_only"),
|
||||||
|
|
||||||
# remove music entries
|
# remove music entries
|
||||||
NReProcessor(re.compile(ur'(?ums)(^[-\s>~]*[♫♪]+\s*.+|.+\s*[♫♪]+\s*$)'),
|
NReProcessor(re.compile(r'(?ums)(^[-\s>~]*[♫♪]+\s*.+|.+\s*[♫♪]+\s*$)'),
|
||||||
"", name="HI_music"),
|
"", name="HI_music"),
|
||||||
]
|
]
|
||||||
|
|
||||||
|
|
Some files were not shown because too many files have changed in this diff Show More
Loading…
Reference in New Issue