This commit is contained in:
Louis Vézina 2019-09-13 15:14:31 -04:00
parent 645952c61a
commit c5fa0f56e4
109 changed files with 31929 additions and 0 deletions

File diff suppressed because it is too large Load Diff

View File

@ -0,0 +1,171 @@
#!/usr/bin/env python
# -*- coding: utf-8 -*-
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
from __future__ import unicode_literals
from collections import MutableMapping
try:
from collections import UserDict
except ImportError:
from UserDict import UserDict
try:
from collections import OrderedDict
except ImportError:
from ordereddict import OrderedDict
from io import open
import sys
try:
from thread import get_ident
except ImportError:
try:
from _thread import get_ident
except ImportError:
from _dummy_thread import get_ident
PY2 = sys.version_info[0] == 2
PY3 = sys.version_info[0] == 3
str = type('str')
def from_none(exc):
"""raise from_none(ValueError('a')) == raise ValueError('a') from None"""
exc.__cause__ = None
exc.__suppress_context__ = True
return exc
# from reprlib 3.2.1
def recursive_repr(fillvalue='...'):
'Decorator to make a repr function return fillvalue for a recursive call'
def decorating_function(user_function):
repr_running = set()
def wrapper(self):
key = id(self), get_ident()
if key in repr_running:
return fillvalue
repr_running.add(key)
try:
result = user_function(self)
finally:
repr_running.discard(key)
return result
# Can't use functools.wraps() here because of bootstrap issues
wrapper.__module__ = getattr(user_function, '__module__')
wrapper.__doc__ = getattr(user_function, '__doc__')
wrapper.__name__ = getattr(user_function, '__name__')
wrapper.__annotations__ = getattr(user_function, '__annotations__', {})
return wrapper
return decorating_function
# from collections 3.2.1
class _ChainMap(MutableMapping):
''' A ChainMap groups multiple dicts (or other mappings) together
to create a single, updateable view.
The underlying mappings are stored in a list. That list is public and can
accessed or updated using the *maps* attribute. There is no other state.
Lookups search the underlying mappings successively until a key is found.
In contrast, writes, updates, and deletions only operate on the first
mapping.
'''
def __init__(self, *maps):
'''Initialize a ChainMap by setting *maps* to the given mappings.
If no mappings are provided, a single empty dictionary is used.
'''
self.maps = list(maps) or [{}] # always at least one map
def __missing__(self, key):
raise KeyError(key)
def __getitem__(self, key):
for mapping in self.maps:
try:
return mapping[key] # can't use 'key in mapping' with defaultdict
except KeyError:
pass
return self.__missing__(key) # support subclasses that define __missing__
def get(self, key, default=None):
return self[key] if key in self else default
def __len__(self):
return len(set().union(*self.maps)) # reuses stored hash values if possible
def __iter__(self):
return iter(set().union(*self.maps))
def __contains__(self, key):
return any(key in m for m in self.maps)
@recursive_repr()
def __repr__(self):
return '{0.__class__.__name__}({1})'.format(
self, ', '.join(map(repr, self.maps)))
@classmethod
def fromkeys(cls, iterable, *args):
'Create a ChainMap with a single dict created from the iterable.'
return cls(dict.fromkeys(iterable, *args))
def copy(self):
'New ChainMap or subclass with a new copy of maps[0] and refs to maps[1:]'
return self.__class__(self.maps[0].copy(), *self.maps[1:])
__copy__ = copy
def new_child(self): # like Django's Context.push()
'New ChainMap with a new dict followed by all previous maps.'
return self.__class__({}, *self.maps)
@property
def parents(self): # like Django's Context.pop()
'New ChainMap from maps[1:].'
return self.__class__(*self.maps[1:])
def __setitem__(self, key, value):
self.maps[0][key] = value
def __delitem__(self, key):
try:
del self.maps[0][key]
except KeyError:
raise KeyError('Key not found in the first mapping: {!r}'.format(key))
def popitem(self):
'Remove and return an item pair from maps[0]. Raise KeyError is maps[0] is empty.'
try:
return self.maps[0].popitem()
except KeyError:
raise KeyError('No keys found in the first mapping.')
def pop(self, key, *args):
'Remove *key* from maps[0] and return its value. Raise KeyError if *key* not in maps[0].'
try:
return self.maps[0].pop(key, *args)
except KeyError:
raise KeyError('Key not found in the first mapping: {!r}'.format(key))
def clear(self):
'Clear maps[0], leaving maps[1:] intact.'
self.maps[0].clear()
try:
from collections import ChainMap
except ImportError:
ChainMap = _ChainMap

99
libs/bs4/formatter.py Normal file
View File

@ -0,0 +1,99 @@
from bs4.dammit import EntitySubstitution
class Formatter(EntitySubstitution):
"""Describes a strategy to use when outputting a parse tree to a string.
Some parts of this strategy come from the distinction between
HTML4, HTML5, and XML. Others are configurable by the user.
"""
# Registries of XML and HTML formatters.
XML_FORMATTERS = {}
HTML_FORMATTERS = {}
HTML = 'html'
XML = 'xml'
HTML_DEFAULTS = dict(
cdata_containing_tags=set(["script", "style"]),
)
def _default(self, language, value, kwarg):
if value is not None:
return value
if language == self.XML:
return set()
return self.HTML_DEFAULTS[kwarg]
def __init__(
self, language=None, entity_substitution=None,
void_element_close_prefix='/', cdata_containing_tags=None,
):
"""
:param void_element_close_prefix: By default, represent void
elements as <tag/> rather than <tag>
"""
self.language = language
self.entity_substitution = entity_substitution
self.void_element_close_prefix = void_element_close_prefix
self.cdata_containing_tags = self._default(
language, cdata_containing_tags, 'cdata_containing_tags'
)
def substitute(self, ns):
"""Process a string that needs to undergo entity substitution."""
if not self.entity_substitution:
return ns
from .element import NavigableString
if (isinstance(ns, NavigableString)
and ns.parent is not None
and ns.parent.name in self.cdata_containing_tags):
# Do nothing.
return ns
# Substitute.
return self.entity_substitution(ns)
def attribute_value(self, value):
"""Process the value of an attribute."""
return self.substitute(value)
def attributes(self, tag):
"""Reorder a tag's attributes however you want."""
return sorted(tag.attrs.items())
class HTMLFormatter(Formatter):
REGISTRY = {}
def __init__(self, *args, **kwargs):
return super(HTMLFormatter, self).__init__(self.HTML, *args, **kwargs)
class XMLFormatter(Formatter):
REGISTRY = {}
def __init__(self, *args, **kwargs):
return super(XMLFormatter, self).__init__(self.XML, *args, **kwargs)
# Set up aliases for the default formatters.
HTMLFormatter.REGISTRY['html'] = HTMLFormatter(
entity_substitution=EntitySubstitution.substitute_html
)
HTMLFormatter.REGISTRY["html5"] = HTMLFormatter(
entity_substitution=EntitySubstitution.substitute_html,
void_element_close_prefix = None
)
HTMLFormatter.REGISTRY["minimal"] = HTMLFormatter(
entity_substitution=EntitySubstitution.substitute_xml
)
HTMLFormatter.REGISTRY[None] = HTMLFormatter(
entity_substitution=None
)
XMLFormatter.REGISTRY["html"] = XMLFormatter(
entity_substitution=EntitySubstitution.substitute_html
)
XMLFormatter.REGISTRY["minimal"] = XMLFormatter(
entity_substitution=EntitySubstitution.substitute_xml
)
XMLFormatter.REGISTRY[None] = Formatter(
Formatter(Formatter.XML, entity_substitution=None)
)

43
libs/bs42.7/AUTHORS.txt Normal file
View File

@ -0,0 +1,43 @@
Behold, mortal, the origins of Beautiful Soup...
================================================
Leonard Richardson is the primary programmer.
Aaron DeVore is awesome.
Mark Pilgrim provided the encoding detection code that forms the base
of UnicodeDammit.
Thomas Kluyver and Ezio Melotti finished the work of getting Beautiful
Soup 4 working under Python 3.
Simon Willison wrote soupselect, which was used to make Beautiful Soup
support CSS selectors.
Sam Ruby helped with a lot of edge cases.
Jonathan Ellis was awarded the prestigous Beau Potage D'Or for his
work in solving the nestable tags conundrum.
An incomplete list of people have contributed patches to Beautiful
Soup:
Istvan Albert, Andrew Lin, Anthony Baxter, Andrew Boyko, Tony Chang,
Zephyr Fang, Fuzzy, Roman Gaufman, Yoni Gilad, Richie Hindle, Peteris
Krumins, Kent Johnson, Ben Last, Robert Leftwich, Staffan Malmgren,
Ksenia Marasanova, JP Moins, Adam Monsen, John Nagle, "Jon", Ed
Oskiewicz, Greg Phillips, Giles Radford, Arthur Rudolph, Marko
Samastur, Jouni Seppänen, Alexander Schmolck, Andy Theyers, Glyn
Webster, Paul Wright, Danny Yoo
An incomplete list of people who made suggestions or found bugs or
found ways to break Beautiful Soup:
Hanno Böck, Matteo Bertini, Chris Curvey, Simon Cusack, Bruce Eckel,
Matt Ernst, Michael Foord, Tom Harris, Bill de hOra, Donald Howes,
Matt Patterson, Scott Roberts, Steve Strassmann, Mike Williams,
warchild at redho dot com, Sami Kuisma, Carlos Rocha, Bob Hutchison,
Joren Mc, Michal Migurski, John Kleven, Tim Heaney, Tripp Lilley, Ed
Summers, Dennis Sutch, Chris Smith, Aaron Sweep^W Swartz, Stuart
Turner, Greg Edwards, Kevin J Kalupson, Nikos Kouremenos, Artur de
Sousa Rocha, Yichun Wei, Per Vognsen

27
libs/bs42.7/COPYING.txt Normal file
View File

@ -0,0 +1,27 @@
Beautiful Soup is made available under the MIT license:
Copyright (c) 2004-2015 Leonard Richardson
Permission is hereby granted, free of charge, to any person obtaining
a copy of this software and associated documentation files (the
"Software"), to deal in the Software without restriction, including
without limitation the rights to use, copy, modify, merge, publish,
distribute, sublicense, and/or sell copies of the Software, and to
permit persons to whom the Software is furnished to do so, subject to
the following conditions:
The above copyright notice and this permission notice shall be
included in all copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
SOFTWARE.
Beautiful Soup incorporates code from the html5lib library, which is
also made available under the MIT license. Copyright (c) 2006-2013
James Graham and other contributors

1190
libs/bs42.7/NEWS.txt Normal file

File diff suppressed because it is too large Load Diff

63
libs/bs42.7/README.txt Normal file
View File

@ -0,0 +1,63 @@
= Introduction =
>>> from bs4 import BeautifulSoup
>>> soup = BeautifulSoup("<p>Some<b>bad<i>HTML")
>>> print soup.prettify()
<html>
<body>
<p>
Some
<b>
bad
<i>
HTML
</i>
</b>
</p>
</body>
</html>
>>> soup.find(text="bad")
u'bad'
>>> soup.i
<i>HTML</i>
>>> soup = BeautifulSoup("<tag1>Some<tag2/>bad<tag3>XML", "xml")
>>> print soup.prettify()
<?xml version="1.0" encoding="utf-8">
<tag1>
Some
<tag2 />
bad
<tag3>
XML
</tag3>
</tag1>
= Full documentation =
The bs4/doc/ directory contains full documentation in Sphinx
format. Run "make html" in that directory to create HTML
documentation.
= Running the unit tests =
Beautiful Soup supports unit test discovery from the project root directory:
$ nosetests
$ python -m unittest discover -s bs4 # Python 2.7 and up
If you checked out the source tree, you should see a script in the
home directory called test-all-versions. This script will run the unit
tests under Python 2.7, then create a temporary Python 3 conversion of
the source and run the unit tests again under Python 3.
= Links =
Homepage: http://www.crummy.com/software/BeautifulSoup/bs4/
Documentation: http://www.crummy.com/software/BeautifulSoup/bs4/doc/
http://readthedocs.org/docs/beautiful-soup-4/
Discussion group: http://groups.google.com/group/beautifulsoup/
Development: https://code.launchpad.net/beautifulsoup/
Bug tracker: https://bugs.launchpad.net/beautifulsoup/

31
libs/bs42.7/TODO.txt Normal file
View File

@ -0,0 +1,31 @@
Additions
---------
More of the jQuery API: nextUntil?
Optimizations
-------------
The html5lib tree builder doesn't use the standard tree-building API,
which worries me and has resulted in a number of bugs.
markup_attr_map can be optimized since it's always a map now.
Upon encountering UTF-16LE data or some other uncommon serialization
of Unicode, UnicodeDammit will convert the data to Unicode, then
encode it at UTF-8. This is wasteful because it will just get decoded
back to Unicode.
CDATA
-----
The elementtree XMLParser has a strip_cdata argument that, when set to
False, should allow Beautiful Soup to preserve CDATA sections instead
of treating them as text. Except it doesn't. (This argument is also
present for HTMLParser, and also does nothing there.)
Currently, htm5lib converts CDATA sections into comments. An
as-yet-unreleased version of html5lib changes the parser's handling of
CDATA sections to allow CDATA sections in tags like <svg> and
<math>. The HTML5TreeBuilder will need to be updated to create CData
objects instead of Comment objects in this situation.

529
libs/bs42.7/__init__.py Normal file
View File

@ -0,0 +1,529 @@
"""Beautiful Soup
Elixir and Tonic
"The Screen-Scraper's Friend"
http://www.crummy.com/software/BeautifulSoup/
Beautiful Soup uses a pluggable XML or HTML parser to parse a
(possibly invalid) document into a tree representation. Beautiful Soup
provides methods and Pythonic idioms that make it easy to navigate,
search, and modify the parse tree.
Beautiful Soup works with Python 2.7 and up. It works better if lxml
and/or html5lib is installed.
For more than you ever wanted to know about Beautiful Soup, see the
documentation:
http://www.crummy.com/software/BeautifulSoup/bs4/doc/
"""
# Use of this source code is governed by a BSD-style license that can be
# found in the LICENSE file.
__author__ = "Leonard Richardson (leonardr@segfault.org)"
__version__ = "4.6.0"
__copyright__ = "Copyright (c) 2004-2017 Leonard Richardson"
__license__ = "MIT"
__all__ = ['BeautifulSoup']
import os
import re
import traceback
import warnings
from .builder import builder_registry, ParserRejectedMarkup
from .dammit import UnicodeDammit
from .element import (
CData,
Comment,
DEFAULT_OUTPUT_ENCODING,
Declaration,
Doctype,
NavigableString,
PageElement,
ProcessingInstruction,
ResultSet,
SoupStrainer,
Tag,
)
# The very first thing we do is give a useful error if someone is
# running this code under Python 3 without converting it.
'You are trying to run the Python 2 version of Beautiful Soup under Python 3. This will not work.'<>'You need to convert the code, either by installing it (`python setup.py install`) or by running 2to3 (`2to3 -w bs4`).'
class BeautifulSoup(Tag):
"""
This class defines the basic interface called by the tree builders.
These methods will be called by the parser:
reset()
feed(markup)
The tree builder may call these methods from its feed() implementation:
handle_starttag(name, attrs) # See note about return value
handle_endtag(name)
handle_data(data) # Appends to the current data node
endData(containerClass=NavigableString) # Ends the current data node
No matter how complicated the underlying parser is, you should be
able to build a tree using 'start tag' events, 'end tag' events,
'data' events, and "done with data" events.
If you encounter an empty-element tag (aka a self-closing tag,
like HTML's <br> tag), call handle_starttag and then
handle_endtag.
"""
ROOT_TAG_NAME = u'[document]'
# If the end-user gives no indication which tree builder they
# want, look for one with these features.
DEFAULT_BUILDER_FEATURES = ['html', 'fast']
ASCII_SPACES = '\x20\x0a\x09\x0c\x0d'
NO_PARSER_SPECIFIED_WARNING = "No parser was explicitly specified, so I'm using the best available %(markup_type)s parser for this system (\"%(parser)s\"). This usually isn't a problem, but if you run this code on another system, or in a different virtual environment, it may use a different parser and behave differently.\n\nThe code that caused this warning is on line %(line_number)s of the file %(filename)s. To get rid of this warning, change code that looks like this:\n\n BeautifulSoup(YOUR_MARKUP})\n\nto this:\n\n BeautifulSoup(YOUR_MARKUP, \"%(parser)s\")\n"
def __init__(self, markup="", features=None, builder=None,
parse_only=None, from_encoding=None, exclude_encodings=None,
**kwargs):
"""The Soup object is initialized as the 'root tag', and the
provided markup (which can be a string or a file-like object)
is fed into the underlying parser."""
if 'convertEntities' in kwargs:
warnings.warn(
"BS4 does not respect the convertEntities argument to the "
"BeautifulSoup constructor. Entities are always converted "
"to Unicode characters.")
if 'markupMassage' in kwargs:
del kwargs['markupMassage']
warnings.warn(
"BS4 does not respect the markupMassage argument to the "
"BeautifulSoup constructor. The tree builder is responsible "
"for any necessary markup massage.")
if 'smartQuotesTo' in kwargs:
del kwargs['smartQuotesTo']
warnings.warn(
"BS4 does not respect the smartQuotesTo argument to the "
"BeautifulSoup constructor. Smart quotes are always converted "
"to Unicode characters.")
if 'selfClosingTags' in kwargs:
del kwargs['selfClosingTags']
warnings.warn(
"BS4 does not respect the selfClosingTags argument to the "
"BeautifulSoup constructor. The tree builder is responsible "
"for understanding self-closing tags.")
if 'isHTML' in kwargs:
del kwargs['isHTML']
warnings.warn(
"BS4 does not respect the isHTML argument to the "
"BeautifulSoup constructor. Suggest you use "
"features='lxml' for HTML and features='lxml-xml' for "
"XML.")
def deprecated_argument(old_name, new_name):
if old_name in kwargs:
warnings.warn(
'The "%s" argument to the BeautifulSoup constructor '
'has been renamed to "%s."' % (old_name, new_name))
value = kwargs[old_name]
del kwargs[old_name]
return value
return None
parse_only = parse_only or deprecated_argument(
"parseOnlyThese", "parse_only")
from_encoding = from_encoding or deprecated_argument(
"fromEncoding", "from_encoding")
if from_encoding and isinstance(markup, unicode):
warnings.warn("You provided Unicode markup but also provided a value for from_encoding. Your from_encoding will be ignored.")
from_encoding = None
if len(kwargs) > 0:
arg = kwargs.keys().pop()
raise TypeError(
"__init__() got an unexpected keyword argument '%s'" % arg)
if builder is None:
original_features = features
if isinstance(features, basestring):
features = [features]
if features is None or len(features) == 0:
features = self.DEFAULT_BUILDER_FEATURES
builder_class = builder_registry.lookup(*features)
if builder_class is None:
raise FeatureNotFound(
"Couldn't find a tree builder with the features you "
"requested: %s. Do you need to install a parser library?"
% ",".join(features))
builder = builder_class()
if not (original_features == builder.NAME or
original_features in builder.ALTERNATE_NAMES):
if builder.is_xml:
markup_type = "XML"
else:
markup_type = "HTML"
caller = traceback.extract_stack()[0]
filename = caller[0]
line_number = caller[1]
warnings.warn(self.NO_PARSER_SPECIFIED_WARNING % dict(
filename=filename,
line_number=line_number,
parser=builder.NAME,
markup_type=markup_type))
self.builder = builder
self.is_xml = builder.is_xml
self.known_xml = self.is_xml
self.builder.soup = self
self.parse_only = parse_only
if hasattr(markup, 'read'): # It's a file-type object.
markup = markup.read()
elif len(markup) <= 256 and (
(isinstance(markup, bytes) and not b'<' in markup)
or (isinstance(markup, unicode) and not u'<' in markup)
):
# Print out warnings for a couple beginner problems
# involving passing non-markup to Beautiful Soup.
# Beautiful Soup will still parse the input as markup,
# just in case that's what the user really wants.
if (isinstance(markup, unicode)
and not os.path.supports_unicode_filenames):
possible_filename = markup.encode("utf8")
else:
possible_filename = markup
is_file = False
try:
is_file = os.path.exists(possible_filename)
except Exception, e:
# This is almost certainly a problem involving
# characters not valid in filenames on this
# system. Just let it go.
pass
if is_file:
if isinstance(markup, unicode):
markup = markup.encode("utf8")
warnings.warn(
'"%s" looks like a filename, not markup. You should'
' probably open this file and pass the filehandle into'
' Beautiful Soup.' % markup)
self._check_markup_is_url(markup)
for (self.markup, self.original_encoding, self.declared_html_encoding,
self.contains_replacement_characters) in (
self.builder.prepare_markup(
markup, from_encoding, exclude_encodings=exclude_encodings)):
self.reset()
try:
self._feed()
break
except ParserRejectedMarkup:
pass
# Clear out the markup and remove the builder's circular
# reference to this object.
self.markup = None
self.builder.soup = None
def __copy__(self):
copy = type(self)(
self.encode('utf-8'), builder=self.builder, from_encoding='utf-8'
)
# Although we encoded the tree to UTF-8, that may not have
# been the encoding of the original markup. Set the copy's
# .original_encoding to reflect the original object's
# .original_encoding.
copy.original_encoding = self.original_encoding
return copy
def __getstate__(self):
# Frequently a tree builder can't be pickled.
d = dict(self.__dict__)
if 'builder' in d and not self.builder.picklable:
d['builder'] = None
return d
@staticmethod
def _check_markup_is_url(markup):
"""
Check if markup looks like it's actually a url and raise a warning
if so. Markup can be unicode or str (py2) / bytes (py3).
"""
if isinstance(markup, bytes):
space = b' '
cant_start_with = (b"http:", b"https:")
elif isinstance(markup, unicode):
space = u' '
cant_start_with = (u"http:", u"https:")
else:
return
if any(markup.startswith(prefix) for prefix in cant_start_with):
if not space in markup:
if isinstance(markup, bytes):
decoded_markup = markup.decode('utf-8', 'replace')
else:
decoded_markup = markup
warnings.warn(
'"%s" looks like a URL. Beautiful Soup is not an'
' HTTP client. You should probably use an HTTP client like'
' requests to get the document behind the URL, and feed'
' that document to Beautiful Soup.' % decoded_markup
)
def _feed(self):
# Convert the document to Unicode.
self.builder.reset()
self.builder.feed(self.markup)
# Close out any unfinished strings and close all the open tags.
self.endData()
while self.currentTag.name != self.ROOT_TAG_NAME:
self.popTag()
def reset(self):
Tag.__init__(self, self, self.builder, self.ROOT_TAG_NAME)
self.hidden = 1
self.builder.reset()
self.current_data = []
self.currentTag = None
self.tagStack = []
self.preserve_whitespace_tag_stack = []
self.pushTag(self)
def new_tag(self, name, namespace=None, nsprefix=None, **attrs):
"""Create a new tag associated with this soup."""
return Tag(None, self.builder, name, namespace, nsprefix, attrs)
def new_string(self, s, subclass=NavigableString):
"""Create a new NavigableString associated with this soup."""
return subclass(s)
def insert_before(self, successor):
raise NotImplementedError("BeautifulSoup objects don't support insert_before().")
def insert_after(self, successor):
raise NotImplementedError("BeautifulSoup objects don't support insert_after().")
def popTag(self):
tag = self.tagStack.pop()
if self.preserve_whitespace_tag_stack and tag == self.preserve_whitespace_tag_stack[-1]:
self.preserve_whitespace_tag_stack.pop()
#print "Pop", tag.name
if self.tagStack:
self.currentTag = self.tagStack[-1]
return self.currentTag
def pushTag(self, tag):
#print "Push", tag.name
if self.currentTag:
self.currentTag.contents.append(tag)
self.tagStack.append(tag)
self.currentTag = self.tagStack[-1]
if tag.name in self.builder.preserve_whitespace_tags:
self.preserve_whitespace_tag_stack.append(tag)
def endData(self, containerClass=NavigableString):
if self.current_data:
current_data = u''.join(self.current_data)
# If whitespace is not preserved, and this string contains
# nothing but ASCII spaces, replace it with a single space
# or newline.
if not self.preserve_whitespace_tag_stack:
strippable = True
for i in current_data:
if i not in self.ASCII_SPACES:
strippable = False
break
if strippable:
if '\n' in current_data:
current_data = '\n'
else:
current_data = ' '
# Reset the data collector.
self.current_data = []
# Should we add this string to the tree at all?
if self.parse_only and len(self.tagStack) <= 1 and \
(not self.parse_only.text or \
not self.parse_only.search(current_data)):
return
o = containerClass(current_data)
self.object_was_parsed(o)
def object_was_parsed(self, o, parent=None, most_recent_element=None):
"""Add an object to the parse tree."""
parent = parent or self.currentTag
previous_element = most_recent_element or self._most_recent_element
next_element = previous_sibling = next_sibling = None
if isinstance(o, Tag):
next_element = o.next_element
next_sibling = o.next_sibling
previous_sibling = o.previous_sibling
if not previous_element:
previous_element = o.previous_element
o.setup(parent, previous_element, next_element, previous_sibling, next_sibling)
self._most_recent_element = o
parent.contents.append(o)
if parent.next_sibling:
# This node is being inserted into an element that has
# already been parsed. Deal with any dangling references.
index = len(parent.contents)-1
while index >= 0:
if parent.contents[index] is o:
break
index -= 1
else:
raise ValueError(
"Error building tree: supposedly %r was inserted "
"into %r after the fact, but I don't see it!" % (
o, parent
)
)
if index == 0:
previous_element = parent
previous_sibling = None
else:
previous_element = previous_sibling = parent.contents[index-1]
if index == len(parent.contents)-1:
next_element = parent.next_sibling
next_sibling = None
else:
next_element = next_sibling = parent.contents[index+1]
o.previous_element = previous_element
if previous_element:
previous_element.next_element = o
o.next_element = next_element
if next_element:
next_element.previous_element = o
o.next_sibling = next_sibling
if next_sibling:
next_sibling.previous_sibling = o
o.previous_sibling = previous_sibling
if previous_sibling:
previous_sibling.next_sibling = o
def _popToTag(self, name, nsprefix=None, inclusivePop=True):
"""Pops the tag stack up to and including the most recent
instance of the given tag. If inclusivePop is false, pops the tag
stack up to but *not* including the most recent instqance of
the given tag."""
#print "Popping to %s" % name
if name == self.ROOT_TAG_NAME:
# The BeautifulSoup object itself can never be popped.
return
most_recently_popped = None
stack_size = len(self.tagStack)
for i in range(stack_size - 1, 0, -1):
t = self.tagStack[i]
if (name == t.name and nsprefix == t.prefix):
if inclusivePop:
most_recently_popped = self.popTag()
break
most_recently_popped = self.popTag()
return most_recently_popped
def handle_starttag(self, name, namespace, nsprefix, attrs):
"""Push a start tag on to the stack.
If this method returns None, the tag was rejected by the
SoupStrainer. You should proceed as if the tag had not occurred
in the document. For instance, if this was a self-closing tag,
don't call handle_endtag.
"""
# print "Start tag %s: %s" % (name, attrs)
self.endData()
if (self.parse_only and len(self.tagStack) <= 1
and (self.parse_only.text
or not self.parse_only.search_tag(name, attrs))):
return None
tag = Tag(self, self.builder, name, namespace, nsprefix, attrs,
self.currentTag, self._most_recent_element)
if tag is None:
return tag
if self._most_recent_element:
self._most_recent_element.next_element = tag
self._most_recent_element = tag
self.pushTag(tag)
return tag
def handle_endtag(self, name, nsprefix=None):
#print "End tag: " + name
self.endData()
self._popToTag(name, nsprefix)
def handle_data(self, data):
self.current_data.append(data)
def decode(self, pretty_print=False,
eventual_encoding=DEFAULT_OUTPUT_ENCODING,
formatter="minimal"):
"""Returns a string or Unicode representation of this document.
To get Unicode, pass None for encoding."""
if self.is_xml:
# Print the XML declaration
encoding_part = ''
if eventual_encoding != None:
encoding_part = ' encoding="%s"' % eventual_encoding
prefix = u'<?xml version="1.0"%s?>\n' % encoding_part
else:
prefix = u''
if not pretty_print:
indent_level = None
else:
indent_level = 0
return prefix + super(BeautifulSoup, self).decode(
indent_level, eventual_encoding, formatter)
# Alias to make it easier to type import: 'from bs4 import _soup'
_s = BeautifulSoup
_soup = BeautifulSoup
class BeautifulStoneSoup(BeautifulSoup):
"""Deprecated interface to an XML parser."""
def __init__(self, *args, **kwargs):
kwargs['features'] = 'xml'
warnings.warn(
'The BeautifulStoneSoup class is deprecated. Instead of using '
'it, pass features="xml" into the BeautifulSoup constructor.')
super(BeautifulStoneSoup, self).__init__(*args, **kwargs)
class StopParsing(Exception):
pass
class FeatureNotFound(ValueError):
pass
#By default, act as an HTML pretty-printer.
if __name__ == '__main__':
import sys
soup = BeautifulSoup(sys.stdin)
print soup.prettify()

View File

@ -0,0 +1,333 @@
# Use of this source code is governed by a BSD-style license that can be
# found in the LICENSE file.
from collections import defaultdict
import itertools
import sys
from bs4.element import (
CharsetMetaAttributeValue,
ContentMetaAttributeValue,
HTMLAwareEntitySubstitution,
whitespace_re
)
__all__ = [
'HTMLTreeBuilder',
'SAXTreeBuilder',
'TreeBuilder',
'TreeBuilderRegistry',
]
# Some useful features for a TreeBuilder to have.
FAST = 'fast'
PERMISSIVE = 'permissive'
STRICT = 'strict'
XML = 'xml'
HTML = 'html'
HTML_5 = 'html5'
class TreeBuilderRegistry(object):
def __init__(self):
self.builders_for_feature = defaultdict(list)
self.builders = []
def register(self, treebuilder_class):
"""Register a treebuilder based on its advertised features."""
for feature in treebuilder_class.features:
self.builders_for_feature[feature].insert(0, treebuilder_class)
self.builders.insert(0, treebuilder_class)
def lookup(self, *features):
if len(self.builders) == 0:
# There are no builders at all.
return None
if len(features) == 0:
# They didn't ask for any features. Give them the most
# recently registered builder.
return self.builders[0]
# Go down the list of features in order, and eliminate any builders
# that don't match every feature.
features = list(features)
features.reverse()
candidates = None
candidate_set = None
while len(features) > 0:
feature = features.pop()
we_have_the_feature = self.builders_for_feature.get(feature, [])
if len(we_have_the_feature) > 0:
if candidates is None:
candidates = we_have_the_feature
candidate_set = set(candidates)
else:
# Eliminate any candidates that don't have this feature.
candidate_set = candidate_set.intersection(
set(we_have_the_feature))
# The only valid candidates are the ones in candidate_set.
# Go through the original list of candidates and pick the first one
# that's in candidate_set.
if candidate_set is None:
return None
for candidate in candidates:
if candidate in candidate_set:
return candidate
return None
# The BeautifulSoup class will take feature lists from developers and use them
# to look up builders in this registry.
builder_registry = TreeBuilderRegistry()
class TreeBuilder(object):
"""Turn a document into a Beautiful Soup object tree."""
NAME = "[Unknown tree builder]"
ALTERNATE_NAMES = []
features = []
is_xml = False
picklable = False
preserve_whitespace_tags = set()
empty_element_tags = None # A tag will be considered an empty-element
# tag when and only when it has no contents.
# A value for these tag/attribute combinations is a space- or
# comma-separated list of CDATA, rather than a single CDATA.
cdata_list_attributes = {}
def __init__(self):
self.soup = None
def reset(self):
pass
def can_be_empty_element(self, tag_name):
"""Might a tag with this name be an empty-element tag?
The final markup may or may not actually present this tag as
self-closing.
For instance: an HTMLBuilder does not consider a <p> tag to be
an empty-element tag (it's not in
HTMLBuilder.empty_element_tags). This means an empty <p> tag
will be presented as "<p></p>", not "<p />".
The default implementation has no opinion about which tags are
empty-element tags, so a tag will be presented as an
empty-element tag if and only if it has no contents.
"<foo></foo>" will become "<foo />", and "<foo>bar</foo>" will
be left alone.
"""
if self.empty_element_tags is None:
return True
return tag_name in self.empty_element_tags
def feed(self, markup):
raise NotImplementedError()
def prepare_markup(self, markup, user_specified_encoding=None,
document_declared_encoding=None):
return markup, None, None, False
def test_fragment_to_document(self, fragment):
"""Wrap an HTML fragment to make it look like a document.
Different parsers do this differently. For instance, lxml
introduces an empty <head> tag, and html5lib
doesn't. Abstracting this away lets us write simple tests
which run HTML fragments through the parser and compare the
results against other HTML fragments.
This method should not be used outside of tests.
"""
return fragment
def set_up_substitutions(self, tag):
return False
def _replace_cdata_list_attribute_values(self, tag_name, attrs):
"""Replaces class="foo bar" with class=["foo", "bar"]
Modifies its input in place.
"""
if not attrs:
return attrs
if self.cdata_list_attributes:
universal = self.cdata_list_attributes.get('*', [])
tag_specific = self.cdata_list_attributes.get(
tag_name.lower(), None)
for attr in attrs.keys():
if attr in universal or (tag_specific and attr in tag_specific):
# We have a "class"-type attribute whose string
# value is a whitespace-separated list of
# values. Split it into a list.
value = attrs[attr]
if isinstance(value, basestring):
values = whitespace_re.split(value)
else:
# html5lib sometimes calls setAttributes twice
# for the same tag when rearranging the parse
# tree. On the second call the attribute value
# here is already a list. If this happens,
# leave the value alone rather than trying to
# split it again.
values = value
attrs[attr] = values
return attrs
class SAXTreeBuilder(TreeBuilder):
"""A Beautiful Soup treebuilder that listens for SAX events."""
def feed(self, markup):
raise NotImplementedError()
def close(self):
pass
def startElement(self, name, attrs):
attrs = dict((key[1], value) for key, value in list(attrs.items()))
#print "Start %s, %r" % (name, attrs)
self.soup.handle_starttag(name, attrs)
def endElement(self, name):
#print "End %s" % name
self.soup.handle_endtag(name)
def startElementNS(self, nsTuple, nodeName, attrs):
# Throw away (ns, nodeName) for now.
self.startElement(nodeName, attrs)
def endElementNS(self, nsTuple, nodeName):
# Throw away (ns, nodeName) for now.
self.endElement(nodeName)
#handler.endElementNS((ns, node.nodeName), node.nodeName)
def startPrefixMapping(self, prefix, nodeValue):
# Ignore the prefix for now.
pass
def endPrefixMapping(self, prefix):
# Ignore the prefix for now.
# handler.endPrefixMapping(prefix)
pass
def characters(self, content):
self.soup.handle_data(content)
def startDocument(self):
pass
def endDocument(self):
pass
class HTMLTreeBuilder(TreeBuilder):
"""This TreeBuilder knows facts about HTML.
Such as which tags are empty-element tags.
"""
preserve_whitespace_tags = HTMLAwareEntitySubstitution.preserve_whitespace_tags
empty_element_tags = set([
# These are from HTML5.
'area', 'base', 'br', 'col', 'embed', 'hr', 'img', 'input', 'keygen', 'link', 'menuitem', 'meta', 'param', 'source', 'track', 'wbr',
# These are from HTML4, removed in HTML5.
'spacer', 'frame'
])
# The HTML standard defines these attributes as containing a
# space-separated list of values, not a single value. That is,
# class="foo bar" means that the 'class' attribute has two values,
# 'foo' and 'bar', not the single value 'foo bar'. When we
# encounter one of these attributes, we will parse its value into
# a list of values if possible. Upon output, the list will be
# converted back into a string.
cdata_list_attributes = {
"*" : ['class', 'accesskey', 'dropzone'],
"a" : ['rel', 'rev'],
"link" : ['rel', 'rev'],
"td" : ["headers"],
"th" : ["headers"],
"td" : ["headers"],
"form" : ["accept-charset"],
"object" : ["archive"],
# These are HTML5 specific, as are *.accesskey and *.dropzone above.
"area" : ["rel"],
"icon" : ["sizes"],
"iframe" : ["sandbox"],
"output" : ["for"],
}
def set_up_substitutions(self, tag):
# We are only interested in <meta> tags
if tag.name != 'meta':
return False
http_equiv = tag.get('http-equiv')
content = tag.get('content')
charset = tag.get('charset')
# We are interested in <meta> tags that say what encoding the
# document was originally in. This means HTML 5-style <meta>
# tags that provide the "charset" attribute. It also means
# HTML 4-style <meta> tags that provide the "content"
# attribute and have "http-equiv" set to "content-type".
#
# In both cases we will replace the value of the appropriate
# attribute with a standin object that can take on any
# encoding.
meta_encoding = None
if charset is not None:
# HTML 5 style:
# <meta charset="utf8">
meta_encoding = charset
tag['charset'] = CharsetMetaAttributeValue(charset)
elif (content is not None and http_equiv is not None
and http_equiv.lower() == 'content-type'):
# HTML 4 style:
# <meta http-equiv="content-type" content="text/html; charset=utf8">
tag['content'] = ContentMetaAttributeValue(content)
return (meta_encoding is not None)
def register_treebuilders_from(module):
"""Copy TreeBuilders from the given module into this module."""
# I'm fairly sure this is not the best way to do this.
this_module = sys.modules['bs4.builder']
for name in module.__all__:
obj = getattr(module, name)
if issubclass(obj, TreeBuilder):
setattr(this_module, name, obj)
this_module.__all__.append(name)
# Register the builder while we're at it.
this_module.builder_registry.register(obj)
class ParserRejectedMarkup(Exception):
pass
# Builders are registered in reverse order of priority, so that custom
# builder registrations will take precedence. In general, we want lxml
# to take precedence over html5lib, because it's faster. And we only
# want to use HTMLParser as a last result.
from . import _htmlparser
register_treebuilders_from(_htmlparser)
try:
from . import _html5lib
register_treebuilders_from(_html5lib)
except ImportError:
# They don't have html5lib installed.
pass
try:
from . import _lxml
register_treebuilders_from(_lxml)
except ImportError:
# They don't have lxml installed.
pass

View File

@ -0,0 +1,426 @@
# Use of this source code is governed by a BSD-style license that can be
# found in the LICENSE file.
__all__ = [
'HTML5TreeBuilder',
]
import warnings
import re
from bs4.builder import (
PERMISSIVE,
HTML,
HTML_5,
HTMLTreeBuilder,
)
from bs4.element import (
NamespacedAttribute,
whitespace_re,
)
import html5lib
from html5lib.constants import (
namespaces,
prefixes,
)
from bs4.element import (
Comment,
Doctype,
NavigableString,
Tag,
)
try:
# Pre-0.99999999
from html5lib.treebuilders import _base as treebuilder_base
new_html5lib = False
except ImportError, e:
# 0.99999999 and up
from html5lib.treebuilders import base as treebuilder_base
new_html5lib = True
class HTML5TreeBuilder(HTMLTreeBuilder):
"""Use html5lib to build a tree."""
NAME = "html5lib"
features = [NAME, PERMISSIVE, HTML_5, HTML]
def prepare_markup(self, markup, user_specified_encoding,
document_declared_encoding=None, exclude_encodings=None):
# Store the user-specified encoding for use later on.
self.user_specified_encoding = user_specified_encoding
# document_declared_encoding and exclude_encodings aren't used
# ATM because the html5lib TreeBuilder doesn't use
# UnicodeDammit.
if exclude_encodings:
warnings.warn("You provided a value for exclude_encoding, but the html5lib tree builder doesn't support exclude_encoding.")
yield (markup, None, None, False)
# These methods are defined by Beautiful Soup.
def feed(self, markup):
if self.soup.parse_only is not None:
warnings.warn("You provided a value for parse_only, but the html5lib tree builder doesn't support parse_only. The entire document will be parsed.")
parser = html5lib.HTMLParser(tree=self.create_treebuilder)
extra_kwargs = dict()
if not isinstance(markup, unicode):
if new_html5lib:
extra_kwargs['override_encoding'] = self.user_specified_encoding
else:
extra_kwargs['encoding'] = self.user_specified_encoding
doc = parser.parse(markup, **extra_kwargs)
# Set the character encoding detected by the tokenizer.
if isinstance(markup, unicode):
# We need to special-case this because html5lib sets
# charEncoding to UTF-8 if it gets Unicode input.
doc.original_encoding = None
else:
original_encoding = parser.tokenizer.stream.charEncoding[0]
if not isinstance(original_encoding, basestring):
# In 0.99999999 and up, the encoding is an html5lib
# Encoding object. We want to use a string for compatibility
# with other tree builders.
original_encoding = original_encoding.name
doc.original_encoding = original_encoding
def create_treebuilder(self, namespaceHTMLElements):
self.underlying_builder = TreeBuilderForHtml5lib(
namespaceHTMLElements, self.soup)
return self.underlying_builder
def test_fragment_to_document(self, fragment):
"""See `TreeBuilder`."""
return u'<html><head></head><body>%s</body></html>' % fragment
class TreeBuilderForHtml5lib(treebuilder_base.TreeBuilder):
def __init__(self, namespaceHTMLElements, soup=None):
if soup:
self.soup = soup
else:
from bs4 import BeautifulSoup
self.soup = BeautifulSoup("", "html.parser")
super(TreeBuilderForHtml5lib, self).__init__(namespaceHTMLElements)
def documentClass(self):
self.soup.reset()
return Element(self.soup, self.soup, None)
def insertDoctype(self, token):
name = token["name"]
publicId = token["publicId"]
systemId = token["systemId"]
doctype = Doctype.for_name_and_ids(name, publicId, systemId)
self.soup.object_was_parsed(doctype)
def elementClass(self, name, namespace):
tag = self.soup.new_tag(name, namespace)
return Element(tag, self.soup, namespace)
def commentClass(self, data):
return TextNode(Comment(data), self.soup)
def fragmentClass(self):
from bs4 import BeautifulSoup
self.soup = BeautifulSoup("", "html.parser")
self.soup.name = "[document_fragment]"
return Element(self.soup, self.soup, None)
def appendChild(self, node):
# XXX This code is not covered by the BS4 tests.
self.soup.append(node.element)
def getDocument(self):
return self.soup
def getFragment(self):
return treebuilder_base.TreeBuilder.getFragment(self).element
def testSerializer(self, element):
from bs4 import BeautifulSoup
rv = []
doctype_re = re.compile(r'^(.*?)(?: PUBLIC "(.*?)"(?: "(.*?)")?| SYSTEM "(.*?)")?$')
def serializeElement(element, indent=0):
if isinstance(element, BeautifulSoup):
pass
if isinstance(element, Doctype):
m = doctype_re.match(element)
if m:
name = m.group(1)
if m.lastindex > 1:
publicId = m.group(2) or ""
systemId = m.group(3) or m.group(4) or ""
rv.append("""|%s<!DOCTYPE %s "%s" "%s">""" %
(' ' * indent, name, publicId, systemId))
else:
rv.append("|%s<!DOCTYPE %s>" % (' ' * indent, name))
else:
rv.append("|%s<!DOCTYPE >" % (' ' * indent,))
elif isinstance(element, Comment):
rv.append("|%s<!-- %s -->" % (' ' * indent, element))
elif isinstance(element, NavigableString):
rv.append("|%s\"%s\"" % (' ' * indent, element))
else:
if element.namespace:
name = "%s %s" % (prefixes[element.namespace],
element.name)
else:
name = element.name
rv.append("|%s<%s>" % (' ' * indent, name))
if element.attrs:
attributes = []
for name, value in element.attrs.items():
if isinstance(name, NamespacedAttribute):
name = "%s %s" % (prefixes[name.namespace], name.name)
if isinstance(value, list):
value = " ".join(value)
attributes.append((name, value))
for name, value in sorted(attributes):
rv.append('|%s%s="%s"' % (' ' * (indent + 2), name, value))
indent += 2
for child in element.children:
serializeElement(child, indent)
serializeElement(element, 0)
return "\n".join(rv)
class AttrList(object):
def __init__(self, element):
self.element = element
self.attrs = dict(self.element.attrs)
def __iter__(self):
return list(self.attrs.items()).__iter__()
def __setitem__(self, name, value):
# If this attribute is a multi-valued attribute for this element,
# turn its value into a list.
list_attr = HTML5TreeBuilder.cdata_list_attributes
if (name in list_attr['*']
or (self.element.name in list_attr
and name in list_attr[self.element.name])):
# A node that is being cloned may have already undergone
# this procedure.
if not isinstance(value, list):
value = whitespace_re.split(value)
self.element[name] = value
def items(self):
return list(self.attrs.items())
def keys(self):
return list(self.attrs.keys())
def __len__(self):
return len(self.attrs)
def __getitem__(self, name):
return self.attrs[name]
def __contains__(self, name):
return name in list(self.attrs.keys())
class Element(treebuilder_base.Node):
def __init__(self, element, soup, namespace):
treebuilder_base.Node.__init__(self, element.name)
self.element = element
self.soup = soup
self.namespace = namespace
def appendChild(self, node):
string_child = child = None
if isinstance(node, basestring):
# Some other piece of code decided to pass in a string
# instead of creating a TextElement object to contain the
# string.
string_child = child = node
elif isinstance(node, Tag):
# Some other piece of code decided to pass in a Tag
# instead of creating an Element object to contain the
# Tag.
child = node
elif node.element.__class__ == NavigableString:
string_child = child = node.element
node.parent = self
else:
child = node.element
node.parent = self
if not isinstance(child, basestring) and child.parent is not None:
node.element.extract()
if (string_child and self.element.contents
and self.element.contents[-1].__class__ == NavigableString):
# We are appending a string onto another string.
# TODO This has O(n^2) performance, for input like
# "a</a>a</a>a</a>..."
old_element = self.element.contents[-1]
new_element = self.soup.new_string(old_element + string_child)
old_element.replace_with(new_element)
self.soup._most_recent_element = new_element
else:
if isinstance(node, basestring):
# Create a brand new NavigableString from this string.
child = self.soup.new_string(node)
# Tell Beautiful Soup to act as if it parsed this element
# immediately after the parent's last descendant. (Or
# immediately after the parent, if it has no children.)
if self.element.contents:
most_recent_element = self.element._last_descendant(False)
elif self.element.next_element is not None:
# Something from further ahead in the parse tree is
# being inserted into this earlier element. This is
# very annoying because it means an expensive search
# for the last element in the tree.
most_recent_element = self.soup._last_descendant()
else:
most_recent_element = self.element
self.soup.object_was_parsed(
child, parent=self.element,
most_recent_element=most_recent_element)
def getAttributes(self):
if isinstance(self.element, Comment):
return {}
return AttrList(self.element)
def setAttributes(self, attributes):
if attributes is not None and len(attributes) > 0:
converted_attributes = []
for name, value in list(attributes.items()):
if isinstance(name, tuple):
new_name = NamespacedAttribute(*name)
del attributes[name]
attributes[new_name] = value
self.soup.builder._replace_cdata_list_attribute_values(
self.name, attributes)
for name, value in attributes.items():
self.element[name] = value
# The attributes may contain variables that need substitution.
# Call set_up_substitutions manually.
#
# The Tag constructor called this method when the Tag was created,
# but we just set/changed the attributes, so call it again.
self.soup.builder.set_up_substitutions(self.element)
attributes = property(getAttributes, setAttributes)
def insertText(self, data, insertBefore=None):
text = TextNode(self.soup.new_string(data), self.soup)
if insertBefore:
self.insertBefore(text, insertBefore)
else:
self.appendChild(text)
def insertBefore(self, node, refNode):
index = self.element.index(refNode.element)
if (node.element.__class__ == NavigableString and self.element.contents
and self.element.contents[index-1].__class__ == NavigableString):
# (See comments in appendChild)
old_node = self.element.contents[index-1]
new_str = self.soup.new_string(old_node + node.element)
old_node.replace_with(new_str)
else:
self.element.insert(index, node.element)
node.parent = self
def removeChild(self, node):
node.element.extract()
def reparentChildren(self, new_parent):
"""Move all of this tag's children into another tag."""
# print "MOVE", self.element.contents
# print "FROM", self.element
# print "TO", new_parent.element
element = self.element
new_parent_element = new_parent.element
# Determine what this tag's next_element will be once all the children
# are removed.
final_next_element = element.next_sibling
new_parents_last_descendant = new_parent_element._last_descendant(False, False)
if len(new_parent_element.contents) > 0:
# The new parent already contains children. We will be
# appending this tag's children to the end.
new_parents_last_child = new_parent_element.contents[-1]
new_parents_last_descendant_next_element = new_parents_last_descendant.next_element
else:
# The new parent contains no children.
new_parents_last_child = None
new_parents_last_descendant_next_element = new_parent_element.next_element
to_append = element.contents
if len(to_append) > 0:
# Set the first child's previous_element and previous_sibling
# to elements within the new parent
first_child = to_append[0]
if new_parents_last_descendant:
first_child.previous_element = new_parents_last_descendant
else:
first_child.previous_element = new_parent_element
first_child.previous_sibling = new_parents_last_child
if new_parents_last_descendant:
new_parents_last_descendant.next_element = first_child
else:
new_parent_element.next_element = first_child
if new_parents_last_child:
new_parents_last_child.next_sibling = first_child
# Find the very last element being moved. It is now the
# parent's last descendant. It has no .next_sibling and
# its .next_element is whatever the previous last
# descendant had.
last_childs_last_descendant = to_append[-1]._last_descendant(False, True)
last_childs_last_descendant.next_element = new_parents_last_descendant_next_element
if new_parents_last_descendant_next_element:
# TODO: This code has no test coverage and I'm not sure
# how to get html5lib to go through this path, but it's
# just the other side of the previous line.
new_parents_last_descendant_next_element.previous_element = last_childs_last_descendant
last_childs_last_descendant.next_sibling = None
for child in to_append:
child.parent = new_parent_element
new_parent_element.contents.append(child)
# Now that this element has no children, change its .next_element.
element.contents = []
element.next_element = final_next_element
# print "DONE WITH MOVE"
# print "FROM", self.element
# print "TO", new_parent_element
def cloneNode(self):
tag = self.soup.new_tag(self.element.name, self.namespace)
node = Element(tag, self.soup, self.namespace)
for key,value in self.attributes:
node.attributes[key] = value
return node
def hasContent(self):
return self.element.contents
def getNameTuple(self):
if self.namespace == None:
return namespaces["html"], self.name
else:
return self.namespace, self.name
nameTuple = property(getNameTuple)
class TextNode(Element):
def __init__(self, element, soup):
treebuilder_base.Node.__init__(self, None)
self.element = element
self.soup = soup
def cloneNode(self):
raise NotImplementedError

View File

@ -0,0 +1,314 @@
"""Use the HTMLParser library to parse HTML files that aren't too bad."""
# Use of this source code is governed by a BSD-style license that can be
# found in the LICENSE file.
__all__ = [
'HTMLParserTreeBuilder',
]
from HTMLParser import HTMLParser
try:
from HTMLParser import HTMLParseError
except ImportError, e:
# HTMLParseError is removed in Python 3.5. Since it can never be
# thrown in 3.5, we can just define our own class as a placeholder.
class HTMLParseError(Exception):
pass
import sys
import warnings
# Starting in Python 3.2, the HTMLParser constructor takes a 'strict'
# argument, which we'd like to set to False. Unfortunately,
# http://bugs.python.org/issue13273 makes strict=True a better bet
# before Python 3.2.3.
#
# At the end of this file, we monkeypatch HTMLParser so that
# strict=True works well on Python 3.2.2.
major, minor, release = sys.version_info[:3]
CONSTRUCTOR_TAKES_STRICT = major == 3 and minor == 2 and release >= 3
CONSTRUCTOR_STRICT_IS_DEPRECATED = major == 3 and minor == 3
CONSTRUCTOR_TAKES_CONVERT_CHARREFS = major == 3 and minor >= 4
from bs4.element import (
CData,
Comment,
Declaration,
Doctype,
ProcessingInstruction,
)
from bs4.dammit import EntitySubstitution, UnicodeDammit
from bs4.builder import (
HTML,
HTMLTreeBuilder,
STRICT,
)
HTMLPARSER = 'html.parser'
class BeautifulSoupHTMLParser(HTMLParser):
def __init__(self, *args, **kwargs):
HTMLParser.__init__(self, *args, **kwargs)
# Keep a list of empty-element tags that were encountered
# without an explicit closing tag. If we encounter a closing tag
# of this type, we'll associate it with one of those entries.
#
# This isn't a stack because we don't care about the
# order. It's a list of closing tags we've already handled and
# will ignore, assuming they ever show up.
self.already_closed_empty_element = []
def handle_startendtag(self, name, attrs):
# This is only called when the markup looks like
# <tag/>.
# is_startend() tells handle_starttag not to close the tag
# just because its name matches a known empty-element tag. We
# know that this is an empty-element tag and we want to call
# handle_endtag ourselves.
tag = self.handle_starttag(name, attrs, handle_empty_element=False)
self.handle_endtag(name)
def handle_starttag(self, name, attrs, handle_empty_element=True):
# XXX namespace
attr_dict = {}
for key, value in attrs:
# Change None attribute values to the empty string
# for consistency with the other tree builders.
if value is None:
value = ''
attr_dict[key] = value
attrvalue = '""'
#print "START", name
tag = self.soup.handle_starttag(name, None, None, attr_dict)
if tag and tag.is_empty_element and handle_empty_element:
# Unlike other parsers, html.parser doesn't send separate end tag
# events for empty-element tags. (It's handled in
# handle_startendtag, but only if the original markup looked like
# <tag/>.)
#
# So we need to call handle_endtag() ourselves. Since we
# know the start event is identical to the end event, we
# don't want handle_endtag() to cross off any previous end
# events for tags of this name.
self.handle_endtag(name, check_already_closed=False)
# But we might encounter an explicit closing tag for this tag
# later on. If so, we want to ignore it.
self.already_closed_empty_element.append(name)
def handle_endtag(self, name, check_already_closed=True):
#print "END", name
if check_already_closed and name in self.already_closed_empty_element:
# This is a redundant end tag for an empty-element tag.
# We've already called handle_endtag() for it, so just
# check it off the list.
# print "ALREADY CLOSED", name
self.already_closed_empty_element.remove(name)
else:
self.soup.handle_endtag(name)
def handle_data(self, data):
self.soup.handle_data(data)
def handle_charref(self, name):
# XXX workaround for a bug in HTMLParser. Remove this once
# it's fixed in all supported versions.
# http://bugs.python.org/issue13633
if name.startswith('x'):
real_name = int(name.lstrip('x'), 16)
elif name.startswith('X'):
real_name = int(name.lstrip('X'), 16)
else:
real_name = int(name)
try:
data = unichr(real_name)
except (ValueError, OverflowError), e:
data = u"\N{REPLACEMENT CHARACTER}"
self.handle_data(data)
def handle_entityref(self, name):
character = EntitySubstitution.HTML_ENTITY_TO_CHARACTER.get(name)
if character is not None:
data = character
else:
data = "&%s;" % name
self.handle_data(data)
def handle_comment(self, data):
self.soup.endData()
self.soup.handle_data(data)
self.soup.endData(Comment)
def handle_decl(self, data):
self.soup.endData()
if data.startswith("DOCTYPE "):
data = data[len("DOCTYPE "):]
elif data == 'DOCTYPE':
# i.e. "<!DOCTYPE>"
data = ''
self.soup.handle_data(data)
self.soup.endData(Doctype)
def unknown_decl(self, data):
if data.upper().startswith('CDATA['):
cls = CData
data = data[len('CDATA['):]
else:
cls = Declaration
self.soup.endData()
self.soup.handle_data(data)
self.soup.endData(cls)
def handle_pi(self, data):
self.soup.endData()
self.soup.handle_data(data)
self.soup.endData(ProcessingInstruction)
class HTMLParserTreeBuilder(HTMLTreeBuilder):
is_xml = False
picklable = True
NAME = HTMLPARSER
features = [NAME, HTML, STRICT]
def __init__(self, *args, **kwargs):
if CONSTRUCTOR_TAKES_STRICT and not CONSTRUCTOR_STRICT_IS_DEPRECATED:
kwargs['strict'] = False
if CONSTRUCTOR_TAKES_CONVERT_CHARREFS:
kwargs['convert_charrefs'] = False
self.parser_args = (args, kwargs)
def prepare_markup(self, markup, user_specified_encoding=None,
document_declared_encoding=None, exclude_encodings=None):
"""
:return: A 4-tuple (markup, original encoding, encoding
declared within markup, whether any characters had to be
replaced with REPLACEMENT CHARACTER).
"""
if isinstance(markup, unicode):
yield (markup, None, None, False)
return
try_encodings = [user_specified_encoding, document_declared_encoding]
dammit = UnicodeDammit(markup, try_encodings, is_html=True,
exclude_encodings=exclude_encodings)
yield (dammit.markup, dammit.original_encoding,
dammit.declared_html_encoding,
dammit.contains_replacement_characters)
def feed(self, markup):
args, kwargs = self.parser_args
parser = BeautifulSoupHTMLParser(*args, **kwargs)
parser.soup = self.soup
try:
parser.feed(markup)
except HTMLParseError, e:
warnings.warn(RuntimeWarning(
"Python's built-in HTMLParser cannot parse the given document. This is not a bug in Beautiful Soup. The best solution is to install an external parser (lxml or html5lib), and use Beautiful Soup with that parser. See http://www.crummy.com/software/BeautifulSoup/bs4/doc/#installing-a-parser for help."))
raise e
parser.already_closed_empty_element = []
# Patch 3.2 versions of HTMLParser earlier than 3.2.3 to use some
# 3.2.3 code. This ensures they don't treat markup like <p></p> as a
# string.
#
# XXX This code can be removed once most Python 3 users are on 3.2.3.
if major == 3 and minor == 2 and not CONSTRUCTOR_TAKES_STRICT:
import re
attrfind_tolerant = re.compile(
r'\s*((?<=[\'"\s])[^\s/>][^\s/=>]*)(\s*=+\s*'
r'(\'[^\']*\'|"[^"]*"|(?![\'"])[^>\s]*))?')
HTMLParserTreeBuilder.attrfind_tolerant = attrfind_tolerant
locatestarttagend = re.compile(r"""
<[a-zA-Z][-.a-zA-Z0-9:_]* # tag name
(?:\s+ # whitespace before attribute name
(?:[a-zA-Z_][-.:a-zA-Z0-9_]* # attribute name
(?:\s*=\s* # value indicator
(?:'[^']*' # LITA-enclosed value
|\"[^\"]*\" # LIT-enclosed value
|[^'\">\s]+ # bare value
)
)?
)
)*
\s* # trailing whitespace
""", re.VERBOSE)
BeautifulSoupHTMLParser.locatestarttagend = locatestarttagend
from html.parser import tagfind, attrfind
def parse_starttag(self, i):
self.__starttag_text = None
endpos = self.check_for_whole_start_tag(i)
if endpos < 0:
return endpos
rawdata = self.rawdata
self.__starttag_text = rawdata[i:endpos]
# Now parse the data between i+1 and j into a tag and attrs
attrs = []
match = tagfind.match(rawdata, i+1)
assert match, 'unexpected call to parse_starttag()'
k = match.end()
self.lasttag = tag = rawdata[i+1:k].lower()
while k < endpos:
if self.strict:
m = attrfind.match(rawdata, k)
else:
m = attrfind_tolerant.match(rawdata, k)
if not m:
break
attrname, rest, attrvalue = m.group(1, 2, 3)
if not rest:
attrvalue = None
elif attrvalue[:1] == '\'' == attrvalue[-1:] or \
attrvalue[:1] == '"' == attrvalue[-1:]:
attrvalue = attrvalue[1:-1]
if attrvalue:
attrvalue = self.unescape(attrvalue)
attrs.append((attrname.lower(), attrvalue))
k = m.end()
end = rawdata[k:endpos].strip()
if end not in (">", "/>"):
lineno, offset = self.getpos()
if "\n" in self.__starttag_text:
lineno = lineno + self.__starttag_text.count("\n")
offset = len(self.__starttag_text) \
- self.__starttag_text.rfind("\n")
else:
offset = offset + len(self.__starttag_text)
if self.strict:
self.error("junk characters in start tag: %r"
% (rawdata[k:endpos][:20],))
self.handle_data(rawdata[i:endpos])
return endpos
if end.endswith('/>'):
# XHTML-style empty tag: <span attr="value" />
self.handle_startendtag(tag, attrs)
else:
self.handle_starttag(tag, attrs)
if tag in self.CDATA_CONTENT_ELEMENTS:
self.set_cdata_mode(tag)
return endpos
def set_cdata_mode(self, elem):
self.cdata_elem = elem.lower()
self.interesting = re.compile(r'</\s*%s\s*>' % self.cdata_elem, re.I)
BeautifulSoupHTMLParser.parse_starttag = parse_starttag
BeautifulSoupHTMLParser.set_cdata_mode = set_cdata_mode
CONSTRUCTOR_TAKES_STRICT = True

View File

@ -0,0 +1,258 @@
# Use of this source code is governed by a BSD-style license that can be
# found in the LICENSE file.
__all__ = [
'LXMLTreeBuilderForXML',
'LXMLTreeBuilder',
]
from io import BytesIO
from StringIO import StringIO
import collections
from lxml import etree
from bs4.element import (
Comment,
Doctype,
NamespacedAttribute,
ProcessingInstruction,
XMLProcessingInstruction,
)
from bs4.builder import (
FAST,
HTML,
HTMLTreeBuilder,
PERMISSIVE,
ParserRejectedMarkup,
TreeBuilder,
XML)
from bs4.dammit import EncodingDetector
LXML = 'lxml'
class LXMLTreeBuilderForXML(TreeBuilder):
DEFAULT_PARSER_CLASS = etree.XMLParser
is_xml = True
processing_instruction_class = XMLProcessingInstruction
NAME = "lxml-xml"
ALTERNATE_NAMES = ["xml"]
# Well, it's permissive by XML parser standards.
features = [NAME, LXML, XML, FAST, PERMISSIVE]
CHUNK_SIZE = 512
# This namespace mapping is specified in the XML Namespace
# standard.
DEFAULT_NSMAPS = {'http://www.w3.org/XML/1998/namespace' : "xml"}
def default_parser(self, encoding):
# This can either return a parser object or a class, which
# will be instantiated with default arguments.
if self._default_parser is not None:
return self._default_parser
return etree.XMLParser(
target=self, strip_cdata=False, recover=True, encoding=encoding)
def parser_for(self, encoding):
# Use the default parser.
parser = self.default_parser(encoding)
if isinstance(parser, collections.Callable):
# Instantiate the parser with default arguments
parser = parser(target=self, strip_cdata=False, encoding=encoding)
return parser
def __init__(self, parser=None, empty_element_tags=None):
# TODO: Issue a warning if parser is present but not a
# callable, since that means there's no way to create new
# parsers for different encodings.
self._default_parser = parser
if empty_element_tags is not None:
self.empty_element_tags = set(empty_element_tags)
self.soup = None
self.nsmaps = [self.DEFAULT_NSMAPS]
def _getNsTag(self, tag):
# Split the namespace URL out of a fully-qualified lxml tag
# name. Copied from lxml's src/lxml/sax.py.
if tag[0] == '{':
return tuple(tag[1:].split('}', 1))
else:
return (None, tag)
def prepare_markup(self, markup, user_specified_encoding=None,
exclude_encodings=None,
document_declared_encoding=None):
"""
:yield: A series of 4-tuples.
(markup, encoding, declared encoding,
has undergone character replacement)
Each 4-tuple represents a strategy for parsing the document.
"""
# Instead of using UnicodeDammit to convert the bytestring to
# Unicode using different encodings, use EncodingDetector to
# iterate over the encodings, and tell lxml to try to parse
# the document as each one in turn.
is_html = not self.is_xml
if is_html:
self.processing_instruction_class = ProcessingInstruction
else:
self.processing_instruction_class = XMLProcessingInstruction
if isinstance(markup, unicode):
# We were given Unicode. Maybe lxml can parse Unicode on
# this system?
yield markup, None, document_declared_encoding, False
if isinstance(markup, unicode):
# No, apparently not. Convert the Unicode to UTF-8 and
# tell lxml to parse it as UTF-8.
yield (markup.encode("utf8"), "utf8",
document_declared_encoding, False)
try_encodings = [user_specified_encoding, document_declared_encoding]
detector = EncodingDetector(
markup, try_encodings, is_html, exclude_encodings)
for encoding in detector.encodings:
yield (detector.markup, encoding, document_declared_encoding, False)
def feed(self, markup):
if isinstance(markup, bytes):
markup = BytesIO(markup)
elif isinstance(markup, unicode):
markup = StringIO(markup)
# Call feed() at least once, even if the markup is empty,
# or the parser won't be initialized.
data = markup.read(self.CHUNK_SIZE)
try:
self.parser = self.parser_for(self.soup.original_encoding)
self.parser.feed(data)
while len(data) != 0:
# Now call feed() on the rest of the data, chunk by chunk.
data = markup.read(self.CHUNK_SIZE)
if len(data) != 0:
self.parser.feed(data)
self.parser.close()
except (UnicodeDecodeError, LookupError, etree.ParserError), e:
raise ParserRejectedMarkup(str(e))
def close(self):
self.nsmaps = [self.DEFAULT_NSMAPS]
def start(self, name, attrs, nsmap={}):
# Make sure attrs is a mutable dict--lxml may send an immutable dictproxy.
attrs = dict(attrs)
nsprefix = None
# Invert each namespace map as it comes in.
if len(self.nsmaps) > 1:
# There are no new namespaces for this tag, but
# non-default namespaces are in play, so we need a
# separate tag stack to know when they end.
self.nsmaps.append(None)
elif len(nsmap) > 0:
# A new namespace mapping has come into play.
inverted_nsmap = dict((value, key) for key, value in nsmap.items())
self.nsmaps.append(inverted_nsmap)
# Also treat the namespace mapping as a set of attributes on the
# tag, so we can recreate it later.
attrs = attrs.copy()
for prefix, namespace in nsmap.items():
attribute = NamespacedAttribute(
"xmlns", prefix, "http://www.w3.org/2000/xmlns/")
attrs[attribute] = namespace
# Namespaces are in play. Find any attributes that came in
# from lxml with namespaces attached to their names, and
# turn then into NamespacedAttribute objects.
new_attrs = {}
for attr, value in attrs.items():
namespace, attr = self._getNsTag(attr)
if namespace is None:
new_attrs[attr] = value
else:
nsprefix = self._prefix_for_namespace(namespace)
attr = NamespacedAttribute(nsprefix, attr, namespace)
new_attrs[attr] = value
attrs = new_attrs
namespace, name = self._getNsTag(name)
nsprefix = self._prefix_for_namespace(namespace)
self.soup.handle_starttag(name, namespace, nsprefix, attrs)
def _prefix_for_namespace(self, namespace):
"""Find the currently active prefix for the given namespace."""
if namespace is None:
return None
for inverted_nsmap in reversed(self.nsmaps):
if inverted_nsmap is not None and namespace in inverted_nsmap:
return inverted_nsmap[namespace]
return None
def end(self, name):
self.soup.endData()
completed_tag = self.soup.tagStack[-1]
namespace, name = self._getNsTag(name)
nsprefix = None
if namespace is not None:
for inverted_nsmap in reversed(self.nsmaps):
if inverted_nsmap is not None and namespace in inverted_nsmap:
nsprefix = inverted_nsmap[namespace]
break
self.soup.handle_endtag(name, nsprefix)
if len(self.nsmaps) > 1:
# This tag, or one of its parents, introduced a namespace
# mapping, so pop it off the stack.
self.nsmaps.pop()
def pi(self, target, data):
self.soup.endData()
self.soup.handle_data(target + ' ' + data)
self.soup.endData(self.processing_instruction_class)
def data(self, content):
self.soup.handle_data(content)
def doctype(self, name, pubid, system):
self.soup.endData()
doctype = Doctype.for_name_and_ids(name, pubid, system)
self.soup.object_was_parsed(doctype)
def comment(self, content):
"Handle comments as Comment objects."
self.soup.endData()
self.soup.handle_data(content)
self.soup.endData(Comment)
def test_fragment_to_document(self, fragment):
"""See `TreeBuilder`."""
return u'<?xml version="1.0" encoding="utf-8"?>\n%s' % fragment
class LXMLTreeBuilder(HTMLTreeBuilder, LXMLTreeBuilderForXML):
NAME = LXML
ALTERNATE_NAMES = ["lxml-html"]
features = ALTERNATE_NAMES + [NAME, HTML, FAST, PERMISSIVE]
is_xml = False
processing_instruction_class = ProcessingInstruction
def default_parser(self, encoding):
return etree.HTMLParser
def feed(self, markup):
encoding = self.soup.original_encoding
try:
self.parser = self.parser_for(encoding)
self.parser.feed(markup)
self.parser.close()
except (UnicodeDecodeError, LookupError, etree.ParserError), e:
raise ParserRejectedMarkup(str(e))
def test_fragment_to_document(self, fragment):
"""See `TreeBuilder`."""
return u'<html><body>%s</body></html>' % fragment

842
libs/bs42.7/dammit.py Normal file
View File

@ -0,0 +1,842 @@
# -*- coding: utf-8 -*-
"""Beautiful Soup bonus library: Unicode, Dammit
This library converts a bytestream to Unicode through any means
necessary. It is heavily based on code from Mark Pilgrim's Universal
Feed Parser. It works best on XML and HTML, but it does not rewrite the
XML or HTML to reflect a new encoding; that's the tree builder's job.
"""
# Use of this source code is governed by a BSD-style license that can be
# found in the LICENSE file.
__license__ = "MIT"
import codecs
from htmlentitydefs import codepoint2name
import re
import logging
import string
# Import a library to autodetect character encodings.
chardet_type = None
try:
# First try the fast C implementation.
# PyPI package: cchardet
import cchardet
def chardet_dammit(s):
return cchardet.detect(s)['encoding']
except ImportError:
try:
# Fall back to the pure Python implementation
# Debian package: python-chardet
# PyPI package: chardet
import chardet
def chardet_dammit(s):
return chardet.detect(s)['encoding']
#import chardet.constants
#chardet.constants._debug = 1
except ImportError:
# No chardet available.
def chardet_dammit(s):
return None
# Available from http://cjkpython.i18n.org/.
try:
import iconv_codec
except ImportError:
pass
xml_encoding_re = re.compile(
'^<\?.*encoding=[\'"](.*?)[\'"].*\?>'.encode(), re.I)
html_meta_re = re.compile(
'<\s*meta[^>]+charset\s*=\s*["\']?([^>]*?)[ /;\'">]'.encode(), re.I)
class EntitySubstitution(object):
"""Substitute XML or HTML entities for the corresponding characters."""
def _populate_class_variables():
lookup = {}
reverse_lookup = {}
characters_for_re = []
for codepoint, name in list(codepoint2name.items()):
character = unichr(codepoint)
if codepoint != 34:
# There's no point in turning the quotation mark into
# &quot;, unless it happens within an attribute value, which
# is handled elsewhere.
characters_for_re.append(character)
lookup[character] = name
# But we do want to turn &quot; into the quotation mark.
reverse_lookup[name] = character
re_definition = "[%s]" % "".join(characters_for_re)
return lookup, reverse_lookup, re.compile(re_definition)
(CHARACTER_TO_HTML_ENTITY, HTML_ENTITY_TO_CHARACTER,
CHARACTER_TO_HTML_ENTITY_RE) = _populate_class_variables()
CHARACTER_TO_XML_ENTITY = {
"'": "apos",
'"': "quot",
"&": "amp",
"<": "lt",
">": "gt",
}
BARE_AMPERSAND_OR_BRACKET = re.compile("([<>]|"
"&(?!#\d+;|#x[0-9a-fA-F]+;|\w+;)"
")")
AMPERSAND_OR_BRACKET = re.compile("([<>&])")
@classmethod
def _substitute_html_entity(cls, matchobj):
entity = cls.CHARACTER_TO_HTML_ENTITY.get(matchobj.group(0))
return "&%s;" % entity
@classmethod
def _substitute_xml_entity(cls, matchobj):
"""Used with a regular expression to substitute the
appropriate XML entity for an XML special character."""
entity = cls.CHARACTER_TO_XML_ENTITY[matchobj.group(0)]
return "&%s;" % entity
@classmethod
def quoted_attribute_value(self, value):
"""Make a value into a quoted XML attribute, possibly escaping it.
Most strings will be quoted using double quotes.
Bob's Bar -> "Bob's Bar"
If a string contains double quotes, it will be quoted using
single quotes.
Welcome to "my bar" -> 'Welcome to "my bar"'
If a string contains both single and double quotes, the
double quotes will be escaped, and the string will be quoted
using double quotes.
Welcome to "Bob's Bar" -> "Welcome to &quot;Bob's bar&quot;
"""
quote_with = '"'
if '"' in value:
if "'" in value:
# The string contains both single and double
# quotes. Turn the double quotes into
# entities. We quote the double quotes rather than
# the single quotes because the entity name is
# "&quot;" whether this is HTML or XML. If we
# quoted the single quotes, we'd have to decide
# between &apos; and &squot;.
replace_with = "&quot;"
value = value.replace('"', replace_with)
else:
# There are double quotes but no single quotes.
# We can use single quotes to quote the attribute.
quote_with = "'"
return quote_with + value + quote_with
@classmethod
def substitute_xml(cls, value, make_quoted_attribute=False):
"""Substitute XML entities for special XML characters.
:param value: A string to be substituted. The less-than sign
will become &lt;, the greater-than sign will become &gt;,
and any ampersands will become &amp;. If you want ampersands
that appear to be part of an entity definition to be left
alone, use substitute_xml_containing_entities() instead.
:param make_quoted_attribute: If True, then the string will be
quoted, as befits an attribute value.
"""
# Escape angle brackets and ampersands.
value = cls.AMPERSAND_OR_BRACKET.sub(
cls._substitute_xml_entity, value)
if make_quoted_attribute:
value = cls.quoted_attribute_value(value)
return value
@classmethod
def substitute_xml_containing_entities(
cls, value, make_quoted_attribute=False):
"""Substitute XML entities for special XML characters.
:param value: A string to be substituted. The less-than sign will
become &lt;, the greater-than sign will become &gt;, and any
ampersands that are not part of an entity defition will
become &amp;.
:param make_quoted_attribute: If True, then the string will be
quoted, as befits an attribute value.
"""
# Escape angle brackets, and ampersands that aren't part of
# entities.
value = cls.BARE_AMPERSAND_OR_BRACKET.sub(
cls._substitute_xml_entity, value)
if make_quoted_attribute:
value = cls.quoted_attribute_value(value)
return value
@classmethod
def substitute_html(cls, s):
"""Replace certain Unicode characters with named HTML entities.
This differs from data.encode(encoding, 'xmlcharrefreplace')
in that the goal is to make the result more readable (to those
with ASCII displays) rather than to recover from
errors. There's absolutely nothing wrong with a UTF-8 string
containg a LATIN SMALL LETTER E WITH ACUTE, but replacing that
character with "&eacute;" will make it more readable to some
people.
"""
return cls.CHARACTER_TO_HTML_ENTITY_RE.sub(
cls._substitute_html_entity, s)
class EncodingDetector:
"""Suggests a number of possible encodings for a bytestring.
Order of precedence:
1. Encodings you specifically tell EncodingDetector to try first
(the override_encodings argument to the constructor).
2. An encoding declared within the bytestring itself, either in an
XML declaration (if the bytestring is to be interpreted as an XML
document), or in a <meta> tag (if the bytestring is to be
interpreted as an HTML document.)
3. An encoding detected through textual analysis by chardet,
cchardet, or a similar external library.
4. UTF-8.
5. Windows-1252.
"""
def __init__(self, markup, override_encodings=None, is_html=False,
exclude_encodings=None):
self.override_encodings = override_encodings or []
exclude_encodings = exclude_encodings or []
self.exclude_encodings = set([x.lower() for x in exclude_encodings])
self.chardet_encoding = None
self.is_html = is_html
self.declared_encoding = None
# First order of business: strip a byte-order mark.
self.markup, self.sniffed_encoding = self.strip_byte_order_mark(markup)
def _usable(self, encoding, tried):
if encoding is not None:
encoding = encoding.lower()
if encoding in self.exclude_encodings:
return False
if encoding not in tried:
tried.add(encoding)
return True
return False
@property
def encodings(self):
"""Yield a number of encodings that might work for this markup."""
tried = set()
for e in self.override_encodings:
if self._usable(e, tried):
yield e
# Did the document originally start with a byte-order mark
# that indicated its encoding?
if self._usable(self.sniffed_encoding, tried):
yield self.sniffed_encoding
# Look within the document for an XML or HTML encoding
# declaration.
if self.declared_encoding is None:
self.declared_encoding = self.find_declared_encoding(
self.markup, self.is_html)
if self._usable(self.declared_encoding, tried):
yield self.declared_encoding
# Use third-party character set detection to guess at the
# encoding.
if self.chardet_encoding is None:
self.chardet_encoding = chardet_dammit(self.markup)
if self._usable(self.chardet_encoding, tried):
yield self.chardet_encoding
# As a last-ditch effort, try utf-8 and windows-1252.
for e in ('utf-8', 'windows-1252'):
if self._usable(e, tried):
yield e
@classmethod
def strip_byte_order_mark(cls, data):
"""If a byte-order mark is present, strip it and return the encoding it implies."""
encoding = None
if isinstance(data, unicode):
# Unicode data cannot have a byte-order mark.
return data, encoding
if (len(data) >= 4) and (data[:2] == b'\xfe\xff') \
and (data[2:4] != '\x00\x00'):
encoding = 'utf-16be'
data = data[2:]
elif (len(data) >= 4) and (data[:2] == b'\xff\xfe') \
and (data[2:4] != '\x00\x00'):
encoding = 'utf-16le'
data = data[2:]
elif data[:3] == b'\xef\xbb\xbf':
encoding = 'utf-8'
data = data[3:]
elif data[:4] == b'\x00\x00\xfe\xff':
encoding = 'utf-32be'
data = data[4:]
elif data[:4] == b'\xff\xfe\x00\x00':
encoding = 'utf-32le'
data = data[4:]
return data, encoding
@classmethod
def find_declared_encoding(cls, markup, is_html=False, search_entire_document=False):
"""Given a document, tries to find its declared encoding.
An XML encoding is declared at the beginning of the document.
An HTML encoding is declared in a <meta> tag, hopefully near the
beginning of the document.
"""
if search_entire_document:
xml_endpos = html_endpos = len(markup)
else:
xml_endpos = 1024
html_endpos = max(2048, int(len(markup) * 0.05))
declared_encoding = None
declared_encoding_match = xml_encoding_re.search(markup, endpos=xml_endpos)
if not declared_encoding_match and is_html:
declared_encoding_match = html_meta_re.search(markup, endpos=html_endpos)
if declared_encoding_match is not None:
declared_encoding = declared_encoding_match.groups()[0].decode(
'ascii', 'replace')
if declared_encoding:
return declared_encoding.lower()
return None
class UnicodeDammit:
"""A class for detecting the encoding of a *ML document and
converting it to a Unicode string. If the source encoding is
windows-1252, can replace MS smart quotes with their HTML or XML
equivalents."""
# This dictionary maps commonly seen values for "charset" in HTML
# meta tags to the corresponding Python codec names. It only covers
# values that aren't in Python's aliases and can't be determined
# by the heuristics in find_codec.
CHARSET_ALIASES = {"macintosh": "mac-roman",
"x-sjis": "shift-jis"}
ENCODINGS_WITH_SMART_QUOTES = [
"windows-1252",
"iso-8859-1",
"iso-8859-2",
]
def __init__(self, markup, override_encodings=[],
smart_quotes_to=None, is_html=False, exclude_encodings=[]):
self.smart_quotes_to = smart_quotes_to
self.tried_encodings = []
self.contains_replacement_characters = False
self.is_html = is_html
self.log = logging.getLogger(__name__)
self.detector = EncodingDetector(
markup, override_encodings, is_html, exclude_encodings)
# Short-circuit if the data is in Unicode to begin with.
if isinstance(markup, unicode) or markup == '':
self.markup = markup
self.unicode_markup = unicode(markup)
self.original_encoding = None
return
# The encoding detector may have stripped a byte-order mark.
# Use the stripped markup from this point on.
self.markup = self.detector.markup
u = None
for encoding in self.detector.encodings:
markup = self.detector.markup
u = self._convert_from(encoding)
if u is not None:
break
if not u:
# None of the encodings worked. As an absolute last resort,
# try them again with character replacement.
for encoding in self.detector.encodings:
if encoding != "ascii":
u = self._convert_from(encoding, "replace")
if u is not None:
self.log.warning(
"Some characters could not be decoded, and were "
"replaced with REPLACEMENT CHARACTER."
)
self.contains_replacement_characters = True
break
# If none of that worked, we could at this point force it to
# ASCII, but that would destroy so much data that I think
# giving up is better.
self.unicode_markup = u
if not u:
self.original_encoding = None
def _sub_ms_char(self, match):
"""Changes a MS smart quote character to an XML or HTML
entity, or an ASCII character."""
orig = match.group(1)
if self.smart_quotes_to == 'ascii':
sub = self.MS_CHARS_TO_ASCII.get(orig).encode()
else:
sub = self.MS_CHARS.get(orig)
if type(sub) == tuple:
if self.smart_quotes_to == 'xml':
sub = '&#x'.encode() + sub[1].encode() + ';'.encode()
else:
sub = '&'.encode() + sub[0].encode() + ';'.encode()
else:
sub = sub.encode()
return sub
def _convert_from(self, proposed, errors="strict"):
proposed = self.find_codec(proposed)
if not proposed or (proposed, errors) in self.tried_encodings:
return None
self.tried_encodings.append((proposed, errors))
markup = self.markup
# Convert smart quotes to HTML if coming from an encoding
# that might have them.
if (self.smart_quotes_to is not None
and proposed in self.ENCODINGS_WITH_SMART_QUOTES):
smart_quotes_re = b"([\x80-\x9f])"
smart_quotes_compiled = re.compile(smart_quotes_re)
markup = smart_quotes_compiled.sub(self._sub_ms_char, markup)
try:
#print "Trying to convert document to %s (errors=%s)" % (
# proposed, errors)
u = self._to_unicode(markup, proposed, errors)
self.markup = u
self.original_encoding = proposed
except Exception as e:
#print "That didn't work!"
#print e
return None
#print "Correct encoding: %s" % proposed
return self.markup
def _to_unicode(self, data, encoding, errors="strict"):
'''Given a string and its encoding, decodes the string into Unicode.
%encoding is a string recognized by encodings.aliases'''
return unicode(data, encoding, errors)
@property
def declared_html_encoding(self):
if not self.is_html:
return None
return self.detector.declared_encoding
def find_codec(self, charset):
value = (self._codec(self.CHARSET_ALIASES.get(charset, charset))
or (charset and self._codec(charset.replace("-", "")))
or (charset and self._codec(charset.replace("-", "_")))
or (charset and charset.lower())
or charset
)
if value:
return value.lower()
return None
def _codec(self, charset):
if not charset:
return charset
codec = None
try:
codecs.lookup(charset)
codec = charset
except (LookupError, ValueError):
pass
return codec
# A partial mapping of ISO-Latin-1 to HTML entities/XML numeric entities.
MS_CHARS = {b'\x80': ('euro', '20AC'),
b'\x81': ' ',
b'\x82': ('sbquo', '201A'),
b'\x83': ('fnof', '192'),
b'\x84': ('bdquo', '201E'),
b'\x85': ('hellip', '2026'),
b'\x86': ('dagger', '2020'),
b'\x87': ('Dagger', '2021'),
b'\x88': ('circ', '2C6'),
b'\x89': ('permil', '2030'),
b'\x8A': ('Scaron', '160'),
b'\x8B': ('lsaquo', '2039'),
b'\x8C': ('OElig', '152'),
b'\x8D': '?',
b'\x8E': ('#x17D', '17D'),
b'\x8F': '?',
b'\x90': '?',
b'\x91': ('lsquo', '2018'),
b'\x92': ('rsquo', '2019'),
b'\x93': ('ldquo', '201C'),
b'\x94': ('rdquo', '201D'),
b'\x95': ('bull', '2022'),
b'\x96': ('ndash', '2013'),
b'\x97': ('mdash', '2014'),
b'\x98': ('tilde', '2DC'),
b'\x99': ('trade', '2122'),
b'\x9a': ('scaron', '161'),
b'\x9b': ('rsaquo', '203A'),
b'\x9c': ('oelig', '153'),
b'\x9d': '?',
b'\x9e': ('#x17E', '17E'),
b'\x9f': ('Yuml', ''),}
# A parochial partial mapping of ISO-Latin-1 to ASCII. Contains
# horrors like stripping diacritical marks to turn á into a, but also
# contains non-horrors like turning “ into ".
MS_CHARS_TO_ASCII = {
b'\x80' : 'EUR',
b'\x81' : ' ',
b'\x82' : ',',
b'\x83' : 'f',
b'\x84' : ',,',
b'\x85' : '...',
b'\x86' : '+',
b'\x87' : '++',
b'\x88' : '^',
b'\x89' : '%',
b'\x8a' : 'S',
b'\x8b' : '<',
b'\x8c' : 'OE',
b'\x8d' : '?',
b'\x8e' : 'Z',
b'\x8f' : '?',
b'\x90' : '?',
b'\x91' : "'",
b'\x92' : "'",
b'\x93' : '"',
b'\x94' : '"',
b'\x95' : '*',
b'\x96' : '-',
b'\x97' : '--',
b'\x98' : '~',
b'\x99' : '(TM)',
b'\x9a' : 's',
b'\x9b' : '>',
b'\x9c' : 'oe',
b'\x9d' : '?',
b'\x9e' : 'z',
b'\x9f' : 'Y',
b'\xa0' : ' ',
b'\xa1' : '!',
b'\xa2' : 'c',
b'\xa3' : 'GBP',
b'\xa4' : '$', #This approximation is especially parochial--this is the
#generic currency symbol.
b'\xa5' : 'YEN',
b'\xa6' : '|',
b'\xa7' : 'S',
b'\xa8' : '..',
b'\xa9' : '',
b'\xaa' : '(th)',
b'\xab' : '<<',
b'\xac' : '!',
b'\xad' : ' ',
b'\xae' : '(R)',
b'\xaf' : '-',
b'\xb0' : 'o',
b'\xb1' : '+-',
b'\xb2' : '2',
b'\xb3' : '3',
b'\xb4' : ("'", 'acute'),
b'\xb5' : 'u',
b'\xb6' : 'P',
b'\xb7' : '*',
b'\xb8' : ',',
b'\xb9' : '1',
b'\xba' : '(th)',
b'\xbb' : '>>',
b'\xbc' : '1/4',
b'\xbd' : '1/2',
b'\xbe' : '3/4',
b'\xbf' : '?',
b'\xc0' : 'A',
b'\xc1' : 'A',
b'\xc2' : 'A',
b'\xc3' : 'A',
b'\xc4' : 'A',
b'\xc5' : 'A',
b'\xc6' : 'AE',
b'\xc7' : 'C',
b'\xc8' : 'E',
b'\xc9' : 'E',
b'\xca' : 'E',
b'\xcb' : 'E',
b'\xcc' : 'I',
b'\xcd' : 'I',
b'\xce' : 'I',
b'\xcf' : 'I',
b'\xd0' : 'D',
b'\xd1' : 'N',
b'\xd2' : 'O',
b'\xd3' : 'O',
b'\xd4' : 'O',
b'\xd5' : 'O',
b'\xd6' : 'O',
b'\xd7' : '*',
b'\xd8' : 'O',
b'\xd9' : 'U',
b'\xda' : 'U',
b'\xdb' : 'U',
b'\xdc' : 'U',
b'\xdd' : 'Y',
b'\xde' : 'b',
b'\xdf' : 'B',
b'\xe0' : 'a',
b'\xe1' : 'a',
b'\xe2' : 'a',
b'\xe3' : 'a',
b'\xe4' : 'a',
b'\xe5' : 'a',
b'\xe6' : 'ae',
b'\xe7' : 'c',
b'\xe8' : 'e',
b'\xe9' : 'e',
b'\xea' : 'e',
b'\xeb' : 'e',
b'\xec' : 'i',
b'\xed' : 'i',
b'\xee' : 'i',
b'\xef' : 'i',
b'\xf0' : 'o',
b'\xf1' : 'n',
b'\xf2' : 'o',
b'\xf3' : 'o',
b'\xf4' : 'o',
b'\xf5' : 'o',
b'\xf6' : 'o',
b'\xf7' : '/',
b'\xf8' : 'o',
b'\xf9' : 'u',
b'\xfa' : 'u',
b'\xfb' : 'u',
b'\xfc' : 'u',
b'\xfd' : 'y',
b'\xfe' : 'b',
b'\xff' : 'y',
}
# A map used when removing rogue Windows-1252/ISO-8859-1
# characters in otherwise UTF-8 documents.
#
# Note that \x81, \x8d, \x8f, \x90, and \x9d are undefined in
# Windows-1252.
WINDOWS_1252_TO_UTF8 = {
0x80 : b'\xe2\x82\xac', # €
0x82 : b'\xe2\x80\x9a', #
0x83 : b'\xc6\x92', # ƒ
0x84 : b'\xe2\x80\x9e', # „
0x85 : b'\xe2\x80\xa6', # …
0x86 : b'\xe2\x80\xa0', # †
0x87 : b'\xe2\x80\xa1', # ‡
0x88 : b'\xcb\x86', # ˆ
0x89 : b'\xe2\x80\xb0', # ‰
0x8a : b'\xc5\xa0', # Š
0x8b : b'\xe2\x80\xb9', #
0x8c : b'\xc5\x92', # Œ
0x8e : b'\xc5\xbd', # Ž
0x91 : b'\xe2\x80\x98', #
0x92 : b'\xe2\x80\x99', #
0x93 : b'\xe2\x80\x9c', # “
0x94 : b'\xe2\x80\x9d', # ”
0x95 : b'\xe2\x80\xa2', # •
0x96 : b'\xe2\x80\x93', #
0x97 : b'\xe2\x80\x94', # —
0x98 : b'\xcb\x9c', # ˜
0x99 : b'\xe2\x84\xa2', # ™
0x9a : b'\xc5\xa1', # š
0x9b : b'\xe2\x80\xba', #
0x9c : b'\xc5\x93', # œ
0x9e : b'\xc5\xbe', # ž
0x9f : b'\xc5\xb8', # Ÿ
0xa0 : b'\xc2\xa0', #  
0xa1 : b'\xc2\xa1', # ¡
0xa2 : b'\xc2\xa2', # ¢
0xa3 : b'\xc2\xa3', # £
0xa4 : b'\xc2\xa4', # ¤
0xa5 : b'\xc2\xa5', # ¥
0xa6 : b'\xc2\xa6', # ¦
0xa7 : b'\xc2\xa7', # §
0xa8 : b'\xc2\xa8', # ¨
0xa9 : b'\xc2\xa9', # ©
0xaa : b'\xc2\xaa', # ª
0xab : b'\xc2\xab', # «
0xac : b'\xc2\xac', # ¬
0xad : b'\xc2\xad', # ­
0xae : b'\xc2\xae', # ®
0xaf : b'\xc2\xaf', # ¯
0xb0 : b'\xc2\xb0', # °
0xb1 : b'\xc2\xb1', # ±
0xb2 : b'\xc2\xb2', # ²
0xb3 : b'\xc2\xb3', # ³
0xb4 : b'\xc2\xb4', # ´
0xb5 : b'\xc2\xb5', # µ
0xb6 : b'\xc2\xb6', # ¶
0xb7 : b'\xc2\xb7', # ·
0xb8 : b'\xc2\xb8', # ¸
0xb9 : b'\xc2\xb9', # ¹
0xba : b'\xc2\xba', # º
0xbb : b'\xc2\xbb', # »
0xbc : b'\xc2\xbc', # ¼
0xbd : b'\xc2\xbd', # ½
0xbe : b'\xc2\xbe', # ¾
0xbf : b'\xc2\xbf', # ¿
0xc0 : b'\xc3\x80', # À
0xc1 : b'\xc3\x81', # Á
0xc2 : b'\xc3\x82', # Â
0xc3 : b'\xc3\x83', # Ã
0xc4 : b'\xc3\x84', # Ä
0xc5 : b'\xc3\x85', # Å
0xc6 : b'\xc3\x86', # Æ
0xc7 : b'\xc3\x87', # Ç
0xc8 : b'\xc3\x88', # È
0xc9 : b'\xc3\x89', # É
0xca : b'\xc3\x8a', # Ê
0xcb : b'\xc3\x8b', # Ë
0xcc : b'\xc3\x8c', # Ì
0xcd : b'\xc3\x8d', # Í
0xce : b'\xc3\x8e', # Î
0xcf : b'\xc3\x8f', # Ï
0xd0 : b'\xc3\x90', # Ð
0xd1 : b'\xc3\x91', # Ñ
0xd2 : b'\xc3\x92', # Ò
0xd3 : b'\xc3\x93', # Ó
0xd4 : b'\xc3\x94', # Ô
0xd5 : b'\xc3\x95', # Õ
0xd6 : b'\xc3\x96', # Ö
0xd7 : b'\xc3\x97', # ×
0xd8 : b'\xc3\x98', # Ø
0xd9 : b'\xc3\x99', # Ù
0xda : b'\xc3\x9a', # Ú
0xdb : b'\xc3\x9b', # Û
0xdc : b'\xc3\x9c', # Ü
0xdd : b'\xc3\x9d', # Ý
0xde : b'\xc3\x9e', # Þ
0xdf : b'\xc3\x9f', # ß
0xe0 : b'\xc3\xa0', # à
0xe1 : b'\xa1', # á
0xe2 : b'\xc3\xa2', # â
0xe3 : b'\xc3\xa3', # ã
0xe4 : b'\xc3\xa4', # ä
0xe5 : b'\xc3\xa5', # å
0xe6 : b'\xc3\xa6', # æ
0xe7 : b'\xc3\xa7', # ç
0xe8 : b'\xc3\xa8', # è
0xe9 : b'\xc3\xa9', # é
0xea : b'\xc3\xaa', # ê
0xeb : b'\xc3\xab', # ë
0xec : b'\xc3\xac', # ì
0xed : b'\xc3\xad', # í
0xee : b'\xc3\xae', # î
0xef : b'\xc3\xaf', # ï
0xf0 : b'\xc3\xb0', # ð
0xf1 : b'\xc3\xb1', # ñ
0xf2 : b'\xc3\xb2', # ò
0xf3 : b'\xc3\xb3', # ó
0xf4 : b'\xc3\xb4', # ô
0xf5 : b'\xc3\xb5', # õ
0xf6 : b'\xc3\xb6', # ö
0xf7 : b'\xc3\xb7', # ÷
0xf8 : b'\xc3\xb8', # ø
0xf9 : b'\xc3\xb9', # ù
0xfa : b'\xc3\xba', # ú
0xfb : b'\xc3\xbb', # û
0xfc : b'\xc3\xbc', # ü
0xfd : b'\xc3\xbd', # ý
0xfe : b'\xc3\xbe', # þ
}
MULTIBYTE_MARKERS_AND_SIZES = [
(0xc2, 0xdf, 2), # 2-byte characters start with a byte C2-DF
(0xe0, 0xef, 3), # 3-byte characters start with E0-EF
(0xf0, 0xf4, 4), # 4-byte characters start with F0-F4
]
FIRST_MULTIBYTE_MARKER = MULTIBYTE_MARKERS_AND_SIZES[0][0]
LAST_MULTIBYTE_MARKER = MULTIBYTE_MARKERS_AND_SIZES[-1][1]
@classmethod
def detwingle(cls, in_bytes, main_encoding="utf8",
embedded_encoding="windows-1252"):
"""Fix characters from one encoding embedded in some other encoding.
Currently the only situation supported is Windows-1252 (or its
subset ISO-8859-1), embedded in UTF-8.
The input must be a bytestring. If you've already converted
the document to Unicode, you're too late.
The output is a bytestring in which `embedded_encoding`
characters have been converted to their `main_encoding`
equivalents.
"""
if embedded_encoding.replace('_', '-').lower() not in (
'windows-1252', 'windows_1252'):
raise NotImplementedError(
"Windows-1252 and ISO-8859-1 are the only currently supported "
"embedded encodings.")
if main_encoding.lower() not in ('utf8', 'utf-8'):
raise NotImplementedError(
"UTF-8 is the only currently supported main encoding.")
byte_chunks = []
chunk_start = 0
pos = 0
while pos < len(in_bytes):
byte = in_bytes[pos]
if not isinstance(byte, int):
# Python 2.x
byte = ord(byte)
if (byte >= cls.FIRST_MULTIBYTE_MARKER
and byte <= cls.LAST_MULTIBYTE_MARKER):
# This is the start of a UTF-8 multibyte character. Skip
# to the end.
for start, end, size in cls.MULTIBYTE_MARKERS_AND_SIZES:
if byte >= start and byte <= end:
pos += size
break
elif byte >= 0x80 and byte in cls.WINDOWS_1252_TO_UTF8:
# We found a Windows-1252 character!
# Save the string up to this point as a chunk.
byte_chunks.append(in_bytes[chunk_start:pos])
# Now translate the Windows-1252 character into UTF-8
# and add it as another, one-byte chunk.
byte_chunks.append(cls.WINDOWS_1252_TO_UTF8[byte])
pos += 1
chunk_start = pos
else:
# Go on to the next character.
pos += 1
if chunk_start == 0:
# The string is unchanged.
return in_bytes
else:
# Store the final chunk.
byte_chunks.append(in_bytes[chunk_start:])
return b''.join(byte_chunks)

219
libs/bs42.7/diagnose.py Normal file
View File

@ -0,0 +1,219 @@
"""Diagnostic functions, mainly for use when doing tech support."""
# Use of this source code is governed by a BSD-style license that can be
# found in the LICENSE file.
__license__ = "MIT"
import cProfile
from StringIO import StringIO
from HTMLParser import HTMLParser
import bs4
from bs4 import BeautifulSoup, __version__
from bs4.builder import builder_registry
import os
import pstats
import random
import tempfile
import time
import traceback
import sys
import cProfile
def diagnose(data):
"""Diagnostic suite for isolating common problems."""
print "Diagnostic running on Beautiful Soup %s" % __version__
print "Python version %s" % sys.version
basic_parsers = ["html.parser", "html5lib", "lxml"]
for name in basic_parsers:
for builder in builder_registry.builders:
if name in builder.features:
break
else:
basic_parsers.remove(name)
print (
"I noticed that %s is not installed. Installing it may help." %
name)
if 'lxml' in basic_parsers:
basic_parsers.append(["lxml", "xml"])
try:
from lxml import etree
print "Found lxml version %s" % ".".join(map(str,etree.LXML_VERSION))
except ImportError, e:
print (
"lxml is not installed or couldn't be imported.")
if 'html5lib' in basic_parsers:
try:
import html5lib
print "Found html5lib version %s" % html5lib.__version__
except ImportError, e:
print (
"html5lib is not installed or couldn't be imported.")
if hasattr(data, 'read'):
data = data.read()
elif os.path.exists(data):
print '"%s" looks like a filename. Reading data from the file.' % data
with open(data) as fp:
data = fp.read()
elif data.startswith("http:") or data.startswith("https:"):
print '"%s" looks like a URL. Beautiful Soup is not an HTTP client.' % data
print "You need to use some other library to get the document behind the URL, and feed that document to Beautiful Soup."
return
print
for parser in basic_parsers:
print "Trying to parse your markup with %s" % parser
success = False
try:
soup = BeautifulSoup(data, parser)
success = True
except Exception, e:
print "%s could not parse the markup." % parser
traceback.print_exc()
if success:
print "Here's what %s did with the markup:" % parser
print soup.prettify()
print "-" * 80
def lxml_trace(data, html=True, **kwargs):
"""Print out the lxml events that occur during parsing.
This lets you see how lxml parses a document when no Beautiful
Soup code is running.
"""
from lxml import etree
for event, element in etree.iterparse(StringIO(data), html=html, **kwargs):
print("%s, %4s, %s" % (event, element.tag, element.text))
class AnnouncingParser(HTMLParser):
"""Announces HTMLParser parse events, without doing anything else."""
def _p(self, s):
print(s)
def handle_starttag(self, name, attrs):
self._p("%s START" % name)
def handle_endtag(self, name):
self._p("%s END" % name)
def handle_data(self, data):
self._p("%s DATA" % data)
def handle_charref(self, name):
self._p("%s CHARREF" % name)
def handle_entityref(self, name):
self._p("%s ENTITYREF" % name)
def handle_comment(self, data):
self._p("%s COMMENT" % data)
def handle_decl(self, data):
self._p("%s DECL" % data)
def unknown_decl(self, data):
self._p("%s UNKNOWN-DECL" % data)
def handle_pi(self, data):
self._p("%s PI" % data)
def htmlparser_trace(data):
"""Print out the HTMLParser events that occur during parsing.
This lets you see how HTMLParser parses a document when no
Beautiful Soup code is running.
"""
parser = AnnouncingParser()
parser.feed(data)
_vowels = "aeiou"
_consonants = "bcdfghjklmnpqrstvwxyz"
def rword(length=5):
"Generate a random word-like string."
s = ''
for i in range(length):
if i % 2 == 0:
t = _consonants
else:
t = _vowels
s += random.choice(t)
return s
def rsentence(length=4):
"Generate a random sentence-like string."
return " ".join(rword(random.randint(4,9)) for i in range(length))
def rdoc(num_elements=1000):
"""Randomly generate an invalid HTML document."""
tag_names = ['p', 'div', 'span', 'i', 'b', 'script', 'table']
elements = []
for i in range(num_elements):
choice = random.randint(0,3)
if choice == 0:
# New tag.
tag_name = random.choice(tag_names)
elements.append("<%s>" % tag_name)
elif choice == 1:
elements.append(rsentence(random.randint(1,4)))
elif choice == 2:
# Close a tag.
tag_name = random.choice(tag_names)
elements.append("</%s>" % tag_name)
return "<html>" + "\n".join(elements) + "</html>"
def benchmark_parsers(num_elements=100000):
"""Very basic head-to-head performance benchmark."""
print "Comparative parser benchmark on Beautiful Soup %s" % __version__
data = rdoc(num_elements)
print "Generated a large invalid HTML document (%d bytes)." % len(data)
for parser in ["lxml", ["lxml", "html"], "html5lib", "html.parser"]:
success = False
try:
a = time.time()
soup = BeautifulSoup(data, parser)
b = time.time()
success = True
except Exception, e:
print "%s could not parse the markup." % parser
traceback.print_exc()
if success:
print "BS4+%s parsed the markup in %.2fs." % (parser, b-a)
from lxml import etree
a = time.time()
etree.HTML(data)
b = time.time()
print "Raw lxml parsed the markup in %.2fs." % (b-a)
import html5lib
parser = html5lib.HTMLParser()
a = time.time()
parser.parse(data)
b = time.time()
print "Raw html5lib parsed the markup in %.2fs." % (b-a)
def profile(num_elements=100000, parser="lxml"):
filehandle = tempfile.NamedTemporaryFile()
filename = filehandle.name
data = rdoc(num_elements)
vars = dict(bs4=bs4, data=data, parser=parser)
cProfile.runctx('bs4.BeautifulSoup(data, parser)' , vars, vars, filename)
stats = pstats.Stats(filename)
# stats.strip_dirs()
stats.sort_stats("cumulative")
stats.print_stats('_html5lib|bs4', 50)
if __name__ == '__main__':
diagnose(sys.stdin.read())

1808
libs/bs42.7/element.py Normal file

File diff suppressed because it is too large Load Diff

99
libs/bs42.7/formatter.py Normal file
View File

@ -0,0 +1,99 @@
from bs4.dammit import EntitySubstitution
class Formatter(EntitySubstitution):
"""Describes a strategy to use when outputting a parse tree to a string.
Some parts of this strategy come from the distinction between
HTML4, HTML5, and XML. Others are configurable by the user.
"""
# Registries of XML and HTML formatters.
XML_FORMATTERS = {}
HTML_FORMATTERS = {}
HTML = 'html'
XML = 'xml'
HTML_DEFAULTS = dict(
cdata_containing_tags=set(["script", "style"]),
)
def _default(self, language, value, kwarg):
if value is not None:
return value
if language == self.XML:
return set()
return self.HTML_DEFAULTS[kwarg]
def __init__(
self, language=None, entity_substitution=None,
void_element_close_prefix='/', cdata_containing_tags=None,
):
"""
:param void_element_close_prefix: By default, represent void
elements as <tag/> rather than <tag>
"""
self.language = language
self.entity_substitution = entity_substitution
self.void_element_close_prefix = void_element_close_prefix
self.cdata_containing_tags = self._default(
language, cdata_containing_tags, 'cdata_containing_tags'
)
def substitute(self, ns):
"""Process a string that needs to undergo entity substitution."""
if not self.entity_substitution:
return ns
from .element import NavigableString
if (isinstance(ns, NavigableString)
and ns.parent is not None
and ns.parent.name in self.cdata_containing_tags):
# Do nothing.
return ns
# Substitute.
return self.entity_substitution(ns)
def attribute_value(self, value):
"""Process the value of an attribute."""
return self.substitute(value)
def attributes(self, tag):
"""Reorder a tag's attributes however you want."""
return sorted(tag.attrs.items())
class HTMLFormatter(Formatter):
REGISTRY = {}
def __init__(self, *args, **kwargs):
return super(HTMLFormatter, self).__init__(self.HTML, *args, **kwargs)
class XMLFormatter(Formatter):
REGISTRY = {}
def __init__(self, *args, **kwargs):
return super(XMLFormatter, self).__init__(self.XML, *args, **kwargs)
# Set up aliases for the default formatters.
HTMLFormatter.REGISTRY['html'] = HTMLFormatter(
entity_substitution=EntitySubstitution.substitute_html
)
HTMLFormatter.REGISTRY["html5"] = HTMLFormatter(
entity_substitution=EntitySubstitution.substitute_html,
void_element_close_prefix = None
)
HTMLFormatter.REGISTRY["minimal"] = HTMLFormatter(
entity_substitution=EntitySubstitution.substitute_xml
)
HTMLFormatter.REGISTRY[None] = HTMLFormatter(
entity_substitution=None
)
XMLFormatter.REGISTRY["html"] = XMLFormatter(
entity_substitution=EntitySubstitution.substitute_html
)
XMLFormatter.REGISTRY["minimal"] = XMLFormatter(
entity_substitution=EntitySubstitution.substitute_xml
)
XMLFormatter.REGISTRY[None] = Formatter(
Formatter(Formatter.XML, entity_substitution=None)
)

770
libs/bs42.7/testing.py Normal file
View File

@ -0,0 +1,770 @@
"""Helper classes for tests."""
# Use of this source code is governed by a BSD-style license that can be
# found in the LICENSE file.
__license__ = "MIT"
import pickle
import copy
import functools
import unittest
from unittest import TestCase
from bs4 import BeautifulSoup
from bs4.element import (
CharsetMetaAttributeValue,
Comment,
ContentMetaAttributeValue,
Doctype,
SoupStrainer,
)
from bs4.builder import HTMLParserTreeBuilder
default_builder = HTMLParserTreeBuilder
class SoupTest(unittest.TestCase):
@property
def default_builder(self):
return default_builder()
def soup(self, markup, **kwargs):
"""Build a Beautiful Soup object from markup."""
builder = kwargs.pop('builder', self.default_builder)
return BeautifulSoup(markup, builder=builder, **kwargs)
def document_for(self, markup):
"""Turn an HTML fragment into a document.
The details depend on the builder.
"""
return self.default_builder.test_fragment_to_document(markup)
def assertSoupEquals(self, to_parse, compare_parsed_to=None):
builder = self.default_builder
obj = BeautifulSoup(to_parse, builder=builder)
if compare_parsed_to is None:
compare_parsed_to = to_parse
self.assertEqual(obj.decode(), self.document_for(compare_parsed_to))
def assertConnectedness(self, element):
"""Ensure that next_element and previous_element are properly
set for all descendants of the given element.
"""
earlier = None
for e in element.descendants:
if earlier:
self.assertEqual(e, earlier.next_element)
self.assertEqual(earlier, e.previous_element)
earlier = e
class HTMLTreeBuilderSmokeTest(object):
"""A basic test of a treebuilder's competence.
Any HTML treebuilder, present or future, should be able to pass
these tests. With invalid markup, there's room for interpretation,
and different parsers can handle it differently. But with the
markup in these tests, there's not much room for interpretation.
"""
def test_empty_element_tags(self):
"""Verify that all HTML4 and HTML5 empty element (aka void element) tags
are handled correctly.
"""
for name in [
'area', 'base', 'br', 'col', 'embed', 'hr', 'img', 'input', 'keygen', 'link', 'menuitem', 'meta', 'param', 'source', 'track', 'wbr',
'spacer', 'frame'
]:
soup = self.soup("")
new_tag = soup.new_tag(name)
self.assertEqual(True, new_tag.is_empty_element)
def test_pickle_and_unpickle_identity(self):
# Pickling a tree, then unpickling it, yields a tree identical
# to the original.
tree = self.soup("<a><b>foo</a>")
dumped = pickle.dumps(tree, 2)
loaded = pickle.loads(dumped)
self.assertEqual(loaded.__class__, BeautifulSoup)
self.assertEqual(loaded.decode(), tree.decode())
def assertDoctypeHandled(self, doctype_fragment):
"""Assert that a given doctype string is handled correctly."""
doctype_str, soup = self._document_with_doctype(doctype_fragment)
# Make sure a Doctype object was created.
doctype = soup.contents[0]
self.assertEqual(doctype.__class__, Doctype)
self.assertEqual(doctype, doctype_fragment)
self.assertEqual(str(soup)[:len(doctype_str)], doctype_str)
# Make sure that the doctype was correctly associated with the
# parse tree and that the rest of the document parsed.
self.assertEqual(soup.p.contents[0], 'foo')
def _document_with_doctype(self, doctype_fragment):
"""Generate and parse a document with the given doctype."""
doctype = '<!DOCTYPE %s>' % doctype_fragment
markup = doctype + '\n<p>foo</p>'
soup = self.soup(markup)
return doctype, soup
def test_normal_doctypes(self):
"""Make sure normal, everyday HTML doctypes are handled correctly."""
self.assertDoctypeHandled("html")
self.assertDoctypeHandled(
'html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN"')
def test_empty_doctype(self):
soup = self.soup("<!DOCTYPE>")
doctype = soup.contents[0]
self.assertEqual("", doctype.strip())
def test_public_doctype_with_url(self):
doctype = 'html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd"'
self.assertDoctypeHandled(doctype)
def test_system_doctype(self):
self.assertDoctypeHandled('foo SYSTEM "http://www.example.com/"')
def test_namespaced_system_doctype(self):
# We can handle a namespaced doctype with a system ID.
self.assertDoctypeHandled('xsl:stylesheet SYSTEM "htmlent.dtd"')
def test_namespaced_public_doctype(self):
# Test a namespaced doctype with a public id.
self.assertDoctypeHandled('xsl:stylesheet PUBLIC "htmlent.dtd"')
def test_real_xhtml_document(self):
"""A real XHTML document should come out more or less the same as it went in."""
markup = b"""<?xml version="1.0" encoding="utf-8"?>
<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN">
<html xmlns="http://www.w3.org/1999/xhtml">
<head><title>Hello.</title></head>
<body>Goodbye.</body>
</html>"""
soup = self.soup(markup)
self.assertEqual(
soup.encode("utf-8").replace(b"\n", b""),
markup.replace(b"\n", b""))
def test_processing_instruction(self):
# We test both Unicode and bytestring to verify that
# process_markup correctly sets processing_instruction_class
# even when the markup is already Unicode and there is no
# need to process anything.
markup = u"""<?PITarget PIContent?>"""
soup = self.soup(markup)
self.assertEqual(markup, soup.decode())
markup = b"""<?PITarget PIContent?>"""
soup = self.soup(markup)
self.assertEqual(markup, soup.encode("utf8"))
def test_deepcopy(self):
"""Make sure you can copy the tree builder.
This is important because the builder is part of a
BeautifulSoup object, and we want to be able to copy that.
"""
copy.deepcopy(self.default_builder)
def test_p_tag_is_never_empty_element(self):
"""A <p> tag is never designated as an empty-element tag.
Even if the markup shows it as an empty-element tag, it
shouldn't be presented that way.
"""
soup = self.soup("<p/>")
self.assertFalse(soup.p.is_empty_element)
self.assertEqual(str(soup.p), "<p></p>")
def test_unclosed_tags_get_closed(self):
"""A tag that's not closed by the end of the document should be closed.
This applies to all tags except empty-element tags.
"""
self.assertSoupEquals("<p>", "<p></p>")
self.assertSoupEquals("<b>", "<b></b>")
self.assertSoupEquals("<br>", "<br/>")
def test_br_is_always_empty_element_tag(self):
"""A <br> tag is designated as an empty-element tag.
Some parsers treat <br></br> as one <br/> tag, some parsers as
two tags, but it should always be an empty-element tag.
"""
soup = self.soup("<br></br>")
self.assertTrue(soup.br.is_empty_element)
self.assertEqual(str(soup.br), "<br/>")
def test_nested_formatting_elements(self):
self.assertSoupEquals("<em><em></em></em>")
def test_double_head(self):
html = '''<!DOCTYPE html>
<html>
<head>
<title>Ordinary HEAD element test</title>
</head>
<script type="text/javascript">
alert("Help!");
</script>
<body>
Hello, world!
</body>
</html>
'''
soup = self.soup(html)
self.assertEqual("text/javascript", soup.find('script')['type'])
def test_comment(self):
# Comments are represented as Comment objects.
markup = "<p>foo<!--foobar-->baz</p>"
self.assertSoupEquals(markup)
soup = self.soup(markup)
comment = soup.find(text="foobar")
self.assertEqual(comment.__class__, Comment)
# The comment is properly integrated into the tree.
foo = soup.find(text="foo")
self.assertEqual(comment, foo.next_element)
baz = soup.find(text="baz")
self.assertEqual(comment, baz.previous_element)
def test_preserved_whitespace_in_pre_and_textarea(self):
"""Whitespace must be preserved in <pre> and <textarea> tags,
even if that would mean not prettifying the markup.
"""
pre_markup = "<pre> </pre>"
textarea_markup = "<textarea> woo\nwoo </textarea>"
self.assertSoupEquals(pre_markup)
self.assertSoupEquals(textarea_markup)
soup = self.soup(pre_markup)
self.assertEqual(soup.pre.prettify(), pre_markup)
soup = self.soup(textarea_markup)
self.assertEqual(soup.textarea.prettify(), textarea_markup)
soup = self.soup("<textarea></textarea>")
self.assertEqual(soup.textarea.prettify(), "<textarea></textarea>")
def test_nested_inline_elements(self):
"""Inline elements can be nested indefinitely."""
b_tag = "<b>Inside a B tag</b>"
self.assertSoupEquals(b_tag)
nested_b_tag = "<p>A <i>nested <b>tag</b></i></p>"
self.assertSoupEquals(nested_b_tag)
double_nested_b_tag = "<p>A <a>doubly <i>nested <b>tag</b></i></a></p>"
self.assertSoupEquals(nested_b_tag)
def test_nested_block_level_elements(self):
"""Block elements can be nested."""
soup = self.soup('<blockquote><p><b>Foo</b></p></blockquote>')
blockquote = soup.blockquote
self.assertEqual(blockquote.p.b.string, 'Foo')
self.assertEqual(blockquote.b.string, 'Foo')
def test_correctly_nested_tables(self):
"""One table can go inside another one."""
markup = ('<table id="1">'
'<tr>'
"<td>Here's another table:"
'<table id="2">'
'<tr><td>foo</td></tr>'
'</table></td>')
self.assertSoupEquals(
markup,
'<table id="1"><tr><td>Here\'s another table:'
'<table id="2"><tr><td>foo</td></tr></table>'
'</td></tr></table>')
self.assertSoupEquals(
"<table><thead><tr><td>Foo</td></tr></thead>"
"<tbody><tr><td>Bar</td></tr></tbody>"
"<tfoot><tr><td>Baz</td></tr></tfoot></table>")
def test_deeply_nested_multivalued_attribute(self):
# html5lib can set the attributes of the same tag many times
# as it rearranges the tree. This has caused problems with
# multivalued attributes.
markup = '<table><div><div class="css"></div></div></table>'
soup = self.soup(markup)
self.assertEqual(["css"], soup.div.div['class'])
def test_multivalued_attribute_on_html(self):
# html5lib uses a different API to set the attributes ot the
# <html> tag. This has caused problems with multivalued
# attributes.
markup = '<html class="a b"></html>'
soup = self.soup(markup)
self.assertEqual(["a", "b"], soup.html['class'])
def test_angle_brackets_in_attribute_values_are_escaped(self):
self.assertSoupEquals('<a b="<a>"></a>', '<a b="&lt;a&gt;"></a>')
def test_entities_in_attributes_converted_to_unicode(self):
expect = u'<p id="pi\N{LATIN SMALL LETTER N WITH TILDE}ata"></p>'
self.assertSoupEquals('<p id="pi&#241;ata"></p>', expect)
self.assertSoupEquals('<p id="pi&#xf1;ata"></p>', expect)
self.assertSoupEquals('<p id="pi&#Xf1;ata"></p>', expect)
self.assertSoupEquals('<p id="pi&ntilde;ata"></p>', expect)
def test_entities_in_text_converted_to_unicode(self):
expect = u'<p>pi\N{LATIN SMALL LETTER N WITH TILDE}ata</p>'
self.assertSoupEquals("<p>pi&#241;ata</p>", expect)
self.assertSoupEquals("<p>pi&#xf1;ata</p>", expect)
self.assertSoupEquals("<p>pi&#Xf1;ata</p>", expect)
self.assertSoupEquals("<p>pi&ntilde;ata</p>", expect)
def test_quot_entity_converted_to_quotation_mark(self):
self.assertSoupEquals("<p>I said &quot;good day!&quot;</p>",
'<p>I said "good day!"</p>')
def test_out_of_range_entity(self):
expect = u"\N{REPLACEMENT CHARACTER}"
self.assertSoupEquals("&#10000000000000;", expect)
self.assertSoupEquals("&#x10000000000000;", expect)
self.assertSoupEquals("&#1000000000;", expect)
def test_multipart_strings(self):
"Mostly to prevent a recurrence of a bug in the html5lib treebuilder."
soup = self.soup("<html><h2>\nfoo</h2><p></p></html>")
self.assertEqual("p", soup.h2.string.next_element.name)
self.assertEqual("p", soup.p.name)
self.assertConnectedness(soup)
def test_empty_element_tags(self):
"""Verify consistent handling of empty-element tags,
no matter how they come in through the markup.
"""
self.assertSoupEquals('<br/><br/><br/>', "<br/><br/><br/>")
self.assertSoupEquals('<br /><br /><br />', "<br/><br/><br/>")
def test_head_tag_between_head_and_body(self):
"Prevent recurrence of a bug in the html5lib treebuilder."
content = """<html><head></head>
<link></link>
<body>foo</body>
</html>
"""
soup = self.soup(content)
self.assertNotEqual(None, soup.html.body)
self.assertConnectedness(soup)
def test_multiple_copies_of_a_tag(self):
"Prevent recurrence of a bug in the html5lib treebuilder."
content = """<!DOCTYPE html>
<html>
<body>
<article id="a" >
<div><a href="1"></div>
<footer>
<a href="2"></a>
</footer>
</article>
</body>
</html>
"""
soup = self.soup(content)
self.assertConnectedness(soup.article)
def test_basic_namespaces(self):
"""Parsers don't need to *understand* namespaces, but at the
very least they should not choke on namespaces or lose
data."""
markup = b'<html xmlns="http://www.w3.org/1999/xhtml" xmlns:mathml="http://www.w3.org/1998/Math/MathML" xmlns:svg="http://www.w3.org/2000/svg"><head></head><body><mathml:msqrt>4</mathml:msqrt><b svg:fill="red"></b></body></html>'
soup = self.soup(markup)
self.assertEqual(markup, soup.encode())
html = soup.html
self.assertEqual('http://www.w3.org/1999/xhtml', soup.html['xmlns'])
self.assertEqual(
'http://www.w3.org/1998/Math/MathML', soup.html['xmlns:mathml'])
self.assertEqual(
'http://www.w3.org/2000/svg', soup.html['xmlns:svg'])
def test_multivalued_attribute_value_becomes_list(self):
markup = b'<a class="foo bar">'
soup = self.soup(markup)
self.assertEqual(['foo', 'bar'], soup.a['class'])
#
# Generally speaking, tests below this point are more tests of
# Beautiful Soup than tests of the tree builders. But parsers are
# weird, so we run these tests separately for every tree builder
# to detect any differences between them.
#
def test_can_parse_unicode_document(self):
# A seemingly innocuous document... but it's in Unicode! And
# it contains characters that can't be represented in the
# encoding found in the declaration! The horror!
markup = u'<html><head><meta encoding="euc-jp"></head><body>Sacr\N{LATIN SMALL LETTER E WITH ACUTE} bleu!</body>'
soup = self.soup(markup)
self.assertEqual(u'Sacr\xe9 bleu!', soup.body.string)
def test_soupstrainer(self):
"""Parsers should be able to work with SoupStrainers."""
strainer = SoupStrainer("b")
soup = self.soup("A <b>bold</b> <meta/> <i>statement</i>",
parse_only=strainer)
self.assertEqual(soup.decode(), "<b>bold</b>")
def test_single_quote_attribute_values_become_double_quotes(self):
self.assertSoupEquals("<foo attr='bar'></foo>",
'<foo attr="bar"></foo>')
def test_attribute_values_with_nested_quotes_are_left_alone(self):
text = """<foo attr='bar "brawls" happen'>a</foo>"""
self.assertSoupEquals(text)
def test_attribute_values_with_double_nested_quotes_get_quoted(self):
text = """<foo attr='bar "brawls" happen'>a</foo>"""
soup = self.soup(text)
soup.foo['attr'] = 'Brawls happen at "Bob\'s Bar"'
self.assertSoupEquals(
soup.foo.decode(),
"""<foo attr="Brawls happen at &quot;Bob\'s Bar&quot;">a</foo>""")
def test_ampersand_in_attribute_value_gets_escaped(self):
self.assertSoupEquals('<this is="really messed up & stuff"></this>',
'<this is="really messed up &amp; stuff"></this>')
self.assertSoupEquals(
'<a href="http://example.org?a=1&b=2;3">foo</a>',
'<a href="http://example.org?a=1&amp;b=2;3">foo</a>')
def test_escaped_ampersand_in_attribute_value_is_left_alone(self):
self.assertSoupEquals('<a href="http://example.org?a=1&amp;b=2;3"></a>')
def test_entities_in_strings_converted_during_parsing(self):
# Both XML and HTML entities are converted to Unicode characters
# during parsing.
text = "<p>&lt;&lt;sacr&eacute;&#32;bleu!&gt;&gt;</p>"
expected = u"<p>&lt;&lt;sacr\N{LATIN SMALL LETTER E WITH ACUTE} bleu!&gt;&gt;</p>"
self.assertSoupEquals(text, expected)
def test_smart_quotes_converted_on_the_way_in(self):
# Microsoft smart quotes are converted to Unicode characters during
# parsing.
quote = b"<p>\x91Foo\x92</p>"
soup = self.soup(quote)
self.assertEqual(
soup.p.string,
u"\N{LEFT SINGLE QUOTATION MARK}Foo\N{RIGHT SINGLE QUOTATION MARK}")
def test_non_breaking_spaces_converted_on_the_way_in(self):
soup = self.soup("<a>&nbsp;&nbsp;</a>")
self.assertEqual(soup.a.string, u"\N{NO-BREAK SPACE}" * 2)
def test_entities_converted_on_the_way_out(self):
text = "<p>&lt;&lt;sacr&eacute;&#32;bleu!&gt;&gt;</p>"
expected = u"<p>&lt;&lt;sacr\N{LATIN SMALL LETTER E WITH ACUTE} bleu!&gt;&gt;</p>".encode("utf-8")
soup = self.soup(text)
self.assertEqual(soup.p.encode("utf-8"), expected)
def test_real_iso_latin_document(self):
# Smoke test of interrelated functionality, using an
# easy-to-understand document.
# Here it is in Unicode. Note that it claims to be in ISO-Latin-1.
unicode_html = u'<html><head><meta content="text/html; charset=ISO-Latin-1" http-equiv="Content-type"/></head><body><p>Sacr\N{LATIN SMALL LETTER E WITH ACUTE} bleu!</p></body></html>'
# That's because we're going to encode it into ISO-Latin-1, and use
# that to test.
iso_latin_html = unicode_html.encode("iso-8859-1")
# Parse the ISO-Latin-1 HTML.
soup = self.soup(iso_latin_html)
# Encode it to UTF-8.
result = soup.encode("utf-8")
# What do we expect the result to look like? Well, it would
# look like unicode_html, except that the META tag would say
# UTF-8 instead of ISO-Latin-1.
expected = unicode_html.replace("ISO-Latin-1", "utf-8")
# And, of course, it would be in UTF-8, not Unicode.
expected = expected.encode("utf-8")
# Ta-da!
self.assertEqual(result, expected)
def test_real_shift_jis_document(self):
# Smoke test to make sure the parser can handle a document in
# Shift-JIS encoding, without choking.
shift_jis_html = (
b'<html><head></head><body><pre>'
b'\x82\xb1\x82\xea\x82\xcdShift-JIS\x82\xc5\x83R\x81[\x83f'
b'\x83B\x83\x93\x83O\x82\xb3\x82\xea\x82\xbd\x93\xfa\x96{\x8c'
b'\xea\x82\xcc\x83t\x83@\x83C\x83\x8b\x82\xc5\x82\xb7\x81B'
b'</pre></body></html>')
unicode_html = shift_jis_html.decode("shift-jis")
soup = self.soup(unicode_html)
# Make sure the parse tree is correctly encoded to various
# encodings.
self.assertEqual(soup.encode("utf-8"), unicode_html.encode("utf-8"))
self.assertEqual(soup.encode("euc_jp"), unicode_html.encode("euc_jp"))
def test_real_hebrew_document(self):
# A real-world test to make sure we can convert ISO-8859-9 (a
# Hebrew encoding) to UTF-8.
hebrew_document = b'<html><head><title>Hebrew (ISO 8859-8) in Visual Directionality</title></head><body><h1>Hebrew (ISO 8859-8) in Visual Directionality</h1>\xed\xe5\xec\xf9</body></html>'
soup = self.soup(
hebrew_document, from_encoding="iso8859-8")
# Some tree builders call it iso8859-8, others call it iso-8859-9.
# That's not a difference we really care about.
assert soup.original_encoding in ('iso8859-8', 'iso-8859-8')
self.assertEqual(
soup.encode('utf-8'),
hebrew_document.decode("iso8859-8").encode("utf-8"))
def test_meta_tag_reflects_current_encoding(self):
# Here's the <meta> tag saying that a document is
# encoded in Shift-JIS.
meta_tag = ('<meta content="text/html; charset=x-sjis" '
'http-equiv="Content-type"/>')
# Here's a document incorporating that meta tag.
shift_jis_html = (
'<html><head>\n%s\n'
'<meta http-equiv="Content-language" content="ja"/>'
'</head><body>Shift-JIS markup goes here.') % meta_tag
soup = self.soup(shift_jis_html)
# Parse the document, and the charset is seemingly unaffected.
parsed_meta = soup.find('meta', {'http-equiv': 'Content-type'})
content = parsed_meta['content']
self.assertEqual('text/html; charset=x-sjis', content)
# But that value is actually a ContentMetaAttributeValue object.
self.assertTrue(isinstance(content, ContentMetaAttributeValue))
# And it will take on a value that reflects its current
# encoding.
self.assertEqual('text/html; charset=utf8', content.encode("utf8"))
# For the rest of the story, see TestSubstitutions in
# test_tree.py.
def test_html5_style_meta_tag_reflects_current_encoding(self):
# Here's the <meta> tag saying that a document is
# encoded in Shift-JIS.
meta_tag = ('<meta id="encoding" charset="x-sjis" />')
# Here's a document incorporating that meta tag.
shift_jis_html = (
'<html><head>\n%s\n'
'<meta http-equiv="Content-language" content="ja"/>'
'</head><body>Shift-JIS markup goes here.') % meta_tag
soup = self.soup(shift_jis_html)
# Parse the document, and the charset is seemingly unaffected.
parsed_meta = soup.find('meta', id="encoding")
charset = parsed_meta['charset']
self.assertEqual('x-sjis', charset)
# But that value is actually a CharsetMetaAttributeValue object.
self.assertTrue(isinstance(charset, CharsetMetaAttributeValue))
# And it will take on a value that reflects its current
# encoding.
self.assertEqual('utf8', charset.encode("utf8"))
def test_tag_with_no_attributes_can_have_attributes_added(self):
data = self.soup("<a>text</a>")
data.a['foo'] = 'bar'
self.assertEqual('<a foo="bar">text</a>', data.a.decode())
class XMLTreeBuilderSmokeTest(object):
def test_pickle_and_unpickle_identity(self):
# Pickling a tree, then unpickling it, yields a tree identical
# to the original.
tree = self.soup("<a><b>foo</a>")
dumped = pickle.dumps(tree, 2)
loaded = pickle.loads(dumped)
self.assertEqual(loaded.__class__, BeautifulSoup)
self.assertEqual(loaded.decode(), tree.decode())
def test_docstring_generated(self):
soup = self.soup("<root/>")
self.assertEqual(
soup.encode(), b'<?xml version="1.0" encoding="utf-8"?>\n<root/>')
def test_xml_declaration(self):
markup = b"""<?xml version="1.0" encoding="utf8"?>\n<foo/>"""
soup = self.soup(markup)
self.assertEqual(markup, soup.encode("utf8"))
def test_processing_instruction(self):
markup = b"""<?xml version="1.0" encoding="utf8"?>\n<?PITarget PIContent?>"""
soup = self.soup(markup)
self.assertEqual(markup, soup.encode("utf8"))
def test_real_xhtml_document(self):
"""A real XHTML document should come out *exactly* the same as it went in."""
markup = b"""<?xml version="1.0" encoding="utf-8"?>
<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN">
<html xmlns="http://www.w3.org/1999/xhtml">
<head><title>Hello.</title></head>
<body>Goodbye.</body>
</html>"""
soup = self.soup(markup)
self.assertEqual(
soup.encode("utf-8"), markup)
def test_formatter_processes_script_tag_for_xml_documents(self):
doc = """
<script type="text/javascript">
</script>
"""
soup = BeautifulSoup(doc, "lxml-xml")
# lxml would have stripped this while parsing, but we can add
# it later.
soup.script.string = 'console.log("< < hey > > ");'
encoded = soup.encode()
self.assertTrue(b"&lt; &lt; hey &gt; &gt;" in encoded)
def test_can_parse_unicode_document(self):
markup = u'<?xml version="1.0" encoding="euc-jp"><root>Sacr\N{LATIN SMALL LETTER E WITH ACUTE} bleu!</root>'
soup = self.soup(markup)
self.assertEqual(u'Sacr\xe9 bleu!', soup.root.string)
def test_popping_namespaced_tag(self):
markup = '<rss xmlns:dc="foo"><dc:creator>b</dc:creator><dc:date>2012-07-02T20:33:42Z</dc:date><dc:rights>c</dc:rights><image>d</image></rss>'
soup = self.soup(markup)
self.assertEqual(
unicode(soup.rss), markup)
def test_docstring_includes_correct_encoding(self):
soup = self.soup("<root/>")
self.assertEqual(
soup.encode("latin1"),
b'<?xml version="1.0" encoding="latin1"?>\n<root/>')
def test_large_xml_document(self):
"""A large XML document should come out the same as it went in."""
markup = (b'<?xml version="1.0" encoding="utf-8"?>\n<root>'
+ b'0' * (2**12)
+ b'</root>')
soup = self.soup(markup)
self.assertEqual(soup.encode("utf-8"), markup)
def test_tags_are_empty_element_if_and_only_if_they_are_empty(self):
self.assertSoupEquals("<p>", "<p/>")
self.assertSoupEquals("<p>foo</p>")
def test_namespaces_are_preserved(self):
markup = '<root xmlns:a="http://example.com/" xmlns:b="http://example.net/"><a:foo>This tag is in the a namespace</a:foo><b:foo>This tag is in the b namespace</b:foo></root>'
soup = self.soup(markup)
root = soup.root
self.assertEqual("http://example.com/", root['xmlns:a'])
self.assertEqual("http://example.net/", root['xmlns:b'])
def test_closing_namespaced_tag(self):
markup = '<p xmlns:dc="http://purl.org/dc/elements/1.1/"><dc:date>20010504</dc:date></p>'
soup = self.soup(markup)
self.assertEqual(unicode(soup.p), markup)
def test_namespaced_attributes(self):
markup = '<foo xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"><bar xsi:schemaLocation="http://www.example.com"/></foo>'
soup = self.soup(markup)
self.assertEqual(unicode(soup.foo), markup)
def test_namespaced_attributes_xml_namespace(self):
markup = '<foo xml:lang="fr">bar</foo>'
soup = self.soup(markup)
self.assertEqual(unicode(soup.foo), markup)
def test_find_by_prefixed_name(self):
doc = """<?xml version="1.0" encoding="utf-8"?>
<Document xmlns="http://example.com/ns0"
xmlns:ns1="http://example.com/ns1"
xmlns:ns2="http://example.com/ns2"
<ns1:tag>foo</ns1:tag>
<ns1:tag>bar</ns1:tag>
<ns2:tag key="value">baz</ns2:tag>
</Document>
"""
soup = self.soup(doc)
# There are three <tag> tags.
self.assertEqual(3, len(soup.find_all('tag')))
# But two of them are ns1:tag and one of them is ns2:tag.
self.assertEqual(2, len(soup.find_all('ns1:tag')))
self.assertEqual(1, len(soup.find_all('ns2:tag')))
self.assertEqual(1, len(soup.find_all('ns2:tag', key='value')))
self.assertEqual(3, len(soup.find_all(['ns1:tag', 'ns2:tag'])))
def test_copy_tag_preserves_namespace(self):
xml = """<?xml version="1.0" encoding="UTF-8" standalone="yes"?>
<w:document xmlns:w="http://example.com/ns0"/>"""
soup = self.soup(xml)
tag = soup.document
duplicate = copy.copy(tag)
# The two tags have the same namespace prefix.
self.assertEqual(tag.prefix, duplicate.prefix)
class HTML5TreeBuilderSmokeTest(HTMLTreeBuilderSmokeTest):
"""Smoke test for a tree builder that supports HTML5."""
def test_real_xhtml_document(self):
# Since XHTML is not HTML5, HTML5 parsers are not tested to handle
# XHTML documents in any particular way.
pass
def test_html_tags_have_namespace(self):
markup = "<a>"
soup = self.soup(markup)
self.assertEqual("http://www.w3.org/1999/xhtml", soup.a.namespace)
def test_svg_tags_have_namespace(self):
markup = '<svg><circle/></svg>'
soup = self.soup(markup)
namespace = "http://www.w3.org/2000/svg"
self.assertEqual(namespace, soup.svg.namespace)
self.assertEqual(namespace, soup.circle.namespace)
def test_mathml_tags_have_namespace(self):
markup = '<math><msqrt>5</msqrt></math>'
soup = self.soup(markup)
namespace = 'http://www.w3.org/1998/Math/MathML'
self.assertEqual(namespace, soup.math.namespace)
self.assertEqual(namespace, soup.msqrt.namespace)
def test_xml_declaration_becomes_comment(self):
markup = '<?xml version="1.0" encoding="utf-8"?><html></html>'
soup = self.soup(markup)
self.assertTrue(isinstance(soup.contents[0], Comment))
self.assertEqual(soup.contents[0], '?xml version="1.0" encoding="utf-8"?')
self.assertEqual("html", soup.contents[0].next_element.name)
def skipIf(condition, reason):
def nothing(test, *args, **kwargs):
return None
def decorator(test_item):
if condition:
return nothing
else:
return test_item
return decorator

View File

@ -0,0 +1 @@
"The beautifulsoup tests."

View File

@ -0,0 +1,147 @@
"""Tests of the builder registry."""
import unittest
import warnings
from bs4 import BeautifulSoup
from bs4.builder import (
builder_registry as registry,
HTMLParserTreeBuilder,
TreeBuilderRegistry,
)
try:
from bs4.builder import HTML5TreeBuilder
HTML5LIB_PRESENT = True
except ImportError:
HTML5LIB_PRESENT = False
try:
from bs4.builder import (
LXMLTreeBuilderForXML,
LXMLTreeBuilder,
)
LXML_PRESENT = True
except ImportError:
LXML_PRESENT = False
class BuiltInRegistryTest(unittest.TestCase):
"""Test the built-in registry with the default builders registered."""
def test_combination(self):
if LXML_PRESENT:
self.assertEqual(registry.lookup('fast', 'html'),
LXMLTreeBuilder)
if LXML_PRESENT:
self.assertEqual(registry.lookup('permissive', 'xml'),
LXMLTreeBuilderForXML)
self.assertEqual(registry.lookup('strict', 'html'),
HTMLParserTreeBuilder)
if HTML5LIB_PRESENT:
self.assertEqual(registry.lookup('html5lib', 'html'),
HTML5TreeBuilder)
def test_lookup_by_markup_type(self):
if LXML_PRESENT:
self.assertEqual(registry.lookup('html'), LXMLTreeBuilder)
self.assertEqual(registry.lookup('xml'), LXMLTreeBuilderForXML)
else:
self.assertEqual(registry.lookup('xml'), None)
if HTML5LIB_PRESENT:
self.assertEqual(registry.lookup('html'), HTML5TreeBuilder)
else:
self.assertEqual(registry.lookup('html'), HTMLParserTreeBuilder)
def test_named_library(self):
if LXML_PRESENT:
self.assertEqual(registry.lookup('lxml', 'xml'),
LXMLTreeBuilderForXML)
self.assertEqual(registry.lookup('lxml', 'html'),
LXMLTreeBuilder)
if HTML5LIB_PRESENT:
self.assertEqual(registry.lookup('html5lib'),
HTML5TreeBuilder)
self.assertEqual(registry.lookup('html.parser'),
HTMLParserTreeBuilder)
def test_beautifulsoup_constructor_does_lookup(self):
with warnings.catch_warnings(record=True) as w:
# This will create a warning about not explicitly
# specifying a parser, but we'll ignore it.
# You can pass in a string.
BeautifulSoup("", features="html")
# Or a list of strings.
BeautifulSoup("", features=["html", "fast"])
# You'll get an exception if BS can't find an appropriate
# builder.
self.assertRaises(ValueError, BeautifulSoup,
"", features="no-such-feature")
class RegistryTest(unittest.TestCase):
"""Test the TreeBuilderRegistry class in general."""
def setUp(self):
self.registry = TreeBuilderRegistry()
def builder_for_features(self, *feature_list):
cls = type('Builder_' + '_'.join(feature_list),
(object,), {'features' : feature_list})
self.registry.register(cls)
return cls
def test_register_with_no_features(self):
builder = self.builder_for_features()
# Since the builder advertises no features, you can't find it
# by looking up features.
self.assertEqual(self.registry.lookup('foo'), None)
# But you can find it by doing a lookup with no features, if
# this happens to be the only registered builder.
self.assertEqual(self.registry.lookup(), builder)
def test_register_with_features_makes_lookup_succeed(self):
builder = self.builder_for_features('foo', 'bar')
self.assertEqual(self.registry.lookup('foo'), builder)
self.assertEqual(self.registry.lookup('bar'), builder)
def test_lookup_fails_when_no_builder_implements_feature(self):
builder = self.builder_for_features('foo', 'bar')
self.assertEqual(self.registry.lookup('baz'), None)
def test_lookup_gets_most_recent_registration_when_no_feature_specified(self):
builder1 = self.builder_for_features('foo')
builder2 = self.builder_for_features('bar')
self.assertEqual(self.registry.lookup(), builder2)
def test_lookup_fails_when_no_tree_builders_registered(self):
self.assertEqual(self.registry.lookup(), None)
def test_lookup_gets_most_recent_builder_supporting_all_features(self):
has_one = self.builder_for_features('foo')
has_the_other = self.builder_for_features('bar')
has_both_early = self.builder_for_features('foo', 'bar', 'baz')
has_both_late = self.builder_for_features('foo', 'bar', 'quux')
lacks_one = self.builder_for_features('bar')
has_the_other = self.builder_for_features('foo')
# There are two builders featuring 'foo' and 'bar', but
# the one that also features 'quux' was registered later.
self.assertEqual(self.registry.lookup('foo', 'bar'),
has_both_late)
# There is only one builder featuring 'foo', 'bar', and 'baz'.
self.assertEqual(self.registry.lookup('foo', 'bar', 'baz'),
has_both_early)
def test_lookup_fails_when_cannot_reconcile_requested_features(self):
builder1 = self.builder_for_features('foo', 'bar')
builder2 = self.builder_for_features('foo', 'baz')
self.assertEqual(self.registry.lookup('bar', 'baz'), None)

View File

@ -0,0 +1,36 @@
"Test harness for doctests."
# pylint: disable-msg=E0611,W0142
__metaclass__ = type
__all__ = [
'additional_tests',
]
import atexit
import doctest
import os
#from pkg_resources import (
# resource_filename, resource_exists, resource_listdir, cleanup_resources)
import unittest
DOCTEST_FLAGS = (
doctest.ELLIPSIS |
doctest.NORMALIZE_WHITESPACE |
doctest.REPORT_NDIFF)
# def additional_tests():
# "Run the doc tests (README.txt and docs/*, if any exist)"
# doctest_files = [
# os.path.abspath(resource_filename('bs4', 'README.txt'))]
# if resource_exists('bs4', 'docs'):
# for name in resource_listdir('bs4', 'docs'):
# if name.endswith('.txt'):
# doctest_files.append(
# os.path.abspath(
# resource_filename('bs4', 'docs/%s' % name)))
# kwargs = dict(module_relative=False, optionflags=DOCTEST_FLAGS)
# atexit.register(cleanup_resources)
# return unittest.TestSuite((
# doctest.DocFileSuite(*doctest_files, **kwargs)))

View File

@ -0,0 +1,130 @@
"""Tests to ensure that the html5lib tree builder generates good trees."""
import warnings
try:
from bs4.builder import HTML5TreeBuilder
HTML5LIB_PRESENT = True
except ImportError, e:
HTML5LIB_PRESENT = False
from bs4.element import SoupStrainer
from bs4.testing import (
HTML5TreeBuilderSmokeTest,
SoupTest,
skipIf,
)
@skipIf(
not HTML5LIB_PRESENT,
"html5lib seems not to be present, not testing its tree builder.")
class HTML5LibBuilderSmokeTest(SoupTest, HTML5TreeBuilderSmokeTest):
"""See ``HTML5TreeBuilderSmokeTest``."""
@property
def default_builder(self):
return HTML5TreeBuilder()
def test_soupstrainer(self):
# The html5lib tree builder does not support SoupStrainers.
strainer = SoupStrainer("b")
markup = "<p>A <b>bold</b> statement.</p>"
with warnings.catch_warnings(record=True) as w:
soup = self.soup(markup, parse_only=strainer)
self.assertEqual(
soup.decode(), self.document_for(markup))
self.assertTrue(
"the html5lib tree builder doesn't support parse_only" in
str(w[0].message))
def test_correctly_nested_tables(self):
"""html5lib inserts <tbody> tags where other parsers don't."""
markup = ('<table id="1">'
'<tr>'
"<td>Here's another table:"
'<table id="2">'
'<tr><td>foo</td></tr>'
'</table></td>')
self.assertSoupEquals(
markup,
'<table id="1"><tbody><tr><td>Here\'s another table:'
'<table id="2"><tbody><tr><td>foo</td></tr></tbody></table>'
'</td></tr></tbody></table>')
self.assertSoupEquals(
"<table><thead><tr><td>Foo</td></tr></thead>"
"<tbody><tr><td>Bar</td></tr></tbody>"
"<tfoot><tr><td>Baz</td></tr></tfoot></table>")
def test_xml_declaration_followed_by_doctype(self):
markup = '''<?xml version="1.0" encoding="utf-8"?>
<!DOCTYPE html>
<html>
<head>
</head>
<body>
<p>foo</p>
</body>
</html>'''
soup = self.soup(markup)
# Verify that we can reach the <p> tag; this means the tree is connected.
self.assertEqual(b"<p>foo</p>", soup.p.encode())
def test_reparented_markup(self):
markup = '<p><em>foo</p>\n<p>bar<a></a></em></p>'
soup = self.soup(markup)
self.assertEqual(u"<body><p><em>foo</em></p><em>\n</em><p><em>bar<a></a></em></p></body>", soup.body.decode())
self.assertEqual(2, len(soup.find_all('p')))
def test_reparented_markup_ends_with_whitespace(self):
markup = '<p><em>foo</p>\n<p>bar<a></a></em></p>\n'
soup = self.soup(markup)
self.assertEqual(u"<body><p><em>foo</em></p><em>\n</em><p><em>bar<a></a></em></p>\n</body>", soup.body.decode())
self.assertEqual(2, len(soup.find_all('p')))
def test_reparented_markup_containing_identical_whitespace_nodes(self):
"""Verify that we keep the two whitespace nodes in this
document distinct when reparenting the adjacent <tbody> tags.
"""
markup = '<table> <tbody><tbody><ims></tbody> </table>'
soup = self.soup(markup)
space1, space2 = soup.find_all(string=' ')
tbody1, tbody2 = soup.find_all('tbody')
assert space1.next_element is tbody1
assert tbody2.next_element is space2
def test_reparented_markup_containing_children(self):
markup = '<div><a>aftermath<p><noscript>target</noscript>aftermath</a></p></div>'
soup = self.soup(markup)
noscript = soup.noscript
self.assertEqual("target", noscript.next_element)
target = soup.find(string='target')
# The 'aftermath' string was duplicated; we want the second one.
final_aftermath = soup.find_all(string='aftermath')[-1]
# The <noscript> tag was moved beneath a copy of the <a> tag,
# but the 'target' string within is still connected to the
# (second) 'aftermath' string.
self.assertEqual(final_aftermath, target.next_element)
self.assertEqual(target, final_aftermath.previous_element)
def test_processing_instruction(self):
"""Processing instructions become comments."""
markup = b"""<?PITarget PIContent?>"""
soup = self.soup(markup)
assert str(soup).startswith("<!--?PITarget PIContent?-->")
def test_cloned_multivalue_node(self):
markup = b"""<a class="my_class"><p></a>"""
soup = self.soup(markup)
a1, a2 = soup.find_all('a')
self.assertEqual(a1, a2)
assert a1 is not a2
def test_foster_parenting(self):
markup = b"""<table><td></tbody>A"""
soup = self.soup(markup)
self.assertEqual(u"<body>A<table><tbody><tr><td></td></tr></tbody></table></body>", soup.body.decode())

View File

@ -0,0 +1,34 @@
"""Tests to ensure that the html.parser tree builder generates good
trees."""
from pdb import set_trace
import pickle
from bs4.testing import SoupTest, HTMLTreeBuilderSmokeTest
from bs4.builder import HTMLParserTreeBuilder
class HTMLParserTreeBuilderSmokeTest(SoupTest, HTMLTreeBuilderSmokeTest):
@property
def default_builder(self):
return HTMLParserTreeBuilder()
def test_namespaced_system_doctype(self):
# html.parser can't handle namespaced doctypes, so skip this one.
pass
def test_namespaced_public_doctype(self):
# html.parser can't handle namespaced doctypes, so skip this one.
pass
def test_builder_is_pickled(self):
"""Unlike most tree builders, HTMLParserTreeBuilder and will
be restored after pickling.
"""
tree = self.soup("<a><b>foo</a>")
dumped = pickle.dumps(tree, 2)
loaded = pickle.loads(dumped)
self.assertTrue(isinstance(loaded.builder, type(tree.builder)))
def test_redundant_empty_element_closing_tags(self):
self.assertSoupEquals('<br></br><br></br><br></br>', "<br/><br/><br/>")
self.assertSoupEquals('</br></br></br>', "")

View File

@ -0,0 +1,76 @@
"""Tests to ensure that the lxml tree builder generates good trees."""
import re
import warnings
try:
import lxml.etree
LXML_PRESENT = True
LXML_VERSION = lxml.etree.LXML_VERSION
except ImportError, e:
LXML_PRESENT = False
LXML_VERSION = (0,)
if LXML_PRESENT:
from bs4.builder import LXMLTreeBuilder, LXMLTreeBuilderForXML
from bs4 import (
BeautifulSoup,
BeautifulStoneSoup,
)
from bs4.element import Comment, Doctype, SoupStrainer
from bs4.testing import skipIf
from bs4.tests import test_htmlparser
from bs4.testing import (
HTMLTreeBuilderSmokeTest,
XMLTreeBuilderSmokeTest,
SoupTest,
skipIf,
)
@skipIf(
not LXML_PRESENT,
"lxml seems not to be present, not testing its tree builder.")
class LXMLTreeBuilderSmokeTest(SoupTest, HTMLTreeBuilderSmokeTest):
"""See ``HTMLTreeBuilderSmokeTest``."""
@property
def default_builder(self):
return LXMLTreeBuilder()
def test_out_of_range_entity(self):
self.assertSoupEquals(
"<p>foo&#10000000000000;bar</p>", "<p>foobar</p>")
self.assertSoupEquals(
"<p>foo&#x10000000000000;bar</p>", "<p>foobar</p>")
self.assertSoupEquals(
"<p>foo&#1000000000;bar</p>", "<p>foobar</p>")
# In lxml < 2.3.5, an empty doctype causes a segfault. Skip this
# test if an old version of lxml is installed.
@skipIf(
not LXML_PRESENT or LXML_VERSION < (2,3,5,0),
"Skipping doctype test for old version of lxml to avoid segfault.")
def test_empty_doctype(self):
soup = self.soup("<!DOCTYPE>")
doctype = soup.contents[0]
self.assertEqual("", doctype.strip())
def test_beautifulstonesoup_is_xml_parser(self):
# Make sure that the deprecated BSS class uses an xml builder
# if one is installed.
with warnings.catch_warnings(record=True) as w:
soup = BeautifulStoneSoup("<b />")
self.assertEqual(u"<b/>", unicode(soup.b))
self.assertTrue("BeautifulStoneSoup class is deprecated" in str(w[0].message))
@skipIf(
not LXML_PRESENT,
"lxml seems not to be present, not testing its XML tree builder.")
class LXMLXMLTreeBuilderSmokeTest(SoupTest, XMLTreeBuilderSmokeTest):
"""See ``HTMLTreeBuilderSmokeTest``."""
@property
def default_builder(self):
return LXMLTreeBuilderForXML()

View File

@ -0,0 +1,501 @@
# -*- coding: utf-8 -*-
"""Tests of Beautiful Soup as a whole."""
from pdb import set_trace
import logging
import unittest
import sys
import tempfile
from bs4 import (
BeautifulSoup,
BeautifulStoneSoup,
)
from bs4.element import (
CharsetMetaAttributeValue,
ContentMetaAttributeValue,
SoupStrainer,
NamespacedAttribute,
)
import bs4.dammit
from bs4.dammit import (
EntitySubstitution,
UnicodeDammit,
EncodingDetector,
)
from bs4.testing import (
SoupTest,
skipIf,
)
import warnings
try:
from bs4.builder import LXMLTreeBuilder, LXMLTreeBuilderForXML
LXML_PRESENT = True
except ImportError, e:
LXML_PRESENT = False
PYTHON_3_PRE_3_2 = (sys.version_info[0] == 3 and sys.version_info < (3,2))
class TestConstructor(SoupTest):
def test_short_unicode_input(self):
data = u"<h1>éé</h1>"
soup = self.soup(data)
self.assertEqual(u"éé", soup.h1.string)
def test_embedded_null(self):
data = u"<h1>foo\0bar</h1>"
soup = self.soup(data)
self.assertEqual(u"foo\0bar", soup.h1.string)
def test_exclude_encodings(self):
utf8_data = u"Räksmörgås".encode("utf-8")
soup = self.soup(utf8_data, exclude_encodings=["utf-8"])
self.assertEqual("windows-1252", soup.original_encoding)
class TestWarnings(SoupTest):
def _no_parser_specified(self, s, is_there=True):
v = s.startswith(BeautifulSoup.NO_PARSER_SPECIFIED_WARNING[:80])
self.assertTrue(v)
def test_warning_if_no_parser_specified(self):
with warnings.catch_warnings(record=True) as w:
soup = self.soup("<a><b></b></a>")
msg = str(w[0].message)
self._assert_no_parser_specified(msg)
def test_warning_if_parser_specified_too_vague(self):
with warnings.catch_warnings(record=True) as w:
soup = self.soup("<a><b></b></a>", "html")
msg = str(w[0].message)
self._assert_no_parser_specified(msg)
def test_no_warning_if_explicit_parser_specified(self):
with warnings.catch_warnings(record=True) as w:
soup = self.soup("<a><b></b></a>", "html.parser")
self.assertEqual([], w)
def test_parseOnlyThese_renamed_to_parse_only(self):
with warnings.catch_warnings(record=True) as w:
soup = self.soup("<a><b></b></a>", parseOnlyThese=SoupStrainer("b"))
msg = str(w[0].message)
self.assertTrue("parseOnlyThese" in msg)
self.assertTrue("parse_only" in msg)
self.assertEqual(b"<b></b>", soup.encode())
def test_fromEncoding_renamed_to_from_encoding(self):
with warnings.catch_warnings(record=True) as w:
utf8 = b"\xc3\xa9"
soup = self.soup(utf8, fromEncoding="utf8")
msg = str(w[0].message)
self.assertTrue("fromEncoding" in msg)
self.assertTrue("from_encoding" in msg)
self.assertEqual("utf8", soup.original_encoding)
def test_unrecognized_keyword_argument(self):
self.assertRaises(
TypeError, self.soup, "<a>", no_such_argument=True)
class TestWarnings(SoupTest):
def test_disk_file_warning(self):
filehandle = tempfile.NamedTemporaryFile()
filename = filehandle.name
try:
with warnings.catch_warnings(record=True) as w:
soup = self.soup(filename)
msg = str(w[0].message)
self.assertTrue("looks like a filename" in msg)
finally:
filehandle.close()
# The file no longer exists, so Beautiful Soup will no longer issue the warning.
with warnings.catch_warnings(record=True) as w:
soup = self.soup(filename)
self.assertEqual(0, len(w))
def test_url_warning_with_bytes_url(self):
with warnings.catch_warnings(record=True) as warning_list:
soup = self.soup(b"http://www.crummybytes.com/")
# Be aware this isn't the only warning that can be raised during
# execution..
self.assertTrue(any("looks like a URL" in str(w.message)
for w in warning_list))
def test_url_warning_with_unicode_url(self):
with warnings.catch_warnings(record=True) as warning_list:
# note - this url must differ from the bytes one otherwise
# python's warnings system swallows the second warning
soup = self.soup(u"http://www.crummyunicode.com/")
self.assertTrue(any("looks like a URL" in str(w.message)
for w in warning_list))
def test_url_warning_with_bytes_and_space(self):
with warnings.catch_warnings(record=True) as warning_list:
soup = self.soup(b"http://www.crummybytes.com/ is great")
self.assertFalse(any("looks like a URL" in str(w.message)
for w in warning_list))
def test_url_warning_with_unicode_and_space(self):
with warnings.catch_warnings(record=True) as warning_list:
soup = self.soup(u"http://www.crummyuncode.com/ is great")
self.assertFalse(any("looks like a URL" in str(w.message)
for w in warning_list))
class TestSelectiveParsing(SoupTest):
def test_parse_with_soupstrainer(self):
markup = "No<b>Yes</b><a>No<b>Yes <c>Yes</c></b>"
strainer = SoupStrainer("b")
soup = self.soup(markup, parse_only=strainer)
self.assertEqual(soup.encode(), b"<b>Yes</b><b>Yes <c>Yes</c></b>")
class TestEntitySubstitution(unittest.TestCase):
"""Standalone tests of the EntitySubstitution class."""
def setUp(self):
self.sub = EntitySubstitution
def test_simple_html_substitution(self):
# Unicode characters corresponding to named HTML entites
# are substituted, and no others.
s = u"foo\u2200\N{SNOWMAN}\u00f5bar"
self.assertEqual(self.sub.substitute_html(s),
u"foo&forall;\N{SNOWMAN}&otilde;bar")
def test_smart_quote_substitution(self):
# MS smart quotes are a common source of frustration, so we
# give them a special test.
quotes = b"\x91\x92foo\x93\x94"
dammit = UnicodeDammit(quotes)
self.assertEqual(self.sub.substitute_html(dammit.markup),
"&lsquo;&rsquo;foo&ldquo;&rdquo;")
def test_xml_converstion_includes_no_quotes_if_make_quoted_attribute_is_false(self):
s = 'Welcome to "my bar"'
self.assertEqual(self.sub.substitute_xml(s, False), s)
def test_xml_attribute_quoting_normally_uses_double_quotes(self):
self.assertEqual(self.sub.substitute_xml("Welcome", True),
'"Welcome"')
self.assertEqual(self.sub.substitute_xml("Bob's Bar", True),
'"Bob\'s Bar"')
def test_xml_attribute_quoting_uses_single_quotes_when_value_contains_double_quotes(self):
s = 'Welcome to "my bar"'
self.assertEqual(self.sub.substitute_xml(s, True),
"'Welcome to \"my bar\"'")
def test_xml_attribute_quoting_escapes_single_quotes_when_value_contains_both_single_and_double_quotes(self):
s = 'Welcome to "Bob\'s Bar"'
self.assertEqual(
self.sub.substitute_xml(s, True),
'"Welcome to &quot;Bob\'s Bar&quot;"')
def test_xml_quotes_arent_escaped_when_value_is_not_being_quoted(self):
quoted = 'Welcome to "Bob\'s Bar"'
self.assertEqual(self.sub.substitute_xml(quoted), quoted)
def test_xml_quoting_handles_angle_brackets(self):
self.assertEqual(
self.sub.substitute_xml("foo<bar>"),
"foo&lt;bar&gt;")
def test_xml_quoting_handles_ampersands(self):
self.assertEqual(self.sub.substitute_xml("AT&T"), "AT&amp;T")
def test_xml_quoting_including_ampersands_when_they_are_part_of_an_entity(self):
self.assertEqual(
self.sub.substitute_xml("&Aacute;T&T"),
"&amp;Aacute;T&amp;T")
def test_xml_quoting_ignoring_ampersands_when_they_are_part_of_an_entity(self):
self.assertEqual(
self.sub.substitute_xml_containing_entities("&Aacute;T&T"),
"&Aacute;T&amp;T")
def test_quotes_not_html_substituted(self):
"""There's no need to do this except inside attribute values."""
text = 'Bob\'s "bar"'
self.assertEqual(self.sub.substitute_html(text), text)
class TestEncodingConversion(SoupTest):
# Test Beautiful Soup's ability to decode and encode from various
# encodings.
def setUp(self):
super(TestEncodingConversion, self).setUp()
self.unicode_data = u'<html><head><meta charset="utf-8"/></head><body><foo>Sacr\N{LATIN SMALL LETTER E WITH ACUTE} bleu!</foo></body></html>'
self.utf8_data = self.unicode_data.encode("utf-8")
# Just so you know what it looks like.
self.assertEqual(
self.utf8_data,
b'<html><head><meta charset="utf-8"/></head><body><foo>Sacr\xc3\xa9 bleu!</foo></body></html>')
def test_ascii_in_unicode_out(self):
# ASCII input is converted to Unicode. The original_encoding
# attribute is set to 'utf-8', a superset of ASCII.
chardet = bs4.dammit.chardet_dammit
logging.disable(logging.WARNING)
try:
def noop(str):
return None
# Disable chardet, which will realize that the ASCII is ASCII.
bs4.dammit.chardet_dammit = noop
ascii = b"<foo>a</foo>"
soup_from_ascii = self.soup(ascii)
unicode_output = soup_from_ascii.decode()
self.assertTrue(isinstance(unicode_output, unicode))
self.assertEqual(unicode_output, self.document_for(ascii.decode()))
self.assertEqual(soup_from_ascii.original_encoding.lower(), "utf-8")
finally:
logging.disable(logging.NOTSET)
bs4.dammit.chardet_dammit = chardet
def test_unicode_in_unicode_out(self):
# Unicode input is left alone. The original_encoding attribute
# is not set.
soup_from_unicode = self.soup(self.unicode_data)
self.assertEqual(soup_from_unicode.decode(), self.unicode_data)
self.assertEqual(soup_from_unicode.foo.string, u'Sacr\xe9 bleu!')
self.assertEqual(soup_from_unicode.original_encoding, None)
def test_utf8_in_unicode_out(self):
# UTF-8 input is converted to Unicode. The original_encoding
# attribute is set.
soup_from_utf8 = self.soup(self.utf8_data)
self.assertEqual(soup_from_utf8.decode(), self.unicode_data)
self.assertEqual(soup_from_utf8.foo.string, u'Sacr\xe9 bleu!')
def test_utf8_out(self):
# The internal data structures can be encoded as UTF-8.
soup_from_unicode = self.soup(self.unicode_data)
self.assertEqual(soup_from_unicode.encode('utf-8'), self.utf8_data)
@skipIf(
PYTHON_3_PRE_3_2,
"Bad HTMLParser detected; skipping test of non-ASCII characters in attribute name.")
def test_attribute_name_containing_unicode_characters(self):
markup = u'<div><a \N{SNOWMAN}="snowman"></a></div>'
self.assertEqual(self.soup(markup).div.encode("utf8"), markup.encode("utf8"))
class TestUnicodeDammit(unittest.TestCase):
"""Standalone tests of UnicodeDammit."""
def test_unicode_input(self):
markup = u"I'm already Unicode! \N{SNOWMAN}"
dammit = UnicodeDammit(markup)
self.assertEqual(dammit.unicode_markup, markup)
def test_smart_quotes_to_unicode(self):
markup = b"<foo>\x91\x92\x93\x94</foo>"
dammit = UnicodeDammit(markup)
self.assertEqual(
dammit.unicode_markup, u"<foo>\u2018\u2019\u201c\u201d</foo>")
def test_smart_quotes_to_xml_entities(self):
markup = b"<foo>\x91\x92\x93\x94</foo>"
dammit = UnicodeDammit(markup, smart_quotes_to="xml")
self.assertEqual(
dammit.unicode_markup, "<foo>&#x2018;&#x2019;&#x201C;&#x201D;</foo>")
def test_smart_quotes_to_html_entities(self):
markup = b"<foo>\x91\x92\x93\x94</foo>"
dammit = UnicodeDammit(markup, smart_quotes_to="html")
self.assertEqual(
dammit.unicode_markup, "<foo>&lsquo;&rsquo;&ldquo;&rdquo;</foo>")
def test_smart_quotes_to_ascii(self):
markup = b"<foo>\x91\x92\x93\x94</foo>"
dammit = UnicodeDammit(markup, smart_quotes_to="ascii")
self.assertEqual(
dammit.unicode_markup, """<foo>''""</foo>""")
def test_detect_utf8(self):
utf8 = b"Sacr\xc3\xa9 bleu! \xe2\x98\x83"
dammit = UnicodeDammit(utf8)
self.assertEqual(dammit.original_encoding.lower(), 'utf-8')
self.assertEqual(dammit.unicode_markup, u'Sacr\xe9 bleu! \N{SNOWMAN}')
def test_convert_hebrew(self):
hebrew = b"\xed\xe5\xec\xf9"
dammit = UnicodeDammit(hebrew, ["iso-8859-8"])
self.assertEqual(dammit.original_encoding.lower(), 'iso-8859-8')
self.assertEqual(dammit.unicode_markup, u'\u05dd\u05d5\u05dc\u05e9')
def test_dont_see_smart_quotes_where_there_are_none(self):
utf_8 = b"\343\202\261\343\203\274\343\202\277\343\202\244 Watch"
dammit = UnicodeDammit(utf_8)
self.assertEqual(dammit.original_encoding.lower(), 'utf-8')
self.assertEqual(dammit.unicode_markup.encode("utf-8"), utf_8)
def test_ignore_inappropriate_codecs(self):
utf8_data = u"Räksmörgås".encode("utf-8")
dammit = UnicodeDammit(utf8_data, ["iso-8859-8"])
self.assertEqual(dammit.original_encoding.lower(), 'utf-8')
def test_ignore_invalid_codecs(self):
utf8_data = u"Räksmörgås".encode("utf-8")
for bad_encoding in ['.utf8', '...', 'utF---16.!']:
dammit = UnicodeDammit(utf8_data, [bad_encoding])
self.assertEqual(dammit.original_encoding.lower(), 'utf-8')
def test_exclude_encodings(self):
# This is UTF-8.
utf8_data = u"Räksmörgås".encode("utf-8")
# But if we exclude UTF-8 from consideration, the guess is
# Windows-1252.
dammit = UnicodeDammit(utf8_data, exclude_encodings=["utf-8"])
self.assertEqual(dammit.original_encoding.lower(), 'windows-1252')
# And if we exclude that, there is no valid guess at all.
dammit = UnicodeDammit(
utf8_data, exclude_encodings=["utf-8", "windows-1252"])
self.assertEqual(dammit.original_encoding, None)
def test_encoding_detector_replaces_junk_in_encoding_name_with_replacement_character(self):
detected = EncodingDetector(
b'<?xml version="1.0" encoding="UTF-\xdb" ?>')
encodings = list(detected.encodings)
assert u'utf-\N{REPLACEMENT CHARACTER}' in encodings
def test_detect_html5_style_meta_tag(self):
for data in (
b'<html><meta charset="euc-jp" /></html>',
b"<html><meta charset='euc-jp' /></html>",
b"<html><meta charset=euc-jp /></html>",
b"<html><meta charset=euc-jp/></html>"):
dammit = UnicodeDammit(data, is_html=True)
self.assertEqual(
"euc-jp", dammit.original_encoding)
def test_last_ditch_entity_replacement(self):
# This is a UTF-8 document that contains bytestrings
# completely incompatible with UTF-8 (ie. encoded with some other
# encoding).
#
# Since there is no consistent encoding for the document,
# Unicode, Dammit will eventually encode the document as UTF-8
# and encode the incompatible characters as REPLACEMENT
# CHARACTER.
#
# If chardet is installed, it will detect that the document
# can be converted into ISO-8859-1 without errors. This happens
# to be the wrong encoding, but it is a consistent encoding, so the
# code we're testing here won't run.
#
# So we temporarily disable chardet if it's present.
doc = b"""\357\273\277<?xml version="1.0" encoding="UTF-8"?>
<html><b>\330\250\330\252\330\261</b>
<i>\310\322\321\220\312\321\355\344</i></html>"""
chardet = bs4.dammit.chardet_dammit
logging.disable(logging.WARNING)
try:
def noop(str):
return None
bs4.dammit.chardet_dammit = noop
dammit = UnicodeDammit(doc)
self.assertEqual(True, dammit.contains_replacement_characters)
self.assertTrue(u"\ufffd" in dammit.unicode_markup)
soup = BeautifulSoup(doc, "html.parser")
self.assertTrue(soup.contains_replacement_characters)
finally:
logging.disable(logging.NOTSET)
bs4.dammit.chardet_dammit = chardet
def test_byte_order_mark_removed(self):
# A document written in UTF-16LE will have its byte order marker stripped.
data = b'\xff\xfe<\x00a\x00>\x00\xe1\x00\xe9\x00<\x00/\x00a\x00>\x00'
dammit = UnicodeDammit(data)
self.assertEqual(u"<a>áé</a>", dammit.unicode_markup)
self.assertEqual("utf-16le", dammit.original_encoding)
def test_detwingle(self):
# Here's a UTF8 document.
utf8 = (u"\N{SNOWMAN}" * 3).encode("utf8")
# Here's a Windows-1252 document.
windows_1252 = (
u"\N{LEFT DOUBLE QUOTATION MARK}Hi, I like Windows!"
u"\N{RIGHT DOUBLE QUOTATION MARK}").encode("windows_1252")
# Through some unholy alchemy, they've been stuck together.
doc = utf8 + windows_1252 + utf8
# The document can't be turned into UTF-8:
self.assertRaises(UnicodeDecodeError, doc.decode, "utf8")
# Unicode, Dammit thinks the whole document is Windows-1252,
# and decodes it into "☃☃☃“Hi, I like Windows!”☃☃☃"
# But if we run it through fix_embedded_windows_1252, it's fixed:
fixed = UnicodeDammit.detwingle(doc)
self.assertEqual(
u"☃☃☃“Hi, I like Windows!”☃☃☃", fixed.decode("utf8"))
def test_detwingle_ignores_multibyte_characters(self):
# Each of these characters has a UTF-8 representation ending
# in \x93. \x93 is a smart quote if interpreted as
# Windows-1252. But our code knows to skip over multibyte
# UTF-8 characters, so they'll survive the process unscathed.
for tricky_unicode_char in (
u"\N{LATIN SMALL LIGATURE OE}", # 2-byte char '\xc5\x93'
u"\N{LATIN SUBSCRIPT SMALL LETTER X}", # 3-byte char '\xe2\x82\x93'
u"\xf0\x90\x90\x93", # This is a CJK character, not sure which one.
):
input = tricky_unicode_char.encode("utf8")
self.assertTrue(input.endswith(b'\x93'))
output = UnicodeDammit.detwingle(input)
self.assertEqual(output, input)
class TestNamedspacedAttribute(SoupTest):
def test_name_may_be_none(self):
a = NamespacedAttribute("xmlns", None)
self.assertEqual(a, "xmlns")
def test_attribute_is_equivalent_to_colon_separated_string(self):
a = NamespacedAttribute("a", "b")
self.assertEqual("a:b", a)
def test_attributes_are_equivalent_if_prefix_and_name_identical(self):
a = NamespacedAttribute("a", "b", "c")
b = NamespacedAttribute("a", "b", "c")
self.assertEqual(a, b)
# The actual namespace is not considered.
c = NamespacedAttribute("a", "b", None)
self.assertEqual(a, c)
# But name and prefix are important.
d = NamespacedAttribute("a", "z", "c")
self.assertNotEqual(a, d)
e = NamespacedAttribute("z", "b", "c")
self.assertNotEqual(a, e)
class TestAttributeValueWithCharsetSubstitution(unittest.TestCase):
def test_content_meta_attribute_value(self):
value = CharsetMetaAttributeValue("euc-jp")
self.assertEqual("euc-jp", value)
self.assertEqual("euc-jp", value.original_value)
self.assertEqual("utf8", value.encode("utf8"))
def test_content_meta_attribute_value(self):
value = ContentMetaAttributeValue("text/html; charset=euc-jp")
self.assertEqual("text/html; charset=euc-jp", value)
self.assertEqual("text/html; charset=euc-jp", value.original_value)
self.assertEqual("text/html; charset=utf8", value.encode("utf8"))

File diff suppressed because it is too large Load Diff

View File

@ -0,0 +1,3 @@
from pkgutil import extend_path
__path__ = extend_path(__path__, __name__)

View File

@ -0,0 +1,23 @@
# Copyright 2009 Brian Quinlan. All Rights Reserved.
# Licensed to PSF under a Contributor Agreement.
"""Execute computations asynchronously using threads or processes."""
__author__ = 'Brian Quinlan (brian@sweetapp.com)'
from concurrent.futures._base import (FIRST_COMPLETED,
FIRST_EXCEPTION,
ALL_COMPLETED,
CancelledError,
TimeoutError,
Future,
Executor,
wait,
as_completed)
from concurrent.futures.thread import ThreadPoolExecutor
try:
from concurrent.futures.process import ProcessPoolExecutor
except ImportError:
# some platforms don't have multiprocessing
pass

View File

@ -0,0 +1,607 @@
# Copyright 2009 Brian Quinlan. All Rights Reserved.
# Licensed to PSF under a Contributor Agreement.
import collections
import logging
import threading
import itertools
import time
__author__ = 'Brian Quinlan (brian@sweetapp.com)'
FIRST_COMPLETED = 'FIRST_COMPLETED'
FIRST_EXCEPTION = 'FIRST_EXCEPTION'
ALL_COMPLETED = 'ALL_COMPLETED'
_AS_COMPLETED = '_AS_COMPLETED'
# Possible future states (for internal use by the futures package).
PENDING = 'PENDING'
RUNNING = 'RUNNING'
# The future was cancelled by the user...
CANCELLED = 'CANCELLED'
# ...and _Waiter.add_cancelled() was called by a worker.
CANCELLED_AND_NOTIFIED = 'CANCELLED_AND_NOTIFIED'
FINISHED = 'FINISHED'
_FUTURE_STATES = [
PENDING,
RUNNING,
CANCELLED,
CANCELLED_AND_NOTIFIED,
FINISHED
]
_STATE_TO_DESCRIPTION_MAP = {
PENDING: "pending",
RUNNING: "running",
CANCELLED: "cancelled",
CANCELLED_AND_NOTIFIED: "cancelled",
FINISHED: "finished"
}
# Logger for internal use by the futures package.
LOGGER = logging.getLogger("concurrent.futures")
class Error(Exception):
"""Base class for all future-related exceptions."""
pass
class CancelledError(Error):
"""The Future was cancelled."""
pass
class TimeoutError(Error):
"""The operation exceeded the given deadline."""
pass
class _Waiter(object):
"""Provides the event that wait() and as_completed() block on."""
def __init__(self):
self.event = threading.Event()
self.finished_futures = []
def add_result(self, future):
self.finished_futures.append(future)
def add_exception(self, future):
self.finished_futures.append(future)
def add_cancelled(self, future):
self.finished_futures.append(future)
class _AsCompletedWaiter(_Waiter):
"""Used by as_completed()."""
def __init__(self):
super(_AsCompletedWaiter, self).__init__()
self.lock = threading.Lock()
def add_result(self, future):
with self.lock:
super(_AsCompletedWaiter, self).add_result(future)
self.event.set()
def add_exception(self, future):
with self.lock:
super(_AsCompletedWaiter, self).add_exception(future)
self.event.set()
def add_cancelled(self, future):
with self.lock:
super(_AsCompletedWaiter, self).add_cancelled(future)
self.event.set()
class _FirstCompletedWaiter(_Waiter):
"""Used by wait(return_when=FIRST_COMPLETED)."""
def add_result(self, future):
super(_FirstCompletedWaiter, self).add_result(future)
self.event.set()
def add_exception(self, future):
super(_FirstCompletedWaiter, self).add_exception(future)
self.event.set()
def add_cancelled(self, future):
super(_FirstCompletedWaiter, self).add_cancelled(future)
self.event.set()
class _AllCompletedWaiter(_Waiter):
"""Used by wait(return_when=FIRST_EXCEPTION and ALL_COMPLETED)."""
def __init__(self, num_pending_calls, stop_on_exception):
self.num_pending_calls = num_pending_calls
self.stop_on_exception = stop_on_exception
self.lock = threading.Lock()
super(_AllCompletedWaiter, self).__init__()
def _decrement_pending_calls(self):
with self.lock:
self.num_pending_calls -= 1
if not self.num_pending_calls:
self.event.set()
def add_result(self, future):
super(_AllCompletedWaiter, self).add_result(future)
self._decrement_pending_calls()
def add_exception(self, future):
super(_AllCompletedWaiter, self).add_exception(future)
if self.stop_on_exception:
self.event.set()
else:
self._decrement_pending_calls()
def add_cancelled(self, future):
super(_AllCompletedWaiter, self).add_cancelled(future)
self._decrement_pending_calls()
class _AcquireFutures(object):
"""A context manager that does an ordered acquire of Future conditions."""
def __init__(self, futures):
self.futures = sorted(futures, key=id)
def __enter__(self):
for future in self.futures:
future._condition.acquire()
def __exit__(self, *args):
for future in self.futures:
future._condition.release()
def _create_and_install_waiters(fs, return_when):
if return_when == _AS_COMPLETED:
waiter = _AsCompletedWaiter()
elif return_when == FIRST_COMPLETED:
waiter = _FirstCompletedWaiter()
else:
pending_count = sum(
f._state not in [CANCELLED_AND_NOTIFIED, FINISHED] for f in fs)
if return_when == FIRST_EXCEPTION:
waiter = _AllCompletedWaiter(pending_count, stop_on_exception=True)
elif return_when == ALL_COMPLETED:
waiter = _AllCompletedWaiter(pending_count, stop_on_exception=False)
else:
raise ValueError("Invalid return condition: %r" % return_when)
for f in fs:
f._waiters.append(waiter)
return waiter
def as_completed(fs, timeout=None):
"""An iterator over the given futures that yields each as it completes.
Args:
fs: The sequence of Futures (possibly created by different Executors) to
iterate over.
timeout: The maximum number of seconds to wait. If None, then there
is no limit on the wait time.
Returns:
An iterator that yields the given Futures as they complete (finished or
cancelled). If any given Futures are duplicated, they will be returned
once.
Raises:
TimeoutError: If the entire result iterator could not be generated
before the given timeout.
"""
if timeout is not None:
end_time = timeout + time.time()
fs = set(fs)
with _AcquireFutures(fs):
finished = set(
f for f in fs
if f._state in [CANCELLED_AND_NOTIFIED, FINISHED])
pending = fs - finished
waiter = _create_and_install_waiters(fs, _AS_COMPLETED)
try:
for future in finished:
yield future
while pending:
if timeout is None:
wait_timeout = None
else:
wait_timeout = end_time - time.time()
if wait_timeout < 0:
raise TimeoutError(
'%d (of %d) futures unfinished' % (
len(pending), len(fs)))
waiter.event.wait(wait_timeout)
with waiter.lock:
finished = waiter.finished_futures
waiter.finished_futures = []
waiter.event.clear()
for future in finished:
yield future
pending.remove(future)
finally:
for f in fs:
with f._condition:
f._waiters.remove(waiter)
DoneAndNotDoneFutures = collections.namedtuple(
'DoneAndNotDoneFutures', 'done not_done')
def wait(fs, timeout=None, return_when=ALL_COMPLETED):
"""Wait for the futures in the given sequence to complete.
Args:
fs: The sequence of Futures (possibly created by different Executors) to
wait upon.
timeout: The maximum number of seconds to wait. If None, then there
is no limit on the wait time.
return_when: Indicates when this function should return. The options
are:
FIRST_COMPLETED - Return when any future finishes or is
cancelled.
FIRST_EXCEPTION - Return when any future finishes by raising an
exception. If no future raises an exception
then it is equivalent to ALL_COMPLETED.
ALL_COMPLETED - Return when all futures finish or are cancelled.
Returns:
A named 2-tuple of sets. The first set, named 'done', contains the
futures that completed (is finished or cancelled) before the wait
completed. The second set, named 'not_done', contains uncompleted
futures.
"""
with _AcquireFutures(fs):
done = set(f for f in fs
if f._state in [CANCELLED_AND_NOTIFIED, FINISHED])
not_done = set(fs) - done
if (return_when == FIRST_COMPLETED) and done:
return DoneAndNotDoneFutures(done, not_done)
elif (return_when == FIRST_EXCEPTION) and done:
if any(f for f in done
if not f.cancelled() and f.exception() is not None):
return DoneAndNotDoneFutures(done, not_done)
if len(done) == len(fs):
return DoneAndNotDoneFutures(done, not_done)
waiter = _create_and_install_waiters(fs, return_when)
waiter.event.wait(timeout)
for f in fs:
with f._condition:
f._waiters.remove(waiter)
done.update(waiter.finished_futures)
return DoneAndNotDoneFutures(done, set(fs) - done)
class Future(object):
"""Represents the result of an asynchronous computation."""
def __init__(self):
"""Initializes the future. Should not be called by clients."""
self._condition = threading.Condition()
self._state = PENDING
self._result = None
self._exception = None
self._traceback = None
self._waiters = []
self._done_callbacks = []
def _invoke_callbacks(self):
for callback in self._done_callbacks:
try:
callback(self)
except Exception:
LOGGER.exception('exception calling callback for %r', self)
def __repr__(self):
with self._condition:
if self._state == FINISHED:
if self._exception:
return '<Future at %s state=%s raised %s>' % (
hex(id(self)),
_STATE_TO_DESCRIPTION_MAP[self._state],
self._exception.__class__.__name__)
else:
return '<Future at %s state=%s returned %s>' % (
hex(id(self)),
_STATE_TO_DESCRIPTION_MAP[self._state],
self._result.__class__.__name__)
return '<Future at %s state=%s>' % (
hex(id(self)),
_STATE_TO_DESCRIPTION_MAP[self._state])
def cancel(self):
"""Cancel the future if possible.
Returns True if the future was cancelled, False otherwise. A future
cannot be cancelled if it is running or has already completed.
"""
with self._condition:
if self._state in [RUNNING, FINISHED]:
return False
if self._state in [CANCELLED, CANCELLED_AND_NOTIFIED]:
return True
self._state = CANCELLED
self._condition.notify_all()
self._invoke_callbacks()
return True
def cancelled(self):
"""Return True if the future has cancelled."""
with self._condition:
return self._state in [CANCELLED, CANCELLED_AND_NOTIFIED]
def running(self):
"""Return True if the future is currently executing."""
with self._condition:
return self._state == RUNNING
def done(self):
"""Return True of the future was cancelled or finished executing."""
with self._condition:
return self._state in [CANCELLED, CANCELLED_AND_NOTIFIED, FINISHED]
def __get_result(self):
if self._exception:
raise type(self._exception), self._exception, self._traceback
else:
return self._result
def add_done_callback(self, fn):
"""Attaches a callable that will be called when the future finishes.
Args:
fn: A callable that will be called with this future as its only
argument when the future completes or is cancelled. The callable
will always be called by a thread in the same process in which
it was added. If the future has already completed or been
cancelled then the callable will be called immediately. These
callables are called in the order that they were added.
"""
with self._condition:
if self._state not in [CANCELLED, CANCELLED_AND_NOTIFIED, FINISHED]:
self._done_callbacks.append(fn)
return
fn(self)
def result(self, timeout=None):
"""Return the result of the call that the future represents.
Args:
timeout: The number of seconds to wait for the result if the future
isn't done. If None, then there is no limit on the wait time.
Returns:
The result of the call that the future represents.
Raises:
CancelledError: If the future was cancelled.
TimeoutError: If the future didn't finish executing before the given
timeout.
Exception: If the call raised then that exception will be raised.
"""
with self._condition:
if self._state in [CANCELLED, CANCELLED_AND_NOTIFIED]:
raise CancelledError()
elif self._state == FINISHED:
return self.__get_result()
self._condition.wait(timeout)
if self._state in [CANCELLED, CANCELLED_AND_NOTIFIED]:
raise CancelledError()
elif self._state == FINISHED:
return self.__get_result()
else:
raise TimeoutError()
def exception_info(self, timeout=None):
"""Return a tuple of (exception, traceback) raised by the call that the
future represents.
Args:
timeout: The number of seconds to wait for the exception if the
future isn't done. If None, then there is no limit on the wait
time.
Returns:
The exception raised by the call that the future represents or None
if the call completed without raising.
Raises:
CancelledError: If the future was cancelled.
TimeoutError: If the future didn't finish executing before the given
timeout.
"""
with self._condition:
if self._state in [CANCELLED, CANCELLED_AND_NOTIFIED]:
raise CancelledError()
elif self._state == FINISHED:
return self._exception, self._traceback
self._condition.wait(timeout)
if self._state in [CANCELLED, CANCELLED_AND_NOTIFIED]:
raise CancelledError()
elif self._state == FINISHED:
return self._exception, self._traceback
else:
raise TimeoutError()
def exception(self, timeout=None):
"""Return the exception raised by the call that the future represents.
Args:
timeout: The number of seconds to wait for the exception if the
future isn't done. If None, then there is no limit on the wait
time.
Returns:
The exception raised by the call that the future represents or None
if the call completed without raising.
Raises:
CancelledError: If the future was cancelled.
TimeoutError: If the future didn't finish executing before the given
timeout.
"""
return self.exception_info(timeout)[0]
# The following methods should only be used by Executors and in tests.
def set_running_or_notify_cancel(self):
"""Mark the future as running or process any cancel notifications.
Should only be used by Executor implementations and unit tests.
If the future has been cancelled (cancel() was called and returned
True) then any threads waiting on the future completing (though calls
to as_completed() or wait()) are notified and False is returned.
If the future was not cancelled then it is put in the running state
(future calls to running() will return True) and True is returned.
This method should be called by Executor implementations before
executing the work associated with this future. If this method returns
False then the work should not be executed.
Returns:
False if the Future was cancelled, True otherwise.
Raises:
RuntimeError: if this method was already called or if set_result()
or set_exception() was called.
"""
with self._condition:
if self._state == CANCELLED:
self._state = CANCELLED_AND_NOTIFIED
for waiter in self._waiters:
waiter.add_cancelled(self)
# self._condition.notify_all() is not necessary because
# self.cancel() triggers a notification.
return False
elif self._state == PENDING:
self._state = RUNNING
return True
else:
LOGGER.critical('Future %s in unexpected state: %s',
id(self),
self._state)
raise RuntimeError('Future in unexpected state')
def set_result(self, result):
"""Sets the return value of work associated with the future.
Should only be used by Executor implementations and unit tests.
"""
with self._condition:
self._result = result
self._state = FINISHED
for waiter in self._waiters:
waiter.add_result(self)
self._condition.notify_all()
self._invoke_callbacks()
def set_exception_info(self, exception, traceback):
"""Sets the result of the future as being the given exception
and traceback.
Should only be used by Executor implementations and unit tests.
"""
with self._condition:
self._exception = exception
self._traceback = traceback
self._state = FINISHED
for waiter in self._waiters:
waiter.add_exception(self)
self._condition.notify_all()
self._invoke_callbacks()
def set_exception(self, exception):
"""Sets the result of the future as being the given exception.
Should only be used by Executor implementations and unit tests.
"""
self.set_exception_info(exception, None)
class Executor(object):
"""This is an abstract base class for concrete asynchronous executors."""
def submit(self, fn, *args, **kwargs):
"""Submits a callable to be executed with the given arguments.
Schedules the callable to be executed as fn(*args, **kwargs) and returns
a Future instance representing the execution of the callable.
Returns:
A Future representing the given call.
"""
raise NotImplementedError()
def map(self, fn, *iterables, **kwargs):
"""Returns a iterator equivalent to map(fn, iter).
Args:
fn: A callable that will take as many arguments as there are
passed iterables.
timeout: The maximum number of seconds to wait. If None, then there
is no limit on the wait time.
Returns:
An iterator equivalent to: map(func, *iterables) but the calls may
be evaluated out-of-order.
Raises:
TimeoutError: If the entire result iterator could not be generated
before the given timeout.
Exception: If fn(*args) raises for any values.
"""
timeout = kwargs.get('timeout')
if timeout is not None:
end_time = timeout + time.time()
fs = [self.submit(fn, *args) for args in itertools.izip(*iterables)]
# Yield must be hidden in closure so that the futures are submitted
# before the first iterator value is required.
def result_iterator():
try:
for future in fs:
if timeout is None:
yield future.result()
else:
yield future.result(end_time - time.time())
finally:
for future in fs:
future.cancel()
return result_iterator()
def shutdown(self, wait=True):
"""Clean-up the resources associated with the Executor.
It is safe to call this method several times. Otherwise, no other
methods can be called after this one.
Args:
wait: If True then shutdown will not return until all running
futures have finished executing and the resources used by the
executor have been reclaimed.
"""
pass
def __enter__(self):
return self
def __exit__(self, exc_type, exc_val, exc_tb):
self.shutdown(wait=True)
return False

View File

@ -0,0 +1,359 @@
# Copyright 2009 Brian Quinlan. All Rights Reserved.
# Licensed to PSF under a Contributor Agreement.
"""Implements ProcessPoolExecutor.
The follow diagram and text describe the data-flow through the system:
|======================= In-process =====================|== Out-of-process ==|
+----------+ +----------+ +--------+ +-----------+ +---------+
| | => | Work Ids | => | | => | Call Q | => | |
| | +----------+ | | +-----------+ | |
| | | ... | | | | ... | | |
| | | 6 | | | | 5, call() | | |
| | | 7 | | | | ... | | |
| Process | | ... | | Local | +-----------+ | Process |
| Pool | +----------+ | Worker | | #1..n |
| Executor | | Thread | | |
| | +----------- + | | +-----------+ | |
| | <=> | Work Items | <=> | | <= | Result Q | <= | |
| | +------------+ | | +-----------+ | |
| | | 6: call() | | | | ... | | |
| | | future | | | | 4, result | | |
| | | ... | | | | 3, except | | |
+----------+ +------------+ +--------+ +-----------+ +---------+
Executor.submit() called:
- creates a uniquely numbered _WorkItem and adds it to the "Work Items" dict
- adds the id of the _WorkItem to the "Work Ids" queue
Local worker thread:
- reads work ids from the "Work Ids" queue and looks up the corresponding
WorkItem from the "Work Items" dict: if the work item has been cancelled then
it is simply removed from the dict, otherwise it is repackaged as a
_CallItem and put in the "Call Q". New _CallItems are put in the "Call Q"
until "Call Q" is full. NOTE: the size of the "Call Q" is kept small because
calls placed in the "Call Q" can no longer be cancelled with Future.cancel().
- reads _ResultItems from "Result Q", updates the future stored in the
"Work Items" dict and deletes the dict entry
Process #1..n:
- reads _CallItems from "Call Q", executes the calls, and puts the resulting
_ResultItems in "Request Q"
"""
import atexit
from concurrent.futures import _base
import Queue as queue
import multiprocessing
import threading
import weakref
import sys
__author__ = 'Brian Quinlan (brian@sweetapp.com)'
# Workers are created as daemon threads and processes. This is done to allow the
# interpreter to exit when there are still idle processes in a
# ProcessPoolExecutor's process pool (i.e. shutdown() was not called). However,
# allowing workers to die with the interpreter has two undesirable properties:
# - The workers would still be running during interpretor shutdown,
# meaning that they would fail in unpredictable ways.
# - The workers could be killed while evaluating a work item, which could
# be bad if the callable being evaluated has external side-effects e.g.
# writing to a file.
#
# To work around this problem, an exit handler is installed which tells the
# workers to exit when their work queues are empty and then waits until the
# threads/processes finish.
_threads_queues = weakref.WeakKeyDictionary()
_shutdown = False
def _python_exit():
global _shutdown
_shutdown = True
items = list(_threads_queues.items()) if _threads_queues else ()
for t, q in items:
q.put(None)
for t, q in items:
t.join(sys.maxint)
# Controls how many more calls than processes will be queued in the call queue.
# A smaller number will mean that processes spend more time idle waiting for
# work while a larger number will make Future.cancel() succeed less frequently
# (Futures in the call queue cannot be cancelled).
EXTRA_QUEUED_CALLS = 1
class _WorkItem(object):
def __init__(self, future, fn, args, kwargs):
self.future = future
self.fn = fn
self.args = args
self.kwargs = kwargs
class _ResultItem(object):
def __init__(self, work_id, exception=None, result=None):
self.work_id = work_id
self.exception = exception
self.result = result
class _CallItem(object):
def __init__(self, work_id, fn, args, kwargs):
self.work_id = work_id
self.fn = fn
self.args = args
self.kwargs = kwargs
def _process_worker(call_queue, result_queue):
"""Evaluates calls from call_queue and places the results in result_queue.
This worker is run in a separate process.
Args:
call_queue: A multiprocessing.Queue of _CallItems that will be read and
evaluated by the worker.
result_queue: A multiprocessing.Queue of _ResultItems that will written
to by the worker.
shutdown: A multiprocessing.Event that will be set as a signal to the
worker that it should exit when call_queue is empty.
"""
while True:
call_item = call_queue.get(block=True)
if call_item is None:
# Wake up queue management thread
result_queue.put(None)
return
try:
r = call_item.fn(*call_item.args, **call_item.kwargs)
except BaseException:
e = sys.exc_info()[1]
result_queue.put(_ResultItem(call_item.work_id,
exception=e))
else:
result_queue.put(_ResultItem(call_item.work_id,
result=r))
def _add_call_item_to_queue(pending_work_items,
work_ids,
call_queue):
"""Fills call_queue with _WorkItems from pending_work_items.
This function never blocks.
Args:
pending_work_items: A dict mapping work ids to _WorkItems e.g.
{5: <_WorkItem...>, 6: <_WorkItem...>, ...}
work_ids: A queue.Queue of work ids e.g. Queue([5, 6, ...]). Work ids
are consumed and the corresponding _WorkItems from
pending_work_items are transformed into _CallItems and put in
call_queue.
call_queue: A multiprocessing.Queue that will be filled with _CallItems
derived from _WorkItems.
"""
while True:
if call_queue.full():
return
try:
work_id = work_ids.get(block=False)
except queue.Empty:
return
else:
work_item = pending_work_items[work_id]
if work_item.future.set_running_or_notify_cancel():
call_queue.put(_CallItem(work_id,
work_item.fn,
work_item.args,
work_item.kwargs),
block=True)
else:
del pending_work_items[work_id]
continue
def _queue_management_worker(executor_reference,
processes,
pending_work_items,
work_ids_queue,
call_queue,
result_queue):
"""Manages the communication between this process and the worker processes.
This function is run in a local thread.
Args:
executor_reference: A weakref.ref to the ProcessPoolExecutor that owns
this thread. Used to determine if the ProcessPoolExecutor has been
garbage collected and that this function can exit.
process: A list of the multiprocessing.Process instances used as
workers.
pending_work_items: A dict mapping work ids to _WorkItems e.g.
{5: <_WorkItem...>, 6: <_WorkItem...>, ...}
work_ids_queue: A queue.Queue of work ids e.g. Queue([5, 6, ...]).
call_queue: A multiprocessing.Queue that will be filled with _CallItems
derived from _WorkItems for processing by the process workers.
result_queue: A multiprocessing.Queue of _ResultItems generated by the
process workers.
"""
nb_shutdown_processes = [0]
def shutdown_one_process():
"""Tell a worker to terminate, which will in turn wake us again"""
call_queue.put(None)
nb_shutdown_processes[0] += 1
while True:
_add_call_item_to_queue(pending_work_items,
work_ids_queue,
call_queue)
result_item = result_queue.get(block=True)
if result_item is not None:
work_item = pending_work_items[result_item.work_id]
del pending_work_items[result_item.work_id]
if result_item.exception:
work_item.future.set_exception(result_item.exception)
else:
work_item.future.set_result(result_item.result)
# Delete references to object. See issue16284
del work_item
# Check whether we should start shutting down.
executor = executor_reference()
# No more work items can be added if:
# - The interpreter is shutting down OR
# - The executor that owns this worker has been collected OR
# - The executor that owns this worker has been shutdown.
if _shutdown or executor is None or executor._shutdown_thread:
# Since no new work items can be added, it is safe to shutdown
# this thread if there are no pending work items.
if not pending_work_items:
while nb_shutdown_processes[0] < len(processes):
shutdown_one_process()
# If .join() is not called on the created processes then
# some multiprocessing.Queue methods may deadlock on Mac OS
# X.
for p in processes:
p.join()
call_queue.close()
return
del executor
_system_limits_checked = False
_system_limited = None
def _check_system_limits():
global _system_limits_checked, _system_limited
if _system_limits_checked:
if _system_limited:
raise NotImplementedError(_system_limited)
_system_limits_checked = True
try:
import os
nsems_max = os.sysconf("SC_SEM_NSEMS_MAX")
except (AttributeError, ValueError):
# sysconf not available or setting not available
return
if nsems_max == -1:
# indetermine limit, assume that limit is determined
# by available memory only
return
if nsems_max >= 256:
# minimum number of semaphores available
# according to POSIX
return
_system_limited = "system provides too few semaphores (%d available, 256 necessary)" % nsems_max
raise NotImplementedError(_system_limited)
class ProcessPoolExecutor(_base.Executor):
def __init__(self, max_workers=None):
"""Initializes a new ProcessPoolExecutor instance.
Args:
max_workers: The maximum number of processes that can be used to
execute the given calls. If None or not given then as many
worker processes will be created as the machine has processors.
"""
_check_system_limits()
if max_workers is None:
self._max_workers = multiprocessing.cpu_count()
else:
self._max_workers = max_workers
# Make the call queue slightly larger than the number of processes to
# prevent the worker processes from idling. But don't make it too big
# because futures in the call queue cannot be cancelled.
self._call_queue = multiprocessing.Queue(self._max_workers +
EXTRA_QUEUED_CALLS)
self._result_queue = multiprocessing.Queue()
self._work_ids = queue.Queue()
self._queue_management_thread = None
self._processes = set()
# Shutdown is a two-step process.
self._shutdown_thread = False
self._shutdown_lock = threading.Lock()
self._queue_count = 0
self._pending_work_items = {}
def _start_queue_management_thread(self):
# When the executor gets lost, the weakref callback will wake up
# the queue management thread.
def weakref_cb(_, q=self._result_queue):
q.put(None)
if self._queue_management_thread is None:
self._queue_management_thread = threading.Thread(
target=_queue_management_worker,
args=(weakref.ref(self, weakref_cb),
self._processes,
self._pending_work_items,
self._work_ids,
self._call_queue,
self._result_queue))
self._queue_management_thread.daemon = True
self._queue_management_thread.start()
_threads_queues[self._queue_management_thread] = self._result_queue
def _adjust_process_count(self):
for _ in range(len(self._processes), self._max_workers):
p = multiprocessing.Process(
target=_process_worker,
args=(self._call_queue,
self._result_queue))
p.start()
self._processes.add(p)
def submit(self, fn, *args, **kwargs):
with self._shutdown_lock:
if self._shutdown_thread:
raise RuntimeError('cannot schedule new futures after shutdown')
f = _base.Future()
w = _WorkItem(f, fn, args, kwargs)
self._pending_work_items[self._queue_count] = w
self._work_ids.put(self._queue_count)
self._queue_count += 1
# Wake up queue management thread
self._result_queue.put(None)
self._start_queue_management_thread()
self._adjust_process_count()
return f
submit.__doc__ = _base.Executor.submit.__doc__
def shutdown(self, wait=True):
with self._shutdown_lock:
self._shutdown_thread = True
if self._queue_management_thread:
# Wake up queue management thread
self._result_queue.put(None)
if wait:
self._queue_management_thread.join(sys.maxint)
# To reduce the risk of openning too many files, remove references to
# objects that use file descriptors.
self._queue_management_thread = None
self._call_queue = None
self._result_queue = None
self._processes = None
shutdown.__doc__ = _base.Executor.shutdown.__doc__
atexit.register(_python_exit)

View File

@ -0,0 +1,134 @@
# Copyright 2009 Brian Quinlan. All Rights Reserved.
# Licensed to PSF under a Contributor Agreement.
"""Implements ThreadPoolExecutor."""
import atexit
from concurrent.futures import _base
import Queue as queue
import threading
import weakref
import sys
__author__ = 'Brian Quinlan (brian@sweetapp.com)'
# Workers are created as daemon threads. This is done to allow the interpreter
# to exit when there are still idle threads in a ThreadPoolExecutor's thread
# pool (i.e. shutdown() was not called). However, allowing workers to die with
# the interpreter has two undesirable properties:
# - The workers would still be running during interpretor shutdown,
# meaning that they would fail in unpredictable ways.
# - The workers could be killed while evaluating a work item, which could
# be bad if the callable being evaluated has external side-effects e.g.
# writing to a file.
#
# To work around this problem, an exit handler is installed which tells the
# workers to exit when their work queues are empty and then waits until the
# threads finish.
_threads_queues = weakref.WeakKeyDictionary()
_shutdown = False
def _python_exit():
global _shutdown
_shutdown = True
items = list(_threads_queues.items()) if _threads_queues else ()
for t, q in items:
q.put(None)
for t, q in items:
t.join(sys.maxint)
atexit.register(_python_exit)
class _WorkItem(object):
def __init__(self, future, fn, args, kwargs):
self.future = future
self.fn = fn
self.args = args
self.kwargs = kwargs
def run(self):
if not self.future.set_running_or_notify_cancel():
return
try:
result = self.fn(*self.args, **self.kwargs)
except BaseException:
e, tb = sys.exc_info()[1:]
self.future.set_exception_info(e, tb)
else:
self.future.set_result(result)
def _worker(executor_reference, work_queue):
try:
while True:
work_item = work_queue.get(block=True)
if work_item is not None:
work_item.run()
# Delete references to object. See issue16284
del work_item
continue
executor = executor_reference()
# Exit if:
# - The interpreter is shutting down OR
# - The executor that owns the worker has been collected OR
# - The executor that owns the worker has been shutdown.
if _shutdown or executor is None or executor._shutdown:
# Notice other workers
work_queue.put(None)
return
del executor
except BaseException:
_base.LOGGER.critical('Exception in worker', exc_info=True)
class ThreadPoolExecutor(_base.Executor):
def __init__(self, max_workers):
"""Initializes a new ThreadPoolExecutor instance.
Args:
max_workers: The maximum number of threads that can be used to
execute the given calls.
"""
self._max_workers = max_workers
self._work_queue = queue.Queue()
self._threads = set()
self._shutdown = False
self._shutdown_lock = threading.Lock()
def submit(self, fn, *args, **kwargs):
with self._shutdown_lock:
if self._shutdown:
raise RuntimeError('cannot schedule new futures after shutdown')
f = _base.Future()
w = _WorkItem(f, fn, args, kwargs)
self._work_queue.put(w)
self._adjust_thread_count()
return f
submit.__doc__ = _base.Executor.submit.__doc__
def _adjust_thread_count(self):
# When the executor gets lost, the weakref callback will wake up
# the worker threads.
def weakref_cb(_, q=self._work_queue):
q.put(None)
# TODO(bquinlan): Should avoid creating new threads if there are more
# idle threads than items in the work queue.
if len(self._threads) < self._max_workers:
t = threading.Thread(target=_worker,
args=(weakref.ref(self, weakref_cb),
self._work_queue))
t.daemon = True
t.start()
self._threads.add(t)
_threads_queues[t] = self._work_queue
def shutdown(self, wait=True):
with self._shutdown_lock:
self._shutdown = True
self._work_queue.put(None)
if wait:
for t in self._threads:
t.join(sys.maxint)
shutdown.__doc__ = _base.Executor.shutdown.__doc__

516
libs/deathbycaptcha2.7.py Normal file
View File

@ -0,0 +1,516 @@
#!/usr/bin/env python
# -*- coding: UTF-8 -*-
"""Death by Captcha HTTP and socket API clients.
There are two types of Death by Captcha (DBC hereinafter) API: HTTP and
socket ones. Both offer the same functionalily, with the socket API
sporting faster responses and using way less connections.
To access the socket API, use SocketClient class; for the HTTP API, use
HttpClient class. Both are thread-safe. SocketClient keeps a persistent
connection opened and serializes all API requests sent through it, thus
it is advised to keep a pool of them if you're script is heavily
multithreaded.
Both SocketClient and HttpClient give you the following methods:
get_user()
Returns your DBC account details as a dict with the following keys:
"user": your account numeric ID; if login fails, it will be the only
item with the value of 0;
"rate": your CAPTCHA rate, i.e. how much you will be charged for one
solved CAPTCHA in US cents;
"balance": your DBC account balance in US cents;
"is_banned": flag indicating whether your account is suspended or not.
get_balance()
Returns your DBC account balance in US cents.
get_captcha(cid)
Returns an uploaded CAPTCHA details as a dict with the following keys:
"captcha": the CAPTCHA numeric ID; if no such CAPTCHAs found, it will
be the only item with the value of 0;
"text": the CAPTCHA text, if solved, otherwise None;
"is_correct": flag indicating whether the CAPTCHA was solved correctly
(DBC can detect that in rare cases).
The only argument `cid` is the CAPTCHA numeric ID.
get_text(cid)
Returns an uploaded CAPTCHA text (None if not solved). The only argument
`cid` is the CAPTCHA numeric ID.
report(cid)
Reports an incorrectly solved CAPTCHA. The only argument `cid` is the
CAPTCHA numeric ID. Returns True on success, False otherwise.
upload(captcha)
Uploads a CAPTCHA. The only argument `captcha` can be either file-like
object (any object with `read` method defined, actually, so StringIO
will do), or CAPTCHA image file name. On successul upload you'll get
the CAPTCHA details dict (see get_captcha() method).
NOTE: AT THIS POINT THE UPLOADED CAPTCHA IS NOT SOLVED YET! You have
to poll for its status periodically using get_captcha() or get_text()
method until the CAPTCHA is solved and you get the text.
decode(captcha, timeout=DEFAULT_TIMEOUT)
A convenient method that uploads a CAPTCHA and polls for its status
periodically, but no longer than `timeout` (defaults to 60 seconds).
If solved, you'll get the CAPTCHA details dict (see get_captcha()
method for details). See upload() method for details on `captcha`
argument.
Visit http://www.deathbycaptcha.com/user/api for updates.
"""
import base64
import binascii
import errno
import imghdr
import random
import os
import select
import socket
import sys
import threading
import time
import urllib
import urllib2
try:
from json import read as json_decode, write as json_encode
except ImportError:
try:
from json import loads as json_decode, dumps as json_encode
except ImportError:
from simplejson import loads as json_decode, dumps as json_encode
# API version and unique software ID
API_VERSION = 'DBC/Python v4.6'
# Default CAPTCHA timeout and decode() polling interval
DEFAULT_TIMEOUT = 60
DEFAULT_TOKEN_TIMEOUT = 120
POLLS_INTERVAL = [1, 1, 2, 3, 2, 2, 3, 2, 2]
DFLT_POLL_INTERVAL = 3
# Base HTTP API url
HTTP_BASE_URL = 'http://api.dbcapi.me/api'
# Preferred HTTP API server's response content type, do not change
HTTP_RESPONSE_TYPE = 'application/json'
# Socket API server's host & ports range
SOCKET_HOST = 'api.dbcapi.me'
SOCKET_PORTS = range(8123, 8131)
def _load_image(captcha):
if hasattr(captcha, 'read'):
img = captcha.read()
elif type(captcha) == bytearray:
img = captcha
else:
img = ''
try:
captcha_file = open(captcha, 'rb')
except Exception:
raise
else:
img = captcha_file.read()
captcha_file.close()
if not len(img):
raise ValueError('CAPTCHA image is empty')
elif imghdr.what(None, img) is None:
raise TypeError('Unknown CAPTCHA image type')
else:
return img
class AccessDeniedException(Exception):
pass
class Client(object):
"""Death by Captcha API Client."""
def __init__(self, username, password):
self.is_verbose = False
self.userpwd = {'username': username, 'password': password}
def _log(self, cmd, msg=''):
if self.is_verbose:
print '%d %s %s' % (time.time(), cmd, msg.rstrip())
return self
def close(self):
pass
def connect(self):
pass
def get_user(self):
"""Fetch user details -- ID, balance, rate and banned status."""
raise NotImplementedError()
def get_balance(self):
"""Fetch user balance (in US cents)."""
return self.get_user().get('balance')
def get_captcha(self, cid):
"""Fetch a CAPTCHA details -- ID, text and correctness flag."""
raise NotImplementedError()
def get_text(self, cid):
"""Fetch a CAPTCHA text."""
return self.get_captcha(cid).get('text') or None
def report(self, cid):
"""Report a CAPTCHA as incorrectly solved."""
raise NotImplementedError()
def upload(self, captcha):
"""Upload a CAPTCHA.
Accepts file names and file-like objects. Returns CAPTCHA details
dict on success.
"""
raise NotImplementedError()
def decode(self, captcha=None, timeout=None, **kwargs):
"""
Try to solve a CAPTCHA.
See Client.upload() for arguments details.
Uploads a CAPTCHA, polls for its status periodically with arbitrary
timeout (in seconds), returns CAPTCHA details if (correctly) solved.
"""
if not timeout:
if not captcha:
timeout = DEFAULT_TOKEN_TIMEOUT
else:
timeout = DEFAULT_TIMEOUT
deadline = time.time() + (max(0, timeout) or DEFAULT_TIMEOUT)
uploaded_captcha = self.upload(captcha, **kwargs)
if uploaded_captcha:
intvl_idx = 0 # POLL_INTERVAL index
while deadline > time.time() and not uploaded_captcha.get('text'):
intvl, intvl_idx = self._get_poll_interval(intvl_idx)
time.sleep(intvl)
pulled = self.get_captcha(uploaded_captcha['captcha'])
if pulled['captcha'] == uploaded_captcha['captcha']:
uploaded_captcha = pulled
if uploaded_captcha.get('text') and \
uploaded_captcha.get('is_correct'):
return uploaded_captcha
def _get_poll_interval(self, idx):
"""Returns poll interval and next index depending on index provided"""
if len(POLLS_INTERVAL) > idx:
intvl = POLLS_INTERVAL[idx]
else:
intvl = DFLT_POLL_INTERVAL
idx += 1
return intvl, idx
class HttpClient(Client):
"""Death by Captcha HTTP API client."""
def __init__(self, *args):
Client.__init__(self, *args)
self.opener = urllib2.build_opener(urllib2.HTTPRedirectHandler())
def _call(self, cmd, payload=None, headers=None):
if headers is None:
headers = {}
headers['Accept'] = HTTP_RESPONSE_TYPE
headers['User-Agent'] = API_VERSION
if hasattr(payload, 'items'):
payload = urllib.urlencode(payload)
self._log('SEND', '%s %d %s' % (cmd, len(payload), payload))
else:
self._log('SEND', '%s' % cmd)
if payload is not None:
headers['Content-Length'] = len(payload)
try:
response = self.opener.open(urllib2.Request(
HTTP_BASE_URL + '/' + cmd.strip('/'),
data=payload,
headers=headers
)).read()
except urllib2.HTTPError, err:
if 403 == err.code:
raise AccessDeniedException('Access denied, please check'
' your credentials and/or balance')
elif 400 == err.code or 413 == err.code:
raise ValueError("CAPTCHA was rejected by the service, check"
" if it's a valid image")
elif 503 == err.code:
raise OverflowError("CAPTCHA was rejected due to service"
" overload, try again later")
else:
raise err
else:
self._log('RECV', '%d %s' % (len(response), response))
try:
return json_decode(response)
except Exception:
raise RuntimeError('Invalid API response')
return {}
def get_user(self):
return self._call('user', self.userpwd.copy()) or {'user': 0}
def get_captcha(self, cid):
return self._call('captcha/%d' % cid) or {'captcha': 0}
def report(self, cid):
return not self._call('captcha/%d/report' % cid,
self.userpwd.copy()).get('is_correct')
def upload(self, captcha=None, **kwargs):
boundary = binascii.hexlify(os.urandom(16))
banner = kwargs.get('banner', '')
if banner:
kwargs['banner'] = 'base64:' + base64.b64encode(_load_image(banner))
body = '\r\n'.join(('\r\n'.join((
'--%s' % boundary,
'Content-Disposition: form-data; name="%s"' % k,
'Content-Type: text/plain',
'Content-Length: %d' % len(str(v)),
'',
str(v)
))) for k, v in self.userpwd.items())
body += '\r\n'.join(('\r\n'.join((
'--%s' % boundary,
'Content-Disposition: form-data; name="%s"' % k,
'Content-Type: text/plain',
'Content-Length: %d' % len(str(v)),
'',
str(v)
))) for k, v in kwargs.items())
if captcha:
img = _load_image(captcha)
body += '\r\n'.join((
'',
'--%s' % boundary,
'Content-Disposition: form-data; name="captchafile"; '
'filename="captcha"',
'Content-Type: application/octet-stream',
'Content-Length: %d' % len(img),
'',
img,
'--%s--' % boundary,
''
))
response = self._call('captcha', body, {
'Content-Type': 'multipart/form-data; boundary="%s"' % boundary
}) or {}
if response.get('captcha'):
return response
class SocketClient(Client):
"""Death by Captcha socket API client."""
TERMINATOR = '\r\n'
def __init__(self, *args):
Client.__init__(self, *args)
self.socket_lock = threading.Lock()
self.socket = None
def close(self):
if self.socket:
self._log('CLOSE')
try:
self.socket.shutdown(socket.SHUT_RDWR)
except socket.error:
pass
finally:
self.socket.close()
self.socket = None
def connect(self):
if not self.socket:
self._log('CONN')
host = (socket.gethostbyname(SOCKET_HOST),
random.choice(SOCKET_PORTS))
self.socket = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
self.socket.settimeout(0)
try:
self.socket.connect(host)
except socket.error, err:
if (err.args[0] not in
(errno.EAGAIN, errno.EWOULDBLOCK, errno.EINPROGRESS)):
self.close()
raise err
return self.socket
def __del__(self):
self.close()
def _sendrecv(self, sock, buf):
self._log('SEND', buf)
fds = [sock]
buf += self.TERMINATOR
response = ''
intvl_idx = 0
while True:
intvl, intvl_idx = self._get_poll_interval(intvl_idx)
rds, wrs, exs = select.select((not buf and fds) or [],
(buf and fds) or [],
fds,
intvl)
if exs:
raise IOError('select() failed')
try:
if wrs:
while buf:
buf = buf[wrs[0].send(buf):]
elif rds:
while True:
s = rds[0].recv(256)
if not s:
raise IOError('recv(): connection lost')
else:
response += s
except socket.error, err:
if (err.args[0] not in
(errno.EAGAIN, errno.EWOULDBLOCK, errno.EINPROGRESS)):
raise err
if response.endswith(self.TERMINATOR):
self._log('RECV', response)
return response.rstrip(self.TERMINATOR)
raise IOError('send/recv timed out')
def _call(self, cmd, data=None):
if data is None:
data = {}
data['cmd'] = cmd
data['version'] = API_VERSION
request = json_encode(data)
response = None
for _ in range(2):
if not self.socket and cmd != 'login':
self._call('login', self.userpwd.copy())
self.socket_lock.acquire()
try:
sock = self.connect()
response = self._sendrecv(sock, request)
except IOError, err:
sys.stderr.write(str(err) + "\n")
self.close()
except socket.error, err:
sys.stderr.write(str(err) + "\n")
self.close()
raise IOError('Connection refused')
else:
break
finally:
self.socket_lock.release()
if response is None:
raise IOError('Connection lost or timed out during API request')
try:
response = json_decode(response)
except Exception:
raise RuntimeError('Invalid API response')
if not response.get('error'):
return response
error = response['error']
if error in ('not-logged-in', 'invalid-credentials'):
raise AccessDeniedException('Access denied, check your credentials')
elif 'banned' == error:
raise AccessDeniedException('Access denied, account is suspended')
elif 'insufficient-funds' == error:
raise AccessDeniedException(
'CAPTCHA was rejected due to low balance')
elif 'invalid-captcha' == error:
raise ValueError('CAPTCHA is not a valid image')
elif 'service-overload' == error:
raise OverflowError(
'CAPTCHA was rejected due to service overload, try again later')
else:
self.socket_lock.acquire()
self.close()
self.socket_lock.release()
raise RuntimeError('API server error occured: %s' % error)
def get_user(self):
return self._call('user') or {'user': 0}
def get_captcha(self, cid):
return self._call('captcha', {'captcha': cid}) or {'captcha': 0}
def upload(self, captcha=None, **kwargs):
data = {}
if captcha:
data['captcha'] = base64.b64encode(_load_image(captcha))
if kwargs:
banner = kwargs.get('banner', '')
if banner:
kwargs['banner'] = base64.b64encode(_load_image(banner))
data.update(kwargs)
response = self._call('upload', data)
if response.get('captcha'):
uploaded_captcha = dict(
(k, response.get(k))
for k in ('captcha', 'text', 'is_correct')
)
if not uploaded_captcha['text']:
uploaded_captcha['text'] = None
return uploaded_captcha
def report(self, cid):
return not self._call('report', {'captcha': cid}).get('is_correct')
if '__main__' == __name__:
# Put your DBC username & password here:
# client = HttpClient(sys.argv[1], sys.argv[2])
client = SocketClient(sys.argv[1], sys.argv[2])
client.is_verbose = True
print 'Your balance is %s US cents' % client.get_balance()
for fn in sys.argv[3:]:
try:
# Put your CAPTCHA image file name or file-like object, and optional
# solving timeout (in seconds) here:
captcha = client.decode(fn, DEFAULT_TIMEOUT)
except Exception, e:
sys.stderr.write('Failed uploading CAPTCHA: %s\n' % (e, ))
captcha = None
if captcha:
print 'CAPTCHA %d solved: %s' % \
(captcha['captcha'], captcha['text'])
# Report as incorrectly solved if needed. Make sure the CAPTCHA was
# in fact incorrectly solved!
# try:
# client.report(captcha['captcha'])
# except Exception, e:
# sys.stderr.write('Failed reporting CAPTCHA: %s\n' % (e, ))

View File

@ -0,0 +1,4 @@
__version__ = '0.6.5'
from .lock import Lock # noqa
from .lock import NeedRegenerationException # noqa

4
libs/dogpile2.7/cache/__init__.py vendored Normal file
View File

@ -0,0 +1,4 @@
from .region import CacheRegion, register_backend, make_region # noqa
# backwards compat
from .. import __version__ # noqa

215
libs/dogpile2.7/cache/api.py vendored Normal file
View File

@ -0,0 +1,215 @@
import operator
from ..util.compat import py3k
class NoValue(object):
"""Describe a missing cache value.
The :attr:`.NO_VALUE` module global
should be used.
"""
@property
def payload(self):
return self
def __repr__(self):
"""Ensure __repr__ is a consistent value in case NoValue is used to
fill another cache key.
"""
return '<dogpile.cache.api.NoValue object>'
if py3k:
def __bool__(self): # pragma NO COVERAGE
return False
else:
def __nonzero__(self): # pragma NO COVERAGE
return False
NO_VALUE = NoValue()
"""Value returned from ``get()`` that describes
a key not present."""
class CachedValue(tuple):
"""Represent a value stored in the cache.
:class:`.CachedValue` is a two-tuple of
``(payload, metadata)``, where ``metadata``
is dogpile.cache's tracking information (
currently the creation time). The metadata
and tuple structure is pickleable, if
the backend requires serialization.
"""
payload = property(operator.itemgetter(0))
"""Named accessor for the payload."""
metadata = property(operator.itemgetter(1))
"""Named accessor for the dogpile.cache metadata dictionary."""
def __new__(cls, payload, metadata):
return tuple.__new__(cls, (payload, metadata))
def __reduce__(self):
return CachedValue, (self.payload, self.metadata)
class CacheBackend(object):
"""Base class for backend implementations."""
key_mangler = None
"""Key mangling function.
May be None, or otherwise declared
as an ordinary instance method.
"""
def __init__(self, arguments): # pragma NO COVERAGE
"""Construct a new :class:`.CacheBackend`.
Subclasses should override this to
handle the given arguments.
:param arguments: The ``arguments`` parameter
passed to :func:`.make_registry`.
"""
raise NotImplementedError()
@classmethod
def from_config_dict(cls, config_dict, prefix):
prefix_len = len(prefix)
return cls(
dict(
(key[prefix_len:], config_dict[key])
for key in config_dict
if key.startswith(prefix)
)
)
def has_lock_timeout(self):
return False
def get_mutex(self, key):
"""Return an optional mutexing object for the given key.
This object need only provide an ``acquire()``
and ``release()`` method.
May return ``None``, in which case the dogpile
lock will use a regular ``threading.Lock``
object to mutex concurrent threads for
value creation. The default implementation
returns ``None``.
Different backends may want to provide various
kinds of "mutex" objects, such as those which
link to lock files, distributed mutexes,
memcached semaphores, etc. Whatever
kind of system is best suited for the scope
and behavior of the caching backend.
A mutex that takes the key into account will
allow multiple regenerate operations across
keys to proceed simultaneously, while a mutex
that does not will serialize regenerate operations
to just one at a time across all keys in the region.
The latter approach, or a variant that involves
a modulus of the given key's hash value,
can be used as a means of throttling the total
number of value recreation operations that may
proceed at one time.
"""
return None
def get(self, key): # pragma NO COVERAGE
"""Retrieve a value from the cache.
The returned value should be an instance of
:class:`.CachedValue`, or ``NO_VALUE`` if
not present.
"""
raise NotImplementedError()
def get_multi(self, keys): # pragma NO COVERAGE
"""Retrieve multiple values from the cache.
The returned value should be a list, corresponding
to the list of keys given.
.. versionadded:: 0.5.0
"""
raise NotImplementedError()
def set(self, key, value): # pragma NO COVERAGE
"""Set a value in the cache.
The key will be whatever was passed
to the registry, processed by the
"key mangling" function, if any.
The value will always be an instance
of :class:`.CachedValue`.
"""
raise NotImplementedError()
def set_multi(self, mapping): # pragma NO COVERAGE
"""Set multiple values in the cache.
``mapping`` is a dict in which
the key will be whatever was passed
to the registry, processed by the
"key mangling" function, if any.
The value will always be an instance
of :class:`.CachedValue`.
When implementing a new :class:`.CacheBackend` or cutomizing via
:class:`.ProxyBackend`, be aware that when this method is invoked by
:meth:`.Region.get_or_create_multi`, the ``mapping`` values are the
same ones returned to the upstream caller. If the subclass alters the
values in any way, it must not do so 'in-place' on the ``mapping`` dict
-- that will have the undesirable effect of modifying the returned
values as well.
.. versionadded:: 0.5.0
"""
raise NotImplementedError()
def delete(self, key): # pragma NO COVERAGE
"""Delete a value from the cache.
The key will be whatever was passed
to the registry, processed by the
"key mangling" function, if any.
The behavior here should be idempotent,
that is, can be called any number of times
regardless of whether or not the
key exists.
"""
raise NotImplementedError()
def delete_multi(self, keys): # pragma NO COVERAGE
"""Delete multiple values from the cache.
The key will be whatever was passed
to the registry, processed by the
"key mangling" function, if any.
The behavior here should be idempotent,
that is, can be called any number of times
regardless of whether or not the
key exists.
.. versionadded:: 0.5.0
"""
raise NotImplementedError()

View File

@ -0,0 +1,22 @@
from dogpile.cache.region import register_backend
register_backend(
"dogpile.cache.null", "dogpile.cache.backends.null", "NullBackend")
register_backend(
"dogpile.cache.dbm", "dogpile.cache.backends.file", "DBMBackend")
register_backend(
"dogpile.cache.pylibmc", "dogpile.cache.backends.memcached",
"PylibmcBackend")
register_backend(
"dogpile.cache.bmemcached", "dogpile.cache.backends.memcached",
"BMemcachedBackend")
register_backend(
"dogpile.cache.memcached", "dogpile.cache.backends.memcached",
"MemcachedBackend")
register_backend(
"dogpile.cache.memory", "dogpile.cache.backends.memory", "MemoryBackend")
register_backend(
"dogpile.cache.memory_pickle", "dogpile.cache.backends.memory",
"MemoryPickleBackend")
register_backend(
"dogpile.cache.redis", "dogpile.cache.backends.redis", "RedisBackend")

447
libs/dogpile2.7/cache/backends/file.py vendored Normal file
View File

@ -0,0 +1,447 @@
"""
File Backends
------------------
Provides backends that deal with local filesystem access.
"""
from __future__ import with_statement
from ..api import CacheBackend, NO_VALUE
from contextlib import contextmanager
from ...util import compat
from ... import util
import os
__all__ = 'DBMBackend', 'FileLock', 'AbstractFileLock'
class DBMBackend(CacheBackend):
"""A file-backend using a dbm file to store keys.
Basic usage::
from dogpile.cache import make_region
region = make_region().configure(
'dogpile.cache.dbm',
expiration_time = 3600,
arguments = {
"filename":"/path/to/cachefile.dbm"
}
)
DBM access is provided using the Python ``anydbm`` module,
which selects a platform-specific dbm module to use.
This may be made to be more configurable in a future
release.
Note that different dbm modules have different behaviors.
Some dbm implementations handle their own locking, while
others don't. The :class:`.DBMBackend` uses a read/write
lockfile by default, which is compatible even with those
DBM implementations for which this is unnecessary,
though the behavior can be disabled.
The DBM backend by default makes use of two lockfiles.
One is in order to protect the DBM file itself from
concurrent writes, the other is to coordinate
value creation (i.e. the dogpile lock). By default,
these lockfiles use the ``flock()`` system call
for locking; this is **only available on Unix
platforms**. An alternative lock implementation, such as one
which is based on threads or uses a third-party system
such as `portalocker <https://pypi.python.org/pypi/portalocker>`_,
can be dropped in using the ``lock_factory`` argument
in conjunction with the :class:`.AbstractFileLock` base class.
Currently, the dogpile lock is against the entire
DBM file, not per key. This means there can
only be one "creator" job running at a time
per dbm file.
A future improvement might be to have the dogpile lock
using a filename that's based on a modulus of the key.
Locking on a filename that uniquely corresponds to the
key is problematic, since it's not generally safe to
delete lockfiles as the application runs, implying an
unlimited number of key-based files would need to be
created and never deleted.
Parameters to the ``arguments`` dictionary are
below.
:param filename: path of the filename in which to
create the DBM file. Note that some dbm backends
will change this name to have additional suffixes.
:param rw_lockfile: the name of the file to use for
read/write locking. If omitted, a default name
is used by appending the suffix ".rw.lock" to the
DBM filename. If False, then no lock is used.
:param dogpile_lockfile: the name of the file to use
for value creation, i.e. the dogpile lock. If
omitted, a default name is used by appending the
suffix ".dogpile.lock" to the DBM filename. If
False, then dogpile.cache uses the default dogpile
lock, a plain thread-based mutex.
:param lock_factory: a function or class which provides
for a read/write lock. Defaults to :class:`.FileLock`.
Custom implementations need to implement context-manager
based ``read()`` and ``write()`` functions - the
:class:`.AbstractFileLock` class is provided as a base class
which provides these methods based on individual read/write lock
functions. E.g. to replace the lock with the dogpile.core
:class:`.ReadWriteMutex`::
from dogpile.core.readwrite_lock import ReadWriteMutex
from dogpile.cache.backends.file import AbstractFileLock
class MutexLock(AbstractFileLock):
def __init__(self, filename):
self.mutex = ReadWriteMutex()
def acquire_read_lock(self, wait):
ret = self.mutex.acquire_read_lock(wait)
return wait or ret
def acquire_write_lock(self, wait):
ret = self.mutex.acquire_write_lock(wait)
return wait or ret
def release_read_lock(self):
return self.mutex.release_read_lock()
def release_write_lock(self):
return self.mutex.release_write_lock()
from dogpile.cache import make_region
region = make_region().configure(
"dogpile.cache.dbm",
expiration_time=300,
arguments={
"filename": "file.dbm",
"lock_factory": MutexLock
}
)
While the included :class:`.FileLock` uses ``os.flock()``, a
windows-compatible implementation can be built using a library
such as `portalocker <https://pypi.python.org/pypi/portalocker>`_.
.. versionadded:: 0.5.2
"""
def __init__(self, arguments):
self.filename = os.path.abspath(
os.path.normpath(arguments['filename'])
)
dir_, filename = os.path.split(self.filename)
self.lock_factory = arguments.get("lock_factory", FileLock)
self._rw_lock = self._init_lock(
arguments.get('rw_lockfile'),
".rw.lock", dir_, filename)
self._dogpile_lock = self._init_lock(
arguments.get('dogpile_lockfile'),
".dogpile.lock",
dir_, filename,
util.KeyReentrantMutex.factory)
# TODO: make this configurable
if compat.py3k:
import dbm
else:
import anydbm as dbm
self.dbmmodule = dbm
self._init_dbm_file()
def _init_lock(self, argument, suffix, basedir, basefile, wrapper=None):
if argument is None:
lock = self.lock_factory(os.path.join(basedir, basefile + suffix))
elif argument is not False:
lock = self.lock_factory(
os.path.abspath(
os.path.normpath(argument)
))
else:
return None
if wrapper:
lock = wrapper(lock)
return lock
def _init_dbm_file(self):
exists = os.access(self.filename, os.F_OK)
if not exists:
for ext in ('db', 'dat', 'pag', 'dir'):
if os.access(self.filename + os.extsep + ext, os.F_OK):
exists = True
break
if not exists:
fh = self.dbmmodule.open(self.filename, 'c')
fh.close()
def get_mutex(self, key):
# using one dogpile for the whole file. Other ways
# to do this might be using a set of files keyed to a
# hash/modulus of the key. the issue is it's never
# really safe to delete a lockfile as this can
# break other processes trying to get at the file
# at the same time - so handling unlimited keys
# can't imply unlimited filenames
if self._dogpile_lock:
return self._dogpile_lock(key)
else:
return None
@contextmanager
def _use_rw_lock(self, write):
if self._rw_lock is None:
yield
elif write:
with self._rw_lock.write():
yield
else:
with self._rw_lock.read():
yield
@contextmanager
def _dbm_file(self, write):
with self._use_rw_lock(write):
dbm = self.dbmmodule.open(
self.filename,
"w" if write else "r")
yield dbm
dbm.close()
def get(self, key):
with self._dbm_file(False) as dbm:
if hasattr(dbm, 'get'):
value = dbm.get(key, NO_VALUE)
else:
# gdbm objects lack a .get method
try:
value = dbm[key]
except KeyError:
value = NO_VALUE
if value is not NO_VALUE:
value = compat.pickle.loads(value)
return value
def get_multi(self, keys):
return [self.get(key) for key in keys]
def set(self, key, value):
with self._dbm_file(True) as dbm:
dbm[key] = compat.pickle.dumps(value,
compat.pickle.HIGHEST_PROTOCOL)
def set_multi(self, mapping):
with self._dbm_file(True) as dbm:
for key, value in mapping.items():
dbm[key] = compat.pickle.dumps(value,
compat.pickle.HIGHEST_PROTOCOL)
def delete(self, key):
with self._dbm_file(True) as dbm:
try:
del dbm[key]
except KeyError:
pass
def delete_multi(self, keys):
with self._dbm_file(True) as dbm:
for key in keys:
try:
del dbm[key]
except KeyError:
pass
class AbstractFileLock(object):
"""Coordinate read/write access to a file.
typically is a file-based lock but doesn't necessarily have to be.
The default implementation here is :class:`.FileLock`.
Implementations should provide the following methods::
* __init__()
* acquire_read_lock()
* acquire_write_lock()
* release_read_lock()
* release_write_lock()
The ``__init__()`` method accepts a single argument "filename", which
may be used as the "lock file", for those implementations that use a lock
file.
Note that multithreaded environments must provide a thread-safe
version of this lock. The recommended approach for file-
descriptor-based locks is to use a Python ``threading.local()`` so
that a unique file descriptor is held per thread. See the source
code of :class:`.FileLock` for an implementation example.
"""
def __init__(self, filename):
"""Constructor, is given the filename of a potential lockfile.
The usage of this filename is optional and no file is
created by default.
Raises ``NotImplementedError`` by default, must be
implemented by subclasses.
"""
raise NotImplementedError()
def acquire(self, wait=True):
"""Acquire the "write" lock.
This is a direct call to :meth:`.AbstractFileLock.acquire_write_lock`.
"""
return self.acquire_write_lock(wait)
def release(self):
"""Release the "write" lock.
This is a direct call to :meth:`.AbstractFileLock.release_write_lock`.
"""
self.release_write_lock()
@contextmanager
def read(self):
"""Provide a context manager for the "read" lock.
This method makes use of :meth:`.AbstractFileLock.acquire_read_lock`
and :meth:`.AbstractFileLock.release_read_lock`
"""
self.acquire_read_lock(True)
try:
yield
finally:
self.release_read_lock()
@contextmanager
def write(self):
"""Provide a context manager for the "write" lock.
This method makes use of :meth:`.AbstractFileLock.acquire_write_lock`
and :meth:`.AbstractFileLock.release_write_lock`
"""
self.acquire_write_lock(True)
try:
yield
finally:
self.release_write_lock()
@property
def is_open(self):
"""optional method."""
raise NotImplementedError()
def acquire_read_lock(self, wait):
"""Acquire a 'reader' lock.
Raises ``NotImplementedError`` by default, must be
implemented by subclasses.
"""
raise NotImplementedError()
def acquire_write_lock(self, wait):
"""Acquire a 'write' lock.
Raises ``NotImplementedError`` by default, must be
implemented by subclasses.
"""
raise NotImplementedError()
def release_read_lock(self):
"""Release a 'reader' lock.
Raises ``NotImplementedError`` by default, must be
implemented by subclasses.
"""
raise NotImplementedError()
def release_write_lock(self):
"""Release a 'writer' lock.
Raises ``NotImplementedError`` by default, must be
implemented by subclasses.
"""
raise NotImplementedError()
class FileLock(AbstractFileLock):
"""Use lockfiles to coordinate read/write access to a file.
Only works on Unix systems, using
`fcntl.flock() <http://docs.python.org/library/fcntl.html>`_.
"""
def __init__(self, filename):
self._filedescriptor = compat.threading.local()
self.filename = filename
@util.memoized_property
def _module(self):
import fcntl
return fcntl
@property
def is_open(self):
return hasattr(self._filedescriptor, 'fileno')
def acquire_read_lock(self, wait):
return self._acquire(wait, os.O_RDONLY, self._module.LOCK_SH)
def acquire_write_lock(self, wait):
return self._acquire(wait, os.O_WRONLY, self._module.LOCK_EX)
def release_read_lock(self):
self._release()
def release_write_lock(self):
self._release()
def _acquire(self, wait, wrflag, lockflag):
wrflag |= os.O_CREAT
fileno = os.open(self.filename, wrflag)
try:
if not wait:
lockflag |= self._module.LOCK_NB
self._module.flock(fileno, lockflag)
except IOError:
os.close(fileno)
if not wait:
# this is typically
# "[Errno 35] Resource temporarily unavailable",
# because of LOCK_NB
return False
else:
raise
else:
self._filedescriptor.fileno = fileno
return True
def _release(self):
try:
fileno = self._filedescriptor.fileno
except AttributeError:
return
else:
self._module.flock(fileno, self._module.LOCK_UN)
os.close(fileno)
del self._filedescriptor.fileno

View File

@ -0,0 +1,364 @@
"""
Memcached Backends
------------------
Provides backends for talking to `memcached <http://memcached.org>`_.
"""
from ..api import CacheBackend, NO_VALUE
from ...util import compat
from ... import util
import random
import time
__all__ = 'GenericMemcachedBackend', 'MemcachedBackend',\
'PylibmcBackend', 'BMemcachedBackend', 'MemcachedLock'
class MemcachedLock(object):
"""Simple distributed lock using memcached.
This is an adaptation of the lock featured at
http://amix.dk/blog/post/19386
"""
def __init__(self, client_fn, key, timeout=0):
self.client_fn = client_fn
self.key = "_lock" + key
self.timeout = timeout
def acquire(self, wait=True):
client = self.client_fn()
i = 0
while True:
if client.add(self.key, 1, self.timeout):
return True
elif not wait:
return False
else:
sleep_time = (((i + 1) * random.random()) + 2 ** i) / 2.5
time.sleep(sleep_time)
if i < 15:
i += 1
def release(self):
client = self.client_fn()
client.delete(self.key)
class GenericMemcachedBackend(CacheBackend):
"""Base class for memcached backends.
This base class accepts a number of paramters
common to all backends.
:param url: the string URL to connect to. Can be a single
string or a list of strings. This is the only argument
that's required.
:param distributed_lock: boolean, when True, will use a
memcached-lock as the dogpile lock (see :class:`.MemcachedLock`).
Use this when multiple
processes will be talking to the same memcached instance.
When left at False, dogpile will coordinate on a regular
threading mutex.
:param lock_timeout: integer, number of seconds after acquiring a lock that
memcached should expire it. This argument is only valid when
``distributed_lock`` is ``True``.
.. versionadded:: 0.5.7
:param memcached_expire_time: integer, when present will
be passed as the ``time`` parameter to ``pylibmc.Client.set``.
This is used to set the memcached expiry time for a value.
.. note::
This parameter is **different** from Dogpile's own
``expiration_time``, which is the number of seconds after
which Dogpile will consider the value to be expired.
When Dogpile considers a value to be expired,
it **continues to use the value** until generation
of a new value is complete, when using
:meth:`.CacheRegion.get_or_create`.
Therefore, if you are setting ``memcached_expire_time``, you'll
want to make sure it is greater than ``expiration_time``
by at least enough seconds for new values to be generated,
else the value won't be available during a regeneration,
forcing all threads to wait for a regeneration each time
a value expires.
The :class:`.GenericMemachedBackend` uses a ``threading.local()``
object to store individual client objects per thread,
as most modern memcached clients do not appear to be inherently
threadsafe.
In particular, ``threading.local()`` has the advantage over pylibmc's
built-in thread pool in that it automatically discards objects
associated with a particular thread when that thread ends.
"""
set_arguments = {}
"""Additional arguments which will be passed
to the :meth:`set` method."""
def __init__(self, arguments):
self._imports()
# using a plain threading.local here. threading.local
# automatically deletes the __dict__ when a thread ends,
# so the idea is that this is superior to pylibmc's
# own ThreadMappedPool which doesn't handle this
# automatically.
self.url = util.to_list(arguments['url'])
self.distributed_lock = arguments.get('distributed_lock', False)
self.lock_timeout = arguments.get('lock_timeout', 0)
self.memcached_expire_time = arguments.get(
'memcached_expire_time', 0)
def has_lock_timeout(self):
return self.lock_timeout != 0
def _imports(self):
"""client library imports go here."""
raise NotImplementedError()
def _create_client(self):
"""Creation of a Client instance goes here."""
raise NotImplementedError()
@util.memoized_property
def _clients(self):
backend = self
class ClientPool(compat.threading.local):
def __init__(self):
self.memcached = backend._create_client()
return ClientPool()
@property
def client(self):
"""Return the memcached client.
This uses a threading.local by
default as it appears most modern
memcached libs aren't inherently
threadsafe.
"""
return self._clients.memcached
def get_mutex(self, key):
if self.distributed_lock:
return MemcachedLock(lambda: self.client, key,
timeout=self.lock_timeout)
else:
return None
def get(self, key):
value = self.client.get(key)
if value is None:
return NO_VALUE
else:
return value
def get_multi(self, keys):
values = self.client.get_multi(keys)
return [
NO_VALUE if key not in values
else values[key] for key in keys
]
def set(self, key, value):
self.client.set(
key,
value,
**self.set_arguments
)
def set_multi(self, mapping):
self.client.set_multi(
mapping,
**self.set_arguments
)
def delete(self, key):
self.client.delete(key)
def delete_multi(self, keys):
self.client.delete_multi(keys)
class MemcacheArgs(object):
"""Mixin which provides support for the 'time' argument to set(),
'min_compress_len' to other methods.
"""
def __init__(self, arguments):
self.min_compress_len = arguments.get('min_compress_len', 0)
self.set_arguments = {}
if "memcached_expire_time" in arguments:
self.set_arguments["time"] = arguments["memcached_expire_time"]
if "min_compress_len" in arguments:
self.set_arguments["min_compress_len"] = \
arguments["min_compress_len"]
super(MemcacheArgs, self).__init__(arguments)
pylibmc = None
class PylibmcBackend(MemcacheArgs, GenericMemcachedBackend):
"""A backend for the
`pylibmc <http://sendapatch.se/projects/pylibmc/index.html>`_
memcached client.
A configuration illustrating several of the optional
arguments described in the pylibmc documentation::
from dogpile.cache import make_region
region = make_region().configure(
'dogpile.cache.pylibmc',
expiration_time = 3600,
arguments = {
'url':["127.0.0.1"],
'binary':True,
'behaviors':{"tcp_nodelay": True,"ketama":True}
}
)
Arguments accepted here include those of
:class:`.GenericMemcachedBackend`, as well as
those below.
:param binary: sets the ``binary`` flag understood by
``pylibmc.Client``.
:param behaviors: a dictionary which will be passed to
``pylibmc.Client`` as the ``behaviors`` parameter.
:param min_compress_len: Integer, will be passed as the
``min_compress_len`` parameter to the ``pylibmc.Client.set``
method.
"""
def __init__(self, arguments):
self.binary = arguments.get('binary', False)
self.behaviors = arguments.get('behaviors', {})
super(PylibmcBackend, self).__init__(arguments)
def _imports(self):
global pylibmc
import pylibmc # noqa
def _create_client(self):
return pylibmc.Client(
self.url,
binary=self.binary,
behaviors=self.behaviors
)
memcache = None
class MemcachedBackend(MemcacheArgs, GenericMemcachedBackend):
"""A backend using the standard
`Python-memcached <http://www.tummy.com/Community/software/\
python-memcached/>`_
library.
Example::
from dogpile.cache import make_region
region = make_region().configure(
'dogpile.cache.memcached',
expiration_time = 3600,
arguments = {
'url':"127.0.0.1:11211"
}
)
"""
def _imports(self):
global memcache
import memcache # noqa
def _create_client(self):
return memcache.Client(self.url)
bmemcached = None
class BMemcachedBackend(GenericMemcachedBackend):
"""A backend for the
`python-binary-memcached <https://github.com/jaysonsantos/\
python-binary-memcached>`_
memcached client.
This is a pure Python memcached client which
includes the ability to authenticate with a memcached
server using SASL.
A typical configuration using username/password::
from dogpile.cache import make_region
region = make_region().configure(
'dogpile.cache.bmemcached',
expiration_time = 3600,
arguments = {
'url':["127.0.0.1"],
'username':'scott',
'password':'tiger'
}
)
Arguments which can be passed to the ``arguments``
dictionary include:
:param username: optional username, will be used for
SASL authentication.
:param password: optional password, will be used for
SASL authentication.
"""
def __init__(self, arguments):
self.username = arguments.get('username', None)
self.password = arguments.get('password', None)
super(BMemcachedBackend, self).__init__(arguments)
def _imports(self):
global bmemcached
import bmemcached
class RepairBMemcachedAPI(bmemcached.Client):
"""Repairs BMemcached's non-standard method
signatures, which was fixed in BMemcached
ef206ed4473fec3b639e.
"""
def add(self, key, value, timeout=0):
try:
return super(RepairBMemcachedAPI, self).add(
key, value, timeout)
except ValueError:
return False
self.Client = RepairBMemcachedAPI
def _create_client(self):
return self.Client(
self.url,
username=self.username,
password=self.password
)
def delete_multi(self, keys):
"""python-binary-memcached api does not implements delete_multi"""
for key in keys:
self.delete(key)

124
libs/dogpile2.7/cache/backends/memory.py vendored Normal file
View File

@ -0,0 +1,124 @@
"""
Memory Backends
---------------
Provides simple dictionary-based backends.
The two backends are :class:`.MemoryBackend` and :class:`.MemoryPickleBackend`;
the latter applies a serialization step to cached values while the former
places the value as given into the dictionary.
"""
from ..api import CacheBackend, NO_VALUE
from ...util.compat import pickle
class MemoryBackend(CacheBackend):
"""A backend that uses a plain dictionary.
There is no size management, and values which
are placed into the dictionary will remain
until explicitly removed. Note that
Dogpile's expiration of items is based on
timestamps and does not remove them from
the cache.
E.g.::
from dogpile.cache import make_region
region = make_region().configure(
'dogpile.cache.memory'
)
To use a Python dictionary of your choosing,
it can be passed in with the ``cache_dict``
argument::
my_dictionary = {}
region = make_region().configure(
'dogpile.cache.memory',
arguments={
"cache_dict":my_dictionary
}
)
"""
pickle_values = False
def __init__(self, arguments):
self._cache = arguments.pop("cache_dict", {})
def get(self, key):
value = self._cache.get(key, NO_VALUE)
if value is not NO_VALUE and self.pickle_values:
value = pickle.loads(value)
return value
def get_multi(self, keys):
ret = [
self._cache.get(key, NO_VALUE)
for key in keys]
if self.pickle_values:
ret = [
pickle.loads(value)
if value is not NO_VALUE else value
for value in ret
]
return ret
def set(self, key, value):
if self.pickle_values:
value = pickle.dumps(value, pickle.HIGHEST_PROTOCOL)
self._cache[key] = value
def set_multi(self, mapping):
pickle_values = self.pickle_values
for key, value in mapping.items():
if pickle_values:
value = pickle.dumps(value, pickle.HIGHEST_PROTOCOL)
self._cache[key] = value
def delete(self, key):
self._cache.pop(key, None)
def delete_multi(self, keys):
for key in keys:
self._cache.pop(key, None)
class MemoryPickleBackend(MemoryBackend):
"""A backend that uses a plain dictionary, but serializes objects on
:meth:`.MemoryBackend.set` and deserializes :meth:`.MemoryBackend.get`.
E.g.::
from dogpile.cache import make_region
region = make_region().configure(
'dogpile.cache.memory_pickle'
)
The usage of pickle to serialize cached values allows an object
as placed in the cache to be a copy of the original given object, so
that any subsequent changes to the given object aren't reflected
in the cached value, thus making the backend behave the same way
as other backends which make use of serialization.
The serialization is performed via pickle, and incurs the same
performance hit in doing so as that of other backends; in this way
the :class:`.MemoryPickleBackend` performance is somewhere in between
that of the pure :class:`.MemoryBackend` and the remote server oriented
backends such as that of Memcached or Redis.
Pickle behavior here is the same as that of the Redis backend, using
either ``cPickle`` or ``pickle`` and specifying ``HIGHEST_PROTOCOL``
upon serialize.
.. versionadded:: 0.5.3
"""
pickle_values = True

62
libs/dogpile2.7/cache/backends/null.py vendored Normal file
View File

@ -0,0 +1,62 @@
"""
Null Backend
-------------
The Null backend does not do any caching at all. It can be
used to test behavior without caching, or as a means of disabling
caching for a region that is otherwise used normally.
.. versionadded:: 0.5.4
"""
from ..api import CacheBackend, NO_VALUE
__all__ = ['NullBackend']
class NullLock(object):
def acquire(self, wait=True):
return True
def release(self):
pass
class NullBackend(CacheBackend):
"""A "null" backend that effectively disables all cache operations.
Basic usage::
from dogpile.cache import make_region
region = make_region().configure(
'dogpile.cache.null'
)
"""
def __init__(self, arguments):
pass
def get_mutex(self, key):
return NullLock()
def get(self, key):
return NO_VALUE
def get_multi(self, keys):
return [NO_VALUE for k in keys]
def set(self, key, value):
pass
def set_multi(self, mapping):
pass
def delete(self, key):
pass
def delete_multi(self, keys):
pass

183
libs/dogpile2.7/cache/backends/redis.py vendored Normal file
View File

@ -0,0 +1,183 @@
"""
Redis Backends
------------------
Provides backends for talking to `Redis <http://redis.io>`_.
"""
from __future__ import absolute_import
from ..api import CacheBackend, NO_VALUE
from ...util.compat import pickle, u
redis = None
__all__ = 'RedisBackend',
class RedisBackend(CacheBackend):
"""A `Redis <http://redis.io/>`_ backend, using the
`redis-py <http://pypi.python.org/pypi/redis/>`_ backend.
Example configuration::
from dogpile.cache import make_region
region = make_region().configure(
'dogpile.cache.redis',
arguments = {
'host': 'localhost',
'port': 6379,
'db': 0,
'redis_expiration_time': 60*60*2, # 2 hours
'distributed_lock': True
}
)
Arguments accepted in the arguments dictionary:
:param url: string. If provided, will override separate host/port/db
params. The format is that accepted by ``StrictRedis.from_url()``.
.. versionadded:: 0.4.1
:param host: string, default is ``localhost``.
:param password: string, default is no password.
.. versionadded:: 0.4.1
:param port: integer, default is ``6379``.
:param db: integer, default is ``0``.
:param redis_expiration_time: integer, number of seconds after setting
a value that Redis should expire it. This should be larger than dogpile's
cache expiration. By default no expiration is set.
:param distributed_lock: boolean, when True, will use a
redis-lock as the dogpile lock.
Use this when multiple
processes will be talking to the same redis instance.
When left at False, dogpile will coordinate on a regular
threading mutex.
:param lock_timeout: integer, number of seconds after acquiring a lock that
Redis should expire it. This argument is only valid when
``distributed_lock`` is ``True``.
.. versionadded:: 0.5.0
:param socket_timeout: float, seconds for socket timeout.
Default is None (no timeout).
.. versionadded:: 0.5.4
:param lock_sleep: integer, number of seconds to sleep when failed to
acquire a lock. This argument is only valid when
``distributed_lock`` is ``True``.
.. versionadded:: 0.5.0
:param connection_pool: ``redis.ConnectionPool`` object. If provided,
this object supersedes other connection arguments passed to the
``redis.StrictRedis`` instance, including url and/or host as well as
socket_timeout, and will be passed to ``redis.StrictRedis`` as the
source of connectivity.
.. versionadded:: 0.5.4
"""
def __init__(self, arguments):
arguments = arguments.copy()
self._imports()
self.url = arguments.pop('url', None)
self.host = arguments.pop('host', 'localhost')
self.password = arguments.pop('password', None)
self.port = arguments.pop('port', 6379)
self.db = arguments.pop('db', 0)
self.distributed_lock = arguments.get('distributed_lock', False)
self.socket_timeout = arguments.pop('socket_timeout', None)
self.lock_timeout = arguments.get('lock_timeout', None)
self.lock_sleep = arguments.get('lock_sleep', 0.1)
self.redis_expiration_time = arguments.pop('redis_expiration_time', 0)
self.connection_pool = arguments.get('connection_pool', None)
self.client = self._create_client()
def _imports(self):
# defer imports until backend is used
global redis
import redis # noqa
def _create_client(self):
if self.connection_pool is not None:
# the connection pool already has all other connection
# options present within, so here we disregard socket_timeout
# and others.
return redis.StrictRedis(connection_pool=self.connection_pool)
args = {}
if self.socket_timeout:
args['socket_timeout'] = self.socket_timeout
if self.url is not None:
args.update(url=self.url)
return redis.StrictRedis.from_url(**args)
else:
args.update(
host=self.host, password=self.password,
port=self.port, db=self.db
)
return redis.StrictRedis(**args)
def get_mutex(self, key):
if self.distributed_lock:
return self.client.lock(u('_lock{0}').format(key),
self.lock_timeout, self.lock_sleep)
else:
return None
def get(self, key):
value = self.client.get(key)
if value is None:
return NO_VALUE
return pickle.loads(value)
def get_multi(self, keys):
if not keys:
return []
values = self.client.mget(keys)
return [
pickle.loads(v) if v is not None else NO_VALUE
for v in values]
def set(self, key, value):
if self.redis_expiration_time:
self.client.setex(key, self.redis_expiration_time,
pickle.dumps(value, pickle.HIGHEST_PROTOCOL))
else:
self.client.set(key, pickle.dumps(value, pickle.HIGHEST_PROTOCOL))
def set_multi(self, mapping):
mapping = dict(
(k, pickle.dumps(v, pickle.HIGHEST_PROTOCOL))
for k, v in mapping.items()
)
if not self.redis_expiration_time:
self.client.mset(mapping)
else:
pipe = self.client.pipeline()
for key, value in mapping.items():
pipe.setex(key, self.redis_expiration_time, value)
pipe.execute()
def delete(self, key):
self.client.delete(key)
def delete_multi(self, keys):
self.client.delete(*keys)

25
libs/dogpile2.7/cache/exception.py vendored Normal file
View File

@ -0,0 +1,25 @@
"""Exception classes for dogpile.cache."""
class DogpileCacheException(Exception):
"""Base Exception for dogpile.cache exceptions to inherit from."""
class RegionAlreadyConfigured(DogpileCacheException):
"""CacheRegion instance is already configured."""
class RegionNotConfigured(DogpileCacheException):
"""CacheRegion instance has not been configured."""
class ValidationError(DogpileCacheException):
"""Error validating a value or option."""
class PluginNotFound(DogpileCacheException):
"""The specified plugin could not be found.
.. versionadded:: 0.6.4
"""

View File

View File

@ -0,0 +1,90 @@
"""
Mako Integration
----------------
dogpile.cache includes a `Mako <http://www.makotemplates.org>`_ plugin
that replaces `Beaker <http://beaker.groovie.org>`_
as the cache backend.
Setup a Mako template lookup using the "dogpile.cache" cache implementation
and a region dictionary::
from dogpile.cache import make_region
from mako.lookup import TemplateLookup
my_regions = {
"local":make_region().configure(
"dogpile.cache.dbm",
expiration_time=360,
arguments={"filename":"file.dbm"}
),
"memcached":make_region().configure(
"dogpile.cache.pylibmc",
expiration_time=3600,
arguments={"url":["127.0.0.1"]}
)
}
mako_lookup = TemplateLookup(
directories=["/myapp/templates"],
cache_impl="dogpile.cache",
cache_args={
'regions':my_regions
}
)
To use the above configuration in a template, use the ``cached=True``
argument on any Mako tag which accepts it, in conjunction with the
name of the desired region as the ``cache_region`` argument::
<%def name="mysection()" cached="True" cache_region="memcached">
some content that's cached
</%def>
"""
from mako.cache import CacheImpl
class MakoPlugin(CacheImpl):
"""A Mako ``CacheImpl`` which talks to dogpile.cache."""
def __init__(self, cache):
super(MakoPlugin, self).__init__(cache)
try:
self.regions = self.cache.template.cache_args['regions']
except KeyError:
raise KeyError(
"'cache_regions' argument is required on the "
"Mako Lookup or Template object for usage "
"with the dogpile.cache plugin.")
def _get_region(self, **kw):
try:
region = kw['region']
except KeyError:
raise KeyError(
"'cache_region' argument must be specified with 'cache=True'"
"within templates for usage with the dogpile.cache plugin.")
try:
return self.regions[region]
except KeyError:
raise KeyError("No such region '%s'" % region)
def get_and_replace(self, key, creation_function, **kw):
expiration_time = kw.pop("timeout", None)
return self._get_region(**kw).get_or_create(
key, creation_function,
expiration_time=expiration_time)
def get_or_create(self, key, creation_function, **kw):
return self.get_and_replace(key, creation_function, **kw)
def put(self, key, value, **kw):
self._get_region(**kw).put(key, value)
def get(self, key, **kw):
expiration_time = kw.pop("timeout", None)
return self._get_region(**kw).get(key, expiration_time=expiration_time)
def invalidate(self, key, **kw):
self._get_region(**kw).delete(key)

95
libs/dogpile2.7/cache/proxy.py vendored Normal file
View File

@ -0,0 +1,95 @@
"""
Proxy Backends
------------------
Provides a utility and a decorator class that allow for modifying the behavior
of different backends without altering the class itself or having to extend the
base backend.
.. versionadded:: 0.5.0 Added support for the :class:`.ProxyBackend` class.
"""
from .api import CacheBackend
class ProxyBackend(CacheBackend):
"""A decorator class for altering the functionality of backends.
Basic usage::
from dogpile.cache import make_region
from dogpile.cache.proxy import ProxyBackend
class MyFirstProxy(ProxyBackend):
def get(self, key):
# ... custom code goes here ...
return self.proxied.get(key)
def set(self, key, value):
# ... custom code goes here ...
self.proxied.set(key)
class MySecondProxy(ProxyBackend):
def get(self, key):
# ... custom code goes here ...
return self.proxied.get(key)
region = make_region().configure(
'dogpile.cache.dbm',
expiration_time = 3600,
arguments = {
"filename":"/path/to/cachefile.dbm"
},
wrap = [ MyFirstProxy, MySecondProxy ]
)
Classes that extend :class:`.ProxyBackend` can be stacked
together. The ``.proxied`` property will always
point to either the concrete backend instance or
the next proxy in the chain that a method can be
delegated towards.
.. versionadded:: 0.5.0
"""
def __init__(self, *args, **kwargs):
self.proxied = None
def wrap(self, backend):
''' Take a backend as an argument and setup the self.proxied property.
Return an object that be used as a backend by a :class:`.CacheRegion`
object.
'''
assert(
isinstance(backend, CacheBackend) or
isinstance(backend, ProxyBackend))
self.proxied = backend
return self
#
# Delegate any functions that are not already overridden to
# the proxies backend
#
def get(self, key):
return self.proxied.get(key)
def set(self, key, value):
self.proxied.set(key, value)
def delete(self, key):
self.proxied.delete(key)
def get_multi(self, keys):
return self.proxied.get_multi(keys)
def set_multi(self, mapping):
self.proxied.set_multi(mapping)
def delete_multi(self, keys):
self.proxied.delete_multi(keys)
def get_mutex(self, key):
return self.proxied.get_mutex(key)

1498
libs/dogpile2.7/cache/region.py vendored Normal file

File diff suppressed because it is too large Load Diff

146
libs/dogpile2.7/cache/util.py vendored Normal file
View File

@ -0,0 +1,146 @@
from hashlib import sha1
import inspect
from ..util import compat
from ..util import langhelpers
def function_key_generator(namespace, fn, to_str=compat.string_type):
"""Return a function that generates a string
key, based on a given function as well as
arguments to the returned function itself.
This is used by :meth:`.CacheRegion.cache_on_arguments`
to generate a cache key from a decorated function.
An alternate function may be used by specifying
the :paramref:`.CacheRegion.function_key_generator` argument
for :class:`.CacheRegion`.
.. seealso::
:func:`.kwarg_function_key_generator` - similar function that also
takes keyword arguments into account
"""
if namespace is None:
namespace = '%s:%s' % (fn.__module__, fn.__name__)
else:
namespace = '%s:%s|%s' % (fn.__module__, fn.__name__, namespace)
args = inspect.getargspec(fn)
has_self = args[0] and args[0][0] in ('self', 'cls')
def generate_key(*args, **kw):
if kw:
raise ValueError(
"dogpile.cache's default key creation "
"function does not accept keyword arguments.")
if has_self:
args = args[1:]
return namespace + "|" + " ".join(map(to_str, args))
return generate_key
def function_multi_key_generator(namespace, fn, to_str=compat.string_type):
if namespace is None:
namespace = '%s:%s' % (fn.__module__, fn.__name__)
else:
namespace = '%s:%s|%s' % (fn.__module__, fn.__name__, namespace)
args = inspect.getargspec(fn)
has_self = args[0] and args[0][0] in ('self', 'cls')
def generate_keys(*args, **kw):
if kw:
raise ValueError(
"dogpile.cache's default key creation "
"function does not accept keyword arguments.")
if has_self:
args = args[1:]
return [namespace + "|" + key for key in map(to_str, args)]
return generate_keys
def kwarg_function_key_generator(namespace, fn, to_str=compat.string_type):
"""Return a function that generates a string
key, based on a given function as well as
arguments to the returned function itself.
For kwargs passed in, we will build a dict of
all argname (key) argvalue (values) including
default args from the argspec and then
alphabetize the list before generating the
key.
.. versionadded:: 0.6.2
.. seealso::
:func:`.function_key_generator` - default key generation function
"""
if namespace is None:
namespace = '%s:%s' % (fn.__module__, fn.__name__)
else:
namespace = '%s:%s|%s' % (fn.__module__, fn.__name__, namespace)
argspec = inspect.getargspec(fn)
default_list = list(argspec.defaults or [])
# Reverse the list, as we want to compare the argspec by negative index,
# meaning default_list[0] should be args[-1], which works well with
# enumerate()
default_list.reverse()
# use idx*-1 to create the correct right-lookup index.
args_with_defaults = dict((argspec.args[(idx*-1)], default)
for idx, default in enumerate(default_list, 1))
if argspec.args and argspec.args[0] in ('self', 'cls'):
arg_index_start = 1
else:
arg_index_start = 0
def generate_key(*args, **kwargs):
as_kwargs = dict(
[(argspec.args[idx], arg)
for idx, arg in enumerate(args[arg_index_start:],
arg_index_start)])
as_kwargs.update(kwargs)
for arg, val in args_with_defaults.items():
if arg not in as_kwargs:
as_kwargs[arg] = val
argument_values = [as_kwargs[key]
for key in sorted(as_kwargs.keys())]
return namespace + '|' + " ".join(map(to_str, argument_values))
return generate_key
def sha1_mangle_key(key):
"""a SHA1 key mangler."""
return sha1(key).hexdigest()
def length_conditional_mangler(length, mangler):
"""a key mangler that mangles if the length of the key is
past a certain threshold.
"""
def mangle(key):
if len(key) >= length:
return mangler(key)
else:
return key
return mangle
# in the 0.6 release these functions were moved to the dogpile.util namespace.
# They are linked here to maintain compatibility with older versions.
coerce_string_conf = langhelpers.coerce_string_conf
KeyReentrantMutex = langhelpers.KeyReentrantMutex
memoized_property = langhelpers.memoized_property
PluginLoader = langhelpers.PluginLoader
to_list = langhelpers.to_list

17
libs/dogpile2.7/core.py Normal file
View File

@ -0,0 +1,17 @@
"""Compatibility namespace for those using dogpile.core.
As of dogpile.cache 0.6.0, dogpile.core as a separate package
is no longer used by dogpile.cache.
Note that this namespace will not take effect if an actual
dogpile.core installation is present.
"""
from .util import nameregistry # noqa
from .util import readwrite_lock # noqa
from .util.readwrite_lock import ReadWriteMutex # noqa
from .util.nameregistry import NameRegistry # noqa
from .lock import Lock # noqa
from .lock import NeedRegenerationException # noqa
from . import __version__ # noqa

158
libs/dogpile2.7/lock.py Normal file
View File

@ -0,0 +1,158 @@
import time
import logging
log = logging.getLogger(__name__)
class NeedRegenerationException(Exception):
"""An exception that when raised in the 'with' block,
forces the 'has_value' flag to False and incurs a
regeneration of the value.
"""
NOT_REGENERATED = object()
class Lock(object):
"""Dogpile lock class.
Provides an interface around an arbitrary mutex
that allows one thread/process to be elected as
the creator of a new value, while other threads/processes
continue to return the previous version
of that value.
:param mutex: A mutex object that provides ``acquire()``
and ``release()`` methods.
:param creator: Callable which returns a tuple of the form
(new_value, creation_time). "new_value" should be a newly
generated value representing completed state. "creation_time"
should be a floating point time value which is relative
to Python's ``time.time()`` call, representing the time
at which the value was created. This time value should
be associated with the created value.
:param value_and_created_fn: Callable which returns
a tuple of the form (existing_value, creation_time). This
basically should return what the last local call to the ``creator()``
callable has returned, i.e. the value and the creation time,
which would be assumed here to be from a cache. If the
value is not available, the :class:`.NeedRegenerationException`
exception should be thrown.
:param expiretime: Expiration time in seconds. Set to
``None`` for never expires. This timestamp is compared
to the creation_time result and ``time.time()`` to determine if
the value returned by value_and_created_fn is "expired".
:param async_creator: A callable. If specified, this callable will be
passed the mutex as an argument and is responsible for releasing the mutex
after it finishes some asynchronous value creation. The intent is for
this to be used to defer invocation of the creator callable until some
later time.
"""
def __init__(
self,
mutex,
creator,
value_and_created_fn,
expiretime,
async_creator=None,
):
self.mutex = mutex
self.creator = creator
self.value_and_created_fn = value_and_created_fn
self.expiretime = expiretime
self.async_creator = async_creator
def _is_expired(self, createdtime):
"""Return true if the expiration time is reached, or no
value is available."""
return not self._has_value(createdtime) or \
(
self.expiretime is not None and
time.time() - createdtime > self.expiretime
)
def _has_value(self, createdtime):
"""Return true if the creation function has proceeded
at least once."""
return createdtime > 0
def _enter(self):
value_fn = self.value_and_created_fn
try:
value = value_fn()
value, createdtime = value
except NeedRegenerationException:
log.debug("NeedRegenerationException")
value = NOT_REGENERATED
createdtime = -1
generated = self._enter_create(createdtime)
if generated is not NOT_REGENERATED:
generated, createdtime = generated
return generated
elif value is NOT_REGENERATED:
try:
value, createdtime = value_fn()
return value
except NeedRegenerationException:
raise Exception("Generation function should "
"have just been called by a concurrent "
"thread.")
else:
return value
def _enter_create(self, createdtime):
if not self._is_expired(createdtime):
return NOT_REGENERATED
async = False
if self._has_value(createdtime):
if not self.mutex.acquire(False):
log.debug("creation function in progress "
"elsewhere, returning")
return NOT_REGENERATED
else:
log.debug("no value, waiting for create lock")
self.mutex.acquire()
try:
log.debug("value creation lock %r acquired" % self.mutex)
# see if someone created the value already
try:
value, createdtime = self.value_and_created_fn()
except NeedRegenerationException:
pass
else:
if not self._is_expired(createdtime):
log.debug("value already present")
return value, createdtime
elif self.async_creator:
log.debug("Passing creation lock to async runner")
self.async_creator(self.mutex)
async = True
return value, createdtime
log.debug("Calling creation function")
created = self.creator()
return created
finally:
if not async:
self.mutex.release()
log.debug("Released creation lock")
def __enter__(self):
return self._enter()
def __exit__(self, type, value, traceback):
pass

View File

@ -0,0 +1,4 @@
from .nameregistry import NameRegistry # noqa
from .readwrite_lock import ReadWriteMutex # noqa
from .langhelpers import PluginLoader, memoized_property, \
coerce_string_conf, to_list, KeyReentrantMutex # noqa

View File

@ -0,0 +1,65 @@
import sys
py2k = sys.version_info < (3, 0)
py3k = sys.version_info >= (3, 0)
py32 = sys.version_info >= (3, 2)
py27 = sys.version_info >= (2, 7)
jython = sys.platform.startswith('java')
win32 = sys.platform.startswith('win')
try:
import threading
except ImportError:
import dummy_threading as threading # noqa
if py3k: # pragma: no cover
string_types = str,
text_type = str
string_type = str
if py32:
callable = callable
else:
def callable(fn):
return hasattr(fn, '__call__')
def u(s):
return s
def ue(s):
return s
import configparser
import io
import _thread as thread
else:
string_types = basestring,
text_type = unicode
string_type = str
def u(s):
return unicode(s, "utf-8")
def ue(s):
return unicode(s, "unicode_escape")
import ConfigParser as configparser # noqa
import StringIO as io # noqa
callable = callable # noqa
import thread # noqa
if py3k or jython:
import pickle
else:
import cPickle as pickle # noqa
def timedelta_total_seconds(td):
if py27:
return td.total_seconds()
else:
return (td.microseconds + (
td.seconds + td.days * 24 * 3600) * 1e6) / 1e6

View File

@ -0,0 +1,123 @@
import re
import collections
from . import compat
def coerce_string_conf(d):
result = {}
for k, v in d.items():
if not isinstance(v, compat.string_types):
result[k] = v
continue
v = v.strip()
if re.match(r'^[-+]?\d+$', v):
result[k] = int(v)
elif re.match(r'^[-+]?(?:\d+(?:\.\d*)?|\.\d+)(?:[eE][-+]?\d+)?$', v):
result[k] = float(v)
elif v.lower() in ('false', 'true'):
result[k] = v.lower() == 'true'
elif v == 'None':
result[k] = None
else:
result[k] = v
return result
class PluginLoader(object):
def __init__(self, group):
self.group = group
self.impls = {}
def load(self, name):
if name in self.impls:
return self.impls[name]()
else: # pragma NO COVERAGE
import pkg_resources
for impl in pkg_resources.iter_entry_points(
self.group, name):
self.impls[name] = impl.load
return impl.load()
else:
raise self.NotFound(
"Can't load plugin %s %s" % (self.group, name)
)
def register(self, name, modulepath, objname):
def load():
mod = __import__(modulepath, fromlist=[objname])
return getattr(mod, objname)
self.impls[name] = load
class NotFound(Exception):
"""The specified plugin could not be found."""
class memoized_property(object):
"""A read-only @property that is only evaluated once."""
def __init__(self, fget, doc=None):
self.fget = fget
self.__doc__ = doc or fget.__doc__
self.__name__ = fget.__name__
def __get__(self, obj, cls):
if obj is None:
return self
obj.__dict__[self.__name__] = result = self.fget(obj)
return result
def to_list(x, default=None):
"""Coerce to a list."""
if x is None:
return default
if not isinstance(x, (list, tuple)):
return [x]
else:
return x
class KeyReentrantMutex(object):
def __init__(self, key, mutex, keys):
self.key = key
self.mutex = mutex
self.keys = keys
@classmethod
def factory(cls, mutex):
# this collection holds zero or one
# thread idents as the key; a set of
# keynames held as the value.
keystore = collections.defaultdict(set)
def fac(key):
return KeyReentrantMutex(key, mutex, keystore)
return fac
def acquire(self, wait=True):
current_thread = compat.threading.current_thread().ident
keys = self.keys.get(current_thread)
if keys is not None and \
self.key not in keys:
# current lockholder, new key. add it in
keys.add(self.key)
return True
elif self.mutex.acquire(wait=wait):
# after acquire, create new set and add our key
self.keys[current_thread].add(self.key)
return True
else:
return False
def release(self):
current_thread = compat.threading.current_thread().ident
keys = self.keys.get(current_thread)
assert keys is not None, "this thread didn't do the acquire"
assert self.key in keys, "No acquire held for key '%s'" % self.key
keys.remove(self.key)
if not keys:
# when list of keys empty, remove
# the thread ident and unlock.
del self.keys[current_thread]
self.mutex.release()

View File

@ -0,0 +1,84 @@
from .compat import threading
import weakref
class NameRegistry(object):
"""Generates and return an object, keeping it as a
singleton for a certain identifier for as long as its
strongly referenced.
e.g.::
class MyFoo(object):
"some important object."
def __init__(self, identifier):
self.identifier = identifier
registry = NameRegistry(MyFoo)
# thread 1:
my_foo = registry.get("foo1")
# thread 2
my_foo = registry.get("foo1")
Above, ``my_foo`` in both thread #1 and #2 will
be *the same object*. The constructor for
``MyFoo`` will be called once, passing the
identifier ``foo1`` as the argument.
When thread 1 and thread 2 both complete or
otherwise delete references to ``my_foo``, the
object is *removed* from the :class:`.NameRegistry` as
a result of Python garbage collection.
:param creator: A function that will create a new
value, given the identifier passed to the :meth:`.NameRegistry.get`
method.
"""
_locks = weakref.WeakValueDictionary()
_mutex = threading.RLock()
def __init__(self, creator):
"""Create a new :class:`.NameRegistry`.
"""
self._values = weakref.WeakValueDictionary()
self._mutex = threading.RLock()
self.creator = creator
def get(self, identifier, *args, **kw):
"""Get and possibly create the value.
:param identifier: Hash key for the value.
If the creation function is called, this identifier
will also be passed to the creation function.
:param \*args, \**kw: Additional arguments which will
also be passed to the creation function if it is
called.
"""
try:
if identifier in self._values:
return self._values[identifier]
else:
return self._sync_get(identifier, *args, **kw)
except KeyError:
return self._sync_get(identifier, *args, **kw)
def _sync_get(self, identifier, *args, **kw):
self._mutex.acquire()
try:
try:
if identifier in self._values:
return self._values[identifier]
else:
self._values[identifier] = value = self.creator(identifier, *args, **kw)
return value
except KeyError:
self._values[identifier] = value = self.creator(identifier, *args, **kw)
return value
finally:
self._mutex.release()

View File

@ -0,0 +1,132 @@
from .compat import threading
import logging
log = logging.getLogger(__name__)
class LockError(Exception):
pass
class ReadWriteMutex(object):
"""A mutex which allows multiple readers, single writer.
:class:`.ReadWriteMutex` uses a Python ``threading.Condition``
to provide this functionality across threads within a process.
The Beaker package also contained a file-lock based version
of this concept, so that readers/writers could be synchronized
across processes with a common filesystem. A future Dogpile
release may include this additional class at some point.
"""
def __init__(self):
# counts how many asynchronous methods are executing
self.async = 0
# pointer to thread that is the current sync operation
self.current_sync_operation = None
# condition object to lock on
self.condition = threading.Condition(threading.Lock())
def acquire_read_lock(self, wait = True):
"""Acquire the 'read' lock."""
self.condition.acquire()
try:
# see if a synchronous operation is waiting to start
# or is already running, in which case we wait (or just
# give up and return)
if wait:
while self.current_sync_operation is not None:
self.condition.wait()
else:
if self.current_sync_operation is not None:
return False
self.async += 1
log.debug("%s acquired read lock", self)
finally:
self.condition.release()
if not wait:
return True
def release_read_lock(self):
"""Release the 'read' lock."""
self.condition.acquire()
try:
self.async -= 1
# check if we are the last asynchronous reader thread
# out the door.
if self.async == 0:
# yes. so if a sync operation is waiting, notifyAll to wake
# it up
if self.current_sync_operation is not None:
self.condition.notifyAll()
elif self.async < 0:
raise LockError("Synchronizer error - too many "
"release_read_locks called")
log.debug("%s released read lock", self)
finally:
self.condition.release()
def acquire_write_lock(self, wait = True):
"""Acquire the 'write' lock."""
self.condition.acquire()
try:
# here, we are not a synchronous reader, and after returning,
# assuming waiting or immediate availability, we will be.
if wait:
# if another sync is working, wait
while self.current_sync_operation is not None:
self.condition.wait()
else:
# if another sync is working,
# we dont want to wait, so forget it
if self.current_sync_operation is not None:
return False
# establish ourselves as the current sync
# this indicates to other read/write operations
# that they should wait until this is None again
self.current_sync_operation = threading.currentThread()
# now wait again for asyncs to finish
if self.async > 0:
if wait:
# wait
self.condition.wait()
else:
# we dont want to wait, so forget it
self.current_sync_operation = None
return False
log.debug("%s acquired write lock", self)
finally:
self.condition.release()
if not wait:
return True
def release_write_lock(self):
"""Release the 'write' lock."""
self.condition.acquire()
try:
if self.current_sync_operation is not threading.currentThread():
raise LockError("Synchronizer error - current thread doesn't "
"have the write lock")
# reset the current sync operation so
# another can get it
self.current_sync_operation = None
# tell everyone to get ready
self.condition.notifyAll()
log.debug("%s released write lock", self)
finally:
# everyone go !!
self.condition.release()

127
libs/soupsieve/__init__.py Normal file
View File

@ -0,0 +1,127 @@
"""
Soup Sieve.
A CSS selector filter for BeautifulSoup4.
MIT License
Copyright (c) 2018 Isaac Muse
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:
The above copyright notice and this permission notice shall be included in all
copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
SOFTWARE.
"""
from __future__ import unicode_literals
from .__meta__ import __version__, __version_info__ # noqa: F401
from . import css_parser as cp
from . import css_match as cm
from . import css_types as ct
from .util import DEBUG, deprecated, SelectorSyntaxError # noqa: F401
__all__ = (
'DEBUG', 'SelectorSyntaxError', 'SoupSieve',
'closest', 'comments', 'compile', 'filter', 'icomments',
'iselect', 'match', 'select', 'select_one'
)
SoupSieve = cm.SoupSieve
def compile(pattern, namespaces=None, flags=0, **kwargs): # noqa: A001
"""Compile CSS pattern."""
if namespaces is not None:
namespaces = ct.Namespaces(**namespaces)
custom = kwargs.get('custom')
if custom is not None:
custom = ct.CustomSelectors(**custom)
if isinstance(pattern, SoupSieve):
if flags:
raise ValueError("Cannot process 'flags' argument on a compiled selector list")
elif namespaces is not None:
raise ValueError("Cannot process 'namespaces' argument on a compiled selector list")
elif custom is not None:
raise ValueError("Cannot process 'custom' argument on a compiled selector list")
return pattern
return cp._cached_css_compile(pattern, namespaces, custom, flags)
def purge():
"""Purge cached patterns."""
cp._purge_cache()
def closest(select, tag, namespaces=None, flags=0, **kwargs):
"""Match closest ancestor."""
return compile(select, namespaces, flags, **kwargs).closest(tag)
def match(select, tag, namespaces=None, flags=0, **kwargs):
"""Match node."""
return compile(select, namespaces, flags, **kwargs).match(tag)
def filter(select, iterable, namespaces=None, flags=0, **kwargs): # noqa: A001
"""Filter list of nodes."""
return compile(select, namespaces, flags, **kwargs).filter(iterable)
@deprecated("'comments' is not related to CSS selectors and will be removed in the future.")
def comments(tag, limit=0, flags=0, **kwargs):
"""Get comments only."""
return [comment for comment in cm.CommentsMatch(tag).get_comments(limit)]
@deprecated("'icomments' is not related to CSS selectors and will be removed in the future.")
def icomments(tag, limit=0, flags=0, **kwargs):
"""Iterate comments only."""
for comment in cm.CommentsMatch(tag).get_comments(limit):
yield comment
def select_one(select, tag, namespaces=None, flags=0, **kwargs):
"""Select a single tag."""
return compile(select, namespaces, flags, **kwargs).select_one(tag)
def select(select, tag, namespaces=None, limit=0, flags=0, **kwargs):
"""Select the specified tags."""
return compile(select, namespaces, flags, **kwargs).select(tag, limit)
def iselect(select, tag, namespaces=None, limit=0, flags=0, **kwargs):
"""Iterate the specified tags."""
for el in compile(select, namespaces, flags, **kwargs).iselect(tag, limit):
yield el
def escape(ident):
"""Escape identifier."""
return cp.escape(ident)

190
libs/soupsieve/__meta__.py Normal file
View File

@ -0,0 +1,190 @@
"""Meta related things."""
from __future__ import unicode_literals
from collections import namedtuple
import re
RE_VER = re.compile(
r'''(?x)
(?P<major>\d+)(?:\.(?P<minor>\d+))?(?:\.(?P<micro>\d+))?
(?:(?P<type>a|b|rc)(?P<pre>\d+))?
(?:\.post(?P<post>\d+))?
(?:\.dev(?P<dev>\d+))?
'''
)
REL_MAP = {
".dev": "",
".dev-alpha": "a",
".dev-beta": "b",
".dev-candidate": "rc",
"alpha": "a",
"beta": "b",
"candidate": "rc",
"final": ""
}
DEV_STATUS = {
".dev": "2 - Pre-Alpha",
".dev-alpha": "2 - Pre-Alpha",
".dev-beta": "2 - Pre-Alpha",
".dev-candidate": "2 - Pre-Alpha",
"alpha": "3 - Alpha",
"beta": "4 - Beta",
"candidate": "4 - Beta",
"final": "5 - Production/Stable"
}
PRE_REL_MAP = {"a": 'alpha', "b": 'beta', "rc": 'candidate'}
class Version(namedtuple("Version", ["major", "minor", "micro", "release", "pre", "post", "dev"])):
"""
Get the version (PEP 440).
A biased approach to the PEP 440 semantic version.
Provides a tuple structure which is sorted for comparisons `v1 > v2` etc.
(major, minor, micro, release type, pre-release build, post-release build, development release build)
Release types are named in is such a way they are comparable with ease.
Accessors to check if a development, pre-release, or post-release build. Also provides accessor to get
development status for setup files.
How it works (currently):
- You must specify a release type as either `final`, `alpha`, `beta`, or `candidate`.
- To define a development release, you can use either `.dev`, `.dev-alpha`, `.dev-beta`, or `.dev-candidate`.
The dot is used to ensure all development specifiers are sorted before `alpha`.
You can specify a `dev` number for development builds, but do not have to as implicit development releases
are allowed.
- You must specify a `pre` value greater than zero if using a prerelease as this project (not PEP 440) does not
allow implicit prereleases.
- You can optionally set `post` to a value greater than zero to make the build a post release. While post releases
are technically allowed in prereleases, it is strongly discouraged, so we are rejecting them. It should be
noted that we do not allow `post0` even though PEP 440 does not restrict this. This project specifically
does not allow implicit post releases.
- It should be noted that we do not support epochs `1!` or local versions `+some-custom.version-1`.
Acceptable version releases:
```
Version(1, 0, 0, "final") 1.0
Version(1, 2, 0, "final") 1.2
Version(1, 2, 3, "final") 1.2.3
Version(1, 2, 0, ".dev-alpha", pre=4) 1.2a4
Version(1, 2, 0, ".dev-beta", pre=4) 1.2b4
Version(1, 2, 0, ".dev-candidate", pre=4) 1.2rc4
Version(1, 2, 0, "final", post=1) 1.2.post1
Version(1, 2, 3, ".dev") 1.2.3.dev0
Version(1, 2, 3, ".dev", dev=1) 1.2.3.dev1
```
"""
def __new__(cls, major, minor, micro, release="final", pre=0, post=0, dev=0):
"""Validate version info."""
# Ensure all parts are positive integers.
for value in (major, minor, micro, pre, post):
if not (isinstance(value, int) and value >= 0):
raise ValueError("All version parts except 'release' should be integers.")
if release not in REL_MAP:
raise ValueError("'{}' is not a valid release type.".format(release))
# Ensure valid pre-release (we do not allow implicit pre-releases).
if ".dev-candidate" < release < "final":
if pre == 0:
raise ValueError("Implicit pre-releases not allowed.")
elif dev:
raise ValueError("Version is not a development release.")
elif post:
raise ValueError("Post-releases are not allowed with pre-releases.")
# Ensure valid development or development/pre release
elif release < "alpha":
if release > ".dev" and pre == 0:
raise ValueError("Implicit pre-release not allowed.")
elif post:
raise ValueError("Post-releases are not allowed with pre-releases.")
# Ensure a valid normal release
else:
if pre:
raise ValueError("Version is not a pre-release.")
elif dev:
raise ValueError("Version is not a development release.")
return super(Version, cls).__new__(cls, major, minor, micro, release, pre, post, dev)
def _is_pre(self):
"""Is prerelease."""
return self.pre > 0
def _is_dev(self):
"""Is development."""
return bool(self.release < "alpha")
def _is_post(self):
"""Is post."""
return self.post > 0
def _get_dev_status(self): # pragma: no cover
"""Get development status string."""
return DEV_STATUS[self.release]
def _get_canonical(self):
"""Get the canonical output string."""
# Assemble major, minor, micro version and append `pre`, `post`, or `dev` if needed..
if self.micro == 0:
ver = "{}.{}".format(self.major, self.minor)
else:
ver = "{}.{}.{}".format(self.major, self.minor, self.micro)
if self._is_pre():
ver += '{}{}'.format(REL_MAP[self.release], self.pre)
if self._is_post():
ver += ".post{}".format(self.post)
if self._is_dev():
ver += ".dev{}".format(self.dev)
return ver
def parse_version(ver, pre=False):
"""Parse version into a comparable Version tuple."""
m = RE_VER.match(ver)
# Handle major, minor, micro
major = int(m.group('major'))
minor = int(m.group('minor')) if m.group('minor') else 0
micro = int(m.group('micro')) if m.group('micro') else 0
# Handle pre releases
if m.group('type'):
release = PRE_REL_MAP[m.group('type')]
pre = int(m.group('pre'))
else:
release = "final"
pre = 0
# Handle development releases
dev = m.group('dev') if m.group('dev') else 0
if m.group('dev'):
dev = int(m.group('dev'))
release = '.dev-' + release if pre else '.dev'
else:
dev = 0
# Handle post
post = int(m.group('post')) if m.group('post') else 0
return Version(major, minor, micro, release, pre, post, dev)
__version_info__ = Version(1, 9, 3, "final")
__version__ = __version_info__._get_canonical()

1472
libs/soupsieve/css_match.py Normal file

File diff suppressed because it is too large Load Diff

1218
libs/soupsieve/css_parser.py Normal file

File diff suppressed because it is too large Load Diff

344
libs/soupsieve/css_types.py Normal file
View File

@ -0,0 +1,344 @@
"""CSS selector structure items."""
from __future__ import unicode_literals
from . import util
__all__ = (
'Selector',
'SelectorNull',
'SelectorTag',
'SelectorAttribute',
'SelectorContains',
'SelectorNth',
'SelectorLang',
'SelectorList',
'Namespaces',
'CustomSelectors'
)
SEL_EMPTY = 0x1
SEL_ROOT = 0x2
SEL_DEFAULT = 0x4
SEL_INDETERMINATE = 0x8
SEL_SCOPE = 0x10
SEL_DIR_LTR = 0x20
SEL_DIR_RTL = 0x40
SEL_IN_RANGE = 0x80
SEL_OUT_OF_RANGE = 0x100
SEL_DEFINED = 0x200
class Immutable(object):
"""Immutable."""
__slots__ = ('_hash',)
def __init__(self, **kwargs):
"""Initialize."""
temp = []
for k, v in kwargs.items():
temp.append(type(v))
temp.append(v)
super(Immutable, self).__setattr__(k, v)
super(Immutable, self).__setattr__('_hash', hash(tuple(temp)))
@classmethod
def __base__(cls):
"""Get base class."""
return cls
def __eq__(self, other):
"""Equal."""
return (
isinstance(other, self.__base__()) and
all([getattr(other, key) == getattr(self, key) for key in self.__slots__ if key != '_hash'])
)
def __ne__(self, other):
"""Equal."""
return (
not isinstance(other, self.__base__()) or
any([getattr(other, key) != getattr(self, key) for key in self.__slots__ if key != '_hash'])
)
def __hash__(self):
"""Hash."""
return self._hash
def __setattr__(self, name, value):
"""Prevent mutability."""
raise AttributeError("'{}' is immutable".format(self.__class__.__name__))
def __repr__(self): # pragma: no cover
"""Representation."""
return "{}({})".format(
self.__base__(), ', '.join(["{}={!r}".format(k, getattr(self, k)) for k in self.__slots__[:-1]])
)
__str__ = __repr__
class ImmutableDict(util.Mapping):
"""Hashable, immutable dictionary."""
def __init__(self, *args, **kwargs):
"""Initialize."""
arg = args[0] if args else kwargs
is_dict = isinstance(arg, dict)
if (
is_dict and not all([isinstance(v, util.Hashable) for v in arg.values()]) or
not is_dict and not all([isinstance(k, util.Hashable) and isinstance(v, util.Hashable) for k, v in arg])
):
raise TypeError('All values must be hashable')
self._d = dict(*args, **kwargs)
self._hash = hash(tuple([(type(x), x, type(y), y) for x, y in sorted(self._d.items())]))
def __iter__(self):
"""Iterator."""
return iter(self._d)
def __len__(self):
"""Length."""
return len(self._d)
def __getitem__(self, key):
"""Get item: `namespace['key']`."""
return self._d[key]
def __hash__(self):
"""Hash."""
return self._hash
def __repr__(self): # pragma: no cover
"""Representation."""
return "{!r}".format(self._d)
__str__ = __repr__
class Namespaces(ImmutableDict):
"""Namespaces."""
def __init__(self, *args, **kwargs):
"""Initialize."""
# If there are arguments, check the first index.
# `super` should fail if the user gave multiple arguments,
# so don't bother checking that.
arg = args[0] if args else kwargs
is_dict = isinstance(arg, dict)
if is_dict and not all([isinstance(k, util.string) and isinstance(v, util.string) for k, v in arg.items()]):
raise TypeError('Namespace keys and values must be Unicode strings')
elif not is_dict and not all([isinstance(k, util.string) and isinstance(v, util.string) for k, v in arg]):
raise TypeError('Namespace keys and values must be Unicode strings')
super(Namespaces, self).__init__(*args, **kwargs)
class CustomSelectors(ImmutableDict):
"""Custom selectors."""
def __init__(self, *args, **kwargs):
"""Initialize."""
# If there are arguments, check the first index.
# `super` should fail if the user gave multiple arguments,
# so don't bother checking that.
arg = args[0] if args else kwargs
is_dict = isinstance(arg, dict)
if is_dict and not all([isinstance(k, util.string) and isinstance(v, util.string) for k, v in arg.items()]):
raise TypeError('CustomSelectors keys and values must be Unicode strings')
elif not is_dict and not all([isinstance(k, util.string) and isinstance(v, util.string) for k, v in arg]):
raise TypeError('CustomSelectors keys and values must be Unicode strings')
super(CustomSelectors, self).__init__(*args, **kwargs)
class Selector(Immutable):
"""Selector."""
__slots__ = (
'tag', 'ids', 'classes', 'attributes', 'nth', 'selectors',
'relation', 'rel_type', 'contains', 'lang', 'flags', '_hash'
)
def __init__(
self, tag, ids, classes, attributes, nth, selectors,
relation, rel_type, contains, lang, flags
):
"""Initialize."""
super(Selector, self).__init__(
tag=tag,
ids=ids,
classes=classes,
attributes=attributes,
nth=nth,
selectors=selectors,
relation=relation,
rel_type=rel_type,
contains=contains,
lang=lang,
flags=flags
)
class SelectorNull(Immutable):
"""Null Selector."""
def __init__(self):
"""Initialize."""
super(SelectorNull, self).__init__()
class SelectorTag(Immutable):
"""Selector tag."""
__slots__ = ("name", "prefix", "_hash")
def __init__(self, name, prefix):
"""Initialize."""
super(SelectorTag, self).__init__(
name=name,
prefix=prefix
)
class SelectorAttribute(Immutable):
"""Selector attribute rule."""
__slots__ = ("attribute", "prefix", "pattern", "xml_type_pattern", "_hash")
def __init__(self, attribute, prefix, pattern, xml_type_pattern):
"""Initialize."""
super(SelectorAttribute, self).__init__(
attribute=attribute,
prefix=prefix,
pattern=pattern,
xml_type_pattern=xml_type_pattern
)
class SelectorContains(Immutable):
"""Selector contains rule."""
__slots__ = ("text", "_hash")
def __init__(self, text):
"""Initialize."""
super(SelectorContains, self).__init__(
text=text
)
class SelectorNth(Immutable):
"""Selector nth type."""
__slots__ = ("a", "n", "b", "of_type", "last", "selectors", "_hash")
def __init__(self, a, n, b, of_type, last, selectors):
"""Initialize."""
super(SelectorNth, self).__init__(
a=a,
n=n,
b=b,
of_type=of_type,
last=last,
selectors=selectors
)
class SelectorLang(Immutable):
"""Selector language rules."""
__slots__ = ("languages", "_hash",)
def __init__(self, languages):
"""Initialize."""
super(SelectorLang, self).__init__(
languages=tuple(languages)
)
def __iter__(self):
"""Iterator."""
return iter(self.languages)
def __len__(self): # pragma: no cover
"""Length."""
return len(self.languages)
def __getitem__(self, index): # pragma: no cover
"""Get item."""
return self.languages[index]
class SelectorList(Immutable):
"""Selector list."""
__slots__ = ("selectors", "is_not", "is_html", "_hash")
def __init__(self, selectors=tuple(), is_not=False, is_html=False):
"""Initialize."""
super(SelectorList, self).__init__(
selectors=tuple(selectors),
is_not=is_not,
is_html=is_html
)
def __iter__(self):
"""Iterator."""
return iter(self.selectors)
def __len__(self):
"""Length."""
return len(self.selectors)
def __getitem__(self, index):
"""Get item."""
return self.selectors[index]
def _pickle(p):
return p.__base__(), tuple([getattr(p, s) for s in p.__slots__[:-1]])
def pickle_register(obj):
"""Allow object to be pickled."""
util.copyreg.pickle(obj, _pickle)
pickle_register(Selector)
pickle_register(SelectorNull)
pickle_register(SelectorTag)
pickle_register(SelectorAttribute)
pickle_register(SelectorContains)
pickle_register(SelectorNth)
pickle_register(SelectorLang)
pickle_register(SelectorList)

170
libs/soupsieve/util.py Normal file
View File

@ -0,0 +1,170 @@
"""Utility."""
from __future__ import unicode_literals
from functools import wraps
import warnings
import sys
import struct
import os
import re
MODULE = os.path.dirname(__file__)
PY3 = sys.version_info >= (3, 0)
PY35 = sys.version_info >= (3, 5)
PY37 = sys.version_info >= (3, 7)
if PY3:
from functools import lru_cache # noqa F401
import copyreg # noqa F401
from collections.abc import Hashable, Mapping # noqa F401
ustr = str
bstr = bytes
unichar = chr
string = str
else:
from backports.functools_lru_cache import lru_cache # noqa F401
import copy_reg as copyreg # noqa F401
from collections import Hashable, Mapping # noqa F401
ustr = unicode # noqa: F821
bstr = str
unichar = unichr # noqa: F821
string = basestring # noqa: F821
DEBUG = 0x00001
RE_PATTERN_LINE_SPLIT = re.compile(r'(?:\r\n|(?!\r\n)[\n\r])|$')
LC_A = ord('a')
LC_Z = ord('z')
UC_A = ord('A')
UC_Z = ord('Z')
def lower(string):
"""Lower."""
new_string = []
for c in string:
o = ord(c)
new_string.append(chr(o + 32) if UC_A <= o <= UC_Z else c)
return ''.join(new_string)
def upper(string): # pragma: no cover
"""Lower."""
new_string = []
for c in string:
o = ord(c)
new_string.append(chr(o - 32) if LC_A <= o <= LC_Z else c)
return ''.join(new_string)
def uchr(i):
"""Allow getting Unicode character on narrow python builds."""
try:
return unichar(i)
except ValueError: # pragma: no cover
return struct.pack('i', i).decode('utf-32')
def uord(c):
"""Get Unicode ordinal."""
if len(c) == 2: # pragma: no cover
high, low = [ord(p) for p in c]
ordinal = (high - 0xD800) * 0x400 + low - 0xDC00 + 0x10000
else:
ordinal = ord(c)
return ordinal
class SelectorSyntaxError(SyntaxError):
"""Syntax error in a CSS selector."""
def __init__(self, msg, pattern=None, index=None):
"""Initialize."""
self.line = None
self.col = None
self.context = None
if pattern is not None and index is not None:
# Format pattern to show line and column position
self.context, self.line, self.col = get_pattern_context(pattern, index)
msg = '{}\n line {}:\n{}'.format(msg, self.line, self.context)
super(SelectorSyntaxError, self).__init__(msg)
def deprecated(message, stacklevel=2): # pragma: no cover
"""
Raise a `DeprecationWarning` when wrapped function/method is called.
Borrowed from https://stackoverflow.com/a/48632082/866026
"""
def _decorator(func):
@wraps(func)
def _func(*args, **kwargs):
warnings.warn(
"'{}' is deprecated. {}".format(func.__name__, message),
category=DeprecationWarning,
stacklevel=stacklevel
)
return func(*args, **kwargs)
return _func
return _decorator
def warn_deprecated(message, stacklevel=2): # pragma: no cover
"""Warn deprecated."""
warnings.warn(
message,
category=DeprecationWarning,
stacklevel=stacklevel
)
def get_pattern_context(pattern, index):
"""Get the pattern context."""
last = 0
current_line = 1
col = 1
text = []
line = 1
# Split pattern by newline and handle the text before the newline
for m in RE_PATTERN_LINE_SPLIT.finditer(pattern):
linetext = pattern[last:m.start(0)]
if not len(m.group(0)) and not len(text):
indent = ''
offset = -1
col = index - last + 1
elif last <= index < m.end(0):
indent = '--> '
offset = (-1 if index > m.start(0) else 0) + 3
col = index - last + 1
else:
indent = ' '
offset = None
if len(text):
# Regardless of whether we are presented with `\r\n`, `\r`, or `\n`,
# we will render the output with just `\n`. We will still log the column
# correctly though.
text.append('\n')
text.append('{}{}'.format(indent, linetext))
if offset is not None:
text.append('\n')
text.append(' ' * (col + offset) + '^')
line = current_line
current_line += 1
last = m.end(0)
return ''.join(text), line, col

View File

@ -0,0 +1,88 @@
# -*- coding: utf-8 -*-
from datetime import time
class Component(object):
"""Base class for cue text.
:param list components: sub-components of this one.
"""
tag_name = 'Component'
def __init__(self, components=None):
if components is None:
self.components = []
elif isinstance(components, list):
self.components = components
else:
self.components = [components]
def __iter__(self):
return iter(self.components)
def __len__(self):
return len(self.components)
def __str__(self):
return ''.join(str(c) for c in self.components)
def __repr__(self):
return '<{name}>{components}</{name}>'.format(name=self.tag_name,
components=''.join(repr(c) for c in self.components))
class Bold(Component):
"""Bold :class:`Component`."""
tag_name = 'b'
class Italic(Component):
"""Italic :class:`Component`."""
tag_name = 'i'
class Underline(Component):
"""Underline :class:`Component`."""
tag_name = 'u'
class Strikethrough(Component):
"""Strikethrough :class:`Component`."""
tag_name = 's'
class Font(Component):
"""Font :class:`Component`."""
tag_name = 'font'
def __init__(self, color, *args, **kwargs):
super(Font, self).__init__(*args, **kwargs)
self.color = color
def __repr__(self):
return '<{name} "{color}">{components}</{name}>'.format(name=self.tag_name, color=self.color,
components=''.join(repr(c) for c in self.components))
class Cue(object):
"""A single subtitle cue with timings and components.
:param datetime.time start_time: start time.
:param datetime.time end_time: end time.
:param list components: cue components.
"""
def __init__(self, start_time, end_time, components):
self.start_time = start_time
self.end_time = end_time
self.components = components
def __repr__(self):
return '<Cue [{start_time}->{end_time}] "{text}">'.format(start_time=self.start_time, end_time=self.end_time,
text=''.join(repr(c) for c in self.components))
if __name__ == '__main__':
cue = Cue(time(), time(1), [Bold('Hello')])
print repr(cue)

View File

@ -0,0 +1,82 @@
# -*- coding: utf-8 -*-
import re
from datetime import time
from subliminal.subtitles import Cue
index_re = re.compile(r'(?P<index>\d+)')
timing_re = re.compile(r'(?P<hour>\d{2}):(?P<minute>\d{2}):(?P<second>\d{2}),(?P<milliseconds>\d{3})')
class SubripReadError(Exception):
pass
class SubripReadIndexError(SubripReadError):
pass
class SubripReader(object):
INDEX = 1
TIMINGS = 2
TEXT = 3
def __init__(self):
self.state = self.INDEX
def read(self, content):
pass
def read_line(self, line):
if self.state == self.INDEX:
if index_re.match(line):
raise SubripReadIndexError
def read_cue(stream):
"""Attempt to parse a complete Cue from the stream"""
# skip blank lines
line = ''
while not line:
line = stream.readline()
# parse index
if not index_re.match(line):
raise SubripReadIndexError
# parse timings
line = stream.readline()
if '-->' not in line:
raise SubripReadError
timings = line.split('-->')
if not len(timings):
raise SubripReadError
# parse start time
match = timing_re.match(timings[0].strip())
if not match:
raise SubripReadError
start_time = time(**match.groupdict())
# parse end time
match = timing_re.match(timings[0].strip())
if not match:
raise SubripReadError
end_time = time(**match.groupdict())
class SubripSubtitle(object):
def __init__(self):
self.cues = []
if __name__ == '__main__':
print read_cue('toto')
i = 0
for x in read_cue('toto'):
print x
if i > 10:
break
i += 1

View File

@ -0,0 +1,21 @@
# -*- coding: utf-8 -*-
__title__ = 'subliminal'
__version__ = '2.1.0.dev'
__short_version__ = '.'.join(__version__.split('.')[:2])
__author__ = 'Antoine Bertin'
__license__ = 'MIT'
__copyright__ = 'Copyright 2016, Antoine Bertin'
import logging
from .core import (AsyncProviderPool, ProviderPool, check_video, download_best_subtitles, download_subtitles,
list_subtitles, refine, save_subtitles, scan_video, scan_videos)
from .cache import region
from .exceptions import Error, ProviderError
from .extensions import provider_manager, refiner_manager
from .providers import Provider
from .score import compute_score, get_scores
from .subtitle import SUBTITLE_EXTENSIONS, Subtitle
from .video import VIDEO_EXTENSIONS, Episode, Movie, Video
logging.getLogger(__name__).addHandler(logging.NullHandler())

View File

@ -0,0 +1,16 @@
# -*- coding: utf-8 -*-
import datetime
from dogpile.cache import make_region
#: Expiration time for show caching
SHOW_EXPIRATION_TIME = datetime.timedelta(weeks=3).total_seconds()
#: Expiration time for episode caching
EPISODE_EXPIRATION_TIME = datetime.timedelta(days=3).total_seconds()
#: Expiration time for scraper searches
REFINER_EXPIRATION_TIME = datetime.timedelta(weeks=1).total_seconds()
region = make_region()

458
libs/subliminal2.7/cli.py Normal file
View File

@ -0,0 +1,458 @@
# -*- coding: utf-8 -*-
"""
Subliminal uses `click <http://click.pocoo.org>`_ to provide a powerful :abbr:`CLI (command-line interface)`.
"""
from __future__ import division
from collections import defaultdict
from datetime import timedelta
import glob
import json
import logging
import os
import re
from appdirs import AppDirs
from babelfish import Error as BabelfishError, Language
import click
from dogpile.cache.backends.file import AbstractFileLock
from dogpile.util.readwrite_lock import ReadWriteMutex
from six.moves import configparser
from subliminal import (AsyncProviderPool, Episode, Movie, Video, __version__, check_video, compute_score, get_scores,
provider_manager, refine, refiner_manager, region, save_subtitles, scan_video, scan_videos)
from subliminal.core import ARCHIVE_EXTENSIONS, search_external_subtitles
logger = logging.getLogger(__name__)
class MutexLock(AbstractFileLock):
""":class:`MutexLock` is a thread-based rw lock based on :class:`dogpile.core.ReadWriteMutex`."""
def __init__(self, filename):
self.mutex = ReadWriteMutex()
def acquire_read_lock(self, wait):
ret = self.mutex.acquire_read_lock(wait)
return wait or ret
def acquire_write_lock(self, wait):
ret = self.mutex.acquire_write_lock(wait)
return wait or ret
def release_read_lock(self):
return self.mutex.release_read_lock()
def release_write_lock(self):
return self.mutex.release_write_lock()
class Config(object):
"""A :class:`~configparser.ConfigParser` wrapper to store configuration.
Interaction with the configuration is done with the properties.
:param str path: path to the configuration file.
"""
def __init__(self, path):
#: Path to the configuration file
self.path = path
#: The underlying configuration object
self.config = configparser.SafeConfigParser()
self.config.add_section('general')
self.config.set('general', 'languages', json.dumps(['en']))
self.config.set('general', 'providers', json.dumps(sorted([p.name for p in provider_manager])))
self.config.set('general', 'refiners', json.dumps(sorted([r.name for r in refiner_manager])))
self.config.set('general', 'single', str(0))
self.config.set('general', 'embedded_subtitles', str(1))
self.config.set('general', 'age', str(int(timedelta(weeks=2).total_seconds())))
self.config.set('general', 'hearing_impaired', str(1))
self.config.set('general', 'min_score', str(0))
def read(self):
"""Read the configuration from :attr:`path`"""
self.config.read(self.path)
def write(self):
"""Write the configuration to :attr:`path`"""
with open(self.path, 'w') as f:
self.config.write(f)
@property
def languages(self):
return {Language.fromietf(l) for l in json.loads(self.config.get('general', 'languages'))}
@languages.setter
def languages(self, value):
self.config.set('general', 'languages', json.dumps(sorted([str(l) for l in value])))
@property
def providers(self):
return json.loads(self.config.get('general', 'providers'))
@providers.setter
def providers(self, value):
self.config.set('general', 'providers', json.dumps(sorted([p.lower() for p in value])))
@property
def refiners(self):
return json.loads(self.config.get('general', 'refiners'))
@refiners.setter
def refiners(self, value):
self.config.set('general', 'refiners', json.dumps([r.lower() for r in value]))
@property
def single(self):
return self.config.getboolean('general', 'single')
@single.setter
def single(self, value):
self.config.set('general', 'single', str(int(value)))
@property
def embedded_subtitles(self):
return self.config.getboolean('general', 'embedded_subtitles')
@embedded_subtitles.setter
def embedded_subtitles(self, value):
self.config.set('general', 'embedded_subtitles', str(int(value)))
@property
def age(self):
return timedelta(seconds=self.config.getint('general', 'age'))
@age.setter
def age(self, value):
self.config.set('general', 'age', str(int(value.total_seconds())))
@property
def hearing_impaired(self):
return self.config.getboolean('general', 'hearing_impaired')
@hearing_impaired.setter
def hearing_impaired(self, value):
self.config.set('general', 'hearing_impaired', str(int(value)))
@property
def min_score(self):
return self.config.getfloat('general', 'min_score')
@min_score.setter
def min_score(self, value):
self.config.set('general', 'min_score', str(value))
@property
def provider_configs(self):
rv = {}
for provider in provider_manager:
if self.config.has_section(provider.name):
rv[provider.name] = {k: v for k, v in self.config.items(provider.name)}
return rv
@provider_configs.setter
def provider_configs(self, value):
# loop over provider configurations
for provider, config in value.items():
# create the corresponding section if necessary
if not self.config.has_section(provider):
self.config.add_section(provider)
# add config options
for k, v in config.items():
self.config.set(provider, k, v)
class LanguageParamType(click.ParamType):
""":class:`~click.ParamType` for languages that returns a :class:`~babelfish.language.Language`"""
name = 'language'
def convert(self, value, param, ctx):
try:
return Language.fromietf(value)
except BabelfishError:
self.fail('%s is not a valid language' % value)
LANGUAGE = LanguageParamType()
class AgeParamType(click.ParamType):
""":class:`~click.ParamType` for age strings that returns a :class:`~datetime.timedelta`
An age string is in the form `number + identifier` with possible identifiers:
* ``w`` for weeks
* ``d`` for days
* ``h`` for hours
The form can be specified multiple times but only with that idenfier ordering. For example:
* ``1w2d4h`` for 1 week, 2 days and 4 hours
* ``2w`` for 2 weeks
* ``3w6h`` for 3 weeks and 6 hours
"""
name = 'age'
def convert(self, value, param, ctx):
match = re.match(r'^(?:(?P<weeks>\d+?)w)?(?:(?P<days>\d+?)d)?(?:(?P<hours>\d+?)h)?$', value)
if not match:
self.fail('%s is not a valid age' % value)
return timedelta(**{k: int(v) for k, v in match.groupdict(0).items()})
AGE = AgeParamType()
PROVIDER = click.Choice(sorted(provider_manager.names()))
REFINER = click.Choice(sorted(refiner_manager.names()))
dirs = AppDirs('subliminal')
cache_file = 'subliminal.dbm'
config_file = 'config.ini'
@click.group(context_settings={'max_content_width': 100}, epilog='Suggestions and bug reports are greatly appreciated: '
'https://github.com/Diaoul/subliminal/')
@click.option('--addic7ed', type=click.STRING, nargs=2, metavar='USERNAME PASSWORD', help='Addic7ed configuration.')
@click.option('--legendastv', type=click.STRING, nargs=2, metavar='USERNAME PASSWORD', help='LegendasTV configuration.')
@click.option('--opensubtitles', type=click.STRING, nargs=2, metavar='USERNAME PASSWORD',
help='OpenSubtitles configuration.')
@click.option('--cache-dir', type=click.Path(writable=True, file_okay=False), default=dirs.user_cache_dir,
show_default=True, expose_value=True, help='Path to the cache directory.')
@click.option('--debug', is_flag=True, help='Print useful information for debugging subliminal and for reporting bugs.')
@click.version_option(__version__)
@click.pass_context
def subliminal(ctx, addic7ed, legendastv, opensubtitles, cache_dir, debug):
"""Subtitles, faster than your thoughts."""
# create cache directory
try:
os.makedirs(cache_dir)
except OSError:
if not os.path.isdir(cache_dir):
raise
# configure cache
region.configure('dogpile.cache.dbm', expiration_time=timedelta(days=30),
arguments={'filename': os.path.join(cache_dir, cache_file), 'lock_factory': MutexLock})
# configure logging
if debug:
handler = logging.StreamHandler()
handler.setFormatter(logging.Formatter(logging.BASIC_FORMAT))
logging.getLogger('subliminal').addHandler(handler)
logging.getLogger('subliminal').setLevel(logging.DEBUG)
# provider configs
ctx.obj = {'provider_configs': {}}
if addic7ed:
ctx.obj['provider_configs']['addic7ed'] = {'username': addic7ed[0], 'password': addic7ed[1]}
if legendastv:
ctx.obj['provider_configs']['legendastv'] = {'username': legendastv[0], 'password': legendastv[1]}
if opensubtitles:
ctx.obj['provider_configs']['opensubtitles'] = {'username': opensubtitles[0], 'password': opensubtitles[1]}
@subliminal.command()
@click.option('--clear-subliminal', is_flag=True, help='Clear subliminal\'s cache. Use this ONLY if your cache is '
'corrupted or if you experience issues.')
@click.pass_context
def cache(ctx, clear_subliminal):
"""Cache management."""
if clear_subliminal:
for file in glob.glob(os.path.join(ctx.parent.params['cache_dir'], cache_file) + '*'):
os.remove(file)
click.echo('Subliminal\'s cache cleared.')
else:
click.echo('Nothing done.')
@subliminal.command()
@click.option('-l', '--language', type=LANGUAGE, required=True, multiple=True, help='Language as IETF code, '
'e.g. en, pt-BR (can be used multiple times).')
@click.option('-p', '--provider', type=PROVIDER, multiple=True, help='Provider to use (can be used multiple times).')
@click.option('-r', '--refiner', type=REFINER, multiple=True, help='Refiner to use (can be used multiple times).')
@click.option('-a', '--age', type=AGE, help='Filter videos newer than AGE, e.g. 12h, 1w2d.')
@click.option('-d', '--directory', type=click.STRING, metavar='DIR', help='Directory where to save subtitles, '
'default is next to the video file.')
@click.option('-e', '--encoding', type=click.STRING, metavar='ENC', help='Subtitle file encoding, default is to '
'preserve original encoding.')
@click.option('-s', '--single', is_flag=True, default=False, help='Save subtitle without language code in the file '
'name, i.e. use .srt extension. Do not use this unless your media player requires it.')
@click.option('-f', '--force', is_flag=True, default=False, help='Force download even if a subtitle already exist.')
@click.option('-hi', '--hearing-impaired', is_flag=True, default=False, help='Prefer hearing impaired subtitles.')
@click.option('-m', '--min-score', type=click.IntRange(0, 100), default=0, help='Minimum score for a subtitle '
'to be downloaded (0 to 100).')
@click.option('-w', '--max-workers', type=click.IntRange(1, 50), default=None, help='Maximum number of threads to use.')
@click.option('-z/-Z', '--archives/--no-archives', default=True, show_default=True, help='Scan archives for videos '
'(supported extensions: %s).' % ', '.join(ARCHIVE_EXTENSIONS))
@click.option('-v', '--verbose', count=True, help='Increase verbosity.')
@click.argument('path', type=click.Path(), required=True, nargs=-1)
@click.pass_obj
def download(obj, provider, refiner, language, age, directory, encoding, single, force, hearing_impaired, min_score,
max_workers, archives, verbose, path):
"""Download best subtitles.
PATH can be an directory containing videos, a video file path or a video file name. It can be used multiple times.
If an existing subtitle is detected (external or embedded) in the correct language, the download is skipped for
the associated video.
"""
# process parameters
language = set(language)
# scan videos
videos = []
ignored_videos = []
errored_paths = []
with click.progressbar(path, label='Collecting videos', item_show_func=lambda p: p or '') as bar:
for p in bar:
logger.debug('Collecting path %s', p)
# non-existing
if not os.path.exists(p):
try:
video = Video.fromname(p)
except:
logger.exception('Unexpected error while collecting non-existing path %s', p)
errored_paths.append(p)
continue
if not force:
video.subtitle_languages |= set(search_external_subtitles(video.name, directory=directory).values())
refine(video, episode_refiners=refiner, movie_refiners=refiner, embedded_subtitles=not force)
videos.append(video)
continue
# directories
if os.path.isdir(p):
try:
scanned_videos = scan_videos(p, age=age, archives=archives)
except:
logger.exception('Unexpected error while collecting directory path %s', p)
errored_paths.append(p)
continue
for video in scanned_videos:
if not force:
video.subtitle_languages |= set(search_external_subtitles(video.name,
directory=directory).values())
if check_video(video, languages=language, age=age, undefined=single):
refine(video, episode_refiners=refiner, movie_refiners=refiner, embedded_subtitles=not force)
videos.append(video)
else:
ignored_videos.append(video)
continue
# other inputs
try:
video = scan_video(p)
except:
logger.exception('Unexpected error while collecting path %s', p)
errored_paths.append(p)
continue
if not force:
video.subtitle_languages |= set(search_external_subtitles(video.name, directory=directory).values())
if check_video(video, languages=language, age=age, undefined=single):
refine(video, episode_refiners=refiner, movie_refiners=refiner, embedded_subtitles=not force)
videos.append(video)
else:
ignored_videos.append(video)
# output errored paths
if verbose > 0:
for p in errored_paths:
click.secho('%s errored' % p, fg='red')
# output ignored videos
if verbose > 1:
for video in ignored_videos:
click.secho('%s ignored - subtitles: %s / age: %d day%s' % (
os.path.split(video.name)[1],
', '.join(str(s) for s in video.subtitle_languages) or 'none',
video.age.days,
's' if video.age.days > 1 else ''
), fg='yellow')
# report collected videos
click.echo('%s video%s collected / %s video%s ignored / %s error%s' % (
click.style(str(len(videos)), bold=True, fg='green' if videos else None),
's' if len(videos) > 1 else '',
click.style(str(len(ignored_videos)), bold=True, fg='yellow' if ignored_videos else None),
's' if len(ignored_videos) > 1 else '',
click.style(str(len(errored_paths)), bold=True, fg='red' if errored_paths else None),
's' if len(errored_paths) > 1 else '',
))
# exit if no video collected
if not videos:
return
# download best subtitles
downloaded_subtitles = defaultdict(list)
with AsyncProviderPool(max_workers=max_workers, providers=provider, provider_configs=obj['provider_configs']) as p:
with click.progressbar(videos, label='Downloading subtitles',
item_show_func=lambda v: os.path.split(v.name)[1] if v is not None else '') as bar:
for v in bar:
scores = get_scores(v)
subtitles = p.download_best_subtitles(p.list_subtitles(v, language - v.subtitle_languages),
v, language, min_score=scores['hash'] * min_score / 100,
hearing_impaired=hearing_impaired, only_one=single)
downloaded_subtitles[v] = subtitles
if p.discarded_providers:
click.secho('Some providers have been discarded due to unexpected errors: %s' %
', '.join(p.discarded_providers), fg='yellow')
# save subtitles
total_subtitles = 0
for v, subtitles in downloaded_subtitles.items():
saved_subtitles = save_subtitles(v, subtitles, single=single, directory=directory, encoding=encoding)
total_subtitles += len(saved_subtitles)
if verbose > 0:
click.echo('%s subtitle%s downloaded for %s' % (click.style(str(len(saved_subtitles)), bold=True),
's' if len(saved_subtitles) > 1 else '',
os.path.split(v.name)[1]))
if verbose > 1:
for s in saved_subtitles:
matches = s.get_matches(v)
score = compute_score(s, v)
# score color
score_color = None
scores = get_scores(v)
if isinstance(v, Movie):
if score < scores['title']:
score_color = 'red'
elif score < scores['title'] + scores['year'] + scores['release_group']:
score_color = 'yellow'
else:
score_color = 'green'
elif isinstance(v, Episode):
if score < scores['series'] + scores['season'] + scores['episode']:
score_color = 'red'
elif score < scores['series'] + scores['season'] + scores['episode'] + scores['release_group']:
score_color = 'yellow'
else:
score_color = 'green'
# scale score from 0 to 100 taking out preferences
scaled_score = score
if s.hearing_impaired == hearing_impaired:
scaled_score -= scores['hearing_impaired']
scaled_score *= 100 / scores['hash']
# echo some nice colored output
click.echo(' - [{score}] {language} subtitle from {provider_name} (match on {matches})'.format(
score=click.style('{:5.1f}'.format(scaled_score), fg=score_color, bold=score >= scores['hash']),
language=s.language.name if s.language.country is None else '%s (%s)' % (s.language.name,
s.language.country.name),
provider_name=s.provider_name,
matches=', '.join(sorted(matches, key=scores.get, reverse=True))
))
if verbose == 0:
click.echo('Downloaded %s subtitle%s' % (click.style(str(total_subtitles), bold=True),
's' if total_subtitles > 1 else ''))

View File

@ -0,0 +1,32 @@
# -*- coding: utf-8 -*-
from babelfish import LanguageReverseConverter, language_converters
class Addic7edConverter(LanguageReverseConverter):
def __init__(self):
self.name_converter = language_converters['name']
self.from_addic7ed = {u'Català': ('cat',), 'Chinese (Simplified)': ('zho',), 'Chinese (Traditional)': ('zho',),
'Euskera': ('eus',), 'Galego': ('glg',), 'Greek': ('ell',), 'Malay': ('msa',),
'Portuguese (Brazilian)': ('por', 'BR'), 'Serbian (Cyrillic)': ('srp', None, 'Cyrl'),
'Serbian (Latin)': ('srp',), 'Spanish (Latin America)': ('spa',),
'Spanish (Spain)': ('spa',)}
self.to_addic7ed = {('cat',): 'Català', ('zho',): 'Chinese (Simplified)', ('eus',): 'Euskera',
('glg',): 'Galego', ('ell',): 'Greek', ('msa',): 'Malay',
('por', 'BR'): 'Portuguese (Brazilian)', ('srp', None, 'Cyrl'): 'Serbian (Cyrillic)'}
self.codes = self.name_converter.codes | set(self.from_addic7ed.keys())
def convert(self, alpha3, country=None, script=None):
if (alpha3, country, script) in self.to_addic7ed:
return self.to_addic7ed[(alpha3, country, script)]
if (alpha3, country) in self.to_addic7ed:
return self.to_addic7ed[(alpha3, country)]
if (alpha3,) in self.to_addic7ed:
return self.to_addic7ed[(alpha3,)]
return self.name_converter.convert(alpha3, country, script)
def reverse(self, addic7ed):
if addic7ed in self.from_addic7ed:
return self.from_addic7ed[addic7ed]
return self.name_converter.reverse(addic7ed)

View File

@ -0,0 +1,27 @@
# -*- coding: utf-8 -*-
from babelfish import LanguageReverseConverter
from ..exceptions import ConfigurationError
class LegendasTVConverter(LanguageReverseConverter):
def __init__(self):
self.from_legendastv = {1: ('por', 'BR'), 2: ('eng',), 3: ('spa',), 4: ('fra',), 5: ('deu',), 6: ('jpn',),
7: ('dan',), 8: ('nor',), 9: ('swe',), 10: ('por',), 11: ('ara',), 12: ('ces',),
13: ('zho',), 14: ('kor',), 15: ('bul',), 16: ('ita',), 17: ('pol',)}
self.to_legendastv = {v: k for k, v in self.from_legendastv.items()}
self.codes = set(self.from_legendastv.keys())
def convert(self, alpha3, country=None, script=None):
if (alpha3, country) in self.to_legendastv:
return self.to_legendastv[(alpha3, country)]
if (alpha3,) in self.to_legendastv:
return self.to_legendastv[(alpha3,)]
raise ConfigurationError('Unsupported language code for legendastv: %s, %s, %s' % (alpha3, country, script))
def reverse(self, legendastv):
if legendastv in self.from_legendastv:
return self.from_legendastv[legendastv]
raise ConfigurationError('Unsupported language number for legendastv: %s' % legendastv)

View File

@ -0,0 +1,23 @@
# -*- coding: utf-8 -*-
from babelfish import LanguageReverseConverter
from ..exceptions import ConfigurationError
class ShooterConverter(LanguageReverseConverter):
def __init__(self):
self.from_shooter = {'chn': ('zho',), 'eng': ('eng',)}
self.to_shooter = {v: k for k, v in self.from_shooter.items()}
self.codes = set(self.from_shooter.keys())
def convert(self, alpha3, country=None, script=None):
if (alpha3,) in self.to_shooter:
return self.to_shooter[(alpha3,)]
raise ConfigurationError('Unsupported language for shooter: %s, %s, %s' % (alpha3, country, script))
def reverse(self, shooter):
if shooter in self.from_shooter:
return self.from_shooter[shooter]
raise ConfigurationError('Unsupported language code for shooter: %s' % shooter)

View File

@ -0,0 +1,26 @@
# -*- coding: utf-8 -*-
from babelfish import LanguageReverseConverter
from ..exceptions import ConfigurationError
class TheSubDBConverter(LanguageReverseConverter):
def __init__(self):
self.from_thesubdb = {'en': ('eng',), 'es': ('spa',), 'fr': ('fra',), 'it': ('ita',), 'nl': ('nld',),
'pl': ('pol',), 'pt': ('por', 'BR'), 'ro': ('ron',), 'sv': ('swe',), 'tr': ('tur',)}
self.to_thesubdb = {v: k for k, v in self.from_thesubdb.items()}
self.codes = set(self.from_thesubdb.keys())
def convert(self, alpha3, country=None, script=None):
if (alpha3, country) in self.to_thesubdb:
return self.to_thesubdb[(alpha3, country)]
if (alpha3,) in self.to_thesubdb:
return self.to_thesubdb[(alpha3,)]
raise ConfigurationError('Unsupported language for thesubdb: %s, %s, %s' % (alpha3, country, script))
def reverse(self, thesubdb):
if thesubdb in self.from_thesubdb:
return self.from_thesubdb[thesubdb]
raise ConfigurationError('Unsupported language code for thesubdb: %s' % thesubdb)

View File

@ -0,0 +1,25 @@
# -*- coding: utf-8 -*-
from babelfish import LanguageReverseConverter, language_converters
class TVsubtitlesConverter(LanguageReverseConverter):
def __init__(self):
self.alpha2_converter = language_converters['alpha2']
self.from_tvsubtitles = {'br': ('por', 'BR'), 'ua': ('ukr',), 'gr': ('ell',), 'cn': ('zho',), 'jp': ('jpn',),
'cz': ('ces',)}
self.to_tvsubtitles = {v: k for k, v in self.from_tvsubtitles.items()}
self.codes = self.alpha2_converter.codes | set(self.from_tvsubtitles.keys())
def convert(self, alpha3, country=None, script=None):
if (alpha3, country) in self.to_tvsubtitles:
return self.to_tvsubtitles[(alpha3, country)]
if (alpha3,) in self.to_tvsubtitles:
return self.to_tvsubtitles[(alpha3,)]
return self.alpha2_converter.convert(alpha3, country, script)
def reverse(self, tvsubtitles):
if tvsubtitles in self.from_tvsubtitles:
return self.from_tvsubtitles[tvsubtitles]
return self.alpha2_converter.reverse(tvsubtitles)

777
libs/subliminal2.7/core.py Normal file
View File

@ -0,0 +1,777 @@
# -*- coding: utf-8 -*-
from collections import defaultdict
import platform
is_windows_special_path = False
if platform.system() == "Windows":
try:
__file__.decode("ascii")
except UnicodeDecodeError:
is_windows_special_path = True
if not is_windows_special_path:
from concurrent.futures import ThreadPoolExecutor
else:
ThreadPoolExecutor = object
from datetime import datetime
import io
import itertools
import logging
import operator
import os
import socket
from babelfish import Language, LanguageReverseError
from guessit import guessit
from six.moves.xmlrpc_client import ProtocolError
from rarfile import BadRarFile, NotRarFile, RarCannotExec, RarFile
from zipfile import BadZipfile
from ssl import SSLError
import requests
from .exceptions import ServiceUnavailable
from .extensions import provider_manager, refiner_manager
from .score import compute_score as default_compute_score
from .subtitle import SUBTITLE_EXTENSIONS, get_subtitle_path
from .utils import hash_napiprojekt, hash_opensubtitles, hash_shooter, hash_thesubdb
from .video import VIDEO_EXTENSIONS, Episode, Movie, Video
#: Supported archive extensions
ARCHIVE_EXTENSIONS = ('.rar',)
logger = logging.getLogger(__name__)
class ProviderPool(object):
"""A pool of providers with the same API as a single :class:`~subliminal.providers.Provider`.
It has a few extra features:
* Lazy loads providers when needed and supports the `with` statement to :meth:`terminate`
the providers on exit.
* Automatically discard providers on failure.
:param list providers: name of providers to use, if not all.
:param dict provider_configs: provider configuration as keyword arguments per provider name to pass when
instanciating the :class:`~subliminal.providers.Provider`.
"""
def __init__(self, providers=None, provider_configs=None):
#: Name of providers to use
self.providers = providers or provider_manager.names()
#: Provider configuration
self.provider_configs = provider_configs or {}
#: Initialized providers
self.initialized_providers = {}
#: Discarded providers
self.discarded_providers = set()
def __enter__(self):
return self
def __exit__(self, exc_type, exc_value, traceback):
self.terminate()
def __getitem__(self, name):
if name not in self.providers:
raise KeyError
if name not in self.initialized_providers:
logger.info('Initializing provider %s', name)
provider = provider_manager[name].plugin(**self.provider_configs.get(name, {}))
provider.initialize()
self.initialized_providers[name] = provider
return self.initialized_providers[name]
def __delitem__(self, name):
if name not in self.initialized_providers:
raise KeyError(name)
try:
logger.info('Terminating provider %s', name)
self.initialized_providers[name].terminate()
except (requests.Timeout, socket.timeout):
logger.error('Provider %r timed out, improperly terminated', name)
except (ServiceUnavailable, ProtocolError): # OpenSubtitles raises xmlrpclib.ProtocolError when unavailable
logger.error('Provider %r unavailable, improperly terminated', name)
except requests.exceptions.HTTPError as e:
if e.response.status_code in range(500, 600):
logger.error('Provider %r unavailable, improperly terminated', name)
else:
logger.exception('Provider %r http error %r, improperly terminated', name, e.response.status_code)
except SSLError as e:
if e.args[0] == 'The read operation timed out':
logger.error('Provider %r unavailable, improperly terminated', name)
else:
logger.exception('Provider %r SSL error %r, improperly terminated', name, e.args[0])
except:
logger.exception('Provider %r terminated unexpectedly', name)
del self.initialized_providers[name]
def __iter__(self):
return iter(self.initialized_providers)
def list_subtitles_provider(self, provider, video, languages):
"""List subtitles with a single provider.
The video and languages are checked against the provider.
:param str provider: name of the provider.
:param video: video to list subtitles for.
:type video: :class:`~subliminal.video.Video`
:param languages: languages to search for.
:type languages: set of :class:`~babelfish.language.Language`
:return: found subtitles.
:rtype: list of :class:`~subliminal.subtitle.Subtitle` or None
"""
# check video validity
if not provider_manager[provider].plugin.check(video):
logger.info('Skipping provider %r: not a valid video', provider)
return []
# check supported languages
provider_languages = provider_manager[provider].plugin.languages & languages
if not provider_languages:
logger.info('Skipping provider %r: no language to search for', provider)
return []
# list subtitles
logger.info('Listing subtitles with provider %r and languages %r', provider, provider_languages)
try:
return self[provider].list_subtitles(video, provider_languages)
except (requests.Timeout, socket.timeout):
logger.error('Provider %r timed out', provider)
except (ServiceUnavailable, ProtocolError): # OpenSubtitles raises xmlrpclib.ProtocolError when unavailable
logger.error('Provider %r unavailable', provider)
except requests.exceptions.HTTPError as e:
if e.response.status_code in range(500, 600):
logger.error('Provider %r unavailable', provider)
else:
logger.exception('Provider %r http error %r', provider, e.response.status_code)
except SSLError as e:
if e.args[0] == 'The read operation timed out':
logger.error('Provider %r unavailable', provider)
else:
logger.exception('Provider %r SSL error %r', provider, e.args[0])
except:
logger.exception('Unexpected error in provider %r', provider)
def list_subtitles(self, video, languages):
"""List subtitles.
:param video: video to list subtitles for.
:type video: :class:`~subliminal.video.Video`
:param languages: languages to search for.
:type languages: set of :class:`~babelfish.language.Language`
:return: found subtitles.
:rtype: list of :class:`~subliminal.subtitle.Subtitle`
"""
subtitles = []
for name in self.providers:
# check discarded providers
if name in self.discarded_providers:
logger.debug('Skipping discarded provider %r', name)
continue
# list subtitles
provider_subtitles = self.list_subtitles_provider(name, video, languages)
if provider_subtitles is None:
logger.info('Discarding provider %s', name)
self.discarded_providers.add(name)
continue
# add the subtitles
subtitles.extend(provider_subtitles)
return subtitles
def download_subtitle(self, subtitle):
"""Download `subtitle`'s :attr:`~subliminal.subtitle.Subtitle.content`.
:param subtitle: subtitle to download.
:type subtitle: :class:`~subliminal.subtitle.Subtitle`
:return: `True` if the subtitle has been successfully downloaded, `False` otherwise.
:rtype: bool
"""
# check discarded providers
if subtitle.provider_name in self.discarded_providers:
logger.warning('Provider %r is discarded', subtitle.provider_name)
return False
logger.info('Downloading subtitle %r', subtitle)
try:
self[subtitle.provider_name].download_subtitle(subtitle)
except (requests.Timeout, socket.timeout):
logger.error('Provider %r timed out, discarding it', subtitle.provider_name)
self.discarded_providers.add(subtitle.provider_name)
return False
except (ServiceUnavailable, ProtocolError): # OpenSubtitles raises xmlrpclib.ProtocolError when unavailable
logger.error('Provider %r unavailable, discarding it', subtitle.provider_name)
self.discarded_providers.add(subtitle.provider_name)
return False
except requests.exceptions.HTTPError as e:
if e.response.status_code in range(500, 600):
logger.error('Provider %r unavailable, discarding it', subtitle.provider_name)
else:
logger.exception('Provider %r http error %r, discarding it', subtitle.provider_name,
e.response.status_code)
self.discarded_providers.add(subtitle.provider_name)
return False
except SSLError as e:
if e.args[0] == 'The read operation timed out':
logger.error('Provider %r unavailable, discarding it', subtitle.provider_name)
else:
logger.exception('Provider %r SSL error %r, discarding it', subtitle.provider_name, e.args[0])
self.discarded_providers.add(subtitle.provider_name)
return False
except (BadRarFile, BadZipfile):
logger.error('Bad archive for %r', subtitle)
return False
except:
logger.exception('Unexpected error in provider %r, discarding it', subtitle.provider_name)
self.discarded_providers.add(subtitle.provider_name)
return False
# check subtitle validity
if not subtitle.is_valid():
logger.error('Invalid subtitle')
return False
return True
def download_best_subtitles(self, subtitles, video, languages, min_score=0, hearing_impaired=False, only_one=False,
compute_score=None):
"""Download the best matching subtitles.
:param subtitles: the subtitles to use.
:type subtitles: list of :class:`~subliminal.subtitle.Subtitle`
:param video: video to download subtitles for.
:type video: :class:`~subliminal.video.Video`
:param languages: languages to download.
:type languages: set of :class:`~babelfish.language.Language`
:param int min_score: minimum score for a subtitle to be downloaded.
:param bool hearing_impaired: hearing impaired preference.
:param bool only_one: download only one subtitle, not one per language.
:param compute_score: function that takes `subtitle` and `video` as positional arguments,
`hearing_impaired` as keyword argument and returns the score.
:return: downloaded subtitles.
:rtype: list of :class:`~subliminal.subtitle.Subtitle`
"""
compute_score = compute_score or default_compute_score
# sort subtitles by score
scored_subtitles = sorted([(s, compute_score(s, video, hearing_impaired=hearing_impaired))
for s in subtitles], key=operator.itemgetter(1), reverse=True)
# download best subtitles, falling back on the next on error
downloaded_subtitles = []
for subtitle, score in scored_subtitles:
# check score
if score < min_score:
logger.info('Score %d is below min_score (%d)', score, min_score)
break
# check downloaded languages
if subtitle.language in set(s.language for s in downloaded_subtitles):
logger.debug('Skipping subtitle: %r already downloaded', subtitle.language)
continue
# download
if self.download_subtitle(subtitle):
downloaded_subtitles.append(subtitle)
# stop when all languages are downloaded
if set(s.language for s in downloaded_subtitles) == languages:
logger.debug('All languages downloaded')
break
# stop if only one subtitle is requested
if only_one:
logger.debug('Only one subtitle downloaded')
break
return downloaded_subtitles
def terminate(self):
"""Terminate all the :attr:`initialized_providers`."""
logger.debug('Terminating initialized providers')
for name in list(self.initialized_providers):
del self[name]
class AsyncProviderPool(ProviderPool):
"""Subclass of :class:`ProviderPool` with asynchronous support for :meth:`~ProviderPool.list_subtitles`.
:param int max_workers: maximum number of threads to use. If `None`, :attr:`max_workers` will be set
to the number of :attr:`~ProviderPool.providers`.
"""
def __init__(self, max_workers=None, *args, **kwargs):
super(AsyncProviderPool, self).__init__(*args, **kwargs)
#: Maximum number of threads to use
self.max_workers = max_workers or len(self.providers)
def list_subtitles_provider(self, provider, video, languages):
return provider, super(AsyncProviderPool, self).list_subtitles_provider(provider, video, languages)
def list_subtitles(self, video, languages):
subtitles = []
with ThreadPoolExecutor(self.max_workers) as executor:
for provider, provider_subtitles in executor.map(self.list_subtitles_provider, self.providers,
itertools.repeat(video, len(self.providers)),
itertools.repeat(languages, len(self.providers))):
# discard provider that failed
if provider_subtitles is None:
logger.info('Discarding provider %s', provider)
self.discarded_providers.add(provider)
continue
# add subtitles
subtitles.extend(provider_subtitles)
return subtitles
def check_video(video, languages=None, age=None, undefined=False):
"""Perform some checks on the `video`.
All the checks are optional. Return `False` if any of this check fails:
* `languages` already exist in `video`'s :attr:`~subliminal.video.Video.subtitle_languages`.
* `video` is older than `age`.
* `video` has an `undefined` language in :attr:`~subliminal.video.Video.subtitle_languages`.
:param video: video to check.
:type video: :class:`~subliminal.video.Video`
:param languages: desired languages.
:type languages: set of :class:`~babelfish.language.Language`
:param datetime.timedelta age: maximum age of the video.
:param bool undefined: fail on existing undefined language.
:return: `True` if the video passes the checks, `False` otherwise.
:rtype: bool
"""
# language test
if languages and not (languages - video.subtitle_languages):
logger.debug('All languages %r exist', languages)
return False
# age test
if age and video.age > age:
logger.debug('Video is older than %r', age)
return False
# undefined test
if undefined and Language('und') in video.subtitle_languages:
logger.debug('Undefined language found')
return False
return True
def search_external_subtitles(path, directory=None):
"""Search for external subtitles from a video `path` and their associated language.
Unless `directory` is provided, search will be made in the same directory as the video file.
:param str path: path to the video.
:param str directory: directory to search for subtitles.
:return: found subtitles with their languages.
:rtype: dict
"""
# split path
dirpath, filename = os.path.split(path)
dirpath = dirpath or '.'
fileroot, fileext = os.path.splitext(filename)
# search for subtitles
subtitles = {}
for p in os.listdir(directory or dirpath):
# keep only valid subtitle filenames
if not p.startswith(fileroot) or not p.endswith(SUBTITLE_EXTENSIONS):
continue
# extract the potential language code
language = Language('und')
language_code = p[len(fileroot):-len(os.path.splitext(p)[1])].replace(fileext, '').replace('_', '-')[1:]
if language_code:
try:
language = Language.fromietf(language_code)
except (ValueError, LanguageReverseError):
logger.error('Cannot parse language code %r', language_code)
subtitles[p] = language
logger.debug('Found subtitles %r', subtitles)
return subtitles
def scan_video(path):
"""Scan a video from a `path`.
:param str path: existing path to the video.
:return: the scanned video.
:rtype: :class:`~subliminal.video.Video`
"""
# check for non-existing path
if not os.path.exists(path):
raise ValueError('Path does not exist')
# check video extension
if not path.endswith(VIDEO_EXTENSIONS):
raise ValueError('%r is not a valid video extension' % os.path.splitext(path)[1])
dirpath, filename = os.path.split(path)
logger.info('Scanning video %r in %r', filename, dirpath)
# guess
video = Video.fromguess(path, guessit(path))
# size and hashes
video.size = os.path.getsize(path)
if video.size > 10485760:
logger.debug('Size is %d', video.size)
video.hashes['opensubtitles'] = hash_opensubtitles(path)
video.hashes['shooter'] = hash_shooter(path)
video.hashes['thesubdb'] = hash_thesubdb(path)
video.hashes['napiprojekt'] = hash_napiprojekt(path)
logger.debug('Computed hashes %r', video.hashes)
else:
logger.warning('Size is lower than 10MB: hashes not computed')
return video
def scan_archive(path):
"""Scan an archive from a `path`.
:param str path: existing path to the archive.
:return: the scanned video.
:rtype: :class:`~subliminal.video.Video`
"""
# check for non-existing path
if not os.path.exists(path):
raise ValueError('Path does not exist')
# check video extension
if not path.endswith(ARCHIVE_EXTENSIONS):
raise ValueError('%r is not a valid archive extension' % os.path.splitext(path)[1])
dirpath, filename = os.path.split(path)
logger.info('Scanning archive %r in %r', filename, dirpath)
# rar extension
if filename.endswith('.rar'):
rar = RarFile(path)
# filter on video extensions
rar_filenames = [f for f in rar.namelist() if f.endswith(VIDEO_EXTENSIONS)]
# no video found
if not rar_filenames:
raise ValueError('No video in archive')
# more than one video found
if len(rar_filenames) > 1:
raise ValueError('More than one video in archive')
# guess
rar_filename = rar_filenames[0]
rar_filepath = os.path.join(dirpath, rar_filename)
video = Video.fromguess(rar_filepath, guessit(rar_filepath))
# size
video.size = rar.getinfo(rar_filename).file_size
else:
raise ValueError('Unsupported extension %r' % os.path.splitext(path)[1])
return video
def scan_videos(path, age=None, archives=True):
"""Scan `path` for videos and their subtitles.
See :func:`refine` to find additional information for the video.
:param str path: existing directory path to scan.
:param datetime.timedelta age: maximum age of the video or archive.
:param bool archives: scan videos in archives.
:return: the scanned videos.
:rtype: list of :class:`~subliminal.video.Video`
"""
# check for non-existing path
if not os.path.exists(path):
raise ValueError('Path does not exist')
# check for non-directory path
if not os.path.isdir(path):
raise ValueError('Path is not a directory')
# walk the path
videos = []
for dirpath, dirnames, filenames in os.walk(path):
logger.debug('Walking directory %r', dirpath)
# remove badly encoded and hidden dirnames
for dirname in list(dirnames):
if dirname.startswith('.'):
logger.debug('Skipping hidden dirname %r in %r', dirname, dirpath)
dirnames.remove(dirname)
# scan for videos
for filename in filenames:
# filter on videos and archives
if not (filename.endswith(VIDEO_EXTENSIONS) or archives and filename.endswith(ARCHIVE_EXTENSIONS)):
continue
# skip hidden files
if filename.startswith('.'):
logger.debug('Skipping hidden filename %r in %r', filename, dirpath)
continue
# reconstruct the file path
filepath = os.path.join(dirpath, filename)
# skip links
if os.path.islink(filepath):
logger.debug('Skipping link %r in %r', filename, dirpath)
continue
# skip old files
try:
file_age = datetime.utcfromtimestamp(os.path.getmtime(filepath))
except ValueError:
logger.warning('Could not get age of file %r in %r', filename, dirpath)
continue
else:
if age and datetime.utcnow() - file_age > age:
logger.debug('Skipping old file %r in %r', filename, dirpath)
continue
# scan
if filename.endswith(VIDEO_EXTENSIONS): # video
try:
video = scan_video(filepath)
except ValueError: # pragma: no cover
logger.exception('Error scanning video')
continue
elif archives and filename.endswith(ARCHIVE_EXTENSIONS): # archive
try:
video = scan_archive(filepath)
except (NotRarFile, RarCannotExec, ValueError): # pragma: no cover
logger.exception('Error scanning archive')
continue
else: # pragma: no cover
raise ValueError('Unsupported file %r' % filename)
videos.append(video)
return videos
def refine(video, episode_refiners=None, movie_refiners=None, **kwargs):
"""Refine a video using :ref:`refiners`.
.. note::
Exceptions raised in refiners are silently passed and logged.
:param video: the video to refine.
:type video: :class:`~subliminal.video.Video`
:param tuple episode_refiners: refiners to use for episodes.
:param tuple movie_refiners: refiners to use for movies.
:param \*\*kwargs: additional parameters for the :func:`~subliminal.refiners.refine` functions.
"""
refiners = ()
if isinstance(video, Episode):
refiners = episode_refiners or ('metadata', 'tvdb', 'omdb')
elif isinstance(video, Movie):
refiners = movie_refiners or ('metadata', 'omdb')
for refiner in refiners:
logger.info('Refining video with %s', refiner)
try:
refiner_manager[refiner].plugin(video, **kwargs)
except:
logger.error('Failed to refine video %r', video.name)
logger.debug('Refiner exception:', exc_info=True)
def list_subtitles(videos, languages, pool_class=ProviderPool, **kwargs):
"""List subtitles.
The `videos` must pass the `languages` check of :func:`check_video`.
:param videos: videos to list subtitles for.
:type videos: set of :class:`~subliminal.video.Video`
:param languages: languages to search for.
:type languages: set of :class:`~babelfish.language.Language`
:param pool_class: class to use as provider pool.
:type pool_class: :class:`ProviderPool`, :class:`AsyncProviderPool` or similar
:param \*\*kwargs: additional parameters for the provided `pool_class` constructor.
:return: found subtitles per video.
:rtype: dict of :class:`~subliminal.video.Video` to list of :class:`~subliminal.subtitle.Subtitle`
"""
listed_subtitles = defaultdict(list)
# check videos
checked_videos = []
for video in videos:
if not check_video(video, languages=languages):
logger.info('Skipping video %r', video)
continue
checked_videos.append(video)
# return immediately if no video passed the checks
if not checked_videos:
return listed_subtitles
# list subtitles
with pool_class(**kwargs) as pool:
for video in checked_videos:
logger.info('Listing subtitles for %r', video)
subtitles = pool.list_subtitles(video, languages - video.subtitle_languages)
listed_subtitles[video].extend(subtitles)
logger.info('Found %d subtitle(s)', len(subtitles))
return listed_subtitles
def download_subtitles(subtitles, pool_class=ProviderPool, **kwargs):
"""Download :attr:`~subliminal.subtitle.Subtitle.content` of `subtitles`.
:param subtitles: subtitles to download.
:type subtitles: list of :class:`~subliminal.subtitle.Subtitle`
:param pool_class: class to use as provider pool.
:type pool_class: :class:`ProviderPool`, :class:`AsyncProviderPool` or similar
:param \*\*kwargs: additional parameters for the provided `pool_class` constructor.
"""
with pool_class(**kwargs) as pool:
for subtitle in subtitles:
logger.info('Downloading subtitle %r', subtitle)
pool.download_subtitle(subtitle)
def download_best_subtitles(videos, languages, min_score=0, hearing_impaired=False, only_one=False, compute_score=None,
pool_class=ProviderPool, **kwargs):
"""List and download the best matching subtitles.
The `videos` must pass the `languages` and `undefined` (`only_one`) checks of :func:`check_video`.
:param videos: videos to download subtitles for.
:type videos: set of :class:`~subliminal.video.Video`
:param languages: languages to download.
:type languages: set of :class:`~babelfish.language.Language`
:param int min_score: minimum score for a subtitle to be downloaded.
:param bool hearing_impaired: hearing impaired preference.
:param bool only_one: download only one subtitle, not one per language.
:param compute_score: function that takes `subtitle` and `video` as positional arguments,
`hearing_impaired` as keyword argument and returns the score.
:param pool_class: class to use as provider pool.
:type pool_class: :class:`ProviderPool`, :class:`AsyncProviderPool` or similar
:param \*\*kwargs: additional parameters for the provided `pool_class` constructor.
:return: downloaded subtitles per video.
:rtype: dict of :class:`~subliminal.video.Video` to list of :class:`~subliminal.subtitle.Subtitle`
"""
downloaded_subtitles = defaultdict(list)
# check videos
checked_videos = []
for video in videos:
if not check_video(video, languages=languages, undefined=only_one):
logger.info('Skipping video %r', video)
continue
checked_videos.append(video)
# return immediately if no video passed the checks
if not checked_videos:
return downloaded_subtitles
# download best subtitles
with pool_class(**kwargs) as pool:
for video in checked_videos:
logger.info('Downloading best subtitles for %r', video)
subtitles = pool.download_best_subtitles(pool.list_subtitles(video, languages - video.subtitle_languages),
video, languages, min_score=min_score,
hearing_impaired=hearing_impaired, only_one=only_one,
compute_score=compute_score)
logger.info('Downloaded %d subtitle(s)', len(subtitles))
downloaded_subtitles[video].extend(subtitles)
return downloaded_subtitles
def save_subtitles(video, subtitles, single=False, directory=None, encoding=None):
"""Save subtitles on filesystem.
Subtitles are saved in the order of the list. If a subtitle with a language has already been saved, other subtitles
with the same language are silently ignored.
The extension used is `.lang.srt` by default or `.srt` is `single` is `True`, with `lang` being the IETF code for
the :attr:`~subliminal.subtitle.Subtitle.language` of the subtitle.
:param video: video of the subtitles.
:type video: :class:`~subliminal.video.Video`
:param subtitles: subtitles to save.
:type subtitles: list of :class:`~subliminal.subtitle.Subtitle`
:param bool single: save a single subtitle, default is to save one subtitle per language.
:param str directory: path to directory where to save the subtitles, default is next to the video.
:param str encoding: encoding in which to save the subtitles, default is to keep original encoding.
:return: the saved subtitles
:rtype: list of :class:`~subliminal.subtitle.Subtitle`
"""
saved_subtitles = []
for subtitle in subtitles:
# check content
if subtitle.content is None:
logger.error('Skipping subtitle %r: no content', subtitle)
continue
# check language
if subtitle.language in set(s.language for s in saved_subtitles):
logger.debug('Skipping subtitle %r: language already saved', subtitle)
continue
# create subtitle path
subtitle_path = get_subtitle_path(video.name, None if single else subtitle.language)
if directory is not None:
subtitle_path = os.path.join(directory, os.path.split(subtitle_path)[1])
# save content as is or in the specified encoding
logger.info('Saving %r to %r', subtitle, subtitle_path)
if encoding is None:
with io.open(subtitle_path, 'wb') as f:
f.write(subtitle.content)
else:
with io.open(subtitle_path, 'w', encoding=encoding) as f:
f.write(subtitle.text)
saved_subtitles.append(subtitle)
# check single
if single:
break
return saved_subtitles

View File

@ -0,0 +1,29 @@
# -*- coding: utf-8 -*-
class Error(Exception):
"""Base class for exceptions in subliminal."""
pass
class ProviderError(Error):
"""Exception raised by providers."""
pass
class ConfigurationError(ProviderError):
"""Exception raised by providers when badly configured."""
pass
class AuthenticationError(ProviderError):
"""Exception raised by providers when authentication failed."""
pass
class ServiceUnavailable(ProviderError):
"""Exception raised when status is '503 Service Unavailable'."""
pass
class DownloadLimitExceeded(ProviderError):
"""Exception raised by providers when download limit is exceeded."""
pass

View File

@ -0,0 +1,105 @@
# -*- coding: utf-8 -*-
from pkg_resources import EntryPoint
from stevedore import ExtensionManager
class RegistrableExtensionManager(ExtensionManager):
""":class:~stevedore.extensions.ExtensionManager` with support for registration.
It allows loading of internal extensions without setup and registering/unregistering additional extensions.
Loading is done in this order:
* Entry point extensions
* Internal extensions
* Registered extensions
:param str namespace: namespace argument for :class:~stevedore.extensions.ExtensionManager`.
:param list internal_extensions: internal extensions to use with entry point syntax.
:param \*\*kwargs: additional parameters for the :class:~stevedore.extensions.ExtensionManager` constructor.
"""
def __init__(self, namespace, internal_extensions, **kwargs):
#: Registered extensions with entry point syntax
self.registered_extensions = []
#: Internal extensions with entry point syntax
self.internal_extensions = internal_extensions
super(RegistrableExtensionManager, self).__init__(namespace, **kwargs)
def list_entry_points(self):
# copy of default extensions
eps = list(super(RegistrableExtensionManager, self).list_entry_points())
# internal extensions
for iep in self.internal_extensions:
ep = EntryPoint.parse(iep)
if ep.name not in [e.name for e in eps]:
eps.append(ep)
# registered extensions
for rep in self.registered_extensions:
ep = EntryPoint.parse(rep)
if ep.name not in [e.name for e in eps]:
eps.append(ep)
return eps
def register(self, entry_point):
"""Register an extension
:param str entry_point: extension to register (entry point syntax).
:raise: ValueError if already registered.
"""
if entry_point in self.registered_extensions:
raise ValueError('Extension already registered')
ep = EntryPoint.parse(entry_point)
if ep.name in self.names():
raise ValueError('An extension with the same name already exist')
ext = self._load_one_plugin(ep, False, (), {}, False)
self.extensions.append(ext)
if self._extensions_by_name is not None:
self._extensions_by_name[ext.name] = ext
self.registered_extensions.insert(0, entry_point)
def unregister(self, entry_point):
"""Unregister a provider
:param str entry_point: provider to unregister (entry point syntax).
"""
if entry_point not in self.registered_extensions:
raise ValueError('Extension not registered')
ep = EntryPoint.parse(entry_point)
self.registered_extensions.remove(entry_point)
if self._extensions_by_name is not None:
del self._extensions_by_name[ep.name]
for i, ext in enumerate(self.extensions):
if ext.name == ep.name:
del self.extensions[i]
break
#: Provider manager
provider_manager = RegistrableExtensionManager('subliminal.providers', [
'addic7ed = subliminal.providers.addic7ed:Addic7edProvider',
'legendastv = subliminal.providers.legendastv:LegendasTVProvider',
'opensubtitles = subliminal.providers.opensubtitles:OpenSubtitlesProvider',
'podnapisi = subliminal.providers.podnapisi:PodnapisiProvider',
'shooter = subliminal.providers.shooter:ShooterProvider',
'thesubdb = subliminal.providers.thesubdb:TheSubDBProvider',
'tvsubtitles = subliminal.providers.tvsubtitles:TVsubtitlesProvider'
])
#: Refiner manager
refiner_manager = RegistrableExtensionManager('subliminal.refiners', [
'metadata = subliminal.refiners.metadata:refine',
'omdb = subliminal.refiners.omdb:refine',
'tvdb = subliminal.refiners.tvdb:refine'
])

View File

@ -0,0 +1,164 @@
# -*- coding: utf-8 -*-
import logging
from bs4 import BeautifulSoup, FeatureNotFound
from six.moves.xmlrpc_client import SafeTransport
from ..video import Episode, Movie
logger = logging.getLogger(__name__)
class TimeoutSafeTransport(SafeTransport):
"""Timeout support for ``xmlrpc.client.SafeTransport``."""
def __init__(self, timeout, *args, **kwargs):
SafeTransport.__init__(self, *args, **kwargs)
self.timeout = timeout
def make_connection(self, host):
c = SafeTransport.make_connection(self, host)
c.timeout = self.timeout
return c
class ParserBeautifulSoup(BeautifulSoup):
"""A ``bs4.BeautifulSoup`` that picks the first parser available in `parsers`.
:param markup: markup for the ``bs4.BeautifulSoup``.
:param list parsers: parser names, in order of preference.
"""
def __init__(self, markup, parsers, **kwargs):
# reject features
if set(parsers).intersection({'fast', 'permissive', 'strict', 'xml', 'html', 'html5'}):
raise ValueError('Features not allowed, only parser names')
# reject some kwargs
if 'features' in kwargs:
raise ValueError('Cannot use features kwarg')
if 'builder' in kwargs:
raise ValueError('Cannot use builder kwarg')
# pick the first parser available
for parser in parsers:
try:
super(ParserBeautifulSoup, self).__init__(markup, parser, **kwargs)
return
except FeatureNotFound:
pass
raise FeatureNotFound
class Provider(object):
"""Base class for providers.
If any configuration is possible for the provider, like credentials, it must take place during instantiation.
:raise: :class:`~subliminal.exceptions.ConfigurationError` if there is a configuration error
"""
#: Supported set of :class:`~babelfish.language.Language`
languages = set()
#: Supported video types
video_types = (Episode, Movie)
#: Required hash, if any
required_hash = None
#: Subtitle class to use
subtitle_class = None
def __enter__(self):
self.initialize()
return self
def __exit__(self, exc_type, exc_value, traceback):
self.terminate()
def initialize(self):
"""Initialize the provider.
Must be called when starting to work with the provider. This is the place for network initialization
or login operations.
.. note::
This is called automatically when entering the `with` statement
"""
raise NotImplementedError
def terminate(self):
"""Terminate the provider.
Must be called when done with the provider. This is the place for network shutdown or logout operations.
.. note::
This is called automatically when exiting the `with` statement
"""
raise NotImplementedError
@classmethod
def check(cls, video):
"""Check if the `video` can be processed.
The `video` is considered invalid if not an instance of :attr:`video_types` or if the :attr:`required_hash` is
not present in :attr:`~subliminal.video.Video.hashes` attribute of the `video`.
:param video: the video to check.
:type video: :class:`~subliminal.video.Video`
:return: `True` if the `video` is valid, `False` otherwise.
:rtype: bool
"""
if not isinstance(video, cls.video_types):
return False
if cls.required_hash is not None and cls.required_hash not in video.hashes:
return False
return True
def query(self, *args, **kwargs):
"""Query the provider for subtitles.
Arguments should match as much as possible the actual parameters for querying the provider
:return: found subtitles.
:rtype: list of :class:`~subliminal.subtitle.Subtitle`
:raise: :class:`~subliminal.exceptions.ProviderError`
"""
raise NotImplementedError
def list_subtitles(self, video, languages):
"""List subtitles for the `video` with the given `languages`.
This will call the :meth:`query` method internally. The parameters passed to the :meth:`query` method may
vary depending on the amount of information available in the `video`.
:param video: video to list subtitles for.
:type video: :class:`~subliminal.video.Video`
:param languages: languages to search for.
:type languages: set of :class:`~babelfish.language.Language`
:return: found subtitles.
:rtype: list of :class:`~subliminal.subtitle.Subtitle`
:raise: :class:`~subliminal.exceptions.ProviderError`
"""
raise NotImplementedError
def download_subtitle(self, subtitle):
"""Download `subtitle`'s :attr:`~subliminal.subtitle.Subtitle.content`.
:param subtitle: subtitle to download.
:type subtitle: :class:`~subliminal.subtitle.Subtitle`
:raise: :class:`~subliminal.exceptions.ProviderError`
"""
raise NotImplementedError
def __repr__(self):
return '<%s [%r]>' % (self.__class__.__name__, self.video_types)

View File

@ -0,0 +1,321 @@
# -*- coding: utf-8 -*-
import logging
import re
from babelfish import Language, language_converters
from guessit import guessit
from requests import Session
from . import ParserBeautifulSoup, Provider
from .. import __short_version__
from ..cache import SHOW_EXPIRATION_TIME, region
from ..exceptions import AuthenticationError, ConfigurationError, DownloadLimitExceeded
from ..score import get_equivalent_release_groups
from ..subtitle import Subtitle, fix_line_ending, guess_matches
from ..utils import sanitize, sanitize_release_group
from ..video import Episode
logger = logging.getLogger(__name__)
language_converters.register('addic7ed = subliminal.converters.addic7ed:Addic7edConverter')
# Series cell matching regex
show_cells_re = re.compile(b'<td class="version">.*?</td>', re.DOTALL)
#: Series header parsing regex
series_year_re = re.compile(r'^(?P<series>[ \w\'.:(),*&!?-]+?)(?: \((?P<year>\d{4})\))?$')
class Addic7edSubtitle(Subtitle):
"""Addic7ed Subtitle."""
provider_name = 'addic7ed'
def __init__(self, language, hearing_impaired, page_link, series, season, episode, title, year, version,
download_link):
super(Addic7edSubtitle, self).__init__(language, hearing_impaired=hearing_impaired, page_link=page_link)
self.series = series
self.season = season
self.episode = episode
self.title = title
self.year = year
self.version = version
self.download_link = download_link
@property
def id(self):
return self.download_link
def get_matches(self, video):
matches = set()
# series name
if video.series and sanitize(self.series) in (
sanitize(name) for name in [video.series] + video.alternative_series):
matches.add('series')
# season
if video.season and self.season == video.season:
matches.add('season')
# episode
if video.episode and self.episode == video.episode:
matches.add('episode')
# title of the episode
if video.title and sanitize(self.title) == sanitize(video.title):
matches.add('title')
# year
if video.original_series and self.year is None or video.year and video.year == self.year:
matches.add('year')
# release_group
if (video.release_group and self.version and
any(r in sanitize_release_group(self.version)
for r in get_equivalent_release_groups(sanitize_release_group(video.release_group)))):
matches.add('release_group')
# resolution
if video.resolution and self.version and video.resolution in self.version.lower():
matches.add('resolution')
# format
if video.format and self.version and video.format.lower() in self.version.lower():
matches.add('format')
# other properties
matches |= guess_matches(video, guessit(self.version), partial=True)
return matches
class Addic7edProvider(Provider):
"""Addic7ed Provider."""
languages = {Language('por', 'BR')} | {Language(l) for l in [
'ara', 'aze', 'ben', 'bos', 'bul', 'cat', 'ces', 'dan', 'deu', 'ell', 'eng', 'eus', 'fas', 'fin', 'fra', 'glg',
'heb', 'hrv', 'hun', 'hye', 'ind', 'ita', 'jpn', 'kor', 'mkd', 'msa', 'nld', 'nor', 'pol', 'por', 'ron', 'rus',
'slk', 'slv', 'spa', 'sqi', 'srp', 'swe', 'tha', 'tur', 'ukr', 'vie', 'zho'
]}
video_types = (Episode,)
server_url = 'http://www.addic7ed.com/'
subtitle_class = Addic7edSubtitle
def __init__(self, username=None, password=None):
if any((username, password)) and not all((username, password)):
raise ConfigurationError('Username and password must be specified')
self.username = username
self.password = password
self.logged_in = False
self.session = None
def initialize(self):
self.session = Session()
self.session.headers['User-Agent'] = 'Subliminal/%s' % __short_version__
# login
if self.username and self.password:
logger.info('Logging in')
data = {'username': self.username, 'password': self.password, 'Submit': 'Log in'}
r = self.session.post(self.server_url + 'dologin.php', data, allow_redirects=False, timeout=10)
if r.status_code != 302:
raise AuthenticationError(self.username)
logger.debug('Logged in')
self.logged_in = True
def terminate(self):
# logout
if self.logged_in:
logger.info('Logging out')
r = self.session.get(self.server_url + 'logout.php', timeout=10)
r.raise_for_status()
logger.debug('Logged out')
self.logged_in = False
self.session.close()
@region.cache_on_arguments(expiration_time=SHOW_EXPIRATION_TIME)
def _get_show_ids(self):
"""Get the ``dict`` of show ids per series by querying the `shows.php` page.
:return: show id per series, lower case and without quotes.
:rtype: dict
"""
# get the show page
logger.info('Getting show ids')
r = self.session.get(self.server_url + 'shows.php', timeout=10)
r.raise_for_status()
# LXML parser seems to fail when parsing Addic7ed.com HTML markup.
# Last known version to work properly is 3.6.4 (next version, 3.7.0, fails)
# Assuming the site's markup is bad, and stripping it down to only contain what's needed.
show_cells = re.findall(show_cells_re, r.content)
if show_cells:
soup = ParserBeautifulSoup(b''.join(show_cells), ['lxml', 'html.parser'])
else:
# If RegEx fails, fall back to original r.content and use 'html.parser'
soup = ParserBeautifulSoup(r.content, ['html.parser'])
# populate the show ids
show_ids = {}
for show in soup.select('td.version > h3 > a[href^="/show/"]'):
show_ids[sanitize(show.text)] = int(show['href'][6:])
logger.debug('Found %d show ids', len(show_ids))
return show_ids
@region.cache_on_arguments(expiration_time=SHOW_EXPIRATION_TIME)
def _search_show_id(self, series, year=None):
"""Search the show id from the `series` and `year`.
:param str series: series of the episode.
:param year: year of the series, if any.
:type year: int
:return: the show id, if found.
:rtype: int
"""
# addic7ed doesn't support search with quotes
series = series.replace('\'', ' ')
# build the params
series_year = '%s %d' % (series, year) if year is not None else series
params = {'search': series_year, 'Submit': 'Search'}
# make the search
logger.info('Searching show ids with %r', params)
r = self.session.get(self.server_url + 'search.php', params=params, timeout=10)
r.raise_for_status()
soup = ParserBeautifulSoup(r.content, ['lxml', 'html.parser'])
# get the suggestion
suggestion = soup.select('span.titulo > a[href^="/show/"]')
if not suggestion:
logger.warning('Show id not found: no suggestion')
return None
if not sanitize(suggestion[0].i.text.replace('\'', ' ')) == sanitize(series_year):
logger.warning('Show id not found: suggestion does not match')
return None
show_id = int(suggestion[0]['href'][6:])
logger.debug('Found show id %d', show_id)
return show_id
def get_show_id(self, series, year=None, country_code=None):
"""Get the best matching show id for `series`, `year` and `country_code`.
First search in the result of :meth:`_get_show_ids` and fallback on a search with :meth:`_search_show_id`.
:param str series: series of the episode.
:param year: year of the series, if any.
:type year: int
:param country_code: country code of the series, if any.
:type country_code: str
:return: the show id, if found.
:rtype: int
"""
series_sanitized = sanitize(series).lower()
show_ids = self._get_show_ids()
show_id = None
# attempt with country
if not show_id and country_code:
logger.debug('Getting show id with country')
show_id = show_ids.get('%s %s' % (series_sanitized, country_code.lower()))
# attempt with year
if not show_id and year:
logger.debug('Getting show id with year')
show_id = show_ids.get('%s %d' % (series_sanitized, year))
# attempt clean
if not show_id:
logger.debug('Getting show id')
show_id = show_ids.get(series_sanitized)
# search as last resort
if not show_id:
logger.warning('Series %s not found in show ids', series)
show_id = self._search_show_id(series)
return show_id
def query(self, show_id, series, season, year=None, country=None):
# get the page of the season of the show
logger.info('Getting the page of show id %d, season %d', show_id, season)
r = self.session.get(self.server_url + 'show/%d' % show_id, params={'season': season}, timeout=10)
r.raise_for_status()
if not r.content:
# Provider returns a status of 304 Not Modified with an empty content
# raise_for_status won't raise exception for that status code
logger.debug('No data returned from provider')
return []
soup = ParserBeautifulSoup(r.content, ['lxml', 'html.parser'])
# loop over subtitle rows
match = series_year_re.match(soup.select('#header font')[0].text.strip()[:-10])
series = match.group('series')
year = int(match.group('year')) if match.group('year') else None
subtitles = []
for row in soup.select('tr.epeven'):
cells = row('td')
# ignore incomplete subtitles
status = cells[5].text
if status != 'Completed':
logger.debug('Ignoring subtitle with status %s', status)
continue
# read the item
language = Language.fromaddic7ed(cells[3].text)
hearing_impaired = bool(cells[6].text)
page_link = self.server_url + cells[2].a['href'][1:]
season = int(cells[0].text)
episode = int(cells[1].text)
title = cells[2].text
version = cells[4].text
download_link = cells[9].a['href'][1:]
subtitle = self.subtitle_class(language, hearing_impaired, page_link, series, season, episode, title, year,
version, download_link)
logger.debug('Found subtitle %r', subtitle)
subtitles.append(subtitle)
return subtitles
def list_subtitles(self, video, languages):
# lookup show_id
titles = [video.series] + video.alternative_series
show_id = None
for title in titles:
show_id = self.get_show_id(title, video.year)
if show_id is not None:
break
# query for subtitles with the show_id
if show_id is not None:
subtitles = [s for s in self.query(show_id, title, video.season, video.year)
if s.language in languages and s.episode == video.episode]
if subtitles:
return subtitles
else:
logger.error('No show id found for %r (%r)', video.series, {'year': video.year})
return []
def download_subtitle(self, subtitle):
# download the subtitle
logger.info('Downloading subtitle %r', subtitle)
r = self.session.get(self.server_url + subtitle.download_link, headers={'Referer': subtitle.page_link},
timeout=10)
r.raise_for_status()
if not r.content:
# Provider returns a status of 304 Not Modified with an empty content
# raise_for_status won't raise exception for that status code
logger.debug('Unable to download subtitle. No data returned from provider')
return
# detect download limit exceeded
if r.headers['Content-Type'] == 'text/html':
raise DownloadLimitExceeded
subtitle.content = fix_line_ending(r.content)

View File

@ -0,0 +1,522 @@
# -*- coding: utf-8 -*-
import io
import json
import logging
import os
import re
from babelfish import Language, language_converters
from datetime import datetime, timedelta
from dogpile.cache.api import NO_VALUE
from guessit import guessit
import pytz
import rarfile
from rarfile import RarFile, is_rarfile
from requests import Session
from zipfile import ZipFile, is_zipfile
from . import ParserBeautifulSoup, Provider
from .. import __short_version__
from ..cache import SHOW_EXPIRATION_TIME, region
from ..exceptions import AuthenticationError, ConfigurationError, ProviderError, ServiceUnavailable
from ..subtitle import SUBTITLE_EXTENSIONS, Subtitle, fix_line_ending, guess_matches, sanitize
from ..video import Episode, Movie
logger = logging.getLogger(__name__)
language_converters.register('legendastv = subliminal.converters.legendastv:LegendasTVConverter')
# Configure :mod:`rarfile` to use the same path separator as :mod:`zipfile`
rarfile.PATH_SEP = '/'
#: Conversion map for types
type_map = {'M': 'movie', 'S': 'episode', 'C': 'episode'}
#: BR title season parsing regex
season_re = re.compile(r' - (?P<season>\d+)(\xaa|a|st|nd|rd|th) (temporada|season)', re.IGNORECASE)
#: Downloads parsing regex
downloads_re = re.compile(r'(?P<downloads>\d+) downloads')
#: Rating parsing regex
rating_re = re.compile(r'nota (?P<rating>\d+)')
#: Timestamp parsing regex
timestamp_re = re.compile(r'(?P<day>\d+)/(?P<month>\d+)/(?P<year>\d+) - (?P<hour>\d+):(?P<minute>\d+)')
#: Title with year/country regex
title_re = re.compile(r'^(?P<series>.*?)(?: \((?:(?P<year>\d{4})|(?P<country>[A-Z]{2}))\))?$')
#: Cache key for releases
releases_key = __name__ + ':releases|{archive_id}|{archive_name}'
class LegendasTVArchive(object):
"""LegendasTV Archive.
:param str id: identifier.
:param str name: name.
:param bool pack: contains subtitles for multiple episodes.
:param bool pack: featured.
:param str link: link.
:param int downloads: download count.
:param int rating: rating (0-10).
:param timestamp: timestamp.
:type timestamp: datetime.datetime
"""
def __init__(self, id, name, pack, featured, link, downloads=0, rating=0, timestamp=None):
#: Identifier
self.id = id
#: Name
self.name = name
#: Pack
self.pack = pack
#: Featured
self.featured = featured
#: Link
self.link = link
#: Download count
self.downloads = downloads
#: Rating (0-10)
self.rating = rating
#: Timestamp
self.timestamp = timestamp
#: Compressed content as :class:`rarfile.RarFile` or :class:`zipfile.ZipFile`
self.content = None
def __repr__(self):
return '<%s [%s] %r>' % (self.__class__.__name__, self.id, self.name)
class LegendasTVSubtitle(Subtitle):
"""LegendasTV Subtitle."""
provider_name = 'legendastv'
def __init__(self, language, type, title, year, imdb_id, season, archive, name):
super(LegendasTVSubtitle, self).__init__(language, page_link=archive.link)
self.type = type
self.title = title
self.year = year
self.imdb_id = imdb_id
self.season = season
self.archive = archive
self.name = name
@property
def id(self):
return '%s-%s' % (self.archive.id, self.name.lower())
def get_matches(self, video, hearing_impaired=False):
matches = set()
# episode
if isinstance(video, Episode) and self.type == 'episode':
# series
if video.series and (sanitize(self.title) in (
sanitize(name) for name in [video.series] + video.alternative_series)):
matches.add('series')
# year
if video.original_series and self.year is None or video.year and video.year == self.year:
matches.add('year')
# imdb_id
if video.series_imdb_id and self.imdb_id == video.series_imdb_id:
matches.add('series_imdb_id')
# movie
elif isinstance(video, Movie) and self.type == 'movie':
# title
if video.title and (sanitize(self.title) in (
sanitize(name) for name in [video.title] + video.alternative_titles)):
matches.add('title')
# year
if video.year and self.year == video.year:
matches.add('year')
# imdb_id
if video.imdb_id and self.imdb_id == video.imdb_id:
matches.add('imdb_id')
# name
matches |= guess_matches(video, guessit(self.name, {'type': self.type}))
return matches
class LegendasTVProvider(Provider):
"""LegendasTV Provider.
:param str username: username.
:param str password: password.
"""
languages = {Language.fromlegendastv(l) for l in language_converters['legendastv'].codes}
server_url = 'http://legendas.tv/'
subtitle_class = LegendasTVSubtitle
def __init__(self, username=None, password=None):
# Provider needs UNRAR installed. If not available raise ConfigurationError
try:
rarfile.custom_check(rarfile.UNRAR_TOOL)
except rarfile.RarExecError:
raise ConfigurationError('UNRAR tool not available')
if any((username, password)) and not all((username, password)):
raise ConfigurationError('Username and password must be specified')
self.username = username
self.password = password
self.logged_in = False
self.session = None
def initialize(self):
self.session = Session()
self.session.headers['User-Agent'] = 'Subliminal/%s' % __short_version__
# login
if self.username and self.password:
logger.info('Logging in')
data = {'_method': 'POST', 'data[User][username]': self.username, 'data[User][password]': self.password}
r = self.session.post(self.server_url + 'login', data, allow_redirects=False, timeout=10)
raise_for_status(r)
soup = ParserBeautifulSoup(r.content, ['html.parser'])
if soup.find('div', {'class': 'alert-error'}, string=re.compile(u'Usuário ou senha inválidos')):
raise AuthenticationError(self.username)
logger.debug('Logged in')
self.logged_in = True
def terminate(self):
# logout
if self.logged_in:
logger.info('Logging out')
r = self.session.get(self.server_url + 'users/logout', allow_redirects=False, timeout=10)
raise_for_status(r)
logger.debug('Logged out')
self.logged_in = False
self.session.close()
@staticmethod
def is_valid_title(title, title_id, sanitized_title, season, year):
"""Check if is a valid title."""
sanitized_result = sanitize(title['title'])
if sanitized_result != sanitized_title:
logger.debug("Mismatched title, discarding title %d (%s)",
title_id, sanitized_result)
return
# episode type
if season:
# discard mismatches on type
if title['type'] != 'episode':
logger.debug("Mismatched 'episode' type, discarding title %d (%s)", title_id, sanitized_result)
return
# discard mismatches on season
if 'season' not in title or title['season'] != season:
logger.debug('Mismatched season %s, discarding title %d (%s)',
title.get('season'), title_id, sanitized_result)
return
# movie type
else:
# discard mismatches on type
if title['type'] != 'movie':
logger.debug("Mismatched 'movie' type, discarding title %d (%s)", title_id, sanitized_result)
return
# discard mismatches on year
if year is not None and 'year' in title and title['year'] != year:
logger.debug("Mismatched movie year, discarding title %d (%s)", title_id, sanitized_result)
return
return True
@region.cache_on_arguments(expiration_time=SHOW_EXPIRATION_TIME, should_cache_fn=lambda value: value)
def search_titles(self, title, season, title_year):
"""Search for titles matching the `title`.
For episodes, each season has it own title
:param str title: the title to search for.
:param int season: season of the title
:param int title_year: year of the title
:return: found titles.
:rtype: dict
"""
titles = {}
sanitized_titles = [sanitize(title)]
ignore_characters = {'\'', '.'}
if any(c in title for c in ignore_characters):
sanitized_titles.append(sanitize(title, ignore_characters=ignore_characters))
for sanitized_title in sanitized_titles:
# make the query
if season:
logger.info('Searching episode title %r for season %r', sanitized_title, season)
else:
logger.info('Searching movie title %r', sanitized_title)
r = self.session.get(self.server_url + 'legenda/sugestao/{}'.format(sanitized_title), timeout=10)
raise_for_status(r)
results = json.loads(r.text)
# loop over results
for result in results:
source = result['_source']
# extract id
title_id = int(source['id_filme'])
# extract type
title = {'type': type_map[source['tipo']]}
# extract title, year and country
name, year, country = title_re.match(source['dsc_nome']).groups()
title['title'] = name
# extract imdb_id
if source['id_imdb'] != '0':
if not source['id_imdb'].startswith('tt'):
title['imdb_id'] = 'tt' + source['id_imdb'].zfill(7)
else:
title['imdb_id'] = source['id_imdb']
# extract season
if title['type'] == 'episode':
if source['temporada'] and source['temporada'].isdigit():
title['season'] = int(source['temporada'])
else:
match = season_re.search(source['dsc_nome_br'])
if match:
title['season'] = int(match.group('season'))
else:
logger.debug('No season detected for title %d (%s)', title_id, name)
# extract year
if year:
title['year'] = int(year)
elif source['dsc_data_lancamento'] and source['dsc_data_lancamento'].isdigit():
# year is based on season air date hence the adjustment
title['year'] = int(source['dsc_data_lancamento']) - title.get('season', 1) + 1
# add title only if is valid
# Check against title without ignored chars
if self.is_valid_title(title, title_id, sanitized_titles[0], season, title_year):
titles[title_id] = title
logger.debug('Found %d titles', len(titles))
return titles
@region.cache_on_arguments(expiration_time=timedelta(minutes=15).total_seconds())
def get_archives(self, title_id, language_code, title_type, season, episode):
"""Get the archive list from a given `title_id`, `language_code`, `title_type`, `season` and `episode`.
:param int title_id: title id.
:param int language_code: language code.
:param str title_type: episode or movie
:param int season: season
:param int episode: episode
:return: the archives.
:rtype: list of :class:`LegendasTVArchive`
"""
archives = []
page = 0
while True:
# get the archive page
url = self.server_url + 'legenda/busca/-/{language}/-/{page}/{title}'.format(
language=language_code, page=page, title=title_id)
r = self.session.get(url)
raise_for_status(r)
# parse the results
soup = ParserBeautifulSoup(r.content, ['lxml', 'html.parser'])
for archive_soup in soup.select('div.list_element > article > div > div.f_left'):
# create archive
archive = LegendasTVArchive(archive_soup.a['href'].split('/')[2],
archive_soup.a.text,
'pack' in archive_soup.parent['class'],
'destaque' in archive_soup.parent['class'],
self.server_url + archive_soup.a['href'][1:])
# clean name of path separators and pack flags
clean_name = archive.name.replace('/', '-')
if archive.pack and clean_name.startswith('(p)'):
clean_name = clean_name[3:]
# guess from name
guess = guessit(clean_name, {'type': title_type})
# episode
if season and episode:
# discard mismatches on episode in non-pack archives
# Guessit may return int for single episode or list for multi-episode
# Check if archive name has multiple episodes releases on it
if not archive.pack and 'episode' in guess:
wanted_episode = set(episode) if isinstance(episode, list) else {episode}
archive_episode = guess['episode'] if isinstance(guess['episode'], list) else {guess['episode']}
if not wanted_episode.intersection(archive_episode):
logger.debug('Mismatched episode %s, discarding archive: %s', guess['episode'], clean_name)
continue
# extract text containing downloads, rating and timestamp
data_text = archive_soup.find('p', class_='data').text
# match downloads
archive.downloads = int(downloads_re.search(data_text).group('downloads'))
# match rating
match = rating_re.search(data_text)
if match:
archive.rating = int(match.group('rating'))
# match timestamp and validate it
time_data = {k: int(v) for k, v in timestamp_re.search(data_text).groupdict().items()}
archive.timestamp = pytz.timezone('America/Sao_Paulo').localize(datetime(**time_data))
if archive.timestamp > datetime.utcnow().replace(tzinfo=pytz.utc):
raise ProviderError('Archive timestamp is in the future')
# add archive
logger.info('Found archive for title %d and language %d at page %s: %s',
title_id, language_code, page, archive)
archives.append(archive)
# stop on last page
if soup.find('a', attrs={'class': 'load_more'}, string='carregar mais') is None:
break
# increment page count
page += 1
logger.debug('Found %d archives', len(archives))
return archives
def download_archive(self, archive):
"""Download an archive's :attr:`~LegendasTVArchive.content`.
:param archive: the archive to download :attr:`~LegendasTVArchive.content` of.
:type archive: :class:`LegendasTVArchive`
"""
logger.info('Downloading archive %s', archive.id)
r = self.session.get(self.server_url + 'downloadarquivo/{}'.format(archive.id))
raise_for_status(r)
# open the archive
archive_stream = io.BytesIO(r.content)
if is_rarfile(archive_stream):
logger.debug('Identified rar archive')
archive.content = RarFile(archive_stream)
elif is_zipfile(archive_stream):
logger.debug('Identified zip archive')
archive.content = ZipFile(archive_stream)
else:
raise ValueError('Not a valid archive')
def query(self, language, title, season=None, episode=None, year=None):
# search for titles
titles = self.search_titles(title, season, year)
subtitles = []
# iterate over titles
for title_id, t in titles.items():
logger.info('Getting archives for title %d and language %d', title_id, language.legendastv)
archives = self.get_archives(title_id, language.legendastv, t['type'], season, episode)
if not archives:
logger.info('No archives found for title %d and language %d', title_id, language.legendastv)
# iterate over title's archives
for a in archives:
# compute an expiration time based on the archive timestamp
expiration_time = (datetime.utcnow().replace(tzinfo=pytz.utc) - a.timestamp).total_seconds()
# attempt to get the releases from the cache
cache_key = releases_key.format(archive_id=a.id, archive_name=a.name)
releases = region.get(cache_key, expiration_time=expiration_time)
# the releases are not in cache or cache is expired
if releases == NO_VALUE:
logger.info('Releases not found in cache')
# download archive
self.download_archive(a)
# extract the releases
releases = []
for name in a.content.namelist():
# discard the legendastv file
if name.startswith('Legendas.tv'):
continue
# discard hidden files
if os.path.split(name)[-1].startswith('.'):
continue
# discard non-subtitle files
if not name.lower().endswith(SUBTITLE_EXTENSIONS):
continue
releases.append(name)
# cache the releases
region.set(cache_key, releases)
# iterate over releases
for r in releases:
subtitle = self.subtitle_class(language, t['type'], t['title'], t.get('year'), t.get('imdb_id'),
t.get('season'), a, r)
logger.debug('Found subtitle %r', subtitle)
subtitles.append(subtitle)
return subtitles
def list_subtitles(self, video, languages):
season = episode = None
if isinstance(video, Episode):
titles = [video.series] + video.alternative_series
season = video.season
episode = video.episode
else:
titles = [video.title] + video.alternative_titles
for title in titles:
subtitles = [s for l in languages for s in
self.query(l, title, season=season, episode=episode, year=video.year)]
if subtitles:
return subtitles
return []
def download_subtitle(self, subtitle):
# download archive in case we previously hit the releases cache and didn't download it
if subtitle.archive.content is None:
self.download_archive(subtitle.archive)
# extract subtitle's content
subtitle.content = fix_line_ending(subtitle.archive.content.read(subtitle.name))
def raise_for_status(r):
# When site is under maintaince and http status code 200.
if 'Em breve estaremos de volta' in r.text:
raise ServiceUnavailable
else:
r.raise_for_status()

View File

@ -0,0 +1,108 @@
# -*- coding: utf-8 -*-
import logging
from babelfish import Language
from requests import Session
from . import Provider
from .. import __short_version__
from ..subtitle import Subtitle
logger = logging.getLogger(__name__)
def get_subhash(hash):
"""Get a second hash based on napiprojekt's hash.
:param str hash: napiprojekt's hash.
:return: the subhash.
:rtype: str
"""
idx = [0xe, 0x3, 0x6, 0x8, 0x2]
mul = [2, 2, 5, 4, 3]
add = [0, 0xd, 0x10, 0xb, 0x5]
b = []
for i in range(len(idx)):
a = add[i]
m = mul[i]
i = idx[i]
t = a + int(hash[i], 16)
v = int(hash[t:t + 2], 16)
b.append(('%x' % (v * m))[-1])
return ''.join(b)
class NapiProjektSubtitle(Subtitle):
"""NapiProjekt Subtitle."""
provider_name = 'napiprojekt'
def __init__(self, language, hash):
super(NapiProjektSubtitle, self).__init__(language)
self.hash = hash
self.content = None
@property
def id(self):
return self.hash
def get_matches(self, video):
matches = set()
# hash
if 'napiprojekt' in video.hashes and video.hashes['napiprojekt'] == self.hash:
matches.add('hash')
return matches
class NapiProjektProvider(Provider):
"""NapiProjekt Provider."""
languages = {Language.fromalpha2(l) for l in ['pl']}
required_hash = 'napiprojekt'
server_url = 'http://napiprojekt.pl/unit_napisy/dl.php'
subtitle_class = NapiProjektSubtitle
def __init__(self):
self.session = None
def initialize(self):
self.session = Session()
self.session.headers['User-Agent'] = 'Subliminal/%s' % __short_version__
def terminate(self):
self.session.close()
def query(self, language, hash):
params = {
'v': 'dreambox',
'kolejka': 'false',
'nick': '',
'pass': '',
'napios': 'Linux',
'l': language.alpha2.upper(),
'f': hash,
't': get_subhash(hash)}
logger.info('Searching subtitle %r', params)
r = self.session.get(self.server_url, params=params, timeout=10)
r.raise_for_status()
# handle subtitles not found and errors
if r.content[:4] == b'NPc0':
logger.debug('No subtitles found')
return None
subtitle = self.subtitle_class(language, hash)
subtitle.content = r.content
logger.debug('Found subtitle %r', subtitle)
return subtitle
def list_subtitles(self, video, languages):
return [s for s in [self.query(l, video.hashes['napiprojekt']) for l in languages] if s is not None]
def download_subtitle(self, subtitle):
# there is no download step, content is already filled from listing subtitles
pass

View File

@ -0,0 +1,297 @@
# -*- coding: utf-8 -*-
import base64
import logging
import os
import re
import zlib
from babelfish import Language, language_converters
from guessit import guessit
from six.moves.xmlrpc_client import ServerProxy
from . import Provider, TimeoutSafeTransport
from .. import __short_version__
from ..exceptions import (AuthenticationError, ConfigurationError, DownloadLimitExceeded, ProviderError,
ServiceUnavailable)
from ..subtitle import Subtitle, fix_line_ending, guess_matches
from ..utils import sanitize
from ..video import Episode, Movie
logger = logging.getLogger(__name__)
class OpenSubtitlesSubtitle(Subtitle):
"""OpenSubtitles Subtitle."""
provider_name = 'opensubtitles'
series_re = re.compile(r'^"(?P<series_name>.*)" (?P<series_title>.*)$')
def __init__(self, language, hearing_impaired, page_link, subtitle_id, matched_by, movie_kind, hash, movie_name,
movie_release_name, movie_year, movie_imdb_id, series_season, series_episode, filename, encoding):
super(OpenSubtitlesSubtitle, self).__init__(language, hearing_impaired=hearing_impaired,
page_link=page_link, encoding=encoding)
self.subtitle_id = subtitle_id
self.matched_by = matched_by
self.movie_kind = movie_kind
self.hash = hash
self.movie_name = movie_name
self.movie_release_name = movie_release_name
self.movie_year = movie_year
self.movie_imdb_id = movie_imdb_id
self.series_season = series_season
self.series_episode = series_episode
self.filename = filename
@property
def id(self):
return str(self.subtitle_id)
@property
def series_name(self):
return self.series_re.match(self.movie_name).group('series_name')
@property
def series_title(self):
return self.series_re.match(self.movie_name).group('series_title')
def get_matches(self, video):
matches = set()
# episode
if isinstance(video, Episode) and self.movie_kind == 'episode':
# tag match, assume series, year, season and episode matches
if self.matched_by == 'tag':
if not video.imdb_id or self.movie_imdb_id == video.imdb_id:
matches |= {'series', 'year', 'season', 'episode'}
# series
if video.series and sanitize(self.series_name) == sanitize(video.series):
matches.add('series')
# year
if video.original_series and self.movie_year is None or video.year and video.year == self.movie_year:
matches.add('year')
# season
if video.season and self.series_season == video.season:
matches.add('season')
# episode
if video.episode and self.series_episode == video.episode:
matches.add('episode')
# title
if video.title and sanitize(self.series_title) == sanitize(video.title):
matches.add('title')
# guess
matches |= guess_matches(video, guessit(self.movie_release_name, {'type': 'episode'}))
matches |= guess_matches(video, guessit(self.filename, {'type': 'episode'}))
# hash
if 'opensubtitles' in video.hashes and self.hash == video.hashes['opensubtitles']:
if 'series' in matches and 'season' in matches and 'episode' in matches:
matches.add('hash')
else:
logger.debug('Match on hash discarded')
# movie
elif isinstance(video, Movie) and self.movie_kind == 'movie':
# tag match, assume title and year matches
if self.matched_by == 'tag':
if not video.imdb_id or self.movie_imdb_id == video.imdb_id:
matches |= {'title', 'year'}
# title
if video.title and sanitize(self.movie_name) == sanitize(video.title):
matches.add('title')
# year
if video.year and self.movie_year == video.year:
matches.add('year')
# guess
matches |= guess_matches(video, guessit(self.movie_release_name, {'type': 'movie'}))
matches |= guess_matches(video, guessit(self.filename, {'type': 'movie'}))
# hash
if 'opensubtitles' in video.hashes and self.hash == video.hashes['opensubtitles']:
if 'title' in matches:
matches.add('hash')
else:
logger.debug('Match on hash discarded')
else:
logger.info('%r is not a valid movie_kind', self.movie_kind)
return matches
# imdb_id
if video.imdb_id and self.movie_imdb_id == video.imdb_id:
matches.add('imdb_id')
return matches
class OpenSubtitlesProvider(Provider):
"""OpenSubtitles Provider.
:param str username: username.
:param str password: password.
"""
languages = {Language.fromopensubtitles(l) for l in language_converters['opensubtitles'].codes}
subtitle_class = OpenSubtitlesSubtitle
def __init__(self, username=None, password=None):
self.server = ServerProxy('https://api.opensubtitles.org/xml-rpc', TimeoutSafeTransport(10))
if any((username, password)) and not all((username, password)):
raise ConfigurationError('Username and password must be specified')
# None values not allowed for logging in, so replace it by ''
self.username = username or ''
self.password = password or ''
self.token = None
def initialize(self):
logger.info('Logging in')
response = checked(self.server.LogIn(self.username, self.password, 'eng',
'subliminal v%s' % __short_version__))
self.token = response['token']
logger.debug('Logged in with token %r', self.token)
def terminate(self):
logger.info('Logging out')
checked(self.server.LogOut(self.token))
self.server.close()
self.token = None
logger.debug('Logged out')
def no_operation(self):
logger.debug('No operation')
checked(self.server.NoOperation(self.token))
def query(self, languages, hash=None, size=None, imdb_id=None, query=None, season=None, episode=None, tag=None):
# fill the search criteria
criteria = []
if hash and size:
criteria.append({'moviehash': hash, 'moviebytesize': str(size)})
if imdb_id:
if season and episode:
criteria.append({'imdbid': imdb_id[2:], 'season': season, 'episode': episode})
else:
criteria.append({'imdbid': imdb_id[2:]})
if tag:
criteria.append({'tag': tag})
if query and season and episode:
criteria.append({'query': query.replace('\'', ''), 'season': season, 'episode': episode})
elif query:
criteria.append({'query': query.replace('\'', '')})
if not criteria:
raise ValueError('Not enough information')
# add the language
for criterion in criteria:
criterion['sublanguageid'] = ','.join(sorted(l.opensubtitles for l in languages))
# query the server
logger.info('Searching subtitles %r', criteria)
response = checked(self.server.SearchSubtitles(self.token, criteria))
subtitles = []
# exit if no data
if not response['data']:
logger.debug('No subtitles found')
return subtitles
# loop over subtitle items
for subtitle_item in response['data']:
# read the item
language = Language.fromopensubtitles(subtitle_item['SubLanguageID'])
hearing_impaired = bool(int(subtitle_item['SubHearingImpaired']))
page_link = subtitle_item['SubtitlesLink']
subtitle_id = int(subtitle_item['IDSubtitleFile'])
matched_by = subtitle_item['MatchedBy']
movie_kind = subtitle_item['MovieKind']
hash = subtitle_item['MovieHash']
movie_name = subtitle_item['MovieName']
movie_release_name = subtitle_item['MovieReleaseName']
movie_year = int(subtitle_item['MovieYear']) if subtitle_item['MovieYear'] else None
movie_imdb_id = 'tt' + subtitle_item['IDMovieImdb']
series_season = int(subtitle_item['SeriesSeason']) if subtitle_item['SeriesSeason'] else None
series_episode = int(subtitle_item['SeriesEpisode']) if subtitle_item['SeriesEpisode'] else None
filename = subtitle_item['SubFileName']
encoding = subtitle_item.get('SubEncoding') or None
subtitle = self.subtitle_class(language, hearing_impaired, page_link, subtitle_id, matched_by, movie_kind,
hash, movie_name, movie_release_name, movie_year, movie_imdb_id,
series_season, series_episode, filename, encoding)
logger.debug('Found subtitle %r by %s', subtitle, matched_by)
subtitles.append(subtitle)
return subtitles
def list_subtitles(self, video, languages):
season = episode = None
if isinstance(video, Episode):
query = video.series
season = video.season
episode = video.episode
else:
query = video.title
return self.query(languages, hash=video.hashes.get('opensubtitles'), size=video.size, imdb_id=video.imdb_id,
query=query, season=season, episode=episode, tag=os.path.basename(video.name))
def download_subtitle(self, subtitle):
logger.info('Downloading subtitle %r', subtitle)
response = checked(self.server.DownloadSubtitles(self.token, [str(subtitle.subtitle_id)]))
subtitle.content = fix_line_ending(zlib.decompress(base64.b64decode(response['data'][0]['data']), 47))
class OpenSubtitlesError(ProviderError):
"""Base class for non-generic :class:`OpenSubtitlesProvider` exceptions."""
pass
class Unauthorized(OpenSubtitlesError, AuthenticationError):
"""Exception raised when status is '401 Unauthorized'."""
pass
class NoSession(OpenSubtitlesError, AuthenticationError):
"""Exception raised when status is '406 No session'."""
pass
class DownloadLimitReached(OpenSubtitlesError, DownloadLimitExceeded):
"""Exception raised when status is '407 Download limit reached'."""
pass
class InvalidImdbid(OpenSubtitlesError):
"""Exception raised when status is '413 Invalid ImdbID'."""
pass
class UnknownUserAgent(OpenSubtitlesError, AuthenticationError):
"""Exception raised when status is '414 Unknown User Agent'."""
pass
class DisabledUserAgent(OpenSubtitlesError, AuthenticationError):
"""Exception raised when status is '415 Disabled user agent'."""
pass
def checked(response):
"""Check a response status before returning it.
:param response: a response from a XMLRPC call to OpenSubtitles.
:return: the response.
:raise: :class:`OpenSubtitlesError`
"""
status_code = int(response['status'][:3])
if status_code == 401:
raise Unauthorized
if status_code == 406:
raise NoSession
if status_code == 407:
raise DownloadLimitReached
if status_code == 413:
raise InvalidImdbid
if status_code == 414:
raise UnknownUserAgent
if status_code == 415:
raise DisabledUserAgent
if status_code == 503:
raise ServiceUnavailable
if status_code != 200:
raise OpenSubtitlesError(response['status'])
return response

View File

@ -0,0 +1,197 @@
# -*- coding: utf-8 -*-
import io
import logging
import re
from babelfish import Language, language_converters
from guessit import guessit
try:
from lxml import etree
except ImportError:
try:
import xml.etree.cElementTree as etree
except ImportError:
import xml.etree.ElementTree as etree
from requests import Session
from zipfile import ZipFile
from . import Provider
from .. import __short_version__
from ..exceptions import ProviderError
from ..subtitle import Subtitle, fix_line_ending, guess_matches
from ..utils import sanitize
from ..video import Episode, Movie
logger = logging.getLogger(__name__)
class PodnapisiSubtitle(Subtitle):
"""Podnapisi Subtitle."""
provider_name = 'podnapisi'
def __init__(self, language, hearing_impaired, page_link, pid, releases, title, season=None, episode=None,
year=None):
super(PodnapisiSubtitle, self).__init__(language, hearing_impaired=hearing_impaired, page_link=page_link)
self.pid = pid
self.releases = releases
self.title = title
self.season = season
self.episode = episode
self.year = year
@property
def id(self):
return self.pid
def get_matches(self, video):
matches = set()
# episode
if isinstance(video, Episode):
# series
if video.series and (sanitize(self.title) in (
sanitize(name) for name in [video.series] + video.alternative_series)):
matches.add('series')
# year
if video.original_series and self.year is None or video.year and video.year == self.year:
matches.add('year')
# season
if video.season and self.season == video.season:
matches.add('season')
# episode
if video.episode and self.episode == video.episode:
matches.add('episode')
# guess
for release in self.releases:
matches |= guess_matches(video, guessit(release, {'type': 'episode'}))
# movie
elif isinstance(video, Movie):
# title
if video.title and (sanitize(self.title) in (
sanitize(name) for name in [video.title] + video.alternative_titles)):
matches.add('title')
# year
if video.year and self.year == video.year:
matches.add('year')
# guess
for release in self.releases:
matches |= guess_matches(video, guessit(release, {'type': 'movie'}))
return matches
class PodnapisiProvider(Provider):
"""Podnapisi Provider."""
languages = ({Language('por', 'BR'), Language('srp', script='Latn')} |
{Language.fromalpha2(l) for l in language_converters['alpha2'].codes})
server_url = 'https://www.podnapisi.net/subtitles/'
subtitle_class = PodnapisiSubtitle
def __init__(self):
self.session = None
def initialize(self):
self.session = Session()
self.session.headers['User-Agent'] = 'Subliminal/%s' % __short_version__
def terminate(self):
self.session.close()
def query(self, language, keyword, season=None, episode=None, year=None):
# set parameters, see http://www.podnapisi.net/forum/viewtopic.php?f=62&t=26164#p212652
params = {'sXML': 1, 'sL': str(language), 'sK': keyword}
is_episode = False
if season and episode:
is_episode = True
params['sTS'] = season
params['sTE'] = episode
if year:
params['sY'] = year
# loop over paginated results
logger.info('Searching subtitles %r', params)
subtitles = []
pids = set()
while True:
# query the server
r = self.session.get(self.server_url + 'search/old', params=params, timeout=10)
r.raise_for_status()
xml = etree.fromstring(r.content)
# exit if no results
if not int(xml.find('pagination/results').text):
logger.debug('No subtitles found')
break
# loop over subtitles
for subtitle_xml in xml.findall('subtitle'):
# read xml elements
pid = subtitle_xml.find('pid').text
# ignore duplicates, see http://www.podnapisi.net/forum/viewtopic.php?f=62&t=26164&start=10#p213321
if pid in pids:
continue
language = Language.fromietf(subtitle_xml.find('language').text)
hearing_impaired = 'n' in (subtitle_xml.find('flags').text or '')
page_link = subtitle_xml.find('url').text
releases = []
if subtitle_xml.find('release').text:
for release in subtitle_xml.find('release').text.split():
release = re.sub(r'\.+$', '', release) # remove trailing dots
release = ''.join(filter(lambda x: ord(x) < 128, release)) # remove non-ascii characters
releases.append(release)
title = subtitle_xml.find('title').text
season = int(subtitle_xml.find('tvSeason').text)
episode = int(subtitle_xml.find('tvEpisode').text)
year = int(subtitle_xml.find('year').text)
if is_episode:
subtitle = self.subtitle_class(language, hearing_impaired, page_link, pid, releases, title,
season=season, episode=episode, year=year)
else:
subtitle = self.subtitle_class(language, hearing_impaired, page_link, pid, releases, title,
year=year)
logger.debug('Found subtitle %r', subtitle)
subtitles.append(subtitle)
pids.add(pid)
# stop on last page
if int(xml.find('pagination/current').text) >= int(xml.find('pagination/count').text):
break
# increment current page
params['page'] = int(xml.find('pagination/current').text) + 1
logger.debug('Getting page %d', params['page'])
return subtitles
def list_subtitles(self, video, languages):
season = episode = None
if isinstance(video, Episode):
titles = [video.series] + video.alternative_series
season = video.season
episode = video.episode
else:
titles = [video.title] + video.alternative_titles
for title in titles:
subtitles = [s for l in languages for s in
self.query(l, title, season=season, episode=episode, year=video.year)]
if subtitles:
return subtitles
return []
def download_subtitle(self, subtitle):
# download as a zip
logger.info('Downloading subtitle %r', subtitle)
r = self.session.get(self.server_url + subtitle.pid + '/download', params={'container': 'zip'}, timeout=10)
r.raise_for_status()
# open the zip
with ZipFile(io.BytesIO(r.content)) as zf:
if len(zf.namelist()) > 1:
raise ProviderError('More than one file to unzip')
subtitle.content = fix_line_ending(zf.read(zf.namelist()[0]))

View File

@ -0,0 +1,83 @@
# -*- coding: utf-8 -*-
import json
import logging
import os
from babelfish import Language, language_converters
from requests import Session
from . import Provider
from .. import __short_version__
from ..subtitle import Subtitle, fix_line_ending
logger = logging.getLogger(__name__)
language_converters.register('shooter = subliminal.converters.shooter:ShooterConverter')
class ShooterSubtitle(Subtitle):
"""Shooter Subtitle."""
provider_name = 'shooter'
def __init__(self, language, hash, download_link):
super(ShooterSubtitle, self).__init__(language)
self.hash = hash
self.download_link = download_link
@property
def id(self):
return self.download_link
def get_matches(self, video):
matches = set()
# hash
if 'shooter' in video.hashes and video.hashes['shooter'] == self.hash:
matches.add('hash')
return matches
class ShooterProvider(Provider):
"""Shooter Provider."""
languages = {Language(l) for l in ['eng', 'zho']}
server_url = 'https://www.shooter.cn/api/subapi.php'
subtitle_class = ShooterSubtitle
def __init__(self):
self.session = None
def initialize(self):
self.session = Session()
self.session.headers['User-Agent'] = 'Subliminal/%s' % __short_version__
def terminate(self):
self.session.close()
def query(self, language, filename, hash=None):
# query the server
params = {'filehash': hash, 'pathinfo': os.path.realpath(filename), 'format': 'json', 'lang': language.shooter}
logger.debug('Searching subtitles %r', params)
r = self.session.post(self.server_url, params=params, timeout=10)
r.raise_for_status()
# handle subtitles not found
if r.content == b'\xff':
logger.debug('No subtitles found')
return []
# parse the subtitles
results = json.loads(r.text)
subtitles = [self.subtitle_class(language, hash, t['Link']) for s in results for t in s['Files']]
return subtitles
def list_subtitles(self, video, languages):
return [s for l in languages for s in self.query(l, video.name, video.hashes.get('shooter'))]
def download_subtitle(self, subtitle):
logger.info('Downloading subtitle %r', subtitle)
r = self.session.get(subtitle.download_link, timeout=10)
r.raise_for_status()
subtitle.content = fix_line_ending(r.content)

View File

@ -0,0 +1,243 @@
# -*- coding: utf-8 -*-
import bisect
from collections import defaultdict
import io
import json
import logging
import zipfile
from babelfish import Language
from guessit import guessit
from requests import Session
from . import ParserBeautifulSoup, Provider
from .. import __short_version__
from ..cache import SHOW_EXPIRATION_TIME, region
from ..exceptions import AuthenticationError, ConfigurationError, ProviderError
from ..subtitle import Subtitle, fix_line_ending, guess_matches
from ..utils import sanitize
from ..video import Episode, Movie
logger = logging.getLogger(__name__)
class SubsCenterSubtitle(Subtitle):
"""SubsCenter Subtitle."""
provider_name = 'subscenter'
def __init__(self, language, hearing_impaired, page_link, series, season, episode, title, subtitle_id, subtitle_key,
subtitle_version, downloaded, releases):
super(SubsCenterSubtitle, self).__init__(language, hearing_impaired, page_link)
self.series = series
self.season = season
self.episode = episode
self.title = title
self.subtitle_id = subtitle_id
self.subtitle_key = subtitle_key
self.subtitle_version = subtitle_version
self.downloaded = downloaded
self.releases = releases
@property
def id(self):
return str(self.subtitle_id)
def get_matches(self, video):
matches = set()
# episode
if isinstance(video, Episode):
# series
if video.series and sanitize(self.series) == sanitize(video.series):
matches.add('series')
# season
if video.season and self.season == video.season:
matches.add('season')
# episode
if video.episode and self.episode == video.episode:
matches.add('episode')
# guess
for release in self.releases:
matches |= guess_matches(video, guessit(release, {'type': 'episode'}))
# movie
elif isinstance(video, Movie):
# guess
for release in self.releases:
matches |= guess_matches(video, guessit(release, {'type': 'movie'}))
# title
if video.title and sanitize(self.title) == sanitize(video.title):
matches.add('title')
return matches
class SubsCenterProvider(Provider):
"""SubsCenter Provider."""
languages = {Language.fromalpha2(l) for l in ['he']}
server_url = 'http://www.subscenter.org/he/'
subtitle_class = SubsCenterSubtitle
def __init__(self, username=None, password=None):
if username is not None and password is None or username is None and password is not None:
raise ConfigurationError('Username and password must be specified')
self.session = None
self.username = username
self.password = password
self.logged_in = False
def initialize(self):
self.session = Session()
self.session.headers['User-Agent'] = 'Subliminal/{}'.format(__short_version__)
# login
if self.username is not None and self.password is not None:
logger.debug('Logging in')
url = self.server_url + 'subscenter/accounts/login/'
# retrieve CSRF token
self.session.get(url)
csrf_token = self.session.cookies['csrftoken']
# actual login
data = {'username': self.username, 'password': self.password, 'csrfmiddlewaretoken': csrf_token}
r = self.session.post(url, data, allow_redirects=False, timeout=10)
if r.status_code != 302:
raise AuthenticationError(self.username)
logger.info('Logged in')
self.logged_in = True
def terminate(self):
# logout
if self.logged_in:
logger.info('Logging out')
r = self.session.get(self.server_url + 'subscenter/accounts/logout/', timeout=10)
r.raise_for_status()
logger.info('Logged out')
self.logged_in = False
self.session.close()
@region.cache_on_arguments(expiration_time=SHOW_EXPIRATION_TIME)
def _search_url_titles(self, title):
"""Search the URL titles by kind for the given `title`.
:param str title: title to search for.
:return: the URL titles by kind.
:rtype: collections.defaultdict
"""
# make the search
logger.info('Searching title name for %r', title)
r = self.session.get(self.server_url + 'subtitle/search/', params={'q': title}, timeout=10)
r.raise_for_status()
# check for redirections
if r.history and all([h.status_code == 302 for h in r.history]):
logger.debug('Redirected to the subtitles page')
links = [r.url]
else:
# get the suggestions (if needed)
soup = ParserBeautifulSoup(r.content, ['lxml', 'html.parser'])
links = [link.attrs['href'] for link in soup.select('#processes div.generalWindowTop a')]
logger.debug('Found %d suggestions', len(links))
url_titles = defaultdict(list)
for link in links:
parts = link.split('/')
url_titles[parts[-3]].append(parts[-2])
return url_titles
def query(self, title, season=None, episode=None):
# search for the url title
url_titles = self._search_url_titles(title)
# episode
if season and episode:
if 'series' not in url_titles:
logger.error('No URL title found for series %r', title)
return []
url_title = url_titles['series'][0]
logger.debug('Using series title %r', url_title)
url = self.server_url + 'cst/data/series/sb/{}/{}/{}/'.format(url_title, season, episode)
page_link = self.server_url + 'subtitle/series/{}/{}/{}/'.format(url_title, season, episode)
else:
if 'movie' not in url_titles:
logger.error('No URL title found for movie %r', title)
return []
url_title = url_titles['movie'][0]
logger.debug('Using movie title %r', url_title)
url = self.server_url + 'cst/data/movie/sb/{}/'.format(url_title)
page_link = self.server_url + 'subtitle/movie/{}/'.format(url_title)
# get the list of subtitles
logger.debug('Getting the list of subtitles')
r = self.session.get(url)
r.raise_for_status()
results = json.loads(r.text)
# loop over results
subtitles = {}
for language_code, language_data in results.items():
for quality_data in language_data.values():
for quality, subtitles_data in quality_data.items():
for subtitle_item in subtitles_data.values():
# read the item
language = Language.fromalpha2(language_code)
hearing_impaired = bool(subtitle_item['hearing_impaired'])
subtitle_id = subtitle_item['id']
subtitle_key = subtitle_item['key']
subtitle_version = subtitle_item['h_version']
downloaded = subtitle_item['downloaded']
release = subtitle_item['subtitle_version']
# add the release and increment downloaded count if we already have the subtitle
if subtitle_id in subtitles:
logger.debug('Found additional release %r for subtitle %d', release, subtitle_id)
bisect.insort_left(subtitles[subtitle_id].releases, release) # deterministic order
subtitles[subtitle_id].downloaded += downloaded
continue
# otherwise create it
subtitle = self.subtitle_class(language, hearing_impaired, page_link, title, season, episode,
title, subtitle_id, subtitle_key, subtitle_version, downloaded,
[release])
logger.debug('Found subtitle %r', subtitle)
subtitles[subtitle_id] = subtitle
return subtitles.values()
def list_subtitles(self, video, languages):
season = episode = None
title = video.title
if isinstance(video, Episode):
title = video.series
season = video.season
episode = video.episode
return [s for s in self.query(title, season, episode) if s.language in languages]
def download_subtitle(self, subtitle):
# download
url = self.server_url + 'subtitle/download/{}/{}/'.format(subtitle.language.alpha2, subtitle.subtitle_id)
params = {'v': subtitle.subtitle_version, 'key': subtitle.subtitle_key}
r = self.session.get(url, params=params, headers={'Referer': subtitle.page_link}, timeout=10)
r.raise_for_status()
# open the zip
try:
with zipfile.ZipFile(io.BytesIO(r.content)) as zf:
# remove some filenames from the namelist
namelist = [n for n in zf.namelist() if not n.endswith('.txt')]
if len(namelist) > 1:
raise ProviderError('More than one file to unzip')
subtitle.content = fix_line_ending(zf.read(namelist[0]))
except zipfile.BadZipfile:
# if no zip file was retrieved, daily downloads limit has exceeded
raise ProviderError('Daily limit exceeded')

View File

@ -0,0 +1,88 @@
# -*- coding: utf-8 -*-
import logging
from babelfish import Language, language_converters
from requests import Session
from . import Provider
from .. import __short_version__
from ..subtitle import Subtitle, fix_line_ending
logger = logging.getLogger(__name__)
language_converters.register('thesubdb = subliminal.converters.thesubdb:TheSubDBConverter')
class TheSubDBSubtitle(Subtitle):
"""TheSubDB Subtitle."""
provider_name = 'thesubdb'
def __init__(self, language, hash):
super(TheSubDBSubtitle, self).__init__(language)
self.hash = hash
@property
def id(self):
return self.hash + '-' + str(self.language)
def get_matches(self, video):
matches = set()
# hash
if 'thesubdb' in video.hashes and video.hashes['thesubdb'] == self.hash:
matches.add('hash')
return matches
class TheSubDBProvider(Provider):
"""TheSubDB Provider."""
languages = {Language.fromthesubdb(l) for l in language_converters['thesubdb'].codes}
required_hash = 'thesubdb'
server_url = 'http://api.thesubdb.com/'
subtitle_class = TheSubDBSubtitle
def __init__(self):
self.session = None
def initialize(self):
self.session = Session()
self.session.headers['User-Agent'] = ('SubDB/1.0 (subliminal/%s; https://github.com/Diaoul/subliminal)' %
__short_version__)
def terminate(self):
self.session.close()
def query(self, hash):
# make the query
params = {'action': 'search', 'hash': hash}
logger.info('Searching subtitles %r', params)
r = self.session.get(self.server_url, params=params, timeout=10)
# handle subtitles not found and errors
if r.status_code == 404:
logger.debug('No subtitles found')
return []
r.raise_for_status()
# loop over languages
subtitles = []
for language_code in r.text.split(','):
language = Language.fromthesubdb(language_code)
subtitle = self.subtitle_class(language, hash)
logger.debug('Found subtitle %r', subtitle)
subtitles.append(subtitle)
return subtitles
def list_subtitles(self, video, languages):
return [s for s in self.query(video.hashes['thesubdb']) if s.language in languages]
def download_subtitle(self, subtitle):
logger.info('Downloading subtitle %r', subtitle)
params = {'action': 'download', 'hash': subtitle.hash, 'language': subtitle.language.alpha2}
r = self.session.get(self.server_url, params=params, timeout=10)
r.raise_for_status()
subtitle.content = fix_line_ending(r.content)

View File

@ -0,0 +1,226 @@
# -*- coding: utf-8 -*-
import io
import logging
import re
from zipfile import ZipFile
from babelfish import Language, language_converters
from guessit import guessit
from requests import Session
from . import ParserBeautifulSoup, Provider
from .. import __short_version__
from ..cache import EPISODE_EXPIRATION_TIME, SHOW_EXPIRATION_TIME, region
from ..exceptions import ProviderError
from ..score import get_equivalent_release_groups
from ..subtitle import Subtitle, fix_line_ending, guess_matches
from ..utils import sanitize, sanitize_release_group
from ..video import Episode
logger = logging.getLogger(__name__)
language_converters.register('tvsubtitles = subliminal.converters.tvsubtitles:TVsubtitlesConverter')
link_re = re.compile(r'^(?P<series>.+?)(?: \(?\d{4}\)?| \((?:US|UK)\))? \((?P<first_year>\d{4})-\d{4}\)$')
episode_id_re = re.compile(r'^episode-\d+\.html$')
class TVsubtitlesSubtitle(Subtitle):
"""TVsubtitles Subtitle."""
provider_name = 'tvsubtitles'
def __init__(self, language, page_link, subtitle_id, series, season, episode, year, rip, release):
super(TVsubtitlesSubtitle, self).__init__(language, page_link=page_link)
self.subtitle_id = subtitle_id
self.series = series
self.season = season
self.episode = episode
self.year = year
self.rip = rip
self.release = release
@property
def id(self):
return str(self.subtitle_id)
def get_matches(self, video):
matches = set()
# series
if video.series and (sanitize(self.series) in (
sanitize(name) for name in [video.series] + video.alternative_series)):
matches.add('series')
# season
if video.season and self.season == video.season:
matches.add('season')
# episode
if video.episode and self.episode == video.episode:
matches.add('episode')
# year
if video.original_series and self.year is None or video.year and video.year == self.year:
matches.add('year')
# release_group
if (video.release_group and self.release and
any(r in sanitize_release_group(self.release)
for r in get_equivalent_release_groups(sanitize_release_group(video.release_group)))):
matches.add('release_group')
# other properties
if self.release:
matches |= guess_matches(video, guessit(self.release, {'type': 'episode'}), partial=True)
if self.rip:
matches |= guess_matches(video, guessit(self.rip), partial=True)
return matches
class TVsubtitlesProvider(Provider):
"""TVsubtitles Provider."""
languages = {Language('por', 'BR')} | {Language(l) for l in [
'ara', 'bul', 'ces', 'dan', 'deu', 'ell', 'eng', 'fin', 'fra', 'hun', 'ita', 'jpn', 'kor', 'nld', 'pol', 'por',
'ron', 'rus', 'spa', 'swe', 'tur', 'ukr', 'zho'
]}
video_types = (Episode,)
server_url = 'http://www.tvsubtitles.net/'
subtitle_class = TVsubtitlesSubtitle
def __init__(self):
self.session = None
def initialize(self):
self.session = Session()
self.session.headers['User-Agent'] = 'Subliminal/%s' % __short_version__
def terminate(self):
self.session.close()
@region.cache_on_arguments(expiration_time=SHOW_EXPIRATION_TIME)
def search_show_id(self, series, year=None):
"""Search the show id from the `series` and `year`.
:param str series: series of the episode.
:param year: year of the series, if any.
:type year: int
:return: the show id, if any.
:rtype: int
"""
# make the search
logger.info('Searching show id for %r', series)
r = self.session.post(self.server_url + 'search.php', data={'q': series}, timeout=10)
r.raise_for_status()
# get the series out of the suggestions
soup = ParserBeautifulSoup(r.content, ['lxml', 'html.parser'])
show_id = None
for suggestion in soup.select('div.left li div a[href^="/tvshow-"]'):
match = link_re.match(suggestion.text)
if not match:
logger.error('Failed to match %s', suggestion.text)
continue
if match.group('series').lower() == series.lower():
if year is not None and int(match.group('first_year')) != year:
logger.debug('Year does not match')
continue
show_id = int(suggestion['href'][8:-5])
logger.debug('Found show id %d', show_id)
break
return show_id
@region.cache_on_arguments(expiration_time=EPISODE_EXPIRATION_TIME)
def get_episode_ids(self, show_id, season):
"""Get episode ids from the show id and the season.
:param int show_id: show id.
:param int season: season of the episode.
:return: episode ids per episode number.
:rtype: dict
"""
# get the page of the season of the show
logger.info('Getting the page of show id %d, season %d', show_id, season)
r = self.session.get(self.server_url + 'tvshow-%d-%d.html' % (show_id, season), timeout=10)
soup = ParserBeautifulSoup(r.content, ['lxml', 'html.parser'])
# loop over episode rows
episode_ids = {}
for row in soup.select('table#table5 tr'):
# skip rows that do not have a link to the episode page
if not row('a', href=episode_id_re):
continue
# extract data from the cells
cells = row('td')
episode = int(cells[0].text.split('x')[1])
episode_id = int(cells[1].a['href'][8:-5])
episode_ids[episode] = episode_id
if episode_ids:
logger.debug('Found episode ids %r', episode_ids)
else:
logger.warning('No episode ids found')
return episode_ids
def query(self, show_id, series, season, episode, year=None):
# get the episode ids
episode_ids = self.get_episode_ids(show_id, season)
if episode not in episode_ids:
logger.error('Episode %d not found', episode)
return []
# get the episode page
logger.info('Getting the page for episode %d', episode_ids[episode])
r = self.session.get(self.server_url + 'episode-%d.html' % episode_ids[episode], timeout=10)
soup = ParserBeautifulSoup(r.content, ['lxml', 'html.parser'])
# loop over subtitles rows
subtitles = []
for row in soup.select('.subtitlen'):
# read the item
language = Language.fromtvsubtitles(row.h5.img['src'][13:-4])
subtitle_id = int(row.parent['href'][10:-5])
page_link = self.server_url + 'subtitle-%d.html' % subtitle_id
rip = row.find('p', title='rip').text.strip() or None
release = row.find('h5').text.strip() or None
subtitle = self.subtitle_class(language, page_link, subtitle_id, series, season, episode, year, rip,
release)
logger.debug('Found subtitle %s', subtitle)
subtitles.append(subtitle)
return subtitles
def list_subtitles(self, video, languages):
# lookup show_id
titles = [video.series] + video.alternative_series
show_id = None
for title in titles:
show_id = self.search_show_id(title, video.year)
if show_id is not None:
break
# query for subtitles with the show_id
if show_id is not None:
subtitles = [s for s in self.query(show_id, title, video.season, video.episode, video.year)
if s.language in languages and s.episode == video.episode]
if subtitles:
return subtitles
else:
logger.error('No show id found for %r (%r)', video.series, {'year': video.year})
return []
def download_subtitle(self, subtitle):
# download as a zip
logger.info('Downloading subtitle %r', subtitle)
r = self.session.get(self.server_url + 'download-%d.html' % subtitle.subtitle_id, timeout=10)
r.raise_for_status()
# open the zip
with ZipFile(io.BytesIO(r.content)) as zf:
if len(zf.namelist()) > 1:
raise ProviderError('More than one file to unzip')
subtitle.content = fix_line_ending(zf.read(zf.namelist()[0]))

View File

@ -0,0 +1,12 @@
"""
Refiners enrich a :class:`~subliminal.video.Video` object by adding information to it.
A refiner is a simple function:
.. py:function:: refine(video, **kwargs)
:param video: the video to refine.
:type video: :class:`~subliminal.video.Video`
:param \*\*kwargs: additional parameters for refiners.
"""

View File

@ -0,0 +1,99 @@
# -*- coding: utf-8 -*-
import logging
import os
from babelfish import Error as BabelfishError, Language
from enzyme import MKV
logger = logging.getLogger(__name__)
def refine(video, embedded_subtitles=True, **kwargs):
"""Refine a video by searching its metadata.
Several :class:`~subliminal.video.Video` attributes can be found:
* :attr:`~subliminal.video.Video.resolution`
* :attr:`~subliminal.video.Video.video_codec`
* :attr:`~subliminal.video.Video.audio_codec`
* :attr:`~subliminal.video.Video.subtitle_languages`
:param bool embedded_subtitles: search for embedded subtitles.
"""
# skip non existing videos
if not video.exists:
return
# check extensions
extension = os.path.splitext(video.name)[1]
if extension == '.mkv':
with open(video.name, 'rb') as f:
mkv = MKV(f)
# main video track
if mkv.video_tracks:
video_track = mkv.video_tracks[0]
# resolution
if video_track.height in (480, 720, 1080):
if video_track.interlaced:
video.resolution = '%di' % video_track.height
else:
video.resolution = '%dp' % video_track.height
logger.debug('Found resolution %s', video.resolution)
# video codec
if video_track.codec_id == 'V_MPEG4/ISO/AVC':
video.video_codec = 'h264'
logger.debug('Found video_codec %s', video.video_codec)
elif video_track.codec_id == 'V_MPEG4/ISO/SP':
video.video_codec = 'DivX'
logger.debug('Found video_codec %s', video.video_codec)
elif video_track.codec_id == 'V_MPEG4/ISO/ASP':
video.video_codec = 'XviD'
logger.debug('Found video_codec %s', video.video_codec)
else:
logger.warning('MKV has no video track')
# main audio track
if mkv.audio_tracks:
audio_track = mkv.audio_tracks[0]
# audio codec
if audio_track.codec_id == 'A_AC3':
video.audio_codec = 'AC3'
logger.debug('Found audio_codec %s', video.audio_codec)
elif audio_track.codec_id == 'A_DTS':
video.audio_codec = 'DTS'
logger.debug('Found audio_codec %s', video.audio_codec)
elif audio_track.codec_id == 'A_AAC':
video.audio_codec = 'AAC'
logger.debug('Found audio_codec %s', video.audio_codec)
else:
logger.warning('MKV has no audio track')
# subtitle tracks
if mkv.subtitle_tracks:
if embedded_subtitles:
embedded_subtitle_languages = set()
for st in mkv.subtitle_tracks:
if st.language:
try:
embedded_subtitle_languages.add(Language.fromalpha3b(st.language))
except BabelfishError:
logger.error('Embedded subtitle track language %r is not a valid language', st.language)
embedded_subtitle_languages.add(Language('und'))
elif st.name:
try:
embedded_subtitle_languages.add(Language.fromname(st.name))
except BabelfishError:
logger.debug('Embedded subtitle track name %r is not a valid language', st.name)
embedded_subtitle_languages.add(Language('und'))
else:
embedded_subtitle_languages.add(Language('und'))
logger.debug('Found embedded subtitle %r', embedded_subtitle_languages)
video.subtitle_languages |= embedded_subtitle_languages
else:
logger.debug('MKV has no subtitle track')
else:
logger.debug('Unsupported video extension %s', extension)

View File

@ -0,0 +1,187 @@
# -*- coding: utf-8 -*-
import logging
import operator
import requests
from .. import __short_version__
from ..cache import REFINER_EXPIRATION_TIME, region
from ..video import Episode, Movie
from ..utils import sanitize
logger = logging.getLogger(__name__)
class OMDBClient(object):
base_url = 'http://www.omdbapi.com'
def __init__(self, version=1, session=None, headers=None, timeout=10):
#: Session for the requests
self.session = session or requests.Session()
self.session.timeout = timeout
self.session.headers.update(headers or {})
self.session.params['r'] = 'json'
self.session.params['v'] = version
def get(self, id=None, title=None, type=None, year=None, plot='short', tomatoes=False):
# build the params
params = {}
if id:
params['i'] = id
if title:
params['t'] = title
if not params:
raise ValueError('At least id or title is required')
params['type'] = type
params['y'] = year
params['plot'] = plot
params['tomatoes'] = tomatoes
# perform the request
r = self.session.get(self.base_url, params=params)
r.raise_for_status()
# get the response as json
j = r.json()
# check response status
if j['Response'] == 'False':
return None
return j
def search(self, title, type=None, year=None, page=1):
# build the params
params = {'s': title, 'type': type, 'y': year, 'page': page}
# perform the request
r = self.session.get(self.base_url, params=params)
r.raise_for_status()
# get the response as json
j = r.json()
# check response status
if j['Response'] == 'False':
return None
return j
omdb_client = OMDBClient(headers={'User-Agent': 'Subliminal/%s' % __short_version__})
@region.cache_on_arguments(expiration_time=REFINER_EXPIRATION_TIME)
def search(title, type, year):
results = omdb_client.search(title, type, year)
if not results:
return None
# fetch all paginated results
all_results = results['Search']
total_results = int(results['totalResults'])
page = 1
while total_results > page * 10:
page += 1
results = omdb_client.search(title, type, year, page=page)
all_results.extend(results['Search'])
return all_results
def refine(video, **kwargs):
"""Refine a video by searching `OMDb API <http://omdbapi.com/>`_.
Several :class:`~subliminal.video.Episode` attributes can be found:
* :attr:`~subliminal.video.Episode.series`
* :attr:`~subliminal.video.Episode.year`
* :attr:`~subliminal.video.Episode.series_imdb_id`
Similarly, for a :class:`~subliminal.video.Movie`:
* :attr:`~subliminal.video.Movie.title`
* :attr:`~subliminal.video.Movie.year`
* :attr:`~subliminal.video.Video.imdb_id`
"""
if isinstance(video, Episode):
# exit if the information is complete
if video.series_imdb_id:
logger.debug('No need to search')
return
# search the series
results = search(video.series, 'series', video.year)
if not results:
logger.warning('No results for series')
return
logger.debug('Found %d results', len(results))
# filter the results
results = [r for r in results if sanitize(r['Title']) == sanitize(video.series)]
if not results:
logger.warning('No matching series found')
return
# process the results
found = False
for result in sorted(results, key=operator.itemgetter('Year')):
if video.original_series and video.year is None:
logger.debug('Found result for original series without year')
found = True
break
if video.year == int(result['Year'].split(u'\u2013')[0]):
logger.debug('Found result with matching year')
found = True
break
if not found:
logger.warning('No matching series found')
return
# add series information
logger.debug('Found series %r', result)
video.series = result['Title']
video.year = int(result['Year'].split(u'\u2013')[0])
video.series_imdb_id = result['imdbID']
elif isinstance(video, Movie):
# exit if the information is complete
if video.imdb_id:
return
# search the movie
results = search(video.title, 'movie', video.year)
if not results:
logger.warning('No results')
return
logger.debug('Found %d results', len(results))
# filter the results
results = [r for r in results if sanitize(r['Title']) == sanitize(video.title)]
if not results:
logger.warning('No matching movie found')
return
# process the results
found = False
for result in results:
if video.year is None:
logger.debug('Found result for movie without year')
found = True
break
if video.year == int(result['Year']):
logger.debug('Found result with matching year')
found = True
break
if not found:
logger.warning('No matching movie found')
return
# add movie information
logger.debug('Found movie %r', result)
video.title = result['Title']
video.year = int(result['Year'].split(u'\u2013')[0])
video.imdb_id = result['imdbID']

View File

@ -0,0 +1,351 @@
# -*- coding: utf-8 -*-
from datetime import datetime, timedelta
from functools import wraps
import logging
import re
import _strptime
import requests
from .. import __short_version__
from ..cache import REFINER_EXPIRATION_TIME, region
from ..utils import sanitize
from ..video import Episode
logger = logging.getLogger(__name__)
series_re = re.compile(r'^(?P<series>.*?)(?: \((?:(?P<year>\d{4})|(?P<country>[A-Z]{2}))\))?$')
def requires_auth(func):
"""Decorator for :class:`TVDBClient` methods that require authentication"""
@wraps(func)
def wrapper(self, *args, **kwargs):
if self.token is None or self.token_expired:
self.login()
elif self.token_needs_refresh:
self.refresh_token()
return func(self, *args, **kwargs)
return wrapper
class TVDBClient(object):
"""TVDB REST API Client
:param str apikey: API key to use.
:param str username: username to use.
:param str password: password to use.
:param str language: language of the responses.
:param session: session object to use.
:type session: :class:`requests.sessions.Session` or compatible.
:param dict headers: additional headers.
:param int timeout: timeout for the requests.
"""
#: Base URL of the API
base_url = 'https://api.thetvdb.com'
#: Token lifespan
token_lifespan = timedelta(hours=1)
#: Minimum token age before a :meth:`refresh_token` is triggered
refresh_token_every = timedelta(minutes=30)
def __init__(self, apikey=None, username=None, password=None, language='en', session=None, headers=None,
timeout=10):
#: API key
self.apikey = apikey
#: Username
self.username = username
#: Password
self.password = password
#: Last token acquisition date
self.token_date = datetime.utcnow() - self.token_lifespan
#: Session for the requests
self.session = session or requests.Session()
self.session.timeout = timeout
self.session.headers.update(headers or {})
self.session.headers['Content-Type'] = 'application/json'
self.session.headers['Accept-Language'] = language
@property
def language(self):
return self.session.headers['Accept-Language']
@language.setter
def language(self, value):
self.session.headers['Accept-Language'] = value
@property
def token(self):
if 'Authorization' not in self.session.headers:
return None
return self.session.headers['Authorization'][7:]
@property
def token_expired(self):
return datetime.utcnow() - self.token_date > self.token_lifespan
@property
def token_needs_refresh(self):
return datetime.utcnow() - self.token_date > self.refresh_token_every
def login(self):
"""Login"""
# perform the request
data = {'apikey': self.apikey, 'username': self.username, 'password': self.password}
r = self.session.post(self.base_url + '/login', json=data)
r.raise_for_status()
# set the Authorization header
self.session.headers['Authorization'] = 'Bearer ' + r.json()['token']
# update token_date
self.token_date = datetime.utcnow()
def refresh_token(self):
"""Refresh token"""
# perform the request
r = self.session.get(self.base_url + '/refresh_token')
r.raise_for_status()
# set the Authorization header
self.session.headers['Authorization'] = 'Bearer ' + r.json()['token']
# update token_date
self.token_date = datetime.utcnow()
@requires_auth
def search_series(self, name=None, imdb_id=None, zap2it_id=None):
"""Search series"""
# perform the request
params = {'name': name, 'imdbId': imdb_id, 'zap2itId': zap2it_id}
r = self.session.get(self.base_url + '/search/series', params=params)
if r.status_code == 404:
return None
r.raise_for_status()
return r.json()['data']
@requires_auth
def get_series(self, id):
"""Get series"""
# perform the request
r = self.session.get(self.base_url + '/series/{}'.format(id))
if r.status_code == 404:
return None
r.raise_for_status()
return r.json()['data']
@requires_auth
def get_series_actors(self, id):
"""Get series actors"""
# perform the request
r = self.session.get(self.base_url + '/series/{}/actors'.format(id))
if r.status_code == 404:
return None
r.raise_for_status()
return r.json()['data']
@requires_auth
def get_series_episodes(self, id, page=1):
"""Get series episodes"""
# perform the request
params = {'page': page}
r = self.session.get(self.base_url + '/series/{}/episodes'.format(id), params=params)
if r.status_code == 404:
return None
r.raise_for_status()
return r.json()
@requires_auth
def query_series_episodes(self, id, absolute_number=None, aired_season=None, aired_episode=None, dvd_season=None,
dvd_episode=None, imdb_id=None, page=1):
"""Query series episodes"""
# perform the request
params = {'absoluteNumber': absolute_number, 'airedSeason': aired_season, 'airedEpisode': aired_episode,
'dvdSeason': dvd_season, 'dvdEpisode': dvd_episode, 'imdbId': imdb_id, 'page': page}
r = self.session.get(self.base_url + '/series/{}/episodes/query'.format(id), params=params)
if r.status_code == 404:
return None
r.raise_for_status()
return r.json()
@requires_auth
def get_episode(self, id):
"""Get episode"""
# perform the request
r = self.session.get(self.base_url + '/episodes/{}'.format(id))
if r.status_code == 404:
return None
r.raise_for_status()
return r.json()['data']
#: Configured instance of :class:`TVDBClient`
tvdb_client = TVDBClient('5EC930FB90DA1ADA', headers={'User-Agent': 'Subliminal/%s' % __short_version__})
@region.cache_on_arguments(expiration_time=REFINER_EXPIRATION_TIME)
def search_series(name):
"""Search series.
:param str name: name of the series.
:return: the search results.
:rtype: list
"""
return tvdb_client.search_series(name)
@region.cache_on_arguments(expiration_time=REFINER_EXPIRATION_TIME)
def get_series(id):
"""Get series.
:param int id: id of the series.
:return: the series data.
:rtype: dict
"""
return tvdb_client.get_series(id)
@region.cache_on_arguments(expiration_time=REFINER_EXPIRATION_TIME)
def get_series_episode(series_id, season, episode):
"""Get an episode of a series.
:param int series_id: id of the series.
:param int season: season number of the episode.
:param int episode: episode number of the episode.
:return: the episode data.
:rtype: dict
"""
result = tvdb_client.query_series_episodes(series_id, aired_season=season, aired_episode=episode)
if result:
return tvdb_client.get_episode(result['data'][0]['id'])
def refine(video, **kwargs):
"""Refine a video by searching `TheTVDB <http://thetvdb.com/>`_.
.. note::
This refiner only work for instances of :class:`~subliminal.video.Episode`.
Several attributes can be found:
* :attr:`~subliminal.video.Episode.series`
* :attr:`~subliminal.video.Episode.year`
* :attr:`~subliminal.video.Episode.series_imdb_id`
* :attr:`~subliminal.video.Episode.series_tvdb_id`
* :attr:`~subliminal.video.Episode.title`
* :attr:`~subliminal.video.Video.imdb_id`
* :attr:`~subliminal.video.Episode.tvdb_id`
"""
# only deal with Episode videos
if not isinstance(video, Episode):
logger.error('Cannot refine episodes')
return
# exit if the information is complete
if video.series_tvdb_id and video.tvdb_id:
logger.debug('No need to search')
return
# search the series
logger.info('Searching series %r', video.series)
results = search_series(video.series.lower())
if not results:
logger.warning('No results for series')
return
logger.debug('Found %d results', len(results))
# search for exact matches
matching_results = []
for result in results:
matching_result = {}
# use seriesName and aliases
series_names = [result['seriesName']]
series_names.extend(result['aliases'])
# parse the original series as series + year or country
original_match = series_re.match(result['seriesName']).groupdict()
# parse series year
series_year = None
if result['firstAired']:
series_year = datetime.strptime(result['firstAired'], '%Y-%m-%d').year
# discard mismatches on year
if video.year and series_year and video.year != series_year:
logger.debug('Discarding series %r mismatch on year %d', result['seriesName'], series_year)
continue
# iterate over series names
for series_name in series_names:
# parse as series and year
series, year, country = series_re.match(series_name).groups()
if year:
year = int(year)
# discard mismatches on year
if year and (video.original_series or video.year != year):
logger.debug('Discarding series name %r mismatch on year %d', series, year)
continue
# match on sanitized series name
if sanitize(series) == sanitize(video.series):
logger.debug('Found exact match on series %r', series_name)
matching_result['match'] = {'series': original_match['series'], 'year': series_year,
'original_series': original_match['year'] is None}
break
# add the result on match
if matching_result:
matching_result['data'] = result
matching_results.append(matching_result)
# exit if we don't have exactly 1 matching result
if not matching_results:
logger.error('No matching series found')
return
if len(matching_results) > 1:
logger.error('Multiple matches found')
return
# get the series
matching_result = matching_results[0]
series = get_series(matching_result['data']['id'])
# add series information
logger.debug('Found series %r', series)
video.series = matching_result['match']['series']
video.alternative_series.extend(series['aliases'])
video.year = matching_result['match']['year']
video.original_series = matching_result['match']['original_series']
video.series_tvdb_id = series['id']
video.series_imdb_id = series['imdbId'] or None
# get the episode
logger.info('Getting series episode %dx%d', video.season, video.episode)
episode = get_series_episode(video.series_tvdb_id, video.season, video.episode)
if not episode:
logger.warning('No results for episode')
return
# add episode information
logger.debug('Found episode %r', episode)
video.tvdb_id = episode['id']
video.title = episode['episodeName'] or None
video.imdb_id = episode['imdbId'] or None

234
libs/subliminal2.7/score.py Normal file
View File

@ -0,0 +1,234 @@
# -*- coding: utf-8 -*-
"""
This module provides the default implementation of the `compute_score` parameter in
:meth:`~subliminal.core.ProviderPool.download_best_subtitles` and :func:`~subliminal.core.download_best_subtitles`.
.. note::
To avoid unnecessary dependency on `sympy <http://www.sympy.org/>`_ and boost subliminal's import time, the
resulting scores are hardcoded here and manually updated when the set of equations change.
Available matches:
* hash
* title
* year
* series
* season
* episode
* release_group
* format
* audio_codec
* resolution
* hearing_impaired
* video_codec
* series_imdb_id
* imdb_id
* tvdb_id
"""
from __future__ import division, print_function
import logging
from .video import Episode, Movie
logger = logging.getLogger(__name__)
#: Scores for episodes
episode_scores = {'hash': 359, 'series': 180, 'year': 90, 'season': 30, 'episode': 30, 'release_group': 15,
'format': 7, 'audio_codec': 3, 'resolution': 2, 'video_codec': 2, 'hearing_impaired': 1}
#: Scores for movies
movie_scores = {'hash': 119, 'title': 60, 'year': 30, 'release_group': 15,
'format': 7, 'audio_codec': 3, 'resolution': 2, 'video_codec': 2, 'hearing_impaired': 1}
#: Equivalent release groups
equivalent_release_groups = ({'LOL', 'DIMENSION'}, {'ASAP', 'IMMERSE', 'FLEET'}, {'AVS', 'SVA'})
def get_equivalent_release_groups(release_group):
"""Get all the equivalents of the given release group.
:param str release_group: the release group to get the equivalents of.
:return: the equivalent release groups.
:rtype: set
"""
for equivalent_release_group in equivalent_release_groups:
if release_group in equivalent_release_group:
return equivalent_release_group
return {release_group}
def get_scores(video):
"""Get the scores dict for the given `video`.
This will return either :data:`episode_scores` or :data:`movie_scores` based on the type of the `video`.
:param video: the video to compute the score against.
:type video: :class:`~subliminal.video.Video`
:return: the scores dict.
:rtype: dict
"""
if isinstance(video, Episode):
return episode_scores
elif isinstance(video, Movie):
return movie_scores
raise ValueError('video must be an instance of Episode or Movie')
def compute_score(subtitle, video, hearing_impaired=None):
"""Compute the score of the `subtitle` against the `video` with `hearing_impaired` preference.
:func:`compute_score` uses the :meth:`Subtitle.get_matches <subliminal.subtitle.Subtitle.get_matches>` method and
applies the scores (either from :data:`episode_scores` or :data:`movie_scores`) after some processing.
:param subtitle: the subtitle to compute the score of.
:type subtitle: :class:`~subliminal.subtitle.Subtitle`
:param video: the video to compute the score against.
:type video: :class:`~subliminal.video.Video`
:param bool hearing_impaired: hearing impaired preference.
:return: score of the subtitle.
:rtype: int
"""
logger.info('Computing score of %r for video %r with %r', subtitle, video, dict(hearing_impaired=hearing_impaired))
# get the scores dict
scores = get_scores(video)
logger.debug('Using scores %r', scores)
# get the matches
matches = subtitle.get_matches(video)
logger.debug('Found matches %r', matches)
# on hash match, discard everything else
if 'hash' in matches:
logger.debug('Keeping only hash match')
matches &= {'hash'}
# handle equivalent matches
if isinstance(video, Episode):
if 'title' in matches:
logger.debug('Adding title match equivalent')
matches.add('episode')
if 'series_imdb_id' in matches:
logger.debug('Adding series_imdb_id match equivalent')
matches |= {'series', 'year'}
if 'imdb_id' in matches:
logger.debug('Adding imdb_id match equivalents')
matches |= {'series', 'year', 'season', 'episode'}
if 'tvdb_id' in matches:
logger.debug('Adding tvdb_id match equivalents')
matches |= {'series', 'year', 'season', 'episode'}
if 'series_tvdb_id' in matches:
logger.debug('Adding series_tvdb_id match equivalents')
matches |= {'series', 'year'}
elif isinstance(video, Movie):
if 'imdb_id' in matches:
logger.debug('Adding imdb_id match equivalents')
matches |= {'title', 'year'}
# handle hearing impaired
if hearing_impaired is not None and subtitle.hearing_impaired == hearing_impaired:
logger.debug('Matched hearing_impaired')
matches.add('hearing_impaired')
# compute the score
score = sum((scores.get(match, 0) for match in matches))
logger.info('Computed score %r with final matches %r', score, matches)
# ensure score is within valid bounds
assert 0 <= score <= scores['hash'] + scores['hearing_impaired']
return score
def solve_episode_equations():
from sympy import Eq, solve, symbols
hash, series, year, season, episode, release_group = symbols('hash series year season episode release_group')
format, audio_codec, resolution, video_codec = symbols('format audio_codec resolution video_codec')
hearing_impaired = symbols('hearing_impaired')
equations = [
# hash is best
Eq(hash, series + year + season + episode + release_group + format + audio_codec + resolution + video_codec),
# series counts for the most part in the total score
Eq(series, year + season + episode + release_group + format + audio_codec + resolution + video_codec + 1),
# year is the second most important part
Eq(year, season + episode + release_group + format + audio_codec + resolution + video_codec + 1),
# season is important too
Eq(season, release_group + format + audio_codec + resolution + video_codec + 1),
# episode is equally important to season
Eq(episode, season),
# release group is the next most wanted match
Eq(release_group, format + audio_codec + resolution + video_codec + 1),
# format counts as much as audio_codec, resolution and video_codec
Eq(format, audio_codec + resolution + video_codec),
# audio_codec is more valuable than video_codec
Eq(audio_codec, video_codec + 1),
# resolution counts as much as video_codec
Eq(resolution, video_codec),
# video_codec is the least valuable match but counts more than the sum of all scoring increasing matches
Eq(video_codec, hearing_impaired + 1),
# hearing impaired is only used for score increasing, so put it to 1
Eq(hearing_impaired, 1),
]
return solve(equations, [hash, series, year, season, episode, release_group, format, audio_codec, resolution,
hearing_impaired, video_codec])
def solve_movie_equations():
from sympy import Eq, solve, symbols
hash, title, year, release_group = symbols('hash title year release_group')
format, audio_codec, resolution, video_codec = symbols('format audio_codec resolution video_codec')
hearing_impaired = symbols('hearing_impaired')
equations = [
# hash is best
Eq(hash, title + year + release_group + format + audio_codec + resolution + video_codec),
# title counts for the most part in the total score
Eq(title, year + release_group + format + audio_codec + resolution + video_codec + 1),
# year is the second most important part
Eq(year, release_group + format + audio_codec + resolution + video_codec + 1),
# release group is the next most wanted match
Eq(release_group, format + audio_codec + resolution + video_codec + 1),
# format counts as much as audio_codec, resolution and video_codec
Eq(format, audio_codec + resolution + video_codec),
# audio_codec is more valuable than video_codec
Eq(audio_codec, video_codec + 1),
# resolution counts as much as video_codec
Eq(resolution, video_codec),
# video_codec is the least valuable match but counts more than the sum of all scoring increasing matches
Eq(video_codec, hearing_impaired + 1),
# hearing impaired is only used for score increasing, so put it to 1
Eq(hearing_impaired, 1),
]
return solve(equations, [hash, title, year, release_group, format, audio_codec, resolution, hearing_impaired,
video_codec])

View File

@ -0,0 +1,261 @@
# -*- coding: utf-8 -*-
import codecs
import logging
import os
import chardet
import pysrt
from .score import get_equivalent_release_groups
from .video import Episode, Movie
from .utils import sanitize, sanitize_release_group
logger = logging.getLogger(__name__)
#: Subtitle extensions
SUBTITLE_EXTENSIONS = ('.srt', '.sub', '.smi', '.txt', '.ssa', '.ass', '.mpl')
class Subtitle(object):
"""Base class for subtitle.
:param language: language of the subtitle.
:type language: :class:`~babelfish.language.Language`
:param bool hearing_impaired: whether or not the subtitle is hearing impaired.
:param page_link: URL of the web page from which the subtitle can be downloaded.
:type page_link: str
:param encoding: Text encoding of the subtitle.
:type encoding: str
"""
#: Name of the provider that returns that class of subtitle
provider_name = ''
def __init__(self, language, hearing_impaired=False, page_link=None, encoding=None):
#: Language of the subtitle
self.language = language
#: Whether or not the subtitle is hearing impaired
self.hearing_impaired = hearing_impaired
#: URL of the web page from which the subtitle can be downloaded
self.page_link = page_link
#: Content as bytes
self.content = None
#: Encoding to decode with when accessing :attr:`text`
self.encoding = None
# validate the encoding
if encoding:
try:
self.encoding = codecs.lookup(encoding).name
except (TypeError, LookupError):
logger.debug('Unsupported encoding %s', encoding)
@property
def id(self):
"""Unique identifier of the subtitle"""
raise NotImplementedError
@property
def text(self):
"""Content as string
If :attr:`encoding` is None, the encoding is guessed with :meth:`guess_encoding`
"""
if not self.content:
return
if self.encoding:
return self.content.decode(self.encoding, errors='replace')
return self.content.decode(self.guess_encoding(), errors='replace')
def is_valid(self):
"""Check if a :attr:`text` is a valid SubRip format.
:return: whether or not the subtitle is valid.
:rtype: bool
"""
if not self.text:
return False
try:
pysrt.from_string(self.text, error_handling=pysrt.ERROR_RAISE)
except pysrt.Error as e:
if e.args[0] < 80:
return False
return True
def guess_encoding(self):
"""Guess encoding using the language, falling back on chardet.
:return: the guessed encoding.
:rtype: str
"""
logger.info('Guessing encoding for language %s', self.language)
# always try utf-8 first
encodings = ['utf-8']
# add language-specific encodings
if self.language.alpha3 == 'zho':
encodings.extend(['gb18030', 'big5'])
elif self.language.alpha3 == 'jpn':
encodings.append('shift-jis')
elif self.language.alpha3 == 'ara':
encodings.append('windows-1256')
elif self.language.alpha3 == 'heb':
encodings.append('windows-1255')
elif self.language.alpha3 == 'tur':
encodings.extend(['iso-8859-9', 'windows-1254'])
elif self.language.alpha3 == 'pol':
# Eastern European Group 1
encodings.extend(['windows-1250'])
elif self.language.alpha3 == 'bul':
# Eastern European Group 2
encodings.extend(['windows-1251'])
else:
# Western European (windows-1252)
encodings.append('latin-1')
# try to decode
logger.debug('Trying encodings %r', encodings)
for encoding in encodings:
try:
self.content.decode(encoding)
except UnicodeDecodeError:
pass
else:
logger.info('Guessed encoding %s', encoding)
return encoding
logger.warning('Could not guess encoding from language')
# fallback on chardet
encoding = chardet.detect(self.content)['encoding']
logger.info('Chardet found encoding %s', encoding)
return encoding
def get_matches(self, video):
"""Get the matches against the `video`.
:param video: the video to get the matches with.
:type video: :class:`~subliminal.video.Video`
:return: matches of the subtitle.
:rtype: set
"""
raise NotImplementedError
def __hash__(self):
return hash(self.provider_name + '-' + self.id)
def __repr__(self):
return '<%s %r [%s]>' % (self.__class__.__name__, self.id, self.language)
def get_subtitle_path(video_path, language=None, extension='.srt'):
"""Get the subtitle path using the `video_path` and `language`.
:param str video_path: path to the video.
:param language: language of the subtitle to put in the path.
:type language: :class:`~babelfish.language.Language`
:param str extension: extension of the subtitle.
:return: path of the subtitle.
:rtype: str
"""
subtitle_root = os.path.splitext(video_path)[0]
if language:
subtitle_root += '.' + str(language)
return subtitle_root + extension
def guess_matches(video, guess, partial=False):
"""Get matches between a `video` and a `guess`.
If a guess is `partial`, the absence information won't be counted as a match.
:param video: the video.
:type video: :class:`~subliminal.video.Video`
:param guess: the guess.
:type guess: dict
:param bool partial: whether or not the guess is partial.
:return: matches between the `video` and the `guess`.
:rtype: set
"""
matches = set()
if isinstance(video, Episode):
# series
if video.series and 'title' in guess and sanitize(guess['title']) == sanitize(video.series):
matches.add('series')
# title
if video.title and 'episode_title' in guess and sanitize(guess['episode_title']) == sanitize(video.title):
matches.add('title')
# season
if video.season and 'season' in guess and guess['season'] == video.season:
matches.add('season')
# episode
# Currently we only have single-ep support (guessit returns a multi-ep as a list with int values)
# Most providers only support single-ep, so make sure it contains only 1 episode
# In case of multi-ep, take the lowest episode (subtitles will normally be available on lowest episode number)
if video.episode and 'episode' in guess:
episode_guess = guess['episode']
episode = min(episode_guess) if episode_guess and isinstance(episode_guess, list) else episode_guess
if episode == video.episode:
matches.add('episode')
# year
if video.year and 'year' in guess and guess['year'] == video.year:
matches.add('year')
# count "no year" as an information
if not partial and video.original_series and 'year' not in guess:
matches.add('year')
elif isinstance(video, Movie):
# year
if video.year and 'year' in guess and guess['year'] == video.year:
matches.add('year')
# title
if video.title and 'title' in guess and sanitize(guess['title']) == sanitize(video.title):
matches.add('title')
# release_group
if (video.release_group and 'release_group' in guess and
sanitize_release_group(guess['release_group']) in
get_equivalent_release_groups(sanitize_release_group(video.release_group))):
matches.add('release_group')
# resolution
if video.resolution and 'screen_size' in guess and guess['screen_size'] == video.resolution:
matches.add('resolution')
# format
if video.format and 'format' in guess and guess['format'].lower() == video.format.lower():
matches.add('format')
# video_codec
if video.video_codec and 'video_codec' in guess and guess['video_codec'] == video.video_codec:
matches.add('video_codec')
# audio_codec
if video.audio_codec and 'audio_codec' in guess and guess['audio_codec'] == video.audio_codec:
matches.add('audio_codec')
return matches
def fix_line_ending(content):
"""Fix line ending of `content` by changing it to \n.
:param bytes content: content of the subtitle.
:return: the content with fixed line endings.
:rtype: bytes
"""
return content.replace(b'\r\n', b'\n')

152
libs/subliminal2.7/utils.py Normal file
View File

@ -0,0 +1,152 @@
# -*- coding: utf-8 -*-
from datetime import datetime
import hashlib
import os
import re
import struct
def hash_opensubtitles(video_path):
"""Compute a hash using OpenSubtitles' algorithm.
:param str video_path: path of the video.
:return: the hash.
:rtype: str
"""
bytesize = struct.calcsize(b'<q')
with open(video_path, 'rb') as f:
filesize = os.path.getsize(video_path)
filehash = filesize
if filesize < 65536 * 2:
return
for _ in range(65536 // bytesize):
filebuffer = f.read(bytesize)
(l_value,) = struct.unpack(b'<q', filebuffer)
filehash += l_value
filehash &= 0xFFFFFFFFFFFFFFFF # to remain as 64bit number
f.seek(max(0, filesize - 65536), 0)
for _ in range(65536 // bytesize):
filebuffer = f.read(bytesize)
(l_value,) = struct.unpack(b'<q', filebuffer)
filehash += l_value
filehash &= 0xFFFFFFFFFFFFFFFF
returnedhash = '%016x' % filehash
return returnedhash
def hash_thesubdb(video_path):
"""Compute a hash using TheSubDB's algorithm.
:param str video_path: path of the video.
:return: the hash.
:rtype: str
"""
readsize = 64 * 1024
if os.path.getsize(video_path) < readsize:
return
with open(video_path, 'rb') as f:
data = f.read(readsize)
f.seek(-readsize, os.SEEK_END)
data += f.read(readsize)
return hashlib.md5(data).hexdigest()
def hash_napiprojekt(video_path):
"""Compute a hash using NapiProjekt's algorithm.
:param str video_path: path of the video.
:return: the hash.
:rtype: str
"""
readsize = 1024 * 1024 * 10
with open(video_path, 'rb') as f:
data = f.read(readsize)
return hashlib.md5(data).hexdigest()
def hash_shooter(video_path):
"""Compute a hash using Shooter's algorithm
:param string video_path: path of the video
:return: the hash
:rtype: string
"""
filesize = os.path.getsize(video_path)
readsize = 4096
if os.path.getsize(video_path) < readsize * 2:
return None
offsets = (readsize, filesize // 3 * 2, filesize // 3, filesize - readsize * 2)
filehash = []
with open(video_path, 'rb') as f:
for offset in offsets:
f.seek(offset)
filehash.append(hashlib.md5(f.read(readsize)).hexdigest())
return ';'.join(filehash)
def sanitize(string, ignore_characters=None):
"""Sanitize a string to strip special characters.
:param str string: the string to sanitize.
:param set ignore_characters: characters to ignore.
:return: the sanitized string.
:rtype: str
"""
# only deal with strings
if string is None:
return
ignore_characters = ignore_characters or set()
# replace some characters with one space
characters = {'-', ':', '(', ')', '.'} - ignore_characters
if characters:
string = re.sub(r'[%s]' % re.escape(''.join(characters)), ' ', string)
# remove some characters
characters = {'\''} - ignore_characters
if characters:
string = re.sub(r'[%s]' % re.escape(''.join(characters)), '', string)
# replace multiple spaces with one
string = re.sub(r'\s+', ' ', string)
# strip and lower case
return string.strip().lower()
def sanitize_release_group(string):
"""Sanitize a `release_group` string to remove content in square brackets.
:param str string: the release group to sanitize.
:return: the sanitized release group.
:rtype: str
"""
# only deal with strings
if string is None:
return
# remove content in square brackets
string = re.sub(r'\[\w+\]', '', string)
# strip and upper case
return string.strip().upper()
def timestamp(date):
"""Get the timestamp of the `date`, python2/3 compatible
:param datetime.datetime date: the utc date.
:return: the timestamp of the date.
:rtype: float
"""
return (date - datetime(1970, 1, 1)).total_seconds()

239
libs/subliminal2.7/video.py Normal file
View File

@ -0,0 +1,239 @@
# -*- coding: utf-8 -*-
from __future__ import division
from datetime import datetime, timedelta
import logging
import os
from guessit import guessit
logger = logging.getLogger(__name__)
#: Video extensions
VIDEO_EXTENSIONS = ('.3g2', '.3gp', '.3gp2', '.3gpp', '.60d', '.ajp', '.asf', '.asx', '.avchd', '.avi', '.bik',
'.bix', '.box', '.cam', '.dat', '.divx', '.dmf', '.dv', '.dvr-ms', '.evo', '.flc', '.fli',
'.flic', '.flv', '.flx', '.gvi', '.gvp', '.h264', '.m1v', '.m2p', '.m2ts', '.m2v', '.m4e',
'.m4v', '.mjp', '.mjpeg', '.mjpg', '.mkv', '.moov', '.mov', '.movhd', '.movie', '.movx', '.mp4',
'.mpe', '.mpeg', '.mpg', '.mpv', '.mpv2', '.mxf', '.nsv', '.nut', '.ogg', '.ogm', '.ogv', '.omf',
'.ps', '.qt', '.ram', '.rm', '.rmvb', '.swf', '.ts', '.vfw', '.vid', '.video', '.viv', '.vivo',
'.vob', '.vro', '.webm', '.wm', '.wmv', '.wmx', '.wrap', '.wvx', '.wx', '.x264', '.xvid')
class Video(object):
"""Base class for videos.
Represent a video, existing or not.
:param str name: name or path of the video.
:param str format: format of the video (HDTV, WEB-DL, BluRay, ...).
:param str release_group: release group of the video.
:param str resolution: resolution of the video stream (480p, 720p, 1080p or 1080i).
:param str video_codec: codec of the video stream.
:param str audio_codec: codec of the main audio stream.
:param str imdb_id: IMDb id of the video.
:param dict hashes: hashes of the video file by provider names.
:param int size: size of the video file in bytes.
:param set subtitle_languages: existing subtitle languages.
"""
def __init__(self, name, format=None, release_group=None, resolution=None, video_codec=None, audio_codec=None,
imdb_id=None, hashes=None, size=None, subtitle_languages=None):
#: Name or path of the video
self.name = name
#: Format of the video (HDTV, WEB-DL, BluRay, ...)
self.format = format
#: Release group of the video
self.release_group = release_group
#: Resolution of the video stream (480p, 720p, 1080p or 1080i)
self.resolution = resolution
#: Codec of the video stream
self.video_codec = video_codec
#: Codec of the main audio stream
self.audio_codec = audio_codec
#: IMDb id of the video
self.imdb_id = imdb_id
#: Hashes of the video file by provider names
self.hashes = hashes or {}
#: Size of the video file in bytes
self.size = size
#: Existing subtitle languages
self.subtitle_languages = subtitle_languages or set()
@property
def exists(self):
"""Test whether the video exists"""
return os.path.exists(self.name)
@property
def age(self):
"""Age of the video"""
if self.exists:
return datetime.utcnow() - datetime.utcfromtimestamp(os.path.getmtime(self.name))
return timedelta()
@classmethod
def fromguess(cls, name, guess):
"""Create an :class:`Episode` or a :class:`Movie` with the given `name` based on the `guess`.
:param str name: name of the video.
:param dict guess: guessed data.
:raise: :class:`ValueError` if the `type` of the `guess` is invalid
"""
if guess['type'] == 'episode':
return Episode.fromguess(name, guess)
if guess['type'] == 'movie':
return Movie.fromguess(name, guess)
raise ValueError('The guess must be an episode or a movie guess')
@classmethod
def fromname(cls, name):
"""Shortcut for :meth:`fromguess` with a `guess` guessed from the `name`.
:param str name: name of the video.
"""
return cls.fromguess(name, guessit(name))
def __repr__(self):
return '<%s [%r]>' % (self.__class__.__name__, self.name)
def __hash__(self):
return hash(self.name)
class Episode(Video):
"""Episode :class:`Video`.
:param str series: series of the episode.
:param int season: season number of the episode.
:param int episode: episode number of the episode.
:param str title: title of the episode.
:param int year: year of the series.
:param bool original_series: whether the series is the first with this name.
:param int tvdb_id: TVDB id of the episode.
:param list alternative_series: alternative names of the series
:param \*\*kwargs: additional parameters for the :class:`Video` constructor.
"""
def __init__(self, name, series, season, episode, title=None, year=None, original_series=True, tvdb_id=None,
series_tvdb_id=None, series_imdb_id=None, alternative_series=None, **kwargs):
super(Episode, self).__init__(name, **kwargs)
#: Series of the episode
self.series = series
#: Season number of the episode
self.season = season
#: Episode number of the episode
self.episode = episode
#: Title of the episode
self.title = title
#: Year of series
self.year = year
#: The series is the first with this name
self.original_series = original_series
#: TVDB id of the episode
self.tvdb_id = tvdb_id
#: TVDB id of the series
self.series_tvdb_id = series_tvdb_id
#: IMDb id of the series
self.series_imdb_id = series_imdb_id
#: Alternative names of the series
self.alternative_series = alternative_series or []
@classmethod
def fromguess(cls, name, guess):
if guess['type'] != 'episode':
raise ValueError('The guess must be an episode guess')
if 'title' not in guess or 'episode' not in guess:
raise ValueError('Insufficient data to process the guess')
# Currently we only have single-ep support (guessit returns a multi-ep as a list with int values)
# Most providers only support single-ep, so make sure it contains only 1 episode
# In case of multi-ep, take the lowest episode (subtitles will normally be available on lowest episode number)
episode_guess = guess.get('episode')
episode = min(episode_guess) if episode_guess and isinstance(episode_guess, list) else episode_guess
return cls(name, guess['title'], guess.get('season', 1), episode, title=guess.get('episode_title'),
year=guess.get('year'), format=guess.get('format'), original_series='year' not in guess,
release_group=guess.get('release_group'), resolution=guess.get('screen_size'),
video_codec=guess.get('video_codec'), audio_codec=guess.get('audio_codec'))
@classmethod
def fromname(cls, name):
return cls.fromguess(name, guessit(name, {'type': 'episode'}))
def __repr__(self):
if self.year is None:
return '<%s [%r, %dx%d]>' % (self.__class__.__name__, self.series, self.season, self.episode)
return '<%s [%r, %d, %dx%d]>' % (self.__class__.__name__, self.series, self.year, self.season, self.episode)
class Movie(Video):
"""Movie :class:`Video`.
:param str title: title of the movie.
:param int year: year of the movie.
:param list alternative_titles: alternative titles of the movie
:param \*\*kwargs: additional parameters for the :class:`Video` constructor.
"""
def __init__(self, name, title, year=None, alternative_titles=None, **kwargs):
super(Movie, self).__init__(name, **kwargs)
#: Title of the movie
self.title = title
#: Year of the movie
self.year = year
#: Alternative titles of the movie
self.alternative_titles = alternative_titles or []
@classmethod
def fromguess(cls, name, guess):
if guess['type'] != 'movie':
raise ValueError('The guess must be a movie guess')
if 'title' not in guess:
raise ValueError('Insufficient data to process the guess')
alternative_titles = []
if 'alternative_title' in guess:
alternative_titles.append(u"%s %s" % (guess['title'], guess['alternative_title']))
return cls(name, guess['title'], format=guess.get('format'), release_group=guess.get('release_group'),
resolution=guess.get('screen_size'), video_codec=guess.get('video_codec'),
audio_codec=guess.get('audio_codec'), year=guess.get('year'), alternative_titles=alternative_titles)
@classmethod
def fromname(cls, name):
return cls.fromguess(name, guessit(name, {'type': 'movie'}))
def __repr__(self):
if self.year is None:
return '<%s [%r]>' % (self.__class__.__name__, self.title)
return '<%s [%r, %d]>' % (self.__class__.__name__, self.title, self.year)

402
libs/yaml/__init__.py Normal file
View File

@ -0,0 +1,402 @@
from .error import *
from .tokens import *
from .events import *
from .nodes import *
from .loader import *
from .dumper import *
__version__ = '5.1.2'
try:
from .cyaml import *
__with_libyaml__ = True
except ImportError:
__with_libyaml__ = False
import io
#------------------------------------------------------------------------------
# Warnings control
#------------------------------------------------------------------------------
# 'Global' warnings state:
_warnings_enabled = {
'YAMLLoadWarning': True,
}
# Get or set global warnings' state
def warnings(settings=None):
if settings is None:
return _warnings_enabled
if type(settings) is dict:
for key in settings:
if key in _warnings_enabled:
_warnings_enabled[key] = settings[key]
# Warn when load() is called without Loader=...
class YAMLLoadWarning(RuntimeWarning):
pass
def load_warning(method):
if _warnings_enabled['YAMLLoadWarning'] is False:
return
import warnings
message = (
"calling yaml.%s() without Loader=... is deprecated, as the "
"default Loader is unsafe. Please read "
"https://msg.pyyaml.org/load for full details."
) % method
warnings.warn(message, YAMLLoadWarning, stacklevel=3)
#------------------------------------------------------------------------------
def scan(stream, Loader=Loader):
"""
Scan a YAML stream and produce scanning tokens.
"""
loader = Loader(stream)
try:
while loader.check_token():
yield loader.get_token()
finally:
loader.dispose()
def parse(stream, Loader=Loader):
"""
Parse a YAML stream and produce parsing events.
"""
loader = Loader(stream)
try:
while loader.check_event():
yield loader.get_event()
finally:
loader.dispose()
def compose(stream, Loader=Loader):
"""
Parse the first YAML document in a stream
and produce the corresponding representation tree.
"""
loader = Loader(stream)
try:
return loader.get_single_node()
finally:
loader.dispose()
def compose_all(stream, Loader=Loader):
"""
Parse all YAML documents in a stream
and produce corresponding representation trees.
"""
loader = Loader(stream)
try:
while loader.check_node():
yield loader.get_node()
finally:
loader.dispose()
def load(stream, Loader=None):
"""
Parse the first YAML document in a stream
and produce the corresponding Python object.
"""
if Loader is None:
load_warning('load')
Loader = FullLoader
loader = Loader(stream)
try:
return loader.get_single_data()
finally:
loader.dispose()
def load_all(stream, Loader=None):
"""
Parse all YAML documents in a stream
and produce corresponding Python objects.
"""
if Loader is None:
load_warning('load_all')
Loader = FullLoader
loader = Loader(stream)
try:
while loader.check_data():
yield loader.get_data()
finally:
loader.dispose()
def full_load(stream):
"""
Parse the first YAML document in a stream
and produce the corresponding Python object.
Resolve all tags except those known to be
unsafe on untrusted input.
"""
return load(stream, FullLoader)
def full_load_all(stream):
"""
Parse all YAML documents in a stream
and produce corresponding Python objects.
Resolve all tags except those known to be
unsafe on untrusted input.
"""
return load_all(stream, FullLoader)
def safe_load(stream):
"""
Parse the first YAML document in a stream
and produce the corresponding Python object.
Resolve only basic YAML tags. This is known
to be safe for untrusted input.
"""
return load(stream, SafeLoader)
def safe_load_all(stream):
"""
Parse all YAML documents in a stream
and produce corresponding Python objects.
Resolve only basic YAML tags. This is known
to be safe for untrusted input.
"""
return load_all(stream, SafeLoader)
def unsafe_load(stream):
"""
Parse the first YAML document in a stream
and produce the corresponding Python object.
Resolve all tags, even those known to be
unsafe on untrusted input.
"""
return load(stream, UnsafeLoader)
def unsafe_load_all(stream):
"""
Parse all YAML documents in a stream
and produce corresponding Python objects.
Resolve all tags, even those known to be
unsafe on untrusted input.
"""
return load_all(stream, UnsafeLoader)
def emit(events, stream=None, Dumper=Dumper,
canonical=None, indent=None, width=None,
allow_unicode=None, line_break=None):
"""
Emit YAML parsing events into a stream.
If stream is None, return the produced string instead.
"""
getvalue = None
if stream is None:
stream = io.StringIO()
getvalue = stream.getvalue
dumper = Dumper(stream, canonical=canonical, indent=indent, width=width,
allow_unicode=allow_unicode, line_break=line_break)
try:
for event in events:
dumper.emit(event)
finally:
dumper.dispose()
if getvalue:
return getvalue()
def serialize_all(nodes, stream=None, Dumper=Dumper,
canonical=None, indent=None, width=None,
allow_unicode=None, line_break=None,
encoding=None, explicit_start=None, explicit_end=None,
version=None, tags=None):
"""
Serialize a sequence of representation trees into a YAML stream.
If stream is None, return the produced string instead.
"""
getvalue = None
if stream is None:
if encoding is None:
stream = io.StringIO()
else:
stream = io.BytesIO()
getvalue = stream.getvalue
dumper = Dumper(stream, canonical=canonical, indent=indent, width=width,
allow_unicode=allow_unicode, line_break=line_break,
encoding=encoding, version=version, tags=tags,
explicit_start=explicit_start, explicit_end=explicit_end)
try:
dumper.open()
for node in nodes:
dumper.serialize(node)
dumper.close()
finally:
dumper.dispose()
if getvalue:
return getvalue()
def serialize(node, stream=None, Dumper=Dumper, **kwds):
"""
Serialize a representation tree into a YAML stream.
If stream is None, return the produced string instead.
"""
return serialize_all([node], stream, Dumper=Dumper, **kwds)
def dump_all(documents, stream=None, Dumper=Dumper,
default_style=None, default_flow_style=False,
canonical=None, indent=None, width=None,
allow_unicode=None, line_break=None,
encoding=None, explicit_start=None, explicit_end=None,
version=None, tags=None, sort_keys=True):
"""
Serialize a sequence of Python objects into a YAML stream.
If stream is None, return the produced string instead.
"""
getvalue = None
if stream is None:
if encoding is None:
stream = io.StringIO()
else:
stream = io.BytesIO()
getvalue = stream.getvalue
dumper = Dumper(stream, default_style=default_style,
default_flow_style=default_flow_style,
canonical=canonical, indent=indent, width=width,
allow_unicode=allow_unicode, line_break=line_break,
encoding=encoding, version=version, tags=tags,
explicit_start=explicit_start, explicit_end=explicit_end, sort_keys=sort_keys)
try:
dumper.open()
for data in documents:
dumper.represent(data)
dumper.close()
finally:
dumper.dispose()
if getvalue:
return getvalue()
def dump(data, stream=None, Dumper=Dumper, **kwds):
"""
Serialize a Python object into a YAML stream.
If stream is None, return the produced string instead.
"""
return dump_all([data], stream, Dumper=Dumper, **kwds)
def safe_dump_all(documents, stream=None, **kwds):
"""
Serialize a sequence of Python objects into a YAML stream.
Produce only basic YAML tags.
If stream is None, return the produced string instead.
"""
return dump_all(documents, stream, Dumper=SafeDumper, **kwds)
def safe_dump(data, stream=None, **kwds):
"""
Serialize a Python object into a YAML stream.
Produce only basic YAML tags.
If stream is None, return the produced string instead.
"""
return dump_all([data], stream, Dumper=SafeDumper, **kwds)
def add_implicit_resolver(tag, regexp, first=None,
Loader=Loader, Dumper=Dumper):
"""
Add an implicit scalar detector.
If an implicit scalar value matches the given regexp,
the corresponding tag is assigned to the scalar.
first is a sequence of possible initial characters or None.
"""
Loader.add_implicit_resolver(tag, regexp, first)
Dumper.add_implicit_resolver(tag, regexp, first)
def add_path_resolver(tag, path, kind=None, Loader=Loader, Dumper=Dumper):
"""
Add a path based resolver for the given tag.
A path is a list of keys that forms a path
to a node in the representation tree.
Keys can be string values, integers, or None.
"""
Loader.add_path_resolver(tag, path, kind)
Dumper.add_path_resolver(tag, path, kind)
def add_constructor(tag, constructor, Loader=Loader):
"""
Add a constructor for the given tag.
Constructor is a function that accepts a Loader instance
and a node object and produces the corresponding Python object.
"""
Loader.add_constructor(tag, constructor)
def add_multi_constructor(tag_prefix, multi_constructor, Loader=Loader):
"""
Add a multi-constructor for the given tag prefix.
Multi-constructor is called for a node if its tag starts with tag_prefix.
Multi-constructor accepts a Loader instance, a tag suffix,
and a node object and produces the corresponding Python object.
"""
Loader.add_multi_constructor(tag_prefix, multi_constructor)
def add_representer(data_type, representer, Dumper=Dumper):
"""
Add a representer for the given type.
Representer is a function accepting a Dumper instance
and an instance of the given data type
and producing the corresponding representation node.
"""
Dumper.add_representer(data_type, representer)
def add_multi_representer(data_type, multi_representer, Dumper=Dumper):
"""
Add a representer for the given type.
Multi-representer is a function accepting a Dumper instance
and an instance of the given data type or subtype
and producing the corresponding representation node.
"""
Dumper.add_multi_representer(data_type, multi_representer)
class YAMLObjectMetaclass(type):
"""
The metaclass for YAMLObject.
"""
def __init__(cls, name, bases, kwds):
super(YAMLObjectMetaclass, cls).__init__(name, bases, kwds)
if 'yaml_tag' in kwds and kwds['yaml_tag'] is not None:
cls.yaml_loader.add_constructor(cls.yaml_tag, cls.from_yaml)
cls.yaml_dumper.add_representer(cls, cls.to_yaml)
class YAMLObject(metaclass=YAMLObjectMetaclass):
"""
An object that can dump itself to a YAML stream
and load itself from a YAML stream.
"""
__slots__ = () # no direct instantiation, so allow immutable subclasses
yaml_loader = Loader
yaml_dumper = Dumper
yaml_tag = None
yaml_flow_style = None
@classmethod
def from_yaml(cls, loader, node):
"""
Convert a representation node to a Python object.
"""
return loader.construct_yaml_object(node, cls)
@classmethod
def to_yaml(cls, dumper, data):
"""
Convert a Python object to a representation node.
"""
return dumper.represent_yaml_object(cls.yaml_tag, data, cls,
flow_style=cls.yaml_flow_style)

139
libs/yaml/composer.py Normal file
View File

@ -0,0 +1,139 @@
__all__ = ['Composer', 'ComposerError']
from .error import MarkedYAMLError
from .events import *
from .nodes import *
class ComposerError(MarkedYAMLError):
pass
class Composer:
def __init__(self):
self.anchors = {}
def check_node(self):
# Drop the STREAM-START event.
if self.check_event(StreamStartEvent):
self.get_event()
# If there are more documents available?
return not self.check_event(StreamEndEvent)
def get_node(self):
# Get the root node of the next document.
if not self.check_event(StreamEndEvent):
return self.compose_document()
def get_single_node(self):
# Drop the STREAM-START event.
self.get_event()
# Compose a document if the stream is not empty.
document = None
if not self.check_event(StreamEndEvent):
document = self.compose_document()
# Ensure that the stream contains no more documents.
if not self.check_event(StreamEndEvent):
event = self.get_event()
raise ComposerError("expected a single document in the stream",
document.start_mark, "but found another document",
event.start_mark)
# Drop the STREAM-END event.
self.get_event()
return document
def compose_document(self):
# Drop the DOCUMENT-START event.
self.get_event()
# Compose the root node.
node = self.compose_node(None, None)
# Drop the DOCUMENT-END event.
self.get_event()
self.anchors = {}
return node
def compose_node(self, parent, index):
if self.check_event(AliasEvent):
event = self.get_event()
anchor = event.anchor
if anchor not in self.anchors:
raise ComposerError(None, None, "found undefined alias %r"
% anchor, event.start_mark)
return self.anchors[anchor]
event = self.peek_event()
anchor = event.anchor
if anchor is not None:
if anchor in self.anchors:
raise ComposerError("found duplicate anchor %r; first occurrence"
% anchor, self.anchors[anchor].start_mark,
"second occurrence", event.start_mark)
self.descend_resolver(parent, index)
if self.check_event(ScalarEvent):
node = self.compose_scalar_node(anchor)
elif self.check_event(SequenceStartEvent):
node = self.compose_sequence_node(anchor)
elif self.check_event(MappingStartEvent):
node = self.compose_mapping_node(anchor)
self.ascend_resolver()
return node
def compose_scalar_node(self, anchor):
event = self.get_event()
tag = event.tag
if tag is None or tag == '!':
tag = self.resolve(ScalarNode, event.value, event.implicit)
node = ScalarNode(tag, event.value,
event.start_mark, event.end_mark, style=event.style)
if anchor is not None:
self.anchors[anchor] = node
return node
def compose_sequence_node(self, anchor):
start_event = self.get_event()
tag = start_event.tag
if tag is None or tag == '!':
tag = self.resolve(SequenceNode, None, start_event.implicit)
node = SequenceNode(tag, [],
start_event.start_mark, None,
flow_style=start_event.flow_style)
if anchor is not None:
self.anchors[anchor] = node
index = 0
while not self.check_event(SequenceEndEvent):
node.value.append(self.compose_node(node, index))
index += 1
end_event = self.get_event()
node.end_mark = end_event.end_mark
return node
def compose_mapping_node(self, anchor):
start_event = self.get_event()
tag = start_event.tag
if tag is None or tag == '!':
tag = self.resolve(MappingNode, None, start_event.implicit)
node = MappingNode(tag, [],
start_event.start_mark, None,
flow_style=start_event.flow_style)
if anchor is not None:
self.anchors[anchor] = node
while not self.check_event(MappingEndEvent):
#key_event = self.peek_event()
item_key = self.compose_node(node, None)
#if item_key in node.value:
# raise ComposerError("while composing a mapping", start_event.start_mark,
# "found duplicate key", key_event.start_mark)
item_value = self.compose_node(node, item_key)
#node.value[item_key] = item_value
node.value.append((item_key, item_value))
end_event = self.get_event()
node.end_mark = end_event.end_mark
return node

720
libs/yaml/constructor.py Normal file
View File

@ -0,0 +1,720 @@
__all__ = [
'BaseConstructor',
'SafeConstructor',
'FullConstructor',
'UnsafeConstructor',
'Constructor',
'ConstructorError'
]
from .error import *
from .nodes import *
import collections.abc, datetime, base64, binascii, re, sys, types
class ConstructorError(MarkedYAMLError):
pass
class BaseConstructor:
yaml_constructors = {}
yaml_multi_constructors = {}
def __init__(self):
self.constructed_objects = {}
self.recursive_objects = {}
self.state_generators = []
self.deep_construct = False
def check_data(self):
# If there are more documents available?
return self.check_node()
def get_data(self):
# Construct and return the next document.
if self.check_node():
return self.construct_document(self.get_node())
def get_single_data(self):
# Ensure that the stream contains a single document and construct it.
node = self.get_single_node()
if node is not None:
return self.construct_document(node)
return None
def construct_document(self, node):
data = self.construct_object(node)
while self.state_generators:
state_generators = self.state_generators
self.state_generators = []
for generator in state_generators:
for dummy in generator:
pass
self.constructed_objects = {}
self.recursive_objects = {}
self.deep_construct = False
return data
def construct_object(self, node, deep=False):
if node in self.constructed_objects:
return self.constructed_objects[node]
if deep:
old_deep = self.deep_construct
self.deep_construct = True
if node in self.recursive_objects:
raise ConstructorError(None, None,
"found unconstructable recursive node", node.start_mark)
self.recursive_objects[node] = None
constructor = None
tag_suffix = None
if node.tag in self.yaml_constructors:
constructor = self.yaml_constructors[node.tag]
else:
for tag_prefix in self.yaml_multi_constructors:
if node.tag.startswith(tag_prefix):
tag_suffix = node.tag[len(tag_prefix):]
constructor = self.yaml_multi_constructors[tag_prefix]
break
else:
if None in self.yaml_multi_constructors:
tag_suffix = node.tag
constructor = self.yaml_multi_constructors[None]
elif None in self.yaml_constructors:
constructor = self.yaml_constructors[None]
elif isinstance(node, ScalarNode):
constructor = self.__class__.construct_scalar
elif isinstance(node, SequenceNode):
constructor = self.__class__.construct_sequence
elif isinstance(node, MappingNode):
constructor = self.__class__.construct_mapping
if tag_suffix is None:
data = constructor(self, node)
else:
data = constructor(self, tag_suffix, node)
if isinstance(data, types.GeneratorType):
generator = data
data = next(generator)
if self.deep_construct:
for dummy in generator:
pass
else:
self.state_generators.append(generator)
self.constructed_objects[node] = data
del self.recursive_objects[node]
if deep:
self.deep_construct = old_deep
return data
def construct_scalar(self, node):
if not isinstance(node, ScalarNode):
raise ConstructorError(None, None,
"expected a scalar node, but found %s" % node.id,
node.start_mark)
return node.value
def construct_sequence(self, node, deep=False):
if not isinstance(node, SequenceNode):
raise ConstructorError(None, None,
"expected a sequence node, but found %s" % node.id,
node.start_mark)
return [self.construct_object(child, deep=deep)
for child in node.value]
def construct_mapping(self, node, deep=False):
if not isinstance(node, MappingNode):
raise ConstructorError(None, None,
"expected a mapping node, but found %s" % node.id,
node.start_mark)
mapping = {}
for key_node, value_node in node.value:
key = self.construct_object(key_node, deep=deep)
if not isinstance(key, collections.abc.Hashable):
raise ConstructorError("while constructing a mapping", node.start_mark,
"found unhashable key", key_node.start_mark)
value = self.construct_object(value_node, deep=deep)
mapping[key] = value
return mapping
def construct_pairs(self, node, deep=False):
if not isinstance(node, MappingNode):
raise ConstructorError(None, None,
"expected a mapping node, but found %s" % node.id,
node.start_mark)
pairs = []
for key_node, value_node in node.value:
key = self.construct_object(key_node, deep=deep)
value = self.construct_object(value_node, deep=deep)
pairs.append((key, value))
return pairs
@classmethod
def add_constructor(cls, tag, constructor):
if not 'yaml_constructors' in cls.__dict__:
cls.yaml_constructors = cls.yaml_constructors.copy()
cls.yaml_constructors[tag] = constructor
@classmethod
def add_multi_constructor(cls, tag_prefix, multi_constructor):
if not 'yaml_multi_constructors' in cls.__dict__:
cls.yaml_multi_constructors = cls.yaml_multi_constructors.copy()
cls.yaml_multi_constructors[tag_prefix] = multi_constructor
class SafeConstructor(BaseConstructor):
def construct_scalar(self, node):
if isinstance(node, MappingNode):
for key_node, value_node in node.value:
if key_node.tag == 'tag:yaml.org,2002:value':
return self.construct_scalar(value_node)
return super().construct_scalar(node)
def flatten_mapping(self, node):
merge = []
index = 0
while index < len(node.value):
key_node, value_node = node.value[index]
if key_node.tag == 'tag:yaml.org,2002:merge':
del node.value[index]
if isinstance(value_node, MappingNode):
self.flatten_mapping(value_node)
merge.extend(value_node.value)
elif isinstance(value_node, SequenceNode):
submerge = []
for subnode in value_node.value:
if not isinstance(subnode, MappingNode):
raise ConstructorError("while constructing a mapping",
node.start_mark,
"expected a mapping for merging, but found %s"
% subnode.id, subnode.start_mark)
self.flatten_mapping(subnode)
submerge.append(subnode.value)
submerge.reverse()
for value in submerge:
merge.extend(value)
else:
raise ConstructorError("while constructing a mapping", node.start_mark,
"expected a mapping or list of mappings for merging, but found %s"
% value_node.id, value_node.start_mark)
elif key_node.tag == 'tag:yaml.org,2002:value':
key_node.tag = 'tag:yaml.org,2002:str'
index += 1
else:
index += 1
if merge:
node.value = merge + node.value
def construct_mapping(self, node, deep=False):
if isinstance(node, MappingNode):
self.flatten_mapping(node)
return super().construct_mapping(node, deep=deep)
def construct_yaml_null(self, node):
self.construct_scalar(node)
return None
bool_values = {
'yes': True,
'no': False,
'true': True,
'false': False,
'on': True,
'off': False,
}
def construct_yaml_bool(self, node):
value = self.construct_scalar(node)
return self.bool_values[value.lower()]
def construct_yaml_int(self, node):
value = self.construct_scalar(node)
value = value.replace('_', '')
sign = +1
if value[0] == '-':
sign = -1
if value[0] in '+-':
value = value[1:]
if value == '0':
return 0
elif value.startswith('0b'):
return sign*int(value[2:], 2)
elif value.startswith('0x'):
return sign*int(value[2:], 16)
elif value[0] == '0':
return sign*int(value, 8)
elif ':' in value:
digits = [int(part) for part in value.split(':')]
digits.reverse()
base = 1
value = 0
for digit in digits:
value += digit*base
base *= 60
return sign*value
else:
return sign*int(value)
inf_value = 1e300
while inf_value != inf_value*inf_value:
inf_value *= inf_value
nan_value = -inf_value/inf_value # Trying to make a quiet NaN (like C99).
def construct_yaml_float(self, node):
value = self.construct_scalar(node)
value = value.replace('_', '').lower()
sign = +1
if value[0] == '-':
sign = -1
if value[0] in '+-':
value = value[1:]
if value == '.inf':
return sign*self.inf_value
elif value == '.nan':
return self.nan_value
elif ':' in value:
digits = [float(part) for part in value.split(':')]
digits.reverse()
base = 1
value = 0.0
for digit in digits:
value += digit*base
base *= 60
return sign*value
else:
return sign*float(value)
def construct_yaml_binary(self, node):
try:
value = self.construct_scalar(node).encode('ascii')
except UnicodeEncodeError as exc:
raise ConstructorError(None, None,
"failed to convert base64 data into ascii: %s" % exc,
node.start_mark)
try:
if hasattr(base64, 'decodebytes'):
return base64.decodebytes(value)
else:
return base64.decodestring(value)
except binascii.Error as exc:
raise ConstructorError(None, None,
"failed to decode base64 data: %s" % exc, node.start_mark)
timestamp_regexp = re.compile(
r'''^(?P<year>[0-9][0-9][0-9][0-9])
-(?P<month>[0-9][0-9]?)
-(?P<day>[0-9][0-9]?)
(?:(?:[Tt]|[ \t]+)
(?P<hour>[0-9][0-9]?)
:(?P<minute>[0-9][0-9])
:(?P<second>[0-9][0-9])
(?:\.(?P<fraction>[0-9]*))?
(?:[ \t]*(?P<tz>Z|(?P<tz_sign>[-+])(?P<tz_hour>[0-9][0-9]?)
(?::(?P<tz_minute>[0-9][0-9]))?))?)?$''', re.X)
def construct_yaml_timestamp(self, node):
value = self.construct_scalar(node)
match = self.timestamp_regexp.match(node.value)
values = match.groupdict()
year = int(values['year'])
month = int(values['month'])
day = int(values['day'])
if not values['hour']:
return datetime.date(year, month, day)
hour = int(values['hour'])
minute = int(values['minute'])
second = int(values['second'])
fraction = 0
if values['fraction']:
fraction = values['fraction'][:6]
while len(fraction) < 6:
fraction += '0'
fraction = int(fraction)
delta = None
if values['tz_sign']:
tz_hour = int(values['tz_hour'])
tz_minute = int(values['tz_minute'] or 0)
delta = datetime.timedelta(hours=tz_hour, minutes=tz_minute)
if values['tz_sign'] == '-':
delta = -delta
data = datetime.datetime(year, month, day, hour, minute, second, fraction)
if delta:
data -= delta
return data
def construct_yaml_omap(self, node):
# Note: we do not check for duplicate keys, because it's too
# CPU-expensive.
omap = []
yield omap
if not isinstance(node, SequenceNode):
raise ConstructorError("while constructing an ordered map", node.start_mark,
"expected a sequence, but found %s" % node.id, node.start_mark)
for subnode in node.value:
if not isinstance(subnode, MappingNode):
raise ConstructorError("while constructing an ordered map", node.start_mark,
"expected a mapping of length 1, but found %s" % subnode.id,
subnode.start_mark)
if len(subnode.value) != 1:
raise ConstructorError("while constructing an ordered map", node.start_mark,
"expected a single mapping item, but found %d items" % len(subnode.value),
subnode.start_mark)
key_node, value_node = subnode.value[0]
key = self.construct_object(key_node)
value = self.construct_object(value_node)
omap.append((key, value))
def construct_yaml_pairs(self, node):
# Note: the same code as `construct_yaml_omap`.
pairs = []
yield pairs
if not isinstance(node, SequenceNode):
raise ConstructorError("while constructing pairs", node.start_mark,
"expected a sequence, but found %s" % node.id, node.start_mark)
for subnode in node.value:
if not isinstance(subnode, MappingNode):
raise ConstructorError("while constructing pairs", node.start_mark,
"expected a mapping of length 1, but found %s" % subnode.id,
subnode.start_mark)
if len(subnode.value) != 1:
raise ConstructorError("while constructing pairs", node.start_mark,
"expected a single mapping item, but found %d items" % len(subnode.value),
subnode.start_mark)
key_node, value_node = subnode.value[0]
key = self.construct_object(key_node)
value = self.construct_object(value_node)
pairs.append((key, value))
def construct_yaml_set(self, node):
data = set()
yield data
value = self.construct_mapping(node)
data.update(value)
def construct_yaml_str(self, node):
return self.construct_scalar(node)
def construct_yaml_seq(self, node):
data = []
yield data
data.extend(self.construct_sequence(node))
def construct_yaml_map(self, node):
data = {}
yield data
value = self.construct_mapping(node)
data.update(value)
def construct_yaml_object(self, node, cls):
data = cls.__new__(cls)
yield data
if hasattr(data, '__setstate__'):
state = self.construct_mapping(node, deep=True)
data.__setstate__(state)
else:
state = self.construct_mapping(node)
data.__dict__.update(state)
def construct_undefined(self, node):
raise ConstructorError(None, None,
"could not determine a constructor for the tag %r" % node.tag,
node.start_mark)
SafeConstructor.add_constructor(
'tag:yaml.org,2002:null',
SafeConstructor.construct_yaml_null)
SafeConstructor.add_constructor(
'tag:yaml.org,2002:bool',
SafeConstructor.construct_yaml_bool)
SafeConstructor.add_constructor(
'tag:yaml.org,2002:int',
SafeConstructor.construct_yaml_int)
SafeConstructor.add_constructor(
'tag:yaml.org,2002:float',
SafeConstructor.construct_yaml_float)
SafeConstructor.add_constructor(
'tag:yaml.org,2002:binary',
SafeConstructor.construct_yaml_binary)
SafeConstructor.add_constructor(
'tag:yaml.org,2002:timestamp',
SafeConstructor.construct_yaml_timestamp)
SafeConstructor.add_constructor(
'tag:yaml.org,2002:omap',
SafeConstructor.construct_yaml_omap)
SafeConstructor.add_constructor(
'tag:yaml.org,2002:pairs',
SafeConstructor.construct_yaml_pairs)
SafeConstructor.add_constructor(
'tag:yaml.org,2002:set',
SafeConstructor.construct_yaml_set)
SafeConstructor.add_constructor(
'tag:yaml.org,2002:str',
SafeConstructor.construct_yaml_str)
SafeConstructor.add_constructor(
'tag:yaml.org,2002:seq',
SafeConstructor.construct_yaml_seq)
SafeConstructor.add_constructor(
'tag:yaml.org,2002:map',
SafeConstructor.construct_yaml_map)
SafeConstructor.add_constructor(None,
SafeConstructor.construct_undefined)
class FullConstructor(SafeConstructor):
def construct_python_str(self, node):
return self.construct_scalar(node)
def construct_python_unicode(self, node):
return self.construct_scalar(node)
def construct_python_bytes(self, node):
try:
value = self.construct_scalar(node).encode('ascii')
except UnicodeEncodeError as exc:
raise ConstructorError(None, None,
"failed to convert base64 data into ascii: %s" % exc,
node.start_mark)
try:
if hasattr(base64, 'decodebytes'):
return base64.decodebytes(value)
else:
return base64.decodestring(value)
except binascii.Error as exc:
raise ConstructorError(None, None,
"failed to decode base64 data: %s" % exc, node.start_mark)
def construct_python_long(self, node):
return self.construct_yaml_int(node)
def construct_python_complex(self, node):
return complex(self.construct_scalar(node))
def construct_python_tuple(self, node):
return tuple(self.construct_sequence(node))
def find_python_module(self, name, mark, unsafe=False):
if not name:
raise ConstructorError("while constructing a Python module", mark,
"expected non-empty name appended to the tag", mark)
if unsafe:
try:
__import__(name)
except ImportError as exc:
raise ConstructorError("while constructing a Python module", mark,
"cannot find module %r (%s)" % (name, exc), mark)
if not name in sys.modules:
raise ConstructorError("while constructing a Python module", mark,
"module %r is not imported" % name, mark)
return sys.modules[name]
def find_python_name(self, name, mark, unsafe=False):
if not name:
raise ConstructorError("while constructing a Python object", mark,
"expected non-empty name appended to the tag", mark)
if '.' in name:
module_name, object_name = name.rsplit('.', 1)
else:
module_name = 'builtins'
object_name = name
if unsafe:
try:
__import__(module_name)
except ImportError as exc:
raise ConstructorError("while constructing a Python object", mark,
"cannot find module %r (%s)" % (module_name, exc), mark)
if not module_name in sys.modules:
raise ConstructorError("while constructing a Python object", mark,
"module %r is not imported" % module_name, mark)
module = sys.modules[module_name]
if not hasattr(module, object_name):
raise ConstructorError("while constructing a Python object", mark,
"cannot find %r in the module %r"
% (object_name, module.__name__), mark)
return getattr(module, object_name)
def construct_python_name(self, suffix, node):
value = self.construct_scalar(node)
if value:
raise ConstructorError("while constructing a Python name", node.start_mark,
"expected the empty value, but found %r" % value, node.start_mark)
return self.find_python_name(suffix, node.start_mark)
def construct_python_module(self, suffix, node):
value = self.construct_scalar(node)
if value:
raise ConstructorError("while constructing a Python module", node.start_mark,
"expected the empty value, but found %r" % value, node.start_mark)
return self.find_python_module(suffix, node.start_mark)
def make_python_instance(self, suffix, node,
args=None, kwds=None, newobj=False, unsafe=False):
if not args:
args = []
if not kwds:
kwds = {}
cls = self.find_python_name(suffix, node.start_mark)
if not (unsafe or isinstance(cls, type)):
raise ConstructorError("while constructing a Python instance", node.start_mark,
"expected a class, but found %r" % type(cls),
node.start_mark)
if newobj and isinstance(cls, type):
return cls.__new__(cls, *args, **kwds)
else:
return cls(*args, **kwds)
def set_python_instance_state(self, instance, state):
if hasattr(instance, '__setstate__'):
instance.__setstate__(state)
else:
slotstate = {}
if isinstance(state, tuple) and len(state) == 2:
state, slotstate = state
if hasattr(instance, '__dict__'):
instance.__dict__.update(state)
elif state:
slotstate.update(state)
for key, value in slotstate.items():
setattr(object, key, value)
def construct_python_object(self, suffix, node):
# Format:
# !!python/object:module.name { ... state ... }
instance = self.make_python_instance(suffix, node, newobj=True)
yield instance
deep = hasattr(instance, '__setstate__')
state = self.construct_mapping(node, deep=deep)
self.set_python_instance_state(instance, state)
def construct_python_object_apply(self, suffix, node, newobj=False):
# Format:
# !!python/object/apply # (or !!python/object/new)
# args: [ ... arguments ... ]
# kwds: { ... keywords ... }
# state: ... state ...
# listitems: [ ... listitems ... ]
# dictitems: { ... dictitems ... }
# or short format:
# !!python/object/apply [ ... arguments ... ]
# The difference between !!python/object/apply and !!python/object/new
# is how an object is created, check make_python_instance for details.
if isinstance(node, SequenceNode):
args = self.construct_sequence(node, deep=True)
kwds = {}
state = {}
listitems = []
dictitems = {}
else:
value = self.construct_mapping(node, deep=True)
args = value.get('args', [])
kwds = value.get('kwds', {})
state = value.get('state', {})
listitems = value.get('listitems', [])
dictitems = value.get('dictitems', {})
instance = self.make_python_instance(suffix, node, args, kwds, newobj)
if state:
self.set_python_instance_state(instance, state)
if listitems:
instance.extend(listitems)
if dictitems:
for key in dictitems:
instance[key] = dictitems[key]
return instance
def construct_python_object_new(self, suffix, node):
return self.construct_python_object_apply(suffix, node, newobj=True)
FullConstructor.add_constructor(
'tag:yaml.org,2002:python/none',
FullConstructor.construct_yaml_null)
FullConstructor.add_constructor(
'tag:yaml.org,2002:python/bool',
FullConstructor.construct_yaml_bool)
FullConstructor.add_constructor(
'tag:yaml.org,2002:python/str',
FullConstructor.construct_python_str)
FullConstructor.add_constructor(
'tag:yaml.org,2002:python/unicode',
FullConstructor.construct_python_unicode)
FullConstructor.add_constructor(
'tag:yaml.org,2002:python/bytes',
FullConstructor.construct_python_bytes)
FullConstructor.add_constructor(
'tag:yaml.org,2002:python/int',
FullConstructor.construct_yaml_int)
FullConstructor.add_constructor(
'tag:yaml.org,2002:python/long',
FullConstructor.construct_python_long)
FullConstructor.add_constructor(
'tag:yaml.org,2002:python/float',
FullConstructor.construct_yaml_float)
FullConstructor.add_constructor(
'tag:yaml.org,2002:python/complex',
FullConstructor.construct_python_complex)
FullConstructor.add_constructor(
'tag:yaml.org,2002:python/list',
FullConstructor.construct_yaml_seq)
FullConstructor.add_constructor(
'tag:yaml.org,2002:python/tuple',
FullConstructor.construct_python_tuple)
FullConstructor.add_constructor(
'tag:yaml.org,2002:python/dict',
FullConstructor.construct_yaml_map)
FullConstructor.add_multi_constructor(
'tag:yaml.org,2002:python/name:',
FullConstructor.construct_python_name)
FullConstructor.add_multi_constructor(
'tag:yaml.org,2002:python/module:',
FullConstructor.construct_python_module)
FullConstructor.add_multi_constructor(
'tag:yaml.org,2002:python/object:',
FullConstructor.construct_python_object)
FullConstructor.add_multi_constructor(
'tag:yaml.org,2002:python/object/apply:',
FullConstructor.construct_python_object_apply)
FullConstructor.add_multi_constructor(
'tag:yaml.org,2002:python/object/new:',
FullConstructor.construct_python_object_new)
class UnsafeConstructor(FullConstructor):
def find_python_module(self, name, mark):
return super(UnsafeConstructor, self).find_python_module(name, mark, unsafe=True)
def find_python_name(self, name, mark):
return super(UnsafeConstructor, self).find_python_name(name, mark, unsafe=True)
def make_python_instance(self, suffix, node, args=None, kwds=None, newobj=False):
return super(UnsafeConstructor, self).make_python_instance(
suffix, node, args, kwds, newobj, unsafe=True)
# Constructor is same as UnsafeConstructor. Need to leave this in place in case
# people have extended it directly.
class Constructor(UnsafeConstructor):
pass

101
libs/yaml/cyaml.py Normal file
View File

@ -0,0 +1,101 @@
__all__ = [
'CBaseLoader', 'CSafeLoader', 'CFullLoader', 'CUnsafeLoader', 'CLoader',
'CBaseDumper', 'CSafeDumper', 'CDumper'
]
from _yaml import CParser, CEmitter
from .constructor import *
from .serializer import *
from .representer import *
from .resolver import *
class CBaseLoader(CParser, BaseConstructor, BaseResolver):
def __init__(self, stream):
CParser.__init__(self, stream)
BaseConstructor.__init__(self)
BaseResolver.__init__(self)
class CSafeLoader(CParser, SafeConstructor, Resolver):
def __init__(self, stream):
CParser.__init__(self, stream)
SafeConstructor.__init__(self)
Resolver.__init__(self)
class CFullLoader(CParser, FullConstructor, Resolver):
def __init__(self, stream):
CParser.__init__(self, stream)
FullConstructor.__init__(self)
Resolver.__init__(self)
class CUnsafeLoader(CParser, UnsafeConstructor, Resolver):
def __init__(self, stream):
CParser.__init__(self, stream)
UnsafeConstructor.__init__(self)
Resolver.__init__(self)
class CLoader(CParser, Constructor, Resolver):
def __init__(self, stream):
CParser.__init__(self, stream)
Constructor.__init__(self)
Resolver.__init__(self)
class CBaseDumper(CEmitter, BaseRepresenter, BaseResolver):
def __init__(self, stream,
default_style=None, default_flow_style=False,
canonical=None, indent=None, width=None,
allow_unicode=None, line_break=None,
encoding=None, explicit_start=None, explicit_end=None,
version=None, tags=None, sort_keys=True):
CEmitter.__init__(self, stream, canonical=canonical,
indent=indent, width=width, encoding=encoding,
allow_unicode=allow_unicode, line_break=line_break,
explicit_start=explicit_start, explicit_end=explicit_end,
version=version, tags=tags)
Representer.__init__(self, default_style=default_style,
default_flow_style=default_flow_style, sort_keys=sort_keys)
Resolver.__init__(self)
class CSafeDumper(CEmitter, SafeRepresenter, Resolver):
def __init__(self, stream,
default_style=None, default_flow_style=False,
canonical=None, indent=None, width=None,
allow_unicode=None, line_break=None,
encoding=None, explicit_start=None, explicit_end=None,
version=None, tags=None, sort_keys=True):
CEmitter.__init__(self, stream, canonical=canonical,
indent=indent, width=width, encoding=encoding,
allow_unicode=allow_unicode, line_break=line_break,
explicit_start=explicit_start, explicit_end=explicit_end,
version=version, tags=tags)
SafeRepresenter.__init__(self, default_style=default_style,
default_flow_style=default_flow_style, sort_keys=sort_keys)
Resolver.__init__(self)
class CDumper(CEmitter, Serializer, Representer, Resolver):
def __init__(self, stream,
default_style=None, default_flow_style=False,
canonical=None, indent=None, width=None,
allow_unicode=None, line_break=None,
encoding=None, explicit_start=None, explicit_end=None,
version=None, tags=None, sort_keys=True):
CEmitter.__init__(self, stream, canonical=canonical,
indent=indent, width=width, encoding=encoding,
allow_unicode=allow_unicode, line_break=line_break,
explicit_start=explicit_start, explicit_end=explicit_end,
version=version, tags=tags)
Representer.__init__(self, default_style=default_style,
default_flow_style=default_flow_style, sort_keys=sort_keys)
Resolver.__init__(self)

62
libs/yaml/dumper.py Normal file
View File

@ -0,0 +1,62 @@
__all__ = ['BaseDumper', 'SafeDumper', 'Dumper']
from .emitter import *
from .serializer import *
from .representer import *
from .resolver import *
class BaseDumper(Emitter, Serializer, BaseRepresenter, BaseResolver):
def __init__(self, stream,
default_style=None, default_flow_style=False,
canonical=None, indent=None, width=None,
allow_unicode=None, line_break=None,
encoding=None, explicit_start=None, explicit_end=None,
version=None, tags=None, sort_keys=True):
Emitter.__init__(self, stream, canonical=canonical,
indent=indent, width=width,
allow_unicode=allow_unicode, line_break=line_break)
Serializer.__init__(self, encoding=encoding,
explicit_start=explicit_start, explicit_end=explicit_end,
version=version, tags=tags)
Representer.__init__(self, default_style=default_style,
default_flow_style=default_flow_style, sort_keys=sort_keys)
Resolver.__init__(self)
class SafeDumper(Emitter, Serializer, SafeRepresenter, Resolver):
def __init__(self, stream,
default_style=None, default_flow_style=False,
canonical=None, indent=None, width=None,
allow_unicode=None, line_break=None,
encoding=None, explicit_start=None, explicit_end=None,
version=None, tags=None, sort_keys=True):
Emitter.__init__(self, stream, canonical=canonical,
indent=indent, width=width,
allow_unicode=allow_unicode, line_break=line_break)
Serializer.__init__(self, encoding=encoding,
explicit_start=explicit_start, explicit_end=explicit_end,
version=version, tags=tags)
SafeRepresenter.__init__(self, default_style=default_style,
default_flow_style=default_flow_style, sort_keys=sort_keys)
Resolver.__init__(self)
class Dumper(Emitter, Serializer, Representer, Resolver):
def __init__(self, stream,
default_style=None, default_flow_style=False,
canonical=None, indent=None, width=None,
allow_unicode=None, line_break=None,
encoding=None, explicit_start=None, explicit_end=None,
version=None, tags=None, sort_keys=True):
Emitter.__init__(self, stream, canonical=canonical,
indent=indent, width=width,
allow_unicode=allow_unicode, line_break=line_break)
Serializer.__init__(self, encoding=encoding,
explicit_start=explicit_start, explicit_end=explicit_end,
version=version, tags=tags)
Representer.__init__(self, default_style=default_style,
default_flow_style=default_flow_style, sort_keys=sort_keys)
Resolver.__init__(self)

1137
libs/yaml/emitter.py Normal file

File diff suppressed because it is too large Load Diff

75
libs/yaml/error.py Normal file
View File

@ -0,0 +1,75 @@
__all__ = ['Mark', 'YAMLError', 'MarkedYAMLError']
class Mark:
def __init__(self, name, index, line, column, buffer, pointer):
self.name = name
self.index = index
self.line = line
self.column = column
self.buffer = buffer
self.pointer = pointer
def get_snippet(self, indent=4, max_length=75):
if self.buffer is None:
return None
head = ''
start = self.pointer
while start > 0 and self.buffer[start-1] not in '\0\r\n\x85\u2028\u2029':
start -= 1
if self.pointer-start > max_length/2-1:
head = ' ... '
start += 5
break
tail = ''
end = self.pointer
while end < len(self.buffer) and self.buffer[end] not in '\0\r\n\x85\u2028\u2029':
end += 1
if end-self.pointer > max_length/2-1:
tail = ' ... '
end -= 5
break
snippet = self.buffer[start:end]
return ' '*indent + head + snippet + tail + '\n' \
+ ' '*(indent+self.pointer-start+len(head)) + '^'
def __str__(self):
snippet = self.get_snippet()
where = " in \"%s\", line %d, column %d" \
% (self.name, self.line+1, self.column+1)
if snippet is not None:
where += ":\n"+snippet
return where
class YAMLError(Exception):
pass
class MarkedYAMLError(YAMLError):
def __init__(self, context=None, context_mark=None,
problem=None, problem_mark=None, note=None):
self.context = context
self.context_mark = context_mark
self.problem = problem
self.problem_mark = problem_mark
self.note = note
def __str__(self):
lines = []
if self.context is not None:
lines.append(self.context)
if self.context_mark is not None \
and (self.problem is None or self.problem_mark is None
or self.context_mark.name != self.problem_mark.name
or self.context_mark.line != self.problem_mark.line
or self.context_mark.column != self.problem_mark.column):
lines.append(str(self.context_mark))
if self.problem is not None:
lines.append(self.problem)
if self.problem_mark is not None:
lines.append(str(self.problem_mark))
if self.note is not None:
lines.append(self.note)
return '\n'.join(lines)

86
libs/yaml/events.py Normal file
View File

@ -0,0 +1,86 @@
# Abstract classes.
class Event(object):
def __init__(self, start_mark=None, end_mark=None):
self.start_mark = start_mark
self.end_mark = end_mark
def __repr__(self):
attributes = [key for key in ['anchor', 'tag', 'implicit', 'value']
if hasattr(self, key)]
arguments = ', '.join(['%s=%r' % (key, getattr(self, key))
for key in attributes])
return '%s(%s)' % (self.__class__.__name__, arguments)
class NodeEvent(Event):
def __init__(self, anchor, start_mark=None, end_mark=None):
self.anchor = anchor
self.start_mark = start_mark
self.end_mark = end_mark
class CollectionStartEvent(NodeEvent):
def __init__(self, anchor, tag, implicit, start_mark=None, end_mark=None,
flow_style=None):
self.anchor = anchor
self.tag = tag
self.implicit = implicit
self.start_mark = start_mark
self.end_mark = end_mark
self.flow_style = flow_style
class CollectionEndEvent(Event):
pass
# Implementations.
class StreamStartEvent(Event):
def __init__(self, start_mark=None, end_mark=None, encoding=None):
self.start_mark = start_mark
self.end_mark = end_mark
self.encoding = encoding
class StreamEndEvent(Event):
pass
class DocumentStartEvent(Event):
def __init__(self, start_mark=None, end_mark=None,
explicit=None, version=None, tags=None):
self.start_mark = start_mark
self.end_mark = end_mark
self.explicit = explicit
self.version = version
self.tags = tags
class DocumentEndEvent(Event):
def __init__(self, start_mark=None, end_mark=None,
explicit=None):
self.start_mark = start_mark
self.end_mark = end_mark
self.explicit = explicit
class AliasEvent(NodeEvent):
pass
class ScalarEvent(NodeEvent):
def __init__(self, anchor, tag, implicit, value,
start_mark=None, end_mark=None, style=None):
self.anchor = anchor
self.tag = tag
self.implicit = implicit
self.value = value
self.start_mark = start_mark
self.end_mark = end_mark
self.style = style
class SequenceStartEvent(CollectionStartEvent):
pass
class SequenceEndEvent(CollectionEndEvent):
pass
class MappingStartEvent(CollectionStartEvent):
pass
class MappingEndEvent(CollectionEndEvent):
pass

Some files were not shown because too many files have changed in this diff Show More