diff --git a/Mylar.py b/Mylar.py
index f49d1ffb..ab6e7f15 100644
--- a/Mylar.py
+++ b/Mylar.py
@@ -21,16 +21,14 @@ import time
import threading
import signal
-from lib.configobj import ConfigObj
+sys.path.insert(1, os.path.join(os.path.dirname(__file__), 'lib'))
import mylar
from mylar import webstart, logger, filechecker, versioncheck
-try:
- import argparse
-except ImportError:
- import lib.argparse as argparse
+import argparse
+
if ( sys.platform == 'win32' and sys.executable.split( '\\' )[-1] == 'pythonw.exe'):
sys.stdout = open(os.devnull, "w")
@@ -198,6 +196,7 @@ def main():
i += 1
+ from configobj import ConfigObj
mylar.CFG = ConfigObj(mylar.CONFIG_FILE, encoding='utf-8')
# Rename the main thread
diff --git a/bs4/__init__.py b/bs4/__init__.py
deleted file mode 100644
index d0fd1ff1..00000000
--- a/bs4/__init__.py
+++ /dev/null
@@ -1,355 +0,0 @@
-"""Beautiful Soup
-Elixir and Tonic
-"The Screen-Scraper's Friend"
-http://www.crummy.com/software/BeautifulSoup/
-
-Beautiful Soup uses a pluggable XML or HTML parser to parse a
-(possibly invalid) document into a tree representation. Beautiful Soup
-provides provides methods and Pythonic idioms that make it easy to
-navigate, search, and modify the parse tree.
-
-Beautiful Soup works with Python 2.6 and up. It works better if lxml
-and/or html5lib is installed.
-
-For more than you ever wanted to know about Beautiful Soup, see the
-documentation:
-http://www.crummy.com/software/BeautifulSoup/bs4/doc/
-"""
-
-__author__ = "Leonard Richardson (leonardr@segfault.org)"
-__version__ = "4.1.1"
-__copyright__ = "Copyright (c) 2004-2012 Leonard Richardson"
-__license__ = "MIT"
-
-__all__ = ['BeautifulSoup']
-
-import re
-import warnings
-
-from .builder import builder_registry
-from .dammit import UnicodeDammit
-from .element import (
- CData,
- Comment,
- DEFAULT_OUTPUT_ENCODING,
- Declaration,
- Doctype,
- NavigableString,
- PageElement,
- ProcessingInstruction,
- ResultSet,
- SoupStrainer,
- Tag,
- )
-
-# The very first thing we do is give a useful error if someone is
-# running this code under Python 3 without converting it.
-syntax_error = u'You are trying to run the Python 2 version of Beautiful Soup under Python 3. This will not work. You need to convert the code, either by installing it (`python setup.py install`) or by running 2to3 (`2to3 -w bs4`).'
-
-class BeautifulSoup(Tag):
- """
- This class defines the basic interface called by the tree builders.
-
- These methods will be called by the parser:
- reset()
- feed(markup)
-
- The tree builder may call these methods from its feed() implementation:
- handle_starttag(name, attrs) # See note about return value
- handle_endtag(name)
- handle_data(data) # Appends to the current data node
- endData(containerClass=NavigableString) # Ends the current data node
-
- No matter how complicated the underlying parser is, you should be
- able to build a tree using 'start tag' events, 'end tag' events,
- 'data' events, and "done with data" events.
-
- If you encounter an empty-element tag (aka a self-closing tag,
- like HTML's
tag), call handle_starttag and then
- handle_endtag.
- """
- ROOT_TAG_NAME = u'[document]'
-
- # If the end-user gives no indication which tree builder they
- # want, look for one with these features.
- DEFAULT_BUILDER_FEATURES = ['html', 'fast']
-
- # Used when determining whether a text node is all whitespace and
- # can be replaced with a single space. A text node that contains
- # fancy Unicode spaces (usually non-breaking) should be left
- # alone.
- STRIP_ASCII_SPACES = {9: None, 10: None, 12: None, 13: None, 32: None, }
-
- def __init__(self, markup="", features=None, builder=None,
- parse_only=None, from_encoding=None, **kwargs):
- """The Soup object is initialized as the 'root tag', and the
- provided markup (which can be a string or a file-like object)
- is fed into the underlying parser."""
-
- if 'convertEntities' in kwargs:
- warnings.warn(
- "BS4 does not respect the convertEntities argument to the "
- "BeautifulSoup constructor. Entities are always converted "
- "to Unicode characters.")
-
- if 'markupMassage' in kwargs:
- del kwargs['markupMassage']
- warnings.warn(
- "BS4 does not respect the markupMassage argument to the "
- "BeautifulSoup constructor. The tree builder is responsible "
- "for any necessary markup massage.")
-
- if 'smartQuotesTo' in kwargs:
- del kwargs['smartQuotesTo']
- warnings.warn(
- "BS4 does not respect the smartQuotesTo argument to the "
- "BeautifulSoup constructor. Smart quotes are always converted "
- "to Unicode characters.")
-
- if 'selfClosingTags' in kwargs:
- del kwargs['selfClosingTags']
- warnings.warn(
- "BS4 does not respect the selfClosingTags argument to the "
- "BeautifulSoup constructor. The tree builder is responsible "
- "for understanding self-closing tags.")
-
- if 'isHTML' in kwargs:
- del kwargs['isHTML']
- warnings.warn(
- "BS4 does not respect the isHTML argument to the "
- "BeautifulSoup constructor. You can pass in features='html' "
- "or features='xml' to get a builder capable of handling "
- "one or the other.")
-
- def deprecated_argument(old_name, new_name):
- if old_name in kwargs:
- warnings.warn(
- 'The "%s" argument to the BeautifulSoup constructor '
- 'has been renamed to "%s."' % (old_name, new_name))
- value = kwargs[old_name]
- del kwargs[old_name]
- return value
- return None
-
- parse_only = parse_only or deprecated_argument(
- "parseOnlyThese", "parse_only")
-
- from_encoding = from_encoding or deprecated_argument(
- "fromEncoding", "from_encoding")
-
- if len(kwargs) > 0:
- arg = kwargs.keys().pop()
- raise TypeError(
- "__init__() got an unexpected keyword argument '%s'" % arg)
-
- if builder is None:
- if isinstance(features, basestring):
- features = [features]
- if features is None or len(features) == 0:
- features = self.DEFAULT_BUILDER_FEATURES
- builder_class = builder_registry.lookup(*features)
- if builder_class is None:
- raise ValueError(
- "Couldn't find a tree builder with the features you "
- "requested: %s. Do you need to install a parser library?"
- % ",".join(features))
- builder = builder_class()
- self.builder = builder
- self.is_xml = builder.is_xml
- self.builder.soup = self
-
- self.parse_only = parse_only
-
- self.reset()
-
- if hasattr(markup, 'read'): # It's a file-type object.
- markup = markup.read()
- (self.markup, self.original_encoding, self.declared_html_encoding,
- self.contains_replacement_characters) = (
- self.builder.prepare_markup(markup, from_encoding))
-
- try:
- self._feed()
- except StopParsing:
- pass
-
- # Clear out the markup and remove the builder's circular
- # reference to this object.
- self.markup = None
- self.builder.soup = None
-
- def _feed(self):
- # Convert the document to Unicode.
- self.builder.reset()
-
- self.builder.feed(self.markup)
- # Close out any unfinished strings and close all the open tags.
- self.endData()
- while self.currentTag.name != self.ROOT_TAG_NAME:
- self.popTag()
-
- def reset(self):
- Tag.__init__(self, self, self.builder, self.ROOT_TAG_NAME)
- self.hidden = 1
- self.builder.reset()
- self.currentData = []
- self.currentTag = None
- self.tagStack = []
- self.pushTag(self)
-
- def new_tag(self, name, namespace=None, nsprefix=None, **attrs):
- """Create a new tag associated with this soup."""
- return Tag(None, self.builder, name, namespace, nsprefix, attrs)
-
- def new_string(self, s):
- """Create a new NavigableString associated with this soup."""
- navigable = NavigableString(s)
- navigable.setup()
- return navigable
-
- def insert_before(self, successor):
- raise ValueError("BeautifulSoup objects don't support insert_before().")
-
- def insert_after(self, successor):
- raise ValueError("BeautifulSoup objects don't support insert_after().")
-
- def popTag(self):
- tag = self.tagStack.pop()
- #print "Pop", tag.name
- if self.tagStack:
- self.currentTag = self.tagStack[-1]
- return self.currentTag
-
- def pushTag(self, tag):
- #print "Push", tag.name
- if self.currentTag:
- self.currentTag.contents.append(tag)
- self.tagStack.append(tag)
- self.currentTag = self.tagStack[-1]
-
- def endData(self, containerClass=NavigableString):
- if self.currentData:
- currentData = u''.join(self.currentData)
- if (currentData.translate(self.STRIP_ASCII_SPACES) == '' and
- not set([tag.name for tag in self.tagStack]).intersection(
- self.builder.preserve_whitespace_tags)):
- if '\n' in currentData:
- currentData = '\n'
- else:
- currentData = ' '
- self.currentData = []
- if self.parse_only and len(self.tagStack) <= 1 and \
- (not self.parse_only.text or \
- not self.parse_only.search(currentData)):
- return
- o = containerClass(currentData)
- self.object_was_parsed(o)
-
- def object_was_parsed(self, o):
- """Add an object to the parse tree."""
- o.setup(self.currentTag, self.previous_element)
- if self.previous_element:
- self.previous_element.next_element = o
- self.previous_element = o
- self.currentTag.contents.append(o)
-
- def _popToTag(self, name, nsprefix=None, inclusivePop=True):
- """Pops the tag stack up to and including the most recent
- instance of the given tag. If inclusivePop is false, pops the tag
- stack up to but *not* including the most recent instqance of
- the given tag."""
- #print "Popping to %s" % name
- if name == self.ROOT_TAG_NAME:
- return
-
- numPops = 0
- mostRecentTag = None
-
- for i in range(len(self.tagStack) - 1, 0, -1):
- if (name == self.tagStack[i].name
- and nsprefix == self.tagStack[i].prefix):
- numPops = len(self.tagStack) - i
- break
- if not inclusivePop:
- numPops = numPops - 1
-
- for i in range(0, numPops):
- mostRecentTag = self.popTag()
- return mostRecentTag
-
- def handle_starttag(self, name, namespace, nsprefix, attrs):
- """Push a start tag on to the stack.
-
- If this method returns None, the tag was rejected by the
- SoupStrainer. You should proceed as if the tag had not occured
- in the document. For instance, if this was a self-closing tag,
- don't call handle_endtag.
- """
-
- # print "Start tag %s: %s" % (name, attrs)
- self.endData()
-
- if (self.parse_only and len(self.tagStack) <= 1
- and (self.parse_only.text
- or not self.parse_only.search_tag(name, attrs))):
- return None
-
- tag = Tag(self, self.builder, name, namespace, nsprefix, attrs,
- self.currentTag, self.previous_element)
- if tag is None:
- return tag
- if self.previous_element:
- self.previous_element.next_element = tag
- self.previous_element = tag
- self.pushTag(tag)
- return tag
-
- def handle_endtag(self, name, nsprefix=None):
- #print "End tag: " + name
- self.endData()
- self._popToTag(name, nsprefix)
-
- def handle_data(self, data):
- self.currentData.append(data)
-
- def decode(self, pretty_print=False,
- eventual_encoding=DEFAULT_OUTPUT_ENCODING,
- formatter="minimal"):
- """Returns a string or Unicode representation of this document.
- To get Unicode, pass None for encoding."""
-
- if self.is_xml:
- # Print the XML declaration
- encoding_part = ''
- if eventual_encoding != None:
- encoding_part = ' encoding="%s"' % eventual_encoding
- prefix = u'\n' % encoding_part
- else:
- prefix = u''
- if not pretty_print:
- indent_level = None
- else:
- indent_level = 0
- return prefix + super(BeautifulSoup, self).decode(
- indent_level, eventual_encoding, formatter)
-
-class BeautifulStoneSoup(BeautifulSoup):
- """Deprecated interface to an XML parser."""
-
- def __init__(self, *args, **kwargs):
- kwargs['features'] = 'xml'
- warnings.warn(
- 'The BeautifulStoneSoup class is deprecated. Instead of using '
- 'it, pass features="xml" into the BeautifulSoup constructor.')
- super(BeautifulStoneSoup, self).__init__(*args, **kwargs)
-
-
-class StopParsing(Exception):
- pass
-
-
-#By default, act as an HTML pretty-printer.
-if __name__ == '__main__':
- import sys
- soup = BeautifulSoup(sys.stdin)
- print soup.prettify()
diff --git a/bs4/builder/_html5lib.py b/bs4/builder/_html5lib.py
deleted file mode 100644
index 6001e386..00000000
--- a/bs4/builder/_html5lib.py
+++ /dev/null
@@ -1,222 +0,0 @@
-__all__ = [
- 'HTML5TreeBuilder',
- ]
-
-import warnings
-from bs4.builder import (
- PERMISSIVE,
- HTML,
- HTML_5,
- HTMLTreeBuilder,
- )
-from bs4.element import NamespacedAttribute
-import html5lib
-from html5lib.constants import namespaces
-from bs4.element import (
- Comment,
- Doctype,
- NavigableString,
- Tag,
- )
-
-class HTML5TreeBuilder(HTMLTreeBuilder):
- """Use html5lib to build a tree."""
-
- features = ['html5lib', PERMISSIVE, HTML_5, HTML]
-
- def prepare_markup(self, markup, user_specified_encoding):
- # Store the user-specified encoding for use later on.
- self.user_specified_encoding = user_specified_encoding
- return markup, None, None, False
-
- # These methods are defined by Beautiful Soup.
- def feed(self, markup):
- if self.soup.parse_only is not None:
- warnings.warn("You provided a value for parse_only, but the html5lib tree builder doesn't support parse_only. The entire document will be parsed.")
- parser = html5lib.HTMLParser(tree=self.create_treebuilder)
- doc = parser.parse(markup, encoding=self.user_specified_encoding)
-
- # Set the character encoding detected by the tokenizer.
- if isinstance(markup, unicode):
- # We need to special-case this because html5lib sets
- # charEncoding to UTF-8 if it gets Unicode input.
- doc.original_encoding = None
- else:
- doc.original_encoding = parser.tokenizer.stream.charEncoding[0]
-
- def create_treebuilder(self, namespaceHTMLElements):
- self.underlying_builder = TreeBuilderForHtml5lib(
- self.soup, namespaceHTMLElements)
- return self.underlying_builder
-
- def test_fragment_to_document(self, fragment):
- """See `TreeBuilder`."""
- return u'
%s' % fragment
-
-
-class TreeBuilderForHtml5lib(html5lib.treebuilders._base.TreeBuilder):
-
- def __init__(self, soup, namespaceHTMLElements):
- self.soup = soup
- super(TreeBuilderForHtml5lib, self).__init__(namespaceHTMLElements)
-
- def documentClass(self):
- self.soup.reset()
- return Element(self.soup, self.soup, None)
-
- def insertDoctype(self, token):
- name = token["name"]
- publicId = token["publicId"]
- systemId = token["systemId"]
-
- doctype = Doctype.for_name_and_ids(name, publicId, systemId)
- self.soup.object_was_parsed(doctype)
-
- def elementClass(self, name, namespace):
- tag = self.soup.new_tag(name, namespace)
- return Element(tag, self.soup, namespace)
-
- def commentClass(self, data):
- return TextNode(Comment(data), self.soup)
-
- def fragmentClass(self):
- self.soup = BeautifulSoup("")
- self.soup.name = "[document_fragment]"
- return Element(self.soup, self.soup, None)
-
- def appendChild(self, node):
- # XXX This code is not covered by the BS4 tests.
- self.soup.append(node.element)
-
- def getDocument(self):
- return self.soup
-
- def getFragment(self):
- return html5lib.treebuilders._base.TreeBuilder.getFragment(self).element
-
-class AttrList(object):
- def __init__(self, element):
- self.element = element
- self.attrs = dict(self.element.attrs)
- def __iter__(self):
- return list(self.attrs.items()).__iter__()
- def __setitem__(self, name, value):
- "set attr", name, value
- self.element[name] = value
- def items(self):
- return list(self.attrs.items())
- def keys(self):
- return list(self.attrs.keys())
- def __len__(self):
- return len(self.attrs)
- def __getitem__(self, name):
- return self.attrs[name]
- def __contains__(self, name):
- return name in list(self.attrs.keys())
-
-
-class Element(html5lib.treebuilders._base.Node):
- def __init__(self, element, soup, namespace):
- html5lib.treebuilders._base.Node.__init__(self, element.name)
- self.element = element
- self.soup = soup
- self.namespace = namespace
-
- def appendChild(self, node):
- if (node.element.__class__ == NavigableString and self.element.contents
- and self.element.contents[-1].__class__ == NavigableString):
- # Concatenate new text onto old text node
- # XXX This has O(n^2) performance, for input like
- # "aaa..."
- old_element = self.element.contents[-1]
- new_element = self.soup.new_string(old_element + node.element)
- old_element.replace_with(new_element)
- else:
- self.element.append(node.element)
- node.parent = self
-
- def getAttributes(self):
- return AttrList(self.element)
-
- def setAttributes(self, attributes):
- if attributes is not None and len(attributes) > 0:
-
- converted_attributes = []
- for name, value in list(attributes.items()):
- if isinstance(name, tuple):
- new_name = NamespacedAttribute(*name)
- del attributes[name]
- attributes[new_name] = value
-
- self.soup.builder._replace_cdata_list_attribute_values(
- self.name, attributes)
- for name, value in attributes.items():
- self.element[name] = value
-
- # The attributes may contain variables that need substitution.
- # Call set_up_substitutions manually.
- #
- # The Tag constructor called this method when the Tag was created,
- # but we just set/changed the attributes, so call it again.
- self.soup.builder.set_up_substitutions(self.element)
- attributes = property(getAttributes, setAttributes)
-
- def insertText(self, data, insertBefore=None):
- text = TextNode(self.soup.new_string(data), self.soup)
- if insertBefore:
- self.insertBefore(text, insertBefore)
- else:
- self.appendChild(text)
-
- def insertBefore(self, node, refNode):
- index = self.element.index(refNode.element)
- if (node.element.__class__ == NavigableString and self.element.contents
- and self.element.contents[index-1].__class__ == NavigableString):
- # (See comments in appendChild)
- old_node = self.element.contents[index-1]
- new_str = self.soup.new_string(old_node + node.element)
- old_node.replace_with(new_str)
- else:
- self.element.insert(index, node.element)
- node.parent = self
-
- def removeChild(self, node):
- node.element.extract()
-
- def reparentChildren(self, newParent):
- while self.element.contents:
- child = self.element.contents[0]
- child.extract()
- if isinstance(child, Tag):
- newParent.appendChild(
- Element(child, self.soup, namespaces["html"]))
- else:
- newParent.appendChild(
- TextNode(child, self.soup))
-
- def cloneNode(self):
- tag = self.soup.new_tag(self.element.name, self.namespace)
- node = Element(tag, self.soup, self.namespace)
- for key,value in self.attributes:
- node.attributes[key] = value
- return node
-
- def hasContent(self):
- return self.element.contents
-
- def getNameTuple(self):
- if self.namespace == None:
- return namespaces["html"], self.name
- else:
- return self.namespace, self.name
-
- nameTuple = property(getNameTuple)
-
-class TextNode(Element):
- def __init__(self, element, soup):
- html5lib.treebuilders._base.Node.__init__(self, None)
- self.element = element
- self.soup = soup
-
- def cloneNode(self):
- raise NotImplementedError
diff --git a/bs4/tests/test_html5lib.py b/bs4/tests/test_html5lib.py
deleted file mode 100644
index f195f7d0..00000000
--- a/bs4/tests/test_html5lib.py
+++ /dev/null
@@ -1,58 +0,0 @@
-"""Tests to ensure that the html5lib tree builder generates good trees."""
-
-import warnings
-
-try:
- from bs4.builder import HTML5TreeBuilder
- HTML5LIB_PRESENT = True
-except ImportError, e:
- HTML5LIB_PRESENT = False
-from bs4.element import SoupStrainer
-from bs4.testing import (
- HTML5TreeBuilderSmokeTest,
- SoupTest,
- skipIf,
-)
-
-@skipIf(
- not HTML5LIB_PRESENT,
- "html5lib seems not to be present, not testing its tree builder.")
-class HTML5LibBuilderSmokeTest(SoupTest, HTML5TreeBuilderSmokeTest):
- """See ``HTML5TreeBuilderSmokeTest``."""
-
- @property
- def default_builder(self):
- return HTML5TreeBuilder()
-
- def test_soupstrainer(self):
- # The html5lib tree builder does not support SoupStrainers.
- strainer = SoupStrainer("b")
- markup = "A bold statement.
"
- with warnings.catch_warnings(record=True) as w:
- soup = self.soup(markup, parse_only=strainer)
- self.assertEqual(
- soup.decode(), self.document_for(markup))
-
- self.assertTrue(
- "the html5lib tree builder doesn't support parse_only" in
- str(w[0].message))
-
- def test_correctly_nested_tables(self):
- """html5lib inserts tags where other parsers don't."""
- markup = (''
- ''
- "Here's another table:"
- ' | ')
-
- self.assertSoupEquals(
- markup,
- 'Here\'s another table:'
- ''
- ' |
')
-
- self.assertSoupEquals(
- "")
diff --git a/comictagger.py b/comictagger.py
index 1932b46f..35250929 100755
--- a/comictagger.py
+++ b/comictagger.py
@@ -1,5 +1,5 @@
#!/usr/bin/env python
-from lib.comictaggerlib.main import ctmain
+from comictaggerlib.main import ctmain
if __name__ == '__main__':
ctmain()
diff --git a/lib/apscheduler/__init__.py b/lib/apscheduler/__init__.py
old mode 100755
new mode 100644
index 6b502147..11e93a1d
--- a/lib/apscheduler/__init__.py
+++ b/lib/apscheduler/__init__.py
@@ -1,3 +1,3 @@
-version_info = (2, 0, 0, 'rc', 2)
+version_info = (2, 0, 0)
version = '.'.join(str(n) for n in version_info[:3])
release = version + ''.join(str(n) for n in version_info[3:])
diff --git a/lib/apscheduler/events.py b/lib/apscheduler/events.py
old mode 100755
new mode 100644
diff --git a/lib/apscheduler/job.py b/lib/apscheduler/job.py
old mode 100755
new mode 100644
index c863bc3b..868e7234
--- a/lib/apscheduler/job.py
+++ b/lib/apscheduler/job.py
@@ -5,7 +5,7 @@ Jobs represent scheduled tasks.
from threading import Lock
from datetime import timedelta
-from lib.apscheduler.util import to_unicode, ref_to_obj, get_callable_name,\
+from apscheduler.util import to_unicode, ref_to_obj, get_callable_name,\
obj_to_ref
diff --git a/lib/apscheduler/jobstores/__init__.py b/lib/apscheduler/jobstores/__init__.py
old mode 100755
new mode 100644
diff --git a/lib/apscheduler/jobstores/base.py b/lib/apscheduler/jobstores/base.py
old mode 100755
new mode 100644
diff --git a/lib/apscheduler/jobstores/mongodb_store.py b/lib/apscheduler/jobstores/mongodb_store.py
old mode 100755
new mode 100644
index d1093860..3f522c25
--- a/lib/apscheduler/jobstores/mongodb_store.py
+++ b/lib/apscheduler/jobstores/mongodb_store.py
@@ -3,8 +3,8 @@ Stores jobs in a MongoDB database.
"""
import logging
-from lib.apscheduler.jobstores.base import JobStore
-from lib.apscheduler.job import Job
+from apscheduler.jobstores.base import JobStore
+from apscheduler.job import Job
try:
import cPickle as pickle
diff --git a/lib/apscheduler/jobstores/ram_store.py b/lib/apscheduler/jobstores/ram_store.py
old mode 100755
new mode 100644
index 1c3c667e..85091fe8
--- a/lib/apscheduler/jobstores/ram_store.py
+++ b/lib/apscheduler/jobstores/ram_store.py
@@ -2,7 +2,7 @@
Stores jobs in an array in RAM. Provides no persistence support.
"""
-from lib.apscheduler.jobstores.base import JobStore
+from apscheduler.jobstores.base import JobStore
class RAMJobStore(JobStore):
diff --git a/lib/apscheduler/jobstores/shelve_store.py b/lib/apscheduler/jobstores/shelve_store.py
old mode 100755
new mode 100644
index f29d53cb..87c95f8f
--- a/lib/apscheduler/jobstores/shelve_store.py
+++ b/lib/apscheduler/jobstores/shelve_store.py
@@ -7,9 +7,9 @@ import pickle
import random
import logging
-from lib.apscheduler.jobstores.base import JobStore
-from lib.apscheduler.job import Job
-from lib.apscheduler.util import itervalues
+from apscheduler.jobstores.base import JobStore
+from apscheduler.job import Job
+from apscheduler.util import itervalues
logger = logging.getLogger(__name__)
diff --git a/lib/apscheduler/jobstores/sqlalchemy_store.py b/lib/apscheduler/jobstores/sqlalchemy_store.py
old mode 100755
new mode 100644
index c0fee127..8ece7e24
--- a/lib/apscheduler/jobstores/sqlalchemy_store.py
+++ b/lib/apscheduler/jobstores/sqlalchemy_store.py
@@ -4,8 +4,8 @@ Stores jobs in a database table using SQLAlchemy.
import pickle
import logging
-from lib.apscheduler.jobstores.base import JobStore
-from lib.apscheduler.job import Job
+from apscheduler.jobstores.base import JobStore
+from apscheduler.job import Job
try:
from sqlalchemy import *
diff --git a/lib/apscheduler/scheduler.py b/lib/apscheduler/scheduler.py
old mode 100755
new mode 100644
index 461cfea4..ee08ad8b
--- a/lib/apscheduler/scheduler.py
+++ b/lib/apscheduler/scheduler.py
@@ -9,12 +9,12 @@ from logging import getLogger
import os
import sys
-from lib.apscheduler.util import *
-from lib.apscheduler.triggers import SimpleTrigger, IntervalTrigger, CronTrigger
-from lib.apscheduler.jobstores.ram_store import RAMJobStore
-from lib.apscheduler.job import Job, MaxInstancesReachedError
-from lib.apscheduler.events import *
-from lib.apscheduler.threadpool import ThreadPool
+from apscheduler.util import *
+from apscheduler.triggers import SimpleTrigger, IntervalTrigger, CronTrigger
+from apscheduler.jobstores.ram_store import RAMJobStore
+from apscheduler.job import Job, MaxInstancesReachedError
+from apscheduler.events import *
+from apscheduler.threadpool import ThreadPool
logger = getLogger(__name__)
diff --git a/lib/apscheduler/threadpool.py b/lib/apscheduler/threadpool.py
old mode 100755
new mode 100644
diff --git a/lib/apscheduler/triggers/__init__.py b/lib/apscheduler/triggers/__init__.py
old mode 100755
new mode 100644
index a40ece20..74a97884
--- a/lib/apscheduler/triggers/__init__.py
+++ b/lib/apscheduler/triggers/__init__.py
@@ -1,3 +1,3 @@
-from lib.apscheduler.triggers.cron import CronTrigger
-from lib.apscheduler.triggers.interval import IntervalTrigger
-from lib.apscheduler.triggers.simple import SimpleTrigger
+from apscheduler.triggers.cron import CronTrigger
+from apscheduler.triggers.interval import IntervalTrigger
+from apscheduler.triggers.simple import SimpleTrigger
diff --git a/lib/apscheduler/triggers/cron/__init__.py b/lib/apscheduler/triggers/cron/__init__.py
old mode 100755
new mode 100644
index 665d2dae..3f8d9a8f
--- a/lib/apscheduler/triggers/cron/__init__.py
+++ b/lib/apscheduler/triggers/cron/__init__.py
@@ -1,7 +1,7 @@
from datetime import date, datetime
-from lib.apscheduler.triggers.cron.fields import *
-from lib.apscheduler.util import datetime_ceil, convert_to_datetime
+from apscheduler.triggers.cron.fields import *
+from apscheduler.util import datetime_ceil, convert_to_datetime
class CronTrigger(object):
diff --git a/lib/apscheduler/triggers/cron/expressions.py b/lib/apscheduler/triggers/cron/expressions.py
old mode 100755
new mode 100644
index 646d6f3c..018c7a30
--- a/lib/apscheduler/triggers/cron/expressions.py
+++ b/lib/apscheduler/triggers/cron/expressions.py
@@ -5,7 +5,7 @@ This module contains the expressions applicable for CronTrigger's fields.
from calendar import monthrange
import re
-from lib.apscheduler.util import asint
+from apscheduler.util import asint
__all__ = ('AllExpression', 'RangeExpression', 'WeekdayRangeExpression',
'WeekdayPositionExpression')
diff --git a/lib/apscheduler/triggers/cron/fields.py b/lib/apscheduler/triggers/cron/fields.py
old mode 100755
new mode 100644
index 24cb1b31..ef970cc9
--- a/lib/apscheduler/triggers/cron/fields.py
+++ b/lib/apscheduler/triggers/cron/fields.py
@@ -5,7 +5,7 @@ fields.
from calendar import monthrange
-from lib.apscheduler.triggers.cron.expressions import *
+from apscheduler.triggers.cron.expressions import *
__all__ = ('MIN_VALUES', 'MAX_VALUES', 'DEFAULT_VALUES', 'BaseField',
'WeekField', 'DayOfMonthField', 'DayOfWeekField')
diff --git a/lib/apscheduler/triggers/interval.py b/lib/apscheduler/triggers/interval.py
old mode 100755
new mode 100644
index a7c1ee11..dd16d777
--- a/lib/apscheduler/triggers/interval.py
+++ b/lib/apscheduler/triggers/interval.py
@@ -1,7 +1,7 @@
from datetime import datetime, timedelta
from math import ceil
-from lib.apscheduler.util import convert_to_datetime, timedelta_seconds
+from apscheduler.util import convert_to_datetime, timedelta_seconds
class IntervalTrigger(object):
diff --git a/lib/apscheduler/triggers/simple.py b/lib/apscheduler/triggers/simple.py
old mode 100755
new mode 100644
index 702ed78b..ea61b3f1
--- a/lib/apscheduler/triggers/simple.py
+++ b/lib/apscheduler/triggers/simple.py
@@ -1,4 +1,4 @@
-from lib.apscheduler.util import convert_to_datetime
+from apscheduler.util import convert_to_datetime
class SimpleTrigger(object):
diff --git a/lib/apscheduler/util.py b/lib/apscheduler/util.py
old mode 100755
new mode 100644
index af28ae49..a49aaed8
--- a/lib/apscheduler/util.py
+++ b/lib/apscheduler/util.py
@@ -6,6 +6,7 @@ from datetime import date, datetime, timedelta
from time import mktime
import re
import sys
+from types import MethodType
__all__ = ('asint', 'asbool', 'convert_to_datetime', 'timedelta_seconds',
'time_difference', 'datetime_ceil', 'combine_opts',
@@ -108,7 +109,7 @@ def datetime_ceil(dateval):
"""
if dateval.microsecond > 0:
return dateval + timedelta(seconds=1,
- microseconds=-dateval.microsecond)
+ microseconds= -dateval.microsecond)
return dateval
@@ -137,41 +138,64 @@ def get_callable_name(func):
"""
Returns the best available display name for the given function/callable.
"""
- name = func.__module__
- if hasattr(func, '__self__') and func.__self__:
- name += '.' + func.__self__.__name__
- elif hasattr(func, 'im_self') and func.im_self: # py2.4, 2.5
- name += '.' + func.im_self.__name__
- if hasattr(func, '__name__'):
- name += '.' + func.__name__
- return name
+ f_self = getattr(func, '__self__', None) or getattr(func, 'im_self', None)
+
+ if f_self and hasattr(func, '__name__'):
+ if isinstance(f_self, type):
+ # class method
+ return '%s.%s' % (f_self.__name__, func.__name__)
+ # bound method
+ return '%s.%s' % (f_self.__class__.__name__, func.__name__)
+
+ if hasattr(func, '__call__'):
+ if hasattr(func, '__name__'):
+ # function, unbound method or a class with a __call__ method
+ return func.__name__
+ # instance of a class with a __call__ method
+ return func.__class__.__name__
+
+ raise TypeError('Unable to determine a name for %s -- '
+ 'maybe it is not a callable?' % repr(func))
def obj_to_ref(obj):
"""
Returns the path to the given object.
"""
- ref = '%s:%s' % (obj.__module__, obj.__name__)
+ ref = '%s:%s' % (obj.__module__, get_callable_name(obj))
try:
obj2 = ref_to_obj(ref)
- except AttributeError:
- pass
- else:
- if obj2 == obj:
- return ref
-
- raise ValueError('Only module level objects are supported')
+ if obj != obj2:
+ raise ValueError
+ except Exception:
+ raise ValueError('Cannot determine the reference to %s' % repr(obj))
+
+ return ref
def ref_to_obj(ref):
"""
Returns the object pointed to by ``ref``.
"""
+ if not isinstance(ref, basestring):
+ raise TypeError('References must be strings')
+ if not ':' in ref:
+ raise ValueError('Invalid reference')
+
modulename, rest = ref.split(':', 1)
- obj = __import__(modulename)
- for name in modulename.split('.')[1:] + rest.split('.'):
- obj = getattr(obj, name)
- return obj
+ try:
+ obj = __import__(modulename)
+ except ImportError:
+ raise LookupError('Error resolving reference %s: '
+ 'could not import module' % ref)
+
+ try:
+ for name in modulename.split('.')[1:] + rest.split('.'):
+ obj = getattr(obj, name)
+ return obj
+ except Exception:
+ raise LookupError('Error resolving reference %s: '
+ 'error looking up object' % ref)
def maybe_ref(ref):
@@ -191,14 +215,16 @@ def to_unicode(string, encoding='ascii'):
"""
if hasattr(string, 'decode'):
return string.decode(encoding, 'ignore')
- return string
+ return string # pragma: nocover
if sys.version_info < (3, 0): # pragma: nocover
iteritems = lambda d: d.iteritems()
itervalues = lambda d: d.itervalues()
xrange = xrange
+ basestring = basestring
else: # pragma: nocover
iteritems = lambda d: d.items()
itervalues = lambda d: d.values()
xrange = range
+ basestring = str
diff --git a/lib/bs4/__init__.py b/lib/bs4/__init__.py
new file mode 100644
index 00000000..aa818ae4
--- /dev/null
+++ b/lib/bs4/__init__.py
@@ -0,0 +1,529 @@
+"""Beautiful Soup
+Elixir and Tonic
+"The Screen-Scraper's Friend"
+http://www.crummy.com/software/BeautifulSoup/
+
+Beautiful Soup uses a pluggable XML or HTML parser to parse a
+(possibly invalid) document into a tree representation. Beautiful Soup
+provides methods and Pythonic idioms that make it easy to navigate,
+search, and modify the parse tree.
+
+Beautiful Soup works with Python 2.7 and up. It works better if lxml
+and/or html5lib is installed.
+
+For more than you ever wanted to know about Beautiful Soup, see the
+documentation:
+http://www.crummy.com/software/BeautifulSoup/bs4/doc/
+
+"""
+
+# Use of this source code is governed by a BSD-style license that can be
+# found in the LICENSE file.
+
+__author__ = "Leonard Richardson (leonardr@segfault.org)"
+__version__ = "4.5.1"
+__copyright__ = "Copyright (c) 2004-2016 Leonard Richardson"
+__license__ = "MIT"
+
+__all__ = ['BeautifulSoup']
+
+import os
+import re
+import traceback
+import warnings
+
+from .builder import builder_registry, ParserRejectedMarkup
+from .dammit import UnicodeDammit
+from .element import (
+ CData,
+ Comment,
+ DEFAULT_OUTPUT_ENCODING,
+ Declaration,
+ Doctype,
+ NavigableString,
+ PageElement,
+ ProcessingInstruction,
+ ResultSet,
+ SoupStrainer,
+ Tag,
+ )
+
+# The very first thing we do is give a useful error if someone is
+# running this code under Python 3 without converting it.
+'You are trying to run the Python 2 version of Beautiful Soup under Python 3. This will not work.'<>'You need to convert the code, either by installing it (`python setup.py install`) or by running 2to3 (`2to3 -w bs4`).'
+
+class BeautifulSoup(Tag):
+ """
+ This class defines the basic interface called by the tree builders.
+
+ These methods will be called by the parser:
+ reset()
+ feed(markup)
+
+ The tree builder may call these methods from its feed() implementation:
+ handle_starttag(name, attrs) # See note about return value
+ handle_endtag(name)
+ handle_data(data) # Appends to the current data node
+ endData(containerClass=NavigableString) # Ends the current data node
+
+ No matter how complicated the underlying parser is, you should be
+ able to build a tree using 'start tag' events, 'end tag' events,
+ 'data' events, and "done with data" events.
+
+ If you encounter an empty-element tag (aka a self-closing tag,
+ like HTML's
tag), call handle_starttag and then
+ handle_endtag.
+ """
+ ROOT_TAG_NAME = u'[document]'
+
+ # If the end-user gives no indication which tree builder they
+ # want, look for one with these features.
+ DEFAULT_BUILDER_FEATURES = ['html', 'fast']
+
+ ASCII_SPACES = '\x20\x0a\x09\x0c\x0d'
+
+ NO_PARSER_SPECIFIED_WARNING = "No parser was explicitly specified, so I'm using the best available %(markup_type)s parser for this system (\"%(parser)s\"). This usually isn't a problem, but if you run this code on another system, or in a different virtual environment, it may use a different parser and behave differently.\n\nThe code that caused this warning is on line %(line_number)s of the file %(filename)s. To get rid of this warning, change code that looks like this:\n\n BeautifulSoup([your markup])\n\nto this:\n\n BeautifulSoup([your markup], \"%(parser)s\")\n"
+
+ def __init__(self, markup="", features=None, builder=None,
+ parse_only=None, from_encoding=None, exclude_encodings=None,
+ **kwargs):
+ """The Soup object is initialized as the 'root tag', and the
+ provided markup (which can be a string or a file-like object)
+ is fed into the underlying parser."""
+
+ if 'convertEntities' in kwargs:
+ warnings.warn(
+ "BS4 does not respect the convertEntities argument to the "
+ "BeautifulSoup constructor. Entities are always converted "
+ "to Unicode characters.")
+
+ if 'markupMassage' in kwargs:
+ del kwargs['markupMassage']
+ warnings.warn(
+ "BS4 does not respect the markupMassage argument to the "
+ "BeautifulSoup constructor. The tree builder is responsible "
+ "for any necessary markup massage.")
+
+ if 'smartQuotesTo' in kwargs:
+ del kwargs['smartQuotesTo']
+ warnings.warn(
+ "BS4 does not respect the smartQuotesTo argument to the "
+ "BeautifulSoup constructor. Smart quotes are always converted "
+ "to Unicode characters.")
+
+ if 'selfClosingTags' in kwargs:
+ del kwargs['selfClosingTags']
+ warnings.warn(
+ "BS4 does not respect the selfClosingTags argument to the "
+ "BeautifulSoup constructor. The tree builder is responsible "
+ "for understanding self-closing tags.")
+
+ if 'isHTML' in kwargs:
+ del kwargs['isHTML']
+ warnings.warn(
+ "BS4 does not respect the isHTML argument to the "
+ "BeautifulSoup constructor. Suggest you use "
+ "features='lxml' for HTML and features='lxml-xml' for "
+ "XML.")
+
+ def deprecated_argument(old_name, new_name):
+ if old_name in kwargs:
+ warnings.warn(
+ 'The "%s" argument to the BeautifulSoup constructor '
+ 'has been renamed to "%s."' % (old_name, new_name))
+ value = kwargs[old_name]
+ del kwargs[old_name]
+ return value
+ return None
+
+ parse_only = parse_only or deprecated_argument(
+ "parseOnlyThese", "parse_only")
+
+ from_encoding = from_encoding or deprecated_argument(
+ "fromEncoding", "from_encoding")
+
+ if from_encoding and isinstance(markup, unicode):
+ warnings.warn("You provided Unicode markup but also provided a value for from_encoding. Your from_encoding will be ignored.")
+ from_encoding = None
+
+ if len(kwargs) > 0:
+ arg = kwargs.keys().pop()
+ raise TypeError(
+ "__init__() got an unexpected keyword argument '%s'" % arg)
+
+ if builder is None:
+ original_features = features
+ if isinstance(features, basestring):
+ features = [features]
+ if features is None or len(features) == 0:
+ features = self.DEFAULT_BUILDER_FEATURES
+ builder_class = builder_registry.lookup(*features)
+ if builder_class is None:
+ raise FeatureNotFound(
+ "Couldn't find a tree builder with the features you "
+ "requested: %s. Do you need to install a parser library?"
+ % ",".join(features))
+ builder = builder_class()
+ if not (original_features == builder.NAME or
+ original_features in builder.ALTERNATE_NAMES):
+ if builder.is_xml:
+ markup_type = "XML"
+ else:
+ markup_type = "HTML"
+
+ caller = traceback.extract_stack()[0]
+ filename = caller[0]
+ line_number = caller[1]
+ warnings.warn(self.NO_PARSER_SPECIFIED_WARNING % dict(
+ filename=filename,
+ line_number=line_number,
+ parser=builder.NAME,
+ markup_type=markup_type))
+
+ self.builder = builder
+ self.is_xml = builder.is_xml
+ self.known_xml = self.is_xml
+ self.builder.soup = self
+
+ self.parse_only = parse_only
+
+ if hasattr(markup, 'read'): # It's a file-type object.
+ markup = markup.read()
+ elif len(markup) <= 256 and (
+ (isinstance(markup, bytes) and not b'<' in markup)
+ or (isinstance(markup, unicode) and not u'<' in markup)
+ ):
+ # Print out warnings for a couple beginner problems
+ # involving passing non-markup to Beautiful Soup.
+ # Beautiful Soup will still parse the input as markup,
+ # just in case that's what the user really wants.
+ if (isinstance(markup, unicode)
+ and not os.path.supports_unicode_filenames):
+ possible_filename = markup.encode("utf8")
+ else:
+ possible_filename = markup
+ is_file = False
+ try:
+ is_file = os.path.exists(possible_filename)
+ except Exception, e:
+ # This is almost certainly a problem involving
+ # characters not valid in filenames on this
+ # system. Just let it go.
+ pass
+ if is_file:
+ if isinstance(markup, unicode):
+ markup = markup.encode("utf8")
+ warnings.warn(
+ '"%s" looks like a filename, not markup. You should'
+ 'probably open this file and pass the filehandle into'
+ 'Beautiful Soup.' % markup)
+ self._check_markup_is_url(markup)
+
+ for (self.markup, self.original_encoding, self.declared_html_encoding,
+ self.contains_replacement_characters) in (
+ self.builder.prepare_markup(
+ markup, from_encoding, exclude_encodings=exclude_encodings)):
+ self.reset()
+ try:
+ self._feed()
+ break
+ except ParserRejectedMarkup:
+ pass
+
+ # Clear out the markup and remove the builder's circular
+ # reference to this object.
+ self.markup = None
+ self.builder.soup = None
+
+ def __copy__(self):
+ copy = type(self)(
+ self.encode('utf-8'), builder=self.builder, from_encoding='utf-8'
+ )
+
+ # Although we encoded the tree to UTF-8, that may not have
+ # been the encoding of the original markup. Set the copy's
+ # .original_encoding to reflect the original object's
+ # .original_encoding.
+ copy.original_encoding = self.original_encoding
+ return copy
+
+ def __getstate__(self):
+ # Frequently a tree builder can't be pickled.
+ d = dict(self.__dict__)
+ if 'builder' in d and not self.builder.picklable:
+ d['builder'] = None
+ return d
+
+ @staticmethod
+ def _check_markup_is_url(markup):
+ """
+ Check if markup looks like it's actually a url and raise a warning
+ if so. Markup can be unicode or str (py2) / bytes (py3).
+ """
+ if isinstance(markup, bytes):
+ space = b' '
+ cant_start_with = (b"http:", b"https:")
+ elif isinstance(markup, unicode):
+ space = u' '
+ cant_start_with = (u"http:", u"https:")
+ else:
+ return
+
+ if any(markup.startswith(prefix) for prefix in cant_start_with):
+ if not space in markup:
+ if isinstance(markup, bytes):
+ decoded_markup = markup.decode('utf-8', 'replace')
+ else:
+ decoded_markup = markup
+ warnings.warn(
+ '"%s" looks like a URL. Beautiful Soup is not an'
+ ' HTTP client. You should probably use an HTTP client like'
+ ' requests to get the document behind the URL, and feed'
+ ' that document to Beautiful Soup.' % decoded_markup
+ )
+
+ def _feed(self):
+ # Convert the document to Unicode.
+ self.builder.reset()
+
+ self.builder.feed(self.markup)
+ # Close out any unfinished strings and close all the open tags.
+ self.endData()
+ while self.currentTag.name != self.ROOT_TAG_NAME:
+ self.popTag()
+
+ def reset(self):
+ Tag.__init__(self, self, self.builder, self.ROOT_TAG_NAME)
+ self.hidden = 1
+ self.builder.reset()
+ self.current_data = []
+ self.currentTag = None
+ self.tagStack = []
+ self.preserve_whitespace_tag_stack = []
+ self.pushTag(self)
+
+ def new_tag(self, name, namespace=None, nsprefix=None, **attrs):
+ """Create a new tag associated with this soup."""
+ return Tag(None, self.builder, name, namespace, nsprefix, attrs)
+
+ def new_string(self, s, subclass=NavigableString):
+ """Create a new NavigableString associated with this soup."""
+ return subclass(s)
+
+ def insert_before(self, successor):
+ raise NotImplementedError("BeautifulSoup objects don't support insert_before().")
+
+ def insert_after(self, successor):
+ raise NotImplementedError("BeautifulSoup objects don't support insert_after().")
+
+ def popTag(self):
+ tag = self.tagStack.pop()
+ if self.preserve_whitespace_tag_stack and tag == self.preserve_whitespace_tag_stack[-1]:
+ self.preserve_whitespace_tag_stack.pop()
+ #print "Pop", tag.name
+ if self.tagStack:
+ self.currentTag = self.tagStack[-1]
+ return self.currentTag
+
+ def pushTag(self, tag):
+ #print "Push", tag.name
+ if self.currentTag:
+ self.currentTag.contents.append(tag)
+ self.tagStack.append(tag)
+ self.currentTag = self.tagStack[-1]
+ if tag.name in self.builder.preserve_whitespace_tags:
+ self.preserve_whitespace_tag_stack.append(tag)
+
+ def endData(self, containerClass=NavigableString):
+ if self.current_data:
+ current_data = u''.join(self.current_data)
+ # If whitespace is not preserved, and this string contains
+ # nothing but ASCII spaces, replace it with a single space
+ # or newline.
+ if not self.preserve_whitespace_tag_stack:
+ strippable = True
+ for i in current_data:
+ if i not in self.ASCII_SPACES:
+ strippable = False
+ break
+ if strippable:
+ if '\n' in current_data:
+ current_data = '\n'
+ else:
+ current_data = ' '
+
+ # Reset the data collector.
+ self.current_data = []
+
+ # Should we add this string to the tree at all?
+ if self.parse_only and len(self.tagStack) <= 1 and \
+ (not self.parse_only.text or \
+ not self.parse_only.search(current_data)):
+ return
+
+ o = containerClass(current_data)
+ self.object_was_parsed(o)
+
+ def object_was_parsed(self, o, parent=None, most_recent_element=None):
+ """Add an object to the parse tree."""
+ parent = parent or self.currentTag
+ previous_element = most_recent_element or self._most_recent_element
+
+ next_element = previous_sibling = next_sibling = None
+ if isinstance(o, Tag):
+ next_element = o.next_element
+ next_sibling = o.next_sibling
+ previous_sibling = o.previous_sibling
+ if not previous_element:
+ previous_element = o.previous_element
+
+ o.setup(parent, previous_element, next_element, previous_sibling, next_sibling)
+
+ self._most_recent_element = o
+ parent.contents.append(o)
+
+ if parent.next_sibling:
+ # This node is being inserted into an element that has
+ # already been parsed. Deal with any dangling references.
+ index = len(parent.contents)-1
+ while index >= 0:
+ if parent.contents[index] is o:
+ break
+ index -= 1
+ else:
+ raise ValueError(
+ "Error building tree: supposedly %r was inserted "
+ "into %r after the fact, but I don't see it!" % (
+ o, parent
+ )
+ )
+ if index == 0:
+ previous_element = parent
+ previous_sibling = None
+ else:
+ previous_element = previous_sibling = parent.contents[index-1]
+ if index == len(parent.contents)-1:
+ next_element = parent.next_sibling
+ next_sibling = None
+ else:
+ next_element = next_sibling = parent.contents[index+1]
+
+ o.previous_element = previous_element
+ if previous_element:
+ previous_element.next_element = o
+ o.next_element = next_element
+ if next_element:
+ next_element.previous_element = o
+ o.next_sibling = next_sibling
+ if next_sibling:
+ next_sibling.previous_sibling = o
+ o.previous_sibling = previous_sibling
+ if previous_sibling:
+ previous_sibling.next_sibling = o
+
+ def _popToTag(self, name, nsprefix=None, inclusivePop=True):
+ """Pops the tag stack up to and including the most recent
+ instance of the given tag. If inclusivePop is false, pops the tag
+ stack up to but *not* including the most recent instqance of
+ the given tag."""
+ #print "Popping to %s" % name
+ if name == self.ROOT_TAG_NAME:
+ # The BeautifulSoup object itself can never be popped.
+ return
+
+ most_recently_popped = None
+
+ stack_size = len(self.tagStack)
+ for i in range(stack_size - 1, 0, -1):
+ t = self.tagStack[i]
+ if (name == t.name and nsprefix == t.prefix):
+ if inclusivePop:
+ most_recently_popped = self.popTag()
+ break
+ most_recently_popped = self.popTag()
+
+ return most_recently_popped
+
+ def handle_starttag(self, name, namespace, nsprefix, attrs):
+ """Push a start tag on to the stack.
+
+ If this method returns None, the tag was rejected by the
+ SoupStrainer. You should proceed as if the tag had not occurred
+ in the document. For instance, if this was a self-closing tag,
+ don't call handle_endtag.
+ """
+
+ # print "Start tag %s: %s" % (name, attrs)
+ self.endData()
+
+ if (self.parse_only and len(self.tagStack) <= 1
+ and (self.parse_only.text
+ or not self.parse_only.search_tag(name, attrs))):
+ return None
+
+ tag = Tag(self, self.builder, name, namespace, nsprefix, attrs,
+ self.currentTag, self._most_recent_element)
+ if tag is None:
+ return tag
+ if self._most_recent_element:
+ self._most_recent_element.next_element = tag
+ self._most_recent_element = tag
+ self.pushTag(tag)
+ return tag
+
+ def handle_endtag(self, name, nsprefix=None):
+ #print "End tag: " + name
+ self.endData()
+ self._popToTag(name, nsprefix)
+
+ def handle_data(self, data):
+ self.current_data.append(data)
+
+ def decode(self, pretty_print=False,
+ eventual_encoding=DEFAULT_OUTPUT_ENCODING,
+ formatter="minimal"):
+ """Returns a string or Unicode representation of this document.
+ To get Unicode, pass None for encoding."""
+
+ if self.is_xml:
+ # Print the XML declaration
+ encoding_part = ''
+ if eventual_encoding != None:
+ encoding_part = ' encoding="%s"' % eventual_encoding
+ prefix = u'\n' % encoding_part
+ else:
+ prefix = u''
+ if not pretty_print:
+ indent_level = None
+ else:
+ indent_level = 0
+ return prefix + super(BeautifulSoup, self).decode(
+ indent_level, eventual_encoding, formatter)
+
+# Alias to make it easier to type import: 'from bs4 import _soup'
+_s = BeautifulSoup
+_soup = BeautifulSoup
+
+class BeautifulStoneSoup(BeautifulSoup):
+ """Deprecated interface to an XML parser."""
+
+ def __init__(self, *args, **kwargs):
+ kwargs['features'] = 'xml'
+ warnings.warn(
+ 'The BeautifulStoneSoup class is deprecated. Instead of using '
+ 'it, pass features="xml" into the BeautifulSoup constructor.')
+ super(BeautifulStoneSoup, self).__init__(*args, **kwargs)
+
+
+class StopParsing(Exception):
+ pass
+
+class FeatureNotFound(ValueError):
+ pass
+
+
+#By default, act as an HTML pretty-printer.
+if __name__ == '__main__':
+ import sys
+ soup = BeautifulSoup(sys.stdin)
+ print soup.prettify()
diff --git a/bs4/builder/__init__.py b/lib/bs4/builder/__init__.py
similarity index 92%
rename from bs4/builder/__init__.py
rename to lib/bs4/builder/__init__.py
index dc7deb93..601979bf 100644
--- a/bs4/builder/__init__.py
+++ b/lib/bs4/builder/__init__.py
@@ -1,9 +1,13 @@
+# Use of this source code is governed by a BSD-style license that can be
+# found in the LICENSE file.
+
from collections import defaultdict
import itertools
import sys
from bs4.element import (
CharsetMetaAttributeValue,
ContentMetaAttributeValue,
+ HTMLAwareEntitySubstitution,
whitespace_re
)
@@ -80,9 +84,12 @@ builder_registry = TreeBuilderRegistry()
class TreeBuilder(object):
"""Turn a document into a Beautiful Soup object tree."""
+ NAME = "[Unknown tree builder]"
+ ALTERNATE_NAMES = []
features = []
is_xml = False
+ picklable = False
preserve_whitespace_tags = set()
empty_element_tags = None # A tag will be considered an empty-element
# tag when and only when it has no contents.
@@ -147,16 +154,18 @@ class TreeBuilder(object):
Modifies its input in place.
"""
+ if not attrs:
+ return attrs
if self.cdata_list_attributes:
universal = self.cdata_list_attributes.get('*', [])
tag_specific = self.cdata_list_attributes.get(
- tag_name.lower(), [])
- for cdata_list_attr in itertools.chain(universal, tag_specific):
- if cdata_list_attr in dict(attrs):
- # Basically, we have a "class" attribute whose
- # value is a whitespace-separated list of CSS
- # classes. Split it into a list.
- value = attrs[cdata_list_attr]
+ tag_name.lower(), None)
+ for attr in attrs.keys():
+ if attr in universal or (tag_specific and attr in tag_specific):
+ # We have a "class"-type attribute whose string
+ # value is a whitespace-separated list of
+ # values. Split it into a list.
+ value = attrs[attr]
if isinstance(value, basestring):
values = whitespace_re.split(value)
else:
@@ -167,7 +176,7 @@ class TreeBuilder(object):
# leave the value alone rather than trying to
# split it again.
values = value
- attrs[cdata_list_attr] = values
+ attrs[attr] = values
return attrs
class SAXTreeBuilder(TreeBuilder):
@@ -222,7 +231,7 @@ class HTMLTreeBuilder(TreeBuilder):
Such as which tags are empty-element tags.
"""
- preserve_whitespace_tags = set(['pre', 'textarea'])
+ preserve_whitespace_tags = HTMLAwareEntitySubstitution.preserve_whitespace_tags
empty_element_tags = set(['br' , 'hr', 'input', 'img', 'meta',
'spacer', 'link', 'frame', 'base'])
@@ -296,6 +305,9 @@ def register_treebuilders_from(module):
# Register the builder while we're at it.
this_module.builder_registry.register(obj)
+class ParserRejectedMarkup(Exception):
+ pass
+
# Builders are registered in reverse order of priority, so that custom
# builder registrations will take precedence. In general, we want lxml
# to take precedence over html5lib, because it's faster. And we only
diff --git a/lib/bs4/builder/_html5lib.py b/lib/bs4/builder/_html5lib.py
new file mode 100644
index 00000000..c46f8823
--- /dev/null
+++ b/lib/bs4/builder/_html5lib.py
@@ -0,0 +1,356 @@
+# Use of this source code is governed by a BSD-style license that can be
+# found in the LICENSE file.
+
+__all__ = [
+ 'HTML5TreeBuilder',
+ ]
+
+import warnings
+from bs4.builder import (
+ PERMISSIVE,
+ HTML,
+ HTML_5,
+ HTMLTreeBuilder,
+ )
+from bs4.element import (
+ NamespacedAttribute,
+ whitespace_re,
+)
+import html5lib
+from html5lib.constants import namespaces
+from bs4.element import (
+ Comment,
+ Doctype,
+ NavigableString,
+ Tag,
+ )
+
+try:
+ # Pre-0.99999999
+ from html5lib.treebuilders import _base as treebuilder_base
+ new_html5lib = False
+except ImportError, e:
+ # 0.99999999 and up
+ from html5lib.treebuilders import base as treebuilder_base
+ new_html5lib = True
+
+class HTML5TreeBuilder(HTMLTreeBuilder):
+ """Use html5lib to build a tree."""
+
+ NAME = "html5lib"
+
+ features = [NAME, PERMISSIVE, HTML_5, HTML]
+
+ def prepare_markup(self, markup, user_specified_encoding,
+ document_declared_encoding=None, exclude_encodings=None):
+ # Store the user-specified encoding for use later on.
+ self.user_specified_encoding = user_specified_encoding
+
+ # document_declared_encoding and exclude_encodings aren't used
+ # ATM because the html5lib TreeBuilder doesn't use
+ # UnicodeDammit.
+ if exclude_encodings:
+ warnings.warn("You provided a value for exclude_encoding, but the html5lib tree builder doesn't support exclude_encoding.")
+ yield (markup, None, None, False)
+
+ # These methods are defined by Beautiful Soup.
+ def feed(self, markup):
+ if self.soup.parse_only is not None:
+ warnings.warn("You provided a value for parse_only, but the html5lib tree builder doesn't support parse_only. The entire document will be parsed.")
+ parser = html5lib.HTMLParser(tree=self.create_treebuilder)
+
+ extra_kwargs = dict()
+ if not isinstance(markup, unicode):
+ if new_html5lib:
+ extra_kwargs['override_encoding'] = self.user_specified_encoding
+ else:
+ extra_kwargs['encoding'] = self.user_specified_encoding
+ doc = parser.parse(markup, **extra_kwargs)
+
+ # Set the character encoding detected by the tokenizer.
+ if isinstance(markup, unicode):
+ # We need to special-case this because html5lib sets
+ # charEncoding to UTF-8 if it gets Unicode input.
+ doc.original_encoding = None
+ else:
+ original_encoding = parser.tokenizer.stream.charEncoding[0]
+ if not isinstance(original_encoding, basestring):
+ # In 0.99999999 and up, the encoding is an html5lib
+ # Encoding object. We want to use a string for compatibility
+ # with other tree builders.
+ original_encoding = original_encoding.name
+ doc.original_encoding = original_encoding
+
+ def create_treebuilder(self, namespaceHTMLElements):
+ self.underlying_builder = TreeBuilderForHtml5lib(
+ self.soup, namespaceHTMLElements)
+ return self.underlying_builder
+
+ def test_fragment_to_document(self, fragment):
+ """See `TreeBuilder`."""
+ return u'%s' % fragment
+
+
+class TreeBuilderForHtml5lib(treebuilder_base.TreeBuilder):
+
+ def __init__(self, soup, namespaceHTMLElements):
+ self.soup = soup
+ super(TreeBuilderForHtml5lib, self).__init__(namespaceHTMLElements)
+
+ def documentClass(self):
+ self.soup.reset()
+ return Element(self.soup, self.soup, None)
+
+ def insertDoctype(self, token):
+ name = token["name"]
+ publicId = token["publicId"]
+ systemId = token["systemId"]
+
+ doctype = Doctype.for_name_and_ids(name, publicId, systemId)
+ self.soup.object_was_parsed(doctype)
+
+ def elementClass(self, name, namespace):
+ tag = self.soup.new_tag(name, namespace)
+ return Element(tag, self.soup, namespace)
+
+ def commentClass(self, data):
+ return TextNode(Comment(data), self.soup)
+
+ def fragmentClass(self):
+ self.soup = BeautifulSoup("")
+ self.soup.name = "[document_fragment]"
+ return Element(self.soup, self.soup, None)
+
+ def appendChild(self, node):
+ # XXX This code is not covered by the BS4 tests.
+ self.soup.append(node.element)
+
+ def getDocument(self):
+ return self.soup
+
+ def getFragment(self):
+ return treebuilder_base.TreeBuilder.getFragment(self).element
+
+class AttrList(object):
+ def __init__(self, element):
+ self.element = element
+ self.attrs = dict(self.element.attrs)
+ def __iter__(self):
+ return list(self.attrs.items()).__iter__()
+ def __setitem__(self, name, value):
+ # If this attribute is a multi-valued attribute for this element,
+ # turn its value into a list.
+ list_attr = HTML5TreeBuilder.cdata_list_attributes
+ if (name in list_attr['*']
+ or (self.element.name in list_attr
+ and name in list_attr[self.element.name])):
+ # A node that is being cloned may have already undergone
+ # this procedure.
+ if not isinstance(value, list):
+ value = whitespace_re.split(value)
+ self.element[name] = value
+ def items(self):
+ return list(self.attrs.items())
+ def keys(self):
+ return list(self.attrs.keys())
+ def __len__(self):
+ return len(self.attrs)
+ def __getitem__(self, name):
+ return self.attrs[name]
+ def __contains__(self, name):
+ return name in list(self.attrs.keys())
+
+
+class Element(treebuilder_base.Node):
+ def __init__(self, element, soup, namespace):
+ treebuilder_base.Node.__init__(self, element.name)
+ self.element = element
+ self.soup = soup
+ self.namespace = namespace
+
+ def appendChild(self, node):
+ string_child = child = None
+ if isinstance(node, basestring):
+ # Some other piece of code decided to pass in a string
+ # instead of creating a TextElement object to contain the
+ # string.
+ string_child = child = node
+ elif isinstance(node, Tag):
+ # Some other piece of code decided to pass in a Tag
+ # instead of creating an Element object to contain the
+ # Tag.
+ child = node
+ elif node.element.__class__ == NavigableString:
+ string_child = child = node.element
+ else:
+ child = node.element
+
+ if not isinstance(child, basestring) and child.parent is not None:
+ node.element.extract()
+
+ if (string_child and self.element.contents
+ and self.element.contents[-1].__class__ == NavigableString):
+ # We are appending a string onto another string.
+ # TODO This has O(n^2) performance, for input like
+ # "aaa..."
+ old_element = self.element.contents[-1]
+ new_element = self.soup.new_string(old_element + string_child)
+ old_element.replace_with(new_element)
+ self.soup._most_recent_element = new_element
+ else:
+ if isinstance(node, basestring):
+ # Create a brand new NavigableString from this string.
+ child = self.soup.new_string(node)
+
+ # Tell Beautiful Soup to act as if it parsed this element
+ # immediately after the parent's last descendant. (Or
+ # immediately after the parent, if it has no children.)
+ if self.element.contents:
+ most_recent_element = self.element._last_descendant(False)
+ elif self.element.next_element is not None:
+ # Something from further ahead in the parse tree is
+ # being inserted into this earlier element. This is
+ # very annoying because it means an expensive search
+ # for the last element in the tree.
+ most_recent_element = self.soup._last_descendant()
+ else:
+ most_recent_element = self.element
+
+ self.soup.object_was_parsed(
+ child, parent=self.element,
+ most_recent_element=most_recent_element)
+
+ def getAttributes(self):
+ return AttrList(self.element)
+
+ def setAttributes(self, attributes):
+
+ if attributes is not None and len(attributes) > 0:
+
+ converted_attributes = []
+ for name, value in list(attributes.items()):
+ if isinstance(name, tuple):
+ new_name = NamespacedAttribute(*name)
+ del attributes[name]
+ attributes[new_name] = value
+
+ self.soup.builder._replace_cdata_list_attribute_values(
+ self.name, attributes)
+ for name, value in attributes.items():
+ self.element[name] = value
+
+ # The attributes may contain variables that need substitution.
+ # Call set_up_substitutions manually.
+ #
+ # The Tag constructor called this method when the Tag was created,
+ # but we just set/changed the attributes, so call it again.
+ self.soup.builder.set_up_substitutions(self.element)
+ attributes = property(getAttributes, setAttributes)
+
+ def insertText(self, data, insertBefore=None):
+ if insertBefore:
+ text = TextNode(self.soup.new_string(data), self.soup)
+ self.insertBefore(data, insertBefore)
+ else:
+ self.appendChild(data)
+
+ def insertBefore(self, node, refNode):
+ index = self.element.index(refNode.element)
+ if (node.element.__class__ == NavigableString and self.element.contents
+ and self.element.contents[index-1].__class__ == NavigableString):
+ # (See comments in appendChild)
+ old_node = self.element.contents[index-1]
+ new_str = self.soup.new_string(old_node + node.element)
+ old_node.replace_with(new_str)
+ else:
+ self.element.insert(index, node.element)
+ node.parent = self
+
+ def removeChild(self, node):
+ node.element.extract()
+
+ def reparentChildren(self, new_parent):
+ """Move all of this tag's children into another tag."""
+ # print "MOVE", self.element.contents
+ # print "FROM", self.element
+ # print "TO", new_parent.element
+ element = self.element
+ new_parent_element = new_parent.element
+ # Determine what this tag's next_element will be once all the children
+ # are removed.
+ final_next_element = element.next_sibling
+
+ new_parents_last_descendant = new_parent_element._last_descendant(False, False)
+ if len(new_parent_element.contents) > 0:
+ # The new parent already contains children. We will be
+ # appending this tag's children to the end.
+ new_parents_last_child = new_parent_element.contents[-1]
+ new_parents_last_descendant_next_element = new_parents_last_descendant.next_element
+ else:
+ # The new parent contains no children.
+ new_parents_last_child = None
+ new_parents_last_descendant_next_element = new_parent_element.next_element
+
+ to_append = element.contents
+ append_after = new_parent_element.contents
+ if len(to_append) > 0:
+ # Set the first child's previous_element and previous_sibling
+ # to elements within the new parent
+ first_child = to_append[0]
+ if new_parents_last_descendant:
+ first_child.previous_element = new_parents_last_descendant
+ else:
+ first_child.previous_element = new_parent_element
+ first_child.previous_sibling = new_parents_last_child
+ if new_parents_last_descendant:
+ new_parents_last_descendant.next_element = first_child
+ else:
+ new_parent_element.next_element = first_child
+ if new_parents_last_child:
+ new_parents_last_child.next_sibling = first_child
+
+ # Fix the last child's next_element and next_sibling
+ last_child = to_append[-1]
+ last_child.next_element = new_parents_last_descendant_next_element
+ if new_parents_last_descendant_next_element:
+ new_parents_last_descendant_next_element.previous_element = last_child
+ last_child.next_sibling = None
+
+ for child in to_append:
+ child.parent = new_parent_element
+ new_parent_element.contents.append(child)
+
+ # Now that this element has no children, change its .next_element.
+ element.contents = []
+ element.next_element = final_next_element
+
+ # print "DONE WITH MOVE"
+ # print "FROM", self.element
+ # print "TO", new_parent_element
+
+ def cloneNode(self):
+ tag = self.soup.new_tag(self.element.name, self.namespace)
+ node = Element(tag, self.soup, self.namespace)
+ for key,value in self.attributes:
+ node.attributes[key] = value
+ return node
+
+ def hasContent(self):
+ return self.element.contents
+
+ def getNameTuple(self):
+ if self.namespace == None:
+ return namespaces["html"], self.name
+ else:
+ return self.namespace, self.name
+
+ nameTuple = property(getNameTuple)
+
+class TextNode(Element):
+ def __init__(self, element, soup):
+ treebuilder_base.Node.__init__(self, None)
+ self.element = element
+ self.soup = soup
+
+ def cloneNode(self):
+ raise NotImplementedError
diff --git a/bs4/builder/_htmlparser.py b/lib/bs4/builder/_htmlparser.py
similarity index 79%
rename from bs4/builder/_htmlparser.py
rename to lib/bs4/builder/_htmlparser.py
index ede5cecb..823ca15a 100644
--- a/bs4/builder/_htmlparser.py
+++ b/lib/bs4/builder/_htmlparser.py
@@ -1,13 +1,22 @@
"""Use the HTMLParser library to parse HTML files that aren't too bad."""
+# Use of this source code is governed by a BSD-style license that can be
+# found in the LICENSE file.
+
__all__ = [
'HTMLParserTreeBuilder',
]
-from HTMLParser import (
- HTMLParser,
- HTMLParseError,
- )
+from HTMLParser import HTMLParser
+
+try:
+ from HTMLParser import HTMLParseError
+except ImportError, e:
+ # HTMLParseError is removed in Python 3.5. Since it can never be
+ # thrown in 3.5, we can just define our own class as a placeholder.
+ class HTMLParseError(Exception):
+ pass
+
import sys
import warnings
@@ -19,10 +28,10 @@ import warnings
# At the end of this file, we monkeypatch HTMLParser so that
# strict=True works well on Python 3.2.2.
major, minor, release = sys.version_info[:3]
-CONSTRUCTOR_TAKES_STRICT = (
- major > 3
- or (major == 3 and minor > 2)
- or (major == 3 and minor == 2 and release >= 3))
+CONSTRUCTOR_TAKES_STRICT = major == 3 and minor == 2 and release >= 3
+CONSTRUCTOR_STRICT_IS_DEPRECATED = major == 3 and minor == 3
+CONSTRUCTOR_TAKES_CONVERT_CHARREFS = major == 3 and minor >= 4
+
from bs4.element import (
CData,
@@ -45,7 +54,15 @@ HTMLPARSER = 'html.parser'
class BeautifulSoupHTMLParser(HTMLParser):
def handle_starttag(self, name, attrs):
# XXX namespace
- self.soup.handle_starttag(name, None, None, dict(attrs))
+ attr_dict = {}
+ for key, value in attrs:
+ # Change None attribute values to the empty string
+ # for consistency with the other tree builders.
+ if value is None:
+ value = ''
+ attr_dict[key] = value
+ attrvalue = '""'
+ self.soup.handle_starttag(name, None, None, attr_dict)
def handle_endtag(self, name):
self.soup.handle_endtag(name)
@@ -55,9 +72,12 @@ class BeautifulSoupHTMLParser(HTMLParser):
def handle_charref(self, name):
# XXX workaround for a bug in HTMLParser. Remove this once
- # it's fixed.
+ # it's fixed in all supported versions.
+ # http://bugs.python.org/issue13633
if name.startswith('x'):
real_name = int(name.lstrip('x'), 16)
+ elif name.startswith('X'):
+ real_name = int(name.lstrip('X'), 16)
else:
real_name = int(name)
@@ -85,6 +105,9 @@ class BeautifulSoupHTMLParser(HTMLParser):
self.soup.endData()
if data.startswith("DOCTYPE "):
data = data[len("DOCTYPE "):]
+ elif data == 'DOCTYPE':
+ # i.e. ""
+ data = ''
self.soup.handle_data(data)
self.soup.endData(Doctype)
@@ -100,14 +123,6 @@ class BeautifulSoupHTMLParser(HTMLParser):
def handle_pi(self, data):
self.soup.endData()
- if data.endswith("?") and data.lower().startswith("xml"):
- # "An XHTML processing instruction using the trailing '?'
- # will cause the '?' to be included in data." - HTMLParser
- # docs.
- #
- # Strip the question mark so we don't end up with two
- # question marks.
- data = data[:-1]
self.soup.handle_data(data)
self.soup.endData(ProcessingInstruction)
@@ -115,28 +130,34 @@ class BeautifulSoupHTMLParser(HTMLParser):
class HTMLParserTreeBuilder(HTMLTreeBuilder):
is_xml = False
- features = [HTML, STRICT, HTMLPARSER]
+ picklable = True
+ NAME = HTMLPARSER
+ features = [NAME, HTML, STRICT]
def __init__(self, *args, **kwargs):
- if CONSTRUCTOR_TAKES_STRICT:
+ if CONSTRUCTOR_TAKES_STRICT and not CONSTRUCTOR_STRICT_IS_DEPRECATED:
kwargs['strict'] = False
+ if CONSTRUCTOR_TAKES_CONVERT_CHARREFS:
+ kwargs['convert_charrefs'] = False
self.parser_args = (args, kwargs)
def prepare_markup(self, markup, user_specified_encoding=None,
- document_declared_encoding=None):
+ document_declared_encoding=None, exclude_encodings=None):
"""
:return: A 4-tuple (markup, original encoding, encoding
declared within markup, whether any characters had to be
replaced with REPLACEMENT CHARACTER).
"""
if isinstance(markup, unicode):
- return markup, None, None, False
+ yield (markup, None, None, False)
+ return
try_encodings = [user_specified_encoding, document_declared_encoding]
- dammit = UnicodeDammit(markup, try_encodings, is_html=True)
- return (dammit.markup, dammit.original_encoding,
- dammit.declared_html_encoding,
- dammit.contains_replacement_characters)
+ dammit = UnicodeDammit(markup, try_encodings, is_html=True,
+ exclude_encodings=exclude_encodings)
+ yield (dammit.markup, dammit.original_encoding,
+ dammit.declared_html_encoding,
+ dammit.contains_replacement_characters)
def feed(self, markup):
args, kwargs = self.parser_args
diff --git a/lib/bs4/builder/_lxml.py b/lib/bs4/builder/_lxml.py
new file mode 100644
index 00000000..d2ca2872
--- /dev/null
+++ b/lib/bs4/builder/_lxml.py
@@ -0,0 +1,258 @@
+# Use of this source code is governed by a BSD-style license that can be
+# found in the LICENSE file.
+__all__ = [
+ 'LXMLTreeBuilderForXML',
+ 'LXMLTreeBuilder',
+ ]
+
+from io import BytesIO
+from StringIO import StringIO
+import collections
+from lxml import etree
+from bs4.element import (
+ Comment,
+ Doctype,
+ NamespacedAttribute,
+ ProcessingInstruction,
+ XMLProcessingInstruction,
+)
+from bs4.builder import (
+ FAST,
+ HTML,
+ HTMLTreeBuilder,
+ PERMISSIVE,
+ ParserRejectedMarkup,
+ TreeBuilder,
+ XML)
+from bs4.dammit import EncodingDetector
+
+LXML = 'lxml'
+
+class LXMLTreeBuilderForXML(TreeBuilder):
+ DEFAULT_PARSER_CLASS = etree.XMLParser
+
+ is_xml = True
+ processing_instruction_class = XMLProcessingInstruction
+
+ NAME = "lxml-xml"
+ ALTERNATE_NAMES = ["xml"]
+
+ # Well, it's permissive by XML parser standards.
+ features = [NAME, LXML, XML, FAST, PERMISSIVE]
+
+ CHUNK_SIZE = 512
+
+ # This namespace mapping is specified in the XML Namespace
+ # standard.
+ DEFAULT_NSMAPS = {'http://www.w3.org/XML/1998/namespace' : "xml"}
+
+ def default_parser(self, encoding):
+ # This can either return a parser object or a class, which
+ # will be instantiated with default arguments.
+ if self._default_parser is not None:
+ return self._default_parser
+ return etree.XMLParser(
+ target=self, strip_cdata=False, recover=True, encoding=encoding)
+
+ def parser_for(self, encoding):
+ # Use the default parser.
+ parser = self.default_parser(encoding)
+
+ if isinstance(parser, collections.Callable):
+ # Instantiate the parser with default arguments
+ parser = parser(target=self, strip_cdata=False, encoding=encoding)
+ return parser
+
+ def __init__(self, parser=None, empty_element_tags=None):
+ # TODO: Issue a warning if parser is present but not a
+ # callable, since that means there's no way to create new
+ # parsers for different encodings.
+ self._default_parser = parser
+ if empty_element_tags is not None:
+ self.empty_element_tags = set(empty_element_tags)
+ self.soup = None
+ self.nsmaps = [self.DEFAULT_NSMAPS]
+
+ def _getNsTag(self, tag):
+ # Split the namespace URL out of a fully-qualified lxml tag
+ # name. Copied from lxml's src/lxml/sax.py.
+ if tag[0] == '{':
+ return tuple(tag[1:].split('}', 1))
+ else:
+ return (None, tag)
+
+ def prepare_markup(self, markup, user_specified_encoding=None,
+ exclude_encodings=None,
+ document_declared_encoding=None):
+ """
+ :yield: A series of 4-tuples.
+ (markup, encoding, declared encoding,
+ has undergone character replacement)
+
+ Each 4-tuple represents a strategy for parsing the document.
+ """
+ # Instead of using UnicodeDammit to convert the bytestring to
+ # Unicode using different encodings, use EncodingDetector to
+ # iterate over the encodings, and tell lxml to try to parse
+ # the document as each one in turn.
+ is_html = not self.is_xml
+ if is_html:
+ self.processing_instruction_class = ProcessingInstruction
+ else:
+ self.processing_instruction_class = XMLProcessingInstruction
+
+ if isinstance(markup, unicode):
+ # We were given Unicode. Maybe lxml can parse Unicode on
+ # this system?
+ yield markup, None, document_declared_encoding, False
+
+ if isinstance(markup, unicode):
+ # No, apparently not. Convert the Unicode to UTF-8 and
+ # tell lxml to parse it as UTF-8.
+ yield (markup.encode("utf8"), "utf8",
+ document_declared_encoding, False)
+
+ try_encodings = [user_specified_encoding, document_declared_encoding]
+ detector = EncodingDetector(
+ markup, try_encodings, is_html, exclude_encodings)
+ for encoding in detector.encodings:
+ yield (detector.markup, encoding, document_declared_encoding, False)
+
+ def feed(self, markup):
+ if isinstance(markup, bytes):
+ markup = BytesIO(markup)
+ elif isinstance(markup, unicode):
+ markup = StringIO(markup)
+
+ # Call feed() at least once, even if the markup is empty,
+ # or the parser won't be initialized.
+ data = markup.read(self.CHUNK_SIZE)
+ try:
+ self.parser = self.parser_for(self.soup.original_encoding)
+ self.parser.feed(data)
+ while len(data) != 0:
+ # Now call feed() on the rest of the data, chunk by chunk.
+ data = markup.read(self.CHUNK_SIZE)
+ if len(data) != 0:
+ self.parser.feed(data)
+ self.parser.close()
+ except (UnicodeDecodeError, LookupError, etree.ParserError), e:
+ raise ParserRejectedMarkup(str(e))
+
+ def close(self):
+ self.nsmaps = [self.DEFAULT_NSMAPS]
+
+ def start(self, name, attrs, nsmap={}):
+ # Make sure attrs is a mutable dict--lxml may send an immutable dictproxy.
+ attrs = dict(attrs)
+ nsprefix = None
+ # Invert each namespace map as it comes in.
+ if len(self.nsmaps) > 1:
+ # There are no new namespaces for this tag, but
+ # non-default namespaces are in play, so we need a
+ # separate tag stack to know when they end.
+ self.nsmaps.append(None)
+ elif len(nsmap) > 0:
+ # A new namespace mapping has come into play.
+ inverted_nsmap = dict((value, key) for key, value in nsmap.items())
+ self.nsmaps.append(inverted_nsmap)
+ # Also treat the namespace mapping as a set of attributes on the
+ # tag, so we can recreate it later.
+ attrs = attrs.copy()
+ for prefix, namespace in nsmap.items():
+ attribute = NamespacedAttribute(
+ "xmlns", prefix, "http://www.w3.org/2000/xmlns/")
+ attrs[attribute] = namespace
+
+ # Namespaces are in play. Find any attributes that came in
+ # from lxml with namespaces attached to their names, and
+ # turn then into NamespacedAttribute objects.
+ new_attrs = {}
+ for attr, value in attrs.items():
+ namespace, attr = self._getNsTag(attr)
+ if namespace is None:
+ new_attrs[attr] = value
+ else:
+ nsprefix = self._prefix_for_namespace(namespace)
+ attr = NamespacedAttribute(nsprefix, attr, namespace)
+ new_attrs[attr] = value
+ attrs = new_attrs
+
+ namespace, name = self._getNsTag(name)
+ nsprefix = self._prefix_for_namespace(namespace)
+ self.soup.handle_starttag(name, namespace, nsprefix, attrs)
+
+ def _prefix_for_namespace(self, namespace):
+ """Find the currently active prefix for the given namespace."""
+ if namespace is None:
+ return None
+ for inverted_nsmap in reversed(self.nsmaps):
+ if inverted_nsmap is not None and namespace in inverted_nsmap:
+ return inverted_nsmap[namespace]
+ return None
+
+ def end(self, name):
+ self.soup.endData()
+ completed_tag = self.soup.tagStack[-1]
+ namespace, name = self._getNsTag(name)
+ nsprefix = None
+ if namespace is not None:
+ for inverted_nsmap in reversed(self.nsmaps):
+ if inverted_nsmap is not None and namespace in inverted_nsmap:
+ nsprefix = inverted_nsmap[namespace]
+ break
+ self.soup.handle_endtag(name, nsprefix)
+ if len(self.nsmaps) > 1:
+ # This tag, or one of its parents, introduced a namespace
+ # mapping, so pop it off the stack.
+ self.nsmaps.pop()
+
+ def pi(self, target, data):
+ self.soup.endData()
+ self.soup.handle_data(target + ' ' + data)
+ self.soup.endData(self.processing_instruction_class)
+
+ def data(self, content):
+ self.soup.handle_data(content)
+
+ def doctype(self, name, pubid, system):
+ self.soup.endData()
+ doctype = Doctype.for_name_and_ids(name, pubid, system)
+ self.soup.object_was_parsed(doctype)
+
+ def comment(self, content):
+ "Handle comments as Comment objects."
+ self.soup.endData()
+ self.soup.handle_data(content)
+ self.soup.endData(Comment)
+
+ def test_fragment_to_document(self, fragment):
+ """See `TreeBuilder`."""
+ return u'\n%s' % fragment
+
+
+class LXMLTreeBuilder(HTMLTreeBuilder, LXMLTreeBuilderForXML):
+
+ NAME = LXML
+ ALTERNATE_NAMES = ["lxml-html"]
+
+ features = ALTERNATE_NAMES + [NAME, HTML, FAST, PERMISSIVE]
+ is_xml = False
+ processing_instruction_class = ProcessingInstruction
+
+ def default_parser(self, encoding):
+ return etree.HTMLParser
+
+ def feed(self, markup):
+ encoding = self.soup.original_encoding
+ try:
+ self.parser = self.parser_for(encoding)
+ self.parser.feed(markup)
+ self.parser.close()
+ except (UnicodeDecodeError, LookupError, etree.ParserError), e:
+ raise ParserRejectedMarkup(str(e))
+
+
+ def test_fragment_to_document(self, fragment):
+ """See `TreeBuilder`."""
+ return u'%s' % fragment
diff --git a/bs4/dammit.py b/lib/bs4/dammit.py
similarity index 72%
rename from bs4/dammit.py
rename to lib/bs4/dammit.py
index 58cad9ba..2bf67f7f 100644
--- a/bs4/dammit.py
+++ b/lib/bs4/dammit.py
@@ -1,27 +1,43 @@
# -*- coding: utf-8 -*-
"""Beautiful Soup bonus library: Unicode, Dammit
-This class forces XML data into a standard format (usually to UTF-8 or
-Unicode). It is heavily based on code from Mark Pilgrim's Universal
-Feed Parser. It does not rewrite the XML or HTML to reflect a new
-encoding; that's the tree builder's job.
+This library converts a bytestream to Unicode through any means
+necessary. It is heavily based on code from Mark Pilgrim's Universal
+Feed Parser. It works best on XML and HTML, but it does not rewrite the
+XML or HTML to reflect a new encoding; that's the tree builder's job.
"""
+# Use of this source code is governed by a BSD-style license that can be
+# found in the LICENSE file.
+__license__ = "MIT"
import codecs
from htmlentitydefs import codepoint2name
import re
-import warnings
+import logging
+import string
-# Autodetects character encodings. Very useful.
-# Download from http://chardet.feedparser.org/
-# or 'apt-get install python-chardet'
-# or 'easy_install chardet'
+# Import a library to autodetect character encodings.
+chardet_type = None
try:
- import chardet
- #import chardet.constants
- #chardet.constants._debug = 1
+ # First try the fast C implementation.
+ # PyPI package: cchardet
+ import cchardet
+ def chardet_dammit(s):
+ return cchardet.detect(s)['encoding']
except ImportError:
- chardet = None
+ try:
+ # Fall back to the pure Python implementation
+ # Debian package: python-chardet
+ # PyPI package: chardet
+ import chardet
+ def chardet_dammit(s):
+ return chardet.detect(s)['encoding']
+ #import chardet.constants
+ #chardet.constants._debug = 1
+ except ImportError:
+ # No chardet available.
+ def chardet_dammit(s):
+ return None
# Available from http://cjkpython.i18n.org/.
try:
@@ -69,6 +85,8 @@ class EntitySubstitution(object):
"&(?!#\d+;|#x[0-9a-fA-F]+;|\w+;)"
")")
+ AMPERSAND_OR_BRACKET = re.compile("([<>&])")
+
@classmethod
def _substitute_html_entity(cls, matchobj):
entity = cls.CHARACTER_TO_HTML_ENTITY.get(matchobj.group(0))
@@ -122,6 +140,28 @@ class EntitySubstitution(object):
def substitute_xml(cls, value, make_quoted_attribute=False):
"""Substitute XML entities for special XML characters.
+ :param value: A string to be substituted. The less-than sign
+ will become <, the greater-than sign will become >,
+ and any ampersands will become &. If you want ampersands
+ that appear to be part of an entity definition to be left
+ alone, use substitute_xml_containing_entities() instead.
+
+ :param make_quoted_attribute: If True, then the string will be
+ quoted, as befits an attribute value.
+ """
+ # Escape angle brackets and ampersands.
+ value = cls.AMPERSAND_OR_BRACKET.sub(
+ cls._substitute_xml_entity, value)
+
+ if make_quoted_attribute:
+ value = cls.quoted_attribute_value(value)
+ return value
+
+ @classmethod
+ def substitute_xml_containing_entities(
+ cls, value, make_quoted_attribute=False):
+ """Substitute XML entities for special XML characters.
+
:param value: A string to be substituted. The less-than sign will
become <, the greater-than sign will become >, and any
ampersands that are not part of an entity defition will
@@ -155,6 +195,133 @@ class EntitySubstitution(object):
cls._substitute_html_entity, s)
+class EncodingDetector:
+ """Suggests a number of possible encodings for a bytestring.
+
+ Order of precedence:
+
+ 1. Encodings you specifically tell EncodingDetector to try first
+ (the override_encodings argument to the constructor).
+
+ 2. An encoding declared within the bytestring itself, either in an
+ XML declaration (if the bytestring is to be interpreted as an XML
+ document), or in a tag (if the bytestring is to be
+ interpreted as an HTML document.)
+
+ 3. An encoding detected through textual analysis by chardet,
+ cchardet, or a similar external library.
+
+ 4. UTF-8.
+
+ 5. Windows-1252.
+ """
+ def __init__(self, markup, override_encodings=None, is_html=False,
+ exclude_encodings=None):
+ self.override_encodings = override_encodings or []
+ exclude_encodings = exclude_encodings or []
+ self.exclude_encodings = set([x.lower() for x in exclude_encodings])
+ self.chardet_encoding = None
+ self.is_html = is_html
+ self.declared_encoding = None
+
+ # First order of business: strip a byte-order mark.
+ self.markup, self.sniffed_encoding = self.strip_byte_order_mark(markup)
+
+ def _usable(self, encoding, tried):
+ if encoding is not None:
+ encoding = encoding.lower()
+ if encoding in self.exclude_encodings:
+ return False
+ if encoding not in tried:
+ tried.add(encoding)
+ return True
+ return False
+
+ @property
+ def encodings(self):
+ """Yield a number of encodings that might work for this markup."""
+ tried = set()
+ for e in self.override_encodings:
+ if self._usable(e, tried):
+ yield e
+
+ # Did the document originally start with a byte-order mark
+ # that indicated its encoding?
+ if self._usable(self.sniffed_encoding, tried):
+ yield self.sniffed_encoding
+
+ # Look within the document for an XML or HTML encoding
+ # declaration.
+ if self.declared_encoding is None:
+ self.declared_encoding = self.find_declared_encoding(
+ self.markup, self.is_html)
+ if self._usable(self.declared_encoding, tried):
+ yield self.declared_encoding
+
+ # Use third-party character set detection to guess at the
+ # encoding.
+ if self.chardet_encoding is None:
+ self.chardet_encoding = chardet_dammit(self.markup)
+ if self._usable(self.chardet_encoding, tried):
+ yield self.chardet_encoding
+
+ # As a last-ditch effort, try utf-8 and windows-1252.
+ for e in ('utf-8', 'windows-1252'):
+ if self._usable(e, tried):
+ yield e
+
+ @classmethod
+ def strip_byte_order_mark(cls, data):
+ """If a byte-order mark is present, strip it and return the encoding it implies."""
+ encoding = None
+ if isinstance(data, unicode):
+ # Unicode data cannot have a byte-order mark.
+ return data, encoding
+ if (len(data) >= 4) and (data[:2] == b'\xfe\xff') \
+ and (data[2:4] != '\x00\x00'):
+ encoding = 'utf-16be'
+ data = data[2:]
+ elif (len(data) >= 4) and (data[:2] == b'\xff\xfe') \
+ and (data[2:4] != '\x00\x00'):
+ encoding = 'utf-16le'
+ data = data[2:]
+ elif data[:3] == b'\xef\xbb\xbf':
+ encoding = 'utf-8'
+ data = data[3:]
+ elif data[:4] == b'\x00\x00\xfe\xff':
+ encoding = 'utf-32be'
+ data = data[4:]
+ elif data[:4] == b'\xff\xfe\x00\x00':
+ encoding = 'utf-32le'
+ data = data[4:]
+ return data, encoding
+
+ @classmethod
+ def find_declared_encoding(cls, markup, is_html=False, search_entire_document=False):
+ """Given a document, tries to find its declared encoding.
+
+ An XML encoding is declared at the beginning of the document.
+
+ An HTML encoding is declared in a tag, hopefully near the
+ beginning of the document.
+ """
+ if search_entire_document:
+ xml_endpos = html_endpos = len(markup)
+ else:
+ xml_endpos = 1024
+ html_endpos = max(2048, int(len(markup) * 0.05))
+
+ declared_encoding = None
+ declared_encoding_match = xml_encoding_re.search(markup, endpos=xml_endpos)
+ if not declared_encoding_match and is_html:
+ declared_encoding_match = html_meta_re.search(markup, endpos=html_endpos)
+ if declared_encoding_match is not None:
+ declared_encoding = declared_encoding_match.groups()[0].decode(
+ 'ascii', 'replace')
+ if declared_encoding:
+ return declared_encoding.lower()
+ return None
+
class UnicodeDammit:
"""A class for detecting the encoding of a *ML document and
converting it to a Unicode string. If the source encoding is
@@ -175,66 +342,51 @@ class UnicodeDammit:
]
def __init__(self, markup, override_encodings=[],
- smart_quotes_to=None, is_html=False):
- self.declared_html_encoding = None
+ smart_quotes_to=None, is_html=False, exclude_encodings=[]):
self.smart_quotes_to = smart_quotes_to
self.tried_encodings = []
self.contains_replacement_characters = False
+ self.is_html = is_html
+ self.log = logging.getLogger(__name__)
+ self.detector = EncodingDetector(
+ markup, override_encodings, is_html, exclude_encodings)
- if markup == '' or isinstance(markup, unicode):
+ # Short-circuit if the data is in Unicode to begin with.
+ if isinstance(markup, unicode) or markup == '':
self.markup = markup
self.unicode_markup = unicode(markup)
self.original_encoding = None
return
- new_markup, document_encoding, sniffed_encoding = \
- self._detectEncoding(markup, is_html)
- self.markup = new_markup
+ # The encoding detector may have stripped a byte-order mark.
+ # Use the stripped markup from this point on.
+ self.markup = self.detector.markup
u = None
- if new_markup != markup:
- # _detectEncoding modified the markup, then converted it to
- # Unicode and then to UTF-8. So convert it from UTF-8.
- u = self._convert_from("utf8")
- self.original_encoding = sniffed_encoding
+ for encoding in self.detector.encodings:
+ markup = self.detector.markup
+ u = self._convert_from(encoding)
+ if u is not None:
+ break
if not u:
- for proposed_encoding in (
- override_encodings + [document_encoding, sniffed_encoding]):
- if proposed_encoding is not None:
- u = self._convert_from(proposed_encoding)
- if u:
- break
+ # None of the encodings worked. As an absolute last resort,
+ # try them again with character replacement.
- # If no luck and we have auto-detection library, try that:
- if not u and chardet and not isinstance(self.markup, unicode):
- u = self._convert_from(chardet.detect(self.markup)['encoding'])
-
- # As a last resort, try utf-8 and windows-1252:
- if not u:
- for proposed_encoding in ("utf-8", "windows-1252"):
- u = self._convert_from(proposed_encoding)
- if u:
- break
-
- # As an absolute last resort, try the encodings again with
- # character replacement.
- if not u:
- for proposed_encoding in (
- override_encodings + [
- document_encoding, sniffed_encoding, "utf-8", "windows-1252"]):
- if proposed_encoding != "ascii":
- u = self._convert_from(proposed_encoding, "replace")
+ for encoding in self.detector.encodings:
+ if encoding != "ascii":
+ u = self._convert_from(encoding, "replace")
if u is not None:
- warnings.warn(
- UnicodeWarning(
+ self.log.warning(
"Some characters could not be decoded, and were "
- "replaced with REPLACEMENT CHARACTER."))
+ "replaced with REPLACEMENT CHARACTER."
+ )
self.contains_replacement_characters = True
break
- # We could at this point force it to ASCII, but that would
- # destroy so much data that I think giving up is better
+ # If none of that worked, we could at this point force it to
+ # ASCII, but that would destroy so much data that I think
+ # giving up is better.
self.unicode_markup = u
if not u:
self.original_encoding = None
@@ -262,11 +414,10 @@ class UnicodeDammit:
return None
self.tried_encodings.append((proposed, errors))
markup = self.markup
-
# Convert smart quotes to HTML if coming from an encoding
# that might have them.
if (self.smart_quotes_to is not None
- and proposed.lower() in self.ENCODINGS_WITH_SMART_QUOTES):
+ and proposed in self.ENCODINGS_WITH_SMART_QUOTES):
smart_quotes_re = b"([\x80-\x9f])"
smart_quotes_compiled = re.compile(smart_quotes_re)
markup = smart_quotes_compiled.sub(self._sub_ms_char, markup)
@@ -287,99 +438,24 @@ class UnicodeDammit:
def _to_unicode(self, data, encoding, errors="strict"):
'''Given a string and its encoding, decodes the string into Unicode.
%encoding is a string recognized by encodings.aliases'''
+ return unicode(data, encoding, errors)
- # strip Byte Order Mark (if present)
- if (len(data) >= 4) and (data[:2] == '\xfe\xff') \
- and (data[2:4] != '\x00\x00'):
- encoding = 'utf-16be'
- data = data[2:]
- elif (len(data) >= 4) and (data[:2] == '\xff\xfe') \
- and (data[2:4] != '\x00\x00'):
- encoding = 'utf-16le'
- data = data[2:]
- elif data[:3] == '\xef\xbb\xbf':
- encoding = 'utf-8'
- data = data[3:]
- elif data[:4] == '\x00\x00\xfe\xff':
- encoding = 'utf-32be'
- data = data[4:]
- elif data[:4] == '\xff\xfe\x00\x00':
- encoding = 'utf-32le'
- data = data[4:]
- newdata = unicode(data, encoding, errors)
- return newdata
-
- def _detectEncoding(self, xml_data, is_html=False):
- """Given a document, tries to detect its XML encoding."""
- xml_encoding = sniffed_xml_encoding = None
- try:
- if xml_data[:4] == b'\x4c\x6f\xa7\x94':
- # EBCDIC
- xml_data = self._ebcdic_to_ascii(xml_data)
- elif xml_data[:4] == b'\x00\x3c\x00\x3f':
- # UTF-16BE
- sniffed_xml_encoding = 'utf-16be'
- xml_data = unicode(xml_data, 'utf-16be').encode('utf-8')
- elif (len(xml_data) >= 4) and (xml_data[:2] == b'\xfe\xff') \
- and (xml_data[2:4] != b'\x00\x00'):
- # UTF-16BE with BOM
- sniffed_xml_encoding = 'utf-16be'
- xml_data = unicode(xml_data[2:], 'utf-16be').encode('utf-8')
- elif xml_data[:4] == b'\x3c\x00\x3f\x00':
- # UTF-16LE
- sniffed_xml_encoding = 'utf-16le'
- xml_data = unicode(xml_data, 'utf-16le').encode('utf-8')
- elif (len(xml_data) >= 4) and (xml_data[:2] == b'\xff\xfe') and \
- (xml_data[2:4] != b'\x00\x00'):
- # UTF-16LE with BOM
- sniffed_xml_encoding = 'utf-16le'
- xml_data = unicode(xml_data[2:], 'utf-16le').encode('utf-8')
- elif xml_data[:4] == b'\x00\x00\x00\x3c':
- # UTF-32BE
- sniffed_xml_encoding = 'utf-32be'
- xml_data = unicode(xml_data, 'utf-32be').encode('utf-8')
- elif xml_data[:4] == b'\x3c\x00\x00\x00':
- # UTF-32LE
- sniffed_xml_encoding = 'utf-32le'
- xml_data = unicode(xml_data, 'utf-32le').encode('utf-8')
- elif xml_data[:4] == b'\x00\x00\xfe\xff':
- # UTF-32BE with BOM
- sniffed_xml_encoding = 'utf-32be'
- xml_data = unicode(xml_data[4:], 'utf-32be').encode('utf-8')
- elif xml_data[:4] == b'\xff\xfe\x00\x00':
- # UTF-32LE with BOM
- sniffed_xml_encoding = 'utf-32le'
- xml_data = unicode(xml_data[4:], 'utf-32le').encode('utf-8')
- elif xml_data[:3] == b'\xef\xbb\xbf':
- # UTF-8 with BOM
- sniffed_xml_encoding = 'utf-8'
- xml_data = unicode(xml_data[3:], 'utf-8').encode('utf-8')
- else:
- sniffed_xml_encoding = 'ascii'
- pass
- except:
- xml_encoding_match = None
- xml_encoding_match = xml_encoding_re.match(xml_data)
- if not xml_encoding_match and is_html:
- xml_encoding_match = html_meta_re.search(xml_data)
- if xml_encoding_match is not None:
- xml_encoding = xml_encoding_match.groups()[0].decode(
- 'ascii').lower()
- if is_html:
- self.declared_html_encoding = xml_encoding
- if sniffed_xml_encoding and \
- (xml_encoding in ('iso-10646-ucs-2', 'ucs-2', 'csunicode',
- 'iso-10646-ucs-4', 'ucs-4', 'csucs4',
- 'utf-16', 'utf-32', 'utf_16', 'utf_32',
- 'utf16', 'u16')):
- xml_encoding = sniffed_xml_encoding
- return xml_data, xml_encoding, sniffed_xml_encoding
+ @property
+ def declared_html_encoding(self):
+ if not self.is_html:
+ return None
+ return self.detector.declared_encoding
def find_codec(self, charset):
- return self._codec(self.CHARSET_ALIASES.get(charset, charset)) \
- or (charset and self._codec(charset.replace("-", ""))) \
- or (charset and self._codec(charset.replace("-", "_"))) \
+ value = (self._codec(self.CHARSET_ALIASES.get(charset, charset))
+ or (charset and self._codec(charset.replace("-", "")))
+ or (charset and self._codec(charset.replace("-", "_")))
+ or (charset and charset.lower())
or charset
+ )
+ if value:
+ return value.lower()
+ return None
def _codec(self, charset):
if not charset:
@@ -392,32 +468,6 @@ class UnicodeDammit:
pass
return codec
- EBCDIC_TO_ASCII_MAP = None
-
- def _ebcdic_to_ascii(self, s):
- c = self.__class__
- if not c.EBCDIC_TO_ASCII_MAP:
- emap = (0,1,2,3,156,9,134,127,151,141,142,11,12,13,14,15,
- 16,17,18,19,157,133,8,135,24,25,146,143,28,29,30,31,
- 128,129,130,131,132,10,23,27,136,137,138,139,140,5,6,7,
- 144,145,22,147,148,149,150,4,152,153,154,155,20,21,158,26,
- 32,160,161,162,163,164,165,166,167,168,91,46,60,40,43,33,
- 38,169,170,171,172,173,174,175,176,177,93,36,42,41,59,94,
- 45,47,178,179,180,181,182,183,184,185,124,44,37,95,62,63,
- 186,187,188,189,190,191,192,193,194,96,58,35,64,39,61,34,
- 195,97,98,99,100,101,102,103,104,105,196,197,198,199,200,
- 201,202,106,107,108,109,110,111,112,113,114,203,204,205,
- 206,207,208,209,126,115,116,117,118,119,120,121,122,210,
- 211,212,213,214,215,216,217,218,219,220,221,222,223,224,
- 225,226,227,228,229,230,231,123,65,66,67,68,69,70,71,72,
- 73,232,233,234,235,236,237,125,74,75,76,77,78,79,80,81,
- 82,238,239,240,241,242,243,92,159,83,84,85,86,87,88,89,
- 90,244,245,246,247,248,249,48,49,50,51,52,53,54,55,56,57,
- 250,251,252,253,254,255)
- import string
- c.EBCDIC_TO_ASCII_MAP = string.maketrans(
- ''.join(map(chr, list(range(256)))), ''.join(map(chr, emap)))
- return s.translate(c.EBCDIC_TO_ASCII_MAP)
# A partial mapping of ISO-Latin-1 to HTML entities/XML numeric entities.
MS_CHARS = {b'\x80': ('euro', '20AC'),
diff --git a/lib/bs4/diagnose.py b/lib/bs4/diagnose.py
new file mode 100644
index 00000000..8768332f
--- /dev/null
+++ b/lib/bs4/diagnose.py
@@ -0,0 +1,219 @@
+"""Diagnostic functions, mainly for use when doing tech support."""
+
+# Use of this source code is governed by a BSD-style license that can be
+# found in the LICENSE file.
+__license__ = "MIT"
+
+import cProfile
+from StringIO import StringIO
+from HTMLParser import HTMLParser
+import bs4
+from bs4 import BeautifulSoup, __version__
+from bs4.builder import builder_registry
+
+import os
+import pstats
+import random
+import tempfile
+import time
+import traceback
+import sys
+import cProfile
+
+def diagnose(data):
+ """Diagnostic suite for isolating common problems."""
+ print "Diagnostic running on Beautiful Soup %s" % __version__
+ print "Python version %s" % sys.version
+
+ basic_parsers = ["html.parser", "html5lib", "lxml"]
+ for name in basic_parsers:
+ for builder in builder_registry.builders:
+ if name in builder.features:
+ break
+ else:
+ basic_parsers.remove(name)
+ print (
+ "I noticed that %s is not installed. Installing it may help." %
+ name)
+
+ if 'lxml' in basic_parsers:
+ basic_parsers.append(["lxml", "xml"])
+ try:
+ from lxml import etree
+ print "Found lxml version %s" % ".".join(map(str,etree.LXML_VERSION))
+ except ImportError, e:
+ print (
+ "lxml is not installed or couldn't be imported.")
+
+
+ if 'html5lib' in basic_parsers:
+ try:
+ import html5lib
+ print "Found html5lib version %s" % html5lib.__version__
+ except ImportError, e:
+ print (
+ "html5lib is not installed or couldn't be imported.")
+
+ if hasattr(data, 'read'):
+ data = data.read()
+ elif os.path.exists(data):
+ print '"%s" looks like a filename. Reading data from the file.' % data
+ with open(data) as fp:
+ data = fp.read()
+ elif data.startswith("http:") or data.startswith("https:"):
+ print '"%s" looks like a URL. Beautiful Soup is not an HTTP client.' % data
+ print "You need to use some other library to get the document behind the URL, and feed that document to Beautiful Soup."
+ return
+ print
+
+ for parser in basic_parsers:
+ print "Trying to parse your markup with %s" % parser
+ success = False
+ try:
+ soup = BeautifulSoup(data, parser)
+ success = True
+ except Exception, e:
+ print "%s could not parse the markup." % parser
+ traceback.print_exc()
+ if success:
+ print "Here's what %s did with the markup:" % parser
+ print soup.prettify()
+
+ print "-" * 80
+
+def lxml_trace(data, html=True, **kwargs):
+ """Print out the lxml events that occur during parsing.
+
+ This lets you see how lxml parses a document when no Beautiful
+ Soup code is running.
+ """
+ from lxml import etree
+ for event, element in etree.iterparse(StringIO(data), html=html, **kwargs):
+ print("%s, %4s, %s" % (event, element.tag, element.text))
+
+class AnnouncingParser(HTMLParser):
+ """Announces HTMLParser parse events, without doing anything else."""
+
+ def _p(self, s):
+ print(s)
+
+ def handle_starttag(self, name, attrs):
+ self._p("%s START" % name)
+
+ def handle_endtag(self, name):
+ self._p("%s END" % name)
+
+ def handle_data(self, data):
+ self._p("%s DATA" % data)
+
+ def handle_charref(self, name):
+ self._p("%s CHARREF" % name)
+
+ def handle_entityref(self, name):
+ self._p("%s ENTITYREF" % name)
+
+ def handle_comment(self, data):
+ self._p("%s COMMENT" % data)
+
+ def handle_decl(self, data):
+ self._p("%s DECL" % data)
+
+ def unknown_decl(self, data):
+ self._p("%s UNKNOWN-DECL" % data)
+
+ def handle_pi(self, data):
+ self._p("%s PI" % data)
+
+def htmlparser_trace(data):
+ """Print out the HTMLParser events that occur during parsing.
+
+ This lets you see how HTMLParser parses a document when no
+ Beautiful Soup code is running.
+ """
+ parser = AnnouncingParser()
+ parser.feed(data)
+
+_vowels = "aeiou"
+_consonants = "bcdfghjklmnpqrstvwxyz"
+
+def rword(length=5):
+ "Generate a random word-like string."
+ s = ''
+ for i in range(length):
+ if i % 2 == 0:
+ t = _consonants
+ else:
+ t = _vowels
+ s += random.choice(t)
+ return s
+
+def rsentence(length=4):
+ "Generate a random sentence-like string."
+ return " ".join(rword(random.randint(4,9)) for i in range(length))
+
+def rdoc(num_elements=1000):
+ """Randomly generate an invalid HTML document."""
+ tag_names = ['p', 'div', 'span', 'i', 'b', 'script', 'table']
+ elements = []
+ for i in range(num_elements):
+ choice = random.randint(0,3)
+ if choice == 0:
+ # New tag.
+ tag_name = random.choice(tag_names)
+ elements.append("<%s>" % tag_name)
+ elif choice == 1:
+ elements.append(rsentence(random.randint(1,4)))
+ elif choice == 2:
+ # Close a tag.
+ tag_name = random.choice(tag_names)
+ elements.append("%s>" % tag_name)
+ return "" + "\n".join(elements) + ""
+
+def benchmark_parsers(num_elements=100000):
+ """Very basic head-to-head performance benchmark."""
+ print "Comparative parser benchmark on Beautiful Soup %s" % __version__
+ data = rdoc(num_elements)
+ print "Generated a large invalid HTML document (%d bytes)." % len(data)
+
+ for parser in ["lxml", ["lxml", "html"], "html5lib", "html.parser"]:
+ success = False
+ try:
+ a = time.time()
+ soup = BeautifulSoup(data, parser)
+ b = time.time()
+ success = True
+ except Exception, e:
+ print "%s could not parse the markup." % parser
+ traceback.print_exc()
+ if success:
+ print "BS4+%s parsed the markup in %.2fs." % (parser, b-a)
+
+ from lxml import etree
+ a = time.time()
+ etree.HTML(data)
+ b = time.time()
+ print "Raw lxml parsed the markup in %.2fs." % (b-a)
+
+ import html5lib
+ parser = html5lib.HTMLParser()
+ a = time.time()
+ parser.parse(data)
+ b = time.time()
+ print "Raw html5lib parsed the markup in %.2fs." % (b-a)
+
+def profile(num_elements=100000, parser="lxml"):
+
+ filehandle = tempfile.NamedTemporaryFile()
+ filename = filehandle.name
+
+ data = rdoc(num_elements)
+ vars = dict(bs4=bs4, data=data, parser=parser)
+ cProfile.runctx('bs4.BeautifulSoup(data, parser)' , vars, vars, filename)
+
+ stats = pstats.Stats(filename)
+ # stats.strip_dirs()
+ stats.sort_stats("cumulative")
+ stats.print_stats('_html5lib|bs4', 50)
+
+if __name__ == '__main__':
+ diagnose(sys.stdin.read())
diff --git a/bs4/element.py b/lib/bs4/element.py
similarity index 64%
rename from bs4/element.py
rename to lib/bs4/element.py
index 4a4d3ed3..b100d18b 100644
--- a/bs4/element.py
+++ b/lib/bs4/element.py
@@ -1,5 +1,10 @@
+# Use of this source code is governed by a BSD-style license that can be
+# found in the LICENSE file.
+__license__ = "MIT"
+
import collections
import re
+import shlex
import sys
import warnings
from bs4.dammit import EntitySubstitution
@@ -26,6 +31,9 @@ class NamespacedAttribute(unicode):
def __new__(cls, prefix, name, namespace=None):
if name is None:
obj = unicode.__new__(cls, prefix)
+ elif prefix is None:
+ # Not really namespaced.
+ obj = unicode.__new__(cls, name)
else:
obj = unicode.__new__(cls, prefix + ":" + name)
obj.prefix = prefix
@@ -78,6 +86,42 @@ class ContentMetaAttributeValue(AttributeValueWithCharsetSubstitution):
return match.group(1) + encoding
return self.CHARSET_RE.sub(rewrite, self.original_value)
+class HTMLAwareEntitySubstitution(EntitySubstitution):
+
+ """Entity substitution rules that are aware of some HTML quirks.
+
+ Specifically, the contents of
+
+Hello, world!
+
+