Put libs/ in sys.path

Update bs4 to latest version to fix issues Get clean modules using `pip install --upgrade --target=lib` Move cherrypy, mako, pystun, bs4 into lib directory
2016-08-23 00:01:41 -06:00 · 2016-08-23 00:01:41 -06:00 · 69eeb9b49d
parent a850e386d4
commit 69eeb9b49d
311 changed files with 8861 additions and 6577 deletions
--- a/Mylar.py
+++ b/Mylar.py
@ -21,16 +21,14 @@ import time
 import threading
 import signal

-from lib.configobj import ConfigObj
+sys.path.insert(1, os.path.join(os.path.dirname(__file__), 'lib'))

 import mylar

 from mylar import webstart, logger, filechecker, versioncheck

-try:
-    import argparse
-except ImportError:
-    import lib.argparse as argparse
+import argparse
+

 if ( sys.platform == 'win32' and sys.executable.split( '\\' )[-1] == 'pythonw.exe'):
    sys.stdout = open(os.devnull, "w")
@ -198,6 +196,7 @@ def main():

            i += 1

+    from configobj import ConfigObj
    mylar.CFG = ConfigObj(mylar.CONFIG_FILE, encoding='utf-8')

    # Rename the main thread
--- a/bs4/init.py
+++ b/bs4/init.py
@ -1,355 +0,0 @@
-"""Beautiful Soup
-Elixir and Tonic
-"The Screen-Scraper's Friend"
-http://www.crummy.com/software/BeautifulSoup/
-
-Beautiful Soup uses a pluggable XML or HTML parser to parse a
-(possibly invalid) document into a tree representation. Beautiful Soup
-provides provides methods and Pythonic idioms that make it easy to
-navigate, search, and modify the parse tree.
-
-Beautiful Soup works with Python 2.6 and up. It works better if lxml
-and/or html5lib is installed.
-
-For more than you ever wanted to know about Beautiful Soup, see the
-documentation:
-http://www.crummy.com/software/BeautifulSoup/bs4/doc/
-"""
-
-__author__ = "Leonard Richardson (leonardr@segfault.org)"
-__version__ = "4.1.1"
-__copyright__ = "Copyright (c) 2004-2012 Leonard Richardson"
-__license__ = "MIT"
-
-__all__ = ['BeautifulSoup']
-
-import re
-import warnings
-
-from .builder import builder_registry
-from .dammit import UnicodeDammit
-from .element import (
-    CData,
-    Comment,
-    DEFAULT_OUTPUT_ENCODING,
-    Declaration,
-    Doctype,
-    NavigableString,
-    PageElement,
-    ProcessingInstruction,
-    ResultSet,
-    SoupStrainer,
-    Tag,
-    )
-
-# The very first thing we do is give a useful error if someone is
-# running this code under Python 3 without converting it.
-syntax_error = u'You are trying to run the Python 2 version of Beautiful Soup under Python 3. This will not work. You need to convert the code, either by installing it (`python setup.py install`) or by running 2to3 (`2to3 -w bs4`).'
-
-class BeautifulSoup(Tag):
-    """
-    This class defines the basic interface called by the tree builders.
-
-    These methods will be called by the parser:
-      reset()
-      feed(markup)
-
-    The tree builder may call these methods from its feed() implementation:
-      handle_starttag(name, attrs) # See note about return value
-      handle_endtag(name)
-      handle_data(data) # Appends to the current data node
-      endData(containerClass=NavigableString) # Ends the current data node
-
-    No matter how complicated the underlying parser is, you should be
-    able to build a tree using 'start tag' events, 'end tag' events,
-    'data' events, and "done with data" events.
-
-    If you encounter an empty-element tag (aka a self-closing tag,
-    like HTML's <br> tag), call handle_starttag and then
-    handle_endtag.
-    """
-    ROOT_TAG_NAME = u'[document]'
-
-    # If the end-user gives no indication which tree builder they
-    # want, look for one with these features.
-    DEFAULT_BUILDER_FEATURES = ['html', 'fast']
-
-    # Used when determining whether a text node is all whitespace and
-    # can be replaced with a single space. A text node that contains
-    # fancy Unicode spaces (usually non-breaking) should be left
-    # alone.
-    STRIP_ASCII_SPACES = {9: None, 10: None, 12: None, 13: None, 32: None, }
-
-    def __init__(self, markup="", features=None, builder=None,
-                 parse_only=None, from_encoding=None, **kwargs):
-        """The Soup object is initialized as the 'root tag', and the
-        provided markup (which can be a string or a file-like object)
-        is fed into the underlying parser."""
-
-        if 'convertEntities' in kwargs:
-            warnings.warn(
-                "BS4 does not respect the convertEntities argument to the "
-                "BeautifulSoup constructor. Entities are always converted "
-                "to Unicode characters.")
-
-        if 'markupMassage' in kwargs:
-            del kwargs['markupMassage']
-            warnings.warn(
-                "BS4 does not respect the markupMassage argument to the "
-                "BeautifulSoup constructor. The tree builder is responsible "
-                "for any necessary markup massage.")
-
-        if 'smartQuotesTo' in kwargs:
-            del kwargs['smartQuotesTo']
-            warnings.warn(
-                "BS4 does not respect the smartQuotesTo argument to the "
-                "BeautifulSoup constructor. Smart quotes are always converted "
-                "to Unicode characters.")
-
-        if 'selfClosingTags' in kwargs:
-            del kwargs['selfClosingTags']
-            warnings.warn(
-                "BS4 does not respect the selfClosingTags argument to the "
-                "BeautifulSoup constructor. The tree builder is responsible "
-                "for understanding self-closing tags.")
-
-        if 'isHTML' in kwargs:
-            del kwargs['isHTML']
-            warnings.warn(
-                "BS4 does not respect the isHTML argument to the "
-                "BeautifulSoup constructor. You can pass in features='html' "
-                "or features='xml' to get a builder capable of handling "
-                "one or the other.")
-
-        def deprecated_argument(old_name, new_name):
-            if old_name in kwargs:
-                warnings.warn(
-                    'The "%s" argument to the BeautifulSoup constructor '
-                    'has been renamed to "%s."' % (old_name, new_name))
-                value = kwargs[old_name]
-                del kwargs[old_name]
-                return value
-            return None
-
-        parse_only = parse_only or deprecated_argument(
-            "parseOnlyThese", "parse_only")
-
-        from_encoding = from_encoding or deprecated_argument(
-            "fromEncoding", "from_encoding")
-
-        if len(kwargs) > 0:
-            arg = kwargs.keys().pop()
-            raise TypeError(
-                "__init__() got an unexpected keyword argument '%s'" % arg)
-
-        if builder is None:
-            if isinstance(features, basestring):
-                features = [features]
-            if features is None or len(features) == 0:
-                features = self.DEFAULT_BUILDER_FEATURES
-            builder_class = builder_registry.lookup(*features)
-            if builder_class is None:
-                raise ValueError(
-                    "Couldn't find a tree builder with the features you "
-                    "requested: %s. Do you need to install a parser library?"
-                    % ",".join(features))
-            builder = builder_class()
-        self.builder = builder
-        self.is_xml = builder.is_xml
-        self.builder.soup = self
-
-        self.parse_only = parse_only
-
-        self.reset()
-
-        if hasattr(markup, 'read'):        # It's a file-type object.
-            markup = markup.read()
-        (self.markup, self.original_encoding, self.declared_html_encoding,
-         self.contains_replacement_characters) = (
-            self.builder.prepare_markup(markup, from_encoding))
-
-        try:
-            self._feed()
-        except StopParsing:
-            pass
-
-        # Clear out the markup and remove the builder's circular
-        # reference to this object.
-        self.markup = None
-        self.builder.soup = None
-
-    def _feed(self):
-        # Convert the document to Unicode.
-        self.builder.reset()
-
-        self.builder.feed(self.markup)
-        # Close out any unfinished strings and close all the open tags.
-        self.endData()
-        while self.currentTag.name != self.ROOT_TAG_NAME:
-            self.popTag()
-
-    def reset(self):
-        Tag.__init__(self, self, self.builder, self.ROOT_TAG_NAME)
-        self.hidden = 1
-        self.builder.reset()
-        self.currentData = []
-        self.currentTag = None
-        self.tagStack = []
-        self.pushTag(self)
-
-    def new_tag(self, name, namespace=None, nsprefix=None, **attrs):
-        """Create a new tag associated with this soup."""
-        return Tag(None, self.builder, name, namespace, nsprefix, attrs)
-
-    def new_string(self, s):
-        """Create a new NavigableString associated with this soup."""
-        navigable = NavigableString(s)
-        navigable.setup()
-        return navigable
-
-    def insert_before(self, successor):
-        raise ValueError("BeautifulSoup objects don't support insert_before().")
-
-    def insert_after(self, successor):
-        raise ValueError("BeautifulSoup objects don't support insert_after().")
-
-    def popTag(self):
-        tag = self.tagStack.pop()
-        #print "Pop", tag.name
-        if self.tagStack:
-            self.currentTag = self.tagStack[-1]
-        return self.currentTag
-
-    def pushTag(self, tag):
-        #print "Push", tag.name
-        if self.currentTag:
-            self.currentTag.contents.append(tag)
-        self.tagStack.append(tag)
-        self.currentTag = self.tagStack[-1]
-
-    def endData(self, containerClass=NavigableString):
-        if self.currentData:
-            currentData = u''.join(self.currentData)
-            if (currentData.translate(self.STRIP_ASCII_SPACES) == '' and
-                not set([tag.name for tag in self.tagStack]).intersection(
-                    self.builder.preserve_whitespace_tags)):
-                if '\n' in currentData:
-                    currentData = '\n'
-                else:
-                    currentData = ' '
-            self.currentData = []
-            if self.parse_only and len(self.tagStack) <= 1 and \
-                   (not self.parse_only.text or \
-                    not self.parse_only.search(currentData)):
-                return
-            o = containerClass(currentData)
-            self.object_was_parsed(o)
-
-    def object_was_parsed(self, o):
-        """Add an object to the parse tree."""
-        o.setup(self.currentTag, self.previous_element)
-        if self.previous_element:
-            self.previous_element.next_element = o
-        self.previous_element = o
-        self.currentTag.contents.append(o)
-
-    def _popToTag(self, name, nsprefix=None, inclusivePop=True):
-        """Pops the tag stack up to and including the most recent
-        instance of the given tag. If inclusivePop is false, pops the tag
-        stack up to but *not* including the most recent instqance of
-        the given tag."""
-        #print "Popping to %s" % name
-        if name == self.ROOT_TAG_NAME:
-            return
-
-        numPops = 0
-        mostRecentTag = None
-
-        for i in range(len(self.tagStack) - 1, 0, -1):
-            if (name == self.tagStack[i].name
-                and nsprefix == self.tagStack[i].prefix):
-                numPops = len(self.tagStack) - i
-                break
-        if not inclusivePop:
-            numPops = numPops - 1
-
-        for i in range(0, numPops):
-            mostRecentTag = self.popTag()
-        return mostRecentTag
-
-    def handle_starttag(self, name, namespace, nsprefix, attrs):
-        """Push a start tag on to the stack.
-
-        If this method returns None, the tag was rejected by the
-        SoupStrainer. You should proceed as if the tag had not occured
-        in the document. For instance, if this was a self-closing tag,
-        don't call handle_endtag.
-        """
-
-        # print "Start tag %s: %s" % (name, attrs)
-        self.endData()
-
-        if (self.parse_only and len(self.tagStack) <= 1
-            and (self.parse_only.text
-                 or not self.parse_only.search_tag(name, attrs))):
-            return None
-
-        tag = Tag(self, self.builder, name, namespace, nsprefix, attrs,
-                  self.currentTag, self.previous_element)
-        if tag is None:
-            return tag
-        if self.previous_element:
-            self.previous_element.next_element = tag
-        self.previous_element = tag
-        self.pushTag(tag)
-        return tag
-
-    def handle_endtag(self, name, nsprefix=None):
-        #print "End tag: " + name
-        self.endData()
-        self._popToTag(name, nsprefix)
-
-    def handle_data(self, data):
-        self.currentData.append(data)
-
-    def decode(self, pretty_print=False,
-               eventual_encoding=DEFAULT_OUTPUT_ENCODING,
-               formatter="minimal"):
-        """Returns a string or Unicode representation of this document.
-        To get Unicode, pass None for encoding."""
-
-        if self.is_xml:
-            # Print the XML declaration
-            encoding_part = ''
-            if eventual_encoding != None:
-                encoding_part = ' encoding="%s"' % eventual_encoding
-            prefix = u'<?xml version="1.0"%s?>\n' % encoding_part
-        else:
-            prefix = u''
-        if not pretty_print:
-            indent_level = None
-        else:
-            indent_level = 0
-        return prefix + super(BeautifulSoup, self).decode(
-            indent_level, eventual_encoding, formatter)
-
-class BeautifulStoneSoup(BeautifulSoup):
-    """Deprecated interface to an XML parser."""
-
-    def __init__(self, *args, **kwargs):
-        kwargs['features'] = 'xml'
-        warnings.warn(
-            'The BeautifulStoneSoup class is deprecated. Instead of using '
-            'it, pass features="xml" into the BeautifulSoup constructor.')
-        super(BeautifulStoneSoup, self).__init__(*args, **kwargs)
-
-
-class StopParsing(Exception):
-    pass
-
-
-#By default, act as an HTML pretty-printer.
-if __name__ == '__main__':
-    import sys
-    soup = BeautifulSoup(sys.stdin)
-    print soup.prettify()
--- a/bs4/builder/_html5lib.py
+++ b/bs4/builder/_html5lib.py
@ -1,222 +0,0 @@
-__all__ = [
-    'HTML5TreeBuilder',
-    ]
-
-import warnings
-from bs4.builder import (
-    PERMISSIVE,
-    HTML,
-    HTML_5,
-    HTMLTreeBuilder,
-    )
-from bs4.element import NamespacedAttribute
-import html5lib
-from html5lib.constants import namespaces
-from bs4.element import (
-    Comment,
-    Doctype,
-    NavigableString,
-    Tag,
-    )
-
-class HTML5TreeBuilder(HTMLTreeBuilder):
-    """Use html5lib to build a tree."""
-
-    features = ['html5lib', PERMISSIVE, HTML_5, HTML]
-
-    def prepare_markup(self, markup, user_specified_encoding):
-        # Store the user-specified encoding for use later on.
-        self.user_specified_encoding = user_specified_encoding
-        return markup, None, None, False
-
-    # These methods are defined by Beautiful Soup.
-    def feed(self, markup):
-        if self.soup.parse_only is not None:
-            warnings.warn("You provided a value for parse_only, but the html5lib tree builder doesn't support parse_only. The entire document will be parsed.")
-        parser = html5lib.HTMLParser(tree=self.create_treebuilder)
-        doc = parser.parse(markup, encoding=self.user_specified_encoding)
-
-        # Set the character encoding detected by the tokenizer.
-        if isinstance(markup, unicode):
-            # We need to special-case this because html5lib sets
-            # charEncoding to UTF-8 if it gets Unicode input.
-            doc.original_encoding = None
-        else:
-            doc.original_encoding = parser.tokenizer.stream.charEncoding[0]
-
-    def create_treebuilder(self, namespaceHTMLElements):
-        self.underlying_builder = TreeBuilderForHtml5lib(
-            self.soup, namespaceHTMLElements)
-        return self.underlying_builder
-
-    def test_fragment_to_document(self, fragment):
-        """See `TreeBuilder`."""
-        return u'<html><head></head><body>%s</body></html>' % fragment
-
-
-class TreeBuilderForHtml5lib(html5lib.treebuilders._base.TreeBuilder):
-
-    def __init__(self, soup, namespaceHTMLElements):
-        self.soup = soup
-        super(TreeBuilderForHtml5lib, self).__init__(namespaceHTMLElements)
-
-    def documentClass(self):
-        self.soup.reset()
-        return Element(self.soup, self.soup, None)
-
-    def insertDoctype(self, token):
-        name = token["name"]
-        publicId = token["publicId"]
-        systemId = token["systemId"]
-
-        doctype = Doctype.for_name_and_ids(name, publicId, systemId)
-        self.soup.object_was_parsed(doctype)
-
-    def elementClass(self, name, namespace):
-        tag = self.soup.new_tag(name, namespace)
-        return Element(tag, self.soup, namespace)
-
-    def commentClass(self, data):
-        return TextNode(Comment(data), self.soup)
-
-    def fragmentClass(self):
-        self.soup = BeautifulSoup("")
-        self.soup.name = "[document_fragment]"
-        return Element(self.soup, self.soup, None)
-
-    def appendChild(self, node):
-        # XXX This code is not covered by the BS4 tests.
-        self.soup.append(node.element)
-
-    def getDocument(self):
-        return self.soup
-
-    def getFragment(self):
-        return html5lib.treebuilders._base.TreeBuilder.getFragment(self).element
-
-class AttrList(object):
-    def __init__(self, element):
-        self.element = element
-        self.attrs = dict(self.element.attrs)
-    def __iter__(self):
-        return list(self.attrs.items()).__iter__()
-    def __setitem__(self, name, value):
-        "set attr", name, value
-        self.element[name] = value
-    def items(self):
-        return list(self.attrs.items())
-    def keys(self):
-        return list(self.attrs.keys())
-    def __len__(self):
-        return len(self.attrs)
-    def __getitem__(self, name):
-        return self.attrs[name]
-    def __contains__(self, name):
-        return name in list(self.attrs.keys())
-
-
-class Element(html5lib.treebuilders._base.Node):
-    def __init__(self, element, soup, namespace):
-        html5lib.treebuilders._base.Node.__init__(self, element.name)
-        self.element = element
-        self.soup = soup
-        self.namespace = namespace
-
-    def appendChild(self, node):
-        if (node.element.__class__ == NavigableString and self.element.contents
-            and self.element.contents[-1].__class__ == NavigableString):
-            # Concatenate new text onto old text node
-            # XXX This has O(n^2) performance, for input like
-            # "a</a>a</a>a</a>..."
-            old_element = self.element.contents[-1]
-            new_element = self.soup.new_string(old_element + node.element)
-            old_element.replace_with(new_element)
-        else:
-            self.element.append(node.element)
-            node.parent = self
-
-    def getAttributes(self):
-        return AttrList(self.element)
-
-    def setAttributes(self, attributes):
-        if attributes is not None and len(attributes) > 0:
-
-            converted_attributes = []
-            for name, value in list(attributes.items()):
-                if isinstance(name, tuple):
-                    new_name = NamespacedAttribute(*name)
-                    del attributes[name]
-                    attributes[new_name] = value
-
-            self.soup.builder._replace_cdata_list_attribute_values(
-                self.name, attributes)
-            for name, value in attributes.items():
-                self.element[name] = value
-
-            # The attributes may contain variables that need substitution.
-            # Call set_up_substitutions manually.
-            #
-            # The Tag constructor called this method when the Tag was created,
-            # but we just set/changed the attributes, so call it again.
-            self.soup.builder.set_up_substitutions(self.element)
-    attributes = property(getAttributes, setAttributes)
-
-    def insertText(self, data, insertBefore=None):
-        text = TextNode(self.soup.new_string(data), self.soup)
-        if insertBefore:
-            self.insertBefore(text, insertBefore)
-        else:
-            self.appendChild(text)
-
-    def insertBefore(self, node, refNode):
-        index = self.element.index(refNode.element)
-        if (node.element.__class__ == NavigableString and self.element.contents
-            and self.element.contents[index-1].__class__ == NavigableString):
-            # (See comments in appendChild)
-            old_node = self.element.contents[index-1]
-            new_str = self.soup.new_string(old_node + node.element)
-            old_node.replace_with(new_str)
-        else:
-            self.element.insert(index, node.element)
-            node.parent = self
-
-    def removeChild(self, node):
-        node.element.extract()
-
-    def reparentChildren(self, newParent):
-        while self.element.contents:
-            child = self.element.contents[0]
-            child.extract()
-            if isinstance(child, Tag):
-                newParent.appendChild(
-                    Element(child, self.soup, namespaces["html"]))
-            else:
-                newParent.appendChild(
-                    TextNode(child, self.soup))
-
-    def cloneNode(self):
-        tag = self.soup.new_tag(self.element.name, self.namespace)
-        node = Element(tag, self.soup, self.namespace)
-        for key,value in self.attributes:
-            node.attributes[key] = value
-        return node
-
-    def hasContent(self):
-        return self.element.contents
-
-    def getNameTuple(self):
-        if self.namespace == None:
-            return namespaces["html"], self.name
-        else:
-            return self.namespace, self.name
-
-    nameTuple = property(getNameTuple)
-
-class TextNode(Element):
-    def __init__(self, element, soup):
-        html5lib.treebuilders._base.Node.__init__(self, None)
-        self.element = element
-        self.soup = soup
-
-    def cloneNode(self):
-        raise NotImplementedError
--- a/bs4/tests/test_html5lib.py
+++ b/bs4/tests/test_html5lib.py
@ -1,58 +0,0 @@
-"""Tests to ensure that the html5lib tree builder generates good trees."""
-
-import warnings
-
-try:
-    from bs4.builder import HTML5TreeBuilder
-    HTML5LIB_PRESENT = True
-except ImportError, e:
-    HTML5LIB_PRESENT = False
-from bs4.element import SoupStrainer
-from bs4.testing import (
-    HTML5TreeBuilderSmokeTest,
-    SoupTest,
-    skipIf,
-)
-
-@skipIf(
-    not HTML5LIB_PRESENT,
-    "html5lib seems not to be present, not testing its tree builder.")
-class HTML5LibBuilderSmokeTest(SoupTest, HTML5TreeBuilderSmokeTest):
-    """See ``HTML5TreeBuilderSmokeTest``."""
-
-    @property
-    def default_builder(self):
-        return HTML5TreeBuilder()
-
-    def test_soupstrainer(self):
-        # The html5lib tree builder does not support SoupStrainers.
-        strainer = SoupStrainer("b")
-        markup = "<p>A <b>bold</b> statement.</p>"
-        with warnings.catch_warnings(record=True) as w:
-            soup = self.soup(markup, parse_only=strainer)
-        self.assertEqual(
-            soup.decode(), self.document_for(markup))
-
-        self.assertTrue(
-            "the html5lib tree builder doesn't support parse_only" in
-            str(w[0].message))
-
-    def test_correctly_nested_tables(self):
-        """html5lib inserts <tbody> tags where other parsers don't."""
-        markup = ('<table id="1">'
-                  '<tr>'
-                  "<td>Here's another table:"
-                  '<table id="2">'
-                  '<tr><td>foo</td></tr>'
-                  '</table></td>')
-
-        self.assertSoupEquals(
-            markup,
-            '<table id="1"><tbody><tr><td>Here\'s another table:'
-            '<table id="2"><tbody><tr><td>foo</td></tr></tbody></table>'
-            '</td></tr></tbody></table>')
-
-        self.assertSoupEquals(
-            "<table><thead><tr><td>Foo</td></tr></thead>"
-            "<tbody><tr><td>Bar</td></tr></tbody>"
-            "<tfoot><tr><td>Baz</td></tr></tfoot></table>")
--- a/comictagger.py
+++ b/comictagger.py
@ -1,5 +1,5 @@
 #!/usr/bin/env python
-from lib.comictaggerlib.main import ctmain
+from comictaggerlib.main import ctmain

 if __name__ == '__main__':
    ctmain()
--- a/lib/apscheduler/init.py
+++ b/lib/apscheduler/init.py
@ -1,3 +1,3 @@
-version_info = (2, 0, 0, 'rc', 2)
+version_info = (2, 0, 0)
 version = '.'.join(str(n) for n in version_info[:3])
 release = version + ''.join(str(n) for n in version_info[3:])
--- a/lib/apscheduler/events.py
+++ b/lib/apscheduler/events.py
--- a/lib/apscheduler/job.py
+++ b/lib/apscheduler/job.py
@ -5,7 +5,7 @@ Jobs represent scheduled tasks.
 from threading import Lock
 from datetime import timedelta

-from lib.apscheduler.util import to_unicode, ref_to_obj, get_callable_name,\
+from apscheduler.util import to_unicode, ref_to_obj, get_callable_name,\
    obj_to_ref


--- a/lib/apscheduler/jobstores/init.py
+++ b/lib/apscheduler/jobstores/init.py
--- a/lib/apscheduler/jobstores/base.py
+++ b/lib/apscheduler/jobstores/base.py
--- a/lib/apscheduler/jobstores/mongodb_store.py
+++ b/lib/apscheduler/jobstores/mongodb_store.py
@ -3,8 +3,8 @@ Stores jobs in a MongoDB database.
 """
 import logging

-from lib.apscheduler.jobstores.base import JobStore
-from lib.apscheduler.job import Job
+from apscheduler.jobstores.base import JobStore
+from apscheduler.job import Job

 try:
    import cPickle as pickle
--- a/lib/apscheduler/jobstores/ram_store.py
+++ b/lib/apscheduler/jobstores/ram_store.py
@ -2,7 +2,7 @@
 Stores jobs in an array in RAM. Provides no persistence support.
 """

-from lib.apscheduler.jobstores.base import JobStore
+from apscheduler.jobstores.base import JobStore


 class RAMJobStore(JobStore):
--- a/lib/apscheduler/jobstores/shelve_store.py
+++ b/lib/apscheduler/jobstores/shelve_store.py
@ -7,9 +7,9 @@ import pickle
 import random
 import logging

-from lib.apscheduler.jobstores.base import JobStore
-from lib.apscheduler.job import Job
-from lib.apscheduler.util import itervalues
+from apscheduler.jobstores.base import JobStore
+from apscheduler.job import Job
+from apscheduler.util import itervalues

 logger = logging.getLogger(__name__)

--- a/lib/apscheduler/jobstores/sqlalchemy_store.py
+++ b/lib/apscheduler/jobstores/sqlalchemy_store.py
@ -4,8 +4,8 @@ Stores jobs in a database table using SQLAlchemy.
 import pickle
 import logging

-from lib.apscheduler.jobstores.base import JobStore
-from lib.apscheduler.job import Job
+from apscheduler.jobstores.base import JobStore
+from apscheduler.job import Job

 try:
    from sqlalchemy import *
--- a/lib/apscheduler/scheduler.py
+++ b/lib/apscheduler/scheduler.py
@ -9,12 +9,12 @@ from logging import getLogger
 import os
 import sys

-from lib.apscheduler.util import *
-from lib.apscheduler.triggers import SimpleTrigger, IntervalTrigger, CronTrigger
-from lib.apscheduler.jobstores.ram_store import RAMJobStore
-from lib.apscheduler.job import Job, MaxInstancesReachedError
-from lib.apscheduler.events import *
-from lib.apscheduler.threadpool import ThreadPool
+from apscheduler.util import *
+from apscheduler.triggers import SimpleTrigger, IntervalTrigger, CronTrigger
+from apscheduler.jobstores.ram_store import RAMJobStore
+from apscheduler.job import Job, MaxInstancesReachedError
+from apscheduler.events import *
+from apscheduler.threadpool import ThreadPool

 logger = getLogger(__name__)

--- a/lib/apscheduler/threadpool.py
+++ b/lib/apscheduler/threadpool.py
--- a/lib/apscheduler/triggers/init.py
+++ b/lib/apscheduler/triggers/init.py
@ -1,3 +1,3 @@
-from lib.apscheduler.triggers.cron import CronTrigger
-from lib.apscheduler.triggers.interval import IntervalTrigger
-from lib.apscheduler.triggers.simple import SimpleTrigger
+from apscheduler.triggers.cron import CronTrigger
+from apscheduler.triggers.interval import IntervalTrigger
+from apscheduler.triggers.simple import SimpleTrigger
--- a/lib/apscheduler/triggers/cron/init.py
+++ b/lib/apscheduler/triggers/cron/init.py
@ -1,7 +1,7 @@
 from datetime import date, datetime

-from lib.apscheduler.triggers.cron.fields import *
-from lib.apscheduler.util import datetime_ceil, convert_to_datetime
+from apscheduler.triggers.cron.fields import *
+from apscheduler.util import datetime_ceil, convert_to_datetime


 class CronTrigger(object):
--- a/lib/apscheduler/triggers/cron/expressions.py
+++ b/lib/apscheduler/triggers/cron/expressions.py
@ -5,7 +5,7 @@ This module contains the expressions applicable for CronTrigger's fields.
 from calendar import monthrange
 import re

-from lib.apscheduler.util import asint
+from apscheduler.util import asint

 __all__ = ('AllExpression', 'RangeExpression', 'WeekdayRangeExpression',
           'WeekdayPositionExpression')
--- a/lib/apscheduler/triggers/cron/fields.py
+++ b/lib/apscheduler/triggers/cron/fields.py
@ -5,7 +5,7 @@ fields.

 from calendar import monthrange

-from lib.apscheduler.triggers.cron.expressions import *
+from apscheduler.triggers.cron.expressions import *

 __all__ = ('MIN_VALUES', 'MAX_VALUES', 'DEFAULT_VALUES', 'BaseField',
           'WeekField', 'DayOfMonthField', 'DayOfWeekField')
--- a/lib/apscheduler/triggers/interval.py
+++ b/lib/apscheduler/triggers/interval.py
@ -1,7 +1,7 @@
 from datetime import datetime, timedelta
 from math import ceil

-from lib.apscheduler.util import convert_to_datetime, timedelta_seconds
+from apscheduler.util import convert_to_datetime, timedelta_seconds


 class IntervalTrigger(object):
--- a/lib/apscheduler/triggers/simple.py
+++ b/lib/apscheduler/triggers/simple.py
@ -1,4 +1,4 @@
-from lib.apscheduler.util import convert_to_datetime
+from apscheduler.util import convert_to_datetime


 class SimpleTrigger(object):
--- a/lib/apscheduler/util.py
+++ b/lib/apscheduler/util.py
@ -6,6 +6,7 @@ from datetime import date, datetime, timedelta
 from time import mktime
 import re
 import sys
+from types import MethodType

 __all__ = ('asint', 'asbool', 'convert_to_datetime', 'timedelta_seconds',
           'time_difference', 'datetime_ceil', 'combine_opts',
@ -108,7 +109,7 @@ def datetime_ceil(dateval):
    """
    if dateval.microsecond > 0:
        return dateval + timedelta(seconds=1,
-                                   microseconds=-dateval.microsecond)
+                                   microseconds= -dateval.microsecond)
    return dateval


@ -137,41 +138,64 @@ def get_callable_name(func):
    """
    Returns the best available display name for the given function/callable.
    """
-    name = func.__module__
-    if hasattr(func, '__self__') and func.__self__:
-        name += '.' + func.__self__.__name__
-    elif hasattr(func, 'im_self') and func.im_self:     # py2.4, 2.5
-        name += '.' + func.im_self.__name__
-    if hasattr(func, '__name__'):
-        name += '.' + func.__name__
-    return name
+    f_self = getattr(func, '__self__', None) or getattr(func, 'im_self', None)
+
+    if f_self and hasattr(func, '__name__'):
+        if isinstance(f_self, type):
+            # class method
+            return '%s.%s' % (f_self.__name__, func.__name__)
+        # bound method
+        return '%s.%s' % (f_self.__class__.__name__, func.__name__)
+
+    if hasattr(func, '__call__'):
+        if hasattr(func, '__name__'):
+            # function, unbound method or a class with a __call__ method
+            return func.__name__
+        # instance of a class with a __call__ method
+        return func.__class__.__name__
+
+    raise TypeError('Unable to determine a name for %s -- '
+                    'maybe it is not a callable?' % repr(func))


 def obj_to_ref(obj):
    """
    Returns the path to the given object.
    """
-    ref = '%s:%s' % (obj.__module__, obj.__name__)
+    ref = '%s:%s' % (obj.__module__, get_callable_name(obj))
    try:
        obj2 = ref_to_obj(ref)
-    except AttributeError:
-        pass
-    else:
-        if obj2 == obj:
-            return ref
+        if obj != obj2:
+            raise ValueError
+    except Exception:
+        raise ValueError('Cannot determine the reference to %s' % repr(obj))
    
-    raise ValueError('Only module level objects are supported')
+    return ref


 def ref_to_obj(ref):
    """
    Returns the object pointed to by ``ref``.
    """
+    if not isinstance(ref, basestring):
+        raise TypeError('References must be strings')
+    if not ':' in ref:
+        raise ValueError('Invalid reference')
+
    modulename, rest = ref.split(':', 1)
-    obj = __import__(modulename)
-    for name in modulename.split('.')[1:] + rest.split('.'):
-        obj = getattr(obj, name)
-    return obj
+    try:
+        obj = __import__(modulename)
+    except ImportError:
+        raise LookupError('Error resolving reference %s: '
+                          'could not import module' % ref)
+
+    try:
+        for name in modulename.split('.')[1:] + rest.split('.'):
+            obj = getattr(obj, name)
+        return obj
+    except Exception:
+        raise LookupError('Error resolving reference %s: '
+                          'error looking up object' % ref)


 def maybe_ref(ref):
@ -191,14 +215,16 @@ def to_unicode(string, encoding='ascii'):
    """
    if hasattr(string, 'decode'):
        return string.decode(encoding, 'ignore')
-    return string
+    return string  # pragma: nocover


 if sys.version_info < (3, 0):  # pragma: nocover
    iteritems = lambda d: d.iteritems()
    itervalues = lambda d: d.itervalues()
    xrange = xrange
+    basestring = basestring
 else:  # pragma: nocover
    iteritems = lambda d: d.items()
    itervalues = lambda d: d.values()
    xrange = range
+    basestring = str
--- a/lib/bs4/init.py
+++ b/lib/bs4/init.py
@ -0,0 +1,529 @@
+"""Beautiful Soup
+Elixir and Tonic
+"The Screen-Scraper's Friend"
+http://www.crummy.com/software/BeautifulSoup/
+
+Beautiful Soup uses a pluggable XML or HTML parser to parse a
+(possibly invalid) document into a tree representation. Beautiful Soup
+provides methods and Pythonic idioms that make it easy to navigate,
+search, and modify the parse tree.
+
+Beautiful Soup works with Python 2.7 and up. It works better if lxml
+and/or html5lib is installed.
+
+For more than you ever wanted to know about Beautiful Soup, see the
+documentation:
+http://www.crummy.com/software/BeautifulSoup/bs4/doc/
+
+"""
+
+# Use of this source code is governed by a BSD-style license that can be
+# found in the LICENSE file.
+
+__author__ = "Leonard Richardson (leonardr@segfault.org)"
+__version__ = "4.5.1"
+__copyright__ = "Copyright (c) 2004-2016 Leonard Richardson"
+__license__ = "MIT"
+
+__all__ = ['BeautifulSoup']
+
+import os
+import re
+import traceback
+import warnings
+
+from .builder import builder_registry, ParserRejectedMarkup
+from .dammit import UnicodeDammit
+from .element import (
+    CData,
+    Comment,
+    DEFAULT_OUTPUT_ENCODING,
+    Declaration,
+    Doctype,
+    NavigableString,
+    PageElement,
+    ProcessingInstruction,
+    ResultSet,
+    SoupStrainer,
+    Tag,
+    )
+
+# The very first thing we do is give a useful error if someone is
+# running this code under Python 3 without converting it.
+'You are trying to run the Python 2 version of Beautiful Soup under Python 3. This will not work.'<>'You need to convert the code, either by installing it (`python setup.py install`) or by running 2to3 (`2to3 -w bs4`).'
+
+class BeautifulSoup(Tag):
+    """
+    This class defines the basic interface called by the tree builders.
+
+    These methods will be called by the parser:
+      reset()
+      feed(markup)
+
+    The tree builder may call these methods from its feed() implementation:
+      handle_starttag(name, attrs) # See note about return value
+      handle_endtag(name)
+      handle_data(data) # Appends to the current data node
+      endData(containerClass=NavigableString) # Ends the current data node
+
+    No matter how complicated the underlying parser is, you should be
+    able to build a tree using 'start tag' events, 'end tag' events,
+    'data' events, and "done with data" events.
+
+    If you encounter an empty-element tag (aka a self-closing tag,
+    like HTML's <br> tag), call handle_starttag and then
+    handle_endtag.
+    """
+    ROOT_TAG_NAME = u'[document]'
+
+    # If the end-user gives no indication which tree builder they
+    # want, look for one with these features.
+    DEFAULT_BUILDER_FEATURES = ['html', 'fast']
+
+    ASCII_SPACES = '\x20\x0a\x09\x0c\x0d'
+
+    NO_PARSER_SPECIFIED_WARNING = "No parser was explicitly specified, so I'm using the best available %(markup_type)s parser for this system (\"%(parser)s\"). This usually isn't a problem, but if you run this code on another system, or in a different virtual environment, it may use a different parser and behave differently.\n\nThe code that caused this warning is on line %(line_number)s of the file %(filename)s. To get rid of this warning, change code that looks like this:\n\n BeautifulSoup([your markup])\n\nto this:\n\n BeautifulSoup([your markup], \"%(parser)s\")\n"
+
+    def __init__(self, markup="", features=None, builder=None,
+                 parse_only=None, from_encoding=None, exclude_encodings=None,
+                 **kwargs):
+        """The Soup object is initialized as the 'root tag', and the
+        provided markup (which can be a string or a file-like object)
+        is fed into the underlying parser."""
+
+        if 'convertEntities' in kwargs:
+            warnings.warn(
+                "BS4 does not respect the convertEntities argument to the "
+                "BeautifulSoup constructor. Entities are always converted "
+                "to Unicode characters.")
+
+        if 'markupMassage' in kwargs:
+            del kwargs['markupMassage']
+            warnings.warn(
+                "BS4 does not respect the markupMassage argument to the "
+                "BeautifulSoup constructor. The tree builder is responsible "
+                "for any necessary markup massage.")
+
+        if 'smartQuotesTo' in kwargs:
+            del kwargs['smartQuotesTo']
+            warnings.warn(
+                "BS4 does not respect the smartQuotesTo argument to the "
+                "BeautifulSoup constructor. Smart quotes are always converted "
+                "to Unicode characters.")
+
+        if 'selfClosingTags' in kwargs:
+            del kwargs['selfClosingTags']
+            warnings.warn(
+                "BS4 does not respect the selfClosingTags argument to the "
+                "BeautifulSoup constructor. The tree builder is responsible "
+                "for understanding self-closing tags.")
+
+        if 'isHTML' in kwargs:
+            del kwargs['isHTML']
+            warnings.warn(
+                "BS4 does not respect the isHTML argument to the "
+                "BeautifulSoup constructor. Suggest you use "
+                "features='lxml' for HTML and features='lxml-xml' for "
+                "XML.")
+
+        def deprecated_argument(old_name, new_name):
+            if old_name in kwargs:
+                warnings.warn(
+                    'The "%s" argument to the BeautifulSoup constructor '
+                    'has been renamed to "%s."' % (old_name, new_name))
+                value = kwargs[old_name]
+                del kwargs[old_name]
+                return value
+            return None
+
+        parse_only = parse_only or deprecated_argument(
+            "parseOnlyThese", "parse_only")
+
+        from_encoding = from_encoding or deprecated_argument(
+            "fromEncoding", "from_encoding")
+
+        if from_encoding and isinstance(markup, unicode):
+            warnings.warn("You provided Unicode markup but also provided a value for from_encoding. Your from_encoding will be ignored.")
+            from_encoding = None
+
+        if len(kwargs) > 0:
+            arg = kwargs.keys().pop()
+            raise TypeError(
+                "__init__() got an unexpected keyword argument '%s'" % arg)
+
+        if builder is None:
+            original_features = features
+            if isinstance(features, basestring):
+                features = [features]
+            if features is None or len(features) == 0:
+                features = self.DEFAULT_BUILDER_FEATURES
+            builder_class = builder_registry.lookup(*features)
+            if builder_class is None:
+                raise FeatureNotFound(
+                    "Couldn't find a tree builder with the features you "
+                    "requested: %s. Do you need to install a parser library?"
+                    % ",".join(features))
+            builder = builder_class()
+            if not (original_features == builder.NAME or
+                    original_features in builder.ALTERNATE_NAMES):
+                if builder.is_xml:
+                    markup_type = "XML"
+                else:
+                    markup_type = "HTML"
+
+                caller = traceback.extract_stack()[0]
+                filename = caller[0]
+                line_number = caller[1]
+                warnings.warn(self.NO_PARSER_SPECIFIED_WARNING % dict(
+                    filename=filename,
+                    line_number=line_number,
+                    parser=builder.NAME,
+                    markup_type=markup_type))
+
+        self.builder = builder
+        self.is_xml = builder.is_xml
+        self.known_xml = self.is_xml
+        self.builder.soup = self
+
+        self.parse_only = parse_only
+
+        if hasattr(markup, 'read'):        # It's a file-type object.
+            markup = markup.read()
+        elif len(markup) <= 256 and (
+                (isinstance(markup, bytes) and not b'<' in markup)
+                or (isinstance(markup, unicode) and not u'<' in markup)
+        ):
+            # Print out warnings for a couple beginner problems
+            # involving passing non-markup to Beautiful Soup.
+            # Beautiful Soup will still parse the input as markup,
+            # just in case that's what the user really wants.
+            if (isinstance(markup, unicode)
+                and not os.path.supports_unicode_filenames):
+                possible_filename = markup.encode("utf8")
+            else:
+                possible_filename = markup
+            is_file = False
+            try:
+                is_file = os.path.exists(possible_filename)
+            except Exception, e:
+                # This is almost certainly a problem involving
+                # characters not valid in filenames on this
+                # system. Just let it go.
+                pass
+            if is_file:
+                if isinstance(markup, unicode):
+                    markup = markup.encode("utf8")
+                warnings.warn(
+                    '"%s" looks like a filename, not markup. You should'
+                    'probably open this file and pass the filehandle into'
+                    'Beautiful Soup.' % markup)
+            self._check_markup_is_url(markup)
+
+        for (self.markup, self.original_encoding, self.declared_html_encoding,
+         self.contains_replacement_characters) in (
+             self.builder.prepare_markup(
+                 markup, from_encoding, exclude_encodings=exclude_encodings)):
+            self.reset()
+            try:
+                self._feed()
+                break
+            except ParserRejectedMarkup:
+                pass
+
+        # Clear out the markup and remove the builder's circular
+        # reference to this object.
+        self.markup = None
+        self.builder.soup = None
+
+    def __copy__(self):
+        copy = type(self)(
+            self.encode('utf-8'), builder=self.builder, from_encoding='utf-8'
+        )
+
+        # Although we encoded the tree to UTF-8, that may not have
+        # been the encoding of the original markup. Set the copy's
+        # .original_encoding to reflect the original object's
+        # .original_encoding.
+        copy.original_encoding = self.original_encoding
+        return copy
+
+    def __getstate__(self):
+        # Frequently a tree builder can't be pickled.
+        d = dict(self.__dict__)
+        if 'builder' in d and not self.builder.picklable:
+            d['builder'] = None
+        return d
+
+    @staticmethod
+    def _check_markup_is_url(markup):
+        """ 
+        Check if markup looks like it's actually a url and raise a warning 
+        if so. Markup can be unicode or str (py2) / bytes (py3).
+        """
+        if isinstance(markup, bytes):
+            space = b' '
+            cant_start_with = (b"http:", b"https:")
+        elif isinstance(markup, unicode):
+            space = u' '
+            cant_start_with = (u"http:", u"https:")
+        else:
+            return
+
+        if any(markup.startswith(prefix) for prefix in cant_start_with):
+            if not space in markup:
+                if isinstance(markup, bytes):
+                    decoded_markup = markup.decode('utf-8', 'replace')
+                else:
+                    decoded_markup = markup
+                warnings.warn(
+                    '"%s" looks like a URL. Beautiful Soup is not an'
+                    ' HTTP client. You should probably use an HTTP client like'
+                    ' requests to get the document behind the URL, and feed'
+                    ' that document to Beautiful Soup.' % decoded_markup
+                )
+
+    def _feed(self):
+        # Convert the document to Unicode.
+        self.builder.reset()
+
+        self.builder.feed(self.markup)
+        # Close out any unfinished strings and close all the open tags.
+        self.endData()
+        while self.currentTag.name != self.ROOT_TAG_NAME:
+            self.popTag()
+
+    def reset(self):
+        Tag.__init__(self, self, self.builder, self.ROOT_TAG_NAME)
+        self.hidden = 1
+        self.builder.reset()
+        self.current_data = []
+        self.currentTag = None
+        self.tagStack = []
+        self.preserve_whitespace_tag_stack = []
+        self.pushTag(self)
+
+    def new_tag(self, name, namespace=None, nsprefix=None, **attrs):
+        """Create a new tag associated with this soup."""
+        return Tag(None, self.builder, name, namespace, nsprefix, attrs)
+
+    def new_string(self, s, subclass=NavigableString):
+        """Create a new NavigableString associated with this soup."""
+        return subclass(s)
+
+    def insert_before(self, successor):
+        raise NotImplementedError("BeautifulSoup objects don't support insert_before().")
+
+    def insert_after(self, successor):
+        raise NotImplementedError("BeautifulSoup objects don't support insert_after().")
+
+    def popTag(self):
+        tag = self.tagStack.pop()
+        if self.preserve_whitespace_tag_stack and tag == self.preserve_whitespace_tag_stack[-1]:
+            self.preserve_whitespace_tag_stack.pop()
+        #print "Pop", tag.name
+        if self.tagStack:
+            self.currentTag = self.tagStack[-1]
+        return self.currentTag
+
+    def pushTag(self, tag):
+        #print "Push", tag.name
+        if self.currentTag:
+            self.currentTag.contents.append(tag)
+        self.tagStack.append(tag)
+        self.currentTag = self.tagStack[-1]
+        if tag.name in self.builder.preserve_whitespace_tags:
+            self.preserve_whitespace_tag_stack.append(tag)
+
+    def endData(self, containerClass=NavigableString):
+        if self.current_data:
+            current_data = u''.join(self.current_data)
+            # If whitespace is not preserved, and this string contains
+            # nothing but ASCII spaces, replace it with a single space
+            # or newline.
+            if not self.preserve_whitespace_tag_stack:
+                strippable = True
+                for i in current_data:
+                    if i not in self.ASCII_SPACES:
+                        strippable = False
+                        break
+                if strippable:
+                    if '\n' in current_data:
+                        current_data = '\n'
+                    else:
+                        current_data = ' '
+
+            # Reset the data collector.
+            self.current_data = []
+
+            # Should we add this string to the tree at all?
+            if self.parse_only and len(self.tagStack) <= 1 and \
+                   (not self.parse_only.text or \
+                    not self.parse_only.search(current_data)):
+                return
+
+            o = containerClass(current_data)
+            self.object_was_parsed(o)
+
+    def object_was_parsed(self, o, parent=None, most_recent_element=None):
+        """Add an object to the parse tree."""
+        parent = parent or self.currentTag
+        previous_element = most_recent_element or self._most_recent_element
+
+        next_element = previous_sibling = next_sibling = None
+        if isinstance(o, Tag):
+            next_element = o.next_element
+            next_sibling = o.next_sibling
+            previous_sibling = o.previous_sibling
+            if not previous_element:
+                previous_element = o.previous_element
+
+        o.setup(parent, previous_element, next_element, previous_sibling, next_sibling)
+
+        self._most_recent_element = o
+        parent.contents.append(o)
+
+        if parent.next_sibling:
+            # This node is being inserted into an element that has
+            # already been parsed. Deal with any dangling references.
+            index = len(parent.contents)-1
+            while index >= 0:
+                if parent.contents[index] is o:
+                    break
+                index -= 1
+            else:
+                raise ValueError(
+                    "Error building tree: supposedly %r was inserted "
+                    "into %r after the fact, but I don't see it!" % (
+                        o, parent
+                    )
+                )
+            if index == 0:
+                previous_element = parent
+                previous_sibling = None
+            else:
+                previous_element = previous_sibling = parent.contents[index-1]
+            if index == len(parent.contents)-1:
+                next_element = parent.next_sibling
+                next_sibling = None
+            else:
+                next_element = next_sibling = parent.contents[index+1]
+
+            o.previous_element = previous_element
+            if previous_element:
+                previous_element.next_element = o
+            o.next_element = next_element
+            if next_element:
+                next_element.previous_element = o
+            o.next_sibling = next_sibling
+            if next_sibling:
+                next_sibling.previous_sibling = o
+            o.previous_sibling = previous_sibling
+            if previous_sibling:
+                previous_sibling.next_sibling = o
+
+    def _popToTag(self, name, nsprefix=None, inclusivePop=True):
+        """Pops the tag stack up to and including the most recent
+        instance of the given tag. If inclusivePop is false, pops the tag
+        stack up to but *not* including the most recent instqance of
+        the given tag."""
+        #print "Popping to %s" % name
+        if name == self.ROOT_TAG_NAME:
+            # The BeautifulSoup object itself can never be popped.
+            return
+
+        most_recently_popped = None
+
+        stack_size = len(self.tagStack)
+        for i in range(stack_size - 1, 0, -1):
+            t = self.tagStack[i]
+            if (name == t.name and nsprefix == t.prefix):
+                if inclusivePop:
+                    most_recently_popped = self.popTag()
+                break
+            most_recently_popped = self.popTag()
+
+        return most_recently_popped
+
+    def handle_starttag(self, name, namespace, nsprefix, attrs):
+        """Push a start tag on to the stack.
+
+        If this method returns None, the tag was rejected by the
+        SoupStrainer. You should proceed as if the tag had not occurred
+        in the document. For instance, if this was a self-closing tag,
+        don't call handle_endtag.
+        """
+
+        # print "Start tag %s: %s" % (name, attrs)
+        self.endData()
+
+        if (self.parse_only and len(self.tagStack) <= 1
+            and (self.parse_only.text
+                 or not self.parse_only.search_tag(name, attrs))):
+            return None
+
+        tag = Tag(self, self.builder, name, namespace, nsprefix, attrs,
+                  self.currentTag, self._most_recent_element)
+        if tag is None:
+            return tag
+        if self._most_recent_element:
+            self._most_recent_element.next_element = tag
+        self._most_recent_element = tag
+        self.pushTag(tag)
+        return tag
+
+    def handle_endtag(self, name, nsprefix=None):
+        #print "End tag: " + name
+        self.endData()
+        self._popToTag(name, nsprefix)
+
+    def handle_data(self, data):
+        self.current_data.append(data)
+
+    def decode(self, pretty_print=False,
+               eventual_encoding=DEFAULT_OUTPUT_ENCODING,
+               formatter="minimal"):
+        """Returns a string or Unicode representation of this document.
+        To get Unicode, pass None for encoding."""
+
+        if self.is_xml:
+            # Print the XML declaration
+            encoding_part = ''
+            if eventual_encoding != None:
+                encoding_part = ' encoding="%s"' % eventual_encoding
+            prefix = u'<?xml version="1.0"%s?>\n' % encoding_part
+        else:
+            prefix = u''
+        if not pretty_print:
+            indent_level = None
+        else:
+            indent_level = 0
+        return prefix + super(BeautifulSoup, self).decode(
+            indent_level, eventual_encoding, formatter)
+
+# Alias to make it easier to type import: 'from bs4 import _soup'
+_s = BeautifulSoup
+_soup = BeautifulSoup
+
+class BeautifulStoneSoup(BeautifulSoup):
+    """Deprecated interface to an XML parser."""
+
+    def __init__(self, *args, **kwargs):
+        kwargs['features'] = 'xml'
+        warnings.warn(
+            'The BeautifulStoneSoup class is deprecated. Instead of using '
+            'it, pass features="xml" into the BeautifulSoup constructor.')
+        super(BeautifulStoneSoup, self).__init__(*args, **kwargs)
+
+
+class StopParsing(Exception):
+    pass
+
+class FeatureNotFound(ValueError):
+    pass
+
+
+#By default, act as an HTML pretty-printer.
+if __name__ == '__main__':
+    import sys
+    soup = BeautifulSoup(sys.stdin)
+    print soup.prettify()
--- a/lib/bs4/builder/init.py
+++ b/lib/bs4/builder/init.py
@ -1,9 +1,13 @@
+# Use of this source code is governed by a BSD-style license that can be
+# found in the LICENSE file.
+
 from collections import defaultdict
 import itertools
 import sys
 from bs4.element import (
    CharsetMetaAttributeValue,
    ContentMetaAttributeValue,
+    HTMLAwareEntitySubstitution,
    whitespace_re
    )

@ -80,9 +84,12 @@ builder_registry = TreeBuilderRegistry()
 class TreeBuilder(object):
    """Turn a document into a Beautiful Soup object tree."""

+    NAME = "[Unknown tree builder]"
+    ALTERNATE_NAMES = []
    features = []

    is_xml = False
+    picklable = False
    preserve_whitespace_tags = set()
    empty_element_tags = None # A tag will be considered an empty-element
                              # tag when and only when it has no contents.
@ -147,16 +154,18 @@ class TreeBuilder(object):

        Modifies its input in place.
        """
+        if not attrs:
+            return attrs
        if self.cdata_list_attributes:
            universal = self.cdata_list_attributes.get('*', [])
            tag_specific = self.cdata_list_attributes.get(
-                tag_name.lower(), [])
-            for cdata_list_attr in itertools.chain(universal, tag_specific):
-                if cdata_list_attr in dict(attrs):
-                    # Basically, we have a "class" attribute whose
-                    # value is a whitespace-separated list of CSS
-                    # classes. Split it into a list.
-                    value = attrs[cdata_list_attr]
+                tag_name.lower(), None)
+            for attr in attrs.keys():
+                if attr in universal or (tag_specific and attr in tag_specific):
+                    # We have a "class"-type attribute whose string
+                    # value is a whitespace-separated list of
+                    # values. Split it into a list.
+                    value = attrs[attr]
                    if isinstance(value, basestring):
                        values = whitespace_re.split(value)
                    else:
@ -167,7 +176,7 @@ class TreeBuilder(object):
                        # leave the value alone rather than trying to
                        # split it again.
                        values = value
-                    attrs[cdata_list_attr] = values
+                    attrs[attr] = values
        return attrs

 class SAXTreeBuilder(TreeBuilder):
@ -222,7 +231,7 @@ class HTMLTreeBuilder(TreeBuilder):
    Such as which tags are empty-element tags.
    """

-    preserve_whitespace_tags = set(['pre', 'textarea'])
+    preserve_whitespace_tags = HTMLAwareEntitySubstitution.preserve_whitespace_tags
    empty_element_tags = set(['br' , 'hr', 'input', 'img', 'meta',
                              'spacer', 'link', 'frame', 'base'])

@ -296,6 +305,9 @@ def register_treebuilders_from(module):
            # Register the builder while we're at it.
            this_module.builder_registry.register(obj)

+class ParserRejectedMarkup(Exception):
+    pass
+
 # Builders are registered in reverse order of priority, so that custom
 # builder registrations will take precedence. In general, we want lxml
 # to take precedence over html5lib, because it's faster. And we only
--- a/lib/bs4/builder/_html5lib.py
+++ b/lib/bs4/builder/_html5lib.py
@ -0,0 +1,356 @@
+# Use of this source code is governed by a BSD-style license that can be
+# found in the LICENSE file.
+
+__all__ = [
+    'HTML5TreeBuilder',
+    ]
+
+import warnings
+from bs4.builder import (
+    PERMISSIVE,
+    HTML,
+    HTML_5,
+    HTMLTreeBuilder,
+    )
+from bs4.element import (
+    NamespacedAttribute,
+    whitespace_re,
+)
+import html5lib
+from html5lib.constants import namespaces
+from bs4.element import (
+    Comment,
+    Doctype,
+    NavigableString,
+    Tag,
+    )
+
+try:
+    # Pre-0.99999999
+    from html5lib.treebuilders import _base as treebuilder_base
+    new_html5lib = False
+except ImportError, e:
+    # 0.99999999 and up
+    from html5lib.treebuilders import base as treebuilder_base
+    new_html5lib = True
+
+class HTML5TreeBuilder(HTMLTreeBuilder):
+    """Use html5lib to build a tree."""
+
+    NAME = "html5lib"
+
+    features = [NAME, PERMISSIVE, HTML_5, HTML]
+
+    def prepare_markup(self, markup, user_specified_encoding,
+                       document_declared_encoding=None, exclude_encodings=None):
+        # Store the user-specified encoding for use later on.
+        self.user_specified_encoding = user_specified_encoding
+
+        # document_declared_encoding and exclude_encodings aren't used
+        # ATM because the html5lib TreeBuilder doesn't use
+        # UnicodeDammit.
+        if exclude_encodings:
+            warnings.warn("You provided a value for exclude_encoding, but the html5lib tree builder doesn't support exclude_encoding.")
+        yield (markup, None, None, False)
+
+    # These methods are defined by Beautiful Soup.
+    def feed(self, markup):
+        if self.soup.parse_only is not None:
+            warnings.warn("You provided a value for parse_only, but the html5lib tree builder doesn't support parse_only. The entire document will be parsed.")
+        parser = html5lib.HTMLParser(tree=self.create_treebuilder)
+
+        extra_kwargs = dict()
+        if not isinstance(markup, unicode):
+            if new_html5lib:
+                extra_kwargs['override_encoding'] = self.user_specified_encoding
+            else:
+                extra_kwargs['encoding'] = self.user_specified_encoding
+        doc = parser.parse(markup, **extra_kwargs)
+
+        # Set the character encoding detected by the tokenizer.
+        if isinstance(markup, unicode):
+            # We need to special-case this because html5lib sets
+            # charEncoding to UTF-8 if it gets Unicode input.
+            doc.original_encoding = None
+        else:
+            original_encoding = parser.tokenizer.stream.charEncoding[0]
+            if not isinstance(original_encoding, basestring):
+                # In 0.99999999 and up, the encoding is an html5lib
+                # Encoding object. We want to use a string for compatibility
+                # with other tree builders.
+                original_encoding = original_encoding.name
+            doc.original_encoding = original_encoding
+
+    def create_treebuilder(self, namespaceHTMLElements):
+        self.underlying_builder = TreeBuilderForHtml5lib(
+            self.soup, namespaceHTMLElements)
+        return self.underlying_builder
+
+    def test_fragment_to_document(self, fragment):
+        """See `TreeBuilder`."""
+        return u'<html><head></head><body>%s</body></html>' % fragment
+
+
+class TreeBuilderForHtml5lib(treebuilder_base.TreeBuilder):
+
+    def __init__(self, soup, namespaceHTMLElements):
+        self.soup = soup
+        super(TreeBuilderForHtml5lib, self).__init__(namespaceHTMLElements)
+
+    def documentClass(self):
+        self.soup.reset()
+        return Element(self.soup, self.soup, None)
+
+    def insertDoctype(self, token):
+        name = token["name"]
+        publicId = token["publicId"]
+        systemId = token["systemId"]
+
+        doctype = Doctype.for_name_and_ids(name, publicId, systemId)
+        self.soup.object_was_parsed(doctype)
+
+    def elementClass(self, name, namespace):
+        tag = self.soup.new_tag(name, namespace)
+        return Element(tag, self.soup, namespace)
+
+    def commentClass(self, data):
+        return TextNode(Comment(data), self.soup)
+
+    def fragmentClass(self):
+        self.soup = BeautifulSoup("")
+        self.soup.name = "[document_fragment]"
+        return Element(self.soup, self.soup, None)
+
+    def appendChild(self, node):
+        # XXX This code is not covered by the BS4 tests.
+        self.soup.append(node.element)
+
+    def getDocument(self):
+        return self.soup
+
+    def getFragment(self):
+        return treebuilder_base.TreeBuilder.getFragment(self).element
+
+class AttrList(object):
+    def __init__(self, element):
+        self.element = element
+        self.attrs = dict(self.element.attrs)
+    def __iter__(self):
+        return list(self.attrs.items()).__iter__()
+    def __setitem__(self, name, value):
+        # If this attribute is a multi-valued attribute for this element,
+        # turn its value into a list.
+        list_attr = HTML5TreeBuilder.cdata_list_attributes
+        if (name in list_attr['*']
+            or (self.element.name in list_attr
+                and name in list_attr[self.element.name])):
+            # A node that is being cloned may have already undergone
+            # this procedure.
+            if not isinstance(value, list):
+                value = whitespace_re.split(value)
+        self.element[name] = value
+    def items(self):
+        return list(self.attrs.items())
+    def keys(self):
+        return list(self.attrs.keys())
+    def __len__(self):
+        return len(self.attrs)
+    def __getitem__(self, name):
+        return self.attrs[name]
+    def __contains__(self, name):
+        return name in list(self.attrs.keys())
+
+
+class Element(treebuilder_base.Node):
+    def __init__(self, element, soup, namespace):
+        treebuilder_base.Node.__init__(self, element.name)
+        self.element = element
+        self.soup = soup
+        self.namespace = namespace
+
+    def appendChild(self, node):
+        string_child = child = None
+        if isinstance(node, basestring):
+            # Some other piece of code decided to pass in a string
+            # instead of creating a TextElement object to contain the
+            # string.
+            string_child = child = node
+        elif isinstance(node, Tag):
+            # Some other piece of code decided to pass in a Tag
+            # instead of creating an Element object to contain the
+            # Tag.
+            child = node
+        elif node.element.__class__ == NavigableString:
+            string_child = child = node.element
+        else:
+            child = node.element
+
+        if not isinstance(child, basestring) and child.parent is not None:
+            node.element.extract()
+
+        if (string_child and self.element.contents
+            and self.element.contents[-1].__class__ == NavigableString):
+            # We are appending a string onto another string.
+            # TODO This has O(n^2) performance, for input like
+            # "a</a>a</a>a</a>..."
+            old_element = self.element.contents[-1]
+            new_element = self.soup.new_string(old_element + string_child)
+            old_element.replace_with(new_element)
+            self.soup._most_recent_element = new_element
+        else:
+            if isinstance(node, basestring):
+                # Create a brand new NavigableString from this string.
+                child = self.soup.new_string(node)
+
+            # Tell Beautiful Soup to act as if it parsed this element
+            # immediately after the parent's last descendant. (Or
+            # immediately after the parent, if it has no children.)
+            if self.element.contents:
+                most_recent_element = self.element._last_descendant(False)
+            elif self.element.next_element is not None:
+                # Something from further ahead in the parse tree is
+                # being inserted into this earlier element. This is
+                # very annoying because it means an expensive search
+                # for the last element in the tree.
+                most_recent_element = self.soup._last_descendant()
+            else:
+                most_recent_element = self.element
+
+            self.soup.object_was_parsed(
+                child, parent=self.element,
+                most_recent_element=most_recent_element)
+
+    def getAttributes(self):
+        return AttrList(self.element)
+
+    def setAttributes(self, attributes):
+
+        if attributes is not None and len(attributes) > 0:
+
+            converted_attributes = []
+            for name, value in list(attributes.items()):
+                if isinstance(name, tuple):
+                    new_name = NamespacedAttribute(*name)
+                    del attributes[name]
+                    attributes[new_name] = value
+
+            self.soup.builder._replace_cdata_list_attribute_values(
+                self.name, attributes)
+            for name, value in attributes.items():
+                self.element[name] = value
+
+            # The attributes may contain variables that need substitution.
+            # Call set_up_substitutions manually.
+            #
+            # The Tag constructor called this method when the Tag was created,
+            # but we just set/changed the attributes, so call it again.
+            self.soup.builder.set_up_substitutions(self.element)
+    attributes = property(getAttributes, setAttributes)
+
+    def insertText(self, data, insertBefore=None):
+        if insertBefore:
+            text = TextNode(self.soup.new_string(data), self.soup)
+            self.insertBefore(data, insertBefore)
+        else:
+            self.appendChild(data)
+
+    def insertBefore(self, node, refNode):
+        index = self.element.index(refNode.element)
+        if (node.element.__class__ == NavigableString and self.element.contents
+            and self.element.contents[index-1].__class__ == NavigableString):
+            # (See comments in appendChild)
+            old_node = self.element.contents[index-1]
+            new_str = self.soup.new_string(old_node + node.element)
+            old_node.replace_with(new_str)
+        else:
+            self.element.insert(index, node.element)
+            node.parent = self
+
+    def removeChild(self, node):
+        node.element.extract()
+
+    def reparentChildren(self, new_parent):
+        """Move all of this tag's children into another tag."""
+        # print "MOVE", self.element.contents
+        # print "FROM", self.element
+        # print "TO", new_parent.element
+        element = self.element
+        new_parent_element = new_parent.element
+        # Determine what this tag's next_element will be once all the children
+        # are removed.
+        final_next_element = element.next_sibling
+
+        new_parents_last_descendant = new_parent_element._last_descendant(False, False)
+        if len(new_parent_element.contents) > 0:
+            # The new parent already contains children. We will be
+            # appending this tag's children to the end.
+            new_parents_last_child = new_parent_element.contents[-1]
+            new_parents_last_descendant_next_element = new_parents_last_descendant.next_element
+        else:
+            # The new parent contains no children.
+            new_parents_last_child = None
+            new_parents_last_descendant_next_element = new_parent_element.next_element
+
+        to_append = element.contents
+        append_after = new_parent_element.contents
+        if len(to_append) > 0:
+            # Set the first child's previous_element and previous_sibling
+            # to elements within the new parent
+            first_child = to_append[0]
+            if new_parents_last_descendant:
+                first_child.previous_element = new_parents_last_descendant
+            else:
+                first_child.previous_element = new_parent_element
+            first_child.previous_sibling = new_parents_last_child
+            if new_parents_last_descendant:
+                new_parents_last_descendant.next_element = first_child
+            else:
+                new_parent_element.next_element = first_child
+            if new_parents_last_child:
+                new_parents_last_child.next_sibling = first_child
+
+            # Fix the last child's next_element and next_sibling
+            last_child = to_append[-1]
+            last_child.next_element = new_parents_last_descendant_next_element
+            if new_parents_last_descendant_next_element:
+                new_parents_last_descendant_next_element.previous_element = last_child
+            last_child.next_sibling = None
+
+        for child in to_append:
+            child.parent = new_parent_element
+            new_parent_element.contents.append(child)
+
+        # Now that this element has no children, change its .next_element.
+        element.contents = []
+        element.next_element = final_next_element
+
+        # print "DONE WITH MOVE"
+        # print "FROM", self.element
+        # print "TO", new_parent_element
+
+    def cloneNode(self):
+        tag = self.soup.new_tag(self.element.name, self.namespace)
+        node = Element(tag, self.soup, self.namespace)
+        for key,value in self.attributes:
+            node.attributes[key] = value
+        return node
+
+    def hasContent(self):
+        return self.element.contents
+
+    def getNameTuple(self):
+        if self.namespace == None:
+            return namespaces["html"], self.name
+        else:
+            return self.namespace, self.name
+
+    nameTuple = property(getNameTuple)
+
+class TextNode(Element):
+    def __init__(self, element, soup):
+        treebuilder_base.Node.__init__(self, None)
+        self.element = element
+        self.soup = soup
+
+    def cloneNode(self):
+        raise NotImplementedError
--- a/lib/bs4/builder/_htmlparser.py
+++ b/lib/bs4/builder/_htmlparser.py
@ -1,13 +1,22 @@
 """Use the HTMLParser library to parse HTML files that aren't too bad."""

+# Use of this source code is governed by a BSD-style license that can be
+# found in the LICENSE file.
+
 __all__ = [
    'HTMLParserTreeBuilder',
    ]

-from HTMLParser import (
-    HTMLParser,
-    HTMLParseError,
-    )
+from HTMLParser import HTMLParser
+
+try:
+    from HTMLParser import HTMLParseError
+except ImportError, e:
+    # HTMLParseError is removed in Python 3.5. Since it can never be
+    # thrown in 3.5, we can just define our own class as a placeholder.
+    class HTMLParseError(Exception):
+        pass
+
 import sys
 import warnings

@ -19,10 +28,10 @@ import warnings
 # At the end of this file, we monkeypatch HTMLParser so that
 # strict=True works well on Python 3.2.2.
 major, minor, release = sys.version_info[:3]
-CONSTRUCTOR_TAKES_STRICT = (
-    major > 3
-    or (major == 3 and minor > 2)
-    or (major == 3 and minor == 2 and release >= 3))
+CONSTRUCTOR_TAKES_STRICT = major == 3 and minor == 2 and release >= 3
+CONSTRUCTOR_STRICT_IS_DEPRECATED = major == 3 and minor == 3
+CONSTRUCTOR_TAKES_CONVERT_CHARREFS = major == 3 and minor >= 4
+

 from bs4.element import (
    CData,
@ -45,7 +54,15 @@ HTMLPARSER = 'html.parser'
 class BeautifulSoupHTMLParser(HTMLParser):
    def handle_starttag(self, name, attrs):
        # XXX namespace
-        self.soup.handle_starttag(name, None, None, dict(attrs))
+        attr_dict = {}
+        for key, value in attrs:
+            # Change None attribute values to the empty string
+            # for consistency with the other tree builders.
+            if value is None:
+                value = ''
+            attr_dict[key] = value
+            attrvalue = '""'
+        self.soup.handle_starttag(name, None, None, attr_dict)

    def handle_endtag(self, name):
        self.soup.handle_endtag(name)
@ -55,9 +72,12 @@ class BeautifulSoupHTMLParser(HTMLParser):

    def handle_charref(self, name):
        # XXX workaround for a bug in HTMLParser. Remove this once
-        # it's fixed.
+        # it's fixed in all supported versions.
+        # http://bugs.python.org/issue13633
        if name.startswith('x'):
            real_name = int(name.lstrip('x'), 16)
+        elif name.startswith('X'):
+            real_name = int(name.lstrip('X'), 16)
        else:
            real_name = int(name)

@ -85,6 +105,9 @@ class BeautifulSoupHTMLParser(HTMLParser):
        self.soup.endData()
        if data.startswith("DOCTYPE "):
            data = data[len("DOCTYPE "):]
+        elif data == 'DOCTYPE':
+            # i.e. "<!DOCTYPE>"
+            data = ''
        self.soup.handle_data(data)
        self.soup.endData(Doctype)

@ -100,14 +123,6 @@ class BeautifulSoupHTMLParser(HTMLParser):

    def handle_pi(self, data):
        self.soup.endData()
-        if data.endswith("?") and data.lower().startswith("xml"):
-            # "An XHTML processing instruction using the trailing '?'
-            # will cause the '?' to be included in data." - HTMLParser
-            # docs.
-            #
-            # Strip the question mark so we don't end up with two
-            # question marks.
-            data = data[:-1]
        self.soup.handle_data(data)
        self.soup.endData(ProcessingInstruction)

@ -115,28 +130,34 @@ class BeautifulSoupHTMLParser(HTMLParser):
 class HTMLParserTreeBuilder(HTMLTreeBuilder):

    is_xml = False
-    features = [HTML, STRICT, HTMLPARSER]
+    picklable = True
+    NAME = HTMLPARSER
+    features = [NAME, HTML, STRICT]

    def __init__(self, *args, **kwargs):
-        if CONSTRUCTOR_TAKES_STRICT:
+        if CONSTRUCTOR_TAKES_STRICT and not CONSTRUCTOR_STRICT_IS_DEPRECATED:
            kwargs['strict'] = False
+        if CONSTRUCTOR_TAKES_CONVERT_CHARREFS:
+            kwargs['convert_charrefs'] = False
        self.parser_args = (args, kwargs)

    def prepare_markup(self, markup, user_specified_encoding=None,
-                       document_declared_encoding=None):
+                       document_declared_encoding=None, exclude_encodings=None):
        """
        :return: A 4-tuple (markup, original encoding, encoding
        declared within markup, whether any characters had to be
        replaced with REPLACEMENT CHARACTER).
        """
        if isinstance(markup, unicode):
-            return markup, None, None, False
+            yield (markup, None, None, False)
+            return

        try_encodings = [user_specified_encoding, document_declared_encoding]
-        dammit = UnicodeDammit(markup, try_encodings, is_html=True)
-        return (dammit.markup, dammit.original_encoding,
-                dammit.declared_html_encoding,
-                dammit.contains_replacement_characters)
+        dammit = UnicodeDammit(markup, try_encodings, is_html=True,
+                               exclude_encodings=exclude_encodings)
+        yield (dammit.markup, dammit.original_encoding,
+               dammit.declared_html_encoding,
+               dammit.contains_replacement_characters)

    def feed(self, markup):
        args, kwargs = self.parser_args
--- a/lib/bs4/builder/_lxml.py
+++ b/lib/bs4/builder/_lxml.py
@ -0,0 +1,258 @@
+# Use of this source code is governed by a BSD-style license that can be
+# found in the LICENSE file.
+__all__ = [
+    'LXMLTreeBuilderForXML',
+    'LXMLTreeBuilder',
+    ]
+
+from io import BytesIO
+from StringIO import StringIO
+import collections
+from lxml import etree
+from bs4.element import (
+    Comment,
+    Doctype,
+    NamespacedAttribute,
+    ProcessingInstruction,
+    XMLProcessingInstruction,
+)
+from bs4.builder import (
+    FAST,
+    HTML,
+    HTMLTreeBuilder,
+    PERMISSIVE,
+    ParserRejectedMarkup,
+    TreeBuilder,
+    XML)
+from bs4.dammit import EncodingDetector
+
+LXML = 'lxml'
+
+class LXMLTreeBuilderForXML(TreeBuilder):
+    DEFAULT_PARSER_CLASS = etree.XMLParser
+
+    is_xml = True
+    processing_instruction_class = XMLProcessingInstruction
+
+    NAME = "lxml-xml"
+    ALTERNATE_NAMES = ["xml"]
+
+    # Well, it's permissive by XML parser standards.
+    features = [NAME, LXML, XML, FAST, PERMISSIVE]
+
+    CHUNK_SIZE = 512
+
+    # This namespace mapping is specified in the XML Namespace
+    # standard.
+    DEFAULT_NSMAPS = {'http://www.w3.org/XML/1998/namespace' : "xml"}
+
+    def default_parser(self, encoding):
+        # This can either return a parser object or a class, which
+        # will be instantiated with default arguments.
+        if self._default_parser is not None:
+            return self._default_parser
+        return etree.XMLParser(
+            target=self, strip_cdata=False, recover=True, encoding=encoding)
+
+    def parser_for(self, encoding):
+        # Use the default parser.
+        parser = self.default_parser(encoding)
+
+        if isinstance(parser, collections.Callable):
+            # Instantiate the parser with default arguments
+            parser = parser(target=self, strip_cdata=False, encoding=encoding)
+        return parser
+
+    def __init__(self, parser=None, empty_element_tags=None):
+        # TODO: Issue a warning if parser is present but not a
+        # callable, since that means there's no way to create new
+        # parsers for different encodings.
+        self._default_parser = parser
+        if empty_element_tags is not None:
+            self.empty_element_tags = set(empty_element_tags)
+        self.soup = None
+        self.nsmaps = [self.DEFAULT_NSMAPS]
+
+    def _getNsTag(self, tag):
+        # Split the namespace URL out of a fully-qualified lxml tag
+        # name. Copied from lxml's src/lxml/sax.py.
+        if tag[0] == '{':
+            return tuple(tag[1:].split('}', 1))
+        else:
+            return (None, tag)
+
+    def prepare_markup(self, markup, user_specified_encoding=None,
+                       exclude_encodings=None,
+                       document_declared_encoding=None):
+        """
+        :yield: A series of 4-tuples.
+         (markup, encoding, declared encoding,
+          has undergone character replacement)
+
+        Each 4-tuple represents a strategy for parsing the document.
+        """
+        # Instead of using UnicodeDammit to convert the bytestring to
+        # Unicode using different encodings, use EncodingDetector to
+        # iterate over the encodings, and tell lxml to try to parse
+        # the document as each one in turn.
+        is_html = not self.is_xml
+        if is_html:
+            self.processing_instruction_class = ProcessingInstruction
+        else:
+            self.processing_instruction_class = XMLProcessingInstruction
+
+        if isinstance(markup, unicode):
+            # We were given Unicode. Maybe lxml can parse Unicode on
+            # this system?
+            yield markup, None, document_declared_encoding, False
+
+        if isinstance(markup, unicode):
+            # No, apparently not. Convert the Unicode to UTF-8 and
+            # tell lxml to parse it as UTF-8.
+            yield (markup.encode("utf8"), "utf8",
+                   document_declared_encoding, False)
+
+        try_encodings = [user_specified_encoding, document_declared_encoding]
+        detector = EncodingDetector(
+            markup, try_encodings, is_html, exclude_encodings)
+        for encoding in detector.encodings:
+            yield (detector.markup, encoding, document_declared_encoding, False)
+
+    def feed(self, markup):
+        if isinstance(markup, bytes):
+            markup = BytesIO(markup)
+        elif isinstance(markup, unicode):
+            markup = StringIO(markup)
+
+        # Call feed() at least once, even if the markup is empty,
+        # or the parser won't be initialized.
+        data = markup.read(self.CHUNK_SIZE)
+        try:
+            self.parser = self.parser_for(self.soup.original_encoding)
+            self.parser.feed(data)
+            while len(data) != 0:
+                # Now call feed() on the rest of the data, chunk by chunk.
+                data = markup.read(self.CHUNK_SIZE)
+                if len(data) != 0:
+                    self.parser.feed(data)
+            self.parser.close()
+        except (UnicodeDecodeError, LookupError, etree.ParserError), e:
+            raise ParserRejectedMarkup(str(e))
+
+    def close(self):
+        self.nsmaps = [self.DEFAULT_NSMAPS]
+
+    def start(self, name, attrs, nsmap={}):
+        # Make sure attrs is a mutable dict--lxml may send an immutable dictproxy.
+        attrs = dict(attrs)
+        nsprefix = None
+        # Invert each namespace map as it comes in.
+        if len(self.nsmaps) > 1:
+            # There are no new namespaces for this tag, but
+            # non-default namespaces are in play, so we need a
+            # separate tag stack to know when they end.
+            self.nsmaps.append(None)
+        elif len(nsmap) > 0:
+            # A new namespace mapping has come into play.
+            inverted_nsmap = dict((value, key) for key, value in nsmap.items())
+            self.nsmaps.append(inverted_nsmap)
+            # Also treat the namespace mapping as a set of attributes on the
+            # tag, so we can recreate it later.
+            attrs = attrs.copy()
+            for prefix, namespace in nsmap.items():
+                attribute = NamespacedAttribute(
+                    "xmlns", prefix, "http://www.w3.org/2000/xmlns/")
+                attrs[attribute] = namespace
+
+        # Namespaces are in play. Find any attributes that came in
+        # from lxml with namespaces attached to their names, and
+        # turn then into NamespacedAttribute objects.
+        new_attrs = {}
+        for attr, value in attrs.items():
+            namespace, attr = self._getNsTag(attr)
+            if namespace is None:
+                new_attrs[attr] = value
+            else:
+                nsprefix = self._prefix_for_namespace(namespace)
+                attr = NamespacedAttribute(nsprefix, attr, namespace)
+                new_attrs[attr] = value
+        attrs = new_attrs
+
+        namespace, name = self._getNsTag(name)
+        nsprefix = self._prefix_for_namespace(namespace)
+        self.soup.handle_starttag(name, namespace, nsprefix, attrs)
+
+    def _prefix_for_namespace(self, namespace):
+        """Find the currently active prefix for the given namespace."""
+        if namespace is None:
+            return None
+        for inverted_nsmap in reversed(self.nsmaps):
+            if inverted_nsmap is not None and namespace in inverted_nsmap:
+                return inverted_nsmap[namespace]
+        return None
+
+    def end(self, name):
+        self.soup.endData()
+        completed_tag = self.soup.tagStack[-1]
+        namespace, name = self._getNsTag(name)
+        nsprefix = None
+        if namespace is not None:
+            for inverted_nsmap in reversed(self.nsmaps):
+                if inverted_nsmap is not None and namespace in inverted_nsmap:
+                    nsprefix = inverted_nsmap[namespace]
+                    break
+        self.soup.handle_endtag(name, nsprefix)
+        if len(self.nsmaps) > 1:
+            # This tag, or one of its parents, introduced a namespace
+            # mapping, so pop it off the stack.
+            self.nsmaps.pop()
+
+    def pi(self, target, data):
+        self.soup.endData()
+        self.soup.handle_data(target + ' ' + data)
+        self.soup.endData(self.processing_instruction_class)
+
+    def data(self, content):
+        self.soup.handle_data(content)
+
+    def doctype(self, name, pubid, system):
+        self.soup.endData()
+        doctype = Doctype.for_name_and_ids(name, pubid, system)
+        self.soup.object_was_parsed(doctype)
+
+    def comment(self, content):
+        "Handle comments as Comment objects."
+        self.soup.endData()
+        self.soup.handle_data(content)
+        self.soup.endData(Comment)
+
+    def test_fragment_to_document(self, fragment):
+        """See `TreeBuilder`."""
+        return u'<?xml version="1.0" encoding="utf-8"?>\n%s' % fragment
+
+
+class LXMLTreeBuilder(HTMLTreeBuilder, LXMLTreeBuilderForXML):
+
+    NAME = LXML
+    ALTERNATE_NAMES = ["lxml-html"]
+
+    features = ALTERNATE_NAMES + [NAME, HTML, FAST, PERMISSIVE]
+    is_xml = False
+    processing_instruction_class = ProcessingInstruction
+
+    def default_parser(self, encoding):
+        return etree.HTMLParser
+
+    def feed(self, markup):
+        encoding = self.soup.original_encoding
+        try:
+            self.parser = self.parser_for(encoding)
+            self.parser.feed(markup)
+            self.parser.close()
+        except (UnicodeDecodeError, LookupError, etree.ParserError), e:
+            raise ParserRejectedMarkup(str(e))
+
+
+    def test_fragment_to_document(self, fragment):
+        """See `TreeBuilder`."""
+        return u'<html><body>%s</body></html>' % fragment
--- a/lib/bs4/dammit.py
+++ b/lib/bs4/dammit.py
@ -1,27 +1,43 @@
 # -*- coding: utf-8 -*-
 """Beautiful Soup bonus library: Unicode, Dammit

-This class forces XML data into a standard format (usually to UTF-8 or
-Unicode).  It is heavily based on code from Mark Pilgrim's Universal
-Feed Parser. It does not rewrite the XML or HTML to reflect a new
-encoding; that's the tree builder's job.
+This library converts a bytestream to Unicode through any means
+necessary. It is heavily based on code from Mark Pilgrim's Universal
+Feed Parser. It works best on XML and HTML, but it does not rewrite the
+XML or HTML to reflect a new encoding; that's the tree builder's job.
 """
+# Use of this source code is governed by a BSD-style license that can be
+# found in the LICENSE file.
+__license__ = "MIT"

 import codecs
 from htmlentitydefs import codepoint2name
 import re
-import warnings
+import logging
+import string

-# Autodetects character encodings. Very useful.
-# Download from http://chardet.feedparser.org/
-#  or 'apt-get install python-chardet'
-#  or 'easy_install chardet'
+# Import a library to autodetect character encodings.
+chardet_type = None
 try:
-    import chardet
-    #import chardet.constants
-    #chardet.constants._debug = 1
+    # First try the fast C implementation.
+    #  PyPI package: cchardet
+    import cchardet
+    def chardet_dammit(s):
+        return cchardet.detect(s)['encoding']
 except ImportError:
-    chardet = None
+    try:
+        # Fall back to the pure Python implementation
+        #  Debian package: python-chardet
+        #  PyPI package: chardet
+        import chardet
+        def chardet_dammit(s):
+            return chardet.detect(s)['encoding']
+        #import chardet.constants
+        #chardet.constants._debug = 1
+    except ImportError:
+        # No chardet available.
+        def chardet_dammit(s):
+            return None

 # Available from http://cjkpython.i18n.org/.
 try:
@ -69,6 +85,8 @@ class EntitySubstitution(object):
                                           "&(?!#\d+;|#x[0-9a-fA-F]+;|\w+;)"
                                           ")")

+    AMPERSAND_OR_BRACKET = re.compile("([<>&])")
+
    @classmethod
    def _substitute_html_entity(cls, matchobj):
        entity = cls.CHARACTER_TO_HTML_ENTITY.get(matchobj.group(0))
@ -122,6 +140,28 @@ class EntitySubstitution(object):
    def substitute_xml(cls, value, make_quoted_attribute=False):
        """Substitute XML entities for special XML characters.

+        :param value: A string to be substituted. The less-than sign
+          will become &lt;, the greater-than sign will become &gt;,
+          and any ampersands will become &amp;. If you want ampersands
+          that appear to be part of an entity definition to be left
+          alone, use substitute_xml_containing_entities() instead.
+
+        :param make_quoted_attribute: If True, then the string will be
+         quoted, as befits an attribute value.
+        """
+        # Escape angle brackets and ampersands.
+        value = cls.AMPERSAND_OR_BRACKET.sub(
+            cls._substitute_xml_entity, value)
+
+        if make_quoted_attribute:
+            value = cls.quoted_attribute_value(value)
+        return value
+
+    @classmethod
+    def substitute_xml_containing_entities(
+        cls, value, make_quoted_attribute=False):
+        """Substitute XML entities for special XML characters.
+
        :param value: A string to be substituted. The less-than sign will
          become &lt;, the greater-than sign will become &gt;, and any
          ampersands that are not part of an entity defition will
@ -155,6 +195,133 @@ class EntitySubstitution(object):
            cls._substitute_html_entity, s)


+class EncodingDetector:
+    """Suggests a number of possible encodings for a bytestring.
+
+    Order of precedence:
+
+    1. Encodings you specifically tell EncodingDetector to try first
+    (the override_encodings argument to the constructor).
+
+    2. An encoding declared within the bytestring itself, either in an
+    XML declaration (if the bytestring is to be interpreted as an XML
+    document), or in a <meta> tag (if the bytestring is to be
+    interpreted as an HTML document.)
+
+    3. An encoding detected through textual analysis by chardet,
+    cchardet, or a similar external library.
+
+    4. UTF-8.
+
+    5. Windows-1252.
+    """
+    def __init__(self, markup, override_encodings=None, is_html=False,
+                 exclude_encodings=None):
+        self.override_encodings = override_encodings or []
+        exclude_encodings = exclude_encodings or []
+        self.exclude_encodings = set([x.lower() for x in exclude_encodings])
+        self.chardet_encoding = None
+        self.is_html = is_html
+        self.declared_encoding = None
+
+        # First order of business: strip a byte-order mark.
+        self.markup, self.sniffed_encoding = self.strip_byte_order_mark(markup)
+
+    def _usable(self, encoding, tried):
+        if encoding is not None:
+            encoding = encoding.lower()
+            if encoding in self.exclude_encodings:
+                return False
+            if encoding not in tried:
+                tried.add(encoding)
+                return True
+        return False
+
+    @property
+    def encodings(self):
+        """Yield a number of encodings that might work for this markup."""
+        tried = set()
+        for e in self.override_encodings:
+            if self._usable(e, tried):
+                yield e
+
+        # Did the document originally start with a byte-order mark
+        # that indicated its encoding?
+        if self._usable(self.sniffed_encoding, tried):
+            yield self.sniffed_encoding
+
+        # Look within the document for an XML or HTML encoding
+        # declaration.
+        if self.declared_encoding is None:
+            self.declared_encoding = self.find_declared_encoding(
+                self.markup, self.is_html)
+        if self._usable(self.declared_encoding, tried):
+            yield self.declared_encoding
+
+        # Use third-party character set detection to guess at the
+        # encoding.
+        if self.chardet_encoding is None:
+            self.chardet_encoding = chardet_dammit(self.markup)
+        if self._usable(self.chardet_encoding, tried):
+            yield self.chardet_encoding
+
+        # As a last-ditch effort, try utf-8 and windows-1252.
+        for e in ('utf-8', 'windows-1252'):
+            if self._usable(e, tried):
+                yield e
+
+    @classmethod
+    def strip_byte_order_mark(cls, data):
+        """If a byte-order mark is present, strip it and return the encoding it implies."""
+        encoding = None
+        if isinstance(data, unicode):
+            # Unicode data cannot have a byte-order mark.
+            return data, encoding
+        if (len(data) >= 4) and (data[:2] == b'\xfe\xff') \
+               and (data[2:4] != '\x00\x00'):
+            encoding = 'utf-16be'
+            data = data[2:]
+        elif (len(data) >= 4) and (data[:2] == b'\xff\xfe') \
+                 and (data[2:4] != '\x00\x00'):
+            encoding = 'utf-16le'
+            data = data[2:]
+        elif data[:3] == b'\xef\xbb\xbf':
+            encoding = 'utf-8'
+            data = data[3:]
+        elif data[:4] == b'\x00\x00\xfe\xff':
+            encoding = 'utf-32be'
+            data = data[4:]
+        elif data[:4] == b'\xff\xfe\x00\x00':
+            encoding = 'utf-32le'
+            data = data[4:]
+        return data, encoding
+
+    @classmethod
+    def find_declared_encoding(cls, markup, is_html=False, search_entire_document=False):
+        """Given a document, tries to find its declared encoding.
+
+        An XML encoding is declared at the beginning of the document.
+
+        An HTML encoding is declared in a <meta> tag, hopefully near the
+        beginning of the document.
+        """
+        if search_entire_document:
+            xml_endpos = html_endpos = len(markup)
+        else:
+            xml_endpos = 1024
+            html_endpos = max(2048, int(len(markup) * 0.05))
+            
+        declared_encoding = None
+        declared_encoding_match = xml_encoding_re.search(markup, endpos=xml_endpos)
+        if not declared_encoding_match and is_html:
+            declared_encoding_match = html_meta_re.search(markup, endpos=html_endpos)
+        if declared_encoding_match is not None:
+            declared_encoding = declared_encoding_match.groups()[0].decode(
+                'ascii', 'replace')
+        if declared_encoding:
+            return declared_encoding.lower()
+        return None
+
 class UnicodeDammit:
    """A class for detecting the encoding of a *ML document and
    converting it to a Unicode string. If the source encoding is
@ -175,66 +342,51 @@ class UnicodeDammit:
        ]

    def __init__(self, markup, override_encodings=[],
-                 smart_quotes_to=None, is_html=False):
-        self.declared_html_encoding = None
+                 smart_quotes_to=None, is_html=False, exclude_encodings=[]):
        self.smart_quotes_to = smart_quotes_to
        self.tried_encodings = []
        self.contains_replacement_characters = False
+        self.is_html = is_html
+        self.log = logging.getLogger(__name__)
+        self.detector = EncodingDetector(
+            markup, override_encodings, is_html, exclude_encodings)

-        if markup == '' or isinstance(markup, unicode):
+        # Short-circuit if the data is in Unicode to begin with.
+        if isinstance(markup, unicode) or markup == '':
            self.markup = markup
            self.unicode_markup = unicode(markup)
            self.original_encoding = None
            return

-        new_markup, document_encoding, sniffed_encoding = \
-            self._detectEncoding(markup, is_html)
-        self.markup = new_markup
+        # The encoding detector may have stripped a byte-order mark.
+        # Use the stripped markup from this point on.
+        self.markup = self.detector.markup

        u = None
-        if new_markup != markup:
-            # _detectEncoding modified the markup, then converted it to
-            # Unicode and then to UTF-8. So convert it from UTF-8.
-            u = self._convert_from("utf8")
-            self.original_encoding = sniffed_encoding
+        for encoding in self.detector.encodings:
+            markup = self.detector.markup
+            u = self._convert_from(encoding)
+            if u is not None:
+                break

        if not u:
-            for proposed_encoding in (
-                override_encodings + [document_encoding, sniffed_encoding]):
-                if proposed_encoding is not None:
-                    u = self._convert_from(proposed_encoding)
-                    if u:
-                        break
+            # None of the encodings worked. As an absolute last resort,
+            # try them again with character replacement.

-        # If no luck and we have auto-detection library, try that:
-        if not u and chardet and not isinstance(self.markup, unicode):
-            u = self._convert_from(chardet.detect(self.markup)['encoding'])
-
-        # As a last resort, try utf-8 and windows-1252:
-        if not u:
-            for proposed_encoding in ("utf-8", "windows-1252"):
-                u = self._convert_from(proposed_encoding)
-                if u:
-                    break
-
-        # As an absolute last resort, try the encodings again with
-        # character replacement.
-        if not u:
-            for proposed_encoding in (
-                override_encodings + [
-                    document_encoding, sniffed_encoding, "utf-8", "windows-1252"]):
-                if proposed_encoding != "ascii":
-                    u = self._convert_from(proposed_encoding, "replace")
+            for encoding in self.detector.encodings:
+                if encoding != "ascii":
+                    u = self._convert_from(encoding, "replace")
                if u is not None:
-                    warnings.warn(
-                        UnicodeWarning(
+                    self.log.warning(
                            "Some characters could not be decoded, and were "
-                            "replaced with REPLACEMENT CHARACTER."))
+                            "replaced with REPLACEMENT CHARACTER."
+                    )
                    self.contains_replacement_characters = True
                    break

-        # We could at this point force it to ASCII, but that would
-        # destroy so much data that I think giving up is better
+        # If none of that worked, we could at this point force it to
+        # ASCII, but that would destroy so much data that I think
+        # giving up is better.
        self.unicode_markup = u
        if not u:
            self.original_encoding = None
@ -262,11 +414,10 @@ class UnicodeDammit:
            return None
        self.tried_encodings.append((proposed, errors))
        markup = self.markup
-
        # Convert smart quotes to HTML if coming from an encoding
        # that might have them.
        if (self.smart_quotes_to is not None
-            and proposed.lower() in self.ENCODINGS_WITH_SMART_QUOTES):
+            and proposed in self.ENCODINGS_WITH_SMART_QUOTES):
            smart_quotes_re = b"([\x80-\x9f])"
            smart_quotes_compiled = re.compile(smart_quotes_re)
            markup = smart_quotes_compiled.sub(self._sub_ms_char, markup)
@ -287,99 +438,24 @@ class UnicodeDammit:
    def _to_unicode(self, data, encoding, errors="strict"):
        '''Given a string and its encoding, decodes the string into Unicode.
        %encoding is a string recognized by encodings.aliases'''
+        return unicode(data, encoding, errors)

-        # strip Byte Order Mark (if present)
-        if (len(data) >= 4) and (data[:2] == '\xfe\xff') \
-               and (data[2:4] != '\x00\x00'):
-            encoding = 'utf-16be'
-            data = data[2:]
-        elif (len(data) >= 4) and (data[:2] == '\xff\xfe') \
-                 and (data[2:4] != '\x00\x00'):
-            encoding = 'utf-16le'
-            data = data[2:]
-        elif data[:3] == '\xef\xbb\xbf':
-            encoding = 'utf-8'
-            data = data[3:]
-        elif data[:4] == '\x00\x00\xfe\xff':
-            encoding = 'utf-32be'
-            data = data[4:]
-        elif data[:4] == '\xff\xfe\x00\x00':
-            encoding = 'utf-32le'
-            data = data[4:]
-        newdata = unicode(data, encoding, errors)
-        return newdata
-
-    def _detectEncoding(self, xml_data, is_html=False):
-        """Given a document, tries to detect its XML encoding."""
-        xml_encoding = sniffed_xml_encoding = None
-        try:
-            if xml_data[:4] == b'\x4c\x6f\xa7\x94':
-                # EBCDIC
-                xml_data = self._ebcdic_to_ascii(xml_data)
-            elif xml_data[:4] == b'\x00\x3c\x00\x3f':
-                # UTF-16BE
-                sniffed_xml_encoding = 'utf-16be'
-                xml_data = unicode(xml_data, 'utf-16be').encode('utf-8')
-            elif (len(xml_data) >= 4) and (xml_data[:2] == b'\xfe\xff') \
-                     and (xml_data[2:4] != b'\x00\x00'):
-                # UTF-16BE with BOM
-                sniffed_xml_encoding = 'utf-16be'
-                xml_data = unicode(xml_data[2:], 'utf-16be').encode('utf-8')
-            elif xml_data[:4] == b'\x3c\x00\x3f\x00':
-                # UTF-16LE
-                sniffed_xml_encoding = 'utf-16le'
-                xml_data = unicode(xml_data, 'utf-16le').encode('utf-8')
-            elif (len(xml_data) >= 4) and (xml_data[:2] == b'\xff\xfe') and \
-                     (xml_data[2:4] != b'\x00\x00'):
-                # UTF-16LE with BOM
-                sniffed_xml_encoding = 'utf-16le'
-                xml_data = unicode(xml_data[2:], 'utf-16le').encode('utf-8')
-            elif xml_data[:4] == b'\x00\x00\x00\x3c':
-                # UTF-32BE
-                sniffed_xml_encoding = 'utf-32be'
-                xml_data = unicode(xml_data, 'utf-32be').encode('utf-8')
-            elif xml_data[:4] == b'\x3c\x00\x00\x00':
-                # UTF-32LE
-                sniffed_xml_encoding = 'utf-32le'
-                xml_data = unicode(xml_data, 'utf-32le').encode('utf-8')
-            elif xml_data[:4] == b'\x00\x00\xfe\xff':
-                # UTF-32BE with BOM
-                sniffed_xml_encoding = 'utf-32be'
-                xml_data = unicode(xml_data[4:], 'utf-32be').encode('utf-8')
-            elif xml_data[:4] == b'\xff\xfe\x00\x00':
-                # UTF-32LE with BOM
-                sniffed_xml_encoding = 'utf-32le'
-                xml_data = unicode(xml_data[4:], 'utf-32le').encode('utf-8')
-            elif xml_data[:3] == b'\xef\xbb\xbf':
-                # UTF-8 with BOM
-                sniffed_xml_encoding = 'utf-8'
-                xml_data = unicode(xml_data[3:], 'utf-8').encode('utf-8')
-            else:
-                sniffed_xml_encoding = 'ascii'
-                pass
-        except:
-            xml_encoding_match = None
-        xml_encoding_match = xml_encoding_re.match(xml_data)
-        if not xml_encoding_match and is_html:
-            xml_encoding_match = html_meta_re.search(xml_data)
-        if xml_encoding_match is not None:
-            xml_encoding = xml_encoding_match.groups()[0].decode(
-                'ascii').lower()
-            if is_html:
-                self.declared_html_encoding = xml_encoding
-            if sniffed_xml_encoding and \
-               (xml_encoding in ('iso-10646-ucs-2', 'ucs-2', 'csunicode',
-                                 'iso-10646-ucs-4', 'ucs-4', 'csucs4',
-                                 'utf-16', 'utf-32', 'utf_16', 'utf_32',
-                                 'utf16', 'u16')):
-                xml_encoding = sniffed_xml_encoding
-        return xml_data, xml_encoding, sniffed_xml_encoding
+    @property
+    def declared_html_encoding(self):
+        if not self.is_html:
+            return None
+        return self.detector.declared_encoding

    def find_codec(self, charset):
-        return self._codec(self.CHARSET_ALIASES.get(charset, charset)) \
-               or (charset and self._codec(charset.replace("-", ""))) \
-               or (charset and self._codec(charset.replace("-", "_"))) \
+        value = (self._codec(self.CHARSET_ALIASES.get(charset, charset))
+               or (charset and self._codec(charset.replace("-", "")))
+               or (charset and self._codec(charset.replace("-", "_")))
+               or (charset and charset.lower())
               or charset
+                )
+        if value:
+            return value.lower()
+        return None

    def _codec(self, charset):
        if not charset:
@ -392,32 +468,6 @@ class UnicodeDammit:
            pass
        return codec

-    EBCDIC_TO_ASCII_MAP = None
-
-    def _ebcdic_to_ascii(self, s):
-        c = self.__class__
-        if not c.EBCDIC_TO_ASCII_MAP:
-            emap = (0,1,2,3,156,9,134,127,151,141,142,11,12,13,14,15,
-                    16,17,18,19,157,133,8,135,24,25,146,143,28,29,30,31,
-                    128,129,130,131,132,10,23,27,136,137,138,139,140,5,6,7,
-                    144,145,22,147,148,149,150,4,152,153,154,155,20,21,158,26,
-                    32,160,161,162,163,164,165,166,167,168,91,46,60,40,43,33,
-                    38,169,170,171,172,173,174,175,176,177,93,36,42,41,59,94,
-                    45,47,178,179,180,181,182,183,184,185,124,44,37,95,62,63,
-                    186,187,188,189,190,191,192,193,194,96,58,35,64,39,61,34,
-                    195,97,98,99,100,101,102,103,104,105,196,197,198,199,200,
-                    201,202,106,107,108,109,110,111,112,113,114,203,204,205,
-                    206,207,208,209,126,115,116,117,118,119,120,121,122,210,
-                    211,212,213,214,215,216,217,218,219,220,221,222,223,224,
-                    225,226,227,228,229,230,231,123,65,66,67,68,69,70,71,72,
-                    73,232,233,234,235,236,237,125,74,75,76,77,78,79,80,81,
-                    82,238,239,240,241,242,243,92,159,83,84,85,86,87,88,89,
-                    90,244,245,246,247,248,249,48,49,50,51,52,53,54,55,56,57,
-                    250,251,252,253,254,255)
-            import string
-            c.EBCDIC_TO_ASCII_MAP = string.maketrans(
-            ''.join(map(chr, list(range(256)))), ''.join(map(chr, emap)))
-        return s.translate(c.EBCDIC_TO_ASCII_MAP)

    # A partial mapping of ISO-Latin-1 to HTML entities/XML numeric entities.
    MS_CHARS = {b'\x80': ('euro', '20AC'),
--- a/lib/bs4/diagnose.py
+++ b/lib/bs4/diagnose.py
@ -0,0 +1,219 @@
+"""Diagnostic functions, mainly for use when doing tech support."""
+
+# Use of this source code is governed by a BSD-style license that can be
+# found in the LICENSE file.
+__license__ = "MIT"
+
+import cProfile
+from StringIO import StringIO
+from HTMLParser import HTMLParser
+import bs4
+from bs4 import BeautifulSoup, __version__
+from bs4.builder import builder_registry
+
+import os
+import pstats
+import random
+import tempfile
+import time
+import traceback
+import sys
+import cProfile
+
+def diagnose(data):
+    """Diagnostic suite for isolating common problems."""
+    print "Diagnostic running on Beautiful Soup %s" % __version__
+    print "Python version %s" % sys.version
+
+    basic_parsers = ["html.parser", "html5lib", "lxml"]
+    for name in basic_parsers:
+        for builder in builder_registry.builders:
+            if name in builder.features:
+                break
+        else:
+            basic_parsers.remove(name)
+            print (
+                "I noticed that %s is not installed. Installing it may help." %
+                name)
+
+    if 'lxml' in basic_parsers:
+        basic_parsers.append(["lxml", "xml"])
+        try:
+            from lxml import etree
+            print "Found lxml version %s" % ".".join(map(str,etree.LXML_VERSION))
+        except ImportError, e:
+            print (
+                "lxml is not installed or couldn't be imported.")
+
+
+    if 'html5lib' in basic_parsers:
+        try:
+            import html5lib
+            print "Found html5lib version %s" % html5lib.__version__
+        except ImportError, e:
+            print (
+                "html5lib is not installed or couldn't be imported.")
+
+    if hasattr(data, 'read'):
+        data = data.read()
+    elif os.path.exists(data):
+        print '"%s" looks like a filename. Reading data from the file.' % data
+        with open(data) as fp:
+            data = fp.read()
+    elif data.startswith("http:") or data.startswith("https:"):
+        print '"%s" looks like a URL. Beautiful Soup is not an HTTP client.' % data
+        print "You need to use some other library to get the document behind the URL, and feed that document to Beautiful Soup."
+        return
+    print
+
+    for parser in basic_parsers:
+        print "Trying to parse your markup with %s" % parser
+        success = False
+        try:
+            soup = BeautifulSoup(data, parser)
+            success = True
+        except Exception, e:
+            print "%s could not parse the markup." % parser
+            traceback.print_exc()
+        if success:
+            print "Here's what %s did with the markup:" % parser
+            print soup.prettify()
+
+        print "-" * 80
+
+def lxml_trace(data, html=True, **kwargs):
+    """Print out the lxml events that occur during parsing.
+
+    This lets you see how lxml parses a document when no Beautiful
+    Soup code is running.
+    """
+    from lxml import etree
+    for event, element in etree.iterparse(StringIO(data), html=html, **kwargs):
+        print("%s, %4s, %s" % (event, element.tag, element.text))
+
+class AnnouncingParser(HTMLParser):
+    """Announces HTMLParser parse events, without doing anything else."""
+
+    def _p(self, s):
+        print(s)
+
+    def handle_starttag(self, name, attrs):
+        self._p("%s START" % name)
+
+    def handle_endtag(self, name):
+        self._p("%s END" % name)
+
+    def handle_data(self, data):
+        self._p("%s DATA" % data)
+
+    def handle_charref(self, name):
+        self._p("%s CHARREF" % name)
+
+    def handle_entityref(self, name):
+        self._p("%s ENTITYREF" % name)
+
+    def handle_comment(self, data):
+        self._p("%s COMMENT" % data)
+
+    def handle_decl(self, data):
+        self._p("%s DECL" % data)
+
+    def unknown_decl(self, data):
+        self._p("%s UNKNOWN-DECL" % data)
+
+    def handle_pi(self, data):
+        self._p("%s PI" % data)
+
+def htmlparser_trace(data):
+    """Print out the HTMLParser events that occur during parsing.
+
+    This lets you see how HTMLParser parses a document when no
+    Beautiful Soup code is running.
+    """
+    parser = AnnouncingParser()
+    parser.feed(data)
+
+_vowels = "aeiou"
+_consonants = "bcdfghjklmnpqrstvwxyz"
+
+def rword(length=5):
+    "Generate a random word-like string."
+    s = ''
+    for i in range(length):
+        if i % 2 == 0:
+            t = _consonants
+        else:
+            t = _vowels
+        s += random.choice(t)
+    return s
+
+def rsentence(length=4):
+    "Generate a random sentence-like string."
+    return " ".join(rword(random.randint(4,9)) for i in range(length))
+        
+def rdoc(num_elements=1000):
+    """Randomly generate an invalid HTML document."""
+    tag_names = ['p', 'div', 'span', 'i', 'b', 'script', 'table']
+    elements = []
+    for i in range(num_elements):
+        choice = random.randint(0,3)
+        if choice == 0:
+            # New tag.
+            tag_name = random.choice(tag_names)
+            elements.append("<%s>" % tag_name)
+        elif choice == 1:
+            elements.append(rsentence(random.randint(1,4)))
+        elif choice == 2:
+            # Close a tag.
+            tag_name = random.choice(tag_names)
+            elements.append("</%s>" % tag_name)
+    return "<html>" + "\n".join(elements) + "</html>"
+
+def benchmark_parsers(num_elements=100000):
+    """Very basic head-to-head performance benchmark."""
+    print "Comparative parser benchmark on Beautiful Soup %s" % __version__
+    data = rdoc(num_elements)
+    print "Generated a large invalid HTML document (%d bytes)." % len(data)
+    
+    for parser in ["lxml", ["lxml", "html"], "html5lib", "html.parser"]:
+        success = False
+        try:
+            a = time.time()
+            soup = BeautifulSoup(data, parser)
+            b = time.time()
+            success = True
+        except Exception, e:
+            print "%s could not parse the markup." % parser
+            traceback.print_exc()
+        if success:
+            print "BS4+%s parsed the markup in %.2fs." % (parser, b-a)
+
+    from lxml import etree
+    a = time.time()
+    etree.HTML(data)
+    b = time.time()
+    print "Raw lxml parsed the markup in %.2fs." % (b-a)
+
+    import html5lib
+    parser = html5lib.HTMLParser()
+    a = time.time()
+    parser.parse(data)
+    b = time.time()
+    print "Raw html5lib parsed the markup in %.2fs." % (b-a)
+
+def profile(num_elements=100000, parser="lxml"):
+
+    filehandle = tempfile.NamedTemporaryFile()
+    filename = filehandle.name
+
+    data = rdoc(num_elements)
+    vars = dict(bs4=bs4, data=data, parser=parser)
+    cProfile.runctx('bs4.BeautifulSoup(data, parser)' , vars, vars, filename)
+
+    stats = pstats.Stats(filename)
+    # stats.strip_dirs()
+    stats.sort_stats("cumulative")
+    stats.print_stats('_html5lib|bs4', 50)
+
+if __name__ == '__main__':
+    diagnose(sys.stdin.read())
--- a/lib/bs4/element.py
+++ b/lib/bs4/element.py
--- a/lib/bs4/testing.py
+++ b/lib/bs4/testing.py
@ -1,5 +1,10 @@
 """Helper classes for tests."""

+# Use of this source code is governed by a BSD-style license that can be
+# found in the LICENSE file.
+__license__ = "MIT"
+
+import pickle
 import copy
 import functools
 import unittest
@ -43,6 +48,16 @@ class SoupTest(unittest.TestCase):

        self.assertEqual(obj.decode(), self.document_for(compare_parsed_to))

+    def assertConnectedness(self, element):
+        """Ensure that next_element and previous_element are properly
+        set for all descendants of the given element.
+        """
+        earlier = None
+        for e in element.descendants:
+            if earlier:
+                self.assertEqual(e, earlier.next_element)
+                self.assertEqual(earlier, e.previous_element)
+            earlier = e

 class HTMLTreeBuilderSmokeTest(object):

@ -54,6 +69,15 @@ class HTMLTreeBuilderSmokeTest(object):
    markup in these tests, there's not much room for interpretation.
    """

+    def test_pickle_and_unpickle_identity(self):
+        # Pickling a tree, then unpickling it, yields a tree identical
+        # to the original.
+        tree = self.soup("<a><b>foo</a>")
+        dumped = pickle.dumps(tree, 2)
+        loaded = pickle.loads(dumped)
+        self.assertEqual(loaded.__class__, BeautifulSoup)
+        self.assertEqual(loaded.decode(), tree.decode())
+
    def assertDoctypeHandled(self, doctype_fragment):
        """Assert that a given doctype string is handled correctly."""
        doctype_str, soup = self._document_with_doctype(doctype_fragment)
@ -81,6 +105,11 @@ class HTMLTreeBuilderSmokeTest(object):
        self.assertDoctypeHandled(
            'html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN"')

+    def test_empty_doctype(self):
+        soup = self.soup("<!DOCTYPE>")
+        doctype = soup.contents[0]
+        self.assertEqual("", doctype.strip())
+
    def test_public_doctype_with_url(self):
        doctype = 'html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd"'
        self.assertDoctypeHandled(doctype)
@ -109,6 +138,19 @@ class HTMLTreeBuilderSmokeTest(object):
            soup.encode("utf-8").replace(b"\n", b""),
            markup.replace(b"\n", b""))

+    def test_processing_instruction(self):
+        # We test both Unicode and bytestring to verify that
+        # process_markup correctly sets processing_instruction_class
+        # even when the markup is already Unicode and there is no
+        # need to process anything.
+        markup = u"""<?PITarget PIContent?>"""
+        soup = self.soup(markup)
+        self.assertEqual(markup, soup.decode())
+
+        markup = b"""<?PITarget PIContent?>"""
+        soup = self.soup(markup)
+        self.assertEqual(markup, soup.encode("utf8"))
+
    def test_deepcopy(self):
        """Make sure you can copy the tree builder.

@ -150,6 +192,23 @@ class HTMLTreeBuilderSmokeTest(object):
    def test_nested_formatting_elements(self):
        self.assertSoupEquals("<em><em></em></em>")

+    def test_double_head(self):
+        html = '''<!DOCTYPE html>
+<html>
+<head>
+<title>Ordinary HEAD element test</title>
+</head>
+<script type="text/javascript">
+alert("Help!");
+</script>
+<body>
+Hello, world!
+</body>
+</html>
+'''
+        soup = self.soup(html)
+        self.assertEqual("text/javascript", soup.find('script')['type'])
+
    def test_comment(self):
        # Comments are represented as Comment objects.
        markup = "<p>foo<!--foobar-->baz</p>"
@ -159,10 +218,29 @@ class HTMLTreeBuilderSmokeTest(object):
        comment = soup.find(text="foobar")
        self.assertEqual(comment.__class__, Comment)

+        # The comment is properly integrated into the tree.
+        foo = soup.find(text="foo")
+        self.assertEqual(comment, foo.next_element)
+        baz = soup.find(text="baz")
+        self.assertEqual(comment, baz.previous_element)
+
    def test_preserved_whitespace_in_pre_and_textarea(self):
-        """Whitespace must be preserved in <pre> and <textarea> tags."""
-        self.assertSoupEquals("<pre>   </pre>")
-        self.assertSoupEquals("<textarea> woo  </textarea>")
+        """Whitespace must be preserved in <pre> and <textarea> tags,
+        even if that would mean not prettifying the markup.
+        """
+        pre_markup = "<pre>   </pre>"
+        textarea_markup = "<textarea> woo\nwoo  </textarea>"
+        self.assertSoupEquals(pre_markup)
+        self.assertSoupEquals(textarea_markup)
+
+        soup = self.soup(pre_markup)
+        self.assertEqual(soup.pre.prettify(), pre_markup)
+
+        soup = self.soup(textarea_markup)
+        self.assertEqual(soup.textarea.prettify(), textarea_markup)
+
+        soup = self.soup("<textarea></textarea>")
+        self.assertEqual(soup.textarea.prettify(), "<textarea></textarea>")

    def test_nested_inline_elements(self):
        """Inline elements can be nested indefinitely."""
@ -210,6 +288,14 @@ class HTMLTreeBuilderSmokeTest(object):
        soup = self.soup(markup)
        self.assertEqual(["css"], soup.div.div['class'])

+    def test_multivalued_attribute_on_html(self):
+        # html5lib uses a different API to set the attributes ot the
+        # <html> tag. This has caused problems with multivalued
+        # attributes.
+        markup = '<html class="a b"></html>'
+        soup = self.soup(markup)
+        self.assertEqual(["a", "b"], soup.html['class'])
+
    def test_angle_brackets_in_attribute_values_are_escaped(self):
        self.assertSoupEquals('<a b="<a>"></a>', '<a b="&lt;a&gt;"></a>')

@ -217,12 +303,14 @@ class HTMLTreeBuilderSmokeTest(object):
        expect = u'<p id="pi\N{LATIN SMALL LETTER N WITH TILDE}ata"></p>'
        self.assertSoupEquals('<p id="pi&#241;ata"></p>', expect)
        self.assertSoupEquals('<p id="pi&#xf1;ata"></p>', expect)
+        self.assertSoupEquals('<p id="pi&#Xf1;ata"></p>', expect)
        self.assertSoupEquals('<p id="pi&ntilde;ata"></p>', expect)

    def test_entities_in_text_converted_to_unicode(self):
        expect = u'<p>pi\N{LATIN SMALL LETTER N WITH TILDE}ata</p>'
        self.assertSoupEquals("<p>pi&#241;ata</p>", expect)
        self.assertSoupEquals("<p>pi&#xf1;ata</p>", expect)
+        self.assertSoupEquals("<p>pi&#Xf1;ata</p>", expect)
        self.assertSoupEquals("<p>pi&ntilde;ata</p>", expect)

    def test_quot_entity_converted_to_quotation_mark(self):
@ -235,6 +323,41 @@ class HTMLTreeBuilderSmokeTest(object):
        self.assertSoupEquals("&#x10000000000000;", expect)
        self.assertSoupEquals("&#1000000000;", expect)

+    def test_multipart_strings(self):
+        "Mostly to prevent a recurrence of a bug in the html5lib treebuilder."
+        soup = self.soup("<html><h2>\nfoo</h2><p></p></html>")
+        self.assertEqual("p", soup.h2.string.next_element.name)
+        self.assertEqual("p", soup.p.name)
+        self.assertConnectedness(soup)
+
+    def test_head_tag_between_head_and_body(self):
+        "Prevent recurrence of a bug in the html5lib treebuilder."
+        content = """<html><head></head>
+  <link></link>
+  <body>foo</body>
+</html>
+"""
+        soup = self.soup(content)
+        self.assertNotEqual(None, soup.html.body)
+        self.assertConnectedness(soup)
+
+    def test_multiple_copies_of_a_tag(self):
+        "Prevent recurrence of a bug in the html5lib treebuilder."
+        content = """<!DOCTYPE html>
+<html>
+ <body>
+   <article id="a" >
+   <div><a href="1"></div>
+   <footer>
+     <a href="2"></a>
+   </footer>
+  </article>
+  </body>
+</html>
+"""
+        soup = self.soup(content)
+        self.assertConnectedness(soup.article)
+
    def test_basic_namespaces(self):
        """Parsers don't need to *understand* namespaces, but at the
        very least they should not choke on namespaces or lose
@ -262,6 +385,14 @@ class HTMLTreeBuilderSmokeTest(object):
    # to detect any differences between them.
    #

+    def test_can_parse_unicode_document(self):
+        # A seemingly innocuous document... but it's in Unicode! And
+        # it contains characters that can't be represented in the
+        # encoding found in the  declaration! The horror!
+        markup = u'<html><head><meta encoding="euc-jp"></head><body>Sacr\N{LATIN SMALL LETTER E WITH ACUTE} bleu!</body>'
+        soup = self.soup(markup)
+        self.assertEqual(u'Sacr\xe9 bleu!', soup.body.string)
+
    def test_soupstrainer(self):
        """Parsers should be able to work with SoupStrainers."""
        strainer = SoupStrainer("b")
@ -372,7 +503,9 @@ class HTMLTreeBuilderSmokeTest(object):
        hebrew_document = b'<html><head><title>Hebrew (ISO 8859-8) in Visual Directionality</title></head><body><h1>Hebrew (ISO 8859-8) in Visual Directionality</h1>\xed\xe5\xec\xf9</body></html>'
        soup = self.soup(
            hebrew_document, from_encoding="iso8859-8")
-        self.assertEqual(soup.original_encoding, 'iso8859-8')
+        # Some tree builders call it iso8859-8, others call it iso-8859-9.
+        # That's not a difference we really care about.
+        assert soup.original_encoding in ('iso8859-8', 'iso-8859-8')
        self.assertEqual(
            soup.encode('utf-8'),
            hebrew_document.decode("iso8859-8").encode("utf-8"))
@ -436,11 +569,30 @@ class HTMLTreeBuilderSmokeTest(object):

 class XMLTreeBuilderSmokeTest(object):

+    def test_pickle_and_unpickle_identity(self):
+        # Pickling a tree, then unpickling it, yields a tree identical
+        # to the original.
+        tree = self.soup("<a><b>foo</a>")
+        dumped = pickle.dumps(tree, 2)
+        loaded = pickle.loads(dumped)
+        self.assertEqual(loaded.__class__, BeautifulSoup)
+        self.assertEqual(loaded.decode(), tree.decode())
+
    def test_docstring_generated(self):
        soup = self.soup("<root/>")
        self.assertEqual(
            soup.encode(), b'<?xml version="1.0" encoding="utf-8"?>\n<root/>')

+    def test_xml_declaration(self):
+        markup = b"""<?xml version="1.0" encoding="utf8"?>\n<foo/>"""
+        soup = self.soup(markup)
+        self.assertEqual(markup, soup.encode("utf8"))
+
+    def test_processing_instruction(self):
+        markup = b"""<?xml version="1.0" encoding="utf8"?>\n<?PITarget PIContent?>"""
+        soup = self.soup(markup)
+        self.assertEqual(markup, soup.encode("utf8"))
+
    def test_real_xhtml_document(self):
        """A real XHTML document should come out *exactly* the same as it went in."""
        markup = b"""<?xml version="1.0" encoding="utf-8"?>
@ -453,6 +605,23 @@ class XMLTreeBuilderSmokeTest(object):
        self.assertEqual(
            soup.encode("utf-8"), markup)

+    def test_formatter_processes_script_tag_for_xml_documents(self):
+        doc = """
+  <script type="text/javascript">
+  </script>
+"""
+        soup = BeautifulSoup(doc, "lxml-xml")
+        # lxml would have stripped this while parsing, but we can add
+        # it later.
+        soup.script.string = 'console.log("< < hey > > ");'
+        encoded = soup.encode()
+        self.assertTrue(b"&lt; &lt; hey &gt; &gt;" in encoded)
+
+    def test_can_parse_unicode_document(self):
+        markup = u'<?xml version="1.0" encoding="euc-jp"><root>Sacr\N{LATIN SMALL LETTER E WITH ACUTE} bleu!</root>'
+        soup = self.soup(markup)
+        self.assertEqual(u'Sacr\xe9 bleu!', soup.root.string)
+
    def test_popping_namespaced_tag(self):
        markup = '<rss xmlns:dc="foo"><dc:creator>b</dc:creator><dc:date>2012-07-02T20:33:42Z</dc:date><dc:rights>c</dc:rights><image>d</image></rss>'
        soup = self.soup(markup)
@ -490,6 +659,16 @@ class XMLTreeBuilderSmokeTest(object):
        soup = self.soup(markup)
        self.assertEqual(unicode(soup.p), markup)

+    def test_namespaced_attributes(self):
+        markup = '<foo xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"><bar xsi:schemaLocation="http://www.example.com"/></foo>'
+        soup = self.soup(markup)
+        self.assertEqual(unicode(soup.foo), markup)
+
+    def test_namespaced_attributes_xml_namespace(self):
+        markup = '<foo xml:lang="fr">bar</foo>'
+        soup = self.soup(markup)
+        self.assertEqual(unicode(soup.foo), markup)
+
 class HTML5TreeBuilderSmokeTest(HTMLTreeBuilderSmokeTest):
    """Smoke test for a tree builder that supports HTML5."""

@ -518,6 +697,12 @@ class HTML5TreeBuilderSmokeTest(HTMLTreeBuilderSmokeTest):
        self.assertEqual(namespace, soup.math.namespace)
        self.assertEqual(namespace, soup.msqrt.namespace)

+    def test_xml_declaration_becomes_comment(self):
+        markup = '<?xml version="1.0" encoding="utf-8"?><html></html>'
+        soup = self.soup(markup)
+        self.assertTrue(isinstance(soup.contents[0], Comment))
+        self.assertEqual(soup.contents[0], '?xml version="1.0" encoding="utf-8"?')
+        self.assertEqual("html", soup.contents[0].next_element.name)

 def skipIf(condition, reason):
   def nothing(test, *args, **kwargs):
--- a/lib/bs4/tests/init.py
+++ b/lib/bs4/tests/init.py
--- a/lib/bs4/tests/test_builder_registry.py
+++ b/lib/bs4/tests/test_builder_registry.py
@ -1,6 +1,7 @@
 """Tests of the builder registry."""

 import unittest
+import warnings

 from bs4 import BeautifulSoup
 from bs4.builder import (
@ -67,10 +68,15 @@ class BuiltInRegistryTest(unittest.TestCase):
                          HTMLParserTreeBuilder)

    def test_beautifulsoup_constructor_does_lookup(self):
-        # You can pass in a string.
-        BeautifulSoup("", features="html")
-        # Or a list of strings.
-        BeautifulSoup("", features=["html", "fast"])
+
+        with warnings.catch_warnings(record=True) as w:
+            # This will create a warning about not explicitly
+            # specifying a parser, but we'll ignore it.
+
+            # You can pass in a string.
+            BeautifulSoup("", features="html")
+            # Or a list of strings.
+            BeautifulSoup("", features=["html", "fast"])

        # You'll get an exception if BS can't find an appropriate
        # builder.
--- a/lib/bs4/tests/test_docs.py
+++ b/lib/bs4/tests/test_docs.py
--- a/lib/bs4/tests/test_html5lib.py
+++ b/lib/bs4/tests/test_html5lib.py
@ -0,0 +1,109 @@
+"""Tests to ensure that the html5lib tree builder generates good trees."""
+
+import warnings
+
+try:
+    from bs4.builder import HTML5TreeBuilder
+    HTML5LIB_PRESENT = True
+except ImportError, e:
+    HTML5LIB_PRESENT = False
+from bs4.element import SoupStrainer
+from bs4.testing import (
+    HTML5TreeBuilderSmokeTest,
+    SoupTest,
+    skipIf,
+)
+
+@skipIf(
+    not HTML5LIB_PRESENT,
+    "html5lib seems not to be present, not testing its tree builder.")
+class HTML5LibBuilderSmokeTest(SoupTest, HTML5TreeBuilderSmokeTest):
+    """See ``HTML5TreeBuilderSmokeTest``."""
+
+    @property
+    def default_builder(self):
+        return HTML5TreeBuilder()
+
+    def test_soupstrainer(self):
+        # The html5lib tree builder does not support SoupStrainers.
+        strainer = SoupStrainer("b")
+        markup = "<p>A <b>bold</b> statement.</p>"
+        with warnings.catch_warnings(record=True) as w:
+            soup = self.soup(markup, parse_only=strainer)
+        self.assertEqual(
+            soup.decode(), self.document_for(markup))
+
+        self.assertTrue(
+            "the html5lib tree builder doesn't support parse_only" in
+            str(w[0].message))
+
+    def test_correctly_nested_tables(self):
+        """html5lib inserts <tbody> tags where other parsers don't."""
+        markup = ('<table id="1">'
+                  '<tr>'
+                  "<td>Here's another table:"
+                  '<table id="2">'
+                  '<tr><td>foo</td></tr>'
+                  '</table></td>')
+
+        self.assertSoupEquals(
+            markup,
+            '<table id="1"><tbody><tr><td>Here\'s another table:'
+            '<table id="2"><tbody><tr><td>foo</td></tr></tbody></table>'
+            '</td></tr></tbody></table>')
+
+        self.assertSoupEquals(
+            "<table><thead><tr><td>Foo</td></tr></thead>"
+            "<tbody><tr><td>Bar</td></tr></tbody>"
+            "<tfoot><tr><td>Baz</td></tr></tfoot></table>")
+
+    def test_xml_declaration_followed_by_doctype(self):
+        markup = '''<?xml version="1.0" encoding="utf-8"?>
+<!DOCTYPE html>
+<html>
+  <head>
+  </head>
+  <body>
+   <p>foo</p>
+  </body>
+</html>'''
+        soup = self.soup(markup)
+        # Verify that we can reach the <p> tag; this means the tree is connected.
+        self.assertEqual(b"<p>foo</p>", soup.p.encode())
+
+    def test_reparented_markup(self):
+        markup = '<p><em>foo</p>\n<p>bar<a></a></em></p>'
+        soup = self.soup(markup)
+        self.assertEqual(u"<body><p><em>foo</em></p><em>\n</em><p><em>bar<a></a></em></p></body>", soup.body.decode())
+        self.assertEqual(2, len(soup.find_all('p')))
+
+
+    def test_reparented_markup_ends_with_whitespace(self):
+        markup = '<p><em>foo</p>\n<p>bar<a></a></em></p>\n'
+        soup = self.soup(markup)
+        self.assertEqual(u"<body><p><em>foo</em></p><em>\n</em><p><em>bar<a></a></em></p>\n</body>", soup.body.decode())
+        self.assertEqual(2, len(soup.find_all('p')))
+
+    def test_reparented_markup_containing_identical_whitespace_nodes(self):
+        """Verify that we keep the two whitespace nodes in this
+        document distinct when reparenting the adjacent <tbody> tags.
+        """
+        markup = '<table> <tbody><tbody><ims></tbody> </table>'
+        soup = self.soup(markup)
+        space1, space2 = soup.find_all(string=' ')
+        tbody1, tbody2 = soup.find_all('tbody')
+        assert space1.next_element is tbody1
+        assert tbody2.next_element is space2
+
+    def test_processing_instruction(self):
+        """Processing instructions become comments."""
+        markup = b"""<?PITarget PIContent?>"""
+        soup = self.soup(markup)
+        assert str(soup).startswith("<!--?PITarget PIContent?-->")
+
+    def test_cloned_multivalue_node(self):
+        markup = b"""<a class="my_class"><p></a>"""
+        soup = self.soup(markup)
+        a1, a2 = soup.find_all('a')
+        self.assertEqual(a1, a2)
+        assert a1 is not a2
--- a/lib/bs4/tests/test_htmlparser.py
+++ b/lib/bs4/tests/test_htmlparser.py
@ -1,6 +1,8 @@
 """Tests to ensure that the html.parser tree builder generates good
 trees."""

+from pdb import set_trace
+import pickle
 from bs4.testing import SoupTest, HTMLTreeBuilderSmokeTest
 from bs4.builder import HTMLParserTreeBuilder

@ -17,3 +19,14 @@ class HTMLParserTreeBuilderSmokeTest(SoupTest, HTMLTreeBuilderSmokeTest):
    def test_namespaced_public_doctype(self):
        # html.parser can't handle namespaced doctypes, so skip this one.
        pass
+
+    def test_builder_is_pickled(self):
+        """Unlike most tree builders, HTMLParserTreeBuilder and will
+        be restored after pickling.
+        """
+        tree = self.soup("<a><b>foo</a>")
+        dumped = pickle.dumps(tree, 2)
+        loaded = pickle.loads(dumped)
+        self.assertTrue(isinstance(loaded.builder, type(tree.builder)))
+
+
--- a/lib/bs4/tests/test_lxml.py
+++ b/lib/bs4/tests/test_lxml.py
@ -4,10 +4,15 @@ import re
 import warnings

 try:
-    from bs4.builder import LXMLTreeBuilder, LXMLTreeBuilderForXML
+    import lxml.etree
    LXML_PRESENT = True
+    LXML_VERSION = lxml.etree.LXML_VERSION
 except ImportError, e:
    LXML_PRESENT = False
+    LXML_VERSION = (0,)
+
+if LXML_PRESENT:
+    from bs4.builder import LXMLTreeBuilder, LXMLTreeBuilderForXML

 from bs4 import (
    BeautifulSoup,
@ -41,27 +46,24 @@ class LXMLTreeBuilderSmokeTest(SoupTest, HTMLTreeBuilderSmokeTest):
        self.assertSoupEquals(
            "<p>foo&#1000000000;bar</p>", "<p>foobar</p>")

+    # In lxml < 2.3.5, an empty doctype causes a segfault. Skip this
+    # test if an old version of lxml is installed.
+
+    @skipIf(
+        not LXML_PRESENT or LXML_VERSION < (2,3,5,0),
+        "Skipping doctype test for old version of lxml to avoid segfault.")
+    def test_empty_doctype(self):
+        soup = self.soup("<!DOCTYPE>")
+        doctype = soup.contents[0]
+        self.assertEqual("", doctype.strip())
+
    def test_beautifulstonesoup_is_xml_parser(self):
        # Make sure that the deprecated BSS class uses an xml builder
        # if one is installed.
-        with warnings.catch_warnings(record=False) as w:
+        with warnings.catch_warnings(record=True) as w:
            soup = BeautifulStoneSoup("<b />")
-            self.assertEqual(u"<b/>", unicode(soup.b))
-
-    def test_real_xhtml_document(self):
-        """lxml strips the XML definition from an XHTML doc, which is fine."""
-        markup = b"""<?xml version="1.0" encoding="utf-8"?>
-<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN">
-<html xmlns="http://www.w3.org/1999/xhtml">
-<head><title>Hello.</title></head>
-<body>Goodbye.</body>
-</html>"""
-        soup = self.soup(markup)
-        self.assertEqual(
-            soup.encode("utf-8").replace(b"\n", b''),
-            markup.replace(b'\n', b'').replace(
-                b'<?xml version="1.0" encoding="utf-8"?>', b''))
-
+        self.assertEqual(u"<b/>", unicode(soup.b))
+        self.assertTrue("BeautifulStoneSoup class is deprecated" in str(w[0].message))

@skipIf(
    not LXML_PRESENT,
@ -72,4 +74,3 @@ class LXMLXMLTreeBuilderSmokeTest(SoupTest, XMLTreeBuilderSmokeTest):
    @property
    def default_builder(self):
        return LXMLTreeBuilderForXML()
-
--- a/lib/bs4/tests/test_soup.py
+++ b/lib/bs4/tests/test_soup.py
@ -1,7 +1,12 @@
 # -*- coding: utf-8 -*-
 """Tests of Beautiful Soup as a whole."""

+from pdb import set_trace
+import logging
 import unittest
+import sys
+import tempfile
+
 from bs4 import (
    BeautifulSoup,
    BeautifulStoneSoup,
@ -13,7 +18,11 @@ from bs4.element import (
    NamespacedAttribute,
    )
 import bs4.dammit
-from bs4.dammit import EntitySubstitution, UnicodeDammit
+from bs4.dammit import (
+    EntitySubstitution,
+    UnicodeDammit,
+    EncodingDetector,
+)
 from bs4.testing import (
    SoupTest,
    skipIf,
@ -26,7 +35,48 @@ try:
 except ImportError, e:
    LXML_PRESENT = False

-class TestDeprecatedConstructorArguments(SoupTest):
+PYTHON_3_PRE_3_2 = (sys.version_info[0] == 3 and sys.version_info < (3,2))
+
+class TestConstructor(SoupTest):
+
+    def test_short_unicode_input(self):
+        data = u"<h1>éé</h1>"
+        soup = self.soup(data)
+        self.assertEqual(u"éé", soup.h1.string)
+
+    def test_embedded_null(self):
+        data = u"<h1>foo\0bar</h1>"
+        soup = self.soup(data)
+        self.assertEqual(u"foo\0bar", soup.h1.string)
+
+    def test_exclude_encodings(self):
+        utf8_data = u"Räksmörgås".encode("utf-8")
+        soup = self.soup(utf8_data, exclude_encodings=["utf-8"])
+        self.assertEqual("windows-1252", soup.original_encoding)
+
+
+class TestWarnings(SoupTest):
+
+    def _no_parser_specified(self, s, is_there=True):
+        v = s.startswith(BeautifulSoup.NO_PARSER_SPECIFIED_WARNING[:80])
+        self.assertTrue(v)
+
+    def test_warning_if_no_parser_specified(self):
+        with warnings.catch_warnings(record=True) as w:
+            soup = self.soup("<a><b></b></a>")
+        msg = str(w[0].message)
+        self._assert_no_parser_specified(msg)
+
+    def test_warning_if_parser_specified_too_vague(self):
+        with warnings.catch_warnings(record=True) as w:
+            soup = self.soup("<a><b></b></a>", "html")
+        msg = str(w[0].message)
+        self._assert_no_parser_specified(msg)
+
+    def test_no_warning_if_explicit_parser_specified(self):
+        with warnings.catch_warnings(record=True) as w:
+            soup = self.soup("<a><b></b></a>", "html.parser")
+        self.assertEqual([], w)

    def test_parseOnlyThese_renamed_to_parse_only(self):
        with warnings.catch_warnings(record=True) as w:
@ -49,14 +99,52 @@ class TestDeprecatedConstructorArguments(SoupTest):
        self.assertRaises(
            TypeError, self.soup, "<a>", no_such_argument=True)

-    @skipIf(
-        not LXML_PRESENT,
-        "lxml not present, not testing BeautifulStoneSoup.")
-    def test_beautifulstonesoup(self):
+class TestWarnings(SoupTest):
+
+    def test_disk_file_warning(self):
+        filehandle = tempfile.NamedTemporaryFile()
+        filename = filehandle.name
+        try:
+            with warnings.catch_warnings(record=True) as w:
+                soup = self.soup(filename)
+            msg = str(w[0].message)
+            self.assertTrue("looks like a filename" in msg)
+        finally:
+            filehandle.close()
+
+        # The file no longer exists, so Beautiful Soup will no longer issue the warning.
        with warnings.catch_warnings(record=True) as w:
-            soup = BeautifulStoneSoup("<markup>")
-            self.assertTrue(isinstance(soup, BeautifulSoup))
-            self.assertTrue("BeautifulStoneSoup class is deprecated")
+            soup = self.soup(filename)
+        self.assertEqual(0, len(w))
+
+    def test_url_warning_with_bytes_url(self):
+        with warnings.catch_warnings(record=True) as warning_list:
+            soup = self.soup(b"http://www.crummybytes.com/")
+        # Be aware this isn't the only warning that can be raised during
+        # execution..
+        self.assertTrue(any("looks like a URL" in str(w.message) 
+            for w in warning_list))
+
+    def test_url_warning_with_unicode_url(self):
+        with warnings.catch_warnings(record=True) as warning_list:
+            # note - this url must differ from the bytes one otherwise
+            # python's warnings system swallows the second warning
+            soup = self.soup(u"http://www.crummyunicode.com/")
+        self.assertTrue(any("looks like a URL" in str(w.message) 
+            for w in warning_list))
+
+    def test_url_warning_with_bytes_and_space(self):
+        with warnings.catch_warnings(record=True) as warning_list:
+            soup = self.soup(b"http://www.crummybytes.com/ is great")
+        self.assertFalse(any("looks like a URL" in str(w.message) 
+            for w in warning_list))
+
+    def test_url_warning_with_unicode_and_space(self):
+        with warnings.catch_warnings(record=True) as warning_list:
+            soup = self.soup(u"http://www.crummyuncode.com/ is great")
+        self.assertFalse(any("looks like a URL" in str(w.message) 
+            for w in warning_list))
+

 class TestSelectiveParsing(SoupTest):

@ -120,9 +208,14 @@ class TestEntitySubstitution(unittest.TestCase):
    def test_xml_quoting_handles_ampersands(self):
        self.assertEqual(self.sub.substitute_xml("AT&T"), "AT&amp;T")

-    def test_xml_quoting_ignores_ampersands_when_they_are_part_of_an_entity(self):
+    def test_xml_quoting_including_ampersands_when_they_are_part_of_an_entity(self):
        self.assertEqual(
            self.sub.substitute_xml("&Aacute;T&T"),
+            "&amp;Aacute;T&amp;T")
+
+    def test_xml_quoting_ignoring_ampersands_when_they_are_part_of_an_entity(self):
+        self.assertEqual(
+            self.sub.substitute_xml_containing_entities("&Aacute;T&T"),
            "&Aacute;T&amp;T")

    def test_quotes_not_html_substituted(self):
@ -137,22 +230,32 @@ class TestEncodingConversion(SoupTest):

    def setUp(self):
        super(TestEncodingConversion, self).setUp()
-        self.unicode_data = u"<html><head></head><body><foo>Sacr\N{LATIN SMALL LETTER E WITH ACUTE} bleu!</foo></body></html>"
+        self.unicode_data = u'<html><head><meta charset="utf-8"/></head><body><foo>Sacr\N{LATIN SMALL LETTER E WITH ACUTE} bleu!</foo></body></html>'
        self.utf8_data = self.unicode_data.encode("utf-8")
        # Just so you know what it looks like.
        self.assertEqual(
            self.utf8_data,
-            b"<html><head></head><body><foo>Sacr\xc3\xa9 bleu!</foo></body></html>")
+            b'<html><head><meta charset="utf-8"/></head><body><foo>Sacr\xc3\xa9 bleu!</foo></body></html>')

    def test_ascii_in_unicode_out(self):
        # ASCII input is converted to Unicode. The original_encoding
-        # attribute is set.
-        ascii = b"<foo>a</foo>"
-        soup_from_ascii = self.soup(ascii)
-        unicode_output = soup_from_ascii.decode()
-        self.assertTrue(isinstance(unicode_output, unicode))
-        self.assertEqual(unicode_output, self.document_for(ascii.decode()))
-        self.assertEqual(soup_from_ascii.original_encoding, "ascii")
+        # attribute is set to 'utf-8', a superset of ASCII.
+        chardet = bs4.dammit.chardet_dammit
+        logging.disable(logging.WARNING)
+        try:
+            def noop(str):
+                return None
+            # Disable chardet, which will realize that the ASCII is ASCII.
+            bs4.dammit.chardet_dammit = noop
+            ascii = b"<foo>a</foo>"
+            soup_from_ascii = self.soup(ascii)
+            unicode_output = soup_from_ascii.decode()
+            self.assertTrue(isinstance(unicode_output, unicode))
+            self.assertEqual(unicode_output, self.document_for(ascii.decode()))
+            self.assertEqual(soup_from_ascii.original_encoding.lower(), "utf-8")
+        finally:
+            logging.disable(logging.NOTSET)
+            bs4.dammit.chardet_dammit = chardet

    def test_unicode_in_unicode_out(self):
        # Unicode input is left alone. The original_encoding attribute
@ -174,9 +277,20 @@ class TestEncodingConversion(SoupTest):
        soup_from_unicode = self.soup(self.unicode_data)
        self.assertEqual(soup_from_unicode.encode('utf-8'), self.utf8_data)

+    @skipIf(
+        PYTHON_3_PRE_3_2,
+        "Bad HTMLParser detected; skipping test of non-ASCII characters in attribute name.")
+    def test_attribute_name_containing_unicode_characters(self):
+        markup = u'<div><a \N{SNOWMAN}="snowman"></a></div>'
+        self.assertEqual(self.soup(markup).div.encode("utf8"), markup.encode("utf8"))

 class TestUnicodeDammit(unittest.TestCase):
-    """Standalone tests of Unicode, Dammit."""
+    """Standalone tests of UnicodeDammit."""
+
+    def test_unicode_input(self):
+        markup = u"I'm already Unicode! \N{SNOWMAN}"
+        dammit = UnicodeDammit(markup)
+        self.assertEqual(dammit.unicode_markup, markup)

    def test_smart_quotes_to_unicode(self):
        markup = b"<foo>\x91\x92\x93\x94</foo>"
@ -203,33 +317,54 @@ class TestUnicodeDammit(unittest.TestCase):
            dammit.unicode_markup, """<foo>''""</foo>""")

    def test_detect_utf8(self):
-        utf8 = b"\xc3\xa9"
+        utf8 = b"Sacr\xc3\xa9 bleu! \xe2\x98\x83"
        dammit = UnicodeDammit(utf8)
-        self.assertEqual(dammit.unicode_markup, u'\xe9')
-        self.assertEqual(dammit.original_encoding, 'utf-8')
+        self.assertEqual(dammit.original_encoding.lower(), 'utf-8')
+        self.assertEqual(dammit.unicode_markup, u'Sacr\xe9 bleu! \N{SNOWMAN}')
+

    def test_convert_hebrew(self):
        hebrew = b"\xed\xe5\xec\xf9"
        dammit = UnicodeDammit(hebrew, ["iso-8859-8"])
-        self.assertEqual(dammit.original_encoding, 'iso-8859-8')
+        self.assertEqual(dammit.original_encoding.lower(), 'iso-8859-8')
        self.assertEqual(dammit.unicode_markup, u'\u05dd\u05d5\u05dc\u05e9')

    def test_dont_see_smart_quotes_where_there_are_none(self):
        utf_8 = b"\343\202\261\343\203\274\343\202\277\343\202\244 Watch"
        dammit = UnicodeDammit(utf_8)
-        self.assertEqual(dammit.original_encoding, 'utf-8')
+        self.assertEqual(dammit.original_encoding.lower(), 'utf-8')
        self.assertEqual(dammit.unicode_markup.encode("utf-8"), utf_8)

    def test_ignore_inappropriate_codecs(self):
        utf8_data = u"Räksmörgås".encode("utf-8")
        dammit = UnicodeDammit(utf8_data, ["iso-8859-8"])
-        self.assertEqual(dammit.original_encoding, 'utf-8')
+        self.assertEqual(dammit.original_encoding.lower(), 'utf-8')

    def test_ignore_invalid_codecs(self):
        utf8_data = u"Räksmörgås".encode("utf-8")
        for bad_encoding in ['.utf8', '...', 'utF---16.!']:
            dammit = UnicodeDammit(utf8_data, [bad_encoding])
-            self.assertEqual(dammit.original_encoding, 'utf-8')
+            self.assertEqual(dammit.original_encoding.lower(), 'utf-8')
+
+    def test_exclude_encodings(self):
+        # This is UTF-8.
+        utf8_data = u"Räksmörgås".encode("utf-8")
+
+        # But if we exclude UTF-8 from consideration, the guess is
+        # Windows-1252.
+        dammit = UnicodeDammit(utf8_data, exclude_encodings=["utf-8"])
+        self.assertEqual(dammit.original_encoding.lower(), 'windows-1252')
+
+        # And if we exclude that, there is no valid guess at all.
+        dammit = UnicodeDammit(
+            utf8_data, exclude_encodings=["utf-8", "windows-1252"])
+        self.assertEqual(dammit.original_encoding, None)
+
+    def test_encoding_detector_replaces_junk_in_encoding_name_with_replacement_character(self):
+        detected = EncodingDetector(
+            b'<?xml version="1.0" encoding="UTF-\xdb" ?>')
+        encodings = list(detected.encodings)
+        assert u'utf-\N{REPLACEMENT CHARACTER}' in encodings

    def test_detect_html5_style_meta_tag(self):

@ -261,26 +396,24 @@ class TestUnicodeDammit(unittest.TestCase):
        doc = b"""\357\273\277<?xml version="1.0" encoding="UTF-8"?>
 <html><b>\330\250\330\252\330\261</b>
 <i>\310\322\321\220\312\321\355\344</i></html>"""
-        chardet = bs4.dammit.chardet
+        chardet = bs4.dammit.chardet_dammit
+        logging.disable(logging.WARNING)
        try:
-            bs4.dammit.chardet = None
-            with warnings.catch_warnings(record=True) as w:
-                dammit = UnicodeDammit(doc)
-                self.assertEqual(True, dammit.contains_replacement_characters)
-                self.assertTrue(u"\ufffd" in dammit.unicode_markup)
+            def noop(str):
+                return None
+            bs4.dammit.chardet_dammit = noop
+            dammit = UnicodeDammit(doc)
+            self.assertEqual(True, dammit.contains_replacement_characters)
+            self.assertTrue(u"\ufffd" in dammit.unicode_markup)

-                soup = BeautifulSoup(doc, "html.parser")
-                self.assertTrue(soup.contains_replacement_characters)
-
-                msg = w[0].message
-                self.assertTrue(isinstance(msg, UnicodeWarning))
-                self.assertTrue("Some characters could not be decoded" in str(msg))
+            soup = BeautifulSoup(doc, "html.parser")
+            self.assertTrue(soup.contains_replacement_characters)
        finally:
-            bs4.dammit.chardet = chardet
+            logging.disable(logging.NOTSET)
+            bs4.dammit.chardet_dammit = chardet

-    def test_sniffed_xml_encoding(self):
-        # A document written in UTF-16LE will be converted by a different
-        # code path that sniffs the byte order markers.
+    def test_byte_order_mark_removed(self):
+        # A document written in UTF-16LE will have its byte order marker stripped.
        data = b'\xff\xfe<\x00a\x00>\x00\xe1\x00\xe9\x00<\x00/\x00a\x00>\x00'
        dammit = UnicodeDammit(data)
        self.assertEqual(u"<a>áé</a>", dammit.unicode_markup)
--- a/lib/bs4/tests/test_tree.py
+++ b/lib/bs4/tests/test_tree.py
@ -9,6 +9,7 @@ same markup, but all Beautiful Soup trees can be traversed with the
 methods tested here.
 """

+from pdb import set_trace
 import copy
 import pickle
 import re
@ -19,7 +20,10 @@ from bs4.builder import (
    HTMLParserTreeBuilder,
 )
 from bs4.element import (
+    PY3K,
    CData,
+    Comment,
+    Declaration,
    Doctype,
    NavigableString,
    SoupStrainer,
@ -67,7 +71,23 @@ class TestFind(TreeTest):

    def test_unicode_text_find(self):
        soup = self.soup(u'<h1>Räksmörgås</h1>')
-        self.assertEqual(soup.find(text=u'Räksmörgås'), u'Räksmörgås')
+        self.assertEqual(soup.find(string=u'Räksmörgås'), u'Räksmörgås')
+
+    def test_unicode_attribute_find(self):
+        soup = self.soup(u'<h1 id="Räksmörgås">here it is</h1>')
+        str(soup)
+        self.assertEqual("here it is", soup.find(id=u'Räksmörgås').text)
+
+
+    def test_find_everything(self):
+        """Test an optimization that finds all tags."""
+        soup = self.soup("<a>foo</a><b>bar</b>")
+        self.assertEqual(2, len(soup.find_all()))
+
+    def test_find_everything_with_name(self):
+        """Test an optimization that finds all tags with a given name."""
+        soup = self.soup("<a>foo</a><b>bar</b><a>baz</a>")
+        self.assertEqual(2, len(soup.find_all('a')))

 class TestFindAll(TreeTest):
    """Basic tests of the find_all() method."""
@ -76,6 +96,7 @@ class TestFindAll(TreeTest):
        """You can search the tree for text nodes."""
        soup = self.soup("<html>Foo<b>bar</b>\xbb</html>")
        # Exact match.
+        self.assertEqual(soup.find_all(string="bar"), [u"bar"])
        self.assertEqual(soup.find_all(text="bar"), [u"bar"])
        # Match any of a number of strings.
        self.assertEqual(
@ -114,6 +135,19 @@ class TestFindAll(TreeTest):
        # recursion.
        self.assertEqual([], soup.find_all(l))

+    def test_find_all_resultset(self):
+        """All find_all calls return a ResultSet"""
+        soup = self.soup("<a></a>")
+        result = soup.find_all("a")
+        self.assertTrue(hasattr(result, "source"))
+
+        result = soup.find_all(True)
+        self.assertTrue(hasattr(result, "source"))
+
+        result = soup.find_all(text="foo")
+        self.assertTrue(hasattr(result, "source"))
+
+
 class TestFindAllBasicNamespaces(TreeTest):

    def test_find_by_namespaced_name(self):
@ -188,6 +222,17 @@ class TestFindAllByName(TreeTest):
        self.assertSelects(
            tree.find_all(id_matches_name), ["Match 1.", "Match 2."])

+    def test_find_with_multi_valued_attribute(self):
+        soup = self.soup(
+            "<div class='a b'>1</div><div class='a c'>2</div><div class='a d'>3</div>"
+        )
+        r1 = soup.find('div', 'a d');
+        r2 = soup.find('div', re.compile(r'a d'));
+        r3, r4 = soup.find_all('div', ['a b', 'a d']);
+        self.assertEqual('3', r1.string)
+        self.assertEqual('3', r2.string)
+        self.assertEqual('1', r3.string)
+        self.assertEqual('3', r4.string)

 class TestFindAllByAttribute(TreeTest):

@ -228,18 +273,24 @@ class TestFindAllByAttribute(TreeTest):
        self.assertSelects(tree.find_all(attrs={'name' : 'name1'}),
                           ["Name match."])

-        # Passing class='class2' would cause a syntax error.
        self.assertSelects(tree.find_all(attrs={'class' : 'class2'}),
                           ["Class match."])

    def test_find_all_by_class(self):
-        # Passing in a string to 'attrs' will search the CSS class.
        tree = self.soup("""
                         <a class="1">Class 1.</a>
                         <a class="2">Class 2.</a>
                         <b class="1">Class 1.</b>
                         <c class="3 4">Class 3 and 4.</c>
                         """)
+
+        # Passing in the class_ keyword argument will search against
+        # the 'class' attribute.
+        self.assertSelects(tree.find_all('a', class_='1'), ['Class 1.'])
+        self.assertSelects(tree.find_all('c', class_='3'), ['Class 3 and 4.'])
+        self.assertSelects(tree.find_all('c', class_='4'), ['Class 3 and 4.'])
+
+        # Passing in a string to 'attrs' will also search the CSS class.
        self.assertSelects(tree.find_all('a', '1'), ['Class 1.'])
        self.assertSelects(tree.find_all(attrs='1'), ['Class 1.', 'Class 1.'])
        self.assertSelects(tree.find_all('c', '3'), ['Class 3 and 4.'])
@ -248,18 +299,16 @@ class TestFindAllByAttribute(TreeTest):
    def test_find_by_class_when_multiple_classes_present(self):
        tree = self.soup("<gar class='foo bar'>Found it</gar>")

-        attrs = { 'class' : re.compile("o") }
-        f = tree.find_all("gar", attrs=attrs)
+        f = tree.find_all("gar", class_=re.compile("o"))
        self.assertSelects(f, ["Found it"])

-        f = tree.find_all("gar", re.compile("a"))
+        f = tree.find_all("gar", class_=re.compile("a"))
        self.assertSelects(f, ["Found it"])

-        # Since the class is not the string "foo bar", but the two
-        # strings "foo" and "bar", this will not find anything.
-        attrs = { 'class' : re.compile("o b") }
-        f = tree.find_all("gar", attrs=attrs)
-        self.assertSelects(f, [])
+        # If the search fails to match the individual strings "foo" and "bar",
+        # it will be tried against the combined string "foo bar".
+        f = tree.find_all("gar", class_=re.compile("o b"))
+        self.assertSelects(f, ["Found it"])

    def test_find_all_with_non_dictionary_for_attrs_finds_by_class(self):
        soup = self.soup("<a class='bar'>Found it</a>")
@ -283,8 +332,9 @@ class TestFindAllByAttribute(TreeTest):
        self.assertEqual([a, a2], soup.find_all("a", "foo"))
        self.assertEqual([a], soup.find_all("a", "bar"))

-        # If you specify the attribute as a string that contains a
+        # If you specify the class as a string that contains a
        # space, only that specific value will be found.
+        self.assertEqual([a], soup.find_all("a", class_="foo bar"))
        self.assertEqual([a], soup.find_all("a", "foo bar"))
        self.assertEqual([], soup.find_all("a", "bar foo"))

@ -296,7 +346,7 @@ class TestFindAllByAttribute(TreeTest):
        strainer = SoupStrainer(attrs={'id' : 'first'})
        self.assertSelects(tree.find_all(strainer), ['Match.'])

-    def test_find_all_with_missing_atribute(self):
+    def test_find_all_with_missing_attribute(self):
        # You can pass in None as the value of an attribute to find_all.
        # This will match tags that do not have that attribute set.
        tree = self.soup("""<a id="1">ID present.</a>
@ -420,6 +470,7 @@ class TestParentOperations(TreeTest):

    def test_find_parent(self):
        self.assertEqual(self.start.find_parent('ul')['id'], 'bottom')
+        self.assertEqual(self.start.find_parent('ul', id='top')['id'], 'top')

    def test_parent_of_text_element(self):
        text = self.tree.find(text="Start here")
@ -658,7 +709,7 @@ class TestTagCreation(SoupTest):

    def test_tag_inherits_self_closing_rules_from_builder(self):
        if XML_BUILDER_PRESENT:
-            xml_soup = BeautifulSoup("", "xml")
+            xml_soup = BeautifulSoup("", "lxml-xml")
            xml_br = xml_soup.new_tag("br")
            xml_p = xml_soup.new_tag("p")

@ -667,7 +718,7 @@ class TestTagCreation(SoupTest):
            self.assertEqual(b"<br/>", xml_br.encode())
            self.assertEqual(b"<p/>", xml_p.encode())

-        html_soup = BeautifulSoup("", "html")
+        html_soup = BeautifulSoup("", "html.parser")
        html_br = html_soup.new_tag("br")
        html_p = html_soup.new_tag("p")

@ -682,6 +733,12 @@ class TestTagCreation(SoupTest):
        self.assertEqual("foo", s)
        self.assertTrue(isinstance(s, NavigableString))

+    def test_new_string_can_create_navigablestring_subclass(self):
+        soup = self.soup("")
+        s = soup.new_string("foo", Comment)
+        self.assertEqual("foo", s)
+        self.assertTrue(isinstance(s, Comment))
+
 class TestTreeModification(SoupTest):

    def test_attribute_modification(self):
@ -737,6 +794,14 @@ class TestTreeModification(SoupTest):
        new_a = a.unwrap()
        self.assertEqual(a, new_a)

+    def test_replace_with_and_unwrap_give_useful_exception_when_tag_has_no_parent(self):
+        soup = self.soup("<a><b>Foo</b></a><c>Bar</c>")
+        a = soup.a
+        a.extract()
+        self.assertEqual(None, a.parent)
+        self.assertRaises(ValueError, a.unwrap)
+        self.assertRaises(ValueError, a.replace_with, soup.c)
+
    def test_replace_tag_with_itself(self):
        text = "<a><b></b><c>Foo<d></d></c></a><a><e></e></a>"
        soup = self.soup(text)
@ -881,20 +946,20 @@ class TestTreeModification(SoupTest):
        self.assertEqual(
            soup.decode(), self.document_for("QUUX<b>bar</b><a>foo</a>BAZ"))

-    def test_insert_after_raises_valueerror_if_after_has_no_meaning(self):
+    def test_insert_after_raises_exception_if_after_has_no_meaning(self):
        soup = self.soup("")
        tag = soup.new_tag("a")
        string = soup.new_string("")
        self.assertRaises(ValueError, string.insert_after, tag)
-        self.assertRaises(ValueError, soup.insert_after, tag)
+        self.assertRaises(NotImplementedError, soup.insert_after, tag)
        self.assertRaises(ValueError, tag.insert_after, tag)

-    def test_insert_before_raises_valueerror_if_before_has_no_meaning(self):
+    def test_insert_before_raises_notimplementederror_if_before_has_no_meaning(self):
        soup = self.soup("")
        tag = soup.new_tag("a")
        string = soup.new_string("")
        self.assertRaises(ValueError, string.insert_before, tag)
-        self.assertRaises(ValueError, soup.insert_before, tag)
+        self.assertRaises(NotImplementedError, soup.insert_before, tag)
        self.assertRaises(ValueError, tag.insert_before, tag)

    def test_replace_with(self):
@ -1031,6 +1096,31 @@ class TestTreeModification(SoupTest):
        self.assertEqual(foo_2, soup.a.string)
        self.assertEqual(bar_2, soup.b.string)

+    def test_extract_multiples_of_same_tag(self):
+        soup = self.soup("""
+<html>
+<head>
+<script>foo</script>
+</head>
+<body>
+ <script>bar</script>
+ <a></a>
+</body>
+<script>baz</script>
+</html>""")
+        [soup.script.extract() for i in soup.find_all("script")]
+        self.assertEqual("<body>\n\n<a></a>\n</body>", unicode(soup.body))
+
+
+    def test_extract_works_when_element_is_surrounded_by_identical_strings(self):
+        soup = self.soup(
+ '<html>\n'
+ '<body>hi</body>\n'
+ '</html>')
+        soup.find('body').extract()
+        self.assertEqual(None, soup.find('body'))
+
+
    def test_clear(self):
        """Tag.clear()"""
        soup = self.soup("<p><a>String <em>Italicized</em></a> and another</p>")
@ -1043,7 +1133,7 @@ class TestTreeModification(SoupTest):
        # clear using decompose()
        em = a.em
        a.clear(decompose=True)
-        self.assertFalse(hasattr(em, "contents"))
+        self.assertEqual(0, len(em.contents))

    def test_string_set(self):
        """Tag.string = 'string'"""
@ -1161,6 +1251,19 @@ class TestElementObjects(SoupTest):
        self.assertEqual(soup.a.get_text(","), "a,r, , t ")
        self.assertEqual(soup.a.get_text(",", strip=True), "a,r,t")

+    def test_get_text_ignores_comments(self):
+        soup = self.soup("foo<!--IGNORE-->bar")
+        self.assertEqual(soup.get_text(), "foobar")
+
+        self.assertEqual(
+            soup.get_text(types=(NavigableString, Comment)), "fooIGNOREbar")
+        self.assertEqual(
+            soup.get_text(types=None), "fooIGNOREbar")
+
+    def test_all_strings_ignores_comments(self):
+        soup = self.soup("foo<!--IGNORE-->bar")
+        self.assertEqual(['foo', 'bar'], list(soup.strings))
+
 class TestCDAtaListAttributes(SoupTest):

    """Testing cdata-list attributes like 'class'.
@ -1193,6 +1296,12 @@ class TestCDAtaListAttributes(SoupTest):
        # attribute for any other tag.
        self.assertEqual('ISO-8859-1 UTF-8', soup.a['accept-charset'])

+    def test_string_has_immutable_name_property(self):
+        string = self.soup("s").string
+        self.assertEqual(None, string.name)
+        def t():
+            string.name = 'foo'
+        self.assertRaises(AttributeError, t)

 class TestPersistence(SoupTest):
    "Testing features like pickle and deepcopy."
@ -1230,6 +1339,13 @@ class TestPersistence(SoupTest):
        copied = copy.deepcopy(self.tree)
        self.assertEqual(copied.decode(), self.tree.decode())

+    def test_copy_preserves_encoding(self):
+        soup = BeautifulSoup(b'<p>&nbsp;</p>', 'html.parser')
+        encoding = soup.original_encoding
+        copy = soup.__copy__()
+        self.assertEqual(u"<p> </p>", unicode(copy))
+        self.assertEqual(encoding, copy.original_encoding)
+
    def test_unicode_pickle(self):
        # A tree containing Unicode characters can be pickled.
        html = u"<b>\N{SNOWMAN}</b>"
@ -1238,6 +1354,51 @@ class TestPersistence(SoupTest):
        loaded = pickle.loads(dumped)
        self.assertEqual(loaded.decode(), soup.decode())

+    def test_copy_navigablestring_is_not_attached_to_tree(self):
+        html = u"<b>Foo<a></a></b><b>Bar</b>"
+        soup = self.soup(html)
+        s1 = soup.find(string="Foo")
+        s2 = copy.copy(s1)
+        self.assertEqual(s1, s2)
+        self.assertEqual(None, s2.parent)
+        self.assertEqual(None, s2.next_element)
+        self.assertNotEqual(None, s1.next_sibling)
+        self.assertEqual(None, s2.next_sibling)
+        self.assertEqual(None, s2.previous_element)
+
+    def test_copy_navigablestring_subclass_has_same_type(self):
+        html = u"<b><!--Foo--></b>"
+        soup = self.soup(html)
+        s1 = soup.string
+        s2 = copy.copy(s1)
+        self.assertEqual(s1, s2)
+        self.assertTrue(isinstance(s2, Comment))
+
+    def test_copy_entire_soup(self):
+        html = u"<div><b>Foo<a></a></b><b>Bar</b></div>end"
+        soup = self.soup(html)
+        soup_copy = copy.copy(soup)
+        self.assertEqual(soup, soup_copy)
+
+    def test_copy_tag_copies_contents(self):
+        html = u"<div><b>Foo<a></a></b><b>Bar</b></div>end"
+        soup = self.soup(html)
+        div = soup.div
+        div_copy = copy.copy(div)
+
+        # The two tags look the same, and evaluate to equal.
+        self.assertEqual(unicode(div), unicode(div_copy))
+        self.assertEqual(div, div_copy)
+
+        # But they're not the same object.
+        self.assertFalse(div is div_copy)
+
+        # And they don't have the same relation to the parse tree. The
+        # copy is not associated with a parse tree at all.
+        self.assertEqual(None, div_copy.parent)
+        self.assertEqual(None, div_copy.previous_element)
+        self.assertEqual(None, div_copy.find(string='Bar').next_element)
+        self.assertNotEqual(None, div.find(string='Bar').next_element)

 class TestSubstitutions(SoupTest):

@ -1305,8 +1466,34 @@ class TestSubstitutions(SoupTest):
        expect_upper = u'<a href="HTTP://A.COM?A=B&C=É">E</a>'
        self.assertEqual(expect_upper, a.decode(formatter=lambda x: x.upper()))

+    def test_formatter_skips_script_tag_for_html_documents(self):
+        doc = """
+  <script type="text/javascript">
+   console.log("< < hey > > ");
+  </script>
+"""
+        encoded = BeautifulSoup(doc, 'html.parser').encode()
+        self.assertTrue(b"< < hey > >" in encoded)
+
+    def test_formatter_skips_style_tag_for_html_documents(self):
+        doc = """
+  <style type="text/css">
+   console.log("< < hey > > ");
+  </style>
+"""
+        encoded = BeautifulSoup(doc, 'html.parser').encode()
+        self.assertTrue(b"< < hey > >" in encoded)
+
+    def test_prettify_leaves_preformatted_text_alone(self):
+        soup = self.soup("<div>  foo  <pre>  \tbar\n  \n  </pre>  baz  ")
+        # Everything outside the <pre> tag is reformatted, but everything
+        # inside is left alone.
+        self.assertEqual(
+            u'<div>\n foo\n <pre>  \tbar\n  \n  </pre>\n baz\n</div>',
+            soup.div.prettify())
+
    def test_prettify_accepts_formatter(self):
-        soup = BeautifulSoup("<html><body>foo</body></html>")
+        soup = BeautifulSoup("<html><body>foo</body></html>", 'html.parser')
        pretty = soup.prettify(formatter = lambda x: x.upper())
        self.assertTrue("FOO" in pretty)

@ -1403,6 +1590,14 @@ class TestEncoding(SoupTest):
        self.assertEqual(
            u"\N{SNOWMAN}".encode("utf8"), soup.b.renderContents())

+    def test_repr(self):
+        html = u"<b>\N{SNOWMAN}</b>"
+        soup = self.soup(html)
+        if PY3K:
+            self.assertEqual(html, repr(soup))
+        else:
+            self.assertEqual(b'<b>\\u2603</b>', repr(soup))
+
 class TestNavigableStringSubclasses(SoupTest):

    def test_cdata(self):
@ -1441,6 +1636,9 @@ class TestNavigableStringSubclasses(SoupTest):
        soup.insert(1, doctype)
        self.assertEqual(soup.encode(), b"<!DOCTYPE foo>\n")

+    def test_declaration(self):
+        d = Declaration("foo")
+        self.assertEqual("<?foo?>", d.output_ready())

 class TestSoupSelector(TreeTest):

@ -1453,8 +1651,8 @@ class TestSoupSelector(TreeTest):
 <link rel="stylesheet" href="blah.css" type="text/css" id="l1">
 </head>
 <body>
-
-<div id="main">
+<custom-dashed-tag class="dashed" id="dash1">Hello there.</custom-dashed-tag>
+<div id="main" class="fancy">
 <div id="inner">
 <h1 id="header1">An H1</h1>
 <p>Some text</p>
@ -1471,8 +1669,18 @@ class TestSoupSelector(TreeTest):
 <a href="#" id="s2a1">span2a1</a>
 </span>
 <span class="span3"></span>
+<custom-dashed-tag class="dashed" id="dash2"/>
+<div data-tag="dashedvalue" id="data1"/>
 </span>
 </div>
+<x id="xid">
+<z id="zida"/>
+<z id="zidab"/>
+<z id="zidac"/>
+</x>
+<y id="yid">
+<z id="zidb"/>
+</y>
 <p lang="en" id="lang-en">English</p>
 <p lang="en-gb" id="lang-en-gb">English UK</p>
 <p lang="en-us" id="lang-en-us">English US</p>
@ -1484,10 +1692,10 @@ class TestSoupSelector(TreeTest):
 """

    def setUp(self):
-        self.soup = BeautifulSoup(self.HTML)
+        self.soup = BeautifulSoup(self.HTML, 'html.parser')

-    def assertSelects(self, selector, expected_ids):
-        el_ids = [el['id'] for el in self.soup.select(selector)]
+    def assertSelects(self, selector, expected_ids, **kwargs):
+        el_ids = [el['id'] for el in self.soup.select(selector, **kwargs)]
        el_ids.sort()
        expected_ids.sort()
        self.assertEqual(expected_ids, el_ids,
@ -1510,23 +1718,52 @@ class TestSoupSelector(TreeTest):

    def test_one_tag_many(self):
        els = self.soup.select('div')
-        self.assertEqual(len(els), 3)
+        self.assertEqual(len(els), 4)
        for div in els:
            self.assertEqual(div.name, 'div')

+        el = self.soup.select_one('div')
+        self.assertEqual('main', el['id'])
+
+    def test_select_one_returns_none_if_no_match(self):
+        match = self.soup.select_one('nonexistenttag')
+        self.assertEqual(None, match)
+
+
    def test_tag_in_tag_one(self):
        els = self.soup.select('div div')
-        self.assertSelects('div div', ['inner'])
+        self.assertSelects('div div', ['inner', 'data1'])

    def test_tag_in_tag_many(self):
        for selector in ('html div', 'html body div', 'body div'):
-            self.assertSelects(selector, ['main', 'inner', 'footer'])
+            self.assertSelects(selector, ['data1', 'main', 'inner', 'footer'])
+
+
+    def test_limit(self):
+        self.assertSelects('html div', ['main'], limit=1)
+        self.assertSelects('html body div', ['inner', 'main'], limit=2)
+        self.assertSelects('body div', ['data1', 'main', 'inner', 'footer'],
+                           limit=10)

    def test_tag_no_match(self):
        self.assertEqual(len(self.soup.select('del')), 0)

    def test_invalid_tag(self):
-        self.assertEqual(len(self.soup.select('tag%t')), 0)
+        self.assertRaises(ValueError, self.soup.select, 'tag%t')
+
+    def test_select_dashed_tag_ids(self):
+        self.assertSelects('custom-dashed-tag', ['dash1', 'dash2'])
+
+    def test_select_dashed_by_id(self):
+        dashed = self.soup.select('custom-dashed-tag[id=\"dash2\"]')
+        self.assertEqual(dashed[0].name, 'custom-dashed-tag')
+        self.assertEqual(dashed[0]['id'], 'dash2')
+
+    def test_dashed_tag_text(self):
+        self.assertEqual(self.soup.select('body > custom-dashed-tag')[0].text, u'Hello there.')
+
+    def test_select_dashed_matches_find_all(self):
+        self.assertEqual(self.soup.select('custom-dashed-tag'), self.soup.find_all('custom-dashed-tag'))

    def test_header_tags(self):
        self.assertSelectMultiple(
@ -1559,7 +1796,7 @@ class TestSoupSelector(TreeTest):
        for el in els:
            self.assertEqual(el.name, 'p')
        self.assertEqual(els[1]['class'], ['onep'])
-        self.assertFalse(els[0].has_key('class'))
+        self.assertFalse(els[0].has_attr('class'))

    def test_a_bunch_of_emptys(self):
        for selector in ('div#main del', 'div#main div.oops', 'div div#main'):
@ -1579,6 +1816,9 @@ class TestSoupSelector(TreeTest):
        self.assertSelects('.s1 > a', ['s1a1', 's1a2'])
        self.assertSelects('.s1 > a span', ['s1a2s1'])

+    def test_child_selector_id(self):
+        self.assertSelects('.s1 > a#s1a2 span', ['s1a2s1'])
+
    def test_attribute_equals(self):
        self.assertSelectMultiple(
            ('p[class="onep"]', ['p1']),
@ -1625,6 +1865,7 @@ class TestSoupSelector(TreeTest):
            ('[id^="m"]', ['me', 'main']),
            ('div[id^="m"]', ['main']),
            ('a[id^="m"]', ['me']),
+            ('div[data-tag^="dashed"]', ['data1'])
        )

    def test_attribute_endswith(self):
@ -1632,8 +1873,8 @@ class TestSoupSelector(TreeTest):
            ('[href$=".css"]', ['l1']),
            ('link[href$=".css"]', ['l1']),
            ('link[id$="1"]', ['l1']),
-            ('[id$="1"]', ['l1', 'p1', 'header1', 's1a1', 's2a1', 's1a2s1']),
-            ('div[id$="1"]', []),
+            ('[id$="1"]', ['data1', 'l1', 'p1', 'header1', 's1a1', 's2a1', 's1a2s1', 'dash1']),
+            ('div[id$="1"]', ['data1']),
            ('[id$="noending"]', []),
        )

@ -1646,7 +1887,6 @@ class TestSoupSelector(TreeTest):
            ('[rel*="notstyle"]', []),
            ('link[rel*="notstyle"]', []),
            ('link[href*="bla"]', ['l1']),
-            ('a[href*="http://"]', ['bob', 'me']),
            ('[href*="http://"]', ['bob', 'me']),
            ('[id*="p"]', ['pmulti', 'p1']),
            ('div[id*="m"]', ['main']),
@ -1655,8 +1895,8 @@ class TestSoupSelector(TreeTest):
            ('[href*=".css"]', ['l1']),
            ('link[href*=".css"]', ['l1']),
            ('link[id*="1"]', ['l1']),
-            ('[id*="1"]', ['l1', 'p1', 'header1', 's1a1', 's1a2', 's2a1', 's1a2s1']),
-            ('div[id*="1"]', []),
+            ('[id*="1"]', ['data1', 'l1', 'p1', 'header1', 's1a1', 's1a2', 's2a1', 's1a2s1', 'dash1']),
+            ('div[id*="1"]', ['data1']),
            ('[id*="noending"]', []),
            # New for this test
            ('[href*="."]', ['bob', 'me', 'l1']),
@ -1664,6 +1904,7 @@ class TestSoupSelector(TreeTest):
            ('link[href*="."]', ['l1']),
            ('div[id*="n"]', ['main', 'inner']),
            ('div[id*="nn"]', ['inner']),
+            ('div[data-tag*="edval"]', ['data1'])
        )

    def test_attribute_exact_or_hypen(self):
@ -1683,8 +1924,52 @@ class TestSoupSelector(TreeTest):
            ('p[class]', ['p1', 'pmulti']),
            ('[blah]', []),
            ('p[blah]', []),
+            ('div[data-tag]', ['data1'])
        )

+    def test_quoted_space_in_selector_name(self):
+        html = """<div style="display: wrong">nope</div>
+        <div style="display: right">yes</div>
+        """
+        soup = BeautifulSoup(html, 'html.parser')
+        [chosen] = soup.select('div[style="display: right"]')
+        self.assertEqual("yes", chosen.string)
+
+    def test_unsupported_pseudoclass(self):
+        self.assertRaises(
+            NotImplementedError, self.soup.select, "a:no-such-pseudoclass")
+
+        self.assertRaises(
+            NotImplementedError, self.soup.select, "a:nth-of-type(a)")
+
+
+    def test_nth_of_type(self):
+        # Try to select first paragraph
+        els = self.soup.select('div#inner p:nth-of-type(1)')
+        self.assertEqual(len(els), 1)
+        self.assertEqual(els[0].string, u'Some text')
+
+        # Try to select third paragraph
+        els = self.soup.select('div#inner p:nth-of-type(3)')
+        self.assertEqual(len(els), 1)
+        self.assertEqual(els[0].string, u'Another')
+
+        # Try to select (non-existent!) fourth paragraph
+        els = self.soup.select('div#inner p:nth-of-type(4)')
+        self.assertEqual(len(els), 0)
+
+        # Pass in an invalid value.
+        self.assertRaises(
+            ValueError, self.soup.select, 'div p:nth-of-type(0)')
+
+    def test_nth_of_type_direct_descendant(self):
+        els = self.soup.select('div#inner > p:nth-of-type(1)')
+        self.assertEqual(len(els), 1)
+        self.assertEqual(els[0].string, u'Some text')
+
+    def test_id_child_selector_nth_of_type(self):
+        self.assertSelects('#inner > p:nth-of-type(2)', ['p1'])
+
    def test_select_on_element(self):
        # Other tests operate on the tree; this operates on an element
        # within the tree.
@ -1692,4 +1977,68 @@ class TestSoupSelector(TreeTest):
        selected = inner.select("div")
        # The <div id="inner"> tag was selected. The <div id="footer">
        # tag was not.
-        self.assertSelectsIDs(selected, ['inner'])
+        self.assertSelectsIDs(selected, ['inner', 'data1'])
+
+    def test_overspecified_child_id(self):
+        self.assertSelects(".fancy #inner", ['inner'])
+        self.assertSelects(".normal #inner", [])
+
+    def test_adjacent_sibling_selector(self):
+        self.assertSelects('#p1 + h2', ['header2'])
+        self.assertSelects('#p1 + h2 + p', ['pmulti'])
+        self.assertSelects('#p1 + #header2 + .class1', ['pmulti'])
+        self.assertEqual([], self.soup.select('#p1 + p'))
+
+    def test_general_sibling_selector(self):
+        self.assertSelects('#p1 ~ h2', ['header2', 'header3'])
+        self.assertSelects('#p1 ~ #header2', ['header2'])
+        self.assertSelects('#p1 ~ h2 + a', ['me'])
+        self.assertSelects('#p1 ~ h2 + [rel="me"]', ['me'])
+        self.assertEqual([], self.soup.select('#inner ~ h2'))
+
+    def test_dangling_combinator(self):
+        self.assertRaises(ValueError, self.soup.select, 'h1 >')
+
+    def test_sibling_combinator_wont_select_same_tag_twice(self):
+        self.assertSelects('p[lang] ~ p', ['lang-en-gb', 'lang-en-us', 'lang-fr'])
+
+    # Test the selector grouping operator (the comma)
+    def test_multiple_select(self):
+        self.assertSelects('x, y', ['xid', 'yid'])
+
+    def test_multiple_select_with_no_space(self):
+        self.assertSelects('x,y', ['xid', 'yid'])
+
+    def test_multiple_select_with_more_space(self):
+        self.assertSelects('x,    y', ['xid', 'yid'])
+
+    def test_multiple_select_duplicated(self):
+        self.assertSelects('x, x', ['xid'])
+
+    def test_multiple_select_sibling(self):
+        self.assertSelects('x, y ~ p[lang=fr]', ['xid', 'lang-fr'])
+
+    def test_multiple_select_tag_and_direct_descendant(self):
+        self.assertSelects('x, y > z', ['xid', 'zidb'])
+
+    def test_multiple_select_direct_descendant_and_tags(self):
+        self.assertSelects('div > x, y, z', ['xid', 'yid', 'zida', 'zidb', 'zidab', 'zidac'])
+
+    def test_multiple_select_indirect_descendant(self):
+        self.assertSelects('div x,y,  z', ['xid', 'yid', 'zida', 'zidb', 'zidab', 'zidac'])
+
+    def test_invalid_multiple_select(self):
+        self.assertRaises(ValueError, self.soup.select, ',x, y')
+        self.assertRaises(ValueError, self.soup.select, 'x,,y')
+
+    def test_multiple_select_attrs(self):
+        self.assertSelects('p[lang=en], p[lang=en-gb]', ['lang-en', 'lang-en-gb'])
+
+    def test_multiple_select_ids(self):
+        self.assertSelects('x, y > z[id=zida], z[id=zidab], z[id=zidb]', ['xid', 'zidb', 'zidab'])
+
+    def test_multiple_select_nested(self):
+        self.assertSelects('body > div > x, y > z', ['xid', 'zidb'])
+
+
+
--- a/lib/cherrypy/LICENSE.txt
+++ b/lib/cherrypy/LICENSE.txt
--- a/lib/cherrypy/init.py
+++ b/lib/cherrypy/init.py
--- a/lib/cherrypy/_cpchecker.py
+++ b/lib/cherrypy/_cpchecker.py
--- a/lib/cherrypy/_cpcompat.py
+++ b/lib/cherrypy/_cpcompat.py
--- a/lib/cherrypy/_cpconfig.py
+++ b/lib/cherrypy/_cpconfig.py
--- a/lib/cherrypy/_cpdispatch.py
+++ b/lib/cherrypy/_cpdispatch.py
--- a/lib/cherrypy/_cperror.py
+++ b/lib/cherrypy/_cperror.py
--- a/lib/cherrypy/_cplogging.py
+++ b/lib/cherrypy/_cplogging.py
--- a/lib/cherrypy/_cpmodpy.py
+++ b/lib/cherrypy/_cpmodpy.py
--- a/lib/cherrypy/_cpnative_server.py
+++ b/lib/cherrypy/_cpnative_server.py
--- a/lib/cherrypy/_cpreqbody.py
+++ b/lib/cherrypy/_cpreqbody.py
--- a/lib/cherrypy/_cprequest.py
+++ b/lib/cherrypy/_cprequest.py
--- a/lib/cherrypy/_cpserver.py
+++ b/lib/cherrypy/_cpserver.py
--- a/lib/cherrypy/_cpthreadinglocal.py
+++ b/lib/cherrypy/_cpthreadinglocal.py
--- a/lib/cherrypy/_cptools.py
+++ b/lib/cherrypy/_cptools.py
--- a/lib/cherrypy/_cptree.py
+++ b/lib/cherrypy/_cptree.py
--- a/lib/cherrypy/_cpwsgi.py
+++ b/lib/cherrypy/_cpwsgi.py
--- a/lib/cherrypy/_cpwsgi_server.py
+++ b/lib/cherrypy/_cpwsgi_server.py
--- a/lib/cherrypy/cherryd
+++ b/lib/cherrypy/cherryd
--- a/lib/cherrypy/favicon.ico
+++ b/lib/cherrypy/favicon.ico
--- a/lib/cherrypy/lib/init.py
+++ b/lib/cherrypy/lib/init.py
--- a/lib/cherrypy/lib/auth.py
+++ b/lib/cherrypy/lib/auth.py
--- a/lib/cherrypy/lib/auth_basic.py
+++ b/lib/cherrypy/lib/auth_basic.py
--- a/lib/cherrypy/lib/auth_digest.py
+++ b/lib/cherrypy/lib/auth_digest.py
--- a/lib/cherrypy/lib/caching.py
+++ b/lib/cherrypy/lib/caching.py
--- a/lib/cherrypy/lib/covercp.py
+++ b/lib/cherrypy/lib/covercp.py
--- a/lib/cherrypy/lib/cpstats.py
+++ b/lib/cherrypy/lib/cpstats.py
--- a/lib/cherrypy/lib/cptools.py
+++ b/lib/cherrypy/lib/cptools.py
--- a/lib/cherrypy/lib/encoding.py
+++ b/lib/cherrypy/lib/encoding.py
--- a/lib/cherrypy/lib/http.py
+++ b/lib/cherrypy/lib/http.py
--- a/lib/cherrypy/lib/httpauth.py
+++ b/lib/cherrypy/lib/httpauth.py
--- a/lib/cherrypy/lib/httputil.py
+++ b/lib/cherrypy/lib/httputil.py
--- a/lib/cherrypy/lib/jsontools.py
+++ b/lib/cherrypy/lib/jsontools.py
--- a/lib/cherrypy/lib/profiler.py
+++ b/lib/cherrypy/lib/profiler.py
--- a/lib/cherrypy/lib/reprconf.py
+++ b/lib/cherrypy/lib/reprconf.py
--- a/lib/cherrypy/lib/sessions.py
+++ b/lib/cherrypy/lib/sessions.py
--- a/lib/cherrypy/lib/static.py
+++ b/lib/cherrypy/lib/static.py
--- a/lib/cherrypy/lib/xmlrpc.py
+++ b/lib/cherrypy/lib/xmlrpc.py
--- a/lib/cherrypy/process/init.py
+++ b/lib/cherrypy/process/init.py
--- a/lib/cherrypy/process/plugins.py
+++ b/lib/cherrypy/process/plugins.py
--- a/lib/cherrypy/process/servers.py
+++ b/lib/cherrypy/process/servers.py
--- a/lib/cherrypy/process/win32.py
+++ b/lib/cherrypy/process/win32.py
--- a/lib/cherrypy/process/wspbus.py
+++ b/lib/cherrypy/process/wspbus.py
--- a/lib/cherrypy/scaffold/init.py
+++ b/lib/cherrypy/scaffold/init.py
--- a/lib/cherrypy/scaffold/apache-fcgi.conf
+++ b/lib/cherrypy/scaffold/apache-fcgi.conf
--- a/lib/cherrypy/scaffold/example.conf
+++ b/lib/cherrypy/scaffold/example.conf
--- a/lib/cherrypy/scaffold/site.conf
+++ b/lib/cherrypy/scaffold/site.conf
--- a/lib/cherrypy/scaffold/static/made_with_cherrypy_small.png
+++ b/lib/cherrypy/scaffold/static/made_with_cherrypy_small.png
--- a/lib/cherrypy/test/init.py
+++ b/lib/cherrypy/test/init.py
--- a/lib/cherrypy/test/_test_decorators.py
+++ b/lib/cherrypy/test/_test_decorators.py
--- a/lib/cherrypy/test/_test_states_demo.py
+++ b/lib/cherrypy/test/_test_states_demo.py
--- a/lib/cherrypy/test/benchmark.py
+++ b/lib/cherrypy/test/benchmark.py
--- a/lib/cherrypy/test/checkerdemo.py
+++ b/lib/cherrypy/test/checkerdemo.py
--- a/lib/cherrypy/test/fastcgi.conf
+++ b/lib/cherrypy/test/fastcgi.conf
--- a/lib/cherrypy/test/fcgi.conf
+++ b/lib/cherrypy/test/fcgi.conf
--- a/lib/cherrypy/test/helper.py
+++ b/lib/cherrypy/test/helper.py
--- a/lib/cherrypy/test/logtest.py
+++ b/lib/cherrypy/test/logtest.py
--- a/lib/cherrypy/test/modfastcgi.py
+++ b/lib/cherrypy/test/modfastcgi.py
--- a/lib/cherrypy/test/modfcgid.py
+++ b/lib/cherrypy/test/modfcgid.py
--- a/lib/cherrypy/test/modpy.py
+++ b/lib/cherrypy/test/modpy.py
--- a/Show More
+++ b/Show More