fixed missing libs

2012-09-09 17:59:46 -04:00 · 2012-09-09 17:59:46 -04:00 · 633c8617e4
parent 27011e950f
commit 633c8617e4
2361 changed files with 38285 additions and 0 deletions
--- a/lib/bs4/init.py
+++ b/lib/bs4/init.py
@ -0,0 +1,355 @@
+"""Beautiful Soup
+Elixir and Tonic
+"The Screen-Scraper's Friend"
+http://www.crummy.com/software/BeautifulSoup/
+
+Beautiful Soup uses a pluggable XML or HTML parser to parse a
+(possibly invalid) document into a tree representation. Beautiful Soup
+provides provides methods and Pythonic idioms that make it easy to
+navigate, search, and modify the parse tree.
+
+Beautiful Soup works with Python 2.6 and up. It works better if lxml
+and/or html5lib is installed.
+
+For more than you ever wanted to know about Beautiful Soup, see the
+documentation:
+http://www.crummy.com/software/BeautifulSoup/bs4/doc/
+"""
+
+__author__ = "Leonard Richardson (leonardr@segfault.org)"
+__version__ = "4.1.1"
+__copyright__ = "Copyright (c) 2004-2012 Leonard Richardson"
+__license__ = "MIT"
+
+__all__ = ['BeautifulSoup']
+
+import re
+import warnings
+
+from .builder import builder_registry
+from .dammit import UnicodeDammit
+from .element import (
+    CData,
+    Comment,
+    DEFAULT_OUTPUT_ENCODING,
+    Declaration,
+    Doctype,
+    NavigableString,
+    PageElement,
+    ProcessingInstruction,
+    ResultSet,
+    SoupStrainer,
+    Tag,
+    )
+
+# The very first thing we do is give a useful error if someone is
+# running this code under Python 3 without converting it.
+syntax_error = u'You are trying to run the Python 2 version of Beautiful Soup under Python 3. This will not work. You need to convert the code, either by installing it (`python setup.py install`) or by running 2to3 (`2to3 -w bs4`).'
+
+class BeautifulSoup(Tag):
+    """
+    This class defines the basic interface called by the tree builders.
+
+    These methods will be called by the parser:
+      reset()
+      feed(markup)
+
+    The tree builder may call these methods from its feed() implementation:
+      handle_starttag(name, attrs) # See note about return value
+      handle_endtag(name)
+      handle_data(data) # Appends to the current data node
+      endData(containerClass=NavigableString) # Ends the current data node
+
+    No matter how complicated the underlying parser is, you should be
+    able to build a tree using 'start tag' events, 'end tag' events,
+    'data' events, and "done with data" events.
+
+    If you encounter an empty-element tag (aka a self-closing tag,
+    like HTML's <br> tag), call handle_starttag and then
+    handle_endtag.
+    """
+    ROOT_TAG_NAME = u'[document]'
+
+    # If the end-user gives no indication which tree builder they
+    # want, look for one with these features.
+    DEFAULT_BUILDER_FEATURES = ['html', 'fast']
+
+    # Used when determining whether a text node is all whitespace and
+    # can be replaced with a single space. A text node that contains
+    # fancy Unicode spaces (usually non-breaking) should be left
+    # alone.
+    STRIP_ASCII_SPACES = {9: None, 10: None, 12: None, 13: None, 32: None, }
+
+    def __init__(self, markup="", features=None, builder=None,
+                 parse_only=None, from_encoding=None, **kwargs):
+        """The Soup object is initialized as the 'root tag', and the
+        provided markup (which can be a string or a file-like object)
+        is fed into the underlying parser."""
+
+        if 'convertEntities' in kwargs:
+            warnings.warn(
+                "BS4 does not respect the convertEntities argument to the "
+                "BeautifulSoup constructor. Entities are always converted "
+                "to Unicode characters.")
+
+        if 'markupMassage' in kwargs:
+            del kwargs['markupMassage']
+            warnings.warn(
+                "BS4 does not respect the markupMassage argument to the "
+                "BeautifulSoup constructor. The tree builder is responsible "
+                "for any necessary markup massage.")
+
+        if 'smartQuotesTo' in kwargs:
+            del kwargs['smartQuotesTo']
+            warnings.warn(
+                "BS4 does not respect the smartQuotesTo argument to the "
+                "BeautifulSoup constructor. Smart quotes are always converted "
+                "to Unicode characters.")
+
+        if 'selfClosingTags' in kwargs:
+            del kwargs['selfClosingTags']
+            warnings.warn(
+                "BS4 does not respect the selfClosingTags argument to the "
+                "BeautifulSoup constructor. The tree builder is responsible "
+                "for understanding self-closing tags.")
+
+        if 'isHTML' in kwargs:
+            del kwargs['isHTML']
+            warnings.warn(
+                "BS4 does not respect the isHTML argument to the "
+                "BeautifulSoup constructor. You can pass in features='html' "
+                "or features='xml' to get a builder capable of handling "
+                "one or the other.")
+
+        def deprecated_argument(old_name, new_name):
+            if old_name in kwargs:
+                warnings.warn(
+                    'The "%s" argument to the BeautifulSoup constructor '
+                    'has been renamed to "%s."' % (old_name, new_name))
+                value = kwargs[old_name]
+                del kwargs[old_name]
+                return value
+            return None
+
+        parse_only = parse_only or deprecated_argument(
+            "parseOnlyThese", "parse_only")
+
+        from_encoding = from_encoding or deprecated_argument(
+            "fromEncoding", "from_encoding")
+
+        if len(kwargs) > 0:
+            arg = kwargs.keys().pop()
+            raise TypeError(
+                "__init__() got an unexpected keyword argument '%s'" % arg)
+
+        if builder is None:
+            if isinstance(features, basestring):
+                features = [features]
+            if features is None or len(features) == 0:
+                features = self.DEFAULT_BUILDER_FEATURES
+            builder_class = builder_registry.lookup(*features)
+            if builder_class is None:
+                raise ValueError(
+                    "Couldn't find a tree builder with the features you "
+                    "requested: %s. Do you need to install a parser library?"
+                    % ",".join(features))
+            builder = builder_class()
+        self.builder = builder
+        self.is_xml = builder.is_xml
+        self.builder.soup = self
+
+        self.parse_only = parse_only
+
+        self.reset()
+
+        if hasattr(markup, 'read'):        # It's a file-type object.
+            markup = markup.read()
+        (self.markup, self.original_encoding, self.declared_html_encoding,
+         self.contains_replacement_characters) = (
+            self.builder.prepare_markup(markup, from_encoding))
+
+        try:
+            self._feed()
+        except StopParsing:
+            pass
+
+        # Clear out the markup and remove the builder's circular
+        # reference to this object.
+        self.markup = None
+        self.builder.soup = None
+
+    def _feed(self):
+        # Convert the document to Unicode.
+        self.builder.reset()
+
+        self.builder.feed(self.markup)
+        # Close out any unfinished strings and close all the open tags.
+        self.endData()
+        while self.currentTag.name != self.ROOT_TAG_NAME:
+            self.popTag()
+
+    def reset(self):
+        Tag.__init__(self, self, self.builder, self.ROOT_TAG_NAME)
+        self.hidden = 1
+        self.builder.reset()
+        self.currentData = []
+        self.currentTag = None
+        self.tagStack = []
+        self.pushTag(self)
+
+    def new_tag(self, name, namespace=None, nsprefix=None, **attrs):
+        """Create a new tag associated with this soup."""
+        return Tag(None, self.builder, name, namespace, nsprefix, attrs)
+
+    def new_string(self, s):
+        """Create a new NavigableString associated with this soup."""
+        navigable = NavigableString(s)
+        navigable.setup()
+        return navigable
+
+    def insert_before(self, successor):
+        raise ValueError("BeautifulSoup objects don't support insert_before().")
+
+    def insert_after(self, successor):
+        raise ValueError("BeautifulSoup objects don't support insert_after().")
+
+    def popTag(self):
+        tag = self.tagStack.pop()
+        #print "Pop", tag.name
+        if self.tagStack:
+            self.currentTag = self.tagStack[-1]
+        return self.currentTag
+
+    def pushTag(self, tag):
+        #print "Push", tag.name
+        if self.currentTag:
+            self.currentTag.contents.append(tag)
+        self.tagStack.append(tag)
+        self.currentTag = self.tagStack[-1]
+
+    def endData(self, containerClass=NavigableString):
+        if self.currentData:
+            currentData = u''.join(self.currentData)
+            if (currentData.translate(self.STRIP_ASCII_SPACES) == '' and
+                not set([tag.name for tag in self.tagStack]).intersection(
+                    self.builder.preserve_whitespace_tags)):
+                if '\n' in currentData:
+                    currentData = '\n'
+                else:
+                    currentData = ' '
+            self.currentData = []
+            if self.parse_only and len(self.tagStack) <= 1 and \
+                   (not self.parse_only.text or \
+                    not self.parse_only.search(currentData)):
+                return
+            o = containerClass(currentData)
+            self.object_was_parsed(o)
+
+    def object_was_parsed(self, o):
+        """Add an object to the parse tree."""
+        o.setup(self.currentTag, self.previous_element)
+        if self.previous_element:
+            self.previous_element.next_element = o
+        self.previous_element = o
+        self.currentTag.contents.append(o)
+
+    def _popToTag(self, name, nsprefix=None, inclusivePop=True):
+        """Pops the tag stack up to and including the most recent
+        instance of the given tag. If inclusivePop is false, pops the tag
+        stack up to but *not* including the most recent instqance of
+        the given tag."""
+        #print "Popping to %s" % name
+        if name == self.ROOT_TAG_NAME:
+            return
+
+        numPops = 0
+        mostRecentTag = None
+
+        for i in range(len(self.tagStack) - 1, 0, -1):
+            if (name == self.tagStack[i].name
+                and nsprefix == self.tagStack[i].prefix):
+                numPops = len(self.tagStack) - i
+                break
+        if not inclusivePop:
+            numPops = numPops - 1
+
+        for i in range(0, numPops):
+            mostRecentTag = self.popTag()
+        return mostRecentTag
+
+    def handle_starttag(self, name, namespace, nsprefix, attrs):
+        """Push a start tag on to the stack.
+
+        If this method returns None, the tag was rejected by the
+        SoupStrainer. You should proceed as if the tag had not occured
+        in the document. For instance, if this was a self-closing tag,
+        don't call handle_endtag.
+        """
+
+        # print "Start tag %s: %s" % (name, attrs)
+        self.endData()
+
+        if (self.parse_only and len(self.tagStack) <= 1
+            and (self.parse_only.text
+                 or not self.parse_only.search_tag(name, attrs))):
+            return None
+
+        tag = Tag(self, self.builder, name, namespace, nsprefix, attrs,
+                  self.currentTag, self.previous_element)
+        if tag is None:
+            return tag
+        if self.previous_element:
+            self.previous_element.next_element = tag
+        self.previous_element = tag
+        self.pushTag(tag)
+        return tag
+
+    def handle_endtag(self, name, nsprefix=None):
+        #print "End tag: " + name
+        self.endData()
+        self._popToTag(name, nsprefix)
+
+    def handle_data(self, data):
+        self.currentData.append(data)
+
+    def decode(self, pretty_print=False,
+               eventual_encoding=DEFAULT_OUTPUT_ENCODING,
+               formatter="minimal"):
+        """Returns a string or Unicode representation of this document.
+        To get Unicode, pass None for encoding."""
+
+        if self.is_xml:
+            # Print the XML declaration
+            encoding_part = ''
+            if eventual_encoding != None:
+                encoding_part = ' encoding="%s"' % eventual_encoding
+            prefix = u'<?xml version="1.0"%s?>\n' % encoding_part
+        else:
+            prefix = u''
+        if not pretty_print:
+            indent_level = None
+        else:
+            indent_level = 0
+        return prefix + super(BeautifulSoup, self).decode(
+            indent_level, eventual_encoding, formatter)
+
+class BeautifulStoneSoup(BeautifulSoup):
+    """Deprecated interface to an XML parser."""
+
+    def __init__(self, *args, **kwargs):
+        kwargs['features'] = 'xml'
+        warnings.warn(
+            'The BeautifulStoneSoup class is deprecated. Instead of using '
+            'it, pass features="xml" into the BeautifulSoup constructor.')
+        super(BeautifulStoneSoup, self).__init__(*args, **kwargs)
+
+
+class StopParsing(Exception):
+    pass
+
+
+#By default, act as an HTML pretty-printer.
+if __name__ == '__main__':
+    import sys
+    soup = BeautifulSoup(sys.stdin)
+    print soup.prettify()
--- a/lib/bs4/builder/init.py
+++ b/lib/bs4/builder/init.py
@ -0,0 +1,316 @@
+from collections import defaultdict
+import itertools
+import sys
+from bs4.element import (
+    CharsetMetaAttributeValue,
+    ContentMetaAttributeValue,
+    whitespace_re
+    )
+
+__all__ = [
+    'HTMLTreeBuilder',
+    'SAXTreeBuilder',
+    'TreeBuilder',
+    'TreeBuilderRegistry',
+    ]
+
+# Some useful features for a TreeBuilder to have.
+FAST = 'fast'
+PERMISSIVE = 'permissive'
+STRICT = 'strict'
+XML = 'xml'
+HTML = 'html'
+HTML_5 = 'html5'
+
+
+class TreeBuilderRegistry(object):
+
+    def __init__(self):
+        self.builders_for_feature = defaultdict(list)
+        self.builders = []
+
+    def register(self, treebuilder_class):
+        """Register a treebuilder based on its advertised features."""
+        for feature in treebuilder_class.features:
+            self.builders_for_feature[feature].insert(0, treebuilder_class)
+        self.builders.insert(0, treebuilder_class)
+
+    def lookup(self, *features):
+        if len(self.builders) == 0:
+            # There are no builders at all.
+            return None
+
+        if len(features) == 0:
+            # They didn't ask for any features. Give them the most
+            # recently registered builder.
+            return self.builders[0]
+
+        # Go down the list of features in order, and eliminate any builders
+        # that don't match every feature.
+        features = list(features)
+        features.reverse()
+        candidates = None
+        candidate_set = None
+        while len(features) > 0:
+            feature = features.pop()
+            we_have_the_feature = self.builders_for_feature.get(feature, [])
+            if len(we_have_the_feature) > 0:
+                if candidates is None:
+                    candidates = we_have_the_feature
+                    candidate_set = set(candidates)
+                else:
+                    # Eliminate any candidates that don't have this feature.
+                    candidate_set = candidate_set.intersection(
+                        set(we_have_the_feature))
+
+        # The only valid candidates are the ones in candidate_set.
+        # Go through the original list of candidates and pick the first one
+        # that's in candidate_set.
+        if candidate_set is None:
+            return None
+        for candidate in candidates:
+            if candidate in candidate_set:
+                return candidate
+        return None
+
+# The BeautifulSoup class will take feature lists from developers and use them
+# to look up builders in this registry.
+builder_registry = TreeBuilderRegistry()
+
+class TreeBuilder(object):
+    """Turn a document into a Beautiful Soup object tree."""
+
+    features = []
+
+    is_xml = False
+    preserve_whitespace_tags = set()
+    empty_element_tags = None # A tag will be considered an empty-element
+                              # tag when and only when it has no contents.
+
+    # A value for these tag/attribute combinations is a space- or
+    # comma-separated list of CDATA, rather than a single CDATA.
+    cdata_list_attributes = {}
+
+
+    def __init__(self):
+        self.soup = None
+
+    def reset(self):
+        pass
+
+    def can_be_empty_element(self, tag_name):
+        """Might a tag with this name be an empty-element tag?
+
+        The final markup may or may not actually present this tag as
+        self-closing.
+
+        For instance: an HTMLBuilder does not consider a <p> tag to be
+        an empty-element tag (it's not in
+        HTMLBuilder.empty_element_tags). This means an empty <p> tag
+        will be presented as "<p></p>", not "<p />".
+
+        The default implementation has no opinion about which tags are
+        empty-element tags, so a tag will be presented as an
+        empty-element tag if and only if it has no contents.
+        "<foo></foo>" will become "<foo />", and "<foo>bar</foo>" will
+        be left alone.
+        """
+        if self.empty_element_tags is None:
+            return True
+        return tag_name in self.empty_element_tags
+
+    def feed(self, markup):
+        raise NotImplementedError()
+
+    def prepare_markup(self, markup, user_specified_encoding=None,
+                       document_declared_encoding=None):
+        return markup, None, None, False
+
+    def test_fragment_to_document(self, fragment):
+        """Wrap an HTML fragment to make it look like a document.
+
+        Different parsers do this differently. For instance, lxml
+        introduces an empty <head> tag, and html5lib
+        doesn't. Abstracting this away lets us write simple tests
+        which run HTML fragments through the parser and compare the
+        results against other HTML fragments.
+
+        This method should not be used outside of tests.
+        """
+        return fragment
+
+    def set_up_substitutions(self, tag):
+        return False
+
+    def _replace_cdata_list_attribute_values(self, tag_name, attrs):
+        """Replaces class="foo bar" with class=["foo", "bar"]
+
+        Modifies its input in place.
+        """
+        if self.cdata_list_attributes:
+            universal = self.cdata_list_attributes.get('*', [])
+            tag_specific = self.cdata_list_attributes.get(
+                tag_name.lower(), [])
+            for cdata_list_attr in itertools.chain(universal, tag_specific):
+                if cdata_list_attr in dict(attrs):
+                    # Basically, we have a "class" attribute whose
+                    # value is a whitespace-separated list of CSS
+                    # classes. Split it into a list.
+                    value = attrs[cdata_list_attr]
+                    if isinstance(value, basestring):
+                        values = whitespace_re.split(value)
+                    else:
+                        # html5lib sometimes calls setAttributes twice
+                        # for the same tag when rearranging the parse
+                        # tree. On the second call the attribute value
+                        # here is already a list.  If this happens,
+                        # leave the value alone rather than trying to
+                        # split it again.
+                        values = value
+                    attrs[cdata_list_attr] = values
+        return attrs
+
+class SAXTreeBuilder(TreeBuilder):
+    """A Beautiful Soup treebuilder that listens for SAX events."""
+
+    def feed(self, markup):
+        raise NotImplementedError()
+
+    def close(self):
+        pass
+
+    def startElement(self, name, attrs):
+        attrs = dict((key[1], value) for key, value in list(attrs.items()))
+        #print "Start %s, %r" % (name, attrs)
+        self.soup.handle_starttag(name, attrs)
+
+    def endElement(self, name):
+        #print "End %s" % name
+        self.soup.handle_endtag(name)
+
+    def startElementNS(self, nsTuple, nodeName, attrs):
+        # Throw away (ns, nodeName) for now.
+        self.startElement(nodeName, attrs)
+
+    def endElementNS(self, nsTuple, nodeName):
+        # Throw away (ns, nodeName) for now.
+        self.endElement(nodeName)
+        #handler.endElementNS((ns, node.nodeName), node.nodeName)
+
+    def startPrefixMapping(self, prefix, nodeValue):
+        # Ignore the prefix for now.
+        pass
+
+    def endPrefixMapping(self, prefix):
+        # Ignore the prefix for now.
+        # handler.endPrefixMapping(prefix)
+        pass
+
+    def characters(self, content):
+        self.soup.handle_data(content)
+
+    def startDocument(self):
+        pass
+
+    def endDocument(self):
+        pass
+
+
+class HTMLTreeBuilder(TreeBuilder):
+    """This TreeBuilder knows facts about HTML.
+
+    Such as which tags are empty-element tags.
+    """
+
+    preserve_whitespace_tags = set(['pre', 'textarea'])
+    empty_element_tags = set(['br' , 'hr', 'input', 'img', 'meta',
+                              'spacer', 'link', 'frame', 'base'])
+
+    # The HTML standard defines these attributes as containing a
+    # space-separated list of values, not a single value. That is,
+    # class="foo bar" means that the 'class' attribute has two values,
+    # 'foo' and 'bar', not the single value 'foo bar'.  When we
+    # encounter one of these attributes, we will parse its value into
+    # a list of values if possible. Upon output, the list will be
+    # converted back into a string.
+    cdata_list_attributes = {
+        "*" : ['class', 'accesskey', 'dropzone'],
+        "a" : ['rel', 'rev'],
+        "link" :  ['rel', 'rev'],
+        "td" : ["headers"],
+        "th" : ["headers"],
+        "td" : ["headers"],
+        "form" : ["accept-charset"],
+        "object" : ["archive"],
+
+        # These are HTML5 specific, as are *.accesskey and *.dropzone above.
+        "area" : ["rel"],
+        "icon" : ["sizes"],
+        "iframe" : ["sandbox"],
+        "output" : ["for"],
+        }
+
+    def set_up_substitutions(self, tag):
+        # We are only interested in <meta> tags
+        if tag.name != 'meta':
+            return False
+
+        http_equiv = tag.get('http-equiv')
+        content = tag.get('content')
+        charset = tag.get('charset')
+
+        # We are interested in <meta> tags that say what encoding the
+        # document was originally in. This means HTML 5-style <meta>
+        # tags that provide the "charset" attribute. It also means
+        # HTML 4-style <meta> tags that provide the "content"
+        # attribute and have "http-equiv" set to "content-type".
+        #
+        # In both cases we will replace the value of the appropriate
+        # attribute with a standin object that can take on any
+        # encoding.
+        meta_encoding = None
+        if charset is not None:
+            # HTML 5 style:
+            # <meta charset="utf8">
+            meta_encoding = charset
+            tag['charset'] = CharsetMetaAttributeValue(charset)
+
+        elif (content is not None and http_equiv is not None
+              and http_equiv.lower() == 'content-type'):
+            # HTML 4 style:
+            # <meta http-equiv="content-type" content="text/html; charset=utf8">
+            tag['content'] = ContentMetaAttributeValue(content)
+
+        return (meta_encoding is not None)
+
+def register_treebuilders_from(module):
+    """Copy TreeBuilders from the given module into this module."""
+    # I'm fairly sure this is not the best way to do this.
+    this_module = sys.modules['bs4.builder']
+    for name in module.__all__:
+        obj = getattr(module, name)
+
+        if issubclass(obj, TreeBuilder):
+            setattr(this_module, name, obj)
+            this_module.__all__.append(name)
+            # Register the builder while we're at it.
+            this_module.builder_registry.register(obj)
+
+# Builders are registered in reverse order of priority, so that custom
+# builder registrations will take precedence. In general, we want lxml
+# to take precedence over html5lib, because it's faster. And we only
+# want to use HTMLParser as a last result.
+from . import _htmlparser
+register_treebuilders_from(_htmlparser)
+try:
+    from . import _html5lib
+    register_treebuilders_from(_html5lib)
+except ImportError:
+    # They don't have html5lib installed.
+    pass
+try:
+    from . import _lxml
+    register_treebuilders_from(_lxml)
+except ImportError:
+    # They don't have lxml installed.
+    pass
--- a/lib/bs4/builder/_html5lib.py
+++ b/lib/bs4/builder/_html5lib.py
@ -0,0 +1,222 @@
+__all__ = [
+    'HTML5TreeBuilder',
+    ]
+
+import warnings
+from bs4.builder import (
+    PERMISSIVE,
+    HTML,
+    HTML_5,
+    HTMLTreeBuilder,
+    )
+from bs4.element import NamespacedAttribute
+import html5lib
+from html5lib.constants import namespaces
+from bs4.element import (
+    Comment,
+    Doctype,
+    NavigableString,
+    Tag,
+    )
+
+class HTML5TreeBuilder(HTMLTreeBuilder):
+    """Use html5lib to build a tree."""
+
+    features = ['html5lib', PERMISSIVE, HTML_5, HTML]
+
+    def prepare_markup(self, markup, user_specified_encoding):
+        # Store the user-specified encoding for use later on.
+        self.user_specified_encoding = user_specified_encoding
+        return markup, None, None, False
+
+    # These methods are defined by Beautiful Soup.
+    def feed(self, markup):
+        if self.soup.parse_only is not None:
+            warnings.warn("You provided a value for parse_only, but the html5lib tree builder doesn't support parse_only. The entire document will be parsed.")
+        parser = html5lib.HTMLParser(tree=self.create_treebuilder)
+        doc = parser.parse(markup, encoding=self.user_specified_encoding)
+
+        # Set the character encoding detected by the tokenizer.
+        if isinstance(markup, unicode):
+            # We need to special-case this because html5lib sets
+            # charEncoding to UTF-8 if it gets Unicode input.
+            doc.original_encoding = None
+        else:
+            doc.original_encoding = parser.tokenizer.stream.charEncoding[0]
+
+    def create_treebuilder(self, namespaceHTMLElements):
+        self.underlying_builder = TreeBuilderForHtml5lib(
+            self.soup, namespaceHTMLElements)
+        return self.underlying_builder
+
+    def test_fragment_to_document(self, fragment):
+        """See `TreeBuilder`."""
+        return u'<html><head></head><body>%s</body></html>' % fragment
+
+
+class TreeBuilderForHtml5lib(html5lib.treebuilders._base.TreeBuilder):
+
+    def __init__(self, soup, namespaceHTMLElements):
+        self.soup = soup
+        super(TreeBuilderForHtml5lib, self).__init__(namespaceHTMLElements)
+
+    def documentClass(self):
+        self.soup.reset()
+        return Element(self.soup, self.soup, None)
+
+    def insertDoctype(self, token):
+        name = token["name"]
+        publicId = token["publicId"]
+        systemId = token["systemId"]
+
+        doctype = Doctype.for_name_and_ids(name, publicId, systemId)
+        self.soup.object_was_parsed(doctype)
+
+    def elementClass(self, name, namespace):
+        tag = self.soup.new_tag(name, namespace)
+        return Element(tag, self.soup, namespace)
+
+    def commentClass(self, data):
+        return TextNode(Comment(data), self.soup)
+
+    def fragmentClass(self):
+        self.soup = BeautifulSoup("")
+        self.soup.name = "[document_fragment]"
+        return Element(self.soup, self.soup, None)
+
+    def appendChild(self, node):
+        # XXX This code is not covered by the BS4 tests.
+        self.soup.append(node.element)
+
+    def getDocument(self):
+        return self.soup
+
+    def getFragment(self):
+        return html5lib.treebuilders._base.TreeBuilder.getFragment(self).element
+
+class AttrList(object):
+    def __init__(self, element):
+        self.element = element
+        self.attrs = dict(self.element.attrs)
+    def __iter__(self):
+        return list(self.attrs.items()).__iter__()
+    def __setitem__(self, name, value):
+        "set attr", name, value
+        self.element[name] = value
+    def items(self):
+        return list(self.attrs.items())
+    def keys(self):
+        return list(self.attrs.keys())
+    def __len__(self):
+        return len(self.attrs)
+    def __getitem__(self, name):
+        return self.attrs[name]
+    def __contains__(self, name):
+        return name in list(self.attrs.keys())
+
+
+class Element(html5lib.treebuilders._base.Node):
+    def __init__(self, element, soup, namespace):
+        html5lib.treebuilders._base.Node.__init__(self, element.name)
+        self.element = element
+        self.soup = soup
+        self.namespace = namespace
+
+    def appendChild(self, node):
+        if (node.element.__class__ == NavigableString and self.element.contents
+            and self.element.contents[-1].__class__ == NavigableString):
+            # Concatenate new text onto old text node
+            # XXX This has O(n^2) performance, for input like
+            # "a</a>a</a>a</a>..."
+            old_element = self.element.contents[-1]
+            new_element = self.soup.new_string(old_element + node.element)
+            old_element.replace_with(new_element)
+        else:
+            self.element.append(node.element)
+            node.parent = self
+
+    def getAttributes(self):
+        return AttrList(self.element)
+
+    def setAttributes(self, attributes):
+        if attributes is not None and len(attributes) > 0:
+
+            converted_attributes = []
+            for name, value in list(attributes.items()):
+                if isinstance(name, tuple):
+                    new_name = NamespacedAttribute(*name)
+                    del attributes[name]
+                    attributes[new_name] = value
+
+            self.soup.builder._replace_cdata_list_attribute_values(
+                self.name, attributes)
+            for name, value in attributes.items():
+                self.element[name] = value
+
+            # The attributes may contain variables that need substitution.
+            # Call set_up_substitutions manually.
+            #
+            # The Tag constructor called this method when the Tag was created,
+            # but we just set/changed the attributes, so call it again.
+            self.soup.builder.set_up_substitutions(self.element)
+    attributes = property(getAttributes, setAttributes)
+
+    def insertText(self, data, insertBefore=None):
+        text = TextNode(self.soup.new_string(data), self.soup)
+        if insertBefore:
+            self.insertBefore(text, insertBefore)
+        else:
+            self.appendChild(text)
+
+    def insertBefore(self, node, refNode):
+        index = self.element.index(refNode.element)
+        if (node.element.__class__ == NavigableString and self.element.contents
+            and self.element.contents[index-1].__class__ == NavigableString):
+            # (See comments in appendChild)
+            old_node = self.element.contents[index-1]
+            new_str = self.soup.new_string(old_node + node.element)
+            old_node.replace_with(new_str)
+        else:
+            self.element.insert(index, node.element)
+            node.parent = self
+
+    def removeChild(self, node):
+        node.element.extract()
+
+    def reparentChildren(self, newParent):
+        while self.element.contents:
+            child = self.element.contents[0]
+            child.extract()
+            if isinstance(child, Tag):
+                newParent.appendChild(
+                    Element(child, self.soup, namespaces["html"]))
+            else:
+                newParent.appendChild(
+                    TextNode(child, self.soup))
+
+    def cloneNode(self):
+        tag = self.soup.new_tag(self.element.name, self.namespace)
+        node = Element(tag, self.soup, self.namespace)
+        for key,value in self.attributes:
+            node.attributes[key] = value
+        return node
+
+    def hasContent(self):
+        return self.element.contents
+
+    def getNameTuple(self):
+        if self.namespace == None:
+            return namespaces["html"], self.name
+        else:
+            return self.namespace, self.name
+
+    nameTuple = property(getNameTuple)
+
+class TextNode(Element):
+    def __init__(self, element, soup):
+        html5lib.treebuilders._base.Node.__init__(self, None)
+        self.element = element
+        self.soup = soup
+
+    def cloneNode(self):
+        raise NotImplementedError
--- a/lib/bs4/builder/_htmlparser.py
+++ b/lib/bs4/builder/_htmlparser.py
@ -0,0 +1,244 @@
+"""Use the HTMLParser library to parse HTML files that aren't too bad."""
+
+__all__ = [
+    'HTMLParserTreeBuilder',
+    ]
+
+from HTMLParser import (
+    HTMLParser,
+    HTMLParseError,
+    )
+import sys
+import warnings
+
+# Starting in Python 3.2, the HTMLParser constructor takes a 'strict'
+# argument, which we'd like to set to False. Unfortunately,
+# http://bugs.python.org/issue13273 makes strict=True a better bet
+# before Python 3.2.3.
+#
+# At the end of this file, we monkeypatch HTMLParser so that
+# strict=True works well on Python 3.2.2.
+major, minor, release = sys.version_info[:3]
+CONSTRUCTOR_TAKES_STRICT = (
+    major > 3
+    or (major == 3 and minor > 2)
+    or (major == 3 and minor == 2 and release >= 3))
+
+from bs4.element import (
+    CData,
+    Comment,
+    Declaration,
+    Doctype,
+    ProcessingInstruction,
+    )
+from bs4.dammit import EntitySubstitution, UnicodeDammit
+
+from bs4.builder import (
+    HTML,
+    HTMLTreeBuilder,
+    STRICT,
+    )
+
+
+HTMLPARSER = 'html.parser'
+
+class BeautifulSoupHTMLParser(HTMLParser):
+    def handle_starttag(self, name, attrs):
+        # XXX namespace
+        self.soup.handle_starttag(name, None, None, dict(attrs))
+
+    def handle_endtag(self, name):
+        self.soup.handle_endtag(name)
+
+    def handle_data(self, data):
+        self.soup.handle_data(data)
+
+    def handle_charref(self, name):
+        # XXX workaround for a bug in HTMLParser. Remove this once
+        # it's fixed.
+        if name.startswith('x'):
+            real_name = int(name.lstrip('x'), 16)
+        else:
+            real_name = int(name)
+
+        try:
+            data = unichr(real_name)
+        except (ValueError, OverflowError), e:
+            data = u"\N{REPLACEMENT CHARACTER}"
+
+        self.handle_data(data)
+
+    def handle_entityref(self, name):
+        character = EntitySubstitution.HTML_ENTITY_TO_CHARACTER.get(name)
+        if character is not None:
+            data = character
+        else:
+            data = "&%s;" % name
+        self.handle_data(data)
+
+    def handle_comment(self, data):
+        self.soup.endData()
+        self.soup.handle_data(data)
+        self.soup.endData(Comment)
+
+    def handle_decl(self, data):
+        self.soup.endData()
+        if data.startswith("DOCTYPE "):
+            data = data[len("DOCTYPE "):]
+        self.soup.handle_data(data)
+        self.soup.endData(Doctype)
+
+    def unknown_decl(self, data):
+        if data.upper().startswith('CDATA['):
+            cls = CData
+            data = data[len('CDATA['):]
+        else:
+            cls = Declaration
+        self.soup.endData()
+        self.soup.handle_data(data)
+        self.soup.endData(cls)
+
+    def handle_pi(self, data):
+        self.soup.endData()
+        if data.endswith("?") and data.lower().startswith("xml"):
+            # "An XHTML processing instruction using the trailing '?'
+            # will cause the '?' to be included in data." - HTMLParser
+            # docs.
+            #
+            # Strip the question mark so we don't end up with two
+            # question marks.
+            data = data[:-1]
+        self.soup.handle_data(data)
+        self.soup.endData(ProcessingInstruction)
+
+
+class HTMLParserTreeBuilder(HTMLTreeBuilder):
+
+    is_xml = False
+    features = [HTML, STRICT, HTMLPARSER]
+
+    def __init__(self, *args, **kwargs):
+        if CONSTRUCTOR_TAKES_STRICT:
+            kwargs['strict'] = False
+        self.parser_args = (args, kwargs)
+
+    def prepare_markup(self, markup, user_specified_encoding=None,
+                       document_declared_encoding=None):
+        """
+        :return: A 4-tuple (markup, original encoding, encoding
+        declared within markup, whether any characters had to be
+        replaced with REPLACEMENT CHARACTER).
+        """
+        if isinstance(markup, unicode):
+            return markup, None, None, False
+
+        try_encodings = [user_specified_encoding, document_declared_encoding]
+        dammit = UnicodeDammit(markup, try_encodings, is_html=True)
+        return (dammit.markup, dammit.original_encoding,
+                dammit.declared_html_encoding,
+                dammit.contains_replacement_characters)
+
+    def feed(self, markup):
+        args, kwargs = self.parser_args
+        parser = BeautifulSoupHTMLParser(*args, **kwargs)
+        parser.soup = self.soup
+        try:
+            parser.feed(markup)
+        except HTMLParseError, e:
+            warnings.warn(RuntimeWarning(
+                "Python's built-in HTMLParser cannot parse the given document. This is not a bug in Beautiful Soup. The best solution is to install an external parser (lxml or html5lib), and use Beautiful Soup with that parser. See http://www.crummy.com/software/BeautifulSoup/bs4/doc/#installing-a-parser for help."))
+            raise e
+
+# Patch 3.2 versions of HTMLParser earlier than 3.2.3 to use some
+# 3.2.3 code. This ensures they don't treat markup like <p></p> as a
+# string.
+#
+# XXX This code can be removed once most Python 3 users are on 3.2.3.
+if major == 3 and minor == 2 and not CONSTRUCTOR_TAKES_STRICT:
+    import re
+    attrfind_tolerant = re.compile(
+        r'\s*((?<=[\'"\s])[^\s/>][^\s/=>]*)(\s*=+\s*'
+        r'(\'[^\']*\'|"[^"]*"|(?![\'"])[^>\s]*))?')
+    HTMLParserTreeBuilder.attrfind_tolerant = attrfind_tolerant
+
+    locatestarttagend = re.compile(r"""
+  <[a-zA-Z][-.a-zA-Z0-9:_]*          # tag name
+  (?:\s+                             # whitespace before attribute name
+    (?:[a-zA-Z_][-.:a-zA-Z0-9_]*     # attribute name
+      (?:\s*=\s*                     # value indicator
+        (?:'[^']*'                   # LITA-enclosed value
+          |\"[^\"]*\"                # LIT-enclosed value
+          |[^'\">\s]+                # bare value
+         )
+       )?
+     )
+   )*
+  \s*                                # trailing whitespace
+""", re.VERBOSE)
+    BeautifulSoupHTMLParser.locatestarttagend = locatestarttagend
+
+    from html.parser import tagfind, attrfind
+
+    def parse_starttag(self, i):
+        self.__starttag_text = None
+        endpos = self.check_for_whole_start_tag(i)
+        if endpos < 0:
+            return endpos
+        rawdata = self.rawdata
+        self.__starttag_text = rawdata[i:endpos]
+
+        # Now parse the data between i+1 and j into a tag and attrs
+        attrs = []
+        match = tagfind.match(rawdata, i+1)
+        assert match, 'unexpected call to parse_starttag()'
+        k = match.end()
+        self.lasttag = tag = rawdata[i+1:k].lower()
+        while k < endpos:
+            if self.strict:
+                m = attrfind.match(rawdata, k)
+            else:
+                m = attrfind_tolerant.match(rawdata, k)
+            if not m:
+                break
+            attrname, rest, attrvalue = m.group(1, 2, 3)
+            if not rest:
+                attrvalue = None
+            elif attrvalue[:1] == '\'' == attrvalue[-1:] or \
+                 attrvalue[:1] == '"' == attrvalue[-1:]:
+                attrvalue = attrvalue[1:-1]
+            if attrvalue:
+                attrvalue = self.unescape(attrvalue)
+            attrs.append((attrname.lower(), attrvalue))
+            k = m.end()
+
+        end = rawdata[k:endpos].strip()
+        if end not in (">", "/>"):
+            lineno, offset = self.getpos()
+            if "\n" in self.__starttag_text:
+                lineno = lineno + self.__starttag_text.count("\n")
+                offset = len(self.__starttag_text) \
+                         - self.__starttag_text.rfind("\n")
+            else:
+                offset = offset + len(self.__starttag_text)
+            if self.strict:
+                self.error("junk characters in start tag: %r"
+                           % (rawdata[k:endpos][:20],))
+            self.handle_data(rawdata[i:endpos])
+            return endpos
+        if end.endswith('/>'):
+            # XHTML-style empty tag: <span attr="value" />
+            self.handle_startendtag(tag, attrs)
+        else:
+            self.handle_starttag(tag, attrs)
+            if tag in self.CDATA_CONTENT_ELEMENTS:
+                self.set_cdata_mode(tag)
+        return endpos
+
+    def set_cdata_mode(self, elem):
+        self.cdata_elem = elem.lower()
+        self.interesting = re.compile(r'</\s*%s\s*>' % self.cdata_elem, re.I)
+
+    BeautifulSoupHTMLParser.parse_starttag = parse_starttag
+    BeautifulSoupHTMLParser.set_cdata_mode = set_cdata_mode
+
+    CONSTRUCTOR_TAKES_STRICT = True
--- a/lib/bs4/builder/_lxml.py
+++ b/lib/bs4/builder/_lxml.py
@ -0,0 +1,179 @@
+__all__ = [
+    'LXMLTreeBuilderForXML',
+    'LXMLTreeBuilder',
+    ]
+
+from StringIO import StringIO
+import collections
+from lxml import etree
+from bs4.element import Comment, Doctype, NamespacedAttribute
+from bs4.builder import (
+    FAST,
+    HTML,
+    HTMLTreeBuilder,
+    PERMISSIVE,
+    TreeBuilder,
+    XML)
+from bs4.dammit import UnicodeDammit
+
+LXML = 'lxml'
+
+class LXMLTreeBuilderForXML(TreeBuilder):
+    DEFAULT_PARSER_CLASS = etree.XMLParser
+
+    is_xml = True
+
+    # Well, it's permissive by XML parser standards.
+    features = [LXML, XML, FAST, PERMISSIVE]
+
+    CHUNK_SIZE = 512
+
+    @property
+    def default_parser(self):
+        # This can either return a parser object or a class, which
+        # will be instantiated with default arguments.
+        return etree.XMLParser(target=self, strip_cdata=False, recover=True)
+
+    def __init__(self, parser=None, empty_element_tags=None):
+        if empty_element_tags is not None:
+            self.empty_element_tags = set(empty_element_tags)
+        if parser is None:
+            # Use the default parser.
+            parser = self.default_parser
+        if isinstance(parser, collections.Callable):
+            # Instantiate the parser with default arguments
+            parser = parser(target=self, strip_cdata=False)
+        self.parser = parser
+        self.soup = None
+        self.nsmaps = None
+
+    def _getNsTag(self, tag):
+        # Split the namespace URL out of a fully-qualified lxml tag
+        # name. Copied from lxml's src/lxml/sax.py.
+        if tag[0] == '{':
+            return tuple(tag[1:].split('}', 1))
+        else:
+            return (None, tag)
+
+    def prepare_markup(self, markup, user_specified_encoding=None,
+                       document_declared_encoding=None):
+        """
+        :return: A 3-tuple (markup, original encoding, encoding
+        declared within markup).
+        """
+        if isinstance(markup, unicode):
+            return markup, None, None, False
+
+        try_encodings = [user_specified_encoding, document_declared_encoding]
+        dammit = UnicodeDammit(markup, try_encodings, is_html=True)
+        return (dammit.markup, dammit.original_encoding,
+                dammit.declared_html_encoding,
+                dammit.contains_replacement_characters)
+
+    def feed(self, markup):
+        if isinstance(markup, basestring):
+            markup = StringIO(markup)
+        # Call feed() at least once, even if the markup is empty,
+        # or the parser won't be initialized.
+        data = markup.read(self.CHUNK_SIZE)
+        self.parser.feed(data)
+        while data != '':
+            # Now call feed() on the rest of the data, chunk by chunk.
+            data = markup.read(self.CHUNK_SIZE)
+            if data != '':
+                self.parser.feed(data)
+        self.parser.close()
+
+    def close(self):
+        self.nsmaps = None
+
+    def start(self, name, attrs, nsmap={}):
+        # Make sure attrs is a mutable dict--lxml may send an immutable dictproxy.
+        attrs = dict(attrs)
+
+        nsprefix = None
+        # Invert each namespace map as it comes in.
+        if len(nsmap) == 0 and self.nsmaps != None:
+            # There are no new namespaces for this tag, but namespaces
+            # are in play, so we need a separate tag stack to know
+            # when they end.
+            self.nsmaps.append(None)
+        elif len(nsmap) > 0:
+            # A new namespace mapping has come into play.
+            if self.nsmaps is None:
+                self.nsmaps = []
+            inverted_nsmap = dict((value, key) for key, value in nsmap.items())
+            self.nsmaps.append(inverted_nsmap)
+            # Also treat the namespace mapping as a set of attributes on the
+            # tag, so we can recreate it later.
+            attrs = attrs.copy()
+            for prefix, namespace in nsmap.items():
+                attribute = NamespacedAttribute(
+                    "xmlns", prefix, "http://www.w3.org/2000/xmlns/")
+                attrs[attribute] = namespace
+        namespace, name = self._getNsTag(name)
+        if namespace is not None:
+            for inverted_nsmap in reversed(self.nsmaps):
+                if inverted_nsmap is not None and namespace in inverted_nsmap:
+                    nsprefix = inverted_nsmap[namespace]
+                    break
+        self.soup.handle_starttag(name, namespace, nsprefix, attrs)
+
+    def end(self, name):
+        self.soup.endData()
+        completed_tag = self.soup.tagStack[-1]
+        namespace, name = self._getNsTag(name)
+        nsprefix = None
+        if namespace is not None:
+            for inverted_nsmap in reversed(self.nsmaps):
+                if inverted_nsmap is not None and namespace in inverted_nsmap:
+                    nsprefix = inverted_nsmap[namespace]
+                    break
+        self.soup.handle_endtag(name, nsprefix)
+        if self.nsmaps != None:
+            # This tag, or one of its parents, introduced a namespace
+            # mapping, so pop it off the stack.
+            self.nsmaps.pop()
+            if len(self.nsmaps) == 0:
+                # Namespaces are no longer in play, so don't bother keeping
+                # track of the namespace stack.
+                self.nsmaps = None
+
+    def pi(self, target, data):
+        pass
+
+    def data(self, content):
+        self.soup.handle_data(content)
+
+    def doctype(self, name, pubid, system):
+        self.soup.endData()
+        doctype = Doctype.for_name_and_ids(name, pubid, system)
+        self.soup.object_was_parsed(doctype)
+
+    def comment(self, content):
+        "Handle comments as Comment objects."
+        self.soup.endData()
+        self.soup.handle_data(content)
+        self.soup.endData(Comment)
+
+    def test_fragment_to_document(self, fragment):
+        """See `TreeBuilder`."""
+        return u'<?xml version="1.0" encoding="utf-8"?>\n%s' % fragment
+
+
+class LXMLTreeBuilder(HTMLTreeBuilder, LXMLTreeBuilderForXML):
+
+    features = [LXML, HTML, FAST, PERMISSIVE]
+    is_xml = False
+
+    @property
+    def default_parser(self):
+        return etree.HTMLParser
+
+    def feed(self, markup):
+        self.parser.feed(markup)
+        self.parser.close()
+
+    def test_fragment_to_document(self, fragment):
+        """See `TreeBuilder`."""
+        return u'<html><body>%s</body></html>' % fragment
--- a/lib/bs4/dammit.py
+++ b/lib/bs4/dammit.py
@ -0,0 +1,792 @@
+# -*- coding: utf-8 -*-
+"""Beautiful Soup bonus library: Unicode, Dammit
+
+This class forces XML data into a standard format (usually to UTF-8 or
+Unicode).  It is heavily based on code from Mark Pilgrim's Universal
+Feed Parser. It does not rewrite the XML or HTML to reflect a new
+encoding; that's the tree builder's job.
+"""
+
+import codecs
+from htmlentitydefs import codepoint2name
+import re
+import warnings
+
+# Autodetects character encodings. Very useful.
+# Download from http://chardet.feedparser.org/
+#  or 'apt-get install python-chardet'
+#  or 'easy_install chardet'
+try:
+    import chardet
+    #import chardet.constants
+    #chardet.constants._debug = 1
+except ImportError:
+    chardet = None
+
+# Available from http://cjkpython.i18n.org/.
+try:
+    import iconv_codec
+except ImportError:
+    pass
+
+xml_encoding_re = re.compile(
+    '^<\?.*encoding=[\'"](.*?)[\'"].*\?>'.encode(), re.I)
+html_meta_re = re.compile(
+    '<\s*meta[^>]+charset\s*=\s*["\']?([^>]*?)[ /;\'">]'.encode(), re.I)
+
+class EntitySubstitution(object):
+
+    """Substitute XML or HTML entities for the corresponding characters."""
+
+    def _populate_class_variables():
+        lookup = {}
+        reverse_lookup = {}
+        characters_for_re = []
+        for codepoint, name in list(codepoint2name.items()):
+            character = unichr(codepoint)
+            if codepoint != 34:
+                # There's no point in turning the quotation mark into
+                # &quot;, unless it happens within an attribute value, which
+                # is handled elsewhere.
+                characters_for_re.append(character)
+                lookup[character] = name
+            # But we do want to turn &quot; into the quotation mark.
+            reverse_lookup[name] = character
+        re_definition = "[%s]" % "".join(characters_for_re)
+        return lookup, reverse_lookup, re.compile(re_definition)
+    (CHARACTER_TO_HTML_ENTITY, HTML_ENTITY_TO_CHARACTER,
+     CHARACTER_TO_HTML_ENTITY_RE) = _populate_class_variables()
+
+    CHARACTER_TO_XML_ENTITY = {
+        "'": "apos",
+        '"': "quot",
+        "&": "amp",
+        "<": "lt",
+        ">": "gt",
+        }
+
+    BARE_AMPERSAND_OR_BRACKET = re.compile("([<>]|"
+                                           "&(?!#\d+;|#x[0-9a-fA-F]+;|\w+;)"
+                                           ")")
+
+    @classmethod
+    def _substitute_html_entity(cls, matchobj):
+        entity = cls.CHARACTER_TO_HTML_ENTITY.get(matchobj.group(0))
+        return "&%s;" % entity
+
+    @classmethod
+    def _substitute_xml_entity(cls, matchobj):
+        """Used with a regular expression to substitute the
+        appropriate XML entity for an XML special character."""
+        entity = cls.CHARACTER_TO_XML_ENTITY[matchobj.group(0)]
+        return "&%s;" % entity
+
+    @classmethod
+    def quoted_attribute_value(self, value):
+        """Make a value into a quoted XML attribute, possibly escaping it.
+
+         Most strings will be quoted using double quotes.
+
+          Bob's Bar -> "Bob's Bar"
+
+         If a string contains double quotes, it will be quoted using
+         single quotes.
+
+          Welcome to "my bar" -> 'Welcome to "my bar"'
+
+         If a string contains both single and double quotes, the
+         double quotes will be escaped, and the string will be quoted
+         using double quotes.
+
+          Welcome to "Bob's Bar" -> "Welcome to &quot;Bob's bar&quot;
+        """
+        quote_with = '"'
+        if '"' in value:
+            if "'" in value:
+                # The string contains both single and double
+                # quotes.  Turn the double quotes into
+                # entities. We quote the double quotes rather than
+                # the single quotes because the entity name is
+                # "&quot;" whether this is HTML or XML.  If we
+                # quoted the single quotes, we'd have to decide
+                # between &apos; and &squot;.
+                replace_with = "&quot;"
+                value = value.replace('"', replace_with)
+            else:
+                # There are double quotes but no single quotes.
+                # We can use single quotes to quote the attribute.
+                quote_with = "'"
+        return quote_with + value + quote_with
+
+    @classmethod
+    def substitute_xml(cls, value, make_quoted_attribute=False):
+        """Substitute XML entities for special XML characters.
+
+        :param value: A string to be substituted. The less-than sign will
+          become &lt;, the greater-than sign will become &gt;, and any
+          ampersands that are not part of an entity defition will
+          become &amp;.
+
+        :param make_quoted_attribute: If True, then the string will be
+         quoted, as befits an attribute value.
+        """
+        # Escape angle brackets, and ampersands that aren't part of
+        # entities.
+        value = cls.BARE_AMPERSAND_OR_BRACKET.sub(
+            cls._substitute_xml_entity, value)
+
+        if make_quoted_attribute:
+            value = cls.quoted_attribute_value(value)
+        return value
+
+    @classmethod
+    def substitute_html(cls, s):
+        """Replace certain Unicode characters with named HTML entities.
+
+        This differs from data.encode(encoding, 'xmlcharrefreplace')
+        in that the goal is to make the result more readable (to those
+        with ASCII displays) rather than to recover from
+        errors. There's absolutely nothing wrong with a UTF-8 string
+        containg a LATIN SMALL LETTER E WITH ACUTE, but replacing that
+        character with "&eacute;" will make it more readable to some
+        people.
+        """
+        return cls.CHARACTER_TO_HTML_ENTITY_RE.sub(
+            cls._substitute_html_entity, s)
+
+
+class UnicodeDammit:
+    """A class for detecting the encoding of a *ML document and
+    converting it to a Unicode string. If the source encoding is
+    windows-1252, can replace MS smart quotes with their HTML or XML
+    equivalents."""
+
+    # This dictionary maps commonly seen values for "charset" in HTML
+    # meta tags to the corresponding Python codec names. It only covers
+    # values that aren't in Python's aliases and can't be determined
+    # by the heuristics in find_codec.
+    CHARSET_ALIASES = {"macintosh": "mac-roman",
+                       "x-sjis": "shift-jis"}
+
+    ENCODINGS_WITH_SMART_QUOTES = [
+        "windows-1252",
+        "iso-8859-1",
+        "iso-8859-2",
+        ]
+
+    def __init__(self, markup, override_encodings=[],
+                 smart_quotes_to=None, is_html=False):
+        self.declared_html_encoding = None
+        self.smart_quotes_to = smart_quotes_to
+        self.tried_encodings = []
+        self.contains_replacement_characters = False
+
+        if markup == '' or isinstance(markup, unicode):
+            self.markup = markup
+            self.unicode_markup = unicode(markup)
+            self.original_encoding = None
+            return
+
+        new_markup, document_encoding, sniffed_encoding = \
+            self._detectEncoding(markup, is_html)
+        self.markup = new_markup
+
+        u = None
+        if new_markup != markup:
+            # _detectEncoding modified the markup, then converted it to
+            # Unicode and then to UTF-8. So convert it from UTF-8.
+            u = self._convert_from("utf8")
+            self.original_encoding = sniffed_encoding
+
+        if not u:
+            for proposed_encoding in (
+                override_encodings + [document_encoding, sniffed_encoding]):
+                if proposed_encoding is not None:
+                    u = self._convert_from(proposed_encoding)
+                    if u:
+                        break
+
+        # If no luck and we have auto-detection library, try that:
+        if not u and chardet and not isinstance(self.markup, unicode):
+            u = self._convert_from(chardet.detect(self.markup)['encoding'])
+
+        # As a last resort, try utf-8 and windows-1252:
+        if not u:
+            for proposed_encoding in ("utf-8", "windows-1252"):
+                u = self._convert_from(proposed_encoding)
+                if u:
+                    break
+
+        # As an absolute last resort, try the encodings again with
+        # character replacement.
+        if not u:
+            for proposed_encoding in (
+                override_encodings + [
+                    document_encoding, sniffed_encoding, "utf-8", "windows-1252"]):
+                if proposed_encoding != "ascii":
+                    u = self._convert_from(proposed_encoding, "replace")
+                if u is not None:
+                    warnings.warn(
+                        UnicodeWarning(
+                            "Some characters could not be decoded, and were "
+                            "replaced with REPLACEMENT CHARACTER."))
+                    self.contains_replacement_characters = True
+                    break
+
+        # We could at this point force it to ASCII, but that would
+        # destroy so much data that I think giving up is better
+        self.unicode_markup = u
+        if not u:
+            self.original_encoding = None
+
+    def _sub_ms_char(self, match):
+        """Changes a MS smart quote character to an XML or HTML
+        entity, or an ASCII character."""
+        orig = match.group(1)
+        if self.smart_quotes_to == 'ascii':
+            sub = self.MS_CHARS_TO_ASCII.get(orig).encode()
+        else:
+            sub = self.MS_CHARS.get(orig)
+            if type(sub) == tuple:
+                if self.smart_quotes_to == 'xml':
+                    sub = '&#x'.encode() + sub[1].encode() + ';'.encode()
+                else:
+                    sub = '&'.encode() + sub[0].encode() + ';'.encode()
+            else:
+                sub = sub.encode()
+        return sub
+
+    def _convert_from(self, proposed, errors="strict"):
+        proposed = self.find_codec(proposed)
+        if not proposed or (proposed, errors) in self.tried_encodings:
+            return None
+        self.tried_encodings.append((proposed, errors))
+        markup = self.markup
+
+        # Convert smart quotes to HTML if coming from an encoding
+        # that might have them.
+        if (self.smart_quotes_to is not None
+            and proposed.lower() in self.ENCODINGS_WITH_SMART_QUOTES):
+            smart_quotes_re = b"([\x80-\x9f])"
+            smart_quotes_compiled = re.compile(smart_quotes_re)
+            markup = smart_quotes_compiled.sub(self._sub_ms_char, markup)
+
+        try:
+            #print "Trying to convert document to %s (errors=%s)" % (
+            #    proposed, errors)
+            u = self._to_unicode(markup, proposed, errors)
+            self.markup = u
+            self.original_encoding = proposed
+        except Exception as e:
+            #print "That didn't work!"
+            #print e
+            return None
+        #print "Correct encoding: %s" % proposed
+        return self.markup
+
+    def _to_unicode(self, data, encoding, errors="strict"):
+        '''Given a string and its encoding, decodes the string into Unicode.
+        %encoding is a string recognized by encodings.aliases'''
+
+        # strip Byte Order Mark (if present)
+        if (len(data) >= 4) and (data[:2] == '\xfe\xff') \
+               and (data[2:4] != '\x00\x00'):
+            encoding = 'utf-16be'
+            data = data[2:]
+        elif (len(data) >= 4) and (data[:2] == '\xff\xfe') \
+                 and (data[2:4] != '\x00\x00'):
+            encoding = 'utf-16le'
+            data = data[2:]
+        elif data[:3] == '\xef\xbb\xbf':
+            encoding = 'utf-8'
+            data = data[3:]
+        elif data[:4] == '\x00\x00\xfe\xff':
+            encoding = 'utf-32be'
+            data = data[4:]
+        elif data[:4] == '\xff\xfe\x00\x00':
+            encoding = 'utf-32le'
+            data = data[4:]
+        newdata = unicode(data, encoding, errors)
+        return newdata
+
+    def _detectEncoding(self, xml_data, is_html=False):
+        """Given a document, tries to detect its XML encoding."""
+        xml_encoding = sniffed_xml_encoding = None
+        try:
+            if xml_data[:4] == b'\x4c\x6f\xa7\x94':
+                # EBCDIC
+                xml_data = self._ebcdic_to_ascii(xml_data)
+            elif xml_data[:4] == b'\x00\x3c\x00\x3f':
+                # UTF-16BE
+                sniffed_xml_encoding = 'utf-16be'
+                xml_data = unicode(xml_data, 'utf-16be').encode('utf-8')
+            elif (len(xml_data) >= 4) and (xml_data[:2] == b'\xfe\xff') \
+                     and (xml_data[2:4] != b'\x00\x00'):
+                # UTF-16BE with BOM
+                sniffed_xml_encoding = 'utf-16be'
+                xml_data = unicode(xml_data[2:], 'utf-16be').encode('utf-8')
+            elif xml_data[:4] == b'\x3c\x00\x3f\x00':
+                # UTF-16LE
+                sniffed_xml_encoding = 'utf-16le'
+                xml_data = unicode(xml_data, 'utf-16le').encode('utf-8')
+            elif (len(xml_data) >= 4) and (xml_data[:2] == b'\xff\xfe') and \
+                     (xml_data[2:4] != b'\x00\x00'):
+                # UTF-16LE with BOM
+                sniffed_xml_encoding = 'utf-16le'
+                xml_data = unicode(xml_data[2:], 'utf-16le').encode('utf-8')
+            elif xml_data[:4] == b'\x00\x00\x00\x3c':
+                # UTF-32BE
+                sniffed_xml_encoding = 'utf-32be'
+                xml_data = unicode(xml_data, 'utf-32be').encode('utf-8')
+            elif xml_data[:4] == b'\x3c\x00\x00\x00':
+                # UTF-32LE
+                sniffed_xml_encoding = 'utf-32le'
+                xml_data = unicode(xml_data, 'utf-32le').encode('utf-8')
+            elif xml_data[:4] == b'\x00\x00\xfe\xff':
+                # UTF-32BE with BOM
+                sniffed_xml_encoding = 'utf-32be'
+                xml_data = unicode(xml_data[4:], 'utf-32be').encode('utf-8')
+            elif xml_data[:4] == b'\xff\xfe\x00\x00':
+                # UTF-32LE with BOM
+                sniffed_xml_encoding = 'utf-32le'
+                xml_data = unicode(xml_data[4:], 'utf-32le').encode('utf-8')
+            elif xml_data[:3] == b'\xef\xbb\xbf':
+                # UTF-8 with BOM
+                sniffed_xml_encoding = 'utf-8'
+                xml_data = unicode(xml_data[3:], 'utf-8').encode('utf-8')
+            else:
+                sniffed_xml_encoding = 'ascii'
+                pass
+        except:
+            xml_encoding_match = None
+        xml_encoding_match = xml_encoding_re.match(xml_data)
+        if not xml_encoding_match and is_html:
+            xml_encoding_match = html_meta_re.search(xml_data)
+        if xml_encoding_match is not None:
+            xml_encoding = xml_encoding_match.groups()[0].decode(
+                'ascii').lower()
+            if is_html:
+                self.declared_html_encoding = xml_encoding
+            if sniffed_xml_encoding and \
+               (xml_encoding in ('iso-10646-ucs-2', 'ucs-2', 'csunicode',
+                                 'iso-10646-ucs-4', 'ucs-4', 'csucs4',
+                                 'utf-16', 'utf-32', 'utf_16', 'utf_32',
+                                 'utf16', 'u16')):
+                xml_encoding = sniffed_xml_encoding
+        return xml_data, xml_encoding, sniffed_xml_encoding
+
+    def find_codec(self, charset):
+        return self._codec(self.CHARSET_ALIASES.get(charset, charset)) \
+               or (charset and self._codec(charset.replace("-", ""))) \
+               or (charset and self._codec(charset.replace("-", "_"))) \
+               or charset
+
+    def _codec(self, charset):
+        if not charset:
+            return charset
+        codec = None
+        try:
+            codecs.lookup(charset)
+            codec = charset
+        except (LookupError, ValueError):
+            pass
+        return codec
+
+    EBCDIC_TO_ASCII_MAP = None
+
+    def _ebcdic_to_ascii(self, s):
+        c = self.__class__
+        if not c.EBCDIC_TO_ASCII_MAP:
+            emap = (0,1,2,3,156,9,134,127,151,141,142,11,12,13,14,15,
+                    16,17,18,19,157,133,8,135,24,25,146,143,28,29,30,31,
+                    128,129,130,131,132,10,23,27,136,137,138,139,140,5,6,7,
+                    144,145,22,147,148,149,150,4,152,153,154,155,20,21,158,26,
+                    32,160,161,162,163,164,165,166,167,168,91,46,60,40,43,33,
+                    38,169,170,171,172,173,174,175,176,177,93,36,42,41,59,94,
+                    45,47,178,179,180,181,182,183,184,185,124,44,37,95,62,63,
+                    186,187,188,189,190,191,192,193,194,96,58,35,64,39,61,34,
+                    195,97,98,99,100,101,102,103,104,105,196,197,198,199,200,
+                    201,202,106,107,108,109,110,111,112,113,114,203,204,205,
+                    206,207,208,209,126,115,116,117,118,119,120,121,122,210,
+                    211,212,213,214,215,216,217,218,219,220,221,222,223,224,
+                    225,226,227,228,229,230,231,123,65,66,67,68,69,70,71,72,
+                    73,232,233,234,235,236,237,125,74,75,76,77,78,79,80,81,
+                    82,238,239,240,241,242,243,92,159,83,84,85,86,87,88,89,
+                    90,244,245,246,247,248,249,48,49,50,51,52,53,54,55,56,57,
+                    250,251,252,253,254,255)
+            import string
+            c.EBCDIC_TO_ASCII_MAP = string.maketrans(
+            ''.join(map(chr, list(range(256)))), ''.join(map(chr, emap)))
+        return s.translate(c.EBCDIC_TO_ASCII_MAP)
+
+    # A partial mapping of ISO-Latin-1 to HTML entities/XML numeric entities.
+    MS_CHARS = {b'\x80': ('euro', '20AC'),
+                b'\x81': ' ',
+                b'\x82': ('sbquo', '201A'),
+                b'\x83': ('fnof', '192'),
+                b'\x84': ('bdquo', '201E'),
+                b'\x85': ('hellip', '2026'),
+                b'\x86': ('dagger', '2020'),
+                b'\x87': ('Dagger', '2021'),
+                b'\x88': ('circ', '2C6'),
+                b'\x89': ('permil', '2030'),
+                b'\x8A': ('Scaron', '160'),
+                b'\x8B': ('lsaquo', '2039'),
+                b'\x8C': ('OElig', '152'),
+                b'\x8D': '?',
+                b'\x8E': ('#x17D', '17D'),
+                b'\x8F': '?',
+                b'\x90': '?',
+                b'\x91': ('lsquo', '2018'),
+                b'\x92': ('rsquo', '2019'),
+                b'\x93': ('ldquo', '201C'),
+                b'\x94': ('rdquo', '201D'),
+                b'\x95': ('bull', '2022'),
+                b'\x96': ('ndash', '2013'),
+                b'\x97': ('mdash', '2014'),
+                b'\x98': ('tilde', '2DC'),
+                b'\x99': ('trade', '2122'),
+                b'\x9a': ('scaron', '161'),
+                b'\x9b': ('rsaquo', '203A'),
+                b'\x9c': ('oelig', '153'),
+                b'\x9d': '?',
+                b'\x9e': ('#x17E', '17E'),
+                b'\x9f': ('Yuml', ''),}
+
+    # A parochial partial mapping of ISO-Latin-1 to ASCII. Contains
+    # horrors like stripping diacritical marks to turn á into a, but also
+    # contains non-horrors like turning “ into ".
+    MS_CHARS_TO_ASCII = {
+        b'\x80' : 'EUR',
+        b'\x81' : ' ',
+        b'\x82' : ',',
+        b'\x83' : 'f',
+        b'\x84' : ',,',
+        b'\x85' : '...',
+        b'\x86' : '+',
+        b'\x87' : '++',
+        b'\x88' : '^',
+        b'\x89' : '%',
+        b'\x8a' : 'S',
+        b'\x8b' : '<',
+        b'\x8c' : 'OE',
+        b'\x8d' : '?',
+        b'\x8e' : 'Z',
+        b'\x8f' : '?',
+        b'\x90' : '?',
+        b'\x91' : "'",
+        b'\x92' : "'",
+        b'\x93' : '"',
+        b'\x94' : '"',
+        b'\x95' : '*',
+        b'\x96' : '-',
+        b'\x97' : '--',
+        b'\x98' : '~',
+        b'\x99' : '(TM)',
+        b'\x9a' : 's',
+        b'\x9b' : '>',
+        b'\x9c' : 'oe',
+        b'\x9d' : '?',
+        b'\x9e' : 'z',
+        b'\x9f' : 'Y',
+        b'\xa0' : ' ',
+        b'\xa1' : '!',
+        b'\xa2' : 'c',
+        b'\xa3' : 'GBP',
+        b'\xa4' : '$', #This approximation is especially parochial--this is the
+                       #generic currency symbol.
+        b'\xa5' : 'YEN',
+        b'\xa6' : '|',
+        b'\xa7' : 'S',
+        b'\xa8' : '..',
+        b'\xa9' : '',
+        b'\xaa' : '(th)',
+        b'\xab' : '<<',
+        b'\xac' : '!',
+        b'\xad' : ' ',
+        b'\xae' : '(R)',
+        b'\xaf' : '-',
+        b'\xb0' : 'o',
+        b'\xb1' : '+-',
+        b'\xb2' : '2',
+        b'\xb3' : '3',
+        b'\xb4' : ("'", 'acute'),
+        b'\xb5' : 'u',
+        b'\xb6' : 'P',
+        b'\xb7' : '*',
+        b'\xb8' : ',',
+        b'\xb9' : '1',
+        b'\xba' : '(th)',
+        b'\xbb' : '>>',
+        b'\xbc' : '1/4',
+        b'\xbd' : '1/2',
+        b'\xbe' : '3/4',
+        b'\xbf' : '?',
+        b'\xc0' : 'A',
+        b'\xc1' : 'A',
+        b'\xc2' : 'A',
+        b'\xc3' : 'A',
+        b'\xc4' : 'A',
+        b'\xc5' : 'A',
+        b'\xc6' : 'AE',
+        b'\xc7' : 'C',
+        b'\xc8' : 'E',
+        b'\xc9' : 'E',
+        b'\xca' : 'E',
+        b'\xcb' : 'E',
+        b'\xcc' : 'I',
+        b'\xcd' : 'I',
+        b'\xce' : 'I',
+        b'\xcf' : 'I',
+        b'\xd0' : 'D',
+        b'\xd1' : 'N',
+        b'\xd2' : 'O',
+        b'\xd3' : 'O',
+        b'\xd4' : 'O',
+        b'\xd5' : 'O',
+        b'\xd6' : 'O',
+        b'\xd7' : '*',
+        b'\xd8' : 'O',
+        b'\xd9' : 'U',
+        b'\xda' : 'U',
+        b'\xdb' : 'U',
+        b'\xdc' : 'U',
+        b'\xdd' : 'Y',
+        b'\xde' : 'b',
+        b'\xdf' : 'B',
+        b'\xe0' : 'a',
+        b'\xe1' : 'a',
+        b'\xe2' : 'a',
+        b'\xe3' : 'a',
+        b'\xe4' : 'a',
+        b'\xe5' : 'a',
+        b'\xe6' : 'ae',
+        b'\xe7' : 'c',
+        b'\xe8' : 'e',
+        b'\xe9' : 'e',
+        b'\xea' : 'e',
+        b'\xeb' : 'e',
+        b'\xec' : 'i',
+        b'\xed' : 'i',
+        b'\xee' : 'i',
+        b'\xef' : 'i',
+        b'\xf0' : 'o',
+        b'\xf1' : 'n',
+        b'\xf2' : 'o',
+        b'\xf3' : 'o',
+        b'\xf4' : 'o',
+        b'\xf5' : 'o',
+        b'\xf6' : 'o',
+        b'\xf7' : '/',
+        b'\xf8' : 'o',
+        b'\xf9' : 'u',
+        b'\xfa' : 'u',
+        b'\xfb' : 'u',
+        b'\xfc' : 'u',
+        b'\xfd' : 'y',
+        b'\xfe' : 'b',
+        b'\xff' : 'y',
+        }
+
+    # A map used when removing rogue Windows-1252/ISO-8859-1
+    # characters in otherwise UTF-8 documents.
+    #
+    # Note that \x81, \x8d, \x8f, \x90, and \x9d are undefined in
+    # Windows-1252.
+    WINDOWS_1252_TO_UTF8 = {
+        0x80 : b'\xe2\x82\xac', # €
+        0x82 : b'\xe2\x80\x9a', # ‚
+        0x83 : b'\xc6\x92',     # ƒ
+        0x84 : b'\xe2\x80\x9e', # „
+        0x85 : b'\xe2\x80\xa6', # …
+        0x86 : b'\xe2\x80\xa0', # †
+        0x87 : b'\xe2\x80\xa1', # ‡
+        0x88 : b'\xcb\x86',     # ˆ
+        0x89 : b'\xe2\x80\xb0', # ‰
+        0x8a : b'\xc5\xa0',     # Š
+        0x8b : b'\xe2\x80\xb9', # ‹
+        0x8c : b'\xc5\x92',     # Œ
+        0x8e : b'\xc5\xbd',     # Ž
+        0x91 : b'\xe2\x80\x98', # ‘
+        0x92 : b'\xe2\x80\x99', # ’
+        0x93 : b'\xe2\x80\x9c', # “
+        0x94 : b'\xe2\x80\x9d', # ”
+        0x95 : b'\xe2\x80\xa2', # •
+        0x96 : b'\xe2\x80\x93', # –
+        0x97 : b'\xe2\x80\x94', # —
+        0x98 : b'\xcb\x9c',     # ˜
+        0x99 : b'\xe2\x84\xa2', # ™
+        0x9a : b'\xc5\xa1',     # š
+        0x9b : b'\xe2\x80\xba', # ›
+        0x9c : b'\xc5\x93',     # œ
+        0x9e : b'\xc5\xbe',     # ž
+        0x9f : b'\xc5\xb8',     # Ÿ
+        0xa0 : b'\xc2\xa0',     #  
+        0xa1 : b'\xc2\xa1',     # ¡
+        0xa2 : b'\xc2\xa2',     # ¢
+        0xa3 : b'\xc2\xa3',     # £
+        0xa4 : b'\xc2\xa4',     # ¤
+        0xa5 : b'\xc2\xa5',     # ¥
+        0xa6 : b'\xc2\xa6',     # ¦
+        0xa7 : b'\xc2\xa7',     # §
+        0xa8 : b'\xc2\xa8',     # ¨
+        0xa9 : b'\xc2\xa9',     # ©
+        0xaa : b'\xc2\xaa',     # ª
+        0xab : b'\xc2\xab',     # «
+        0xac : b'\xc2\xac',     # ¬
+        0xad : b'\xc2\xad',     # 
+        0xae : b'\xc2\xae',     # ®
+        0xaf : b'\xc2\xaf',     # ¯
+        0xb0 : b'\xc2\xb0',     # °
+        0xb1 : b'\xc2\xb1',     # ±
+        0xb2 : b'\xc2\xb2',     # ²
+        0xb3 : b'\xc2\xb3',     # ³
+        0xb4 : b'\xc2\xb4',     # ´
+        0xb5 : b'\xc2\xb5',     # µ
+        0xb6 : b'\xc2\xb6',     # ¶
+        0xb7 : b'\xc2\xb7',     # ·
+        0xb8 : b'\xc2\xb8',     # ¸
+        0xb9 : b'\xc2\xb9',     # ¹
+        0xba : b'\xc2\xba',     # º
+        0xbb : b'\xc2\xbb',     # »
+        0xbc : b'\xc2\xbc',     # ¼
+        0xbd : b'\xc2\xbd',     # ½
+        0xbe : b'\xc2\xbe',     # ¾
+        0xbf : b'\xc2\xbf',     # ¿
+        0xc0 : b'\xc3\x80',     # À
+        0xc1 : b'\xc3\x81',     # Á
+        0xc2 : b'\xc3\x82',     # Â
+        0xc3 : b'\xc3\x83',     # Ã
+        0xc4 : b'\xc3\x84',     # Ä
+        0xc5 : b'\xc3\x85',     # Å
+        0xc6 : b'\xc3\x86',     # Æ
+        0xc7 : b'\xc3\x87',     # Ç
+        0xc8 : b'\xc3\x88',     # È
+        0xc9 : b'\xc3\x89',     # É
+        0xca : b'\xc3\x8a',     # Ê
+        0xcb : b'\xc3\x8b',     # Ë
+        0xcc : b'\xc3\x8c',     # Ì
+        0xcd : b'\xc3\x8d',     # Í
+        0xce : b'\xc3\x8e',     # Î
+        0xcf : b'\xc3\x8f',     # Ï
+        0xd0 : b'\xc3\x90',     # Ð
+        0xd1 : b'\xc3\x91',     # Ñ
+        0xd2 : b'\xc3\x92',     # Ò
+        0xd3 : b'\xc3\x93',     # Ó
+        0xd4 : b'\xc3\x94',     # Ô
+        0xd5 : b'\xc3\x95',     # Õ
+        0xd6 : b'\xc3\x96',     # Ö
+        0xd7 : b'\xc3\x97',     # ×
+        0xd8 : b'\xc3\x98',     # Ø
+        0xd9 : b'\xc3\x99',     # Ù
+        0xda : b'\xc3\x9a',     # Ú
+        0xdb : b'\xc3\x9b',     # Û
+        0xdc : b'\xc3\x9c',     # Ü
+        0xdd : b'\xc3\x9d',     # Ý
+        0xde : b'\xc3\x9e',     # Þ
+        0xdf : b'\xc3\x9f',     # ß
+        0xe0 : b'\xc3\xa0',     # à
+        0xe1 : b'\xa1',     # á
+        0xe2 : b'\xc3\xa2',     # â
+        0xe3 : b'\xc3\xa3',     # ã
+        0xe4 : b'\xc3\xa4',     # ä
+        0xe5 : b'\xc3\xa5',     # å
+        0xe6 : b'\xc3\xa6',     # æ
+        0xe7 : b'\xc3\xa7',     # ç
+        0xe8 : b'\xc3\xa8',     # è
+        0xe9 : b'\xc3\xa9',     # é
+        0xea : b'\xc3\xaa',     # ê
+        0xeb : b'\xc3\xab',     # ë
+        0xec : b'\xc3\xac',     # ì
+        0xed : b'\xc3\xad',     # í
+        0xee : b'\xc3\xae',     # î
+        0xef : b'\xc3\xaf',     # ï
+        0xf0 : b'\xc3\xb0',     # ð
+        0xf1 : b'\xc3\xb1',     # ñ
+        0xf2 : b'\xc3\xb2',     # ò
+        0xf3 : b'\xc3\xb3',     # ó
+        0xf4 : b'\xc3\xb4',     # ô
+        0xf5 : b'\xc3\xb5',     # õ
+        0xf6 : b'\xc3\xb6',     # ö
+        0xf7 : b'\xc3\xb7',     # ÷
+        0xf8 : b'\xc3\xb8',     # ø
+        0xf9 : b'\xc3\xb9',     # ù
+        0xfa : b'\xc3\xba',     # ú
+        0xfb : b'\xc3\xbb',     # û
+        0xfc : b'\xc3\xbc',     # ü
+        0xfd : b'\xc3\xbd',     # ý
+        0xfe : b'\xc3\xbe',     # þ
+        }
+
+    MULTIBYTE_MARKERS_AND_SIZES = [
+        (0xc2, 0xdf, 2), # 2-byte characters start with a byte C2-DF
+        (0xe0, 0xef, 3), # 3-byte characters start with E0-EF
+        (0xf0, 0xf4, 4), # 4-byte characters start with F0-F4
+        ]
+
+    FIRST_MULTIBYTE_MARKER = MULTIBYTE_MARKERS_AND_SIZES[0][0]
+    LAST_MULTIBYTE_MARKER = MULTIBYTE_MARKERS_AND_SIZES[-1][1]
+
+    @classmethod
+    def detwingle(cls, in_bytes, main_encoding="utf8",
+                  embedded_encoding="windows-1252"):
+        """Fix characters from one encoding embedded in some other encoding.
+
+        Currently the only situation supported is Windows-1252 (or its
+        subset ISO-8859-1), embedded in UTF-8.
+
+        The input must be a bytestring. If you've already converted
+        the document to Unicode, you're too late.
+
+        The output is a bytestring in which `embedded_encoding`
+        characters have been converted to their `main_encoding`
+        equivalents.
+        """
+        if embedded_encoding.replace('_', '-').lower() not in (
+            'windows-1252', 'windows_1252'):
+            raise NotImplementedError(
+                "Windows-1252 and ISO-8859-1 are the only currently supported "
+                "embedded encodings.")
+
+        if main_encoding.lower() not in ('utf8', 'utf-8'):
+            raise NotImplementedError(
+                "UTF-8 is the only currently supported main encoding.")
+
+        byte_chunks = []
+
+        chunk_start = 0
+        pos = 0
+        while pos < len(in_bytes):
+            byte = in_bytes[pos]
+            if not isinstance(byte, int):
+                # Python 2.x
+                byte = ord(byte)
+            if (byte >= cls.FIRST_MULTIBYTE_MARKER
+                and byte <= cls.LAST_MULTIBYTE_MARKER):
+                # This is the start of a UTF-8 multibyte character. Skip
+                # to the end.
+                for start, end, size in cls.MULTIBYTE_MARKERS_AND_SIZES:
+                    if byte >= start and byte <= end:
+                        pos += size
+                        break
+            elif byte >= 0x80 and byte in cls.WINDOWS_1252_TO_UTF8:
+                # We found a Windows-1252 character!
+                # Save the string up to this point as a chunk.
+                byte_chunks.append(in_bytes[chunk_start:pos])
+
+                # Now translate the Windows-1252 character into UTF-8
+                # and add it as another, one-byte chunk.
+                byte_chunks.append(cls.WINDOWS_1252_TO_UTF8[byte])
+                pos += 1
+                chunk_start = pos
+            else:
+                # Go on to the next character.
+                pos += 1
+        if chunk_start == 0:
+            # The string is unchanged.
+            return in_bytes
+        else:
+            # Store the final chunk.
+            byte_chunks.append(in_bytes[chunk_start:])
+        return b''.join(byte_chunks)
+
--- a/lib/bs4/element.py
+++ b/lib/bs4/element.py
--- a/lib/bs4/testing.py
+++ b/lib/bs4/testing.py
@ -0,0 +1,532 @@
+"""Helper classes for tests."""
+
+import copy
+import functools
+import unittest
+from unittest import TestCase
+from bs4 import BeautifulSoup
+from bs4.element import (
+    CharsetMetaAttributeValue,
+    Comment,
+    ContentMetaAttributeValue,
+    Doctype,
+    SoupStrainer,
+)
+
+from bs4.builder import HTMLParserTreeBuilder
+default_builder = HTMLParserTreeBuilder
+
+
+class SoupTest(unittest.TestCase):
+
+    @property
+    def default_builder(self):
+        return default_builder()
+
+    def soup(self, markup, **kwargs):
+        """Build a Beautiful Soup object from markup."""
+        builder = kwargs.pop('builder', self.default_builder)
+        return BeautifulSoup(markup, builder=builder, **kwargs)
+
+    def document_for(self, markup):
+        """Turn an HTML fragment into a document.
+
+        The details depend on the builder.
+        """
+        return self.default_builder.test_fragment_to_document(markup)
+
+    def assertSoupEquals(self, to_parse, compare_parsed_to=None):
+        builder = self.default_builder
+        obj = BeautifulSoup(to_parse, builder=builder)
+        if compare_parsed_to is None:
+            compare_parsed_to = to_parse
+
+        self.assertEqual(obj.decode(), self.document_for(compare_parsed_to))
+
+
+class HTMLTreeBuilderSmokeTest(object):
+
+    """A basic test of a treebuilder's competence.
+
+    Any HTML treebuilder, present or future, should be able to pass
+    these tests. With invalid markup, there's room for interpretation,
+    and different parsers can handle it differently. But with the
+    markup in these tests, there's not much room for interpretation.
+    """
+
+    def assertDoctypeHandled(self, doctype_fragment):
+        """Assert that a given doctype string is handled correctly."""
+        doctype_str, soup = self._document_with_doctype(doctype_fragment)
+
+        # Make sure a Doctype object was created.
+        doctype = soup.contents[0]
+        self.assertEqual(doctype.__class__, Doctype)
+        self.assertEqual(doctype, doctype_fragment)
+        self.assertEqual(str(soup)[:len(doctype_str)], doctype_str)
+
+        # Make sure that the doctype was correctly associated with the
+        # parse tree and that the rest of the document parsed.
+        self.assertEqual(soup.p.contents[0], 'foo')
+
+    def _document_with_doctype(self, doctype_fragment):
+        """Generate and parse a document with the given doctype."""
+        doctype = '<!DOCTYPE %s>' % doctype_fragment
+        markup = doctype + '\n<p>foo</p>'
+        soup = self.soup(markup)
+        return doctype, soup
+
+    def test_normal_doctypes(self):
+        """Make sure normal, everyday HTML doctypes are handled correctly."""
+        self.assertDoctypeHandled("html")
+        self.assertDoctypeHandled(
+            'html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN"')
+
+    def test_public_doctype_with_url(self):
+        doctype = 'html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd"'
+        self.assertDoctypeHandled(doctype)
+
+    def test_system_doctype(self):
+        self.assertDoctypeHandled('foo SYSTEM "http://www.example.com/"')
+
+    def test_namespaced_system_doctype(self):
+        # We can handle a namespaced doctype with a system ID.
+        self.assertDoctypeHandled('xsl:stylesheet SYSTEM "htmlent.dtd"')
+
+    def test_namespaced_public_doctype(self):
+        # Test a namespaced doctype with a public id.
+        self.assertDoctypeHandled('xsl:stylesheet PUBLIC "htmlent.dtd"')
+
+    def test_real_xhtml_document(self):
+        """A real XHTML document should come out more or less the same as it went in."""
+        markup = b"""<?xml version="1.0" encoding="utf-8"?>
+<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN">
+<html xmlns="http://www.w3.org/1999/xhtml">
+<head><title>Hello.</title></head>
+<body>Goodbye.</body>
+</html>"""
+        soup = self.soup(markup)
+        self.assertEqual(
+            soup.encode("utf-8").replace(b"\n", b""),
+            markup.replace(b"\n", b""))
+
+    def test_deepcopy(self):
+        """Make sure you can copy the tree builder.
+
+        This is important because the builder is part of a
+        BeautifulSoup object, and we want to be able to copy that.
+        """
+        copy.deepcopy(self.default_builder)
+
+    def test_p_tag_is_never_empty_element(self):
+        """A <p> tag is never designated as an empty-element tag.
+
+        Even if the markup shows it as an empty-element tag, it
+        shouldn't be presented that way.
+        """
+        soup = self.soup("<p/>")
+        self.assertFalse(soup.p.is_empty_element)
+        self.assertEqual(str(soup.p), "<p></p>")
+
+    def test_unclosed_tags_get_closed(self):
+        """A tag that's not closed by the end of the document should be closed.
+
+        This applies to all tags except empty-element tags.
+        """
+        self.assertSoupEquals("<p>", "<p></p>")
+        self.assertSoupEquals("<b>", "<b></b>")
+
+        self.assertSoupEquals("<br>", "<br/>")
+
+    def test_br_is_always_empty_element_tag(self):
+        """A <br> tag is designated as an empty-element tag.
+
+        Some parsers treat <br></br> as one <br/> tag, some parsers as
+        two tags, but it should always be an empty-element tag.
+        """
+        soup = self.soup("<br></br>")
+        self.assertTrue(soup.br.is_empty_element)
+        self.assertEqual(str(soup.br), "<br/>")
+
+    def test_nested_formatting_elements(self):
+        self.assertSoupEquals("<em><em></em></em>")
+
+    def test_comment(self):
+        # Comments are represented as Comment objects.
+        markup = "<p>foo<!--foobar-->baz</p>"
+        self.assertSoupEquals(markup)
+
+        soup = self.soup(markup)
+        comment = soup.find(text="foobar")
+        self.assertEqual(comment.__class__, Comment)
+
+    def test_preserved_whitespace_in_pre_and_textarea(self):
+        """Whitespace must be preserved in <pre> and <textarea> tags."""
+        self.assertSoupEquals("<pre>   </pre>")
+        self.assertSoupEquals("<textarea> woo  </textarea>")
+
+    def test_nested_inline_elements(self):
+        """Inline elements can be nested indefinitely."""
+        b_tag = "<b>Inside a B tag</b>"
+        self.assertSoupEquals(b_tag)
+
+        nested_b_tag = "<p>A <i>nested <b>tag</b></i></p>"
+        self.assertSoupEquals(nested_b_tag)
+
+        double_nested_b_tag = "<p>A <a>doubly <i>nested <b>tag</b></i></a></p>"
+        self.assertSoupEquals(nested_b_tag)
+
+    def test_nested_block_level_elements(self):
+        """Block elements can be nested."""
+        soup = self.soup('<blockquote><p><b>Foo</b></p></blockquote>')
+        blockquote = soup.blockquote
+        self.assertEqual(blockquote.p.b.string, 'Foo')
+        self.assertEqual(blockquote.b.string, 'Foo')
+
+    def test_correctly_nested_tables(self):
+        """One table can go inside another one."""
+        markup = ('<table id="1">'
+                  '<tr>'
+                  "<td>Here's another table:"
+                  '<table id="2">'
+                  '<tr><td>foo</td></tr>'
+                  '</table></td>')
+
+        self.assertSoupEquals(
+            markup,
+            '<table id="1"><tr><td>Here\'s another table:'
+            '<table id="2"><tr><td>foo</td></tr></table>'
+            '</td></tr></table>')
+
+        self.assertSoupEquals(
+            "<table><thead><tr><td>Foo</td></tr></thead>"
+            "<tbody><tr><td>Bar</td></tr></tbody>"
+            "<tfoot><tr><td>Baz</td></tr></tfoot></table>")
+
+    def test_deeply_nested_multivalued_attribute(self):
+        # html5lib can set the attributes of the same tag many times
+        # as it rearranges the tree. This has caused problems with
+        # multivalued attributes.
+        markup = '<table><div><div class="css"></div></div></table>'
+        soup = self.soup(markup)
+        self.assertEqual(["css"], soup.div.div['class'])
+
+    def test_angle_brackets_in_attribute_values_are_escaped(self):
+        self.assertSoupEquals('<a b="<a>"></a>', '<a b="&lt;a&gt;"></a>')
+
+    def test_entities_in_attributes_converted_to_unicode(self):
+        expect = u'<p id="pi\N{LATIN SMALL LETTER N WITH TILDE}ata"></p>'
+        self.assertSoupEquals('<p id="pi&#241;ata"></p>', expect)
+        self.assertSoupEquals('<p id="pi&#xf1;ata"></p>', expect)
+        self.assertSoupEquals('<p id="pi&ntilde;ata"></p>', expect)
+
+    def test_entities_in_text_converted_to_unicode(self):
+        expect = u'<p>pi\N{LATIN SMALL LETTER N WITH TILDE}ata</p>'
+        self.assertSoupEquals("<p>pi&#241;ata</p>", expect)
+        self.assertSoupEquals("<p>pi&#xf1;ata</p>", expect)
+        self.assertSoupEquals("<p>pi&ntilde;ata</p>", expect)
+
+    def test_quot_entity_converted_to_quotation_mark(self):
+        self.assertSoupEquals("<p>I said &quot;good day!&quot;</p>",
+                              '<p>I said "good day!"</p>')
+
+    def test_out_of_range_entity(self):
+        expect = u"\N{REPLACEMENT CHARACTER}"
+        self.assertSoupEquals("&#10000000000000;", expect)
+        self.assertSoupEquals("&#x10000000000000;", expect)
+        self.assertSoupEquals("&#1000000000;", expect)
+
+    def test_basic_namespaces(self):
+        """Parsers don't need to *understand* namespaces, but at the
+        very least they should not choke on namespaces or lose
+        data."""
+
+        markup = b'<html xmlns="http://www.w3.org/1999/xhtml" xmlns:mathml="http://www.w3.org/1998/Math/MathML" xmlns:svg="http://www.w3.org/2000/svg"><head></head><body><mathml:msqrt>4</mathml:msqrt><b svg:fill="red"></b></body></html>'
+        soup = self.soup(markup)
+        self.assertEqual(markup, soup.encode())
+        html = soup.html
+        self.assertEqual('http://www.w3.org/1999/xhtml', soup.html['xmlns'])
+        self.assertEqual(
+            'http://www.w3.org/1998/Math/MathML', soup.html['xmlns:mathml'])
+        self.assertEqual(
+            'http://www.w3.org/2000/svg', soup.html['xmlns:svg'])
+
+    def test_multivalued_attribute_value_becomes_list(self):
+        markup = b'<a class="foo bar">'
+        soup = self.soup(markup)
+        self.assertEqual(['foo', 'bar'], soup.a['class'])
+
+    #
+    # Generally speaking, tests below this point are more tests of
+    # Beautiful Soup than tests of the tree builders. But parsers are
+    # weird, so we run these tests separately for every tree builder
+    # to detect any differences between them.
+    #
+
+    def test_soupstrainer(self):
+        """Parsers should be able to work with SoupStrainers."""
+        strainer = SoupStrainer("b")
+        soup = self.soup("A <b>bold</b> <meta/> <i>statement</i>",
+                         parse_only=strainer)
+        self.assertEqual(soup.decode(), "<b>bold</b>")
+
+    def test_single_quote_attribute_values_become_double_quotes(self):
+        self.assertSoupEquals("<foo attr='bar'></foo>",
+                              '<foo attr="bar"></foo>')
+
+    def test_attribute_values_with_nested_quotes_are_left_alone(self):
+        text = """<foo attr='bar "brawls" happen'>a</foo>"""
+        self.assertSoupEquals(text)
+
+    def test_attribute_values_with_double_nested_quotes_get_quoted(self):
+        text = """<foo attr='bar "brawls" happen'>a</foo>"""
+        soup = self.soup(text)
+        soup.foo['attr'] = 'Brawls happen at "Bob\'s Bar"'
+        self.assertSoupEquals(
+            soup.foo.decode(),
+            """<foo attr="Brawls happen at &quot;Bob\'s Bar&quot;">a</foo>""")
+
+    def test_ampersand_in_attribute_value_gets_escaped(self):
+        self.assertSoupEquals('<this is="really messed up & stuff"></this>',
+                              '<this is="really messed up &amp; stuff"></this>')
+
+        self.assertSoupEquals(
+            '<a href="http://example.org?a=1&b=2;3">foo</a>',
+            '<a href="http://example.org?a=1&amp;b=2;3">foo</a>')
+
+    def test_escaped_ampersand_in_attribute_value_is_left_alone(self):
+        self.assertSoupEquals('<a href="http://example.org?a=1&amp;b=2;3"></a>')
+
+    def test_entities_in_strings_converted_during_parsing(self):
+        # Both XML and HTML entities are converted to Unicode characters
+        # during parsing.
+        text = "<p>&lt;&lt;sacr&eacute;&#32;bleu!&gt;&gt;</p>"
+        expected = u"<p>&lt;&lt;sacr\N{LATIN SMALL LETTER E WITH ACUTE} bleu!&gt;&gt;</p>"
+        self.assertSoupEquals(text, expected)
+
+    def test_smart_quotes_converted_on_the_way_in(self):
+        # Microsoft smart quotes are converted to Unicode characters during
+        # parsing.
+        quote = b"<p>\x91Foo\x92</p>"
+        soup = self.soup(quote)
+        self.assertEqual(
+            soup.p.string,
+            u"\N{LEFT SINGLE QUOTATION MARK}Foo\N{RIGHT SINGLE QUOTATION MARK}")
+
+    def test_non_breaking_spaces_converted_on_the_way_in(self):
+        soup = self.soup("<a>&nbsp;&nbsp;</a>")
+        self.assertEqual(soup.a.string, u"\N{NO-BREAK SPACE}" * 2)
+
+    def test_entities_converted_on_the_way_out(self):
+        text = "<p>&lt;&lt;sacr&eacute;&#32;bleu!&gt;&gt;</p>"
+        expected = u"<p>&lt;&lt;sacr\N{LATIN SMALL LETTER E WITH ACUTE} bleu!&gt;&gt;</p>".encode("utf-8")
+        soup = self.soup(text)
+        self.assertEqual(soup.p.encode("utf-8"), expected)
+
+    def test_real_iso_latin_document(self):
+        # Smoke test of interrelated functionality, using an
+        # easy-to-understand document.
+
+        # Here it is in Unicode. Note that it claims to be in ISO-Latin-1.
+        unicode_html = u'<html><head><meta content="text/html; charset=ISO-Latin-1" http-equiv="Content-type"/></head><body><p>Sacr\N{LATIN SMALL LETTER E WITH ACUTE} bleu!</p></body></html>'
+
+        # That's because we're going to encode it into ISO-Latin-1, and use
+        # that to test.
+        iso_latin_html = unicode_html.encode("iso-8859-1")
+
+        # Parse the ISO-Latin-1 HTML.
+        soup = self.soup(iso_latin_html)
+        # Encode it to UTF-8.
+        result = soup.encode("utf-8")
+
+        # What do we expect the result to look like? Well, it would
+        # look like unicode_html, except that the META tag would say
+        # UTF-8 instead of ISO-Latin-1.
+        expected = unicode_html.replace("ISO-Latin-1", "utf-8")
+
+        # And, of course, it would be in UTF-8, not Unicode.
+        expected = expected.encode("utf-8")
+
+        # Ta-da!
+        self.assertEqual(result, expected)
+
+    def test_real_shift_jis_document(self):
+        # Smoke test to make sure the parser can handle a document in
+        # Shift-JIS encoding, without choking.
+        shift_jis_html = (
+            b'<html><head></head><body><pre>'
+            b'\x82\xb1\x82\xea\x82\xcdShift-JIS\x82\xc5\x83R\x81[\x83f'
+            b'\x83B\x83\x93\x83O\x82\xb3\x82\xea\x82\xbd\x93\xfa\x96{\x8c'
+            b'\xea\x82\xcc\x83t\x83@\x83C\x83\x8b\x82\xc5\x82\xb7\x81B'
+            b'</pre></body></html>')
+        unicode_html = shift_jis_html.decode("shift-jis")
+        soup = self.soup(unicode_html)
+
+        # Make sure the parse tree is correctly encoded to various
+        # encodings.
+        self.assertEqual(soup.encode("utf-8"), unicode_html.encode("utf-8"))
+        self.assertEqual(soup.encode("euc_jp"), unicode_html.encode("euc_jp"))
+
+    def test_real_hebrew_document(self):
+        # A real-world test to make sure we can convert ISO-8859-9 (a
+        # Hebrew encoding) to UTF-8.
+        hebrew_document = b'<html><head><title>Hebrew (ISO 8859-8) in Visual Directionality</title></head><body><h1>Hebrew (ISO 8859-8) in Visual Directionality</h1>\xed\xe5\xec\xf9</body></html>'
+        soup = self.soup(
+            hebrew_document, from_encoding="iso8859-8")
+        self.assertEqual(soup.original_encoding, 'iso8859-8')
+        self.assertEqual(
+            soup.encode('utf-8'),
+            hebrew_document.decode("iso8859-8").encode("utf-8"))
+
+    def test_meta_tag_reflects_current_encoding(self):
+        # Here's the <meta> tag saying that a document is
+        # encoded in Shift-JIS.
+        meta_tag = ('<meta content="text/html; charset=x-sjis" '
+                    'http-equiv="Content-type"/>')
+
+        # Here's a document incorporating that meta tag.
+        shift_jis_html = (
+            '<html><head>\n%s\n'
+            '<meta http-equiv="Content-language" content="ja"/>'
+            '</head><body>Shift-JIS markup goes here.') % meta_tag
+        soup = self.soup(shift_jis_html)
+
+        # Parse the document, and the charset is seemingly unaffected.
+        parsed_meta = soup.find('meta', {'http-equiv': 'Content-type'})
+        content = parsed_meta['content']
+        self.assertEqual('text/html; charset=x-sjis', content)
+
+        # But that value is actually a ContentMetaAttributeValue object.
+        self.assertTrue(isinstance(content, ContentMetaAttributeValue))
+
+        # And it will take on a value that reflects its current
+        # encoding.
+        self.assertEqual('text/html; charset=utf8', content.encode("utf8"))
+
+        # For the rest of the story, see TestSubstitutions in
+        # test_tree.py.
+
+    def test_html5_style_meta_tag_reflects_current_encoding(self):
+        # Here's the <meta> tag saying that a document is
+        # encoded in Shift-JIS.
+        meta_tag = ('<meta id="encoding" charset="x-sjis" />')
+
+        # Here's a document incorporating that meta tag.
+        shift_jis_html = (
+            '<html><head>\n%s\n'
+            '<meta http-equiv="Content-language" content="ja"/>'
+            '</head><body>Shift-JIS markup goes here.') % meta_tag
+        soup = self.soup(shift_jis_html)
+
+        # Parse the document, and the charset is seemingly unaffected.
+        parsed_meta = soup.find('meta', id="encoding")
+        charset = parsed_meta['charset']
+        self.assertEqual('x-sjis', charset)
+
+        # But that value is actually a CharsetMetaAttributeValue object.
+        self.assertTrue(isinstance(charset, CharsetMetaAttributeValue))
+
+        # And it will take on a value that reflects its current
+        # encoding.
+        self.assertEqual('utf8', charset.encode("utf8"))
+
+    def test_tag_with_no_attributes_can_have_attributes_added(self):
+        data = self.soup("<a>text</a>")
+        data.a['foo'] = 'bar'
+        self.assertEqual('<a foo="bar">text</a>', data.a.decode())
+
+class XMLTreeBuilderSmokeTest(object):
+
+    def test_docstring_generated(self):
+        soup = self.soup("<root/>")
+        self.assertEqual(
+            soup.encode(), b'<?xml version="1.0" encoding="utf-8"?>\n<root/>')
+
+    def test_real_xhtml_document(self):
+        """A real XHTML document should come out *exactly* the same as it went in."""
+        markup = b"""<?xml version="1.0" encoding="utf-8"?>
+<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN">
+<html xmlns="http://www.w3.org/1999/xhtml">
+<head><title>Hello.</title></head>
+<body>Goodbye.</body>
+</html>"""
+        soup = self.soup(markup)
+        self.assertEqual(
+            soup.encode("utf-8"), markup)
+
+    def test_popping_namespaced_tag(self):
+        markup = '<rss xmlns:dc="foo"><dc:creator>b</dc:creator><dc:date>2012-07-02T20:33:42Z</dc:date><dc:rights>c</dc:rights><image>d</image></rss>'
+        soup = self.soup(markup)
+        self.assertEqual(
+            unicode(soup.rss), markup)
+
+    def test_docstring_includes_correct_encoding(self):
+        soup = self.soup("<root/>")
+        self.assertEqual(
+            soup.encode("latin1"),
+            b'<?xml version="1.0" encoding="latin1"?>\n<root/>')
+
+    def test_large_xml_document(self):
+        """A large XML document should come out the same as it went in."""
+        markup = (b'<?xml version="1.0" encoding="utf-8"?>\n<root>'
+                  + b'0' * (2**12)
+                  + b'</root>')
+        soup = self.soup(markup)
+        self.assertEqual(soup.encode("utf-8"), markup)
+
+
+    def test_tags_are_empty_element_if_and_only_if_they_are_empty(self):
+        self.assertSoupEquals("<p>", "<p/>")
+        self.assertSoupEquals("<p>foo</p>")
+
+    def test_namespaces_are_preserved(self):
+        markup = '<root xmlns:a="http://example.com/" xmlns:b="http://example.net/"><a:foo>This tag is in the a namespace</a:foo><b:foo>This tag is in the b namespace</b:foo></root>'
+        soup = self.soup(markup)
+        root = soup.root
+        self.assertEqual("http://example.com/", root['xmlns:a'])
+        self.assertEqual("http://example.net/", root['xmlns:b'])
+
+    def test_closing_namespaced_tag(self):
+        markup = '<p xmlns:dc="http://purl.org/dc/elements/1.1/"><dc:date>20010504</dc:date></p>'
+        soup = self.soup(markup)
+        self.assertEqual(unicode(soup.p), markup)
+
+class HTML5TreeBuilderSmokeTest(HTMLTreeBuilderSmokeTest):
+    """Smoke test for a tree builder that supports HTML5."""
+
+    def test_real_xhtml_document(self):
+        # Since XHTML is not HTML5, HTML5 parsers are not tested to handle
+        # XHTML documents in any particular way.
+        pass
+
+    def test_html_tags_have_namespace(self):
+        markup = "<a>"
+        soup = self.soup(markup)
+        self.assertEqual("http://www.w3.org/1999/xhtml", soup.a.namespace)
+
+    def test_svg_tags_have_namespace(self):
+        markup = '<svg><circle/></svg>'
+        soup = self.soup(markup)
+        namespace = "http://www.w3.org/2000/svg"
+        self.assertEqual(namespace, soup.svg.namespace)
+        self.assertEqual(namespace, soup.circle.namespace)
+
+
+    def test_mathml_tags_have_namespace(self):
+        markup = '<math><msqrt>5</msqrt></math>'
+        soup = self.soup(markup)
+        namespace = 'http://www.w3.org/1998/Math/MathML'
+        self.assertEqual(namespace, soup.math.namespace)
+        self.assertEqual(namespace, soup.msqrt.namespace)
+
+
+def skipIf(condition, reason):
+   def nothing(test, *args, **kwargs):
+       return None
+
+   def decorator(test_item):
+       if condition:
+           return nothing
+       else:
+           return test_item
+
+   return decorator
--- a/lib/bs4/tests/init.py
+++ b/lib/bs4/tests/init.py
@ -0,0 +1 @@
+"The beautifulsoup tests."
--- a/lib/bs4/tests/test_builder_registry.py
+++ b/lib/bs4/tests/test_builder_registry.py
@ -0,0 +1,141 @@
+"""Tests of the builder registry."""
+
+import unittest
+
+from bs4 import BeautifulSoup
+from bs4.builder import (
+    builder_registry as registry,
+    HTMLParserTreeBuilder,
+    TreeBuilderRegistry,
+)
+
+try:
+    from bs4.builder import HTML5TreeBuilder
+    HTML5LIB_PRESENT = True
+except ImportError:
+    HTML5LIB_PRESENT = False
+
+try:
+    from bs4.builder import (
+        LXMLTreeBuilderForXML,
+        LXMLTreeBuilder,
+        )
+    LXML_PRESENT = True
+except ImportError:
+    LXML_PRESENT = False
+
+
+class BuiltInRegistryTest(unittest.TestCase):
+    """Test the built-in registry with the default builders registered."""
+
+    def test_combination(self):
+        if LXML_PRESENT:
+            self.assertEqual(registry.lookup('fast', 'html'),
+                             LXMLTreeBuilder)
+
+        if LXML_PRESENT:
+            self.assertEqual(registry.lookup('permissive', 'xml'),
+                             LXMLTreeBuilderForXML)
+        self.assertEqual(registry.lookup('strict', 'html'),
+                          HTMLParserTreeBuilder)
+        if HTML5LIB_PRESENT:
+            self.assertEqual(registry.lookup('html5lib', 'html'),
+                              HTML5TreeBuilder)
+
+    def test_lookup_by_markup_type(self):
+        if LXML_PRESENT:
+            self.assertEqual(registry.lookup('html'), LXMLTreeBuilder)
+            self.assertEqual(registry.lookup('xml'), LXMLTreeBuilderForXML)
+        else:
+            self.assertEqual(registry.lookup('xml'), None)
+            if HTML5LIB_PRESENT:
+                self.assertEqual(registry.lookup('html'), HTML5TreeBuilder)
+            else:
+                self.assertEqual(registry.lookup('html'), HTMLParserTreeBuilder)
+
+    def test_named_library(self):
+        if LXML_PRESENT:
+            self.assertEqual(registry.lookup('lxml', 'xml'),
+                             LXMLTreeBuilderForXML)
+            self.assertEqual(registry.lookup('lxml', 'html'),
+                             LXMLTreeBuilder)
+        if HTML5LIB_PRESENT:
+            self.assertEqual(registry.lookup('html5lib'),
+                              HTML5TreeBuilder)
+
+        self.assertEqual(registry.lookup('html.parser'),
+                          HTMLParserTreeBuilder)
+
+    def test_beautifulsoup_constructor_does_lookup(self):
+        # You can pass in a string.
+        BeautifulSoup("", features="html")
+        # Or a list of strings.
+        BeautifulSoup("", features=["html", "fast"])
+
+        # You'll get an exception if BS can't find an appropriate
+        # builder.
+        self.assertRaises(ValueError, BeautifulSoup,
+                          "", features="no-such-feature")
+
+class RegistryTest(unittest.TestCase):
+    """Test the TreeBuilderRegistry class in general."""
+
+    def setUp(self):
+        self.registry = TreeBuilderRegistry()
+
+    def builder_for_features(self, *feature_list):
+        cls = type('Builder_' + '_'.join(feature_list),
+                   (object,), {'features' : feature_list})
+
+        self.registry.register(cls)
+        return cls
+
+    def test_register_with_no_features(self):
+        builder = self.builder_for_features()
+
+        # Since the builder advertises no features, you can't find it
+        # by looking up features.
+        self.assertEqual(self.registry.lookup('foo'), None)
+
+        # But you can find it by doing a lookup with no features, if
+        # this happens to be the only registered builder.
+        self.assertEqual(self.registry.lookup(), builder)
+
+    def test_register_with_features_makes_lookup_succeed(self):
+        builder = self.builder_for_features('foo', 'bar')
+        self.assertEqual(self.registry.lookup('foo'), builder)
+        self.assertEqual(self.registry.lookup('bar'), builder)
+
+    def test_lookup_fails_when_no_builder_implements_feature(self):
+        builder = self.builder_for_features('foo', 'bar')
+        self.assertEqual(self.registry.lookup('baz'), None)
+
+    def test_lookup_gets_most_recent_registration_when_no_feature_specified(self):
+        builder1 = self.builder_for_features('foo')
+        builder2 = self.builder_for_features('bar')
+        self.assertEqual(self.registry.lookup(), builder2)
+
+    def test_lookup_fails_when_no_tree_builders_registered(self):
+        self.assertEqual(self.registry.lookup(), None)
+
+    def test_lookup_gets_most_recent_builder_supporting_all_features(self):
+        has_one = self.builder_for_features('foo')
+        has_the_other = self.builder_for_features('bar')
+        has_both_early = self.builder_for_features('foo', 'bar', 'baz')
+        has_both_late = self.builder_for_features('foo', 'bar', 'quux')
+        lacks_one = self.builder_for_features('bar')
+        has_the_other = self.builder_for_features('foo')
+
+        # There are two builders featuring 'foo' and 'bar', but
+        # the one that also features 'quux' was registered later.
+        self.assertEqual(self.registry.lookup('foo', 'bar'),
+                          has_both_late)
+
+        # There is only one builder featuring 'foo', 'bar', and 'baz'.
+        self.assertEqual(self.registry.lookup('foo', 'bar', 'baz'),
+                          has_both_early)
+
+    def test_lookup_fails_when_cannot_reconcile_requested_features(self):
+        builder1 = self.builder_for_features('foo', 'bar')
+        builder2 = self.builder_for_features('foo', 'baz')
+        self.assertEqual(self.registry.lookup('bar', 'baz'), None)
--- a/lib/bs4/tests/test_docs.py
+++ b/lib/bs4/tests/test_docs.py
@ -0,0 +1,36 @@
+"Test harness for doctests."
+
+# pylint: disable-msg=E0611,W0142
+
+__metaclass__ = type
+__all__ = [
+    'additional_tests',
+    ]
+
+import atexit
+import doctest
+import os
+#from pkg_resources import (
+#    resource_filename, resource_exists, resource_listdir, cleanup_resources)
+import unittest
+
+DOCTEST_FLAGS = (
+    doctest.ELLIPSIS |
+    doctest.NORMALIZE_WHITESPACE |
+    doctest.REPORT_NDIFF)
+
+
+# def additional_tests():
+#     "Run the doc tests (README.txt and docs/*, if any exist)"
+#     doctest_files = [
+#         os.path.abspath(resource_filename('bs4', 'README.txt'))]
+#     if resource_exists('bs4', 'docs'):
+#         for name in resource_listdir('bs4', 'docs'):
+#             if name.endswith('.txt'):
+#                 doctest_files.append(
+#                     os.path.abspath(
+#                         resource_filename('bs4', 'docs/%s' % name)))
+#     kwargs = dict(module_relative=False, optionflags=DOCTEST_FLAGS)
+#     atexit.register(cleanup_resources)
+#     return unittest.TestSuite((
+#         doctest.DocFileSuite(*doctest_files, **kwargs)))
--- a/lib/bs4/tests/test_html5lib.py
+++ b/lib/bs4/tests/test_html5lib.py
@ -0,0 +1,58 @@
+"""Tests to ensure that the html5lib tree builder generates good trees."""
+
+import warnings
+
+try:
+    from bs4.builder import HTML5TreeBuilder
+    HTML5LIB_PRESENT = True
+except ImportError, e:
+    HTML5LIB_PRESENT = False
+from bs4.element import SoupStrainer
+from bs4.testing import (
+    HTML5TreeBuilderSmokeTest,
+    SoupTest,
+    skipIf,
+)
+
+@skipIf(
+    not HTML5LIB_PRESENT,
+    "html5lib seems not to be present, not testing its tree builder.")
+class HTML5LibBuilderSmokeTest(SoupTest, HTML5TreeBuilderSmokeTest):
+    """See ``HTML5TreeBuilderSmokeTest``."""
+
+    @property
+    def default_builder(self):
+        return HTML5TreeBuilder()
+
+    def test_soupstrainer(self):
+        # The html5lib tree builder does not support SoupStrainers.
+        strainer = SoupStrainer("b")
+        markup = "<p>A <b>bold</b> statement.</p>"
+        with warnings.catch_warnings(record=True) as w:
+            soup = self.soup(markup, parse_only=strainer)
+        self.assertEqual(
+            soup.decode(), self.document_for(markup))
+
+        self.assertTrue(
+            "the html5lib tree builder doesn't support parse_only" in
+            str(w[0].message))
+
+    def test_correctly_nested_tables(self):
+        """html5lib inserts <tbody> tags where other parsers don't."""
+        markup = ('<table id="1">'
+                  '<tr>'
+                  "<td>Here's another table:"
+                  '<table id="2">'
+                  '<tr><td>foo</td></tr>'
+                  '</table></td>')
+
+        self.assertSoupEquals(
+            markup,
+            '<table id="1"><tbody><tr><td>Here\'s another table:'
+            '<table id="2"><tbody><tr><td>foo</td></tr></tbody></table>'
+            '</td></tr></tbody></table>')
+
+        self.assertSoupEquals(
+            "<table><thead><tr><td>Foo</td></tr></thead>"
+            "<tbody><tr><td>Bar</td></tr></tbody>"
+            "<tfoot><tr><td>Baz</td></tr></tfoot></table>")
--- a/lib/bs4/tests/test_htmlparser.py
+++ b/lib/bs4/tests/test_htmlparser.py
@ -0,0 +1,19 @@
+"""Tests to ensure that the html.parser tree builder generates good
+trees."""
+
+from bs4.testing import SoupTest, HTMLTreeBuilderSmokeTest
+from bs4.builder import HTMLParserTreeBuilder
+
+class HTMLParserTreeBuilderSmokeTest(SoupTest, HTMLTreeBuilderSmokeTest):
+
+    @property
+    def default_builder(self):
+        return HTMLParserTreeBuilder()
+
+    def test_namespaced_system_doctype(self):
+        # html.parser can't handle namespaced doctypes, so skip this one.
+        pass
+
+    def test_namespaced_public_doctype(self):
+        # html.parser can't handle namespaced doctypes, so skip this one.
+        pass
--- a/lib/bs4/tests/test_lxml.py
+++ b/lib/bs4/tests/test_lxml.py
@ -0,0 +1,75 @@
+"""Tests to ensure that the lxml tree builder generates good trees."""
+
+import re
+import warnings
+
+try:
+    from bs4.builder import LXMLTreeBuilder, LXMLTreeBuilderForXML
+    LXML_PRESENT = True
+except ImportError, e:
+    LXML_PRESENT = False
+
+from bs4 import (
+    BeautifulSoup,
+    BeautifulStoneSoup,
+    )
+from bs4.element import Comment, Doctype, SoupStrainer
+from bs4.testing import skipIf
+from bs4.tests import test_htmlparser
+from bs4.testing import (
+    HTMLTreeBuilderSmokeTest,
+    XMLTreeBuilderSmokeTest,
+    SoupTest,
+    skipIf,
+)
+
+@skipIf(
+    not LXML_PRESENT,
+    "lxml seems not to be present, not testing its tree builder.")
+class LXMLTreeBuilderSmokeTest(SoupTest, HTMLTreeBuilderSmokeTest):
+    """See ``HTMLTreeBuilderSmokeTest``."""
+
+    @property
+    def default_builder(self):
+        return LXMLTreeBuilder()
+
+    def test_out_of_range_entity(self):
+        self.assertSoupEquals(
+            "<p>foo&#10000000000000;bar</p>", "<p>foobar</p>")
+        self.assertSoupEquals(
+            "<p>foo&#x10000000000000;bar</p>", "<p>foobar</p>")
+        self.assertSoupEquals(
+            "<p>foo&#1000000000;bar</p>", "<p>foobar</p>")
+
+    def test_beautifulstonesoup_is_xml_parser(self):
+        # Make sure that the deprecated BSS class uses an xml builder
+        # if one is installed.
+        with warnings.catch_warnings(record=False) as w:
+            soup = BeautifulStoneSoup("<b />")
+            self.assertEqual(u"<b/>", unicode(soup.b))
+
+    def test_real_xhtml_document(self):
+        """lxml strips the XML definition from an XHTML doc, which is fine."""
+        markup = b"""<?xml version="1.0" encoding="utf-8"?>
+<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN">
+<html xmlns="http://www.w3.org/1999/xhtml">
+<head><title>Hello.</title></head>
+<body>Goodbye.</body>
+</html>"""
+        soup = self.soup(markup)
+        self.assertEqual(
+            soup.encode("utf-8").replace(b"\n", b''),
+            markup.replace(b'\n', b'').replace(
+                b'<?xml version="1.0" encoding="utf-8"?>', b''))
+
+
+@skipIf(
+    not LXML_PRESENT,
+    "lxml seems not to be present, not testing its XML tree builder.")
+class LXMLXMLTreeBuilderSmokeTest(SoupTest, XMLTreeBuilderSmokeTest):
+    """See ``HTMLTreeBuilderSmokeTest``."""
+
+    @property
+    def default_builder(self):
+        return LXMLTreeBuilderForXML()
+
--- a/lib/bs4/tests/test_soup.py
+++ b/lib/bs4/tests/test_soup.py
@ -0,0 +1,368 @@
+# -*- coding: utf-8 -*-
+"""Tests of Beautiful Soup as a whole."""
+
+import unittest
+from bs4 import (
+    BeautifulSoup,
+    BeautifulStoneSoup,
+)
+from bs4.element import (
+    CharsetMetaAttributeValue,
+    ContentMetaAttributeValue,
+    SoupStrainer,
+    NamespacedAttribute,
+    )
+import bs4.dammit
+from bs4.dammit import EntitySubstitution, UnicodeDammit
+from bs4.testing import (
+    SoupTest,
+    skipIf,
+)
+import warnings
+
+try:
+    from bs4.builder import LXMLTreeBuilder, LXMLTreeBuilderForXML
+    LXML_PRESENT = True
+except ImportError, e:
+    LXML_PRESENT = False
+
+class TestDeprecatedConstructorArguments(SoupTest):
+
+    def test_parseOnlyThese_renamed_to_parse_only(self):
+        with warnings.catch_warnings(record=True) as w:
+            soup = self.soup("<a><b></b></a>", parseOnlyThese=SoupStrainer("b"))
+        msg = str(w[0].message)
+        self.assertTrue("parseOnlyThese" in msg)
+        self.assertTrue("parse_only" in msg)
+        self.assertEqual(b"<b></b>", soup.encode())
+
+    def test_fromEncoding_renamed_to_from_encoding(self):
+        with warnings.catch_warnings(record=True) as w:
+            utf8 = b"\xc3\xa9"
+            soup = self.soup(utf8, fromEncoding="utf8")
+        msg = str(w[0].message)
+        self.assertTrue("fromEncoding" in msg)
+        self.assertTrue("from_encoding" in msg)
+        self.assertEqual("utf8", soup.original_encoding)
+
+    def test_unrecognized_keyword_argument(self):
+        self.assertRaises(
+            TypeError, self.soup, "<a>", no_such_argument=True)
+
+    @skipIf(
+        not LXML_PRESENT,
+        "lxml not present, not testing BeautifulStoneSoup.")
+    def test_beautifulstonesoup(self):
+        with warnings.catch_warnings(record=True) as w:
+            soup = BeautifulStoneSoup("<markup>")
+            self.assertTrue(isinstance(soup, BeautifulSoup))
+            self.assertTrue("BeautifulStoneSoup class is deprecated")
+
+class TestSelectiveParsing(SoupTest):
+
+    def test_parse_with_soupstrainer(self):
+        markup = "No<b>Yes</b><a>No<b>Yes <c>Yes</c></b>"
+        strainer = SoupStrainer("b")
+        soup = self.soup(markup, parse_only=strainer)
+        self.assertEqual(soup.encode(), b"<b>Yes</b><b>Yes <c>Yes</c></b>")
+
+
+class TestEntitySubstitution(unittest.TestCase):
+    """Standalone tests of the EntitySubstitution class."""
+    def setUp(self):
+        self.sub = EntitySubstitution
+
+    def test_simple_html_substitution(self):
+        # Unicode characters corresponding to named HTML entites
+        # are substituted, and no others.
+        s = u"foo\u2200\N{SNOWMAN}\u00f5bar"
+        self.assertEqual(self.sub.substitute_html(s),
+                          u"foo&forall;\N{SNOWMAN}&otilde;bar")
+
+    def test_smart_quote_substitution(self):
+        # MS smart quotes are a common source of frustration, so we
+        # give them a special test.
+        quotes = b"\x91\x92foo\x93\x94"
+        dammit = UnicodeDammit(quotes)
+        self.assertEqual(self.sub.substitute_html(dammit.markup),
+                          "&lsquo;&rsquo;foo&ldquo;&rdquo;")
+
+    def test_xml_converstion_includes_no_quotes_if_make_quoted_attribute_is_false(self):
+        s = 'Welcome to "my bar"'
+        self.assertEqual(self.sub.substitute_xml(s, False), s)
+
+    def test_xml_attribute_quoting_normally_uses_double_quotes(self):
+        self.assertEqual(self.sub.substitute_xml("Welcome", True),
+                          '"Welcome"')
+        self.assertEqual(self.sub.substitute_xml("Bob's Bar", True),
+                          '"Bob\'s Bar"')
+
+    def test_xml_attribute_quoting_uses_single_quotes_when_value_contains_double_quotes(self):
+        s = 'Welcome to "my bar"'
+        self.assertEqual(self.sub.substitute_xml(s, True),
+                          "'Welcome to \"my bar\"'")
+
+    def test_xml_attribute_quoting_escapes_single_quotes_when_value_contains_both_single_and_double_quotes(self):
+        s = 'Welcome to "Bob\'s Bar"'
+        self.assertEqual(
+            self.sub.substitute_xml(s, True),
+            '"Welcome to &quot;Bob\'s Bar&quot;"')
+
+    def test_xml_quotes_arent_escaped_when_value_is_not_being_quoted(self):
+        quoted = 'Welcome to "Bob\'s Bar"'
+        self.assertEqual(self.sub.substitute_xml(quoted), quoted)
+
+    def test_xml_quoting_handles_angle_brackets(self):
+        self.assertEqual(
+            self.sub.substitute_xml("foo<bar>"),
+            "foo&lt;bar&gt;")
+
+    def test_xml_quoting_handles_ampersands(self):
+        self.assertEqual(self.sub.substitute_xml("AT&T"), "AT&amp;T")
+
+    def test_xml_quoting_ignores_ampersands_when_they_are_part_of_an_entity(self):
+        self.assertEqual(
+            self.sub.substitute_xml("&Aacute;T&T"),
+            "&Aacute;T&amp;T")
+
+    def test_quotes_not_html_substituted(self):
+        """There's no need to do this except inside attribute values."""
+        text = 'Bob\'s "bar"'
+        self.assertEqual(self.sub.substitute_html(text), text)
+
+
+class TestEncodingConversion(SoupTest):
+    # Test Beautiful Soup's ability to decode and encode from various
+    # encodings.
+
+    def setUp(self):
+        super(TestEncodingConversion, self).setUp()
+        self.unicode_data = u"<html><head></head><body><foo>Sacr\N{LATIN SMALL LETTER E WITH ACUTE} bleu!</foo></body></html>"
+        self.utf8_data = self.unicode_data.encode("utf-8")
+        # Just so you know what it looks like.
+        self.assertEqual(
+            self.utf8_data,
+            b"<html><head></head><body><foo>Sacr\xc3\xa9 bleu!</foo></body></html>")
+
+    def test_ascii_in_unicode_out(self):
+        # ASCII input is converted to Unicode. The original_encoding
+        # attribute is set.
+        ascii = b"<foo>a</foo>"
+        soup_from_ascii = self.soup(ascii)
+        unicode_output = soup_from_ascii.decode()
+        self.assertTrue(isinstance(unicode_output, unicode))
+        self.assertEqual(unicode_output, self.document_for(ascii.decode()))
+        self.assertEqual(soup_from_ascii.original_encoding, "ascii")
+
+    def test_unicode_in_unicode_out(self):
+        # Unicode input is left alone. The original_encoding attribute
+        # is not set.
+        soup_from_unicode = self.soup(self.unicode_data)
+        self.assertEqual(soup_from_unicode.decode(), self.unicode_data)
+        self.assertEqual(soup_from_unicode.foo.string, u'Sacr\xe9 bleu!')
+        self.assertEqual(soup_from_unicode.original_encoding, None)
+
+    def test_utf8_in_unicode_out(self):
+        # UTF-8 input is converted to Unicode. The original_encoding
+        # attribute is set.
+        soup_from_utf8 = self.soup(self.utf8_data)
+        self.assertEqual(soup_from_utf8.decode(), self.unicode_data)
+        self.assertEqual(soup_from_utf8.foo.string, u'Sacr\xe9 bleu!')
+
+    def test_utf8_out(self):
+        # The internal data structures can be encoded as UTF-8.
+        soup_from_unicode = self.soup(self.unicode_data)
+        self.assertEqual(soup_from_unicode.encode('utf-8'), self.utf8_data)
+
+
+class TestUnicodeDammit(unittest.TestCase):
+    """Standalone tests of Unicode, Dammit."""
+
+    def test_smart_quotes_to_unicode(self):
+        markup = b"<foo>\x91\x92\x93\x94</foo>"
+        dammit = UnicodeDammit(markup)
+        self.assertEqual(
+            dammit.unicode_markup, u"<foo>\u2018\u2019\u201c\u201d</foo>")
+
+    def test_smart_quotes_to_xml_entities(self):
+        markup = b"<foo>\x91\x92\x93\x94</foo>"
+        dammit = UnicodeDammit(markup, smart_quotes_to="xml")
+        self.assertEqual(
+            dammit.unicode_markup, "<foo>&#x2018;&#x2019;&#x201C;&#x201D;</foo>")
+
+    def test_smart_quotes_to_html_entities(self):
+        markup = b"<foo>\x91\x92\x93\x94</foo>"
+        dammit = UnicodeDammit(markup, smart_quotes_to="html")
+        self.assertEqual(
+            dammit.unicode_markup, "<foo>&lsquo;&rsquo;&ldquo;&rdquo;</foo>")
+
+    def test_smart_quotes_to_ascii(self):
+        markup = b"<foo>\x91\x92\x93\x94</foo>"
+        dammit = UnicodeDammit(markup, smart_quotes_to="ascii")
+        self.assertEqual(
+            dammit.unicode_markup, """<foo>''""</foo>""")
+
+    def test_detect_utf8(self):
+        utf8 = b"\xc3\xa9"
+        dammit = UnicodeDammit(utf8)
+        self.assertEqual(dammit.unicode_markup, u'\xe9')
+        self.assertEqual(dammit.original_encoding, 'utf-8')
+
+    def test_convert_hebrew(self):
+        hebrew = b"\xed\xe5\xec\xf9"
+        dammit = UnicodeDammit(hebrew, ["iso-8859-8"])
+        self.assertEqual(dammit.original_encoding, 'iso-8859-8')
+        self.assertEqual(dammit.unicode_markup, u'\u05dd\u05d5\u05dc\u05e9')
+
+    def test_dont_see_smart_quotes_where_there_are_none(self):
+        utf_8 = b"\343\202\261\343\203\274\343\202\277\343\202\244 Watch"
+        dammit = UnicodeDammit(utf_8)
+        self.assertEqual(dammit.original_encoding, 'utf-8')
+        self.assertEqual(dammit.unicode_markup.encode("utf-8"), utf_8)
+
+    def test_ignore_inappropriate_codecs(self):
+        utf8_data = u"Räksmörgås".encode("utf-8")
+        dammit = UnicodeDammit(utf8_data, ["iso-8859-8"])
+        self.assertEqual(dammit.original_encoding, 'utf-8')
+
+    def test_ignore_invalid_codecs(self):
+        utf8_data = u"Räksmörgås".encode("utf-8")
+        for bad_encoding in ['.utf8', '...', 'utF---16.!']:
+            dammit = UnicodeDammit(utf8_data, [bad_encoding])
+            self.assertEqual(dammit.original_encoding, 'utf-8')
+
+    def test_detect_html5_style_meta_tag(self):
+
+        for data in (
+            b'<html><meta charset="euc-jp" /></html>',
+            b"<html><meta charset='euc-jp' /></html>",
+            b"<html><meta charset=euc-jp /></html>",
+            b"<html><meta charset=euc-jp/></html>"):
+            dammit = UnicodeDammit(data, is_html=True)
+            self.assertEqual(
+                "euc-jp", dammit.original_encoding)
+
+    def test_last_ditch_entity_replacement(self):
+        # This is a UTF-8 document that contains bytestrings
+        # completely incompatible with UTF-8 (ie. encoded with some other
+        # encoding).
+        #
+        # Since there is no consistent encoding for the document,
+        # Unicode, Dammit will eventually encode the document as UTF-8
+        # and encode the incompatible characters as REPLACEMENT
+        # CHARACTER.
+        #
+        # If chardet is installed, it will detect that the document
+        # can be converted into ISO-8859-1 without errors. This happens
+        # to be the wrong encoding, but it is a consistent encoding, so the
+        # code we're testing here won't run.
+        #
+        # So we temporarily disable chardet if it's present.
+        doc = b"""\357\273\277<?xml version="1.0" encoding="UTF-8"?>
+<html><b>\330\250\330\252\330\261</b>
+<i>\310\322\321\220\312\321\355\344</i></html>"""
+        chardet = bs4.dammit.chardet
+        try:
+            bs4.dammit.chardet = None
+            with warnings.catch_warnings(record=True) as w:
+                dammit = UnicodeDammit(doc)
+                self.assertEqual(True, dammit.contains_replacement_characters)
+                self.assertTrue(u"\ufffd" in dammit.unicode_markup)
+
+                soup = BeautifulSoup(doc, "html.parser")
+                self.assertTrue(soup.contains_replacement_characters)
+
+                msg = w[0].message
+                self.assertTrue(isinstance(msg, UnicodeWarning))
+                self.assertTrue("Some characters could not be decoded" in str(msg))
+        finally:
+            bs4.dammit.chardet = chardet
+
+    def test_sniffed_xml_encoding(self):
+        # A document written in UTF-16LE will be converted by a different
+        # code path that sniffs the byte order markers.
+        data = b'\xff\xfe<\x00a\x00>\x00\xe1\x00\xe9\x00<\x00/\x00a\x00>\x00'
+        dammit = UnicodeDammit(data)
+        self.assertEqual(u"<a>áé</a>", dammit.unicode_markup)
+        self.assertEqual("utf-16le", dammit.original_encoding)
+
+    def test_detwingle(self):
+        # Here's a UTF8 document.
+        utf8 = (u"\N{SNOWMAN}" * 3).encode("utf8")
+
+        # Here's a Windows-1252 document.
+        windows_1252 = (
+            u"\N{LEFT DOUBLE QUOTATION MARK}Hi, I like Windows!"
+            u"\N{RIGHT DOUBLE QUOTATION MARK}").encode("windows_1252")
+
+        # Through some unholy alchemy, they've been stuck together.
+        doc = utf8 + windows_1252 + utf8
+
+        # The document can't be turned into UTF-8:
+        self.assertRaises(UnicodeDecodeError, doc.decode, "utf8")
+
+        # Unicode, Dammit thinks the whole document is Windows-1252,
+        # and decodes it into "â˜ƒâ˜ƒâ˜ƒ“Hi, I like Windows!”â˜ƒâ˜ƒâ˜ƒ"
+
+        # But if we run it through fix_embedded_windows_1252, it's fixed:
+
+        fixed = UnicodeDammit.detwingle(doc)
+        self.assertEqual(
+            u"☃☃☃“Hi, I like Windows!”☃☃☃", fixed.decode("utf8"))
+
+    def test_detwingle_ignores_multibyte_characters(self):
+        # Each of these characters has a UTF-8 representation ending
+        # in \x93. \x93 is a smart quote if interpreted as
+        # Windows-1252. But our code knows to skip over multibyte
+        # UTF-8 characters, so they'll survive the process unscathed.
+        for tricky_unicode_char in (
+            u"\N{LATIN SMALL LIGATURE OE}", # 2-byte char '\xc5\x93'
+            u"\N{LATIN SUBSCRIPT SMALL LETTER X}", # 3-byte char '\xe2\x82\x93'
+            u"\xf0\x90\x90\x93", # This is a CJK character, not sure which one.
+            ):
+            input = tricky_unicode_char.encode("utf8")
+            self.assertTrue(input.endswith(b'\x93'))
+            output = UnicodeDammit.detwingle(input)
+            self.assertEqual(output, input)
+
+class TestNamedspacedAttribute(SoupTest):
+
+    def test_name_may_be_none(self):
+        a = NamespacedAttribute("xmlns", None)
+        self.assertEqual(a, "xmlns")
+
+    def test_attribute_is_equivalent_to_colon_separated_string(self):
+        a = NamespacedAttribute("a", "b")
+        self.assertEqual("a:b", a)
+
+    def test_attributes_are_equivalent_if_prefix_and_name_identical(self):
+        a = NamespacedAttribute("a", "b", "c")
+        b = NamespacedAttribute("a", "b", "c")
+        self.assertEqual(a, b)
+
+        # The actual namespace is not considered.
+        c = NamespacedAttribute("a", "b", None)
+        self.assertEqual(a, c)
+
+        # But name and prefix are important.
+        d = NamespacedAttribute("a", "z", "c")
+        self.assertNotEqual(a, d)
+
+        e = NamespacedAttribute("z", "b", "c")
+        self.assertNotEqual(a, e)
+
+
+class TestAttributeValueWithCharsetSubstitution(unittest.TestCase):
+
+    def test_content_meta_attribute_value(self):
+        value = CharsetMetaAttributeValue("euc-jp")
+        self.assertEqual("euc-jp", value)
+        self.assertEqual("euc-jp", value.original_value)
+        self.assertEqual("utf8", value.encode("utf8"))
+
+
+    def test_content_meta_attribute_value(self):
+        value = ContentMetaAttributeValue("text/html; charset=euc-jp")
+        self.assertEqual("text/html; charset=euc-jp", value)
+        self.assertEqual("text/html; charset=euc-jp", value.original_value)
+        self.assertEqual("text/html; charset=utf8", value.encode("utf8"))
--- a/lib/bs4/tests/test_tree.py
+++ b/lib/bs4/tests/test_tree.py
--- a/lib/feedparser/feedparser.egg-info/PKG-INFO
+++ b/lib/feedparser/feedparser.egg-info/PKG-INFO
@ -0,0 +1,29 @@
+Metadata-Version: 1.0
+Name: feedparser
+Version: 5.1.1
+Summary: Universal feed parser, handles RSS 0.9x, RSS 1.0, RSS 2.0, CDF, Atom 0.3, and Atom 1.0 feeds
+Home-page: http://code.google.com/p/feedparser/
+Author: Kurt McKee
+Author-email: contactme@kurtmckee.org
+License: UNKNOWN
+Download-URL: http://code.google.com/p/feedparser/
+Description: UNKNOWN
+Keywords: atom,cdf,feed,parser,rdf,rss
+Platform: POSIX
+Platform: Windows
+Classifier: Development Status :: 5 - Production/Stable
+Classifier: Intended Audience :: Developers
+Classifier: License :: OSI Approved
+Classifier: Operating System :: OS Independent
+Classifier: Programming Language :: Python
+Classifier: Programming Language :: Python :: 2
+Classifier: Programming Language :: Python :: 2.4
+Classifier: Programming Language :: Python :: 2.5
+Classifier: Programming Language :: Python :: 2.6
+Classifier: Programming Language :: Python :: 2.7
+Classifier: Programming Language :: Python :: 3
+Classifier: Programming Language :: Python :: 3.0
+Classifier: Programming Language :: Python :: 3.1
+Classifier: Programming Language :: Python :: 3.2
+Classifier: Topic :: Software Development :: Libraries :: Python Modules
+Classifier: Topic :: Text Processing :: Markup :: XML
--- a/lib/feedparser/feedparser.egg-info/SOURCES.txt
+++ b/lib/feedparser/feedparser.egg-info/SOURCES.txt
--- a/lib/feedparser/feedparser.egg-info/dependency_links.txt
+++ b/lib/feedparser/feedparser.egg-info/dependency_links.txt
@ -0,0 +1 @@
+
--- a/lib/feedparser/feedparser.egg-info/top_level.txt
+++ b/lib/feedparser/feedparser.egg-info/top_level.txt
@ -0,0 +1 @@
+feedparser
--- a/lib/feedparser/feedparser.py
+++ b/lib/feedparser/feedparser.py
--- a/lib/feedparser/feedparsertest.py
+++ b/lib/feedparser/feedparsertest.py
@ -0,0 +1,806 @@
+#!/usr/bin/env python
+
+__author__ = "Mark Pilgrim <http://diveintomark.org/>"
+__license__ = """
+Copyright (c) 2010-2012 Kurt McKee <contactme@kurtmckee.org>
+Copyright (c) 2004-2008 Mark Pilgrim
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without modification,
+are permitted provided that the following conditions are met:
+
+* Redistributions of source code must retain the above copyright notice,
+  this list of conditions and the following disclaimer.
+* Redistributions in binary form must reproduce the above copyright notice,
+  this list of conditions and the following disclaimer in the documentation
+  and/or other materials provided with the distribution.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 'AS IS'
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+POSSIBILITY OF SUCH DAMAGE."""
+
+import codecs
+import datetime
+import glob
+import operator
+import os
+import posixpath
+import pprint
+import re
+import struct
+import sys
+import threading
+import time
+import unittest
+import urllib
+import warnings
+import zlib
+import BaseHTTPServer
+import SimpleHTTPServer
+
+import feedparser
+
+if not feedparser._XML_AVAILABLE:
+    sys.stderr.write('No XML parsers available, unit testing can not proceed\n')
+    sys.exit(1)
+
+_UTF32_AVAILABLE = feedparser._UTF32_AVAILABLE
+_s2bytes = feedparser._s2bytes
+_l2bytes = feedparser._l2bytes
+
+#---------- custom HTTP server (used to serve test feeds) ----------
+
+_PORT = 8097 # not really configurable, must match hardcoded port in tests
+_HOST = '127.0.0.1' # also not really configurable
+
+class FeedParserTestRequestHandler(SimpleHTTPServer.SimpleHTTPRequestHandler):
+    headers_re = re.compile(_s2bytes(r"^Header:\s+([^:]+):(.+)$"), re.MULTILINE)
+
+    def send_head(self):
+        """Send custom headers defined in test case
+
+        Example:
+        <!--
+        Header:   Content-type: application/atom+xml
+        Header:   X-Foo: bar
+        -->
+        """
+        # Short-circuit the HTTP status test `test_redirect_to_304()`
+        if self.path == '/-/return-304.xml':
+            self.send_response(304)
+            self.send_header('Content-type', 'text/xml')
+            self.end_headers()
+            return feedparser._StringIO(u''.encode('utf-8'))
+        path = self.translate_path(self.path)
+        # the compression tests' filenames determine the header sent
+        if self.path.startswith('/tests/compression'):
+            if self.path.endswith('gz'):
+                headers = {'Content-Encoding': 'gzip'}
+            else:
+                headers = {'Content-Encoding': 'deflate'}
+        else:
+            headers = dict([(k.decode('utf-8'), v.decode('utf-8').strip()) for k, v in self.headers_re.findall(open(path, 'rb').read())])
+        f = open(path, 'rb')
+        if (self.headers.get('if-modified-since') == headers.get('Last-Modified', 'nom')) \
+            or (self.headers.get('if-none-match') == headers.get('ETag', 'nomatch')):
+            status = 304
+        else:
+            status = 200
+        headers.setdefault('Status', status)
+        self.send_response(int(headers['Status']))
+        headers.setdefault('Content-type', self.guess_type(path))
+        self.send_header("Content-type", headers['Content-type'])
+        self.send_header("Content-Length", str(os.stat(f.name)[6]))
+        for k, v in headers.items():
+            if k not in ('Status', 'Content-type'):
+                self.send_header(k, v)
+        self.end_headers()
+        return f
+
+    def log_request(self, *args):
+        pass
+
+class FeedParserTestServer(threading.Thread):
+    """HTTP Server that runs in a thread and handles a predetermined number of requests"""
+
+    def __init__(self, requests):
+        threading.Thread.__init__(self)
+        self.requests = requests
+        self.ready = threading.Event()
+
+    def run(self):
+        self.httpd = BaseHTTPServer.HTTPServer((_HOST, _PORT), FeedParserTestRequestHandler)
+        self.ready.set()
+        while self.requests:
+            self.httpd.handle_request()
+            self.requests -= 1
+        self.ready.clear()
+
+#---------- dummy test case class (test methods are added dynamically) ----------
+unicode1_re = re.compile(_s2bytes(" u'"))
+unicode2_re = re.compile(_s2bytes(' u"'))
+
+# _bytes is only used in everythingIsUnicode().
+# In Python 2 it's str, and in Python 3 it's bytes.
+_bytes = type(_s2bytes(''))
+
+def everythingIsUnicode(d):
+    """Takes a dictionary, recursively verifies that every value is unicode"""
+    for k, v in d.iteritems():
+        if isinstance(v, dict) and k != 'headers':
+            if not everythingIsUnicode(v):
+                return False
+        elif isinstance(v, list):
+            for i in v:
+                if isinstance(i, dict) and not everythingIsUnicode(i):
+                    return False
+                elif isinstance(i, _bytes):
+                    return False
+        elif isinstance(v, _bytes):
+            return False
+    return True
+
+def failUnlessEval(self, xmlfile, evalString, msg=None):
+    """Fail unless eval(evalString, env)"""
+    env = feedparser.parse(xmlfile)
+    try:
+        if not eval(evalString, globals(), env):
+            failure=(msg or 'not eval(%s) \nWITH env(%s)' % (evalString, pprint.pformat(env)))
+            raise self.failureException, failure
+        if not everythingIsUnicode(env):
+            raise self.failureException, "not everything is unicode \nWITH env(%s)" % (pprint.pformat(env), )
+    except SyntaxError:
+        # Python 3 doesn't have the `u""` syntax, so evalString needs to be modified,
+        # which will require the failure message to be updated
+        evalString = re.sub(unicode1_re, _s2bytes(" '"), evalString)
+        evalString = re.sub(unicode2_re, _s2bytes(' "'), evalString)
+        if not eval(evalString, globals(), env):
+            failure=(msg or 'not eval(%s) \nWITH env(%s)' % (evalString, pprint.pformat(env)))
+            raise self.failureException, failure
+
+class BaseTestCase(unittest.TestCase):
+    failUnlessEval = failUnlessEval
+
+class TestCase(BaseTestCase):
+    pass
+
+class TestTemporaryFallbackBehavior(unittest.TestCase):
+    "These tests are temporarily here because of issues 310 and 328"
+    def test_issue_328_fallback_behavior(self):
+        warnings.filterwarnings('error')
+
+        d = feedparser.FeedParserDict()
+        d['published'] = u'pub string'
+        d['published_parsed'] = u'pub tuple'
+        d['updated'] = u'upd string'
+        d['updated_parsed'] = u'upd tuple'
+        # Ensure that `updated` doesn't map to `published` when it exists
+        self.assertTrue('published' in d)
+        self.assertTrue('published_parsed' in d)
+        self.assertTrue('updated' in d)
+        self.assertTrue('updated_parsed' in d)
+        self.assertEqual(d['published'], 'pub string')
+        self.assertEqual(d['published_parsed'], 'pub tuple')
+        self.assertEqual(d['updated'], 'upd string')
+        self.assertEqual(d['updated_parsed'], 'upd tuple')
+
+        d = feedparser.FeedParserDict()
+        d['published'] = u'pub string'
+        d['published_parsed'] = u'pub tuple'
+        # Ensure that `updated` doesn't actually exist
+        self.assertTrue('updated' not in d)
+        self.assertTrue('updated_parsed' not in d)
+        # Ensure that accessing `updated` throws a DeprecationWarning
+        try:
+            d['updated']
+        except DeprecationWarning:
+            # Expected behavior
+            pass
+        else:
+            # Wrong behavior
+            self.assertEqual(True, False)
+        try:
+            d['updated_parsed']
+        except DeprecationWarning:
+            # Expected behavior
+            pass
+        else:
+            # Wrong behavior
+            self.assertEqual(True, False)
+        # Ensure that `updated` maps to `published`
+        warnings.filterwarnings('ignore')
+        self.assertEqual(d['updated'], u'pub string')
+        self.assertEqual(d['updated_parsed'], u'pub tuple')
+        warnings.resetwarnings()
+
+
+class TestEverythingIsUnicode(unittest.TestCase):
+    "Ensure that `everythingIsUnicode()` is working appropriately"
+    def test_everything_is_unicode(self):
+        self.assertTrue(everythingIsUnicode(
+            {'a': u'a', 'b': [u'b', {'c': u'c'}], 'd': {'e': u'e'}}
+        ))
+    def test_not_everything_is_unicode(self):
+        self.assertFalse(everythingIsUnicode({'a': _s2bytes('a')}))
+        self.assertFalse(everythingIsUnicode({'a': [_s2bytes('a')]}))
+        self.assertFalse(everythingIsUnicode({'a': {'b': _s2bytes('b')}}))
+        self.assertFalse(everythingIsUnicode({'a': [{'b': _s2bytes('b')}]}))
+
+class TestLooseParser(BaseTestCase):
+    "Test the sgmllib-based parser by manipulating feedparser " \
+    "into believing no XML parsers are installed"
+    def __init__(self, arg):
+        unittest.TestCase.__init__(self, arg)
+        self._xml_available = feedparser._XML_AVAILABLE
+    def setUp(self):
+        feedparser._XML_AVAILABLE = 0
+    def tearDown(self):
+        feedparser._XML_AVAILABLE = self._xml_available
+
+class TestStrictParser(BaseTestCase):
+    pass
+
+class TestMicroformats(BaseTestCase):
+    pass
+
+class TestEncodings(BaseTestCase):
+    pass
+
+class TestFeedParserDict(unittest.TestCase):
+    "Ensure that FeedParserDict returns values as expected and won't crash"
+    def setUp(self):
+        self.d = feedparser.FeedParserDict()
+    def _check_key(self, k):
+        self.assertTrue(k in self.d)
+        self.assertTrue(hasattr(self.d, k))
+        self.assertEqual(self.d[k], 1)
+        self.assertEqual(getattr(self.d, k), 1)
+    def _check_no_key(self, k):
+        self.assertTrue(k not in self.d)
+        self.assertTrue(not hasattr(self.d, k))
+    def test_empty(self):
+        keys = (
+            'a','entries', 'id', 'guid', 'summary', 'subtitle', 'description',
+            'category', 'enclosures', 'license', 'categories',
+        )
+        for k in keys:
+            self._check_no_key(k)
+        self.assertTrue('items' not in self.d)
+        self.assertTrue(hasattr(self.d, 'items')) # dict.items() exists
+    def test_neutral(self):
+        self.d['a'] = 1
+        self._check_key('a')
+    def test_single_mapping_target_1(self):
+        self.d['id'] = 1
+        self._check_key('id')
+        self._check_key('guid')
+    def test_single_mapping_target_2(self):
+        self.d['guid'] = 1
+        self._check_key('id')
+        self._check_key('guid')
+    def test_multiple_mapping_target_1(self):
+        self.d['summary'] = 1
+        self._check_key('summary')
+        self._check_key('description')
+    def test_multiple_mapping_target_2(self):
+        self.d['subtitle'] = 1
+        self._check_key('subtitle')
+        self._check_key('description')
+    def test_multiple_mapping_mapped_key(self):
+        self.d['description'] = 1
+        self._check_key('summary')
+        self._check_key('description')
+    def test_license(self):
+        self.d['links'] = []
+        try:
+            self.d['license']
+            self.assertTrue(False)
+        except KeyError:
+            pass
+        self.d['links'].append({'rel': 'license'})
+        try:
+            self.d['license']
+            self.assertTrue(False)
+        except KeyError:
+            pass
+        self.d['links'].append({'rel': 'license', 'href': 'http://dom.test/'})
+        self.assertEqual(self.d['license'], 'http://dom.test/')
+    def test_category(self):
+        self.d['tags'] = []
+        try:
+            self.d['category']
+            self.assertTrue(False)
+        except KeyError:
+            pass
+        self.d['tags'] = [{}]
+        try:
+            self.d['category']
+            self.assertTrue(False)
+        except KeyError:
+            pass
+        self.d['tags'] = [{'term': 'cat'}]
+        self.assertEqual(self.d['category'], 'cat')
+        self.d['tags'].append({'term': 'dog'})
+        self.assertEqual(self.d['category'], 'cat')
+
+class TestOpenResource(unittest.TestCase):
+    "Ensure that `_open_resource()` interprets its arguments as URIs, " \
+    "file-like objects, or in-memory feeds as expected"
+    def test_fileobj(self):
+        r = feedparser._open_resource(sys.stdin, '', '', '', '', [], {})
+        self.assertTrue(r is sys.stdin)
+    def test_feed(self):
+        f = feedparser.parse(u'feed://localhost:8097/tests/http/target.xml')
+        self.assertEqual(f.href, u'http://localhost:8097/tests/http/target.xml')
+    def test_feed_http(self):
+        f = feedparser.parse(u'feed:http://localhost:8097/tests/http/target.xml')
+        self.assertEqual(f.href, u'http://localhost:8097/tests/http/target.xml')
+    def test_bytes(self):
+        s = '<feed><item><title>text</title></item></feed>'.encode('utf-8')
+        r = feedparser._open_resource(s, '', '', '', '', [], {})
+        self.assertEqual(s, r.read())
+    def test_string(self):
+        s = '<feed><item><title>text</title></item></feed>'
+        r = feedparser._open_resource(s, '', '', '', '', [], {})
+        self.assertEqual(s.encode('utf-8'), r.read())
+    def test_unicode_1(self):
+        s = u'<feed><item><title>text</title></item></feed>'
+        r = feedparser._open_resource(s, '', '', '', '', [], {})
+        self.assertEqual(s.encode('utf-8'), r.read())
+    def test_unicode_2(self):
+        s = u'<feed><item><title>t\u00e9xt</title></item></feed>'
+        r = feedparser._open_resource(s, '', '', '', '', [], {})
+        self.assertEqual(s.encode('utf-8'), r.read())
+
+class TestMakeSafeAbsoluteURI(unittest.TestCase):
+    "Exercise the URI joining and sanitization code"
+    base = u'http://d.test/d/f.ext'
+    def _mktest(rel, expect, doc):
+        def fn(self):
+            value = feedparser._makeSafeAbsoluteURI(self.base, rel)
+            self.assertEqual(value, expect)
+        fn.__doc__ = doc
+        return fn
+
+    # make the test cases; the call signature is:
+    # (relative_url, expected_return_value, test_doc_string)
+    test_abs = _mktest(u'https://s.test/', u'https://s.test/', 'absolute uri')
+    test_rel = _mktest(u'/new', u'http://d.test/new', 'relative uri')
+    test_bad = _mktest(u'x://bad.test/', u'', 'unacceptable uri protocol')
+
+    def test_catch_ValueError(self):
+        'catch ValueError in Python 2.7 and up'
+        uri = u'http://bad]test/'
+        value1 = feedparser._makeSafeAbsoluteURI(uri)
+        value2 = feedparser._makeSafeAbsoluteURI(self.base, uri)
+        swap = feedparser.ACCEPTABLE_URI_SCHEMES
+        feedparser.ACCEPTABLE_URI_SCHEMES = ()
+        value3 = feedparser._makeSafeAbsoluteURI(self.base, uri)
+        feedparser.ACCEPTABLE_URI_SCHEMES = swap
+        # Only Python 2.7 and up throw a ValueError, otherwise uri is returned
+        self.assertTrue(value1 in (uri, u''))
+        self.assertTrue(value2 in (uri, u''))
+        self.assertTrue(value3 in (uri, u''))
+
+class TestConvertToIdn(unittest.TestCase):
+    "Test IDN support (unavailable in Jython as of Jython 2.5.2)"
+    # this is the greek test domain
+    hostname = u'\u03c0\u03b1\u03c1\u03ac\u03b4\u03b5\u03b9\u03b3\u03bc\u03b1'
+    hostname += u'.\u03b4\u03bf\u03ba\u03b9\u03bc\u03ae'
+    def test_control(self):
+        r = feedparser._convert_to_idn(u'http://example.test/')
+        self.assertEqual(r, u'http://example.test/')
+    def test_idn(self):
+        r = feedparser._convert_to_idn(u'http://%s/' % (self.hostname,))
+        self.assertEqual(r, u'http://xn--hxajbheg2az3al.xn--jxalpdlp/')
+    def test_port(self):
+        r = feedparser._convert_to_idn(u'http://%s:8080/' % (self.hostname,))
+        self.assertEqual(r, u'http://xn--hxajbheg2az3al.xn--jxalpdlp:8080/')
+
+class TestCompression(unittest.TestCase):
+    "Test the gzip and deflate support in the HTTP code"
+    def test_gzip_good(self):
+        f = feedparser.parse('http://localhost:8097/tests/compression/gzip.gz')
+        self.assertEqual(f.version, 'atom10')
+    def test_gzip_not_gzipped(self):
+        f = feedparser.parse('http://localhost:8097/tests/compression/gzip-not-gzipped.gz')
+        self.assertEqual(f.bozo, 1)
+        self.assertTrue(isinstance(f.bozo_exception, IOError))
+    def test_gzip_struct_error(self):
+        f = feedparser.parse('http://localhost:8097/tests/compression/gzip-struct-error.gz')
+        self.assertEqual(f.bozo, 1)
+        self.assertTrue(isinstance(f.bozo_exception, struct.error))
+    def test_zlib_good(self):
+        f = feedparser.parse('http://localhost:8097/tests/compression/deflate.z')
+        self.assertEqual(f.version, 'atom10')
+    def test_zlib_bad(self):
+        f = feedparser.parse('http://localhost:8097/tests/compression/deflate-error.z')
+        self.assertEqual(f.bozo, 1)
+        self.assertTrue(isinstance(f.bozo_exception, zlib.error))
+
+class TestHTTPStatus(unittest.TestCase):
+    "Test HTTP redirection and other status codes"
+    def test_301(self):
+        f = feedparser.parse('http://localhost:8097/tests/http/http_status_301.xml')
+        self.assertEqual(f.status, 301)
+        self.assertEqual(f.href, 'http://localhost:8097/tests/http/target.xml')
+        self.assertEqual(f.entries[0].title, 'target')
+    def test_302(self):
+        f = feedparser.parse('http://localhost:8097/tests/http/http_status_302.xml')
+        self.assertEqual(f.status, 302)
+        self.assertEqual(f.href, 'http://localhost:8097/tests/http/target.xml')
+        self.assertEqual(f.entries[0].title, 'target')
+    def test_303(self):
+        f = feedparser.parse('http://localhost:8097/tests/http/http_status_303.xml')
+        self.assertEqual(f.status, 303)
+        self.assertEqual(f.href, 'http://localhost:8097/tests/http/target.xml')
+        self.assertEqual(f.entries[0].title, 'target')
+    def test_307(self):
+        f = feedparser.parse('http://localhost:8097/tests/http/http_status_307.xml')
+        self.assertEqual(f.status, 307)
+        self.assertEqual(f.href, 'http://localhost:8097/tests/http/target.xml')
+        self.assertEqual(f.entries[0].title, 'target')
+    def test_304(self):
+        # first retrieve the url
+        u = 'http://localhost:8097/tests/http/http_status_304.xml'
+        f = feedparser.parse(u)
+        self.assertEqual(f.status, 200)
+        self.assertEqual(f.entries[0].title, 'title 304')
+        # extract the etag and last-modified headers
+        e = [v for k, v in f.headers.items() if k.lower() == 'etag'][0]
+        mh = [v for k, v in f.headers.items() if k.lower() == 'last-modified'][0]
+        ms = f.updated
+        mt = f.updated_parsed
+        md = datetime.datetime(*mt[0:7])
+        self.assertTrue(isinstance(mh, basestring))
+        self.assertTrue(isinstance(ms, basestring))
+        self.assertTrue(isinstance(mt, time.struct_time))
+        self.assertTrue(isinstance(md, datetime.datetime))
+        # test that sending back the etag results in a 304
+        f = feedparser.parse(u, etag=e)
+        self.assertEqual(f.status, 304)
+        # test that sending back last-modified (string) results in a 304
+        f = feedparser.parse(u, modified=ms)
+        self.assertEqual(f.status, 304)
+        # test that sending back last-modified (9-tuple) results in a 304
+        f = feedparser.parse(u, modified=mt)
+        self.assertEqual(f.status, 304)
+        # test that sending back last-modified (datetime) results in a 304
+        f = feedparser.parse(u, modified=md)
+        self.assertEqual(f.status, 304)
+    def test_404(self):
+        f = feedparser.parse('http://localhost:8097/tests/http/http_status_404.xml')
+        self.assertEqual(f.status, 404)
+    def test_9001(self):
+        f = feedparser.parse('http://localhost:8097/tests/http/http_status_9001.xml')
+        self.assertEqual(f.bozo, 1)
+    def test_redirect_to_304(self):
+        # ensure that an http redirect to an http 304 doesn't
+        # trigger a bozo_exception
+        u = 'http://localhost:8097/tests/http/http_redirect_to_304.xml'
+        f = feedparser.parse(u)
+        self.assertTrue(f.bozo == 0)
+        self.assertTrue(f.status == 302)
+
+class TestDateParsers(unittest.TestCase):
+    "Test the various date parsers; most of the test cases are constructed " \
+    "dynamically based on the contents of the `date_tests` dict, below"
+    def test_None(self):
+        self.assertTrue(feedparser._parse_date(None) is None)
+    def _check_date(self, func, dtstring, dttuple):
+        try:
+            tup = func(dtstring)
+        except (OverflowError, ValueError):
+            tup = None
+        self.assertEqual(tup, dttuple)
+        self.assertEqual(tup, feedparser._parse_date(dtstring))
+    def test_year_10000_date(self):
+        # On some systems this date string will trigger an OverflowError.
+        # On Jython and x64 systems, however, it's interpreted just fine.
+        try:
+            date = feedparser._parse_date_rfc822(u'Sun, 31 Dec 9999 23:59:59 -9999')
+        except OverflowError:
+            date = None
+        self.assertTrue(date in (None, (10000, 1, 5, 4, 38, 59, 2, 5, 0)))
+
+date_tests = {
+    feedparser._parse_date_greek: (
+        (u'', None), # empty string
+        (u'\u039a\u03c5\u03c1, 11 \u0399\u03bf\u03cd\u03bb 2004 12:00:00 EST', (2004, 7, 11, 17, 0, 0, 6, 193, 0)),
+    ),
+    feedparser._parse_date_hungarian: (
+        (u'', None), # empty string
+        (u'2004-j\u00falius-13T9:15-05:00', (2004, 7, 13, 14, 15, 0, 1, 195, 0)),
+    ),
+    feedparser._parse_date_iso8601: (
+        (u'', None), # empty string
+        (u'-0312', (2003, 12, 1, 0, 0, 0, 0, 335, 0)), # 2-digit year/month only variant
+        (u'031231', (2003, 12, 31, 0, 0, 0, 2, 365, 0)), # 2-digit year/month/day only, no hyphens
+        (u'03-12-31', (2003, 12, 31, 0, 0, 0, 2, 365, 0)), # 2-digit year/month/day only
+        (u'-03-12', (2003, 12, 1, 0, 0, 0, 0, 335, 0)), # 2-digit year/month only
+        (u'03335', (2003, 12, 1, 0, 0, 0, 0, 335, 0)), # 2-digit year/ordinal, no hyphens
+        (u'2003-12-31T10:14:55.1234Z', (2003, 12, 31, 10, 14, 55, 2, 365, 0)), # fractional seconds
+        # Special case for Google's extra zero in the month
+        (u'2003-012-31T10:14:55+00:00', (2003, 12, 31, 10, 14, 55, 2, 365, 0)),
+    ),
+    feedparser._parse_date_nate: (
+        (u'', None), # empty string
+        (u'2004-05-25 \uc624\ud6c4 11:23:17', (2004, 5, 25, 14, 23, 17, 1, 146, 0)),
+    ),
+    feedparser._parse_date_onblog: (
+        (u'', None), # empty string
+        (u'2004\ub144 05\uc6d4 28\uc77c  01:31:15', (2004, 5, 27, 16, 31, 15, 3, 148, 0)),
+    ),
+    feedparser._parse_date_perforce: (
+        (u'', None), # empty string
+        (u'Fri, 2006/09/15 08:19:53 EDT', (2006, 9, 15, 12, 19, 53, 4, 258, 0)),
+    ),
+    feedparser._parse_date_rfc822: (
+        (u'', None), # empty string
+        (u'Thu, 01 Jan 0100 00:00:01 +0100', (99, 12, 31, 23, 0, 1, 3, 365, 0)), # ancient date
+        (u'Thu, 01 Jan 04 19:48:21 GMT', (2004, 1, 1, 19, 48, 21, 3, 1, 0)), # 2-digit year
+        (u'Thu, 01 Jan 2004 19:48:21 GMT', (2004, 1, 1, 19, 48, 21, 3, 1, 0)), # 4-digit year
+        (u'Wed, 19 Aug 2009 18:28:00 Etc/GMT', (2009, 8, 19, 18, 28, 0, 2, 231, 0)), # etc/gmt timezone
+        (u'Wed, 19 Feb 2012 22:40:00 GMT-01:01', (2012, 2, 19, 23, 41, 0, 6, 50, 0)), # gmt+hh:mm timezone
+        (u'Mon, 13 Feb, 2012 06:28:00 UTC', (2012, 2, 13, 6, 28, 0, 0, 44, 0)), # extraneous comma
+        (u'Thu, 01 Jan 2004 00:00 GMT', (2004, 1, 1, 0, 0, 0, 3, 1, 0)), # no seconds
+        (u'Thu, 01 Jan 2004', (2004, 1, 1, 0, 0, 0, 3, 1, 0)), # no time
+        # Additional tests to handle Disney's long month names and invalid timezones
+        (u'Mon, 26 January 2004 16:31:00 AT', (2004, 1, 26, 20, 31, 0, 0, 26, 0)),
+        (u'Mon, 26 January 2004 16:31:00 ET', (2004, 1, 26, 21, 31, 0, 0, 26, 0)),
+        (u'Mon, 26 January 2004 16:31:00 CT', (2004, 1, 26, 22, 31, 0, 0, 26, 0)),
+        (u'Mon, 26 January 2004 16:31:00 MT', (2004, 1, 26, 23, 31, 0, 0, 26, 0)),
+        (u'Mon, 26 January 2004 16:31:00 PT', (2004, 1, 27, 0, 31, 0, 1, 27, 0)),
+    ),
+    feedparser._parse_date_asctime: (
+        (u'Sun Jan  4 16:29:06 2004', (2004, 1, 4, 16, 29, 6, 6, 4, 0)),
+    ),
+    feedparser._parse_date_w3dtf: (
+        (u'', None), # empty string
+        (u'2003-12-31T10:14:55Z', (2003, 12, 31, 10, 14, 55, 2, 365, 0)), # UTC
+        (u'2003-12-31T10:14:55-08:00', (2003, 12, 31, 18, 14, 55, 2, 365, 0)), # San Francisco timezone
+        (u'2003-12-31T18:14:55+08:00', (2003, 12, 31, 10, 14, 55, 2, 365, 0)), # Tokyo timezone
+        (u'2007-04-23T23:25:47.538+10:00', (2007, 4, 23, 13, 25, 47, 0, 113, 0)), # fractional seconds
+        (u'2003-12-31', (2003, 12, 31, 0, 0, 0, 2, 365, 0)), # year/month/day only
+        (u'20031231', (2003, 12, 31, 0, 0, 0, 2, 365, 0)), # year/month/day only, no hyphens
+        (u'2003-12', (2003, 12, 1, 0, 0, 0, 0, 335, 0)), # year/month only
+        (u'2003', (2003, 1, 1, 0, 0, 0, 2, 1, 0)), # year only
+        # MSSQL-style dates
+        (u'2004-07-08 23:56:58 -00:20', (2004, 7, 9, 0, 16, 58, 4, 191, 0)), # with timezone
+        (u'2004-07-08 23:56:58', (2004, 7, 8, 23, 56, 58, 3, 190, 0)), # without timezone
+        (u'2004-07-08 23:56:58.0', (2004, 7, 8, 23, 56, 58, 3, 190, 0)), # with fractional second
+        # Special cases for out-of-range times
+        (u'2003-12-31T25:14:55Z', (2004, 1, 1, 1, 14, 55, 3, 1, 0)), # invalid (25 hours)
+        (u'2003-12-31T10:61:55Z', (2003, 12, 31, 11, 1, 55, 2, 365, 0)), # invalid (61 minutes)
+        (u'2003-12-31T10:14:61Z', (2003, 12, 31, 10, 15, 1, 2, 365, 0)), # invalid (61 seconds)
+        # Special cases for rollovers in leap years
+        (u'2004-02-28T18:14:55-08:00', (2004, 2, 29, 2, 14, 55, 6, 60, 0)), # feb 28 in leap year
+        (u'2003-02-28T18:14:55-08:00', (2003, 3, 1, 2, 14, 55, 5, 60, 0)), # feb 28 in non-leap year
+        (u'2000-02-28T18:14:55-08:00', (2000, 2, 29, 2, 14, 55, 1, 60, 0)), # feb 28 in leap year on century divisible by 400
+    )
+}
+
+def make_date_test(f, s, t):
+    return lambda self: self._check_date(f, s, t)
+
+for func, items in date_tests.iteritems():
+    for i, (dtstring, dttuple) in enumerate(items):
+        uniqfunc = make_date_test(func, dtstring, dttuple)
+        setattr(TestDateParsers, 'test_%s_%02i' % (func.__name__, i), uniqfunc)
+
+
+class TestHTMLGuessing(unittest.TestCase):
+    "Exercise the HTML sniffing code"
+    def _mktest(text, expect, doc):
+        def fn(self):
+            value = bool(feedparser._FeedParserMixin.lookslikehtml(text))
+            self.assertEqual(value, expect)
+        fn.__doc__ = doc
+        return fn
+
+    test_text_1 = _mktest(u'plain text', False, u'plain text')
+    test_text_2 = _mktest(u'2 < 3', False, u'plain text with angle bracket')
+    test_html_1 = _mktest(u'<a href="">a</a>', True, u'anchor tag')
+    test_html_2 = _mktest(u'<i>i</i>', True, u'italics tag')
+    test_html_3 = _mktest(u'<b>b</b>', True, u'bold tag')
+    test_html_4 = _mktest(u'<code>', False, u'allowed tag, no end tag')
+    test_html_5 = _mktest(u'<rss> .. </rss>', False, u'disallowed tag')
+    test_entity_1 = _mktest(u'AT&T', False, u'corporation name')
+    test_entity_2 = _mktest(u'&copy;', True, u'named entity reference')
+    test_entity_3 = _mktest(u'&#169;', True, u'numeric entity reference')
+    test_entity_4 = _mktest(u'&#xA9;', True, u'hex numeric entity reference')
+
+#---------- additional api unit tests, not backed by files
+
+class TestBuildRequest(unittest.TestCase):
+    "Test that HTTP request objects are created as expected"
+    def test_extra_headers(self):
+        """You can pass in extra headers and they go into the request object."""
+
+        request = feedparser._build_urllib2_request(
+          'http://example.com/feed',
+          'agent-name',
+          None, None, None, None,
+          {'Cache-Control': 'max-age=0'})
+        # nb, urllib2 folds the case of the headers
+        self.assertEqual(
+          request.get_header('Cache-control'), 'max-age=0')
+
+
+#---------- parse test files and create test methods ----------
+def convert_to_utf8(data):
+    "Identify data's encoding using its byte order mark" \
+    "and convert it to its utf-8 equivalent"
+    if data[:4] == _l2bytes([0x4c, 0x6f, 0xa7, 0x94]):
+        return data.decode('cp037').encode('utf-8')
+    elif data[:4] == _l2bytes([0x00, 0x00, 0xfe, 0xff]):
+        if not _UTF32_AVAILABLE:
+            return None
+        return data.decode('utf-32be').encode('utf-8')
+    elif data[:4] == _l2bytes([0xff, 0xfe, 0x00, 0x00]):
+        if not _UTF32_AVAILABLE:
+            return None
+        return data.decode('utf-32le').encode('utf-8')
+    elif data[:4] == _l2bytes([0x00, 0x00, 0x00, 0x3c]):
+        if not _UTF32_AVAILABLE:
+            return None
+        return data.decode('utf-32be').encode('utf-8')
+    elif data[:4] == _l2bytes([0x3c, 0x00, 0x00, 0x00]):
+        if not _UTF32_AVAILABLE:
+            return None
+        return data.decode('utf-32le').encode('utf-8')
+    elif data[:4] == _l2bytes([0x00, 0x3c, 0x00, 0x3f]):
+        return data.decode('utf-16be').encode('utf-8')
+    elif data[:4] == _l2bytes([0x3c, 0x00, 0x3f, 0x00]):
+        return data.decode('utf-16le').encode('utf-8')
+    elif (data[:2] == _l2bytes([0xfe, 0xff])) and (data[2:4] != _l2bytes([0x00, 0x00])):
+        return data[2:].decode('utf-16be').encode('utf-8')
+    elif (data[:2] == _l2bytes([0xff, 0xfe])) and (data[2:4] != _l2bytes([0x00, 0x00])):
+        return data[2:].decode('utf-16le').encode('utf-8')
+    elif data[:3] == _l2bytes([0xef, 0xbb, 0xbf]):
+        return data[3:]
+    # no byte order mark was found
+    return data
+
+skip_re = re.compile(_s2bytes("SkipUnless:\s*(.*?)\n"))
+desc_re = re.compile(_s2bytes("Description:\s*(.*?)\s*Expect:\s*(.*)\s*-->"))
+def getDescription(xmlfile, data):
+    """Extract test data
+
+    Each test case is an XML file which contains not only a test feed
+    but also the description of the test and the condition that we
+    would expect the parser to create when it parses the feed.  Example:
+    <!--
+    Description: feed title
+    Expect:      feed['title'] == u'Example feed'
+    -->
+    """
+    skip_results = skip_re.search(data)
+    if skip_results:
+        skipUnless = skip_results.group(1).strip()
+    else:
+        skipUnless = '1'
+    search_results = desc_re.search(data)
+    if not search_results:
+        raise RuntimeError, "can't parse %s" % xmlfile
+    description, evalString = map(lambda s: s.strip(), list(search_results.groups()))
+    description = xmlfile + ": " + unicode(description, 'utf8')
+    return description, evalString, skipUnless
+
+def buildTestCase(xmlfile, description, evalString):
+    func = lambda self, xmlfile=xmlfile, evalString=evalString: \
+         self.failUnlessEval(xmlfile, evalString)
+    func.__doc__ = description
+    return func
+
+def runtests():
+    "Read the files in the tests/ directory, dynamically add tests to the " \
+    "TestCases above, spawn the HTTP server, and run the test suite"
+    if sys.argv[1:]:
+        allfiles = filter(lambda s: s.endswith('.xml'), reduce(operator.add, map(glob.glob, sys.argv[1:]), []))
+        sys.argv = [sys.argv[0]] #+ sys.argv[2:]
+    else:
+        allfiles = glob.glob(os.path.join('.', 'tests', '**', '**', '*.xml'))
+        wellformedfiles = glob.glob(os.path.join('.', 'tests', 'wellformed', '**', '*.xml'))
+        illformedfiles = glob.glob(os.path.join('.', 'tests', 'illformed', '*.xml'))
+        encodingfiles = glob.glob(os.path.join('.', 'tests', 'encoding', '*.xml'))
+        entitiesfiles = glob.glob(os.path.join('.', 'tests', 'entities', '*.xml'))
+        microformatfiles = glob.glob(os.path.join('.', 'tests', 'microformats', '**', '*.xml'))
+    httpd = None
+    # there are several compression test cases that must be accounted for
+    # as well as a number of http status tests that redirect to a target
+    # and a few `_open_resource`-related tests
+    httpcount = 5 + 17 + 2
+    httpcount += len([f for f in allfiles if 'http' in f])
+    httpcount += len([f for f in wellformedfiles if 'http' in f])
+    httpcount += len([f for f in illformedfiles if 'http' in f])
+    httpcount += len([f for f in encodingfiles if 'http' in f])
+    try:
+        for c, xmlfile in enumerate(allfiles + encodingfiles + illformedfiles + entitiesfiles):
+            addTo = TestCase
+            if xmlfile in encodingfiles:
+                addTo = TestEncodings
+            elif xmlfile in entitiesfiles:
+                addTo = (TestStrictParser, TestLooseParser)
+            elif xmlfile in microformatfiles:
+                addTo = TestMicroformats
+            elif xmlfile in wellformedfiles:
+                addTo = (TestStrictParser, TestLooseParser)
+            data = open(xmlfile, 'rb').read()
+            if 'encoding' in xmlfile:
+                data = convert_to_utf8(data)
+                if data is None:
+                    # convert_to_utf8 found a byte order mark for utf_32
+                    # but it's not supported in this installation of Python
+                    if 'http' in xmlfile:
+                        httpcount -= 1 + (xmlfile in wellformedfiles)
+                    continue
+            description, evalString, skipUnless = getDescription(xmlfile, data)
+            testName = 'test_%06d' % c
+            ishttp = 'http' in xmlfile
+            try:
+                if not eval(skipUnless): raise NotImplementedError
+            except (ImportError, LookupError, NotImplementedError, AttributeError):
+                if ishttp:
+                    httpcount -= 1 + (xmlfile in wellformedfiles)
+                continue
+            if ishttp:
+                xmlfile = 'http://%s:%s/%s' % (_HOST, _PORT, posixpath.normpath(xmlfile.replace('\\', '/')))
+            testFunc = buildTestCase(xmlfile, description, evalString)
+            if isinstance(addTo, tuple):
+                setattr(addTo[0], testName, testFunc)
+                setattr(addTo[1], testName, testFunc)
+            else:
+                setattr(addTo, testName, testFunc)
+        if feedparser.TIDY_MARKUP and feedparser._mxtidy:
+            sys.stderr.write('\nWarning: feedparser.TIDY_MARKUP invalidates tests, turning it off temporarily\n\n')
+            feedparser.TIDY_MARKUP = 0
+        if httpcount:
+            httpd = FeedParserTestServer(httpcount)
+            httpd.daemon = True
+            httpd.start()
+            httpd.ready.wait()
+        testsuite = unittest.TestSuite()
+        testloader = unittest.TestLoader()
+        testsuite.addTest(testloader.loadTestsFromTestCase(TestCase))
+        testsuite.addTest(testloader.loadTestsFromTestCase(TestStrictParser))
+        testsuite.addTest(testloader.loadTestsFromTestCase(TestLooseParser))
+        testsuite.addTest(testloader.loadTestsFromTestCase(TestEncodings))
+        testsuite.addTest(testloader.loadTestsFromTestCase(TestDateParsers))
+        testsuite.addTest(testloader.loadTestsFromTestCase(TestHTMLGuessing))
+        testsuite.addTest(testloader.loadTestsFromTestCase(TestHTTPStatus))
+        testsuite.addTest(testloader.loadTestsFromTestCase(TestCompression))
+        testsuite.addTest(testloader.loadTestsFromTestCase(TestConvertToIdn))
+        testsuite.addTest(testloader.loadTestsFromTestCase(TestMicroformats))
+        testsuite.addTest(testloader.loadTestsFromTestCase(TestOpenResource))
+        testsuite.addTest(testloader.loadTestsFromTestCase(TestFeedParserDict))
+        testsuite.addTest(testloader.loadTestsFromTestCase(TestMakeSafeAbsoluteURI))
+        testsuite.addTest(testloader.loadTestsFromTestCase(TestEverythingIsUnicode))
+        testsuite.addTest(testloader.loadTestsFromTestCase(TestTemporaryFallbackBehavior))
+        testresults = unittest.TextTestRunner(verbosity=1).run(testsuite)
+
+        # Return 0 if successful, 1 if there was a failure
+        sys.exit(not testresults.wasSuccessful())
+    finally:
+        if httpd:
+            if httpd.requests:
+                # Should never get here unless something went horribly wrong, like the
+                # user hitting Ctrl-C.  Tell our HTTP server that it's done, then do
+                # one more request to flush it.  This rarely works; the combination of
+                # threading, self-terminating HTTP servers, and unittest is really
+                # quite flaky.  Just what you want in a testing framework, no?
+                httpd.requests = 0
+                if httpd.ready:
+                    urllib.urlopen('http://127.0.0.1:8097/tests/wellformed/rss/aaa_wellformed.xml').read()
+            httpd.join(0)
+
+if __name__ == "__main__":
+    runtests()
--- a/lib/feedparser/sgmllib3.py
+++ b/lib/feedparser/sgmllib3.py
@ -0,0 +1,547 @@
+"""A parser for SGML, using the derived class as a static DTD."""
+
+# XXX This only supports those SGML features used by HTML.
+
+# XXX There should be a way to distinguish between PCDATA (parsed
+# character data -- the normal case), RCDATA (replaceable character
+# data -- only char and entity references and end tags are special)
+# and CDATA (character data -- only end tags are special).  RCDATA is
+# not supported at all.
+
+import _markupbase
+import re
+
+__all__ = ["SGMLParser", "SGMLParseError"]
+
+# Regular expressions used for parsing
+
+interesting = re.compile('[&<]')
+incomplete = re.compile('&([a-zA-Z][a-zA-Z0-9]*|#[0-9]*)?|'
+                           '<([a-zA-Z][^<>]*|'
+                              '/([a-zA-Z][^<>]*)?|'
+                              '![^<>]*)?')
+
+entityref = re.compile('&([a-zA-Z][-.a-zA-Z0-9]*)[^a-zA-Z0-9]')
+charref = re.compile('&#([0-9]+)[^0-9]')
+
+starttagopen = re.compile('<[>a-zA-Z]')
+shorttagopen = re.compile('<[a-zA-Z][-.a-zA-Z0-9]*/')
+shorttag = re.compile('<([a-zA-Z][-.a-zA-Z0-9]*)/([^/]*)/')
+piclose = re.compile('>')
+endbracket = re.compile('[<>]')
+tagfind = re.compile('[a-zA-Z][-_.a-zA-Z0-9]*')
+attrfind = re.compile(
+    r'\s*([a-zA-Z_][-:.a-zA-Z_0-9]*)(\s*=\s*'
+    r'(\'[^\']*\'|"[^"]*"|[][\-a-zA-Z0-9./,:;+*%?!&$\(\)_#=~\'"@]*))?')
+
+
+class SGMLParseError(RuntimeError):
+    """Exception raised for all parse errors."""
+    pass
+
+
+# SGML parser base class -- find tags and call handler functions.
+# Usage: p = SGMLParser(); p.feed(data); ...; p.close().
+# The dtd is defined by deriving a class which defines methods
+# with special names to handle tags: start_foo and end_foo to handle
+# <foo> and </foo>, respectively, or do_foo to handle <foo> by itself.
+# (Tags are converted to lower case for this purpose.)  The data
+# between tags is passed to the parser by calling self.handle_data()
+# with some data as argument (the data may be split up in arbitrary
+# chunks).  Entity references are passed by calling
+# self.handle_entityref() with the entity reference as argument.
+
+class SGMLParser(_markupbase.ParserBase):
+    # Definition of entities -- derived classes may override
+    entity_or_charref = re.compile('&(?:'
+      '([a-zA-Z][-.a-zA-Z0-9]*)|#([0-9]+)'
+      ')(;?)')
+
+    def __init__(self, verbose=0):
+        """Initialize and reset this instance."""
+        self.verbose = verbose
+        self.reset()
+
+    def reset(self):
+        """Reset this instance. Loses all unprocessed data."""
+        self.__starttag_text = None
+        self.rawdata = ''
+        self.stack = []
+        self.lasttag = '???'
+        self.nomoretags = 0
+        self.literal = 0
+        _markupbase.ParserBase.reset(self)
+
+    def setnomoretags(self):
+        """Enter literal mode (CDATA) till EOF.
+
+        Intended for derived classes only.
+        """
+        self.nomoretags = self.literal = 1
+
+    def setliteral(self, *args):
+        """Enter literal mode (CDATA).
+
+        Intended for derived classes only.
+        """
+        self.literal = 1
+
+    def feed(self, data):
+        """Feed some data to the parser.
+
+        Call this as often as you want, with as little or as much text
+        as you want (may include '\n').  (This just saves the text,
+        all the processing is done by goahead().)
+        """
+
+        self.rawdata = self.rawdata + data
+        self.goahead(0)
+
+    def close(self):
+        """Handle the remaining data."""
+        self.goahead(1)
+
+    def error(self, message):
+        raise SGMLParseError(message)
+
+    # Internal -- handle data as far as reasonable.  May leave state
+    # and data to be processed by a subsequent call.  If 'end' is
+    # true, force handling all data as if followed by EOF marker.
+    def goahead(self, end):
+        rawdata = self.rawdata
+        i = 0
+        n = len(rawdata)
+        while i < n:
+            if self.nomoretags:
+                self.handle_data(rawdata[i:n])
+                i = n
+                break
+            match = interesting.search(rawdata, i)
+            if match: j = match.start()
+            else: j = n
+            if i < j:
+                self.handle_data(rawdata[i:j])
+            i = j
+            if i == n: break
+            if rawdata[i] == '<':
+                if starttagopen.match(rawdata, i):
+                    if self.literal:
+                        self.handle_data(rawdata[i])
+                        i = i+1
+                        continue
+                    k = self.parse_starttag(i)
+                    if k < 0: break
+                    i = k
+                    continue
+                if rawdata.startswith("</", i):
+                    k = self.parse_endtag(i)
+                    if k < 0: break
+                    i = k
+                    self.literal = 0
+                    continue
+                if self.literal:
+                    if n > (i + 1):
+                        self.handle_data("<")
+                        i = i+1
+                    else:
+                        # incomplete
+                        break
+                    continue
+                if rawdata.startswith("<!--", i):
+                        # Strictly speaking, a comment is --.*--
+                        # within a declaration tag <!...>.
+                        # This should be removed,
+                        # and comments handled only in parse_declaration.
+                    k = self.parse_comment(i)
+                    if k < 0: break
+                    i = k
+                    continue
+                if rawdata.startswith("<?", i):
+                    k = self.parse_pi(i)
+                    if k < 0: break
+                    i = i+k
+                    continue
+                if rawdata.startswith("<!", i):
+                    # This is some sort of declaration; in "HTML as
+                    # deployed," this should only be the document type
+                    # declaration ("<!DOCTYPE html...>").
+                    k = self.parse_declaration(i)
+                    if k < 0: break
+                    i = k
+                    continue
+            elif rawdata[i] == '&':
+                if self.literal:
+                    self.handle_data(rawdata[i])
+                    i = i+1
+                    continue
+                match = charref.match(rawdata, i)
+                if match:
+                    name = match.group(1)
+                    self.handle_charref(name)
+                    i = match.end(0)
+                    if rawdata[i-1] != ';': i = i-1
+                    continue
+                match = entityref.match(rawdata, i)
+                if match:
+                    name = match.group(1)
+                    self.handle_entityref(name)
+                    i = match.end(0)
+                    if rawdata[i-1] != ';': i = i-1
+                    continue
+            else:
+                self.error('neither < nor & ??')
+            # We get here only if incomplete matches but
+            # nothing else
+            match = incomplete.match(rawdata, i)
+            if not match:
+                self.handle_data(rawdata[i])
+                i = i+1
+                continue
+            j = match.end(0)
+            if j == n:
+                break # Really incomplete
+            self.handle_data(rawdata[i:j])
+            i = j
+        # end while
+        if end and i < n:
+            self.handle_data(rawdata[i:n])
+            i = n
+        self.rawdata = rawdata[i:]
+        # XXX if end: check for empty stack
+
+    # Extensions for the DOCTYPE scanner:
+    _decl_otherchars = '='
+
+    # Internal -- parse processing instr, return length or -1 if not terminated
+    def parse_pi(self, i):
+        rawdata = self.rawdata
+        if rawdata[i:i+2] != '<?':
+            self.error('unexpected call to parse_pi()')
+        match = piclose.search(rawdata, i+2)
+        if not match:
+            return -1
+        j = match.start(0)
+        self.handle_pi(rawdata[i+2: j])
+        j = match.end(0)
+        return j-i
+
+    def get_starttag_text(self):
+        return self.__starttag_text
+
+    # Internal -- handle starttag, return length or -1 if not terminated
+    def parse_starttag(self, i):
+        self.__starttag_text = None
+        start_pos = i
+        rawdata = self.rawdata
+        if shorttagopen.match(rawdata, i):
+            # SGML shorthand: <tag/data/ == <tag>data</tag>
+            # XXX Can data contain &... (entity or char refs)?
+            # XXX Can data contain < or > (tag characters)?
+            # XXX Can there be whitespace before the first /?
+            match = shorttag.match(rawdata, i)
+            if not match:
+                return -1
+            tag, data = match.group(1, 2)
+            self.__starttag_text = '<%s/' % tag
+            tag = tag.lower()
+            k = match.end(0)
+            self.finish_shorttag(tag, data)
+            self.__starttag_text = rawdata[start_pos:match.end(1) + 1]
+            return k
+        # XXX The following should skip matching quotes (' or ")
+        # As a shortcut way to exit, this isn't so bad, but shouldn't
+        # be used to locate the actual end of the start tag since the
+        # < or > characters may be embedded in an attribute value.
+        match = endbracket.search(rawdata, i+1)
+        if not match:
+            return -1
+        j = match.start(0)
+        # Now parse the data between i+1 and j into a tag and attrs
+        attrs = []
+        if rawdata[i:i+2] == '<>':
+            # SGML shorthand: <> == <last open tag seen>
+            k = j
+            tag = self.lasttag
+        else:
+            match = tagfind.match(rawdata, i+1)
+            if not match:
+                self.error('unexpected call to parse_starttag')
+            k = match.end(0)
+            tag = rawdata[i+1:k].lower()
+            self.lasttag = tag
+        while k < j:
+            match = attrfind.match(rawdata, k)
+            if not match: break
+            attrname, rest, attrvalue = match.group(1, 2, 3)
+            if not rest:
+                attrvalue = attrname
+            else:
+                if (attrvalue[:1] == "'" == attrvalue[-1:] or
+                    attrvalue[:1] == '"' == attrvalue[-1:]):
+                    # strip quotes
+                    attrvalue = attrvalue[1:-1]
+                attrvalue = self.entity_or_charref.sub(
+                    self._convert_ref, attrvalue)
+            attrs.append((attrname.lower(), attrvalue))
+            k = match.end(0)
+        if rawdata[j] == '>':
+            j = j+1
+        self.__starttag_text = rawdata[start_pos:j]
+        self.finish_starttag(tag, attrs)
+        return j
+
+    # Internal -- convert entity or character reference
+    def _convert_ref(self, match):
+        if match.group(2):
+            return self.convert_charref(match.group(2)) or \
+                '&#%s%s' % match.groups()[1:]
+        elif match.group(3):
+            return self.convert_entityref(match.group(1)) or \
+                '&%s;' % match.group(1)
+        else:
+            return '&%s' % match.group(1)
+
+    # Internal -- parse endtag
+    def parse_endtag(self, i):
+        rawdata = self.rawdata
+        match = endbracket.search(rawdata, i+1)
+        if not match:
+            return -1
+        j = match.start(0)
+        tag = rawdata[i+2:j].strip().lower()
+        if rawdata[j] == '>':
+            j = j+1
+        self.finish_endtag(tag)
+        return j
+
+    # Internal -- finish parsing of <tag/data/ (same as <tag>data</tag>)
+    def finish_shorttag(self, tag, data):
+        self.finish_starttag(tag, [])
+        self.handle_data(data)
+        self.finish_endtag(tag)
+
+    # Internal -- finish processing of start tag
+    # Return -1 for unknown tag, 0 for open-only tag, 1 for balanced tag
+    def finish_starttag(self, tag, attrs):
+        try:
+            method = getattr(self, 'start_' + tag)
+        except AttributeError:
+            try:
+                method = getattr(self, 'do_' + tag)
+            except AttributeError:
+                self.unknown_starttag(tag, attrs)
+                return -1
+            else:
+                self.handle_starttag(tag, method, attrs)
+                return 0
+        else:
+            self.stack.append(tag)
+            self.handle_starttag(tag, method, attrs)
+            return 1
+
+    # Internal -- finish processing of end tag
+    def finish_endtag(self, tag):
+        if not tag:
+            found = len(self.stack) - 1
+            if found < 0:
+                self.unknown_endtag(tag)
+                return
+        else:
+            if tag not in self.stack:
+                try:
+                    method = getattr(self, 'end_' + tag)
+                except AttributeError:
+                    self.unknown_endtag(tag)
+                else:
+                    self.report_unbalanced(tag)
+                return
+            found = len(self.stack)
+            for i in range(found):
+                if self.stack[i] == tag: found = i
+        while len(self.stack) > found:
+            tag = self.stack[-1]
+            try:
+                method = getattr(self, 'end_' + tag)
+            except AttributeError:
+                method = None
+            if method:
+                self.handle_endtag(tag, method)
+            else:
+                self.unknown_endtag(tag)
+            del self.stack[-1]
+
+    # Overridable -- handle start tag
+    def handle_starttag(self, tag, method, attrs):
+        method(attrs)
+
+    # Overridable -- handle end tag
+    def handle_endtag(self, tag, method):
+        method()
+
+    # Example -- report an unbalanced </...> tag.
+    def report_unbalanced(self, tag):
+        if self.verbose:
+            print('*** Unbalanced </' + tag + '>')
+            print('*** Stack:', self.stack)
+
+    def convert_charref(self, name):
+        """Convert character reference, may be overridden."""
+        try:
+            n = int(name)
+        except ValueError:
+            return
+        if not 0 <= n <= 127:
+            return
+        return self.convert_codepoint(n)
+
+    def convert_codepoint(self, codepoint):
+        return chr(codepoint)
+
+    def handle_charref(self, name):
+        """Handle character reference, no need to override."""
+        replacement = self.convert_charref(name)
+        if replacement is None:
+            self.unknown_charref(name)
+        else:
+            self.handle_data(replacement)
+
+    # Definition of entities -- derived classes may override
+    entitydefs = \
+            {'lt': '<', 'gt': '>', 'amp': '&', 'quot': '"', 'apos': '\''}
+
+    def convert_entityref(self, name):
+        """Convert entity references.
+
+        As an alternative to overriding this method; one can tailor the
+        results by setting up the self.entitydefs mapping appropriately.
+        """
+        table = self.entitydefs
+        if name in table:
+            return table[name]
+        else:
+            return
+
+    def handle_entityref(self, name):
+        """Handle entity references, no need to override."""
+        replacement = self.convert_entityref(name)
+        if replacement is None:
+            self.unknown_entityref(name)
+        else:
+            self.handle_data(replacement)
+
+    # Example -- handle data, should be overridden
+    def handle_data(self, data):
+        pass
+
+    # Example -- handle comment, could be overridden
+    def handle_comment(self, data):
+        pass
+
+    # Example -- handle declaration, could be overridden
+    def handle_decl(self, decl):
+        pass
+
+    # Example -- handle processing instruction, could be overridden
+    def handle_pi(self, data):
+        pass
+
+    # To be overridden -- handlers for unknown objects
+    def unknown_starttag(self, tag, attrs): pass
+    def unknown_endtag(self, tag): pass
+    def unknown_charref(self, ref): pass
+    def unknown_entityref(self, ref): pass
+
+
+class TestSGMLParser(SGMLParser):
+
+    def __init__(self, verbose=0):
+        self.testdata = ""
+        SGMLParser.__init__(self, verbose)
+
+    def handle_data(self, data):
+        self.testdata = self.testdata + data
+        if len(repr(self.testdata)) >= 70:
+            self.flush()
+
+    def flush(self):
+        data = self.testdata
+        if data:
+            self.testdata = ""
+            print('data:', repr(data))
+
+    def handle_comment(self, data):
+        self.flush()
+        r = repr(data)
+        if len(r) > 68:
+            r = r[:32] + '...' + r[-32:]
+        print('comment:', r)
+
+    def unknown_starttag(self, tag, attrs):
+        self.flush()
+        if not attrs:
+            print('start tag: <' + tag + '>')
+        else:
+            print('start tag: <' + tag, end=' ')
+            for name, value in attrs:
+                print(name + '=' + '"' + value + '"', end=' ')
+            print('>')
+
+    def unknown_endtag(self, tag):
+        self.flush()
+        print('end tag: </' + tag + '>')
+
+    def unknown_entityref(self, ref):
+        self.flush()
+        print('*** unknown entity ref: &' + ref + ';')
+
+    def unknown_charref(self, ref):
+        self.flush()
+        print('*** unknown char ref: &#' + ref + ';')
+
+    def unknown_decl(self, data):
+        self.flush()
+        print('*** unknown decl: [' + data + ']')
+
+    def close(self):
+        SGMLParser.close(self)
+        self.flush()
+
+
+def test(args = None):
+    import sys
+
+    if args is None:
+        args = sys.argv[1:]
+
+    if args and args[0] == '-s':
+        args = args[1:]
+        klass = SGMLParser
+    else:
+        klass = TestSGMLParser
+
+    if args:
+        file = args[0]
+    else:
+        file = 'test.html'
+
+    if file == '-':
+        f = sys.stdin
+    else:
+        try:
+            f = open(file, 'r')
+        except IOError as msg:
+            print(file, ":", msg)
+            sys.exit(1)
+
+    data = f.read()
+    if f is not sys.stdin:
+        f.close()
+
+    x = klass()
+    for c in data:
+        x.feed(c)
+    x.close()
+
+
+if __name__ == '__main__':
+    test()
--- a/lib/feedparser/tests/compression/deflate-error.z
+++ b/lib/feedparser/tests/compression/deflate-error.z
@ -0,0 +1 @@
+error
--- a/lib/feedparser/tests/compression/deflate.z
+++ b/lib/feedparser/tests/compression/deflate.z
--- a/lib/feedparser/tests/compression/gzip-not-gzipped.gz
+++ b/lib/feedparser/tests/compression/gzip-not-gzipped.gz
@ -0,0 +1 @@
+error
--- a/lib/feedparser/tests/compression/gzip-struct-error.gz
+++ b/lib/feedparser/tests/compression/gzip-struct-error.gz
--- a/lib/feedparser/tests/compression/gzip.gz
+++ b/lib/feedparser/tests/compression/gzip.gz
--- a/lib/feedparser/tests/compression/sample.xml
+++ b/lib/feedparser/tests/compression/sample.xml
@ -0,0 +1 @@
+<feed xmlns="http://www.w3.org/2005/Atom"></feed>
--- a/lib/feedparser/tests/encoding/big5.xml
+++ b/lib/feedparser/tests/encoding/big5.xml
@ -0,0 +1,8 @@
+<?xml version="1.0" encoding="big5"?>
+<!--
+SkipUnless:   __import__('codecs').lookup('big5')
+Description:  big5
+Expect:       not bozo and encoding == 'big5'
+-->
+<rss>
+</rss>
--- a/lib/feedparser/tests/encoding/bozo_bogus_encoding.xml
+++ b/lib/feedparser/tests/encoding/bozo_bogus_encoding.xml
@ -0,0 +1,7 @@
+<?xml version="1.0" encoding="bogus"?>
+<!--
+Description: bogus encoding
+Expect:      bozo
+-->
+<rss>
+</rss>
--- a/lib/feedparser/tests/encoding/bozo_double-encoded-html.xml
+++ b/lib/feedparser/tests/encoding/bozo_double-encoded-html.xml
@ -0,0 +1,13 @@
+<?xml version="1.0"?>
+<!--
+Description:  utf-8 interpreted as iso-8859-1 and re-encoded as utf-8
+Expect:       bozo and ord(entries[0]['description']) == 8230
+-->
+<rss version="2.0">
+<channel>
+  <item>
+    <description>&acirc;&#128;&brvbar;</description>
+  </item>
+</channel>
+</rss>
+
--- a/lib/feedparser/tests/encoding/bozo_encoding_mismatch_crash.xml
+++ b/lib/feedparser/tests/encoding/bozo_encoding_mismatch_crash.xml
@ -0,0 +1,10 @@
+<!--
+SkipUnless:   __import__('sys').version.split()[0] >= '2.2.0'
+Description: crashes
+Expect:      1
+-->
+<rss>
+<item>
+<description><![CDATA[<a href="http://www.example.com/">¤</a><a href="&#38;"></a>]]></description>
+</item>
+</rss>
--- a/lib/feedparser/tests/encoding/bozo_http_i18n.xml
+++ b/lib/feedparser/tests/encoding/bozo_http_i18n.xml
@ -0,0 +1,11 @@
+<?xml version="1.0" encoding="utf-8"?>
+<!--
+Note:         text/xml defaults to us-ascii, in conflict with the XML declaration of utf-8
+Header:       Content-type: text/xml
+Description:  Content-type with no charset (text/xml defaults to us-ascii)
+Expect:       bozo and isinstance(bozo_exception, feedparser.CharacterEncodingOverride)
+-->
+
+<feed version="0.3" xmlns="http://purl.org/atom/ns#">
+  <title>Iñtërnâtiônàlizætiøn</title>
+</feed>
--- a/lib/feedparser/tests/encoding/bozo_http_text_plain.xml
+++ b/lib/feedparser/tests/encoding/bozo_http_text_plain.xml
@ -0,0 +1,8 @@
+<?xml version="1.0"?>
+<!--
+Header:      Content-type: text/plain
+Description: text/plain + no encoding
+Expect:      bozo
+-->
+<rss version="2.0">
+</rss>
--- a/lib/feedparser/tests/encoding/bozo_http_text_plain_charset.xml
+++ b/lib/feedparser/tests/encoding/bozo_http_text_plain_charset.xml
@ -0,0 +1,8 @@
+<?xml version="1.0"?>
+<!--
+Header:      Content-type: text/plain; charset=utf-8
+Description: text/plain + charset
+Expect:      bozo and encoding == 'utf-8'
+-->
+<rss version="2.0">
+</rss>
--- a/lib/feedparser/tests/encoding/bozo_invalid-bytes-with-bom.xml
+++ b/lib/feedparser/tests/encoding/bozo_invalid-bytes-with-bom.xml
@ -0,0 +1,10 @@
+ï»¿<!--
+Description: Ensure when there are invalid bytes in encoding specified by BOM, feedparser doesn't crash
+Expect:      bozo and not encoding
+-->
+<rss version="2.0">
+<channel>
+<title>Valid UTF8: Ñ¨Invalid UTF8: España</title>
+<description><pre class="screen"></pre></description>
+</channel>
+</rss
--- a/lib/feedparser/tests/encoding/bozo_linenoise.xml
+++ b/lib/feedparser/tests/encoding/bozo_linenoise.xml
@ -0,0 +1,13 @@
+<?xml version="1.0" encoding="utf-8"?>
+<!--
+Description: unguessable characters
+Expect:      bozo and entries[0].summary == u'\xe2\u20ac\u2122\xe2\u20ac\x9d\u0160'
+-->
+
+<rss version="2.0">
+<channel>
+<item>
+<description><![CDATA[ â€™<>â€<C3A2>© ]]></description>
+</item>
+</channel>
+</rss>
--- a/lib/feedparser/tests/encoding/csucs4.xml
+++ b/lib/feedparser/tests/encoding/csucs4.xml
--- a/lib/feedparser/tests/encoding/csunicode.xml
+++ b/lib/feedparser/tests/encoding/csunicode.xml
--- a/lib/feedparser/tests/encoding/demoronize-1.xml
+++ b/lib/feedparser/tests/encoding/demoronize-1.xml
@ -0,0 +1,13 @@
+<?xml version="1.0"?>
+<!--
+Description:  using win-1252 character points instead of unicode
+Expect:       not bozo and entries[0]['description'] == u'don\u2019t'
+-->
+<rss version="2.0">
+<channel>
+  <item>
+    <description>dont</description>
+  </item>
+</channel>
+</rss>
+
--- a/lib/feedparser/tests/encoding/demoronize-2.xml
+++ b/lib/feedparser/tests/encoding/demoronize-2.xml
@ -0,0 +1,13 @@
+<?xml version="1.0"?>
+<!--
+Description:  using win-1252 character points instead of unicode
+Expect:       not bozo and entries[0]['description'] == u'don\u2019t'
+-->
+<rss version="2.0">
+<channel>
+  <item>
+    <description>don&#146;t</description>
+  </item>
+</channel>
+</rss>
+
--- a/lib/feedparser/tests/encoding/demoronize-3.xml
+++ b/lib/feedparser/tests/encoding/demoronize-3.xml
@ -0,0 +1,13 @@
+<?xml version="1.0"?>
+<!--
+Description:  using win-1252 character points instead of unicode
+Expect:       not bozo and entries[0]['description'] == u'don&#x2019;t'
+-->
+<rss version="2.0">
+<channel>
+  <item>
+    <description>don&amp;#146;t</description>
+  </item>
+</channel>
+</rss>
+
--- a/lib/feedparser/tests/encoding/double-encoded-html.xml
+++ b/lib/feedparser/tests/encoding/double-encoded-html.xml
@ -0,0 +1,13 @@
+<?xml version="1.0"?>
+<!--
+Description:  utf-8 interpreted as iso-8859-1 and re-encoded as utf-8
+Expect:       not bozo and ord(entries[0]['description']) == 8230
+-->
+<rss version="2.0">
+<channel>
+  <item>
+    <description>&#226;&#128;&#166;</description>
+  </item>
+</channel>
+</rss>
+
--- a/lib/feedparser/tests/encoding/encoding_attribute_crash.xml
+++ b/lib/feedparser/tests/encoding/encoding_attribute_crash.xml
@ -0,0 +1,9 @@
+<!--
+Description: crashes
+Expect:      1
+-->
+<rss>
+<item>
+<description><![CDATA[<img alt="&#169;" />]]></description>
+</item>
+</rss>
--- a/lib/feedparser/tests/encoding/encoding_attribute_crash_2.xml
+++ b/lib/feedparser/tests/encoding/encoding_attribute_crash_2.xml
@ -0,0 +1,9 @@
+<!--
+Description: crashes
+Expect:      1
+-->
+<rss>
+<item>
+<description>&lt;a href=&quot;http://example.com&quot;&gt;&lt;img src=&quot;http://example.com/logo.gif&quot; alt=&quot;The image &amp;acirc;&amp;#128;&amp;#156;http://example.com/logo.gif&amp;acirc;&amp;#128;&amp;#65533; cannot be displayed, because it contains errors.&quot;&gt;&lt;/a&gt;&lt;br&gt;</description>
+</item>
+</rss>
--- a/lib/feedparser/tests/encoding/euc-kr-attribute.xml
+++ b/lib/feedparser/tests/encoding/euc-kr-attribute.xml
@ -0,0 +1,14 @@
+<?xml version="1.0" encoding="euc-kr"?>
+<!--
+SkipUnless:   __import__('codecs').lookup('euc-kr')
+Description:  euc-kr character in attribute of embedded HTML
+Expect:       not bozo and entries[0]['description'] == u'<img alt="\ub144" />'
+-->
+<rss version="2.0">
+<channel>
+  <item>
+    <description>&lt;img alt="³â" /></description>
+  </item>
+</channel>
+</rss>
+
--- a/lib/feedparser/tests/encoding/euc-kr-item.xml
+++ b/lib/feedparser/tests/encoding/euc-kr-item.xml
@ -0,0 +1,14 @@
+<?xml version="1.0" encoding="euc-kr"?>
+<!--
+SkipUnless:   __import__('codecs').lookup('euc-kr')
+Description:  euc-kr encoding in item description
+Expect:       not bozo and entries[0]['description'] == u'\ub144'
+-->
+<rss version="2.0">
+<channel>
+  <item>
+    <description>³â</description>
+  </item>
+</channel>
+</rss>
+
--- a/lib/feedparser/tests/encoding/euc-kr.xml
+++ b/lib/feedparser/tests/encoding/euc-kr.xml
@ -0,0 +1,12 @@
+<?xml version="1.0" encoding="euc-kr"?>
+<!--
+SkipUnless:   __import__('codecs').lookup('euc-kr')
+Description:  euc-kr encoding
+Expect:       not bozo and feed['title'] == u'\ub144'
+-->
+<rss version="2.0">
+<channel>
+  <title>³â</title>
+</channel>
+</rss>
+
--- a/lib/feedparser/tests/encoding/http_application_atom_xml_charset.xml
+++ b/lib/feedparser/tests/encoding/http_application_atom_xml_charset.xml
@ -0,0 +1,8 @@
+<?xml version="1.0"?>
+<!--
+Header:      Content-type: application/atom+xml;charset='us-ascii'
+Description: application/atom+xml + explicit charset
+Expect:      not bozo and encoding == 'us-ascii'
+-->
+<feed version="0.3" xmlns="http://purl.org/atom/ns#">
+</feed>
--- a/lib/feedparser/tests/encoding/http_application_atom_xml_charset_overrides_encoding.xml
+++ b/lib/feedparser/tests/encoding/http_application_atom_xml_charset_overrides_encoding.xml
@ -0,0 +1,8 @@
+<?xml version="1.0" encoding="iso-8859-1"?>
+<!--
+Header:      Content-type: application/atom+xml; charset='us-ascii'
+Description: application/atom+xml + charset overrides encoding
+Expect:      not bozo and encoding == 'us-ascii'
+-->
+<feed version="0.3" xmlns="http://purl.org/atom/ns#">
+</feed>
--- a/lib/feedparser/tests/encoding/http_application_atom_xml_default.xml
+++ b/lib/feedparser/tests/encoding/http_application_atom_xml_default.xml
@ -0,0 +1,8 @@
+<?xml version="1.0"?>
+<!--
+Header:      Content-type: application/atom+xml
+Description: application/atom+xml + no encoding
+Expect:      not bozo and encoding == 'utf-8'
+-->
+<feed version="0.3" xmlns="http://purl.org/atom/ns#">
+</feed>
--- a/lib/feedparser/tests/encoding/http_application_atom_xml_encoding.xml
+++ b/lib/feedparser/tests/encoding/http_application_atom_xml_encoding.xml
@ -0,0 +1,8 @@
+<?xml version="1.0" encoding="iso-8859-1"?>
+<!--
+Header:      Content-type: application/atom+xml
+Description: application/atom+xml + explicit encoding
+Expect:      not bozo and encoding == 'iso-8859-1'
+-->
+<feed version="0.3" xmlns="http://purl.org/atom/ns#">
+</feed>
--- a/lib/feedparser/tests/encoding/http_application_atom_xml_gb2312_charset.xml
+++ b/lib/feedparser/tests/encoding/http_application_atom_xml_gb2312_charset.xml
@ -0,0 +1,9 @@
+<?xml version="1.0"?>
+<!--
+SkipUnless:  __import__('codecs').lookup('gb2312')
+Header:      Content-type: application/atom+xml;charset='gb2312'
+Description: application/atom+xml + explicit charset
+Expect:      not bozo and encoding == 'gb18030'
+-->
+<feed xmlns="http://www.w3.org/2005/Atom">
+</feed>
--- a/lib/feedparser/tests/encoding/http_application_atom_xml_gb2312_charset_overrides_encoding.xml
+++ b/lib/feedparser/tests/encoding/http_application_atom_xml_gb2312_charset_overrides_encoding.xml
@ -0,0 +1,9 @@
+<?xml version="1.0" encoding="utf-8"?>
+<!--
+SkipUnless:  __import__('codecs').lookup('gb2312')
+Header:      Content-type: application/atom+xml; charset='gb2312'
+Description: application/atom+xml + charset overrides encoding
+Expect:      not bozo and encoding == 'gb18030'
+-->
+<feed xmlns="http://www.w3.org/2005/Atom">
+</feed>
--- a/lib/feedparser/tests/encoding/http_application_atom_xml_gb2312_encoding.xml
+++ b/lib/feedparser/tests/encoding/http_application_atom_xml_gb2312_encoding.xml
@ -0,0 +1,9 @@
+<?xml version="1.0" encoding="gb2312"?>
+<!--
+SkipUnless:  __import__('codecs').lookup('gb2312')
+Header:      Content-type: application/atom+xml
+Description: application/atom+xml + explicit encoding
+Expect:      not bozo and encoding == 'gb18030'
+-->
+<feed xmlns="http://www.w3.org/2005/Atom">
+</feed>
--- a/lib/feedparser/tests/encoding/http_application_rss_xml_charset.xml
+++ b/lib/feedparser/tests/encoding/http_application_rss_xml_charset.xml
@ -0,0 +1,8 @@
+<?xml version="1.0"?>
+<!--
+Header:      Content-type: application/rss+xml;charset= 'us-ascii'
+Description: application/rss+xml + explicit charset
+Expect:      not bozo and encoding == 'us-ascii'
+-->
+<rss version="2.0">
+</rss>
--- a/lib/feedparser/tests/encoding/http_application_rss_xml_charset_overrides_encoding.xml
+++ b/lib/feedparser/tests/encoding/http_application_rss_xml_charset_overrides_encoding.xml
@ -0,0 +1,8 @@
+<?xml version="1.0" encoding="iso-8859-1"?>
+<!--
+Header:      Content-type: application/rss+xml;charset= "us-ascii"
+Description: application/rss+xml + charset overrides encoding
+Expect:      not bozo and encoding == 'us-ascii'
+-->
+<rss version="2.0">
+</rss>
--- a/lib/feedparser/tests/encoding/http_application_rss_xml_default.xml
+++ b/lib/feedparser/tests/encoding/http_application_rss_xml_default.xml
@ -0,0 +1,8 @@
+<?xml version="1.0"?>
+<!--
+Header:      Content-type: application/rss+xml
+Description: application/rss+xml + no encoding
+Expect:      not bozo and encoding == 'utf-8'
+-->
+<rss version="2.0">
+</rss>
--- a/lib/feedparser/tests/encoding/http_application_rss_xml_encoding.xml
+++ b/lib/feedparser/tests/encoding/http_application_rss_xml_encoding.xml
@ -0,0 +1,8 @@
+<?xml version="1.0" encoding="iso-8859-1"?>
+<!--
+Header:      Content-type: application/rss+xml
+Description: application/rss+xml + explicit encoding
+Expect:      not bozo and encoding == 'iso-8859-1'
+-->
+<rss version="2.0">
+</rss>
--- a/lib/feedparser/tests/encoding/http_application_xml_charset.xml
+++ b/lib/feedparser/tests/encoding/http_application_xml_charset.xml
@ -0,0 +1,8 @@
+<?xml version="1.0"?>
+<!--
+Header:      Content-type: application/xml;charset= "us-ascii"
+Description: application/xml + explicit charset
+Expect:      not bozo and encoding == 'us-ascii'
+-->
+<rss version="2.0">
+</rss>
--- a/lib/feedparser/tests/encoding/http_application_xml_charset_overrides_encoding.xml
+++ b/lib/feedparser/tests/encoding/http_application_xml_charset_overrides_encoding.xml
@ -0,0 +1,8 @@
+<?xml version="1.0" encoding="iso-8859-1"?>
+<!--
+Header:      Content-type: application/xml;charset = us-ascii
+Description: application/xml + charset overrides encoding
+Expect:      not bozo and encoding == 'us-ascii'
+-->
+<rss version="2.0">
+</rss>
--- a/lib/feedparser/tests/encoding/http_application_xml_default.xml
+++ b/lib/feedparser/tests/encoding/http_application_xml_default.xml
@ -0,0 +1,8 @@
+<?xml version="1.0"?>
+<!--
+Header:      Content-type: application/xml
+Description: application/xml + no encoding
+Expect:      not bozo and encoding == 'utf-8'
+-->
+<rss version="2.0">
+</rss>
--- a/lib/feedparser/tests/encoding/http_application_xml_dtd_charset.xml
+++ b/lib/feedparser/tests/encoding/http_application_xml_dtd_charset.xml
@ -0,0 +1,8 @@
+<?xml version="1.0"?>
+<!--
+Header:      Content-type: application/xml-dtd; charset="us-ascii"
+Description: application/xml-dtd + explicit charset
+Expect:      not bozo and encoding == 'us-ascii'
+-->
+<feed version="0.3" xmlns="http://purl.org/atom/ns#">
+</feed>
--- a/lib/feedparser/tests/encoding/http_application_xml_dtd_charset_overrides_encoding.xml
+++ b/lib/feedparser/tests/encoding/http_application_xml_dtd_charset_overrides_encoding.xml
@ -0,0 +1,8 @@
+<?xml version="1.0" encoding="iso-8859-1"?>
+<!--
+Header:      Content-type: application/xml-dtd; charset="us-ascii"
+Description: application/xml-dtd + charset overrides encoding
+Expect:      not bozo and encoding == 'us-ascii'
+-->
+<feed version="0.3" xmlns="http://purl.org/atom/ns#">
+</feed>
--- a/lib/feedparser/tests/encoding/http_application_xml_dtd_default.xml
+++ b/lib/feedparser/tests/encoding/http_application_xml_dtd_default.xml
@ -0,0 +1,8 @@
+<?xml version="1.0"?>
+<!--
+Header:      Content-type: application/xml-dtd
+Description: application/xml-dtd + no encoding
+Expect:      not bozo and encoding == 'utf-8'
+-->
+<feed version="0.3" xmlns="http://purl.org/atom/ns#">
+</feed>
--- a/lib/feedparser/tests/encoding/http_application_xml_dtd_encoding.xml
+++ b/lib/feedparser/tests/encoding/http_application_xml_dtd_encoding.xml
@ -0,0 +1,8 @@
+<?xml version="1.0" encoding="iso-8859-1"?>
+<!--
+Header:      Content-type: application/xml-dtd
+Description: application/xml-dtd + explicit encoding
+Expect:      not bozo and encoding == 'iso-8859-1'
+-->
+<feed version="0.3" xmlns="http://purl.org/atom/ns#">
+</feed>
--- a/lib/feedparser/tests/encoding/http_application_xml_encoding.xml
+++ b/lib/feedparser/tests/encoding/http_application_xml_encoding.xml
@ -0,0 +1,8 @@
+<?xml version="1.0" encoding="iso-8859-1"?>
+<!--
+Header:      Content-type: application/xml
+Description: application/xml + explicit encoding
+Expect:      not bozo and encoding == 'iso-8859-1'
+-->
+<rss version="2.0">
+</rss>
--- a/lib/feedparser/tests/encoding/http_application_xml_epe_charset.xml
+++ b/lib/feedparser/tests/encoding/http_application_xml_epe_charset.xml
@ -0,0 +1,8 @@
+<?xml version="1.0"?>
+<!--
+Header:      Content-type: application/xml-external-parsed-entity; charset="us-ascii"
+Description: application/xml-external-parsed-entity + explicit charset
+Expect:      not bozo and encoding == 'us-ascii'
+-->
+<feed version="0.3" xmlns="http://purl.org/atom/ns#">
+</feed>
--- a/lib/feedparser/tests/encoding/http_application_xml_epe_charset_overrides_encoding.xml
+++ b/lib/feedparser/tests/encoding/http_application_xml_epe_charset_overrides_encoding.xml
@ -0,0 +1,8 @@
+<?xml version="1.0" encoding="iso-8859-1"?>
+<!--
+Header:      Content-type: application/xml-external-parsed-entity;charset=us-ascii
+Description: application/xml-external-parsed-entity + charset overrides encoding
+Expect:      not bozo and encoding == 'us-ascii'
+-->
+<feed version="0.3" xmlns="http://purl.org/atom/ns#">
+</feed>
--- a/lib/feedparser/tests/encoding/http_application_xml_epe_default.xml
+++ b/lib/feedparser/tests/encoding/http_application_xml_epe_default.xml
@ -0,0 +1,8 @@
+<?xml version="1.0"?>
+<!--
+Header:      Content-type: application/xml-external-parsed-entity
+Description: application/xml-external-parsed-entity + no encoding
+Expect:      not bozo and encoding == 'utf-8'
+-->
+<feed version="0.3" xmlns="http://purl.org/atom/ns#">
+</feed>
--- a/lib/feedparser/tests/encoding/http_application_xml_epe_encoding.xml
+++ b/lib/feedparser/tests/encoding/http_application_xml_epe_encoding.xml
@ -0,0 +1,8 @@
+<?xml version="1.0" encoding="iso-8859-1"?>
+<!--
+Header:      Content-type: application/xml-external-parsed-entity
+Description: application/xml-parsed-entity + explicit encoding
+Expect:      not bozo and encoding == 'iso-8859-1'
+-->
+<feed version="0.3" xmlns="http://purl.org/atom/ns#">
+</feed>
--- a/lib/feedparser/tests/encoding/http_encoding_attribute_crash.xml
+++ b/lib/feedparser/tests/encoding/http_encoding_attribute_crash.xml
@ -0,0 +1,13 @@
+<?xml version='1.0' encoding='UTF-8'?>
+<!--
+Header:      Content-type: application/atom+xml
+Description: crashes while resolving relative URIs when content contains attributes which contain (valid) non-ASCII characters
+Expect:      not bozo
+-->
+<feed xmlns='http://www.w3.org/2005/Atom'>
+<entry>
+<content type='xhtml'><div xmlns='http://www.w3.org/1999/xhtml'>
+<img alt="Browser market shares at ‘ongoing’" />
+</div></content>
+</entry>
+</feed>
--- a/lib/feedparser/tests/encoding/http_i18n.xml
+++ b/lib/feedparser/tests/encoding/http_i18n.xml
@ -0,0 +1,13 @@
+<?xml version="1.0" encoding="utf-8"?>
+<!--
+Header:       Content-type: application/xml
+Description:  application/xml with no charset (control for tests/illformed/encoding/http_i18n.xml)
+Expect:       not bozo
+-->
+
+<feed version="0.3" xmlns="http://purl.org/atom/ns#">
+  <title>Iñtërnâtiônàlizætiøn</title>
+  <link rel='alternate' type='text/html' href='http://example.com/'/>
+  <modified>2004-06-02T19:07:55-04:00</modified>
+  <tagline>If your parser thinks this is well-formed, it's right.</tagline>
+</feed>
--- a/lib/feedparser/tests/encoding/http_text_atom_xml_charset.xml
+++ b/lib/feedparser/tests/encoding/http_text_atom_xml_charset.xml
@ -0,0 +1,8 @@
+<?xml version="1.0"?>
+<!--
+Header:      Content-type: text/atom+xml;charset='us-ascii'
+Description: text/atom+xml + explicit charset
+Expect:      not bozo and encoding == 'us-ascii'
+-->
+<feed version="0.3" xmlns="http://purl.org/atom/ns#">
+</feed>
--- a/lib/feedparser/tests/encoding/http_text_atom_xml_charset_overrides_encoding.xml
+++ b/lib/feedparser/tests/encoding/http_text_atom_xml_charset_overrides_encoding.xml
@ -0,0 +1,8 @@
+<?xml version="1.0" encoding="iso-8859-1"?>
+<!--
+Header:      Content-type: text/atom+xml; charset='us-ascii'
+Description: text/atom+xml + charset overrides encoding
+Expect:      not bozo and encoding == 'us-ascii'
+-->
+<feed version="0.3" xmlns="http://purl.org/atom/ns#">
+</feed>
--- a/lib/feedparser/tests/encoding/http_text_atom_xml_default.xml
+++ b/lib/feedparser/tests/encoding/http_text_atom_xml_default.xml
@ -0,0 +1,8 @@
+<?xml version="1.0"?>
+<!--
+Header:      Content-type: text/atom+xml
+Description: text/atom+xml + no encoding
+Expect:      not bozo and encoding == 'us-ascii'
+-->
+<feed version="0.3" xmlns="http://purl.org/atom/ns#">
+</feed>
--- a/lib/feedparser/tests/encoding/http_text_atom_xml_encoding.xml
+++ b/lib/feedparser/tests/encoding/http_text_atom_xml_encoding.xml
@ -0,0 +1,8 @@
+<?xml version="1.0" encoding="iso-8859-1"?>
+<!--
+Header:      Content-type: text/atom+xml
+Description: text/atom+xml + explicit encoding
+Expect:      not bozo and encoding == 'us-ascii'
+-->
+<feed version="0.3" xmlns="http://purl.org/atom/ns#">
+</feed>
--- a/lib/feedparser/tests/encoding/http_text_rss_xml_charset.xml
+++ b/lib/feedparser/tests/encoding/http_text_rss_xml_charset.xml
@ -0,0 +1,8 @@
+<?xml version="1.0"?>
+<!--
+Header:      Content-type: text/rss+xml;charset= 'us-ascii'
+Description: text/rss+xml + explicit charset
+Expect:      not bozo and encoding == 'us-ascii'
+-->
+<rss version="2.0">
+</rss>
--- a/lib/feedparser/tests/encoding/http_text_rss_xml_charset_overrides_encoding.xml
+++ b/lib/feedparser/tests/encoding/http_text_rss_xml_charset_overrides_encoding.xml
@ -0,0 +1,8 @@
+<?xml version="1.0" encoding="iso-8859-1"?>
+<!--
+Header:      Content-type: text/rss+xml;charset= "us-ascii"
+Description: text/rss+xml + charset overrides encoding
+Expect:      not bozo and encoding == 'us-ascii'
+-->
+<rss version="2.0">
+</rss>
--- a/lib/feedparser/tests/encoding/http_text_rss_xml_default.xml
+++ b/lib/feedparser/tests/encoding/http_text_rss_xml_default.xml
@ -0,0 +1,8 @@
+<?xml version="1.0"?>
+<!--
+Header:      Content-type: text/rss+xml
+Description: text/rss+xml + no encoding
+Expect:      not bozo and encoding == 'us-ascii'
+-->
+<rss version="2.0">
+</rss>
--- a/lib/feedparser/tests/encoding/http_text_rss_xml_encoding.xml
+++ b/lib/feedparser/tests/encoding/http_text_rss_xml_encoding.xml
@ -0,0 +1,8 @@
+<?xml version="1.0" encoding="iso-8859-1"?>
+<!--
+Header:      Content-type: text/rss+xml
+Description: text/rss+xml + explicit encoding
+Expect:      not bozo and encoding == 'us-ascii'
+-->
+<rss version="2.0">
+</rss>
--- a/lib/feedparser/tests/encoding/http_text_xml_bogus_charset.xml
+++ b/lib/feedparser/tests/encoding/http_text_xml_bogus_charset.xml
@ -0,0 +1,8 @@
+<?xml version="1.0"?>
+<!--
+Header:      Content-type: text/xml;
+Description: text/xml + bogus charset
+Expect:      not bozo and encoding == 'us-ascii'
+-->
+<rss version="2.0">
+</rss>
--- a/lib/feedparser/tests/encoding/http_text_xml_bogus_param.xml
+++ b/lib/feedparser/tests/encoding/http_text_xml_bogus_param.xml
@ -0,0 +1,8 @@
+<?xml version="1.0"?>
+<!--
+Header:      Content-type: text/xml; charset:iso-8859-1
+Description: text/xml + bogus parameter
+Expect:      not bozo and encoding == 'us-ascii'
+-->
+<rss version="2.0">
+</rss>
--- a/lib/feedparser/tests/encoding/http_text_xml_charset.xml
+++ b/lib/feedparser/tests/encoding/http_text_xml_charset.xml
@ -0,0 +1,8 @@
+<?xml version="1.0"?>
+<!--
+Header:      Content-type: text/xml;charset= "us-ascii"
+Description: text/xml + explicit charset
+Expect:      not bozo and encoding == 'us-ascii'
+-->
+<rss version="2.0">
+</rss>
--- a/lib/feedparser/tests/encoding/http_text_xml_charset_2.xml
+++ b/lib/feedparser/tests/encoding/http_text_xml_charset_2.xml
@ -0,0 +1,16 @@
+<!--
+SkipUnless:   __import__('codecs').lookup('windows-1252')
+Header:       Content-type: text/xml; charset=windows-1252
+Description:  text/xml + explicit charset (this one is harder than the others)
+Expect:       not bozo and entries[0]['description'] == u'This is a \xa3\u201ctest.\u201d'
+-->
+
+<rss version="2.0">
+<channel>
+<item>
+<title>Foo</title>
+<link>http://purl.org/rss/2.0/?item</link>
+<description>This is a £“test.”</description>
+</item>
+</channel>
+</rss>
--- a/lib/feedparser/tests/encoding/http_text_xml_charset_overrides_encoding.xml
+++ b/lib/feedparser/tests/encoding/http_text_xml_charset_overrides_encoding.xml
@ -0,0 +1,8 @@
+<?xml version="1.0" encoding="iso-8859-1"?>
+<!--
+Header:      Content-type: text/xml;charset = us-ascii
+Description: text/xml + charset overrides encoding
+Expect:      not bozo and encoding == 'us-ascii'
+-->
+<rss version="2.0">
+</rss>
--- a/lib/feedparser/tests/encoding/http_text_xml_charset_overrides_encoding_2.xml
+++ b/lib/feedparser/tests/encoding/http_text_xml_charset_overrides_encoding_2.xml
@ -0,0 +1,17 @@
+<?xml version='1.0' encoding='iso-8859-1'?>
+<!--
+SkipUnless:   __import__('codecs').lookup('windows-1252')
+Header:       Content-type: text/xml; charset=windows-1252
+Description:  text/xml + charset overrides encoding (this one is harder than the others)
+Expect:       not bozo and entries[0]['description'] == u'This is a \xa3\u201ctest.\u201d'
+-->
+
+<rss version="2.0">
+<channel>
+<item>
+<title>Foo</title>
+<link>http://purl.org/rss/2.0/?item</link>
+<description>This is a £“test.”</description>
+</item>
+</channel>
+</rss>
--- a/lib/feedparser/tests/encoding/http_text_xml_default.xml
+++ b/lib/feedparser/tests/encoding/http_text_xml_default.xml
@ -0,0 +1,8 @@
+<?xml version="1.0"?>
+<!--
+Header:      Content-type: text/xml
+Description: text/xml + no encoding
+Expect:      not bozo and encoding == 'us-ascii'
+-->
+<rss version="2.0">
+</rss>
--- a/lib/feedparser/tests/encoding/http_text_xml_epe_charset.xml
+++ b/lib/feedparser/tests/encoding/http_text_xml_epe_charset.xml
@ -0,0 +1,8 @@
+<?xml version="1.0"?>
+<!--
+Header:      Content-type: text/xml-external-parsed-entity; charset="us-ascii"
+Description: text/xml-external-parsed-entity + explicit charset
+Expect:      not bozo and encoding == 'us-ascii'
+-->
+<feed version="0.3" xmlns="http://purl.org/atom/ns#">
+</feed>
--- a/lib/feedparser/tests/encoding/http_text_xml_epe_charset_overrides_encoding.xml
+++ b/lib/feedparser/tests/encoding/http_text_xml_epe_charset_overrides_encoding.xml
@ -0,0 +1,8 @@
+<?xml version="1.0" encoding="iso-8859-1"?>
+<!--
+Header:      Content-type: text/xml-external-parsed-entity;charset=us-ascii
+Description: text/xml-external-parsed-entity + charset overrides encoding
+Expect:      not bozo and encoding == 'us-ascii'
+-->
+<feed version="0.3" xmlns="http://purl.org/atom/ns#">
+</feed>
--- a/lib/feedparser/tests/encoding/http_text_xml_epe_default.xml
+++ b/lib/feedparser/tests/encoding/http_text_xml_epe_default.xml
@ -0,0 +1,8 @@
+<?xml version="1.0"?>
+<!--
+Header:      Content-type: text/xml-external-parsed-entity
+Description: text/xml-external-parsed-entity + no encoding
+Expect:      not bozo and encoding == 'us-ascii'
+-->
+<feed version="0.3" xmlns="http://purl.org/atom/ns#">
+</feed>
--- a/lib/feedparser/tests/encoding/http_text_xml_epe_encoding.xml
+++ b/lib/feedparser/tests/encoding/http_text_xml_epe_encoding.xml
@ -0,0 +1,8 @@
+<?xml version="1.0" encoding="iso-8859-1"?>
+<!--
+Header:      Content-type: text/xml-external-parsed-entity
+Description: text/xml-parsed-entity + explicit encoding
+Expect:      not bozo and encoding == 'us-ascii'
+-->
+<feed version="0.3" xmlns="http://purl.org/atom/ns#">
+</feed>
--- a/lib/feedparser/tests/encoding/http_text_xml_qs.xml
+++ b/lib/feedparser/tests/encoding/http_text_xml_qs.xml
@ -0,0 +1,8 @@
+<?xml version="1.0"?>
+<!--
+Header:      Content-type: text/xml; qs=0.9
+Description: text/xml + qs value
+Expect:      not bozo and encoding == 'us-ascii'
+-->
+<feed version="0.3" xmlns="http://purl.org/atom/ns#">
+</feed>
--- a/lib/feedparser/tests/encoding/iso-10646-ucs-2.xml
+++ b/lib/feedparser/tests/encoding/iso-10646-ucs-2.xml
--- a/lib/feedparser/tests/encoding/iso-10646-ucs-4.xml
+++ b/lib/feedparser/tests/encoding/iso-10646-ucs-4.xml
--- a/lib/feedparser/tests/encoding/no_content_type_default.xml
+++ b/lib/feedparser/tests/encoding/no_content_type_default.xml
@ -0,0 +1,7 @@
+<?xml version="1.0"?>
+<!--
+Description: no content-type and no encoding
+Expect:      not bozo and encoding == 'utf-8'
+-->
+<feed version="0.3" xmlns="http://purl.org/atom/ns#">
+</feed>
--- a/lib/feedparser/tests/encoding/no_content_type_encoding.xml
+++ b/lib/feedparser/tests/encoding/no_content_type_encoding.xml
@ -0,0 +1,7 @@
+<?xml version="1.0" encoding="iso-8859-1"?>
+<!--
+Description: no content-type + explicit encoding
+Expect:      not bozo and encoding == 'iso-8859-1'
+-->
+<feed version="0.3" xmlns="http://purl.org/atom/ns#">
+</feed>
--- a/lib/feedparser/tests/encoding/u16.xml
+++ b/lib/feedparser/tests/encoding/u16.xml
--- a/lib/feedparser/tests/encoding/ucs-2.xml
+++ b/lib/feedparser/tests/encoding/ucs-2.xml
--- a/Show More
+++ b/Show More
				`@ -0,0 +1 @@`
				`<feed xmlns="http://www.w3.org/2005/Atom"></feed>`