fixed missing libs

This commit is contained in:
evilhero 2012-09-09 17:59:46 -04:00
parent 27011e950f
commit 633c8617e4
2361 changed files with 38285 additions and 0 deletions

355
lib/bs4/__init__.py Normal file
View File

@ -0,0 +1,355 @@
"""Beautiful Soup
Elixir and Tonic
"The Screen-Scraper's Friend"
http://www.crummy.com/software/BeautifulSoup/
Beautiful Soup uses a pluggable XML or HTML parser to parse a
(possibly invalid) document into a tree representation. Beautiful Soup
provides provides methods and Pythonic idioms that make it easy to
navigate, search, and modify the parse tree.
Beautiful Soup works with Python 2.6 and up. It works better if lxml
and/or html5lib is installed.
For more than you ever wanted to know about Beautiful Soup, see the
documentation:
http://www.crummy.com/software/BeautifulSoup/bs4/doc/
"""
__author__ = "Leonard Richardson (leonardr@segfault.org)"
__version__ = "4.1.1"
__copyright__ = "Copyright (c) 2004-2012 Leonard Richardson"
__license__ = "MIT"
__all__ = ['BeautifulSoup']
import re
import warnings
from .builder import builder_registry
from .dammit import UnicodeDammit
from .element import (
CData,
Comment,
DEFAULT_OUTPUT_ENCODING,
Declaration,
Doctype,
NavigableString,
PageElement,
ProcessingInstruction,
ResultSet,
SoupStrainer,
Tag,
)
# The very first thing we do is give a useful error if someone is
# running this code under Python 3 without converting it.
syntax_error = u'You are trying to run the Python 2 version of Beautiful Soup under Python 3. This will not work. You need to convert the code, either by installing it (`python setup.py install`) or by running 2to3 (`2to3 -w bs4`).'
class BeautifulSoup(Tag):
"""
This class defines the basic interface called by the tree builders.
These methods will be called by the parser:
reset()
feed(markup)
The tree builder may call these methods from its feed() implementation:
handle_starttag(name, attrs) # See note about return value
handle_endtag(name)
handle_data(data) # Appends to the current data node
endData(containerClass=NavigableString) # Ends the current data node
No matter how complicated the underlying parser is, you should be
able to build a tree using 'start tag' events, 'end tag' events,
'data' events, and "done with data" events.
If you encounter an empty-element tag (aka a self-closing tag,
like HTML's <br> tag), call handle_starttag and then
handle_endtag.
"""
ROOT_TAG_NAME = u'[document]'
# If the end-user gives no indication which tree builder they
# want, look for one with these features.
DEFAULT_BUILDER_FEATURES = ['html', 'fast']
# Used when determining whether a text node is all whitespace and
# can be replaced with a single space. A text node that contains
# fancy Unicode spaces (usually non-breaking) should be left
# alone.
STRIP_ASCII_SPACES = {9: None, 10: None, 12: None, 13: None, 32: None, }
def __init__(self, markup="", features=None, builder=None,
parse_only=None, from_encoding=None, **kwargs):
"""The Soup object is initialized as the 'root tag', and the
provided markup (which can be a string or a file-like object)
is fed into the underlying parser."""
if 'convertEntities' in kwargs:
warnings.warn(
"BS4 does not respect the convertEntities argument to the "
"BeautifulSoup constructor. Entities are always converted "
"to Unicode characters.")
if 'markupMassage' in kwargs:
del kwargs['markupMassage']
warnings.warn(
"BS4 does not respect the markupMassage argument to the "
"BeautifulSoup constructor. The tree builder is responsible "
"for any necessary markup massage.")
if 'smartQuotesTo' in kwargs:
del kwargs['smartQuotesTo']
warnings.warn(
"BS4 does not respect the smartQuotesTo argument to the "
"BeautifulSoup constructor. Smart quotes are always converted "
"to Unicode characters.")
if 'selfClosingTags' in kwargs:
del kwargs['selfClosingTags']
warnings.warn(
"BS4 does not respect the selfClosingTags argument to the "
"BeautifulSoup constructor. The tree builder is responsible "
"for understanding self-closing tags.")
if 'isHTML' in kwargs:
del kwargs['isHTML']
warnings.warn(
"BS4 does not respect the isHTML argument to the "
"BeautifulSoup constructor. You can pass in features='html' "
"or features='xml' to get a builder capable of handling "
"one or the other.")
def deprecated_argument(old_name, new_name):
if old_name in kwargs:
warnings.warn(
'The "%s" argument to the BeautifulSoup constructor '
'has been renamed to "%s."' % (old_name, new_name))
value = kwargs[old_name]
del kwargs[old_name]
return value
return None
parse_only = parse_only or deprecated_argument(
"parseOnlyThese", "parse_only")
from_encoding = from_encoding or deprecated_argument(
"fromEncoding", "from_encoding")
if len(kwargs) > 0:
arg = kwargs.keys().pop()
raise TypeError(
"__init__() got an unexpected keyword argument '%s'" % arg)
if builder is None:
if isinstance(features, basestring):
features = [features]
if features is None or len(features) == 0:
features = self.DEFAULT_BUILDER_FEATURES
builder_class = builder_registry.lookup(*features)
if builder_class is None:
raise ValueError(
"Couldn't find a tree builder with the features you "
"requested: %s. Do you need to install a parser library?"
% ",".join(features))
builder = builder_class()
self.builder = builder
self.is_xml = builder.is_xml
self.builder.soup = self
self.parse_only = parse_only
self.reset()
if hasattr(markup, 'read'): # It's a file-type object.
markup = markup.read()
(self.markup, self.original_encoding, self.declared_html_encoding,
self.contains_replacement_characters) = (
self.builder.prepare_markup(markup, from_encoding))
try:
self._feed()
except StopParsing:
pass
# Clear out the markup and remove the builder's circular
# reference to this object.
self.markup = None
self.builder.soup = None
def _feed(self):
# Convert the document to Unicode.
self.builder.reset()
self.builder.feed(self.markup)
# Close out any unfinished strings and close all the open tags.
self.endData()
while self.currentTag.name != self.ROOT_TAG_NAME:
self.popTag()
def reset(self):
Tag.__init__(self, self, self.builder, self.ROOT_TAG_NAME)
self.hidden = 1
self.builder.reset()
self.currentData = []
self.currentTag = None
self.tagStack = []
self.pushTag(self)
def new_tag(self, name, namespace=None, nsprefix=None, **attrs):
"""Create a new tag associated with this soup."""
return Tag(None, self.builder, name, namespace, nsprefix, attrs)
def new_string(self, s):
"""Create a new NavigableString associated with this soup."""
navigable = NavigableString(s)
navigable.setup()
return navigable
def insert_before(self, successor):
raise ValueError("BeautifulSoup objects don't support insert_before().")
def insert_after(self, successor):
raise ValueError("BeautifulSoup objects don't support insert_after().")
def popTag(self):
tag = self.tagStack.pop()
#print "Pop", tag.name
if self.tagStack:
self.currentTag = self.tagStack[-1]
return self.currentTag
def pushTag(self, tag):
#print "Push", tag.name
if self.currentTag:
self.currentTag.contents.append(tag)
self.tagStack.append(tag)
self.currentTag = self.tagStack[-1]
def endData(self, containerClass=NavigableString):
if self.currentData:
currentData = u''.join(self.currentData)
if (currentData.translate(self.STRIP_ASCII_SPACES) == '' and
not set([tag.name for tag in self.tagStack]).intersection(
self.builder.preserve_whitespace_tags)):
if '\n' in currentData:
currentData = '\n'
else:
currentData = ' '
self.currentData = []
if self.parse_only and len(self.tagStack) <= 1 and \
(not self.parse_only.text or \
not self.parse_only.search(currentData)):
return
o = containerClass(currentData)
self.object_was_parsed(o)
def object_was_parsed(self, o):
"""Add an object to the parse tree."""
o.setup(self.currentTag, self.previous_element)
if self.previous_element:
self.previous_element.next_element = o
self.previous_element = o
self.currentTag.contents.append(o)
def _popToTag(self, name, nsprefix=None, inclusivePop=True):
"""Pops the tag stack up to and including the most recent
instance of the given tag. If inclusivePop is false, pops the tag
stack up to but *not* including the most recent instqance of
the given tag."""
#print "Popping to %s" % name
if name == self.ROOT_TAG_NAME:
return
numPops = 0
mostRecentTag = None
for i in range(len(self.tagStack) - 1, 0, -1):
if (name == self.tagStack[i].name
and nsprefix == self.tagStack[i].prefix):
numPops = len(self.tagStack) - i
break
if not inclusivePop:
numPops = numPops - 1
for i in range(0, numPops):
mostRecentTag = self.popTag()
return mostRecentTag
def handle_starttag(self, name, namespace, nsprefix, attrs):
"""Push a start tag on to the stack.
If this method returns None, the tag was rejected by the
SoupStrainer. You should proceed as if the tag had not occured
in the document. For instance, if this was a self-closing tag,
don't call handle_endtag.
"""
# print "Start tag %s: %s" % (name, attrs)
self.endData()
if (self.parse_only and len(self.tagStack) <= 1
and (self.parse_only.text
or not self.parse_only.search_tag(name, attrs))):
return None
tag = Tag(self, self.builder, name, namespace, nsprefix, attrs,
self.currentTag, self.previous_element)
if tag is None:
return tag
if self.previous_element:
self.previous_element.next_element = tag
self.previous_element = tag
self.pushTag(tag)
return tag
def handle_endtag(self, name, nsprefix=None):
#print "End tag: " + name
self.endData()
self._popToTag(name, nsprefix)
def handle_data(self, data):
self.currentData.append(data)
def decode(self, pretty_print=False,
eventual_encoding=DEFAULT_OUTPUT_ENCODING,
formatter="minimal"):
"""Returns a string or Unicode representation of this document.
To get Unicode, pass None for encoding."""
if self.is_xml:
# Print the XML declaration
encoding_part = ''
if eventual_encoding != None:
encoding_part = ' encoding="%s"' % eventual_encoding
prefix = u'<?xml version="1.0"%s?>\n' % encoding_part
else:
prefix = u''
if not pretty_print:
indent_level = None
else:
indent_level = 0
return prefix + super(BeautifulSoup, self).decode(
indent_level, eventual_encoding, formatter)
class BeautifulStoneSoup(BeautifulSoup):
"""Deprecated interface to an XML parser."""
def __init__(self, *args, **kwargs):
kwargs['features'] = 'xml'
warnings.warn(
'The BeautifulStoneSoup class is deprecated. Instead of using '
'it, pass features="xml" into the BeautifulSoup constructor.')
super(BeautifulStoneSoup, self).__init__(*args, **kwargs)
class StopParsing(Exception):
pass
#By default, act as an HTML pretty-printer.
if __name__ == '__main__':
import sys
soup = BeautifulSoup(sys.stdin)
print soup.prettify()

316
lib/bs4/builder/__init__.py Normal file
View File

@ -0,0 +1,316 @@
from collections import defaultdict
import itertools
import sys
from bs4.element import (
CharsetMetaAttributeValue,
ContentMetaAttributeValue,
whitespace_re
)
__all__ = [
'HTMLTreeBuilder',
'SAXTreeBuilder',
'TreeBuilder',
'TreeBuilderRegistry',
]
# Some useful features for a TreeBuilder to have.
FAST = 'fast'
PERMISSIVE = 'permissive'
STRICT = 'strict'
XML = 'xml'
HTML = 'html'
HTML_5 = 'html5'
class TreeBuilderRegistry(object):
def __init__(self):
self.builders_for_feature = defaultdict(list)
self.builders = []
def register(self, treebuilder_class):
"""Register a treebuilder based on its advertised features."""
for feature in treebuilder_class.features:
self.builders_for_feature[feature].insert(0, treebuilder_class)
self.builders.insert(0, treebuilder_class)
def lookup(self, *features):
if len(self.builders) == 0:
# There are no builders at all.
return None
if len(features) == 0:
# They didn't ask for any features. Give them the most
# recently registered builder.
return self.builders[0]
# Go down the list of features in order, and eliminate any builders
# that don't match every feature.
features = list(features)
features.reverse()
candidates = None
candidate_set = None
while len(features) > 0:
feature = features.pop()
we_have_the_feature = self.builders_for_feature.get(feature, [])
if len(we_have_the_feature) > 0:
if candidates is None:
candidates = we_have_the_feature
candidate_set = set(candidates)
else:
# Eliminate any candidates that don't have this feature.
candidate_set = candidate_set.intersection(
set(we_have_the_feature))
# The only valid candidates are the ones in candidate_set.
# Go through the original list of candidates and pick the first one
# that's in candidate_set.
if candidate_set is None:
return None
for candidate in candidates:
if candidate in candidate_set:
return candidate
return None
# The BeautifulSoup class will take feature lists from developers and use them
# to look up builders in this registry.
builder_registry = TreeBuilderRegistry()
class TreeBuilder(object):
"""Turn a document into a Beautiful Soup object tree."""
features = []
is_xml = False
preserve_whitespace_tags = set()
empty_element_tags = None # A tag will be considered an empty-element
# tag when and only when it has no contents.
# A value for these tag/attribute combinations is a space- or
# comma-separated list of CDATA, rather than a single CDATA.
cdata_list_attributes = {}
def __init__(self):
self.soup = None
def reset(self):
pass
def can_be_empty_element(self, tag_name):
"""Might a tag with this name be an empty-element tag?
The final markup may or may not actually present this tag as
self-closing.
For instance: an HTMLBuilder does not consider a <p> tag to be
an empty-element tag (it's not in
HTMLBuilder.empty_element_tags). This means an empty <p> tag
will be presented as "<p></p>", not "<p />".
The default implementation has no opinion about which tags are
empty-element tags, so a tag will be presented as an
empty-element tag if and only if it has no contents.
"<foo></foo>" will become "<foo />", and "<foo>bar</foo>" will
be left alone.
"""
if self.empty_element_tags is None:
return True
return tag_name in self.empty_element_tags
def feed(self, markup):
raise NotImplementedError()
def prepare_markup(self, markup, user_specified_encoding=None,
document_declared_encoding=None):
return markup, None, None, False
def test_fragment_to_document(self, fragment):
"""Wrap an HTML fragment to make it look like a document.
Different parsers do this differently. For instance, lxml
introduces an empty <head> tag, and html5lib
doesn't. Abstracting this away lets us write simple tests
which run HTML fragments through the parser and compare the
results against other HTML fragments.
This method should not be used outside of tests.
"""
return fragment
def set_up_substitutions(self, tag):
return False
def _replace_cdata_list_attribute_values(self, tag_name, attrs):
"""Replaces class="foo bar" with class=["foo", "bar"]
Modifies its input in place.
"""
if self.cdata_list_attributes:
universal = self.cdata_list_attributes.get('*', [])
tag_specific = self.cdata_list_attributes.get(
tag_name.lower(), [])
for cdata_list_attr in itertools.chain(universal, tag_specific):
if cdata_list_attr in dict(attrs):
# Basically, we have a "class" attribute whose
# value is a whitespace-separated list of CSS
# classes. Split it into a list.
value = attrs[cdata_list_attr]
if isinstance(value, basestring):
values = whitespace_re.split(value)
else:
# html5lib sometimes calls setAttributes twice
# for the same tag when rearranging the parse
# tree. On the second call the attribute value
# here is already a list. If this happens,
# leave the value alone rather than trying to
# split it again.
values = value
attrs[cdata_list_attr] = values
return attrs
class SAXTreeBuilder(TreeBuilder):
"""A Beautiful Soup treebuilder that listens for SAX events."""
def feed(self, markup):
raise NotImplementedError()
def close(self):
pass
def startElement(self, name, attrs):
attrs = dict((key[1], value) for key, value in list(attrs.items()))
#print "Start %s, %r" % (name, attrs)
self.soup.handle_starttag(name, attrs)
def endElement(self, name):
#print "End %s" % name
self.soup.handle_endtag(name)
def startElementNS(self, nsTuple, nodeName, attrs):
# Throw away (ns, nodeName) for now.
self.startElement(nodeName, attrs)
def endElementNS(self, nsTuple, nodeName):
# Throw away (ns, nodeName) for now.
self.endElement(nodeName)
#handler.endElementNS((ns, node.nodeName), node.nodeName)
def startPrefixMapping(self, prefix, nodeValue):
# Ignore the prefix for now.
pass
def endPrefixMapping(self, prefix):
# Ignore the prefix for now.
# handler.endPrefixMapping(prefix)
pass
def characters(self, content):
self.soup.handle_data(content)
def startDocument(self):
pass
def endDocument(self):
pass
class HTMLTreeBuilder(TreeBuilder):
"""This TreeBuilder knows facts about HTML.
Such as which tags are empty-element tags.
"""
preserve_whitespace_tags = set(['pre', 'textarea'])
empty_element_tags = set(['br' , 'hr', 'input', 'img', 'meta',
'spacer', 'link', 'frame', 'base'])
# The HTML standard defines these attributes as containing a
# space-separated list of values, not a single value. That is,
# class="foo bar" means that the 'class' attribute has two values,
# 'foo' and 'bar', not the single value 'foo bar'. When we
# encounter one of these attributes, we will parse its value into
# a list of values if possible. Upon output, the list will be
# converted back into a string.
cdata_list_attributes = {
"*" : ['class', 'accesskey', 'dropzone'],
"a" : ['rel', 'rev'],
"link" : ['rel', 'rev'],
"td" : ["headers"],
"th" : ["headers"],
"td" : ["headers"],
"form" : ["accept-charset"],
"object" : ["archive"],
# These are HTML5 specific, as are *.accesskey and *.dropzone above.
"area" : ["rel"],
"icon" : ["sizes"],
"iframe" : ["sandbox"],
"output" : ["for"],
}
def set_up_substitutions(self, tag):
# We are only interested in <meta> tags
if tag.name != 'meta':
return False
http_equiv = tag.get('http-equiv')
content = tag.get('content')
charset = tag.get('charset')
# We are interested in <meta> tags that say what encoding the
# document was originally in. This means HTML 5-style <meta>
# tags that provide the "charset" attribute. It also means
# HTML 4-style <meta> tags that provide the "content"
# attribute and have "http-equiv" set to "content-type".
#
# In both cases we will replace the value of the appropriate
# attribute with a standin object that can take on any
# encoding.
meta_encoding = None
if charset is not None:
# HTML 5 style:
# <meta charset="utf8">
meta_encoding = charset
tag['charset'] = CharsetMetaAttributeValue(charset)
elif (content is not None and http_equiv is not None
and http_equiv.lower() == 'content-type'):
# HTML 4 style:
# <meta http-equiv="content-type" content="text/html; charset=utf8">
tag['content'] = ContentMetaAttributeValue(content)
return (meta_encoding is not None)
def register_treebuilders_from(module):
"""Copy TreeBuilders from the given module into this module."""
# I'm fairly sure this is not the best way to do this.
this_module = sys.modules['bs4.builder']
for name in module.__all__:
obj = getattr(module, name)
if issubclass(obj, TreeBuilder):
setattr(this_module, name, obj)
this_module.__all__.append(name)
# Register the builder while we're at it.
this_module.builder_registry.register(obj)
# Builders are registered in reverse order of priority, so that custom
# builder registrations will take precedence. In general, we want lxml
# to take precedence over html5lib, because it's faster. And we only
# want to use HTMLParser as a last result.
from . import _htmlparser
register_treebuilders_from(_htmlparser)
try:
from . import _html5lib
register_treebuilders_from(_html5lib)
except ImportError:
# They don't have html5lib installed.
pass
try:
from . import _lxml
register_treebuilders_from(_lxml)
except ImportError:
# They don't have lxml installed.
pass

View File

@ -0,0 +1,222 @@
__all__ = [
'HTML5TreeBuilder',
]
import warnings
from bs4.builder import (
PERMISSIVE,
HTML,
HTML_5,
HTMLTreeBuilder,
)
from bs4.element import NamespacedAttribute
import html5lib
from html5lib.constants import namespaces
from bs4.element import (
Comment,
Doctype,
NavigableString,
Tag,
)
class HTML5TreeBuilder(HTMLTreeBuilder):
"""Use html5lib to build a tree."""
features = ['html5lib', PERMISSIVE, HTML_5, HTML]
def prepare_markup(self, markup, user_specified_encoding):
# Store the user-specified encoding for use later on.
self.user_specified_encoding = user_specified_encoding
return markup, None, None, False
# These methods are defined by Beautiful Soup.
def feed(self, markup):
if self.soup.parse_only is not None:
warnings.warn("You provided a value for parse_only, but the html5lib tree builder doesn't support parse_only. The entire document will be parsed.")
parser = html5lib.HTMLParser(tree=self.create_treebuilder)
doc = parser.parse(markup, encoding=self.user_specified_encoding)
# Set the character encoding detected by the tokenizer.
if isinstance(markup, unicode):
# We need to special-case this because html5lib sets
# charEncoding to UTF-8 if it gets Unicode input.
doc.original_encoding = None
else:
doc.original_encoding = parser.tokenizer.stream.charEncoding[0]
def create_treebuilder(self, namespaceHTMLElements):
self.underlying_builder = TreeBuilderForHtml5lib(
self.soup, namespaceHTMLElements)
return self.underlying_builder
def test_fragment_to_document(self, fragment):
"""See `TreeBuilder`."""
return u'<html><head></head><body>%s</body></html>' % fragment
class TreeBuilderForHtml5lib(html5lib.treebuilders._base.TreeBuilder):
def __init__(self, soup, namespaceHTMLElements):
self.soup = soup
super(TreeBuilderForHtml5lib, self).__init__(namespaceHTMLElements)
def documentClass(self):
self.soup.reset()
return Element(self.soup, self.soup, None)
def insertDoctype(self, token):
name = token["name"]
publicId = token["publicId"]
systemId = token["systemId"]
doctype = Doctype.for_name_and_ids(name, publicId, systemId)
self.soup.object_was_parsed(doctype)
def elementClass(self, name, namespace):
tag = self.soup.new_tag(name, namespace)
return Element(tag, self.soup, namespace)
def commentClass(self, data):
return TextNode(Comment(data), self.soup)
def fragmentClass(self):
self.soup = BeautifulSoup("")
self.soup.name = "[document_fragment]"
return Element(self.soup, self.soup, None)
def appendChild(self, node):
# XXX This code is not covered by the BS4 tests.
self.soup.append(node.element)
def getDocument(self):
return self.soup
def getFragment(self):
return html5lib.treebuilders._base.TreeBuilder.getFragment(self).element
class AttrList(object):
def __init__(self, element):
self.element = element
self.attrs = dict(self.element.attrs)
def __iter__(self):
return list(self.attrs.items()).__iter__()
def __setitem__(self, name, value):
"set attr", name, value
self.element[name] = value
def items(self):
return list(self.attrs.items())
def keys(self):
return list(self.attrs.keys())
def __len__(self):
return len(self.attrs)
def __getitem__(self, name):
return self.attrs[name]
def __contains__(self, name):
return name in list(self.attrs.keys())
class Element(html5lib.treebuilders._base.Node):
def __init__(self, element, soup, namespace):
html5lib.treebuilders._base.Node.__init__(self, element.name)
self.element = element
self.soup = soup
self.namespace = namespace
def appendChild(self, node):
if (node.element.__class__ == NavigableString and self.element.contents
and self.element.contents[-1].__class__ == NavigableString):
# Concatenate new text onto old text node
# XXX This has O(n^2) performance, for input like
# "a</a>a</a>a</a>..."
old_element = self.element.contents[-1]
new_element = self.soup.new_string(old_element + node.element)
old_element.replace_with(new_element)
else:
self.element.append(node.element)
node.parent = self
def getAttributes(self):
return AttrList(self.element)
def setAttributes(self, attributes):
if attributes is not None and len(attributes) > 0:
converted_attributes = []
for name, value in list(attributes.items()):
if isinstance(name, tuple):
new_name = NamespacedAttribute(*name)
del attributes[name]
attributes[new_name] = value
self.soup.builder._replace_cdata_list_attribute_values(
self.name, attributes)
for name, value in attributes.items():
self.element[name] = value
# The attributes may contain variables that need substitution.
# Call set_up_substitutions manually.
#
# The Tag constructor called this method when the Tag was created,
# but we just set/changed the attributes, so call it again.
self.soup.builder.set_up_substitutions(self.element)
attributes = property(getAttributes, setAttributes)
def insertText(self, data, insertBefore=None):
text = TextNode(self.soup.new_string(data), self.soup)
if insertBefore:
self.insertBefore(text, insertBefore)
else:
self.appendChild(text)
def insertBefore(self, node, refNode):
index = self.element.index(refNode.element)
if (node.element.__class__ == NavigableString and self.element.contents
and self.element.contents[index-1].__class__ == NavigableString):
# (See comments in appendChild)
old_node = self.element.contents[index-1]
new_str = self.soup.new_string(old_node + node.element)
old_node.replace_with(new_str)
else:
self.element.insert(index, node.element)
node.parent = self
def removeChild(self, node):
node.element.extract()
def reparentChildren(self, newParent):
while self.element.contents:
child = self.element.contents[0]
child.extract()
if isinstance(child, Tag):
newParent.appendChild(
Element(child, self.soup, namespaces["html"]))
else:
newParent.appendChild(
TextNode(child, self.soup))
def cloneNode(self):
tag = self.soup.new_tag(self.element.name, self.namespace)
node = Element(tag, self.soup, self.namespace)
for key,value in self.attributes:
node.attributes[key] = value
return node
def hasContent(self):
return self.element.contents
def getNameTuple(self):
if self.namespace == None:
return namespaces["html"], self.name
else:
return self.namespace, self.name
nameTuple = property(getNameTuple)
class TextNode(Element):
def __init__(self, element, soup):
html5lib.treebuilders._base.Node.__init__(self, None)
self.element = element
self.soup = soup
def cloneNode(self):
raise NotImplementedError

View File

@ -0,0 +1,244 @@
"""Use the HTMLParser library to parse HTML files that aren't too bad."""
__all__ = [
'HTMLParserTreeBuilder',
]
from HTMLParser import (
HTMLParser,
HTMLParseError,
)
import sys
import warnings
# Starting in Python 3.2, the HTMLParser constructor takes a 'strict'
# argument, which we'd like to set to False. Unfortunately,
# http://bugs.python.org/issue13273 makes strict=True a better bet
# before Python 3.2.3.
#
# At the end of this file, we monkeypatch HTMLParser so that
# strict=True works well on Python 3.2.2.
major, minor, release = sys.version_info[:3]
CONSTRUCTOR_TAKES_STRICT = (
major > 3
or (major == 3 and minor > 2)
or (major == 3 and minor == 2 and release >= 3))
from bs4.element import (
CData,
Comment,
Declaration,
Doctype,
ProcessingInstruction,
)
from bs4.dammit import EntitySubstitution, UnicodeDammit
from bs4.builder import (
HTML,
HTMLTreeBuilder,
STRICT,
)
HTMLPARSER = 'html.parser'
class BeautifulSoupHTMLParser(HTMLParser):
def handle_starttag(self, name, attrs):
# XXX namespace
self.soup.handle_starttag(name, None, None, dict(attrs))
def handle_endtag(self, name):
self.soup.handle_endtag(name)
def handle_data(self, data):
self.soup.handle_data(data)
def handle_charref(self, name):
# XXX workaround for a bug in HTMLParser. Remove this once
# it's fixed.
if name.startswith('x'):
real_name = int(name.lstrip('x'), 16)
else:
real_name = int(name)
try:
data = unichr(real_name)
except (ValueError, OverflowError), e:
data = u"\N{REPLACEMENT CHARACTER}"
self.handle_data(data)
def handle_entityref(self, name):
character = EntitySubstitution.HTML_ENTITY_TO_CHARACTER.get(name)
if character is not None:
data = character
else:
data = "&%s;" % name
self.handle_data(data)
def handle_comment(self, data):
self.soup.endData()
self.soup.handle_data(data)
self.soup.endData(Comment)
def handle_decl(self, data):
self.soup.endData()
if data.startswith("DOCTYPE "):
data = data[len("DOCTYPE "):]
self.soup.handle_data(data)
self.soup.endData(Doctype)
def unknown_decl(self, data):
if data.upper().startswith('CDATA['):
cls = CData
data = data[len('CDATA['):]
else:
cls = Declaration
self.soup.endData()
self.soup.handle_data(data)
self.soup.endData(cls)
def handle_pi(self, data):
self.soup.endData()
if data.endswith("?") and data.lower().startswith("xml"):
# "An XHTML processing instruction using the trailing '?'
# will cause the '?' to be included in data." - HTMLParser
# docs.
#
# Strip the question mark so we don't end up with two
# question marks.
data = data[:-1]
self.soup.handle_data(data)
self.soup.endData(ProcessingInstruction)
class HTMLParserTreeBuilder(HTMLTreeBuilder):
is_xml = False
features = [HTML, STRICT, HTMLPARSER]
def __init__(self, *args, **kwargs):
if CONSTRUCTOR_TAKES_STRICT:
kwargs['strict'] = False
self.parser_args = (args, kwargs)
def prepare_markup(self, markup, user_specified_encoding=None,
document_declared_encoding=None):
"""
:return: A 4-tuple (markup, original encoding, encoding
declared within markup, whether any characters had to be
replaced with REPLACEMENT CHARACTER).
"""
if isinstance(markup, unicode):
return markup, None, None, False
try_encodings = [user_specified_encoding, document_declared_encoding]
dammit = UnicodeDammit(markup, try_encodings, is_html=True)
return (dammit.markup, dammit.original_encoding,
dammit.declared_html_encoding,
dammit.contains_replacement_characters)
def feed(self, markup):
args, kwargs = self.parser_args
parser = BeautifulSoupHTMLParser(*args, **kwargs)
parser.soup = self.soup
try:
parser.feed(markup)
except HTMLParseError, e:
warnings.warn(RuntimeWarning(
"Python's built-in HTMLParser cannot parse the given document. This is not a bug in Beautiful Soup. The best solution is to install an external parser (lxml or html5lib), and use Beautiful Soup with that parser. See http://www.crummy.com/software/BeautifulSoup/bs4/doc/#installing-a-parser for help."))
raise e
# Patch 3.2 versions of HTMLParser earlier than 3.2.3 to use some
# 3.2.3 code. This ensures they don't treat markup like <p></p> as a
# string.
#
# XXX This code can be removed once most Python 3 users are on 3.2.3.
if major == 3 and minor == 2 and not CONSTRUCTOR_TAKES_STRICT:
import re
attrfind_tolerant = re.compile(
r'\s*((?<=[\'"\s])[^\s/>][^\s/=>]*)(\s*=+\s*'
r'(\'[^\']*\'|"[^"]*"|(?![\'"])[^>\s]*))?')
HTMLParserTreeBuilder.attrfind_tolerant = attrfind_tolerant
locatestarttagend = re.compile(r"""
<[a-zA-Z][-.a-zA-Z0-9:_]* # tag name
(?:\s+ # whitespace before attribute name
(?:[a-zA-Z_][-.:a-zA-Z0-9_]* # attribute name
(?:\s*=\s* # value indicator
(?:'[^']*' # LITA-enclosed value
|\"[^\"]*\" # LIT-enclosed value
|[^'\">\s]+ # bare value
)
)?
)
)*
\s* # trailing whitespace
""", re.VERBOSE)
BeautifulSoupHTMLParser.locatestarttagend = locatestarttagend
from html.parser import tagfind, attrfind
def parse_starttag(self, i):
self.__starttag_text = None
endpos = self.check_for_whole_start_tag(i)
if endpos < 0:
return endpos
rawdata = self.rawdata
self.__starttag_text = rawdata[i:endpos]
# Now parse the data between i+1 and j into a tag and attrs
attrs = []
match = tagfind.match(rawdata, i+1)
assert match, 'unexpected call to parse_starttag()'
k = match.end()
self.lasttag = tag = rawdata[i+1:k].lower()
while k < endpos:
if self.strict:
m = attrfind.match(rawdata, k)
else:
m = attrfind_tolerant.match(rawdata, k)
if not m:
break
attrname, rest, attrvalue = m.group(1, 2, 3)
if not rest:
attrvalue = None
elif attrvalue[:1] == '\'' == attrvalue[-1:] or \
attrvalue[:1] == '"' == attrvalue[-1:]:
attrvalue = attrvalue[1:-1]
if attrvalue:
attrvalue = self.unescape(attrvalue)
attrs.append((attrname.lower(), attrvalue))
k = m.end()
end = rawdata[k:endpos].strip()
if end not in (">", "/>"):
lineno, offset = self.getpos()
if "\n" in self.__starttag_text:
lineno = lineno + self.__starttag_text.count("\n")
offset = len(self.__starttag_text) \
- self.__starttag_text.rfind("\n")
else:
offset = offset + len(self.__starttag_text)
if self.strict:
self.error("junk characters in start tag: %r"
% (rawdata[k:endpos][:20],))
self.handle_data(rawdata[i:endpos])
return endpos
if end.endswith('/>'):
# XHTML-style empty tag: <span attr="value" />
self.handle_startendtag(tag, attrs)
else:
self.handle_starttag(tag, attrs)
if tag in self.CDATA_CONTENT_ELEMENTS:
self.set_cdata_mode(tag)
return endpos
def set_cdata_mode(self, elem):
self.cdata_elem = elem.lower()
self.interesting = re.compile(r'</\s*%s\s*>' % self.cdata_elem, re.I)
BeautifulSoupHTMLParser.parse_starttag = parse_starttag
BeautifulSoupHTMLParser.set_cdata_mode = set_cdata_mode
CONSTRUCTOR_TAKES_STRICT = True

179
lib/bs4/builder/_lxml.py Normal file
View File

@ -0,0 +1,179 @@
__all__ = [
'LXMLTreeBuilderForXML',
'LXMLTreeBuilder',
]
from StringIO import StringIO
import collections
from lxml import etree
from bs4.element import Comment, Doctype, NamespacedAttribute
from bs4.builder import (
FAST,
HTML,
HTMLTreeBuilder,
PERMISSIVE,
TreeBuilder,
XML)
from bs4.dammit import UnicodeDammit
LXML = 'lxml'
class LXMLTreeBuilderForXML(TreeBuilder):
DEFAULT_PARSER_CLASS = etree.XMLParser
is_xml = True
# Well, it's permissive by XML parser standards.
features = [LXML, XML, FAST, PERMISSIVE]
CHUNK_SIZE = 512
@property
def default_parser(self):
# This can either return a parser object or a class, which
# will be instantiated with default arguments.
return etree.XMLParser(target=self, strip_cdata=False, recover=True)
def __init__(self, parser=None, empty_element_tags=None):
if empty_element_tags is not None:
self.empty_element_tags = set(empty_element_tags)
if parser is None:
# Use the default parser.
parser = self.default_parser
if isinstance(parser, collections.Callable):
# Instantiate the parser with default arguments
parser = parser(target=self, strip_cdata=False)
self.parser = parser
self.soup = None
self.nsmaps = None
def _getNsTag(self, tag):
# Split the namespace URL out of a fully-qualified lxml tag
# name. Copied from lxml's src/lxml/sax.py.
if tag[0] == '{':
return tuple(tag[1:].split('}', 1))
else:
return (None, tag)
def prepare_markup(self, markup, user_specified_encoding=None,
document_declared_encoding=None):
"""
:return: A 3-tuple (markup, original encoding, encoding
declared within markup).
"""
if isinstance(markup, unicode):
return markup, None, None, False
try_encodings = [user_specified_encoding, document_declared_encoding]
dammit = UnicodeDammit(markup, try_encodings, is_html=True)
return (dammit.markup, dammit.original_encoding,
dammit.declared_html_encoding,
dammit.contains_replacement_characters)
def feed(self, markup):
if isinstance(markup, basestring):
markup = StringIO(markup)
# Call feed() at least once, even if the markup is empty,
# or the parser won't be initialized.
data = markup.read(self.CHUNK_SIZE)
self.parser.feed(data)
while data != '':
# Now call feed() on the rest of the data, chunk by chunk.
data = markup.read(self.CHUNK_SIZE)
if data != '':
self.parser.feed(data)
self.parser.close()
def close(self):
self.nsmaps = None
def start(self, name, attrs, nsmap={}):
# Make sure attrs is a mutable dict--lxml may send an immutable dictproxy.
attrs = dict(attrs)
nsprefix = None
# Invert each namespace map as it comes in.
if len(nsmap) == 0 and self.nsmaps != None:
# There are no new namespaces for this tag, but namespaces
# are in play, so we need a separate tag stack to know
# when they end.
self.nsmaps.append(None)
elif len(nsmap) > 0:
# A new namespace mapping has come into play.
if self.nsmaps is None:
self.nsmaps = []
inverted_nsmap = dict((value, key) for key, value in nsmap.items())
self.nsmaps.append(inverted_nsmap)
# Also treat the namespace mapping as a set of attributes on the
# tag, so we can recreate it later.
attrs = attrs.copy()
for prefix, namespace in nsmap.items():
attribute = NamespacedAttribute(
"xmlns", prefix, "http://www.w3.org/2000/xmlns/")
attrs[attribute] = namespace
namespace, name = self._getNsTag(name)
if namespace is not None:
for inverted_nsmap in reversed(self.nsmaps):
if inverted_nsmap is not None and namespace in inverted_nsmap:
nsprefix = inverted_nsmap[namespace]
break
self.soup.handle_starttag(name, namespace, nsprefix, attrs)
def end(self, name):
self.soup.endData()
completed_tag = self.soup.tagStack[-1]
namespace, name = self._getNsTag(name)
nsprefix = None
if namespace is not None:
for inverted_nsmap in reversed(self.nsmaps):
if inverted_nsmap is not None and namespace in inverted_nsmap:
nsprefix = inverted_nsmap[namespace]
break
self.soup.handle_endtag(name, nsprefix)
if self.nsmaps != None:
# This tag, or one of its parents, introduced a namespace
# mapping, so pop it off the stack.
self.nsmaps.pop()
if len(self.nsmaps) == 0:
# Namespaces are no longer in play, so don't bother keeping
# track of the namespace stack.
self.nsmaps = None
def pi(self, target, data):
pass
def data(self, content):
self.soup.handle_data(content)
def doctype(self, name, pubid, system):
self.soup.endData()
doctype = Doctype.for_name_and_ids(name, pubid, system)
self.soup.object_was_parsed(doctype)
def comment(self, content):
"Handle comments as Comment objects."
self.soup.endData()
self.soup.handle_data(content)
self.soup.endData(Comment)
def test_fragment_to_document(self, fragment):
"""See `TreeBuilder`."""
return u'<?xml version="1.0" encoding="utf-8"?>\n%s' % fragment
class LXMLTreeBuilder(HTMLTreeBuilder, LXMLTreeBuilderForXML):
features = [LXML, HTML, FAST, PERMISSIVE]
is_xml = False
@property
def default_parser(self):
return etree.HTMLParser
def feed(self, markup):
self.parser.feed(markup)
self.parser.close()
def test_fragment_to_document(self, fragment):
"""See `TreeBuilder`."""
return u'<html><body>%s</body></html>' % fragment

792
lib/bs4/dammit.py Normal file
View File

@ -0,0 +1,792 @@
# -*- coding: utf-8 -*-
"""Beautiful Soup bonus library: Unicode, Dammit
This class forces XML data into a standard format (usually to UTF-8 or
Unicode). It is heavily based on code from Mark Pilgrim's Universal
Feed Parser. It does not rewrite the XML or HTML to reflect a new
encoding; that's the tree builder's job.
"""
import codecs
from htmlentitydefs import codepoint2name
import re
import warnings
# Autodetects character encodings. Very useful.
# Download from http://chardet.feedparser.org/
# or 'apt-get install python-chardet'
# or 'easy_install chardet'
try:
import chardet
#import chardet.constants
#chardet.constants._debug = 1
except ImportError:
chardet = None
# Available from http://cjkpython.i18n.org/.
try:
import iconv_codec
except ImportError:
pass
xml_encoding_re = re.compile(
'^<\?.*encoding=[\'"](.*?)[\'"].*\?>'.encode(), re.I)
html_meta_re = re.compile(
'<\s*meta[^>]+charset\s*=\s*["\']?([^>]*?)[ /;\'">]'.encode(), re.I)
class EntitySubstitution(object):
"""Substitute XML or HTML entities for the corresponding characters."""
def _populate_class_variables():
lookup = {}
reverse_lookup = {}
characters_for_re = []
for codepoint, name in list(codepoint2name.items()):
character = unichr(codepoint)
if codepoint != 34:
# There's no point in turning the quotation mark into
# &quot;, unless it happens within an attribute value, which
# is handled elsewhere.
characters_for_re.append(character)
lookup[character] = name
# But we do want to turn &quot; into the quotation mark.
reverse_lookup[name] = character
re_definition = "[%s]" % "".join(characters_for_re)
return lookup, reverse_lookup, re.compile(re_definition)
(CHARACTER_TO_HTML_ENTITY, HTML_ENTITY_TO_CHARACTER,
CHARACTER_TO_HTML_ENTITY_RE) = _populate_class_variables()
CHARACTER_TO_XML_ENTITY = {
"'": "apos",
'"': "quot",
"&": "amp",
"<": "lt",
">": "gt",
}
BARE_AMPERSAND_OR_BRACKET = re.compile("([<>]|"
"&(?!#\d+;|#x[0-9a-fA-F]+;|\w+;)"
")")
@classmethod
def _substitute_html_entity(cls, matchobj):
entity = cls.CHARACTER_TO_HTML_ENTITY.get(matchobj.group(0))
return "&%s;" % entity
@classmethod
def _substitute_xml_entity(cls, matchobj):
"""Used with a regular expression to substitute the
appropriate XML entity for an XML special character."""
entity = cls.CHARACTER_TO_XML_ENTITY[matchobj.group(0)]
return "&%s;" % entity
@classmethod
def quoted_attribute_value(self, value):
"""Make a value into a quoted XML attribute, possibly escaping it.
Most strings will be quoted using double quotes.
Bob's Bar -> "Bob's Bar"
If a string contains double quotes, it will be quoted using
single quotes.
Welcome to "my bar" -> 'Welcome to "my bar"'
If a string contains both single and double quotes, the
double quotes will be escaped, and the string will be quoted
using double quotes.
Welcome to "Bob's Bar" -> "Welcome to &quot;Bob's bar&quot;
"""
quote_with = '"'
if '"' in value:
if "'" in value:
# The string contains both single and double
# quotes. Turn the double quotes into
# entities. We quote the double quotes rather than
# the single quotes because the entity name is
# "&quot;" whether this is HTML or XML. If we
# quoted the single quotes, we'd have to decide
# between &apos; and &squot;.
replace_with = "&quot;"
value = value.replace('"', replace_with)
else:
# There are double quotes but no single quotes.
# We can use single quotes to quote the attribute.
quote_with = "'"
return quote_with + value + quote_with
@classmethod
def substitute_xml(cls, value, make_quoted_attribute=False):
"""Substitute XML entities for special XML characters.
:param value: A string to be substituted. The less-than sign will
become &lt;, the greater-than sign will become &gt;, and any
ampersands that are not part of an entity defition will
become &amp;.
:param make_quoted_attribute: If True, then the string will be
quoted, as befits an attribute value.
"""
# Escape angle brackets, and ampersands that aren't part of
# entities.
value = cls.BARE_AMPERSAND_OR_BRACKET.sub(
cls._substitute_xml_entity, value)
if make_quoted_attribute:
value = cls.quoted_attribute_value(value)
return value
@classmethod
def substitute_html(cls, s):
"""Replace certain Unicode characters with named HTML entities.
This differs from data.encode(encoding, 'xmlcharrefreplace')
in that the goal is to make the result more readable (to those
with ASCII displays) rather than to recover from
errors. There's absolutely nothing wrong with a UTF-8 string
containg a LATIN SMALL LETTER E WITH ACUTE, but replacing that
character with "&eacute;" will make it more readable to some
people.
"""
return cls.CHARACTER_TO_HTML_ENTITY_RE.sub(
cls._substitute_html_entity, s)
class UnicodeDammit:
"""A class for detecting the encoding of a *ML document and
converting it to a Unicode string. If the source encoding is
windows-1252, can replace MS smart quotes with their HTML or XML
equivalents."""
# This dictionary maps commonly seen values for "charset" in HTML
# meta tags to the corresponding Python codec names. It only covers
# values that aren't in Python's aliases and can't be determined
# by the heuristics in find_codec.
CHARSET_ALIASES = {"macintosh": "mac-roman",
"x-sjis": "shift-jis"}
ENCODINGS_WITH_SMART_QUOTES = [
"windows-1252",
"iso-8859-1",
"iso-8859-2",
]
def __init__(self, markup, override_encodings=[],
smart_quotes_to=None, is_html=False):
self.declared_html_encoding = None
self.smart_quotes_to = smart_quotes_to
self.tried_encodings = []
self.contains_replacement_characters = False
if markup == '' or isinstance(markup, unicode):
self.markup = markup
self.unicode_markup = unicode(markup)
self.original_encoding = None
return
new_markup, document_encoding, sniffed_encoding = \
self._detectEncoding(markup, is_html)
self.markup = new_markup
u = None
if new_markup != markup:
# _detectEncoding modified the markup, then converted it to
# Unicode and then to UTF-8. So convert it from UTF-8.
u = self._convert_from("utf8")
self.original_encoding = sniffed_encoding
if not u:
for proposed_encoding in (
override_encodings + [document_encoding, sniffed_encoding]):
if proposed_encoding is not None:
u = self._convert_from(proposed_encoding)
if u:
break
# If no luck and we have auto-detection library, try that:
if not u and chardet and not isinstance(self.markup, unicode):
u = self._convert_from(chardet.detect(self.markup)['encoding'])
# As a last resort, try utf-8 and windows-1252:
if not u:
for proposed_encoding in ("utf-8", "windows-1252"):
u = self._convert_from(proposed_encoding)
if u:
break
# As an absolute last resort, try the encodings again with
# character replacement.
if not u:
for proposed_encoding in (
override_encodings + [
document_encoding, sniffed_encoding, "utf-8", "windows-1252"]):
if proposed_encoding != "ascii":
u = self._convert_from(proposed_encoding, "replace")
if u is not None:
warnings.warn(
UnicodeWarning(
"Some characters could not be decoded, and were "
"replaced with REPLACEMENT CHARACTER."))
self.contains_replacement_characters = True
break
# We could at this point force it to ASCII, but that would
# destroy so much data that I think giving up is better
self.unicode_markup = u
if not u:
self.original_encoding = None
def _sub_ms_char(self, match):
"""Changes a MS smart quote character to an XML or HTML
entity, or an ASCII character."""
orig = match.group(1)
if self.smart_quotes_to == 'ascii':
sub = self.MS_CHARS_TO_ASCII.get(orig).encode()
else:
sub = self.MS_CHARS.get(orig)
if type(sub) == tuple:
if self.smart_quotes_to == 'xml':
sub = '&#x'.encode() + sub[1].encode() + ';'.encode()
else:
sub = '&'.encode() + sub[0].encode() + ';'.encode()
else:
sub = sub.encode()
return sub
def _convert_from(self, proposed, errors="strict"):
proposed = self.find_codec(proposed)
if not proposed or (proposed, errors) in self.tried_encodings:
return None
self.tried_encodings.append((proposed, errors))
markup = self.markup
# Convert smart quotes to HTML if coming from an encoding
# that might have them.
if (self.smart_quotes_to is not None
and proposed.lower() in self.ENCODINGS_WITH_SMART_QUOTES):
smart_quotes_re = b"([\x80-\x9f])"
smart_quotes_compiled = re.compile(smart_quotes_re)
markup = smart_quotes_compiled.sub(self._sub_ms_char, markup)
try:
#print "Trying to convert document to %s (errors=%s)" % (
# proposed, errors)
u = self._to_unicode(markup, proposed, errors)
self.markup = u
self.original_encoding = proposed
except Exception as e:
#print "That didn't work!"
#print e
return None
#print "Correct encoding: %s" % proposed
return self.markup
def _to_unicode(self, data, encoding, errors="strict"):
'''Given a string and its encoding, decodes the string into Unicode.
%encoding is a string recognized by encodings.aliases'''
# strip Byte Order Mark (if present)
if (len(data) >= 4) and (data[:2] == '\xfe\xff') \
and (data[2:4] != '\x00\x00'):
encoding = 'utf-16be'
data = data[2:]
elif (len(data) >= 4) and (data[:2] == '\xff\xfe') \
and (data[2:4] != '\x00\x00'):
encoding = 'utf-16le'
data = data[2:]
elif data[:3] == '\xef\xbb\xbf':
encoding = 'utf-8'
data = data[3:]
elif data[:4] == '\x00\x00\xfe\xff':
encoding = 'utf-32be'
data = data[4:]
elif data[:4] == '\xff\xfe\x00\x00':
encoding = 'utf-32le'
data = data[4:]
newdata = unicode(data, encoding, errors)
return newdata
def _detectEncoding(self, xml_data, is_html=False):
"""Given a document, tries to detect its XML encoding."""
xml_encoding = sniffed_xml_encoding = None
try:
if xml_data[:4] == b'\x4c\x6f\xa7\x94':
# EBCDIC
xml_data = self._ebcdic_to_ascii(xml_data)
elif xml_data[:4] == b'\x00\x3c\x00\x3f':
# UTF-16BE
sniffed_xml_encoding = 'utf-16be'
xml_data = unicode(xml_data, 'utf-16be').encode('utf-8')
elif (len(xml_data) >= 4) and (xml_data[:2] == b'\xfe\xff') \
and (xml_data[2:4] != b'\x00\x00'):
# UTF-16BE with BOM
sniffed_xml_encoding = 'utf-16be'
xml_data = unicode(xml_data[2:], 'utf-16be').encode('utf-8')
elif xml_data[:4] == b'\x3c\x00\x3f\x00':
# UTF-16LE
sniffed_xml_encoding = 'utf-16le'
xml_data = unicode(xml_data, 'utf-16le').encode('utf-8')
elif (len(xml_data) >= 4) and (xml_data[:2] == b'\xff\xfe') and \
(xml_data[2:4] != b'\x00\x00'):
# UTF-16LE with BOM
sniffed_xml_encoding = 'utf-16le'
xml_data = unicode(xml_data[2:], 'utf-16le').encode('utf-8')
elif xml_data[:4] == b'\x00\x00\x00\x3c':
# UTF-32BE
sniffed_xml_encoding = 'utf-32be'
xml_data = unicode(xml_data, 'utf-32be').encode('utf-8')
elif xml_data[:4] == b'\x3c\x00\x00\x00':
# UTF-32LE
sniffed_xml_encoding = 'utf-32le'
xml_data = unicode(xml_data, 'utf-32le').encode('utf-8')
elif xml_data[:4] == b'\x00\x00\xfe\xff':
# UTF-32BE with BOM
sniffed_xml_encoding = 'utf-32be'
xml_data = unicode(xml_data[4:], 'utf-32be').encode('utf-8')
elif xml_data[:4] == b'\xff\xfe\x00\x00':
# UTF-32LE with BOM
sniffed_xml_encoding = 'utf-32le'
xml_data = unicode(xml_data[4:], 'utf-32le').encode('utf-8')
elif xml_data[:3] == b'\xef\xbb\xbf':
# UTF-8 with BOM
sniffed_xml_encoding = 'utf-8'
xml_data = unicode(xml_data[3:], 'utf-8').encode('utf-8')
else:
sniffed_xml_encoding = 'ascii'
pass
except:
xml_encoding_match = None
xml_encoding_match = xml_encoding_re.match(xml_data)
if not xml_encoding_match and is_html:
xml_encoding_match = html_meta_re.search(xml_data)
if xml_encoding_match is not None:
xml_encoding = xml_encoding_match.groups()[0].decode(
'ascii').lower()
if is_html:
self.declared_html_encoding = xml_encoding
if sniffed_xml_encoding and \
(xml_encoding in ('iso-10646-ucs-2', 'ucs-2', 'csunicode',
'iso-10646-ucs-4', 'ucs-4', 'csucs4',
'utf-16', 'utf-32', 'utf_16', 'utf_32',
'utf16', 'u16')):
xml_encoding = sniffed_xml_encoding
return xml_data, xml_encoding, sniffed_xml_encoding
def find_codec(self, charset):
return self._codec(self.CHARSET_ALIASES.get(charset, charset)) \
or (charset and self._codec(charset.replace("-", ""))) \
or (charset and self._codec(charset.replace("-", "_"))) \
or charset
def _codec(self, charset):
if not charset:
return charset
codec = None
try:
codecs.lookup(charset)
codec = charset
except (LookupError, ValueError):
pass
return codec
EBCDIC_TO_ASCII_MAP = None
def _ebcdic_to_ascii(self, s):
c = self.__class__
if not c.EBCDIC_TO_ASCII_MAP:
emap = (0,1,2,3,156,9,134,127,151,141,142,11,12,13,14,15,
16,17,18,19,157,133,8,135,24,25,146,143,28,29,30,31,
128,129,130,131,132,10,23,27,136,137,138,139,140,5,6,7,
144,145,22,147,148,149,150,4,152,153,154,155,20,21,158,26,
32,160,161,162,163,164,165,166,167,168,91,46,60,40,43,33,
38,169,170,171,172,173,174,175,176,177,93,36,42,41,59,94,
45,47,178,179,180,181,182,183,184,185,124,44,37,95,62,63,
186,187,188,189,190,191,192,193,194,96,58,35,64,39,61,34,
195,97,98,99,100,101,102,103,104,105,196,197,198,199,200,
201,202,106,107,108,109,110,111,112,113,114,203,204,205,
206,207,208,209,126,115,116,117,118,119,120,121,122,210,
211,212,213,214,215,216,217,218,219,220,221,222,223,224,
225,226,227,228,229,230,231,123,65,66,67,68,69,70,71,72,
73,232,233,234,235,236,237,125,74,75,76,77,78,79,80,81,
82,238,239,240,241,242,243,92,159,83,84,85,86,87,88,89,
90,244,245,246,247,248,249,48,49,50,51,52,53,54,55,56,57,
250,251,252,253,254,255)
import string
c.EBCDIC_TO_ASCII_MAP = string.maketrans(
''.join(map(chr, list(range(256)))), ''.join(map(chr, emap)))
return s.translate(c.EBCDIC_TO_ASCII_MAP)
# A partial mapping of ISO-Latin-1 to HTML entities/XML numeric entities.
MS_CHARS = {b'\x80': ('euro', '20AC'),
b'\x81': ' ',
b'\x82': ('sbquo', '201A'),
b'\x83': ('fnof', '192'),
b'\x84': ('bdquo', '201E'),
b'\x85': ('hellip', '2026'),
b'\x86': ('dagger', '2020'),
b'\x87': ('Dagger', '2021'),
b'\x88': ('circ', '2C6'),
b'\x89': ('permil', '2030'),
b'\x8A': ('Scaron', '160'),
b'\x8B': ('lsaquo', '2039'),
b'\x8C': ('OElig', '152'),
b'\x8D': '?',
b'\x8E': ('#x17D', '17D'),
b'\x8F': '?',
b'\x90': '?',
b'\x91': ('lsquo', '2018'),
b'\x92': ('rsquo', '2019'),
b'\x93': ('ldquo', '201C'),
b'\x94': ('rdquo', '201D'),
b'\x95': ('bull', '2022'),
b'\x96': ('ndash', '2013'),
b'\x97': ('mdash', '2014'),
b'\x98': ('tilde', '2DC'),
b'\x99': ('trade', '2122'),
b'\x9a': ('scaron', '161'),
b'\x9b': ('rsaquo', '203A'),
b'\x9c': ('oelig', '153'),
b'\x9d': '?',
b'\x9e': ('#x17E', '17E'),
b'\x9f': ('Yuml', ''),}
# A parochial partial mapping of ISO-Latin-1 to ASCII. Contains
# horrors like stripping diacritical marks to turn á into a, but also
# contains non-horrors like turning “ into ".
MS_CHARS_TO_ASCII = {
b'\x80' : 'EUR',
b'\x81' : ' ',
b'\x82' : ',',
b'\x83' : 'f',
b'\x84' : ',,',
b'\x85' : '...',
b'\x86' : '+',
b'\x87' : '++',
b'\x88' : '^',
b'\x89' : '%',
b'\x8a' : 'S',
b'\x8b' : '<',
b'\x8c' : 'OE',
b'\x8d' : '?',
b'\x8e' : 'Z',
b'\x8f' : '?',
b'\x90' : '?',
b'\x91' : "'",
b'\x92' : "'",
b'\x93' : '"',
b'\x94' : '"',
b'\x95' : '*',
b'\x96' : '-',
b'\x97' : '--',
b'\x98' : '~',
b'\x99' : '(TM)',
b'\x9a' : 's',
b'\x9b' : '>',
b'\x9c' : 'oe',
b'\x9d' : '?',
b'\x9e' : 'z',
b'\x9f' : 'Y',
b'\xa0' : ' ',
b'\xa1' : '!',
b'\xa2' : 'c',
b'\xa3' : 'GBP',
b'\xa4' : '$', #This approximation is especially parochial--this is the
#generic currency symbol.
b'\xa5' : 'YEN',
b'\xa6' : '|',
b'\xa7' : 'S',
b'\xa8' : '..',
b'\xa9' : '',
b'\xaa' : '(th)',
b'\xab' : '<<',
b'\xac' : '!',
b'\xad' : ' ',
b'\xae' : '(R)',
b'\xaf' : '-',
b'\xb0' : 'o',
b'\xb1' : '+-',
b'\xb2' : '2',
b'\xb3' : '3',
b'\xb4' : ("'", 'acute'),
b'\xb5' : 'u',
b'\xb6' : 'P',
b'\xb7' : '*',
b'\xb8' : ',',
b'\xb9' : '1',
b'\xba' : '(th)',
b'\xbb' : '>>',
b'\xbc' : '1/4',
b'\xbd' : '1/2',
b'\xbe' : '3/4',
b'\xbf' : '?',
b'\xc0' : 'A',
b'\xc1' : 'A',
b'\xc2' : 'A',
b'\xc3' : 'A',
b'\xc4' : 'A',
b'\xc5' : 'A',
b'\xc6' : 'AE',
b'\xc7' : 'C',
b'\xc8' : 'E',
b'\xc9' : 'E',
b'\xca' : 'E',
b'\xcb' : 'E',
b'\xcc' : 'I',
b'\xcd' : 'I',
b'\xce' : 'I',
b'\xcf' : 'I',
b'\xd0' : 'D',
b'\xd1' : 'N',
b'\xd2' : 'O',
b'\xd3' : 'O',
b'\xd4' : 'O',
b'\xd5' : 'O',
b'\xd6' : 'O',
b'\xd7' : '*',
b'\xd8' : 'O',
b'\xd9' : 'U',
b'\xda' : 'U',
b'\xdb' : 'U',
b'\xdc' : 'U',
b'\xdd' : 'Y',
b'\xde' : 'b',
b'\xdf' : 'B',
b'\xe0' : 'a',
b'\xe1' : 'a',
b'\xe2' : 'a',
b'\xe3' : 'a',
b'\xe4' : 'a',
b'\xe5' : 'a',
b'\xe6' : 'ae',
b'\xe7' : 'c',
b'\xe8' : 'e',
b'\xe9' : 'e',
b'\xea' : 'e',
b'\xeb' : 'e',
b'\xec' : 'i',
b'\xed' : 'i',
b'\xee' : 'i',
b'\xef' : 'i',
b'\xf0' : 'o',
b'\xf1' : 'n',
b'\xf2' : 'o',
b'\xf3' : 'o',
b'\xf4' : 'o',
b'\xf5' : 'o',
b'\xf6' : 'o',
b'\xf7' : '/',
b'\xf8' : 'o',
b'\xf9' : 'u',
b'\xfa' : 'u',
b'\xfb' : 'u',
b'\xfc' : 'u',
b'\xfd' : 'y',
b'\xfe' : 'b',
b'\xff' : 'y',
}
# A map used when removing rogue Windows-1252/ISO-8859-1
# characters in otherwise UTF-8 documents.
#
# Note that \x81, \x8d, \x8f, \x90, and \x9d are undefined in
# Windows-1252.
WINDOWS_1252_TO_UTF8 = {
0x80 : b'\xe2\x82\xac', # €
0x82 : b'\xe2\x80\x9a', #
0x83 : b'\xc6\x92', # ƒ
0x84 : b'\xe2\x80\x9e', # „
0x85 : b'\xe2\x80\xa6', # …
0x86 : b'\xe2\x80\xa0', # †
0x87 : b'\xe2\x80\xa1', # ‡
0x88 : b'\xcb\x86', # ˆ
0x89 : b'\xe2\x80\xb0', # ‰
0x8a : b'\xc5\xa0', # Š
0x8b : b'\xe2\x80\xb9', #
0x8c : b'\xc5\x92', # Œ
0x8e : b'\xc5\xbd', # Ž
0x91 : b'\xe2\x80\x98', #
0x92 : b'\xe2\x80\x99', #
0x93 : b'\xe2\x80\x9c', # “
0x94 : b'\xe2\x80\x9d', # ”
0x95 : b'\xe2\x80\xa2', # •
0x96 : b'\xe2\x80\x93', #
0x97 : b'\xe2\x80\x94', # —
0x98 : b'\xcb\x9c', # ˜
0x99 : b'\xe2\x84\xa2', # ™
0x9a : b'\xc5\xa1', # š
0x9b : b'\xe2\x80\xba', #
0x9c : b'\xc5\x93', # œ
0x9e : b'\xc5\xbe', # ž
0x9f : b'\xc5\xb8', # Ÿ
0xa0 : b'\xc2\xa0', #  
0xa1 : b'\xc2\xa1', # ¡
0xa2 : b'\xc2\xa2', # ¢
0xa3 : b'\xc2\xa3', # £
0xa4 : b'\xc2\xa4', # ¤
0xa5 : b'\xc2\xa5', # ¥
0xa6 : b'\xc2\xa6', # ¦
0xa7 : b'\xc2\xa7', # §
0xa8 : b'\xc2\xa8', # ¨
0xa9 : b'\xc2\xa9', # ©
0xaa : b'\xc2\xaa', # ª
0xab : b'\xc2\xab', # «
0xac : b'\xc2\xac', # ¬
0xad : b'\xc2\xad', # ­
0xae : b'\xc2\xae', # ®
0xaf : b'\xc2\xaf', # ¯
0xb0 : b'\xc2\xb0', # °
0xb1 : b'\xc2\xb1', # ±
0xb2 : b'\xc2\xb2', # ²
0xb3 : b'\xc2\xb3', # ³
0xb4 : b'\xc2\xb4', # ´
0xb5 : b'\xc2\xb5', # µ
0xb6 : b'\xc2\xb6', # ¶
0xb7 : b'\xc2\xb7', # ·
0xb8 : b'\xc2\xb8', # ¸
0xb9 : b'\xc2\xb9', # ¹
0xba : b'\xc2\xba', # º
0xbb : b'\xc2\xbb', # »
0xbc : b'\xc2\xbc', # ¼
0xbd : b'\xc2\xbd', # ½
0xbe : b'\xc2\xbe', # ¾
0xbf : b'\xc2\xbf', # ¿
0xc0 : b'\xc3\x80', # À
0xc1 : b'\xc3\x81', # Á
0xc2 : b'\xc3\x82', # Â
0xc3 : b'\xc3\x83', # Ã
0xc4 : b'\xc3\x84', # Ä
0xc5 : b'\xc3\x85', # Å
0xc6 : b'\xc3\x86', # Æ
0xc7 : b'\xc3\x87', # Ç
0xc8 : b'\xc3\x88', # È
0xc9 : b'\xc3\x89', # É
0xca : b'\xc3\x8a', # Ê
0xcb : b'\xc3\x8b', # Ë
0xcc : b'\xc3\x8c', # Ì
0xcd : b'\xc3\x8d', # Í
0xce : b'\xc3\x8e', # Î
0xcf : b'\xc3\x8f', # Ï
0xd0 : b'\xc3\x90', # Ð
0xd1 : b'\xc3\x91', # Ñ
0xd2 : b'\xc3\x92', # Ò
0xd3 : b'\xc3\x93', # Ó
0xd4 : b'\xc3\x94', # Ô
0xd5 : b'\xc3\x95', # Õ
0xd6 : b'\xc3\x96', # Ö
0xd7 : b'\xc3\x97', # ×
0xd8 : b'\xc3\x98', # Ø
0xd9 : b'\xc3\x99', # Ù
0xda : b'\xc3\x9a', # Ú
0xdb : b'\xc3\x9b', # Û
0xdc : b'\xc3\x9c', # Ü
0xdd : b'\xc3\x9d', # Ý
0xde : b'\xc3\x9e', # Þ
0xdf : b'\xc3\x9f', # ß
0xe0 : b'\xc3\xa0', # à
0xe1 : b'\xa1', # á
0xe2 : b'\xc3\xa2', # â
0xe3 : b'\xc3\xa3', # ã
0xe4 : b'\xc3\xa4', # ä
0xe5 : b'\xc3\xa5', # å
0xe6 : b'\xc3\xa6', # æ
0xe7 : b'\xc3\xa7', # ç
0xe8 : b'\xc3\xa8', # è
0xe9 : b'\xc3\xa9', # é
0xea : b'\xc3\xaa', # ê
0xeb : b'\xc3\xab', # ë
0xec : b'\xc3\xac', # ì
0xed : b'\xc3\xad', # í
0xee : b'\xc3\xae', # î
0xef : b'\xc3\xaf', # ï
0xf0 : b'\xc3\xb0', # ð
0xf1 : b'\xc3\xb1', # ñ
0xf2 : b'\xc3\xb2', # ò
0xf3 : b'\xc3\xb3', # ó
0xf4 : b'\xc3\xb4', # ô
0xf5 : b'\xc3\xb5', # õ
0xf6 : b'\xc3\xb6', # ö
0xf7 : b'\xc3\xb7', # ÷
0xf8 : b'\xc3\xb8', # ø
0xf9 : b'\xc3\xb9', # ù
0xfa : b'\xc3\xba', # ú
0xfb : b'\xc3\xbb', # û
0xfc : b'\xc3\xbc', # ü
0xfd : b'\xc3\xbd', # ý
0xfe : b'\xc3\xbe', # þ
}
MULTIBYTE_MARKERS_AND_SIZES = [
(0xc2, 0xdf, 2), # 2-byte characters start with a byte C2-DF
(0xe0, 0xef, 3), # 3-byte characters start with E0-EF
(0xf0, 0xf4, 4), # 4-byte characters start with F0-F4
]
FIRST_MULTIBYTE_MARKER = MULTIBYTE_MARKERS_AND_SIZES[0][0]
LAST_MULTIBYTE_MARKER = MULTIBYTE_MARKERS_AND_SIZES[-1][1]
@classmethod
def detwingle(cls, in_bytes, main_encoding="utf8",
embedded_encoding="windows-1252"):
"""Fix characters from one encoding embedded in some other encoding.
Currently the only situation supported is Windows-1252 (or its
subset ISO-8859-1), embedded in UTF-8.
The input must be a bytestring. If you've already converted
the document to Unicode, you're too late.
The output is a bytestring in which `embedded_encoding`
characters have been converted to their `main_encoding`
equivalents.
"""
if embedded_encoding.replace('_', '-').lower() not in (
'windows-1252', 'windows_1252'):
raise NotImplementedError(
"Windows-1252 and ISO-8859-1 are the only currently supported "
"embedded encodings.")
if main_encoding.lower() not in ('utf8', 'utf-8'):
raise NotImplementedError(
"UTF-8 is the only currently supported main encoding.")
byte_chunks = []
chunk_start = 0
pos = 0
while pos < len(in_bytes):
byte = in_bytes[pos]
if not isinstance(byte, int):
# Python 2.x
byte = ord(byte)
if (byte >= cls.FIRST_MULTIBYTE_MARKER
and byte <= cls.LAST_MULTIBYTE_MARKER):
# This is the start of a UTF-8 multibyte character. Skip
# to the end.
for start, end, size in cls.MULTIBYTE_MARKERS_AND_SIZES:
if byte >= start and byte <= end:
pos += size
break
elif byte >= 0x80 and byte in cls.WINDOWS_1252_TO_UTF8:
# We found a Windows-1252 character!
# Save the string up to this point as a chunk.
byte_chunks.append(in_bytes[chunk_start:pos])
# Now translate the Windows-1252 character into UTF-8
# and add it as another, one-byte chunk.
byte_chunks.append(cls.WINDOWS_1252_TO_UTF8[byte])
pos += 1
chunk_start = pos
else:
# Go on to the next character.
pos += 1
if chunk_start == 0:
# The string is unchanged.
return in_bytes
else:
# Store the final chunk.
byte_chunks.append(in_bytes[chunk_start:])
return b''.join(byte_chunks)

1349
lib/bs4/element.py Normal file

File diff suppressed because it is too large Load Diff

532
lib/bs4/testing.py Normal file
View File

@ -0,0 +1,532 @@
"""Helper classes for tests."""
import copy
import functools
import unittest
from unittest import TestCase
from bs4 import BeautifulSoup
from bs4.element import (
CharsetMetaAttributeValue,
Comment,
ContentMetaAttributeValue,
Doctype,
SoupStrainer,
)
from bs4.builder import HTMLParserTreeBuilder
default_builder = HTMLParserTreeBuilder
class SoupTest(unittest.TestCase):
@property
def default_builder(self):
return default_builder()
def soup(self, markup, **kwargs):
"""Build a Beautiful Soup object from markup."""
builder = kwargs.pop('builder', self.default_builder)
return BeautifulSoup(markup, builder=builder, **kwargs)
def document_for(self, markup):
"""Turn an HTML fragment into a document.
The details depend on the builder.
"""
return self.default_builder.test_fragment_to_document(markup)
def assertSoupEquals(self, to_parse, compare_parsed_to=None):
builder = self.default_builder
obj = BeautifulSoup(to_parse, builder=builder)
if compare_parsed_to is None:
compare_parsed_to = to_parse
self.assertEqual(obj.decode(), self.document_for(compare_parsed_to))
class HTMLTreeBuilderSmokeTest(object):
"""A basic test of a treebuilder's competence.
Any HTML treebuilder, present or future, should be able to pass
these tests. With invalid markup, there's room for interpretation,
and different parsers can handle it differently. But with the
markup in these tests, there's not much room for interpretation.
"""
def assertDoctypeHandled(self, doctype_fragment):
"""Assert that a given doctype string is handled correctly."""
doctype_str, soup = self._document_with_doctype(doctype_fragment)
# Make sure a Doctype object was created.
doctype = soup.contents[0]
self.assertEqual(doctype.__class__, Doctype)
self.assertEqual(doctype, doctype_fragment)
self.assertEqual(str(soup)[:len(doctype_str)], doctype_str)
# Make sure that the doctype was correctly associated with the
# parse tree and that the rest of the document parsed.
self.assertEqual(soup.p.contents[0], 'foo')
def _document_with_doctype(self, doctype_fragment):
"""Generate and parse a document with the given doctype."""
doctype = '<!DOCTYPE %s>' % doctype_fragment
markup = doctype + '\n<p>foo</p>'
soup = self.soup(markup)
return doctype, soup
def test_normal_doctypes(self):
"""Make sure normal, everyday HTML doctypes are handled correctly."""
self.assertDoctypeHandled("html")
self.assertDoctypeHandled(
'html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN"')
def test_public_doctype_with_url(self):
doctype = 'html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd"'
self.assertDoctypeHandled(doctype)
def test_system_doctype(self):
self.assertDoctypeHandled('foo SYSTEM "http://www.example.com/"')
def test_namespaced_system_doctype(self):
# We can handle a namespaced doctype with a system ID.
self.assertDoctypeHandled('xsl:stylesheet SYSTEM "htmlent.dtd"')
def test_namespaced_public_doctype(self):
# Test a namespaced doctype with a public id.
self.assertDoctypeHandled('xsl:stylesheet PUBLIC "htmlent.dtd"')
def test_real_xhtml_document(self):
"""A real XHTML document should come out more or less the same as it went in."""
markup = b"""<?xml version="1.0" encoding="utf-8"?>
<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN">
<html xmlns="http://www.w3.org/1999/xhtml">
<head><title>Hello.</title></head>
<body>Goodbye.</body>
</html>"""
soup = self.soup(markup)
self.assertEqual(
soup.encode("utf-8").replace(b"\n", b""),
markup.replace(b"\n", b""))
def test_deepcopy(self):
"""Make sure you can copy the tree builder.
This is important because the builder is part of a
BeautifulSoup object, and we want to be able to copy that.
"""
copy.deepcopy(self.default_builder)
def test_p_tag_is_never_empty_element(self):
"""A <p> tag is never designated as an empty-element tag.
Even if the markup shows it as an empty-element tag, it
shouldn't be presented that way.
"""
soup = self.soup("<p/>")
self.assertFalse(soup.p.is_empty_element)
self.assertEqual(str(soup.p), "<p></p>")
def test_unclosed_tags_get_closed(self):
"""A tag that's not closed by the end of the document should be closed.
This applies to all tags except empty-element tags.
"""
self.assertSoupEquals("<p>", "<p></p>")
self.assertSoupEquals("<b>", "<b></b>")
self.assertSoupEquals("<br>", "<br/>")
def test_br_is_always_empty_element_tag(self):
"""A <br> tag is designated as an empty-element tag.
Some parsers treat <br></br> as one <br/> tag, some parsers as
two tags, but it should always be an empty-element tag.
"""
soup = self.soup("<br></br>")
self.assertTrue(soup.br.is_empty_element)
self.assertEqual(str(soup.br), "<br/>")
def test_nested_formatting_elements(self):
self.assertSoupEquals("<em><em></em></em>")
def test_comment(self):
# Comments are represented as Comment objects.
markup = "<p>foo<!--foobar-->baz</p>"
self.assertSoupEquals(markup)
soup = self.soup(markup)
comment = soup.find(text="foobar")
self.assertEqual(comment.__class__, Comment)
def test_preserved_whitespace_in_pre_and_textarea(self):
"""Whitespace must be preserved in <pre> and <textarea> tags."""
self.assertSoupEquals("<pre> </pre>")
self.assertSoupEquals("<textarea> woo </textarea>")
def test_nested_inline_elements(self):
"""Inline elements can be nested indefinitely."""
b_tag = "<b>Inside a B tag</b>"
self.assertSoupEquals(b_tag)
nested_b_tag = "<p>A <i>nested <b>tag</b></i></p>"
self.assertSoupEquals(nested_b_tag)
double_nested_b_tag = "<p>A <a>doubly <i>nested <b>tag</b></i></a></p>"
self.assertSoupEquals(nested_b_tag)
def test_nested_block_level_elements(self):
"""Block elements can be nested."""
soup = self.soup('<blockquote><p><b>Foo</b></p></blockquote>')
blockquote = soup.blockquote
self.assertEqual(blockquote.p.b.string, 'Foo')
self.assertEqual(blockquote.b.string, 'Foo')
def test_correctly_nested_tables(self):
"""One table can go inside another one."""
markup = ('<table id="1">'
'<tr>'
"<td>Here's another table:"
'<table id="2">'
'<tr><td>foo</td></tr>'
'</table></td>')
self.assertSoupEquals(
markup,
'<table id="1"><tr><td>Here\'s another table:'
'<table id="2"><tr><td>foo</td></tr></table>'
'</td></tr></table>')
self.assertSoupEquals(
"<table><thead><tr><td>Foo</td></tr></thead>"
"<tbody><tr><td>Bar</td></tr></tbody>"
"<tfoot><tr><td>Baz</td></tr></tfoot></table>")
def test_deeply_nested_multivalued_attribute(self):
# html5lib can set the attributes of the same tag many times
# as it rearranges the tree. This has caused problems with
# multivalued attributes.
markup = '<table><div><div class="css"></div></div></table>'
soup = self.soup(markup)
self.assertEqual(["css"], soup.div.div['class'])
def test_angle_brackets_in_attribute_values_are_escaped(self):
self.assertSoupEquals('<a b="<a>"></a>', '<a b="&lt;a&gt;"></a>')
def test_entities_in_attributes_converted_to_unicode(self):
expect = u'<p id="pi\N{LATIN SMALL LETTER N WITH TILDE}ata"></p>'
self.assertSoupEquals('<p id="pi&#241;ata"></p>', expect)
self.assertSoupEquals('<p id="pi&#xf1;ata"></p>', expect)
self.assertSoupEquals('<p id="pi&ntilde;ata"></p>', expect)
def test_entities_in_text_converted_to_unicode(self):
expect = u'<p>pi\N{LATIN SMALL LETTER N WITH TILDE}ata</p>'
self.assertSoupEquals("<p>pi&#241;ata</p>", expect)
self.assertSoupEquals("<p>pi&#xf1;ata</p>", expect)
self.assertSoupEquals("<p>pi&ntilde;ata</p>", expect)
def test_quot_entity_converted_to_quotation_mark(self):
self.assertSoupEquals("<p>I said &quot;good day!&quot;</p>",
'<p>I said "good day!"</p>')
def test_out_of_range_entity(self):
expect = u"\N{REPLACEMENT CHARACTER}"
self.assertSoupEquals("&#10000000000000;", expect)
self.assertSoupEquals("&#x10000000000000;", expect)
self.assertSoupEquals("&#1000000000;", expect)
def test_basic_namespaces(self):
"""Parsers don't need to *understand* namespaces, but at the
very least they should not choke on namespaces or lose
data."""
markup = b'<html xmlns="http://www.w3.org/1999/xhtml" xmlns:mathml="http://www.w3.org/1998/Math/MathML" xmlns:svg="http://www.w3.org/2000/svg"><head></head><body><mathml:msqrt>4</mathml:msqrt><b svg:fill="red"></b></body></html>'
soup = self.soup(markup)
self.assertEqual(markup, soup.encode())
html = soup.html
self.assertEqual('http://www.w3.org/1999/xhtml', soup.html['xmlns'])
self.assertEqual(
'http://www.w3.org/1998/Math/MathML', soup.html['xmlns:mathml'])
self.assertEqual(
'http://www.w3.org/2000/svg', soup.html['xmlns:svg'])
def test_multivalued_attribute_value_becomes_list(self):
markup = b'<a class="foo bar">'
soup = self.soup(markup)
self.assertEqual(['foo', 'bar'], soup.a['class'])
#
# Generally speaking, tests below this point are more tests of
# Beautiful Soup than tests of the tree builders. But parsers are
# weird, so we run these tests separately for every tree builder
# to detect any differences between them.
#
def test_soupstrainer(self):
"""Parsers should be able to work with SoupStrainers."""
strainer = SoupStrainer("b")
soup = self.soup("A <b>bold</b> <meta/> <i>statement</i>",
parse_only=strainer)
self.assertEqual(soup.decode(), "<b>bold</b>")
def test_single_quote_attribute_values_become_double_quotes(self):
self.assertSoupEquals("<foo attr='bar'></foo>",
'<foo attr="bar"></foo>')
def test_attribute_values_with_nested_quotes_are_left_alone(self):
text = """<foo attr='bar "brawls" happen'>a</foo>"""
self.assertSoupEquals(text)
def test_attribute_values_with_double_nested_quotes_get_quoted(self):
text = """<foo attr='bar "brawls" happen'>a</foo>"""
soup = self.soup(text)
soup.foo['attr'] = 'Brawls happen at "Bob\'s Bar"'
self.assertSoupEquals(
soup.foo.decode(),
"""<foo attr="Brawls happen at &quot;Bob\'s Bar&quot;">a</foo>""")
def test_ampersand_in_attribute_value_gets_escaped(self):
self.assertSoupEquals('<this is="really messed up & stuff"></this>',
'<this is="really messed up &amp; stuff"></this>')
self.assertSoupEquals(
'<a href="http://example.org?a=1&b=2;3">foo</a>',
'<a href="http://example.org?a=1&amp;b=2;3">foo</a>')
def test_escaped_ampersand_in_attribute_value_is_left_alone(self):
self.assertSoupEquals('<a href="http://example.org?a=1&amp;b=2;3"></a>')
def test_entities_in_strings_converted_during_parsing(self):
# Both XML and HTML entities are converted to Unicode characters
# during parsing.
text = "<p>&lt;&lt;sacr&eacute;&#32;bleu!&gt;&gt;</p>"
expected = u"<p>&lt;&lt;sacr\N{LATIN SMALL LETTER E WITH ACUTE} bleu!&gt;&gt;</p>"
self.assertSoupEquals(text, expected)
def test_smart_quotes_converted_on_the_way_in(self):
# Microsoft smart quotes are converted to Unicode characters during
# parsing.
quote = b"<p>\x91Foo\x92</p>"
soup = self.soup(quote)
self.assertEqual(
soup.p.string,
u"\N{LEFT SINGLE QUOTATION MARK}Foo\N{RIGHT SINGLE QUOTATION MARK}")
def test_non_breaking_spaces_converted_on_the_way_in(self):
soup = self.soup("<a>&nbsp;&nbsp;</a>")
self.assertEqual(soup.a.string, u"\N{NO-BREAK SPACE}" * 2)
def test_entities_converted_on_the_way_out(self):
text = "<p>&lt;&lt;sacr&eacute;&#32;bleu!&gt;&gt;</p>"
expected = u"<p>&lt;&lt;sacr\N{LATIN SMALL LETTER E WITH ACUTE} bleu!&gt;&gt;</p>".encode("utf-8")
soup = self.soup(text)
self.assertEqual(soup.p.encode("utf-8"), expected)
def test_real_iso_latin_document(self):
# Smoke test of interrelated functionality, using an
# easy-to-understand document.
# Here it is in Unicode. Note that it claims to be in ISO-Latin-1.
unicode_html = u'<html><head><meta content="text/html; charset=ISO-Latin-1" http-equiv="Content-type"/></head><body><p>Sacr\N{LATIN SMALL LETTER E WITH ACUTE} bleu!</p></body></html>'
# That's because we're going to encode it into ISO-Latin-1, and use
# that to test.
iso_latin_html = unicode_html.encode("iso-8859-1")
# Parse the ISO-Latin-1 HTML.
soup = self.soup(iso_latin_html)
# Encode it to UTF-8.
result = soup.encode("utf-8")
# What do we expect the result to look like? Well, it would
# look like unicode_html, except that the META tag would say
# UTF-8 instead of ISO-Latin-1.
expected = unicode_html.replace("ISO-Latin-1", "utf-8")
# And, of course, it would be in UTF-8, not Unicode.
expected = expected.encode("utf-8")
# Ta-da!
self.assertEqual(result, expected)
def test_real_shift_jis_document(self):
# Smoke test to make sure the parser can handle a document in
# Shift-JIS encoding, without choking.
shift_jis_html = (
b'<html><head></head><body><pre>'
b'\x82\xb1\x82\xea\x82\xcdShift-JIS\x82\xc5\x83R\x81[\x83f'
b'\x83B\x83\x93\x83O\x82\xb3\x82\xea\x82\xbd\x93\xfa\x96{\x8c'
b'\xea\x82\xcc\x83t\x83@\x83C\x83\x8b\x82\xc5\x82\xb7\x81B'
b'</pre></body></html>')
unicode_html = shift_jis_html.decode("shift-jis")
soup = self.soup(unicode_html)
# Make sure the parse tree is correctly encoded to various
# encodings.
self.assertEqual(soup.encode("utf-8"), unicode_html.encode("utf-8"))
self.assertEqual(soup.encode("euc_jp"), unicode_html.encode("euc_jp"))
def test_real_hebrew_document(self):
# A real-world test to make sure we can convert ISO-8859-9 (a
# Hebrew encoding) to UTF-8.
hebrew_document = b'<html><head><title>Hebrew (ISO 8859-8) in Visual Directionality</title></head><body><h1>Hebrew (ISO 8859-8) in Visual Directionality</h1>\xed\xe5\xec\xf9</body></html>'
soup = self.soup(
hebrew_document, from_encoding="iso8859-8")
self.assertEqual(soup.original_encoding, 'iso8859-8')
self.assertEqual(
soup.encode('utf-8'),
hebrew_document.decode("iso8859-8").encode("utf-8"))
def test_meta_tag_reflects_current_encoding(self):
# Here's the <meta> tag saying that a document is
# encoded in Shift-JIS.
meta_tag = ('<meta content="text/html; charset=x-sjis" '
'http-equiv="Content-type"/>')
# Here's a document incorporating that meta tag.
shift_jis_html = (
'<html><head>\n%s\n'
'<meta http-equiv="Content-language" content="ja"/>'
'</head><body>Shift-JIS markup goes here.') % meta_tag
soup = self.soup(shift_jis_html)
# Parse the document, and the charset is seemingly unaffected.
parsed_meta = soup.find('meta', {'http-equiv': 'Content-type'})
content = parsed_meta['content']
self.assertEqual('text/html; charset=x-sjis', content)
# But that value is actually a ContentMetaAttributeValue object.
self.assertTrue(isinstance(content, ContentMetaAttributeValue))
# And it will take on a value that reflects its current
# encoding.
self.assertEqual('text/html; charset=utf8', content.encode("utf8"))
# For the rest of the story, see TestSubstitutions in
# test_tree.py.
def test_html5_style_meta_tag_reflects_current_encoding(self):
# Here's the <meta> tag saying that a document is
# encoded in Shift-JIS.
meta_tag = ('<meta id="encoding" charset="x-sjis" />')
# Here's a document incorporating that meta tag.
shift_jis_html = (
'<html><head>\n%s\n'
'<meta http-equiv="Content-language" content="ja"/>'
'</head><body>Shift-JIS markup goes here.') % meta_tag
soup = self.soup(shift_jis_html)
# Parse the document, and the charset is seemingly unaffected.
parsed_meta = soup.find('meta', id="encoding")
charset = parsed_meta['charset']
self.assertEqual('x-sjis', charset)
# But that value is actually a CharsetMetaAttributeValue object.
self.assertTrue(isinstance(charset, CharsetMetaAttributeValue))
# And it will take on a value that reflects its current
# encoding.
self.assertEqual('utf8', charset.encode("utf8"))
def test_tag_with_no_attributes_can_have_attributes_added(self):
data = self.soup("<a>text</a>")
data.a['foo'] = 'bar'
self.assertEqual('<a foo="bar">text</a>', data.a.decode())
class XMLTreeBuilderSmokeTest(object):
def test_docstring_generated(self):
soup = self.soup("<root/>")
self.assertEqual(
soup.encode(), b'<?xml version="1.0" encoding="utf-8"?>\n<root/>')
def test_real_xhtml_document(self):
"""A real XHTML document should come out *exactly* the same as it went in."""
markup = b"""<?xml version="1.0" encoding="utf-8"?>
<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN">
<html xmlns="http://www.w3.org/1999/xhtml">
<head><title>Hello.</title></head>
<body>Goodbye.</body>
</html>"""
soup = self.soup(markup)
self.assertEqual(
soup.encode("utf-8"), markup)
def test_popping_namespaced_tag(self):
markup = '<rss xmlns:dc="foo"><dc:creator>b</dc:creator><dc:date>2012-07-02T20:33:42Z</dc:date><dc:rights>c</dc:rights><image>d</image></rss>'
soup = self.soup(markup)
self.assertEqual(
unicode(soup.rss), markup)
def test_docstring_includes_correct_encoding(self):
soup = self.soup("<root/>")
self.assertEqual(
soup.encode("latin1"),
b'<?xml version="1.0" encoding="latin1"?>\n<root/>')
def test_large_xml_document(self):
"""A large XML document should come out the same as it went in."""
markup = (b'<?xml version="1.0" encoding="utf-8"?>\n<root>'
+ b'0' * (2**12)
+ b'</root>')
soup = self.soup(markup)
self.assertEqual(soup.encode("utf-8"), markup)
def test_tags_are_empty_element_if_and_only_if_they_are_empty(self):
self.assertSoupEquals("<p>", "<p/>")
self.assertSoupEquals("<p>foo</p>")
def test_namespaces_are_preserved(self):
markup = '<root xmlns:a="http://example.com/" xmlns:b="http://example.net/"><a:foo>This tag is in the a namespace</a:foo><b:foo>This tag is in the b namespace</b:foo></root>'
soup = self.soup(markup)
root = soup.root
self.assertEqual("http://example.com/", root['xmlns:a'])
self.assertEqual("http://example.net/", root['xmlns:b'])
def test_closing_namespaced_tag(self):
markup = '<p xmlns:dc="http://purl.org/dc/elements/1.1/"><dc:date>20010504</dc:date></p>'
soup = self.soup(markup)
self.assertEqual(unicode(soup.p), markup)
class HTML5TreeBuilderSmokeTest(HTMLTreeBuilderSmokeTest):
"""Smoke test for a tree builder that supports HTML5."""
def test_real_xhtml_document(self):
# Since XHTML is not HTML5, HTML5 parsers are not tested to handle
# XHTML documents in any particular way.
pass
def test_html_tags_have_namespace(self):
markup = "<a>"
soup = self.soup(markup)
self.assertEqual("http://www.w3.org/1999/xhtml", soup.a.namespace)
def test_svg_tags_have_namespace(self):
markup = '<svg><circle/></svg>'
soup = self.soup(markup)
namespace = "http://www.w3.org/2000/svg"
self.assertEqual(namespace, soup.svg.namespace)
self.assertEqual(namespace, soup.circle.namespace)
def test_mathml_tags_have_namespace(self):
markup = '<math><msqrt>5</msqrt></math>'
soup = self.soup(markup)
namespace = 'http://www.w3.org/1998/Math/MathML'
self.assertEqual(namespace, soup.math.namespace)
self.assertEqual(namespace, soup.msqrt.namespace)
def skipIf(condition, reason):
def nothing(test, *args, **kwargs):
return None
def decorator(test_item):
if condition:
return nothing
else:
return test_item
return decorator

View File

@ -0,0 +1 @@
"The beautifulsoup tests."

View File

@ -0,0 +1,141 @@
"""Tests of the builder registry."""
import unittest
from bs4 import BeautifulSoup
from bs4.builder import (
builder_registry as registry,
HTMLParserTreeBuilder,
TreeBuilderRegistry,
)
try:
from bs4.builder import HTML5TreeBuilder
HTML5LIB_PRESENT = True
except ImportError:
HTML5LIB_PRESENT = False
try:
from bs4.builder import (
LXMLTreeBuilderForXML,
LXMLTreeBuilder,
)
LXML_PRESENT = True
except ImportError:
LXML_PRESENT = False
class BuiltInRegistryTest(unittest.TestCase):
"""Test the built-in registry with the default builders registered."""
def test_combination(self):
if LXML_PRESENT:
self.assertEqual(registry.lookup('fast', 'html'),
LXMLTreeBuilder)
if LXML_PRESENT:
self.assertEqual(registry.lookup('permissive', 'xml'),
LXMLTreeBuilderForXML)
self.assertEqual(registry.lookup('strict', 'html'),
HTMLParserTreeBuilder)
if HTML5LIB_PRESENT:
self.assertEqual(registry.lookup('html5lib', 'html'),
HTML5TreeBuilder)
def test_lookup_by_markup_type(self):
if LXML_PRESENT:
self.assertEqual(registry.lookup('html'), LXMLTreeBuilder)
self.assertEqual(registry.lookup('xml'), LXMLTreeBuilderForXML)
else:
self.assertEqual(registry.lookup('xml'), None)
if HTML5LIB_PRESENT:
self.assertEqual(registry.lookup('html'), HTML5TreeBuilder)
else:
self.assertEqual(registry.lookup('html'), HTMLParserTreeBuilder)
def test_named_library(self):
if LXML_PRESENT:
self.assertEqual(registry.lookup('lxml', 'xml'),
LXMLTreeBuilderForXML)
self.assertEqual(registry.lookup('lxml', 'html'),
LXMLTreeBuilder)
if HTML5LIB_PRESENT:
self.assertEqual(registry.lookup('html5lib'),
HTML5TreeBuilder)
self.assertEqual(registry.lookup('html.parser'),
HTMLParserTreeBuilder)
def test_beautifulsoup_constructor_does_lookup(self):
# You can pass in a string.
BeautifulSoup("", features="html")
# Or a list of strings.
BeautifulSoup("", features=["html", "fast"])
# You'll get an exception if BS can't find an appropriate
# builder.
self.assertRaises(ValueError, BeautifulSoup,
"", features="no-such-feature")
class RegistryTest(unittest.TestCase):
"""Test the TreeBuilderRegistry class in general."""
def setUp(self):
self.registry = TreeBuilderRegistry()
def builder_for_features(self, *feature_list):
cls = type('Builder_' + '_'.join(feature_list),
(object,), {'features' : feature_list})
self.registry.register(cls)
return cls
def test_register_with_no_features(self):
builder = self.builder_for_features()
# Since the builder advertises no features, you can't find it
# by looking up features.
self.assertEqual(self.registry.lookup('foo'), None)
# But you can find it by doing a lookup with no features, if
# this happens to be the only registered builder.
self.assertEqual(self.registry.lookup(), builder)
def test_register_with_features_makes_lookup_succeed(self):
builder = self.builder_for_features('foo', 'bar')
self.assertEqual(self.registry.lookup('foo'), builder)
self.assertEqual(self.registry.lookup('bar'), builder)
def test_lookup_fails_when_no_builder_implements_feature(self):
builder = self.builder_for_features('foo', 'bar')
self.assertEqual(self.registry.lookup('baz'), None)
def test_lookup_gets_most_recent_registration_when_no_feature_specified(self):
builder1 = self.builder_for_features('foo')
builder2 = self.builder_for_features('bar')
self.assertEqual(self.registry.lookup(), builder2)
def test_lookup_fails_when_no_tree_builders_registered(self):
self.assertEqual(self.registry.lookup(), None)
def test_lookup_gets_most_recent_builder_supporting_all_features(self):
has_one = self.builder_for_features('foo')
has_the_other = self.builder_for_features('bar')
has_both_early = self.builder_for_features('foo', 'bar', 'baz')
has_both_late = self.builder_for_features('foo', 'bar', 'quux')
lacks_one = self.builder_for_features('bar')
has_the_other = self.builder_for_features('foo')
# There are two builders featuring 'foo' and 'bar', but
# the one that also features 'quux' was registered later.
self.assertEqual(self.registry.lookup('foo', 'bar'),
has_both_late)
# There is only one builder featuring 'foo', 'bar', and 'baz'.
self.assertEqual(self.registry.lookup('foo', 'bar', 'baz'),
has_both_early)
def test_lookup_fails_when_cannot_reconcile_requested_features(self):
builder1 = self.builder_for_features('foo', 'bar')
builder2 = self.builder_for_features('foo', 'baz')
self.assertEqual(self.registry.lookup('bar', 'baz'), None)

View File

@ -0,0 +1,36 @@
"Test harness for doctests."
# pylint: disable-msg=E0611,W0142
__metaclass__ = type
__all__ = [
'additional_tests',
]
import atexit
import doctest
import os
#from pkg_resources import (
# resource_filename, resource_exists, resource_listdir, cleanup_resources)
import unittest
DOCTEST_FLAGS = (
doctest.ELLIPSIS |
doctest.NORMALIZE_WHITESPACE |
doctest.REPORT_NDIFF)
# def additional_tests():
# "Run the doc tests (README.txt and docs/*, if any exist)"
# doctest_files = [
# os.path.abspath(resource_filename('bs4', 'README.txt'))]
# if resource_exists('bs4', 'docs'):
# for name in resource_listdir('bs4', 'docs'):
# if name.endswith('.txt'):
# doctest_files.append(
# os.path.abspath(
# resource_filename('bs4', 'docs/%s' % name)))
# kwargs = dict(module_relative=False, optionflags=DOCTEST_FLAGS)
# atexit.register(cleanup_resources)
# return unittest.TestSuite((
# doctest.DocFileSuite(*doctest_files, **kwargs)))

View File

@ -0,0 +1,58 @@
"""Tests to ensure that the html5lib tree builder generates good trees."""
import warnings
try:
from bs4.builder import HTML5TreeBuilder
HTML5LIB_PRESENT = True
except ImportError, e:
HTML5LIB_PRESENT = False
from bs4.element import SoupStrainer
from bs4.testing import (
HTML5TreeBuilderSmokeTest,
SoupTest,
skipIf,
)
@skipIf(
not HTML5LIB_PRESENT,
"html5lib seems not to be present, not testing its tree builder.")
class HTML5LibBuilderSmokeTest(SoupTest, HTML5TreeBuilderSmokeTest):
"""See ``HTML5TreeBuilderSmokeTest``."""
@property
def default_builder(self):
return HTML5TreeBuilder()
def test_soupstrainer(self):
# The html5lib tree builder does not support SoupStrainers.
strainer = SoupStrainer("b")
markup = "<p>A <b>bold</b> statement.</p>"
with warnings.catch_warnings(record=True) as w:
soup = self.soup(markup, parse_only=strainer)
self.assertEqual(
soup.decode(), self.document_for(markup))
self.assertTrue(
"the html5lib tree builder doesn't support parse_only" in
str(w[0].message))
def test_correctly_nested_tables(self):
"""html5lib inserts <tbody> tags where other parsers don't."""
markup = ('<table id="1">'
'<tr>'
"<td>Here's another table:"
'<table id="2">'
'<tr><td>foo</td></tr>'
'</table></td>')
self.assertSoupEquals(
markup,
'<table id="1"><tbody><tr><td>Here\'s another table:'
'<table id="2"><tbody><tr><td>foo</td></tr></tbody></table>'
'</td></tr></tbody></table>')
self.assertSoupEquals(
"<table><thead><tr><td>Foo</td></tr></thead>"
"<tbody><tr><td>Bar</td></tr></tbody>"
"<tfoot><tr><td>Baz</td></tr></tfoot></table>")

View File

@ -0,0 +1,19 @@
"""Tests to ensure that the html.parser tree builder generates good
trees."""
from bs4.testing import SoupTest, HTMLTreeBuilderSmokeTest
from bs4.builder import HTMLParserTreeBuilder
class HTMLParserTreeBuilderSmokeTest(SoupTest, HTMLTreeBuilderSmokeTest):
@property
def default_builder(self):
return HTMLParserTreeBuilder()
def test_namespaced_system_doctype(self):
# html.parser can't handle namespaced doctypes, so skip this one.
pass
def test_namespaced_public_doctype(self):
# html.parser can't handle namespaced doctypes, so skip this one.
pass

View File

@ -0,0 +1,75 @@
"""Tests to ensure that the lxml tree builder generates good trees."""
import re
import warnings
try:
from bs4.builder import LXMLTreeBuilder, LXMLTreeBuilderForXML
LXML_PRESENT = True
except ImportError, e:
LXML_PRESENT = False
from bs4 import (
BeautifulSoup,
BeautifulStoneSoup,
)
from bs4.element import Comment, Doctype, SoupStrainer
from bs4.testing import skipIf
from bs4.tests import test_htmlparser
from bs4.testing import (
HTMLTreeBuilderSmokeTest,
XMLTreeBuilderSmokeTest,
SoupTest,
skipIf,
)
@skipIf(
not LXML_PRESENT,
"lxml seems not to be present, not testing its tree builder.")
class LXMLTreeBuilderSmokeTest(SoupTest, HTMLTreeBuilderSmokeTest):
"""See ``HTMLTreeBuilderSmokeTest``."""
@property
def default_builder(self):
return LXMLTreeBuilder()
def test_out_of_range_entity(self):
self.assertSoupEquals(
"<p>foo&#10000000000000;bar</p>", "<p>foobar</p>")
self.assertSoupEquals(
"<p>foo&#x10000000000000;bar</p>", "<p>foobar</p>")
self.assertSoupEquals(
"<p>foo&#1000000000;bar</p>", "<p>foobar</p>")
def test_beautifulstonesoup_is_xml_parser(self):
# Make sure that the deprecated BSS class uses an xml builder
# if one is installed.
with warnings.catch_warnings(record=False) as w:
soup = BeautifulStoneSoup("<b />")
self.assertEqual(u"<b/>", unicode(soup.b))
def test_real_xhtml_document(self):
"""lxml strips the XML definition from an XHTML doc, which is fine."""
markup = b"""<?xml version="1.0" encoding="utf-8"?>
<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN">
<html xmlns="http://www.w3.org/1999/xhtml">
<head><title>Hello.</title></head>
<body>Goodbye.</body>
</html>"""
soup = self.soup(markup)
self.assertEqual(
soup.encode("utf-8").replace(b"\n", b''),
markup.replace(b'\n', b'').replace(
b'<?xml version="1.0" encoding="utf-8"?>', b''))
@skipIf(
not LXML_PRESENT,
"lxml seems not to be present, not testing its XML tree builder.")
class LXMLXMLTreeBuilderSmokeTest(SoupTest, XMLTreeBuilderSmokeTest):
"""See ``HTMLTreeBuilderSmokeTest``."""
@property
def default_builder(self):
return LXMLTreeBuilderForXML()

368
lib/bs4/tests/test_soup.py Normal file
View File

@ -0,0 +1,368 @@
# -*- coding: utf-8 -*-
"""Tests of Beautiful Soup as a whole."""
import unittest
from bs4 import (
BeautifulSoup,
BeautifulStoneSoup,
)
from bs4.element import (
CharsetMetaAttributeValue,
ContentMetaAttributeValue,
SoupStrainer,
NamespacedAttribute,
)
import bs4.dammit
from bs4.dammit import EntitySubstitution, UnicodeDammit
from bs4.testing import (
SoupTest,
skipIf,
)
import warnings
try:
from bs4.builder import LXMLTreeBuilder, LXMLTreeBuilderForXML
LXML_PRESENT = True
except ImportError, e:
LXML_PRESENT = False
class TestDeprecatedConstructorArguments(SoupTest):
def test_parseOnlyThese_renamed_to_parse_only(self):
with warnings.catch_warnings(record=True) as w:
soup = self.soup("<a><b></b></a>", parseOnlyThese=SoupStrainer("b"))
msg = str(w[0].message)
self.assertTrue("parseOnlyThese" in msg)
self.assertTrue("parse_only" in msg)
self.assertEqual(b"<b></b>", soup.encode())
def test_fromEncoding_renamed_to_from_encoding(self):
with warnings.catch_warnings(record=True) as w:
utf8 = b"\xc3\xa9"
soup = self.soup(utf8, fromEncoding="utf8")
msg = str(w[0].message)
self.assertTrue("fromEncoding" in msg)
self.assertTrue("from_encoding" in msg)
self.assertEqual("utf8", soup.original_encoding)
def test_unrecognized_keyword_argument(self):
self.assertRaises(
TypeError, self.soup, "<a>", no_such_argument=True)
@skipIf(
not LXML_PRESENT,
"lxml not present, not testing BeautifulStoneSoup.")
def test_beautifulstonesoup(self):
with warnings.catch_warnings(record=True) as w:
soup = BeautifulStoneSoup("<markup>")
self.assertTrue(isinstance(soup, BeautifulSoup))
self.assertTrue("BeautifulStoneSoup class is deprecated")
class TestSelectiveParsing(SoupTest):
def test_parse_with_soupstrainer(self):
markup = "No<b>Yes</b><a>No<b>Yes <c>Yes</c></b>"
strainer = SoupStrainer("b")
soup = self.soup(markup, parse_only=strainer)
self.assertEqual(soup.encode(), b"<b>Yes</b><b>Yes <c>Yes</c></b>")
class TestEntitySubstitution(unittest.TestCase):
"""Standalone tests of the EntitySubstitution class."""
def setUp(self):
self.sub = EntitySubstitution
def test_simple_html_substitution(self):
# Unicode characters corresponding to named HTML entites
# are substituted, and no others.
s = u"foo\u2200\N{SNOWMAN}\u00f5bar"
self.assertEqual(self.sub.substitute_html(s),
u"foo&forall;\N{SNOWMAN}&otilde;bar")
def test_smart_quote_substitution(self):
# MS smart quotes are a common source of frustration, so we
# give them a special test.
quotes = b"\x91\x92foo\x93\x94"
dammit = UnicodeDammit(quotes)
self.assertEqual(self.sub.substitute_html(dammit.markup),
"&lsquo;&rsquo;foo&ldquo;&rdquo;")
def test_xml_converstion_includes_no_quotes_if_make_quoted_attribute_is_false(self):
s = 'Welcome to "my bar"'
self.assertEqual(self.sub.substitute_xml(s, False), s)
def test_xml_attribute_quoting_normally_uses_double_quotes(self):
self.assertEqual(self.sub.substitute_xml("Welcome", True),
'"Welcome"')
self.assertEqual(self.sub.substitute_xml("Bob's Bar", True),
'"Bob\'s Bar"')
def test_xml_attribute_quoting_uses_single_quotes_when_value_contains_double_quotes(self):
s = 'Welcome to "my bar"'
self.assertEqual(self.sub.substitute_xml(s, True),
"'Welcome to \"my bar\"'")
def test_xml_attribute_quoting_escapes_single_quotes_when_value_contains_both_single_and_double_quotes(self):
s = 'Welcome to "Bob\'s Bar"'
self.assertEqual(
self.sub.substitute_xml(s, True),
'"Welcome to &quot;Bob\'s Bar&quot;"')
def test_xml_quotes_arent_escaped_when_value_is_not_being_quoted(self):
quoted = 'Welcome to "Bob\'s Bar"'
self.assertEqual(self.sub.substitute_xml(quoted), quoted)
def test_xml_quoting_handles_angle_brackets(self):
self.assertEqual(
self.sub.substitute_xml("foo<bar>"),
"foo&lt;bar&gt;")
def test_xml_quoting_handles_ampersands(self):
self.assertEqual(self.sub.substitute_xml("AT&T"), "AT&amp;T")
def test_xml_quoting_ignores_ampersands_when_they_are_part_of_an_entity(self):
self.assertEqual(
self.sub.substitute_xml("&Aacute;T&T"),
"&Aacute;T&amp;T")
def test_quotes_not_html_substituted(self):
"""There's no need to do this except inside attribute values."""
text = 'Bob\'s "bar"'
self.assertEqual(self.sub.substitute_html(text), text)
class TestEncodingConversion(SoupTest):
# Test Beautiful Soup's ability to decode and encode from various
# encodings.
def setUp(self):
super(TestEncodingConversion, self).setUp()
self.unicode_data = u"<html><head></head><body><foo>Sacr\N{LATIN SMALL LETTER E WITH ACUTE} bleu!</foo></body></html>"
self.utf8_data = self.unicode_data.encode("utf-8")
# Just so you know what it looks like.
self.assertEqual(
self.utf8_data,
b"<html><head></head><body><foo>Sacr\xc3\xa9 bleu!</foo></body></html>")
def test_ascii_in_unicode_out(self):
# ASCII input is converted to Unicode. The original_encoding
# attribute is set.
ascii = b"<foo>a</foo>"
soup_from_ascii = self.soup(ascii)
unicode_output = soup_from_ascii.decode()
self.assertTrue(isinstance(unicode_output, unicode))
self.assertEqual(unicode_output, self.document_for(ascii.decode()))
self.assertEqual(soup_from_ascii.original_encoding, "ascii")
def test_unicode_in_unicode_out(self):
# Unicode input is left alone. The original_encoding attribute
# is not set.
soup_from_unicode = self.soup(self.unicode_data)
self.assertEqual(soup_from_unicode.decode(), self.unicode_data)
self.assertEqual(soup_from_unicode.foo.string, u'Sacr\xe9 bleu!')
self.assertEqual(soup_from_unicode.original_encoding, None)
def test_utf8_in_unicode_out(self):
# UTF-8 input is converted to Unicode. The original_encoding
# attribute is set.
soup_from_utf8 = self.soup(self.utf8_data)
self.assertEqual(soup_from_utf8.decode(), self.unicode_data)
self.assertEqual(soup_from_utf8.foo.string, u'Sacr\xe9 bleu!')
def test_utf8_out(self):
# The internal data structures can be encoded as UTF-8.
soup_from_unicode = self.soup(self.unicode_data)
self.assertEqual(soup_from_unicode.encode('utf-8'), self.utf8_data)
class TestUnicodeDammit(unittest.TestCase):
"""Standalone tests of Unicode, Dammit."""
def test_smart_quotes_to_unicode(self):
markup = b"<foo>\x91\x92\x93\x94</foo>"
dammit = UnicodeDammit(markup)
self.assertEqual(
dammit.unicode_markup, u"<foo>\u2018\u2019\u201c\u201d</foo>")
def test_smart_quotes_to_xml_entities(self):
markup = b"<foo>\x91\x92\x93\x94</foo>"
dammit = UnicodeDammit(markup, smart_quotes_to="xml")
self.assertEqual(
dammit.unicode_markup, "<foo>&#x2018;&#x2019;&#x201C;&#x201D;</foo>")
def test_smart_quotes_to_html_entities(self):
markup = b"<foo>\x91\x92\x93\x94</foo>"
dammit = UnicodeDammit(markup, smart_quotes_to="html")
self.assertEqual(
dammit.unicode_markup, "<foo>&lsquo;&rsquo;&ldquo;&rdquo;</foo>")
def test_smart_quotes_to_ascii(self):
markup = b"<foo>\x91\x92\x93\x94</foo>"
dammit = UnicodeDammit(markup, smart_quotes_to="ascii")
self.assertEqual(
dammit.unicode_markup, """<foo>''""</foo>""")
def test_detect_utf8(self):
utf8 = b"\xc3\xa9"
dammit = UnicodeDammit(utf8)
self.assertEqual(dammit.unicode_markup, u'\xe9')
self.assertEqual(dammit.original_encoding, 'utf-8')
def test_convert_hebrew(self):
hebrew = b"\xed\xe5\xec\xf9"
dammit = UnicodeDammit(hebrew, ["iso-8859-8"])
self.assertEqual(dammit.original_encoding, 'iso-8859-8')
self.assertEqual(dammit.unicode_markup, u'\u05dd\u05d5\u05dc\u05e9')
def test_dont_see_smart_quotes_where_there_are_none(self):
utf_8 = b"\343\202\261\343\203\274\343\202\277\343\202\244 Watch"
dammit = UnicodeDammit(utf_8)
self.assertEqual(dammit.original_encoding, 'utf-8')
self.assertEqual(dammit.unicode_markup.encode("utf-8"), utf_8)
def test_ignore_inappropriate_codecs(self):
utf8_data = u"Räksmörgås".encode("utf-8")
dammit = UnicodeDammit(utf8_data, ["iso-8859-8"])
self.assertEqual(dammit.original_encoding, 'utf-8')
def test_ignore_invalid_codecs(self):
utf8_data = u"Räksmörgås".encode("utf-8")
for bad_encoding in ['.utf8', '...', 'utF---16.!']:
dammit = UnicodeDammit(utf8_data, [bad_encoding])
self.assertEqual(dammit.original_encoding, 'utf-8')
def test_detect_html5_style_meta_tag(self):
for data in (
b'<html><meta charset="euc-jp" /></html>',
b"<html><meta charset='euc-jp' /></html>",
b"<html><meta charset=euc-jp /></html>",
b"<html><meta charset=euc-jp/></html>"):
dammit = UnicodeDammit(data, is_html=True)
self.assertEqual(
"euc-jp", dammit.original_encoding)
def test_last_ditch_entity_replacement(self):
# This is a UTF-8 document that contains bytestrings
# completely incompatible with UTF-8 (ie. encoded with some other
# encoding).
#
# Since there is no consistent encoding for the document,
# Unicode, Dammit will eventually encode the document as UTF-8
# and encode the incompatible characters as REPLACEMENT
# CHARACTER.
#
# If chardet is installed, it will detect that the document
# can be converted into ISO-8859-1 without errors. This happens
# to be the wrong encoding, but it is a consistent encoding, so the
# code we're testing here won't run.
#
# So we temporarily disable chardet if it's present.
doc = b"""\357\273\277<?xml version="1.0" encoding="UTF-8"?>
<html><b>\330\250\330\252\330\261</b>
<i>\310\322\321\220\312\321\355\344</i></html>"""
chardet = bs4.dammit.chardet
try:
bs4.dammit.chardet = None
with warnings.catch_warnings(record=True) as w:
dammit = UnicodeDammit(doc)
self.assertEqual(True, dammit.contains_replacement_characters)
self.assertTrue(u"\ufffd" in dammit.unicode_markup)
soup = BeautifulSoup(doc, "html.parser")
self.assertTrue(soup.contains_replacement_characters)
msg = w[0].message
self.assertTrue(isinstance(msg, UnicodeWarning))
self.assertTrue("Some characters could not be decoded" in str(msg))
finally:
bs4.dammit.chardet = chardet
def test_sniffed_xml_encoding(self):
# A document written in UTF-16LE will be converted by a different
# code path that sniffs the byte order markers.
data = b'\xff\xfe<\x00a\x00>\x00\xe1\x00\xe9\x00<\x00/\x00a\x00>\x00'
dammit = UnicodeDammit(data)
self.assertEqual(u"<a>áé</a>", dammit.unicode_markup)
self.assertEqual("utf-16le", dammit.original_encoding)
def test_detwingle(self):
# Here's a UTF8 document.
utf8 = (u"\N{SNOWMAN}" * 3).encode("utf8")
# Here's a Windows-1252 document.
windows_1252 = (
u"\N{LEFT DOUBLE QUOTATION MARK}Hi, I like Windows!"
u"\N{RIGHT DOUBLE QUOTATION MARK}").encode("windows_1252")
# Through some unholy alchemy, they've been stuck together.
doc = utf8 + windows_1252 + utf8
# The document can't be turned into UTF-8:
self.assertRaises(UnicodeDecodeError, doc.decode, "utf8")
# Unicode, Dammit thinks the whole document is Windows-1252,
# and decodes it into "☃☃☃“Hi, I like Windows!”☃☃☃"
# But if we run it through fix_embedded_windows_1252, it's fixed:
fixed = UnicodeDammit.detwingle(doc)
self.assertEqual(
u"☃☃☃“Hi, I like Windows!”☃☃☃", fixed.decode("utf8"))
def test_detwingle_ignores_multibyte_characters(self):
# Each of these characters has a UTF-8 representation ending
# in \x93. \x93 is a smart quote if interpreted as
# Windows-1252. But our code knows to skip over multibyte
# UTF-8 characters, so they'll survive the process unscathed.
for tricky_unicode_char in (
u"\N{LATIN SMALL LIGATURE OE}", # 2-byte char '\xc5\x93'
u"\N{LATIN SUBSCRIPT SMALL LETTER X}", # 3-byte char '\xe2\x82\x93'
u"\xf0\x90\x90\x93", # This is a CJK character, not sure which one.
):
input = tricky_unicode_char.encode("utf8")
self.assertTrue(input.endswith(b'\x93'))
output = UnicodeDammit.detwingle(input)
self.assertEqual(output, input)
class TestNamedspacedAttribute(SoupTest):
def test_name_may_be_none(self):
a = NamespacedAttribute("xmlns", None)
self.assertEqual(a, "xmlns")
def test_attribute_is_equivalent_to_colon_separated_string(self):
a = NamespacedAttribute("a", "b")
self.assertEqual("a:b", a)
def test_attributes_are_equivalent_if_prefix_and_name_identical(self):
a = NamespacedAttribute("a", "b", "c")
b = NamespacedAttribute("a", "b", "c")
self.assertEqual(a, b)
# The actual namespace is not considered.
c = NamespacedAttribute("a", "b", None)
self.assertEqual(a, c)
# But name and prefix are important.
d = NamespacedAttribute("a", "z", "c")
self.assertNotEqual(a, d)
e = NamespacedAttribute("z", "b", "c")
self.assertNotEqual(a, e)
class TestAttributeValueWithCharsetSubstitution(unittest.TestCase):
def test_content_meta_attribute_value(self):
value = CharsetMetaAttributeValue("euc-jp")
self.assertEqual("euc-jp", value)
self.assertEqual("euc-jp", value.original_value)
self.assertEqual("utf8", value.encode("utf8"))
def test_content_meta_attribute_value(self):
value = ContentMetaAttributeValue("text/html; charset=euc-jp")
self.assertEqual("text/html; charset=euc-jp", value)
self.assertEqual("text/html; charset=euc-jp", value.original_value)
self.assertEqual("text/html; charset=utf8", value.encode("utf8"))

1695
lib/bs4/tests/test_tree.py Normal file

File diff suppressed because it is too large Load Diff

View File

@ -0,0 +1,29 @@
Metadata-Version: 1.0
Name: feedparser
Version: 5.1.1
Summary: Universal feed parser, handles RSS 0.9x, RSS 1.0, RSS 2.0, CDF, Atom 0.3, and Atom 1.0 feeds
Home-page: http://code.google.com/p/feedparser/
Author: Kurt McKee
Author-email: contactme@kurtmckee.org
License: UNKNOWN
Download-URL: http://code.google.com/p/feedparser/
Description: UNKNOWN
Keywords: atom,cdf,feed,parser,rdf,rss
Platform: POSIX
Platform: Windows
Classifier: Development Status :: 5 - Production/Stable
Classifier: Intended Audience :: Developers
Classifier: License :: OSI Approved
Classifier: Operating System :: OS Independent
Classifier: Programming Language :: Python
Classifier: Programming Language :: Python :: 2
Classifier: Programming Language :: Python :: 2.4
Classifier: Programming Language :: Python :: 2.5
Classifier: Programming Language :: Python :: 2.6
Classifier: Programming Language :: Python :: 2.7
Classifier: Programming Language :: Python :: 3
Classifier: Programming Language :: Python :: 3.0
Classifier: Programming Language :: Python :: 3.1
Classifier: Programming Language :: Python :: 3.2
Classifier: Topic :: Software Development :: Libraries :: Python Modules
Classifier: Topic :: Text Processing :: Markup :: XML

File diff suppressed because it is too large Load Diff

View File

@ -0,0 +1 @@

View File

@ -0,0 +1 @@
feedparser

4009
lib/feedparser/feedparser.py Normal file

File diff suppressed because it is too large Load Diff

View File

@ -0,0 +1,806 @@
#!/usr/bin/env python
__author__ = "Mark Pilgrim <http://diveintomark.org/>"
__license__ = """
Copyright (c) 2010-2012 Kurt McKee <contactme@kurtmckee.org>
Copyright (c) 2004-2008 Mark Pilgrim
All rights reserved.
Redistribution and use in source and binary forms, with or without modification,
are permitted provided that the following conditions are met:
* Redistributions of source code must retain the above copyright notice,
this list of conditions and the following disclaimer.
* Redistributions in binary form must reproduce the above copyright notice,
this list of conditions and the following disclaimer in the documentation
and/or other materials provided with the distribution.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 'AS IS'
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
POSSIBILITY OF SUCH DAMAGE."""
import codecs
import datetime
import glob
import operator
import os
import posixpath
import pprint
import re
import struct
import sys
import threading
import time
import unittest
import urllib
import warnings
import zlib
import BaseHTTPServer
import SimpleHTTPServer
import feedparser
if not feedparser._XML_AVAILABLE:
sys.stderr.write('No XML parsers available, unit testing can not proceed\n')
sys.exit(1)
_UTF32_AVAILABLE = feedparser._UTF32_AVAILABLE
_s2bytes = feedparser._s2bytes
_l2bytes = feedparser._l2bytes
#---------- custom HTTP server (used to serve test feeds) ----------
_PORT = 8097 # not really configurable, must match hardcoded port in tests
_HOST = '127.0.0.1' # also not really configurable
class FeedParserTestRequestHandler(SimpleHTTPServer.SimpleHTTPRequestHandler):
headers_re = re.compile(_s2bytes(r"^Header:\s+([^:]+):(.+)$"), re.MULTILINE)
def send_head(self):
"""Send custom headers defined in test case
Example:
<!--
Header: Content-type: application/atom+xml
Header: X-Foo: bar
-->
"""
# Short-circuit the HTTP status test `test_redirect_to_304()`
if self.path == '/-/return-304.xml':
self.send_response(304)
self.send_header('Content-type', 'text/xml')
self.end_headers()
return feedparser._StringIO(u''.encode('utf-8'))
path = self.translate_path(self.path)
# the compression tests' filenames determine the header sent
if self.path.startswith('/tests/compression'):
if self.path.endswith('gz'):
headers = {'Content-Encoding': 'gzip'}
else:
headers = {'Content-Encoding': 'deflate'}
else:
headers = dict([(k.decode('utf-8'), v.decode('utf-8').strip()) for k, v in self.headers_re.findall(open(path, 'rb').read())])
f = open(path, 'rb')
if (self.headers.get('if-modified-since') == headers.get('Last-Modified', 'nom')) \
or (self.headers.get('if-none-match') == headers.get('ETag', 'nomatch')):
status = 304
else:
status = 200
headers.setdefault('Status', status)
self.send_response(int(headers['Status']))
headers.setdefault('Content-type', self.guess_type(path))
self.send_header("Content-type", headers['Content-type'])
self.send_header("Content-Length", str(os.stat(f.name)[6]))
for k, v in headers.items():
if k not in ('Status', 'Content-type'):
self.send_header(k, v)
self.end_headers()
return f
def log_request(self, *args):
pass
class FeedParserTestServer(threading.Thread):
"""HTTP Server that runs in a thread and handles a predetermined number of requests"""
def __init__(self, requests):
threading.Thread.__init__(self)
self.requests = requests
self.ready = threading.Event()
def run(self):
self.httpd = BaseHTTPServer.HTTPServer((_HOST, _PORT), FeedParserTestRequestHandler)
self.ready.set()
while self.requests:
self.httpd.handle_request()
self.requests -= 1
self.ready.clear()
#---------- dummy test case class (test methods are added dynamically) ----------
unicode1_re = re.compile(_s2bytes(" u'"))
unicode2_re = re.compile(_s2bytes(' u"'))
# _bytes is only used in everythingIsUnicode().
# In Python 2 it's str, and in Python 3 it's bytes.
_bytes = type(_s2bytes(''))
def everythingIsUnicode(d):
"""Takes a dictionary, recursively verifies that every value is unicode"""
for k, v in d.iteritems():
if isinstance(v, dict) and k != 'headers':
if not everythingIsUnicode(v):
return False
elif isinstance(v, list):
for i in v:
if isinstance(i, dict) and not everythingIsUnicode(i):
return False
elif isinstance(i, _bytes):
return False
elif isinstance(v, _bytes):
return False
return True
def failUnlessEval(self, xmlfile, evalString, msg=None):
"""Fail unless eval(evalString, env)"""
env = feedparser.parse(xmlfile)
try:
if not eval(evalString, globals(), env):
failure=(msg or 'not eval(%s) \nWITH env(%s)' % (evalString, pprint.pformat(env)))
raise self.failureException, failure
if not everythingIsUnicode(env):
raise self.failureException, "not everything is unicode \nWITH env(%s)" % (pprint.pformat(env), )
except SyntaxError:
# Python 3 doesn't have the `u""` syntax, so evalString needs to be modified,
# which will require the failure message to be updated
evalString = re.sub(unicode1_re, _s2bytes(" '"), evalString)
evalString = re.sub(unicode2_re, _s2bytes(' "'), evalString)
if not eval(evalString, globals(), env):
failure=(msg or 'not eval(%s) \nWITH env(%s)' % (evalString, pprint.pformat(env)))
raise self.failureException, failure
class BaseTestCase(unittest.TestCase):
failUnlessEval = failUnlessEval
class TestCase(BaseTestCase):
pass
class TestTemporaryFallbackBehavior(unittest.TestCase):
"These tests are temporarily here because of issues 310 and 328"
def test_issue_328_fallback_behavior(self):
warnings.filterwarnings('error')
d = feedparser.FeedParserDict()
d['published'] = u'pub string'
d['published_parsed'] = u'pub tuple'
d['updated'] = u'upd string'
d['updated_parsed'] = u'upd tuple'
# Ensure that `updated` doesn't map to `published` when it exists
self.assertTrue('published' in d)
self.assertTrue('published_parsed' in d)
self.assertTrue('updated' in d)
self.assertTrue('updated_parsed' in d)
self.assertEqual(d['published'], 'pub string')
self.assertEqual(d['published_parsed'], 'pub tuple')
self.assertEqual(d['updated'], 'upd string')
self.assertEqual(d['updated_parsed'], 'upd tuple')
d = feedparser.FeedParserDict()
d['published'] = u'pub string'
d['published_parsed'] = u'pub tuple'
# Ensure that `updated` doesn't actually exist
self.assertTrue('updated' not in d)
self.assertTrue('updated_parsed' not in d)
# Ensure that accessing `updated` throws a DeprecationWarning
try:
d['updated']
except DeprecationWarning:
# Expected behavior
pass
else:
# Wrong behavior
self.assertEqual(True, False)
try:
d['updated_parsed']
except DeprecationWarning:
# Expected behavior
pass
else:
# Wrong behavior
self.assertEqual(True, False)
# Ensure that `updated` maps to `published`
warnings.filterwarnings('ignore')
self.assertEqual(d['updated'], u'pub string')
self.assertEqual(d['updated_parsed'], u'pub tuple')
warnings.resetwarnings()
class TestEverythingIsUnicode(unittest.TestCase):
"Ensure that `everythingIsUnicode()` is working appropriately"
def test_everything_is_unicode(self):
self.assertTrue(everythingIsUnicode(
{'a': u'a', 'b': [u'b', {'c': u'c'}], 'd': {'e': u'e'}}
))
def test_not_everything_is_unicode(self):
self.assertFalse(everythingIsUnicode({'a': _s2bytes('a')}))
self.assertFalse(everythingIsUnicode({'a': [_s2bytes('a')]}))
self.assertFalse(everythingIsUnicode({'a': {'b': _s2bytes('b')}}))
self.assertFalse(everythingIsUnicode({'a': [{'b': _s2bytes('b')}]}))
class TestLooseParser(BaseTestCase):
"Test the sgmllib-based parser by manipulating feedparser " \
"into believing no XML parsers are installed"
def __init__(self, arg):
unittest.TestCase.__init__(self, arg)
self._xml_available = feedparser._XML_AVAILABLE
def setUp(self):
feedparser._XML_AVAILABLE = 0
def tearDown(self):
feedparser._XML_AVAILABLE = self._xml_available
class TestStrictParser(BaseTestCase):
pass
class TestMicroformats(BaseTestCase):
pass
class TestEncodings(BaseTestCase):
pass
class TestFeedParserDict(unittest.TestCase):
"Ensure that FeedParserDict returns values as expected and won't crash"
def setUp(self):
self.d = feedparser.FeedParserDict()
def _check_key(self, k):
self.assertTrue(k in self.d)
self.assertTrue(hasattr(self.d, k))
self.assertEqual(self.d[k], 1)
self.assertEqual(getattr(self.d, k), 1)
def _check_no_key(self, k):
self.assertTrue(k not in self.d)
self.assertTrue(not hasattr(self.d, k))
def test_empty(self):
keys = (
'a','entries', 'id', 'guid', 'summary', 'subtitle', 'description',
'category', 'enclosures', 'license', 'categories',
)
for k in keys:
self._check_no_key(k)
self.assertTrue('items' not in self.d)
self.assertTrue(hasattr(self.d, 'items')) # dict.items() exists
def test_neutral(self):
self.d['a'] = 1
self._check_key('a')
def test_single_mapping_target_1(self):
self.d['id'] = 1
self._check_key('id')
self._check_key('guid')
def test_single_mapping_target_2(self):
self.d['guid'] = 1
self._check_key('id')
self._check_key('guid')
def test_multiple_mapping_target_1(self):
self.d['summary'] = 1
self._check_key('summary')
self._check_key('description')
def test_multiple_mapping_target_2(self):
self.d['subtitle'] = 1
self._check_key('subtitle')
self._check_key('description')
def test_multiple_mapping_mapped_key(self):
self.d['description'] = 1
self._check_key('summary')
self._check_key('description')
def test_license(self):
self.d['links'] = []
try:
self.d['license']
self.assertTrue(False)
except KeyError:
pass
self.d['links'].append({'rel': 'license'})
try:
self.d['license']
self.assertTrue(False)
except KeyError:
pass
self.d['links'].append({'rel': 'license', 'href': 'http://dom.test/'})
self.assertEqual(self.d['license'], 'http://dom.test/')
def test_category(self):
self.d['tags'] = []
try:
self.d['category']
self.assertTrue(False)
except KeyError:
pass
self.d['tags'] = [{}]
try:
self.d['category']
self.assertTrue(False)
except KeyError:
pass
self.d['tags'] = [{'term': 'cat'}]
self.assertEqual(self.d['category'], 'cat')
self.d['tags'].append({'term': 'dog'})
self.assertEqual(self.d['category'], 'cat')
class TestOpenResource(unittest.TestCase):
"Ensure that `_open_resource()` interprets its arguments as URIs, " \
"file-like objects, or in-memory feeds as expected"
def test_fileobj(self):
r = feedparser._open_resource(sys.stdin, '', '', '', '', [], {})
self.assertTrue(r is sys.stdin)
def test_feed(self):
f = feedparser.parse(u'feed://localhost:8097/tests/http/target.xml')
self.assertEqual(f.href, u'http://localhost:8097/tests/http/target.xml')
def test_feed_http(self):
f = feedparser.parse(u'feed:http://localhost:8097/tests/http/target.xml')
self.assertEqual(f.href, u'http://localhost:8097/tests/http/target.xml')
def test_bytes(self):
s = '<feed><item><title>text</title></item></feed>'.encode('utf-8')
r = feedparser._open_resource(s, '', '', '', '', [], {})
self.assertEqual(s, r.read())
def test_string(self):
s = '<feed><item><title>text</title></item></feed>'
r = feedparser._open_resource(s, '', '', '', '', [], {})
self.assertEqual(s.encode('utf-8'), r.read())
def test_unicode_1(self):
s = u'<feed><item><title>text</title></item></feed>'
r = feedparser._open_resource(s, '', '', '', '', [], {})
self.assertEqual(s.encode('utf-8'), r.read())
def test_unicode_2(self):
s = u'<feed><item><title>t\u00e9xt</title></item></feed>'
r = feedparser._open_resource(s, '', '', '', '', [], {})
self.assertEqual(s.encode('utf-8'), r.read())
class TestMakeSafeAbsoluteURI(unittest.TestCase):
"Exercise the URI joining and sanitization code"
base = u'http://d.test/d/f.ext'
def _mktest(rel, expect, doc):
def fn(self):
value = feedparser._makeSafeAbsoluteURI(self.base, rel)
self.assertEqual(value, expect)
fn.__doc__ = doc
return fn
# make the test cases; the call signature is:
# (relative_url, expected_return_value, test_doc_string)
test_abs = _mktest(u'https://s.test/', u'https://s.test/', 'absolute uri')
test_rel = _mktest(u'/new', u'http://d.test/new', 'relative uri')
test_bad = _mktest(u'x://bad.test/', u'', 'unacceptable uri protocol')
def test_catch_ValueError(self):
'catch ValueError in Python 2.7 and up'
uri = u'http://bad]test/'
value1 = feedparser._makeSafeAbsoluteURI(uri)
value2 = feedparser._makeSafeAbsoluteURI(self.base, uri)
swap = feedparser.ACCEPTABLE_URI_SCHEMES
feedparser.ACCEPTABLE_URI_SCHEMES = ()
value3 = feedparser._makeSafeAbsoluteURI(self.base, uri)
feedparser.ACCEPTABLE_URI_SCHEMES = swap
# Only Python 2.7 and up throw a ValueError, otherwise uri is returned
self.assertTrue(value1 in (uri, u''))
self.assertTrue(value2 in (uri, u''))
self.assertTrue(value3 in (uri, u''))
class TestConvertToIdn(unittest.TestCase):
"Test IDN support (unavailable in Jython as of Jython 2.5.2)"
# this is the greek test domain
hostname = u'\u03c0\u03b1\u03c1\u03ac\u03b4\u03b5\u03b9\u03b3\u03bc\u03b1'
hostname += u'.\u03b4\u03bf\u03ba\u03b9\u03bc\u03ae'
def test_control(self):
r = feedparser._convert_to_idn(u'http://example.test/')
self.assertEqual(r, u'http://example.test/')
def test_idn(self):
r = feedparser._convert_to_idn(u'http://%s/' % (self.hostname,))
self.assertEqual(r, u'http://xn--hxajbheg2az3al.xn--jxalpdlp/')
def test_port(self):
r = feedparser._convert_to_idn(u'http://%s:8080/' % (self.hostname,))
self.assertEqual(r, u'http://xn--hxajbheg2az3al.xn--jxalpdlp:8080/')
class TestCompression(unittest.TestCase):
"Test the gzip and deflate support in the HTTP code"
def test_gzip_good(self):
f = feedparser.parse('http://localhost:8097/tests/compression/gzip.gz')
self.assertEqual(f.version, 'atom10')
def test_gzip_not_gzipped(self):
f = feedparser.parse('http://localhost:8097/tests/compression/gzip-not-gzipped.gz')
self.assertEqual(f.bozo, 1)
self.assertTrue(isinstance(f.bozo_exception, IOError))
def test_gzip_struct_error(self):
f = feedparser.parse('http://localhost:8097/tests/compression/gzip-struct-error.gz')
self.assertEqual(f.bozo, 1)
self.assertTrue(isinstance(f.bozo_exception, struct.error))
def test_zlib_good(self):
f = feedparser.parse('http://localhost:8097/tests/compression/deflate.z')
self.assertEqual(f.version, 'atom10')
def test_zlib_bad(self):
f = feedparser.parse('http://localhost:8097/tests/compression/deflate-error.z')
self.assertEqual(f.bozo, 1)
self.assertTrue(isinstance(f.bozo_exception, zlib.error))
class TestHTTPStatus(unittest.TestCase):
"Test HTTP redirection and other status codes"
def test_301(self):
f = feedparser.parse('http://localhost:8097/tests/http/http_status_301.xml')
self.assertEqual(f.status, 301)
self.assertEqual(f.href, 'http://localhost:8097/tests/http/target.xml')
self.assertEqual(f.entries[0].title, 'target')
def test_302(self):
f = feedparser.parse('http://localhost:8097/tests/http/http_status_302.xml')
self.assertEqual(f.status, 302)
self.assertEqual(f.href, 'http://localhost:8097/tests/http/target.xml')
self.assertEqual(f.entries[0].title, 'target')
def test_303(self):
f = feedparser.parse('http://localhost:8097/tests/http/http_status_303.xml')
self.assertEqual(f.status, 303)
self.assertEqual(f.href, 'http://localhost:8097/tests/http/target.xml')
self.assertEqual(f.entries[0].title, 'target')
def test_307(self):
f = feedparser.parse('http://localhost:8097/tests/http/http_status_307.xml')
self.assertEqual(f.status, 307)
self.assertEqual(f.href, 'http://localhost:8097/tests/http/target.xml')
self.assertEqual(f.entries[0].title, 'target')
def test_304(self):
# first retrieve the url
u = 'http://localhost:8097/tests/http/http_status_304.xml'
f = feedparser.parse(u)
self.assertEqual(f.status, 200)
self.assertEqual(f.entries[0].title, 'title 304')
# extract the etag and last-modified headers
e = [v for k, v in f.headers.items() if k.lower() == 'etag'][0]
mh = [v for k, v in f.headers.items() if k.lower() == 'last-modified'][0]
ms = f.updated
mt = f.updated_parsed
md = datetime.datetime(*mt[0:7])
self.assertTrue(isinstance(mh, basestring))
self.assertTrue(isinstance(ms, basestring))
self.assertTrue(isinstance(mt, time.struct_time))
self.assertTrue(isinstance(md, datetime.datetime))
# test that sending back the etag results in a 304
f = feedparser.parse(u, etag=e)
self.assertEqual(f.status, 304)
# test that sending back last-modified (string) results in a 304
f = feedparser.parse(u, modified=ms)
self.assertEqual(f.status, 304)
# test that sending back last-modified (9-tuple) results in a 304
f = feedparser.parse(u, modified=mt)
self.assertEqual(f.status, 304)
# test that sending back last-modified (datetime) results in a 304
f = feedparser.parse(u, modified=md)
self.assertEqual(f.status, 304)
def test_404(self):
f = feedparser.parse('http://localhost:8097/tests/http/http_status_404.xml')
self.assertEqual(f.status, 404)
def test_9001(self):
f = feedparser.parse('http://localhost:8097/tests/http/http_status_9001.xml')
self.assertEqual(f.bozo, 1)
def test_redirect_to_304(self):
# ensure that an http redirect to an http 304 doesn't
# trigger a bozo_exception
u = 'http://localhost:8097/tests/http/http_redirect_to_304.xml'
f = feedparser.parse(u)
self.assertTrue(f.bozo == 0)
self.assertTrue(f.status == 302)
class TestDateParsers(unittest.TestCase):
"Test the various date parsers; most of the test cases are constructed " \
"dynamically based on the contents of the `date_tests` dict, below"
def test_None(self):
self.assertTrue(feedparser._parse_date(None) is None)
def _check_date(self, func, dtstring, dttuple):
try:
tup = func(dtstring)
except (OverflowError, ValueError):
tup = None
self.assertEqual(tup, dttuple)
self.assertEqual(tup, feedparser._parse_date(dtstring))
def test_year_10000_date(self):
# On some systems this date string will trigger an OverflowError.
# On Jython and x64 systems, however, it's interpreted just fine.
try:
date = feedparser._parse_date_rfc822(u'Sun, 31 Dec 9999 23:59:59 -9999')
except OverflowError:
date = None
self.assertTrue(date in (None, (10000, 1, 5, 4, 38, 59, 2, 5, 0)))
date_tests = {
feedparser._parse_date_greek: (
(u'', None), # empty string
(u'\u039a\u03c5\u03c1, 11 \u0399\u03bf\u03cd\u03bb 2004 12:00:00 EST', (2004, 7, 11, 17, 0, 0, 6, 193, 0)),
),
feedparser._parse_date_hungarian: (
(u'', None), # empty string
(u'2004-j\u00falius-13T9:15-05:00', (2004, 7, 13, 14, 15, 0, 1, 195, 0)),
),
feedparser._parse_date_iso8601: (
(u'', None), # empty string
(u'-0312', (2003, 12, 1, 0, 0, 0, 0, 335, 0)), # 2-digit year/month only variant
(u'031231', (2003, 12, 31, 0, 0, 0, 2, 365, 0)), # 2-digit year/month/day only, no hyphens
(u'03-12-31', (2003, 12, 31, 0, 0, 0, 2, 365, 0)), # 2-digit year/month/day only
(u'-03-12', (2003, 12, 1, 0, 0, 0, 0, 335, 0)), # 2-digit year/month only
(u'03335', (2003, 12, 1, 0, 0, 0, 0, 335, 0)), # 2-digit year/ordinal, no hyphens
(u'2003-12-31T10:14:55.1234Z', (2003, 12, 31, 10, 14, 55, 2, 365, 0)), # fractional seconds
# Special case for Google's extra zero in the month
(u'2003-012-31T10:14:55+00:00', (2003, 12, 31, 10, 14, 55, 2, 365, 0)),
),
feedparser._parse_date_nate: (
(u'', None), # empty string
(u'2004-05-25 \uc624\ud6c4 11:23:17', (2004, 5, 25, 14, 23, 17, 1, 146, 0)),
),
feedparser._parse_date_onblog: (
(u'', None), # empty string
(u'2004\ub144 05\uc6d4 28\uc77c 01:31:15', (2004, 5, 27, 16, 31, 15, 3, 148, 0)),
),
feedparser._parse_date_perforce: (
(u'', None), # empty string
(u'Fri, 2006/09/15 08:19:53 EDT', (2006, 9, 15, 12, 19, 53, 4, 258, 0)),
),
feedparser._parse_date_rfc822: (
(u'', None), # empty string
(u'Thu, 01 Jan 0100 00:00:01 +0100', (99, 12, 31, 23, 0, 1, 3, 365, 0)), # ancient date
(u'Thu, 01 Jan 04 19:48:21 GMT', (2004, 1, 1, 19, 48, 21, 3, 1, 0)), # 2-digit year
(u'Thu, 01 Jan 2004 19:48:21 GMT', (2004, 1, 1, 19, 48, 21, 3, 1, 0)), # 4-digit year
(u'Wed, 19 Aug 2009 18:28:00 Etc/GMT', (2009, 8, 19, 18, 28, 0, 2, 231, 0)), # etc/gmt timezone
(u'Wed, 19 Feb 2012 22:40:00 GMT-01:01', (2012, 2, 19, 23, 41, 0, 6, 50, 0)), # gmt+hh:mm timezone
(u'Mon, 13 Feb, 2012 06:28:00 UTC', (2012, 2, 13, 6, 28, 0, 0, 44, 0)), # extraneous comma
(u'Thu, 01 Jan 2004 00:00 GMT', (2004, 1, 1, 0, 0, 0, 3, 1, 0)), # no seconds
(u'Thu, 01 Jan 2004', (2004, 1, 1, 0, 0, 0, 3, 1, 0)), # no time
# Additional tests to handle Disney's long month names and invalid timezones
(u'Mon, 26 January 2004 16:31:00 AT', (2004, 1, 26, 20, 31, 0, 0, 26, 0)),
(u'Mon, 26 January 2004 16:31:00 ET', (2004, 1, 26, 21, 31, 0, 0, 26, 0)),
(u'Mon, 26 January 2004 16:31:00 CT', (2004, 1, 26, 22, 31, 0, 0, 26, 0)),
(u'Mon, 26 January 2004 16:31:00 MT', (2004, 1, 26, 23, 31, 0, 0, 26, 0)),
(u'Mon, 26 January 2004 16:31:00 PT', (2004, 1, 27, 0, 31, 0, 1, 27, 0)),
),
feedparser._parse_date_asctime: (
(u'Sun Jan 4 16:29:06 2004', (2004, 1, 4, 16, 29, 6, 6, 4, 0)),
),
feedparser._parse_date_w3dtf: (
(u'', None), # empty string
(u'2003-12-31T10:14:55Z', (2003, 12, 31, 10, 14, 55, 2, 365, 0)), # UTC
(u'2003-12-31T10:14:55-08:00', (2003, 12, 31, 18, 14, 55, 2, 365, 0)), # San Francisco timezone
(u'2003-12-31T18:14:55+08:00', (2003, 12, 31, 10, 14, 55, 2, 365, 0)), # Tokyo timezone
(u'2007-04-23T23:25:47.538+10:00', (2007, 4, 23, 13, 25, 47, 0, 113, 0)), # fractional seconds
(u'2003-12-31', (2003, 12, 31, 0, 0, 0, 2, 365, 0)), # year/month/day only
(u'20031231', (2003, 12, 31, 0, 0, 0, 2, 365, 0)), # year/month/day only, no hyphens
(u'2003-12', (2003, 12, 1, 0, 0, 0, 0, 335, 0)), # year/month only
(u'2003', (2003, 1, 1, 0, 0, 0, 2, 1, 0)), # year only
# MSSQL-style dates
(u'2004-07-08 23:56:58 -00:20', (2004, 7, 9, 0, 16, 58, 4, 191, 0)), # with timezone
(u'2004-07-08 23:56:58', (2004, 7, 8, 23, 56, 58, 3, 190, 0)), # without timezone
(u'2004-07-08 23:56:58.0', (2004, 7, 8, 23, 56, 58, 3, 190, 0)), # with fractional second
# Special cases for out-of-range times
(u'2003-12-31T25:14:55Z', (2004, 1, 1, 1, 14, 55, 3, 1, 0)), # invalid (25 hours)
(u'2003-12-31T10:61:55Z', (2003, 12, 31, 11, 1, 55, 2, 365, 0)), # invalid (61 minutes)
(u'2003-12-31T10:14:61Z', (2003, 12, 31, 10, 15, 1, 2, 365, 0)), # invalid (61 seconds)
# Special cases for rollovers in leap years
(u'2004-02-28T18:14:55-08:00', (2004, 2, 29, 2, 14, 55, 6, 60, 0)), # feb 28 in leap year
(u'2003-02-28T18:14:55-08:00', (2003, 3, 1, 2, 14, 55, 5, 60, 0)), # feb 28 in non-leap year
(u'2000-02-28T18:14:55-08:00', (2000, 2, 29, 2, 14, 55, 1, 60, 0)), # feb 28 in leap year on century divisible by 400
)
}
def make_date_test(f, s, t):
return lambda self: self._check_date(f, s, t)
for func, items in date_tests.iteritems():
for i, (dtstring, dttuple) in enumerate(items):
uniqfunc = make_date_test(func, dtstring, dttuple)
setattr(TestDateParsers, 'test_%s_%02i' % (func.__name__, i), uniqfunc)
class TestHTMLGuessing(unittest.TestCase):
"Exercise the HTML sniffing code"
def _mktest(text, expect, doc):
def fn(self):
value = bool(feedparser._FeedParserMixin.lookslikehtml(text))
self.assertEqual(value, expect)
fn.__doc__ = doc
return fn
test_text_1 = _mktest(u'plain text', False, u'plain text')
test_text_2 = _mktest(u'2 < 3', False, u'plain text with angle bracket')
test_html_1 = _mktest(u'<a href="">a</a>', True, u'anchor tag')
test_html_2 = _mktest(u'<i>i</i>', True, u'italics tag')
test_html_3 = _mktest(u'<b>b</b>', True, u'bold tag')
test_html_4 = _mktest(u'<code>', False, u'allowed tag, no end tag')
test_html_5 = _mktest(u'<rss> .. </rss>', False, u'disallowed tag')
test_entity_1 = _mktest(u'AT&T', False, u'corporation name')
test_entity_2 = _mktest(u'&copy;', True, u'named entity reference')
test_entity_3 = _mktest(u'&#169;', True, u'numeric entity reference')
test_entity_4 = _mktest(u'&#xA9;', True, u'hex numeric entity reference')
#---------- additional api unit tests, not backed by files
class TestBuildRequest(unittest.TestCase):
"Test that HTTP request objects are created as expected"
def test_extra_headers(self):
"""You can pass in extra headers and they go into the request object."""
request = feedparser._build_urllib2_request(
'http://example.com/feed',
'agent-name',
None, None, None, None,
{'Cache-Control': 'max-age=0'})
# nb, urllib2 folds the case of the headers
self.assertEqual(
request.get_header('Cache-control'), 'max-age=0')
#---------- parse test files and create test methods ----------
def convert_to_utf8(data):
"Identify data's encoding using its byte order mark" \
"and convert it to its utf-8 equivalent"
if data[:4] == _l2bytes([0x4c, 0x6f, 0xa7, 0x94]):
return data.decode('cp037').encode('utf-8')
elif data[:4] == _l2bytes([0x00, 0x00, 0xfe, 0xff]):
if not _UTF32_AVAILABLE:
return None
return data.decode('utf-32be').encode('utf-8')
elif data[:4] == _l2bytes([0xff, 0xfe, 0x00, 0x00]):
if not _UTF32_AVAILABLE:
return None
return data.decode('utf-32le').encode('utf-8')
elif data[:4] == _l2bytes([0x00, 0x00, 0x00, 0x3c]):
if not _UTF32_AVAILABLE:
return None
return data.decode('utf-32be').encode('utf-8')
elif data[:4] == _l2bytes([0x3c, 0x00, 0x00, 0x00]):
if not _UTF32_AVAILABLE:
return None
return data.decode('utf-32le').encode('utf-8')
elif data[:4] == _l2bytes([0x00, 0x3c, 0x00, 0x3f]):
return data.decode('utf-16be').encode('utf-8')
elif data[:4] == _l2bytes([0x3c, 0x00, 0x3f, 0x00]):
return data.decode('utf-16le').encode('utf-8')
elif (data[:2] == _l2bytes([0xfe, 0xff])) and (data[2:4] != _l2bytes([0x00, 0x00])):
return data[2:].decode('utf-16be').encode('utf-8')
elif (data[:2] == _l2bytes([0xff, 0xfe])) and (data[2:4] != _l2bytes([0x00, 0x00])):
return data[2:].decode('utf-16le').encode('utf-8')
elif data[:3] == _l2bytes([0xef, 0xbb, 0xbf]):
return data[3:]
# no byte order mark was found
return data
skip_re = re.compile(_s2bytes("SkipUnless:\s*(.*?)\n"))
desc_re = re.compile(_s2bytes("Description:\s*(.*?)\s*Expect:\s*(.*)\s*-->"))
def getDescription(xmlfile, data):
"""Extract test data
Each test case is an XML file which contains not only a test feed
but also the description of the test and the condition that we
would expect the parser to create when it parses the feed. Example:
<!--
Description: feed title
Expect: feed['title'] == u'Example feed'
-->
"""
skip_results = skip_re.search(data)
if skip_results:
skipUnless = skip_results.group(1).strip()
else:
skipUnless = '1'
search_results = desc_re.search(data)
if not search_results:
raise RuntimeError, "can't parse %s" % xmlfile
description, evalString = map(lambda s: s.strip(), list(search_results.groups()))
description = xmlfile + ": " + unicode(description, 'utf8')
return description, evalString, skipUnless
def buildTestCase(xmlfile, description, evalString):
func = lambda self, xmlfile=xmlfile, evalString=evalString: \
self.failUnlessEval(xmlfile, evalString)
func.__doc__ = description
return func
def runtests():
"Read the files in the tests/ directory, dynamically add tests to the " \
"TestCases above, spawn the HTTP server, and run the test suite"
if sys.argv[1:]:
allfiles = filter(lambda s: s.endswith('.xml'), reduce(operator.add, map(glob.glob, sys.argv[1:]), []))
sys.argv = [sys.argv[0]] #+ sys.argv[2:]
else:
allfiles = glob.glob(os.path.join('.', 'tests', '**', '**', '*.xml'))
wellformedfiles = glob.glob(os.path.join('.', 'tests', 'wellformed', '**', '*.xml'))
illformedfiles = glob.glob(os.path.join('.', 'tests', 'illformed', '*.xml'))
encodingfiles = glob.glob(os.path.join('.', 'tests', 'encoding', '*.xml'))
entitiesfiles = glob.glob(os.path.join('.', 'tests', 'entities', '*.xml'))
microformatfiles = glob.glob(os.path.join('.', 'tests', 'microformats', '**', '*.xml'))
httpd = None
# there are several compression test cases that must be accounted for
# as well as a number of http status tests that redirect to a target
# and a few `_open_resource`-related tests
httpcount = 5 + 17 + 2
httpcount += len([f for f in allfiles if 'http' in f])
httpcount += len([f for f in wellformedfiles if 'http' in f])
httpcount += len([f for f in illformedfiles if 'http' in f])
httpcount += len([f for f in encodingfiles if 'http' in f])
try:
for c, xmlfile in enumerate(allfiles + encodingfiles + illformedfiles + entitiesfiles):
addTo = TestCase
if xmlfile in encodingfiles:
addTo = TestEncodings
elif xmlfile in entitiesfiles:
addTo = (TestStrictParser, TestLooseParser)
elif xmlfile in microformatfiles:
addTo = TestMicroformats
elif xmlfile in wellformedfiles:
addTo = (TestStrictParser, TestLooseParser)
data = open(xmlfile, 'rb').read()
if 'encoding' in xmlfile:
data = convert_to_utf8(data)
if data is None:
# convert_to_utf8 found a byte order mark for utf_32
# but it's not supported in this installation of Python
if 'http' in xmlfile:
httpcount -= 1 + (xmlfile in wellformedfiles)
continue
description, evalString, skipUnless = getDescription(xmlfile, data)
testName = 'test_%06d' % c
ishttp = 'http' in xmlfile
try:
if not eval(skipUnless): raise NotImplementedError
except (ImportError, LookupError, NotImplementedError, AttributeError):
if ishttp:
httpcount -= 1 + (xmlfile in wellformedfiles)
continue
if ishttp:
xmlfile = 'http://%s:%s/%s' % (_HOST, _PORT, posixpath.normpath(xmlfile.replace('\\', '/')))
testFunc = buildTestCase(xmlfile, description, evalString)
if isinstance(addTo, tuple):
setattr(addTo[0], testName, testFunc)
setattr(addTo[1], testName, testFunc)
else:
setattr(addTo, testName, testFunc)
if feedparser.TIDY_MARKUP and feedparser._mxtidy:
sys.stderr.write('\nWarning: feedparser.TIDY_MARKUP invalidates tests, turning it off temporarily\n\n')
feedparser.TIDY_MARKUP = 0
if httpcount:
httpd = FeedParserTestServer(httpcount)
httpd.daemon = True
httpd.start()
httpd.ready.wait()
testsuite = unittest.TestSuite()
testloader = unittest.TestLoader()
testsuite.addTest(testloader.loadTestsFromTestCase(TestCase))
testsuite.addTest(testloader.loadTestsFromTestCase(TestStrictParser))
testsuite.addTest(testloader.loadTestsFromTestCase(TestLooseParser))
testsuite.addTest(testloader.loadTestsFromTestCase(TestEncodings))
testsuite.addTest(testloader.loadTestsFromTestCase(TestDateParsers))
testsuite.addTest(testloader.loadTestsFromTestCase(TestHTMLGuessing))
testsuite.addTest(testloader.loadTestsFromTestCase(TestHTTPStatus))
testsuite.addTest(testloader.loadTestsFromTestCase(TestCompression))
testsuite.addTest(testloader.loadTestsFromTestCase(TestConvertToIdn))
testsuite.addTest(testloader.loadTestsFromTestCase(TestMicroformats))
testsuite.addTest(testloader.loadTestsFromTestCase(TestOpenResource))
testsuite.addTest(testloader.loadTestsFromTestCase(TestFeedParserDict))
testsuite.addTest(testloader.loadTestsFromTestCase(TestMakeSafeAbsoluteURI))
testsuite.addTest(testloader.loadTestsFromTestCase(TestEverythingIsUnicode))
testsuite.addTest(testloader.loadTestsFromTestCase(TestTemporaryFallbackBehavior))
testresults = unittest.TextTestRunner(verbosity=1).run(testsuite)
# Return 0 if successful, 1 if there was a failure
sys.exit(not testresults.wasSuccessful())
finally:
if httpd:
if httpd.requests:
# Should never get here unless something went horribly wrong, like the
# user hitting Ctrl-C. Tell our HTTP server that it's done, then do
# one more request to flush it. This rarely works; the combination of
# threading, self-terminating HTTP servers, and unittest is really
# quite flaky. Just what you want in a testing framework, no?
httpd.requests = 0
if httpd.ready:
urllib.urlopen('http://127.0.0.1:8097/tests/wellformed/rss/aaa_wellformed.xml').read()
httpd.join(0)
if __name__ == "__main__":
runtests()

547
lib/feedparser/sgmllib3.py Normal file
View File

@ -0,0 +1,547 @@
"""A parser for SGML, using the derived class as a static DTD."""
# XXX This only supports those SGML features used by HTML.
# XXX There should be a way to distinguish between PCDATA (parsed
# character data -- the normal case), RCDATA (replaceable character
# data -- only char and entity references and end tags are special)
# and CDATA (character data -- only end tags are special). RCDATA is
# not supported at all.
import _markupbase
import re
__all__ = ["SGMLParser", "SGMLParseError"]
# Regular expressions used for parsing
interesting = re.compile('[&<]')
incomplete = re.compile('&([a-zA-Z][a-zA-Z0-9]*|#[0-9]*)?|'
'<([a-zA-Z][^<>]*|'
'/([a-zA-Z][^<>]*)?|'
'![^<>]*)?')
entityref = re.compile('&([a-zA-Z][-.a-zA-Z0-9]*)[^a-zA-Z0-9]')
charref = re.compile('&#([0-9]+)[^0-9]')
starttagopen = re.compile('<[>a-zA-Z]')
shorttagopen = re.compile('<[a-zA-Z][-.a-zA-Z0-9]*/')
shorttag = re.compile('<([a-zA-Z][-.a-zA-Z0-9]*)/([^/]*)/')
piclose = re.compile('>')
endbracket = re.compile('[<>]')
tagfind = re.compile('[a-zA-Z][-_.a-zA-Z0-9]*')
attrfind = re.compile(
r'\s*([a-zA-Z_][-:.a-zA-Z_0-9]*)(\s*=\s*'
r'(\'[^\']*\'|"[^"]*"|[][\-a-zA-Z0-9./,:;+*%?!&$\(\)_#=~\'"@]*))?')
class SGMLParseError(RuntimeError):
"""Exception raised for all parse errors."""
pass
# SGML parser base class -- find tags and call handler functions.
# Usage: p = SGMLParser(); p.feed(data); ...; p.close().
# The dtd is defined by deriving a class which defines methods
# with special names to handle tags: start_foo and end_foo to handle
# <foo> and </foo>, respectively, or do_foo to handle <foo> by itself.
# (Tags are converted to lower case for this purpose.) The data
# between tags is passed to the parser by calling self.handle_data()
# with some data as argument (the data may be split up in arbitrary
# chunks). Entity references are passed by calling
# self.handle_entityref() with the entity reference as argument.
class SGMLParser(_markupbase.ParserBase):
# Definition of entities -- derived classes may override
entity_or_charref = re.compile('&(?:'
'([a-zA-Z][-.a-zA-Z0-9]*)|#([0-9]+)'
')(;?)')
def __init__(self, verbose=0):
"""Initialize and reset this instance."""
self.verbose = verbose
self.reset()
def reset(self):
"""Reset this instance. Loses all unprocessed data."""
self.__starttag_text = None
self.rawdata = ''
self.stack = []
self.lasttag = '???'
self.nomoretags = 0
self.literal = 0
_markupbase.ParserBase.reset(self)
def setnomoretags(self):
"""Enter literal mode (CDATA) till EOF.
Intended for derived classes only.
"""
self.nomoretags = self.literal = 1
def setliteral(self, *args):
"""Enter literal mode (CDATA).
Intended for derived classes only.
"""
self.literal = 1
def feed(self, data):
"""Feed some data to the parser.
Call this as often as you want, with as little or as much text
as you want (may include '\n'). (This just saves the text,
all the processing is done by goahead().)
"""
self.rawdata = self.rawdata + data
self.goahead(0)
def close(self):
"""Handle the remaining data."""
self.goahead(1)
def error(self, message):
raise SGMLParseError(message)
# Internal -- handle data as far as reasonable. May leave state
# and data to be processed by a subsequent call. If 'end' is
# true, force handling all data as if followed by EOF marker.
def goahead(self, end):
rawdata = self.rawdata
i = 0
n = len(rawdata)
while i < n:
if self.nomoretags:
self.handle_data(rawdata[i:n])
i = n
break
match = interesting.search(rawdata, i)
if match: j = match.start()
else: j = n
if i < j:
self.handle_data(rawdata[i:j])
i = j
if i == n: break
if rawdata[i] == '<':
if starttagopen.match(rawdata, i):
if self.literal:
self.handle_data(rawdata[i])
i = i+1
continue
k = self.parse_starttag(i)
if k < 0: break
i = k
continue
if rawdata.startswith("</", i):
k = self.parse_endtag(i)
if k < 0: break
i = k
self.literal = 0
continue
if self.literal:
if n > (i + 1):
self.handle_data("<")
i = i+1
else:
# incomplete
break
continue
if rawdata.startswith("<!--", i):
# Strictly speaking, a comment is --.*--
# within a declaration tag <!...>.
# This should be removed,
# and comments handled only in parse_declaration.
k = self.parse_comment(i)
if k < 0: break
i = k
continue
if rawdata.startswith("<?", i):
k = self.parse_pi(i)
if k < 0: break
i = i+k
continue
if rawdata.startswith("<!", i):
# This is some sort of declaration; in "HTML as
# deployed," this should only be the document type
# declaration ("<!DOCTYPE html...>").
k = self.parse_declaration(i)
if k < 0: break
i = k
continue
elif rawdata[i] == '&':
if self.literal:
self.handle_data(rawdata[i])
i = i+1
continue
match = charref.match(rawdata, i)
if match:
name = match.group(1)
self.handle_charref(name)
i = match.end(0)
if rawdata[i-1] != ';': i = i-1
continue
match = entityref.match(rawdata, i)
if match:
name = match.group(1)
self.handle_entityref(name)
i = match.end(0)
if rawdata[i-1] != ';': i = i-1
continue
else:
self.error('neither < nor & ??')
# We get here only if incomplete matches but
# nothing else
match = incomplete.match(rawdata, i)
if not match:
self.handle_data(rawdata[i])
i = i+1
continue
j = match.end(0)
if j == n:
break # Really incomplete
self.handle_data(rawdata[i:j])
i = j
# end while
if end and i < n:
self.handle_data(rawdata[i:n])
i = n
self.rawdata = rawdata[i:]
# XXX if end: check for empty stack
# Extensions for the DOCTYPE scanner:
_decl_otherchars = '='
# Internal -- parse processing instr, return length or -1 if not terminated
def parse_pi(self, i):
rawdata = self.rawdata
if rawdata[i:i+2] != '<?':
self.error('unexpected call to parse_pi()')
match = piclose.search(rawdata, i+2)
if not match:
return -1
j = match.start(0)
self.handle_pi(rawdata[i+2: j])
j = match.end(0)
return j-i
def get_starttag_text(self):
return self.__starttag_text
# Internal -- handle starttag, return length or -1 if not terminated
def parse_starttag(self, i):
self.__starttag_text = None
start_pos = i
rawdata = self.rawdata
if shorttagopen.match(rawdata, i):
# SGML shorthand: <tag/data/ == <tag>data</tag>
# XXX Can data contain &... (entity or char refs)?
# XXX Can data contain < or > (tag characters)?
# XXX Can there be whitespace before the first /?
match = shorttag.match(rawdata, i)
if not match:
return -1
tag, data = match.group(1, 2)
self.__starttag_text = '<%s/' % tag
tag = tag.lower()
k = match.end(0)
self.finish_shorttag(tag, data)
self.__starttag_text = rawdata[start_pos:match.end(1) + 1]
return k
# XXX The following should skip matching quotes (' or ")
# As a shortcut way to exit, this isn't so bad, but shouldn't
# be used to locate the actual end of the start tag since the
# < or > characters may be embedded in an attribute value.
match = endbracket.search(rawdata, i+1)
if not match:
return -1
j = match.start(0)
# Now parse the data between i+1 and j into a tag and attrs
attrs = []
if rawdata[i:i+2] == '<>':
# SGML shorthand: <> == <last open tag seen>
k = j
tag = self.lasttag
else:
match = tagfind.match(rawdata, i+1)
if not match:
self.error('unexpected call to parse_starttag')
k = match.end(0)
tag = rawdata[i+1:k].lower()
self.lasttag = tag
while k < j:
match = attrfind.match(rawdata, k)
if not match: break
attrname, rest, attrvalue = match.group(1, 2, 3)
if not rest:
attrvalue = attrname
else:
if (attrvalue[:1] == "'" == attrvalue[-1:] or
attrvalue[:1] == '"' == attrvalue[-1:]):
# strip quotes
attrvalue = attrvalue[1:-1]
attrvalue = self.entity_or_charref.sub(
self._convert_ref, attrvalue)
attrs.append((attrname.lower(), attrvalue))
k = match.end(0)
if rawdata[j] == '>':
j = j+1
self.__starttag_text = rawdata[start_pos:j]
self.finish_starttag(tag, attrs)
return j
# Internal -- convert entity or character reference
def _convert_ref(self, match):
if match.group(2):
return self.convert_charref(match.group(2)) or \
'&#%s%s' % match.groups()[1:]
elif match.group(3):
return self.convert_entityref(match.group(1)) or \
'&%s;' % match.group(1)
else:
return '&%s' % match.group(1)
# Internal -- parse endtag
def parse_endtag(self, i):
rawdata = self.rawdata
match = endbracket.search(rawdata, i+1)
if not match:
return -1
j = match.start(0)
tag = rawdata[i+2:j].strip().lower()
if rawdata[j] == '>':
j = j+1
self.finish_endtag(tag)
return j
# Internal -- finish parsing of <tag/data/ (same as <tag>data</tag>)
def finish_shorttag(self, tag, data):
self.finish_starttag(tag, [])
self.handle_data(data)
self.finish_endtag(tag)
# Internal -- finish processing of start tag
# Return -1 for unknown tag, 0 for open-only tag, 1 for balanced tag
def finish_starttag(self, tag, attrs):
try:
method = getattr(self, 'start_' + tag)
except AttributeError:
try:
method = getattr(self, 'do_' + tag)
except AttributeError:
self.unknown_starttag(tag, attrs)
return -1
else:
self.handle_starttag(tag, method, attrs)
return 0
else:
self.stack.append(tag)
self.handle_starttag(tag, method, attrs)
return 1
# Internal -- finish processing of end tag
def finish_endtag(self, tag):
if not tag:
found = len(self.stack) - 1
if found < 0:
self.unknown_endtag(tag)
return
else:
if tag not in self.stack:
try:
method = getattr(self, 'end_' + tag)
except AttributeError:
self.unknown_endtag(tag)
else:
self.report_unbalanced(tag)
return
found = len(self.stack)
for i in range(found):
if self.stack[i] == tag: found = i
while len(self.stack) > found:
tag = self.stack[-1]
try:
method = getattr(self, 'end_' + tag)
except AttributeError:
method = None
if method:
self.handle_endtag(tag, method)
else:
self.unknown_endtag(tag)
del self.stack[-1]
# Overridable -- handle start tag
def handle_starttag(self, tag, method, attrs):
method(attrs)
# Overridable -- handle end tag
def handle_endtag(self, tag, method):
method()
# Example -- report an unbalanced </...> tag.
def report_unbalanced(self, tag):
if self.verbose:
print('*** Unbalanced </' + tag + '>')
print('*** Stack:', self.stack)
def convert_charref(self, name):
"""Convert character reference, may be overridden."""
try:
n = int(name)
except ValueError:
return
if not 0 <= n <= 127:
return
return self.convert_codepoint(n)
def convert_codepoint(self, codepoint):
return chr(codepoint)
def handle_charref(self, name):
"""Handle character reference, no need to override."""
replacement = self.convert_charref(name)
if replacement is None:
self.unknown_charref(name)
else:
self.handle_data(replacement)
# Definition of entities -- derived classes may override
entitydefs = \
{'lt': '<', 'gt': '>', 'amp': '&', 'quot': '"', 'apos': '\''}
def convert_entityref(self, name):
"""Convert entity references.
As an alternative to overriding this method; one can tailor the
results by setting up the self.entitydefs mapping appropriately.
"""
table = self.entitydefs
if name in table:
return table[name]
else:
return
def handle_entityref(self, name):
"""Handle entity references, no need to override."""
replacement = self.convert_entityref(name)
if replacement is None:
self.unknown_entityref(name)
else:
self.handle_data(replacement)
# Example -- handle data, should be overridden
def handle_data(self, data):
pass
# Example -- handle comment, could be overridden
def handle_comment(self, data):
pass
# Example -- handle declaration, could be overridden
def handle_decl(self, decl):
pass
# Example -- handle processing instruction, could be overridden
def handle_pi(self, data):
pass
# To be overridden -- handlers for unknown objects
def unknown_starttag(self, tag, attrs): pass
def unknown_endtag(self, tag): pass
def unknown_charref(self, ref): pass
def unknown_entityref(self, ref): pass
class TestSGMLParser(SGMLParser):
def __init__(self, verbose=0):
self.testdata = ""
SGMLParser.__init__(self, verbose)
def handle_data(self, data):
self.testdata = self.testdata + data
if len(repr(self.testdata)) >= 70:
self.flush()
def flush(self):
data = self.testdata
if data:
self.testdata = ""
print('data:', repr(data))
def handle_comment(self, data):
self.flush()
r = repr(data)
if len(r) > 68:
r = r[:32] + '...' + r[-32:]
print('comment:', r)
def unknown_starttag(self, tag, attrs):
self.flush()
if not attrs:
print('start tag: <' + tag + '>')
else:
print('start tag: <' + tag, end=' ')
for name, value in attrs:
print(name + '=' + '"' + value + '"', end=' ')
print('>')
def unknown_endtag(self, tag):
self.flush()
print('end tag: </' + tag + '>')
def unknown_entityref(self, ref):
self.flush()
print('*** unknown entity ref: &' + ref + ';')
def unknown_charref(self, ref):
self.flush()
print('*** unknown char ref: &#' + ref + ';')
def unknown_decl(self, data):
self.flush()
print('*** unknown decl: [' + data + ']')
def close(self):
SGMLParser.close(self)
self.flush()
def test(args = None):
import sys
if args is None:
args = sys.argv[1:]
if args and args[0] == '-s':
args = args[1:]
klass = SGMLParser
else:
klass = TestSGMLParser
if args:
file = args[0]
else:
file = 'test.html'
if file == '-':
f = sys.stdin
else:
try:
f = open(file, 'r')
except IOError as msg:
print(file, ":", msg)
sys.exit(1)
data = f.read()
if f is not sys.stdin:
f.close()
x = klass()
for c in data:
x.feed(c)
x.close()
if __name__ == '__main__':
test()

View File

@ -0,0 +1 @@
error

Binary file not shown.

View File

@ -0,0 +1 @@
error

Binary file not shown.

Binary file not shown.

View File

@ -0,0 +1 @@
<feed xmlns="http://www.w3.org/2005/Atom"></feed>

View File

@ -0,0 +1,8 @@
<?xml version="1.0" encoding="big5"?>
<!--
SkipUnless: __import__('codecs').lookup('big5')
Description: big5
Expect: not bozo and encoding == 'big5'
-->
<rss>
</rss>

View File

@ -0,0 +1,7 @@
<?xml version="1.0" encoding="bogus"?>
<!--
Description: bogus encoding
Expect: bozo
-->
<rss>
</rss>

View File

@ -0,0 +1,13 @@
<?xml version="1.0"?>
<!--
Description: utf-8 interpreted as iso-8859-1 and re-encoded as utf-8
Expect: bozo and ord(entries[0]['description']) == 8230
-->
<rss version="2.0">
<channel>
<item>
<description>&acirc;&#128;&brvbar;</description>
</item>
</channel>
</rss>

View File

@ -0,0 +1,10 @@
<!--
SkipUnless: __import__('sys').version.split()[0] >= '2.2.0'
Description: crashes
Expect: 1
-->
<rss>
<item>
<description><![CDATA[<a href="http://www.example.com/">¤</a><a href="&#38;"></a>]]></description>
</item>
</rss>

View File

@ -0,0 +1,11 @@
<?xml version="1.0" encoding="utf-8"?>
<!--
Note: text/xml defaults to us-ascii, in conflict with the XML declaration of utf-8
Header: Content-type: text/xml
Description: Content-type with no charset (text/xml defaults to us-ascii)
Expect: bozo and isinstance(bozo_exception, feedparser.CharacterEncodingOverride)
-->
<feed version="0.3" xmlns="http://purl.org/atom/ns#">
<title>Iñtërnâtiônàlizætiøn</title>
</feed>

View File

@ -0,0 +1,8 @@
<?xml version="1.0"?>
<!--
Header: Content-type: text/plain
Description: text/plain + no encoding
Expect: bozo
-->
<rss version="2.0">
</rss>

View File

@ -0,0 +1,8 @@
<?xml version="1.0"?>
<!--
Header: Content-type: text/plain; charset=utf-8
Description: text/plain + charset
Expect: bozo and encoding == 'utf-8'
-->
<rss version="2.0">
</rss>

View File

@ -0,0 +1,10 @@
<!--
Description: Ensure when there are invalid bytes in encoding specified by BOM, feedparser doesn't crash
Expect: bozo and not encoding
-->
<rss version="2.0">
<channel>
<title>Valid UTF8: ѨInvalid UTF8: España</title>
<description><pre class="screen"></pre></description>
</channel>
</rss

View File

@ -0,0 +1,13 @@
<?xml version="1.0" encoding="utf-8"?>
<!--
Description: unguessable characters
Expect: bozo and entries[0].summary == u'\xe2\u20ac\u2122\xe2\u20ac\x9d\u0160'
-->
<rss version="2.0">
<channel>
<item>
<description><![CDATA[ ’<>â€<C3A2>© ]]></description>
</item>
</channel>
</rss>

Binary file not shown.

Binary file not shown.

View File

@ -0,0 +1,13 @@
<?xml version="1.0"?>
<!--
Description: using win-1252 character points instead of unicode
Expect: not bozo and entries[0]['description'] == u'don\u2019t'
-->
<rss version="2.0">
<channel>
<item>
<description>don’t</description>
</item>
</channel>
</rss>

View File

@ -0,0 +1,13 @@
<?xml version="1.0"?>
<!--
Description: using win-1252 character points instead of unicode
Expect: not bozo and entries[0]['description'] == u'don\u2019t'
-->
<rss version="2.0">
<channel>
<item>
<description>don&#146;t</description>
</item>
</channel>
</rss>

View File

@ -0,0 +1,13 @@
<?xml version="1.0"?>
<!--
Description: using win-1252 character points instead of unicode
Expect: not bozo and entries[0]['description'] == u'don&#x2019;t'
-->
<rss version="2.0">
<channel>
<item>
<description>don&amp;#146;t</description>
</item>
</channel>
</rss>

View File

@ -0,0 +1,13 @@
<?xml version="1.0"?>
<!--
Description: utf-8 interpreted as iso-8859-1 and re-encoded as utf-8
Expect: not bozo and ord(entries[0]['description']) == 8230
-->
<rss version="2.0">
<channel>
<item>
<description>&#226;&#128;&#166;</description>
</item>
</channel>
</rss>

View File

@ -0,0 +1,9 @@
<!--
Description: crashes
Expect: 1
-->
<rss>
<item>
<description><![CDATA[<img alt="&#169;" />]]></description>
</item>
</rss>

View File

@ -0,0 +1,9 @@
<!--
Description: crashes
Expect: 1
-->
<rss>
<item>
<description>&lt;a href=&quot;http://example.com&quot;&gt;&lt;img src=&quot;http://example.com/logo.gif&quot; alt=&quot;The image &amp;acirc;&amp;#128;&amp;#156;http://example.com/logo.gif&amp;acirc;&amp;#128;&amp;#65533; cannot be displayed, because it contains errors.&quot;&gt;&lt;/a&gt;&lt;br&gt;</description>
</item>
</rss>

View File

@ -0,0 +1,14 @@
<?xml version="1.0" encoding="euc-kr"?>
<!--
SkipUnless: __import__('codecs').lookup('euc-kr')
Description: euc-kr character in attribute of embedded HTML
Expect: not bozo and entries[0]['description'] == u'<img alt="\ub144" />'
-->
<rss version="2.0">
<channel>
<item>
<description>&lt;img alt="³â" /></description>
</item>
</channel>
</rss>

View File

@ -0,0 +1,14 @@
<?xml version="1.0" encoding="euc-kr"?>
<!--
SkipUnless: __import__('codecs').lookup('euc-kr')
Description: euc-kr encoding in item description
Expect: not bozo and entries[0]['description'] == u'\ub144'
-->
<rss version="2.0">
<channel>
<item>
<description>³â</description>
</item>
</channel>
</rss>

View File

@ -0,0 +1,12 @@
<?xml version="1.0" encoding="euc-kr"?>
<!--
SkipUnless: __import__('codecs').lookup('euc-kr')
Description: euc-kr encoding
Expect: not bozo and feed['title'] == u'\ub144'
-->
<rss version="2.0">
<channel>
<title>³â</title>
</channel>
</rss>

View File

@ -0,0 +1,8 @@
<?xml version="1.0"?>
<!--
Header: Content-type: application/atom+xml;charset='us-ascii'
Description: application/atom+xml + explicit charset
Expect: not bozo and encoding == 'us-ascii'
-->
<feed version="0.3" xmlns="http://purl.org/atom/ns#">
</feed>

View File

@ -0,0 +1,8 @@
<?xml version="1.0" encoding="iso-8859-1"?>
<!--
Header: Content-type: application/atom+xml; charset='us-ascii'
Description: application/atom+xml + charset overrides encoding
Expect: not bozo and encoding == 'us-ascii'
-->
<feed version="0.3" xmlns="http://purl.org/atom/ns#">
</feed>

View File

@ -0,0 +1,8 @@
<?xml version="1.0"?>
<!--
Header: Content-type: application/atom+xml
Description: application/atom+xml + no encoding
Expect: not bozo and encoding == 'utf-8'
-->
<feed version="0.3" xmlns="http://purl.org/atom/ns#">
</feed>

View File

@ -0,0 +1,8 @@
<?xml version="1.0" encoding="iso-8859-1"?>
<!--
Header: Content-type: application/atom+xml
Description: application/atom+xml + explicit encoding
Expect: not bozo and encoding == 'iso-8859-1'
-->
<feed version="0.3" xmlns="http://purl.org/atom/ns#">
</feed>

View File

@ -0,0 +1,9 @@
<?xml version="1.0"?>
<!--
SkipUnless: __import__('codecs').lookup('gb2312')
Header: Content-type: application/atom+xml;charset='gb2312'
Description: application/atom+xml + explicit charset
Expect: not bozo and encoding == 'gb18030'
-->
<feed xmlns="http://www.w3.org/2005/Atom">
</feed>

View File

@ -0,0 +1,9 @@
<?xml version="1.0" encoding="utf-8"?>
<!--
SkipUnless: __import__('codecs').lookup('gb2312')
Header: Content-type: application/atom+xml; charset='gb2312'
Description: application/atom+xml + charset overrides encoding
Expect: not bozo and encoding == 'gb18030'
-->
<feed xmlns="http://www.w3.org/2005/Atom">
</feed>

View File

@ -0,0 +1,9 @@
<?xml version="1.0" encoding="gb2312"?>
<!--
SkipUnless: __import__('codecs').lookup('gb2312')
Header: Content-type: application/atom+xml
Description: application/atom+xml + explicit encoding
Expect: not bozo and encoding == 'gb18030'
-->
<feed xmlns="http://www.w3.org/2005/Atom">
</feed>

View File

@ -0,0 +1,8 @@
<?xml version="1.0"?>
<!--
Header: Content-type: application/rss+xml;charset= 'us-ascii'
Description: application/rss+xml + explicit charset
Expect: not bozo and encoding == 'us-ascii'
-->
<rss version="2.0">
</rss>

View File

@ -0,0 +1,8 @@
<?xml version="1.0" encoding="iso-8859-1"?>
<!--
Header: Content-type: application/rss+xml;charset= "us-ascii"
Description: application/rss+xml + charset overrides encoding
Expect: not bozo and encoding == 'us-ascii'
-->
<rss version="2.0">
</rss>

View File

@ -0,0 +1,8 @@
<?xml version="1.0"?>
<!--
Header: Content-type: application/rss+xml
Description: application/rss+xml + no encoding
Expect: not bozo and encoding == 'utf-8'
-->
<rss version="2.0">
</rss>

View File

@ -0,0 +1,8 @@
<?xml version="1.0" encoding="iso-8859-1"?>
<!--
Header: Content-type: application/rss+xml
Description: application/rss+xml + explicit encoding
Expect: not bozo and encoding == 'iso-8859-1'
-->
<rss version="2.0">
</rss>

View File

@ -0,0 +1,8 @@
<?xml version="1.0"?>
<!--
Header: Content-type: application/xml;charset= "us-ascii"
Description: application/xml + explicit charset
Expect: not bozo and encoding == 'us-ascii'
-->
<rss version="2.0">
</rss>

View File

@ -0,0 +1,8 @@
<?xml version="1.0" encoding="iso-8859-1"?>
<!--
Header: Content-type: application/xml;charset = us-ascii
Description: application/xml + charset overrides encoding
Expect: not bozo and encoding == 'us-ascii'
-->
<rss version="2.0">
</rss>

View File

@ -0,0 +1,8 @@
<?xml version="1.0"?>
<!--
Header: Content-type: application/xml
Description: application/xml + no encoding
Expect: not bozo and encoding == 'utf-8'
-->
<rss version="2.0">
</rss>

View File

@ -0,0 +1,8 @@
<?xml version="1.0"?>
<!--
Header: Content-type: application/xml-dtd; charset="us-ascii"
Description: application/xml-dtd + explicit charset
Expect: not bozo and encoding == 'us-ascii'
-->
<feed version="0.3" xmlns="http://purl.org/atom/ns#">
</feed>

View File

@ -0,0 +1,8 @@
<?xml version="1.0" encoding="iso-8859-1"?>
<!--
Header: Content-type: application/xml-dtd; charset="us-ascii"
Description: application/xml-dtd + charset overrides encoding
Expect: not bozo and encoding == 'us-ascii'
-->
<feed version="0.3" xmlns="http://purl.org/atom/ns#">
</feed>

View File

@ -0,0 +1,8 @@
<?xml version="1.0"?>
<!--
Header: Content-type: application/xml-dtd
Description: application/xml-dtd + no encoding
Expect: not bozo and encoding == 'utf-8'
-->
<feed version="0.3" xmlns="http://purl.org/atom/ns#">
</feed>

View File

@ -0,0 +1,8 @@
<?xml version="1.0" encoding="iso-8859-1"?>
<!--
Header: Content-type: application/xml-dtd
Description: application/xml-dtd + explicit encoding
Expect: not bozo and encoding == 'iso-8859-1'
-->
<feed version="0.3" xmlns="http://purl.org/atom/ns#">
</feed>

View File

@ -0,0 +1,8 @@
<?xml version="1.0" encoding="iso-8859-1"?>
<!--
Header: Content-type: application/xml
Description: application/xml + explicit encoding
Expect: not bozo and encoding == 'iso-8859-1'
-->
<rss version="2.0">
</rss>

View File

@ -0,0 +1,8 @@
<?xml version="1.0"?>
<!--
Header: Content-type: application/xml-external-parsed-entity; charset="us-ascii"
Description: application/xml-external-parsed-entity + explicit charset
Expect: not bozo and encoding == 'us-ascii'
-->
<feed version="0.3" xmlns="http://purl.org/atom/ns#">
</feed>

View File

@ -0,0 +1,8 @@
<?xml version="1.0" encoding="iso-8859-1"?>
<!--
Header: Content-type: application/xml-external-parsed-entity;charset=us-ascii
Description: application/xml-external-parsed-entity + charset overrides encoding
Expect: not bozo and encoding == 'us-ascii'
-->
<feed version="0.3" xmlns="http://purl.org/atom/ns#">
</feed>

View File

@ -0,0 +1,8 @@
<?xml version="1.0"?>
<!--
Header: Content-type: application/xml-external-parsed-entity
Description: application/xml-external-parsed-entity + no encoding
Expect: not bozo and encoding == 'utf-8'
-->
<feed version="0.3" xmlns="http://purl.org/atom/ns#">
</feed>

View File

@ -0,0 +1,8 @@
<?xml version="1.0" encoding="iso-8859-1"?>
<!--
Header: Content-type: application/xml-external-parsed-entity
Description: application/xml-parsed-entity + explicit encoding
Expect: not bozo and encoding == 'iso-8859-1'
-->
<feed version="0.3" xmlns="http://purl.org/atom/ns#">
</feed>

View File

@ -0,0 +1,13 @@
<?xml version='1.0' encoding='UTF-8'?>
<!--
Header: Content-type: application/atom+xml
Description: crashes while resolving relative URIs when content contains attributes which contain (valid) non-ASCII characters
Expect: not bozo
-->
<feed xmlns='http://www.w3.org/2005/Atom'>
<entry>
<content type='xhtml'><div xmlns='http://www.w3.org/1999/xhtml'>
<img alt="Browser market shares at ongoing" />
</div></content>
</entry>
</feed>

View File

@ -0,0 +1,13 @@
<?xml version="1.0" encoding="utf-8"?>
<!--
Header: Content-type: application/xml
Description: application/xml with no charset (control for tests/illformed/encoding/http_i18n.xml)
Expect: not bozo
-->
<feed version="0.3" xmlns="http://purl.org/atom/ns#">
<title>Iñtërnâtiônàlizætiøn</title>
<link rel='alternate' type='text/html' href='http://example.com/'/>
<modified>2004-06-02T19:07:55-04:00</modified>
<tagline>If your parser thinks this is well-formed, it's right.</tagline>
</feed>

View File

@ -0,0 +1,8 @@
<?xml version="1.0"?>
<!--
Header: Content-type: text/atom+xml;charset='us-ascii'
Description: text/atom+xml + explicit charset
Expect: not bozo and encoding == 'us-ascii'
-->
<feed version="0.3" xmlns="http://purl.org/atom/ns#">
</feed>

View File

@ -0,0 +1,8 @@
<?xml version="1.0" encoding="iso-8859-1"?>
<!--
Header: Content-type: text/atom+xml; charset='us-ascii'
Description: text/atom+xml + charset overrides encoding
Expect: not bozo and encoding == 'us-ascii'
-->
<feed version="0.3" xmlns="http://purl.org/atom/ns#">
</feed>

View File

@ -0,0 +1,8 @@
<?xml version="1.0"?>
<!--
Header: Content-type: text/atom+xml
Description: text/atom+xml + no encoding
Expect: not bozo and encoding == 'us-ascii'
-->
<feed version="0.3" xmlns="http://purl.org/atom/ns#">
</feed>

View File

@ -0,0 +1,8 @@
<?xml version="1.0" encoding="iso-8859-1"?>
<!--
Header: Content-type: text/atom+xml
Description: text/atom+xml + explicit encoding
Expect: not bozo and encoding == 'us-ascii'
-->
<feed version="0.3" xmlns="http://purl.org/atom/ns#">
</feed>

View File

@ -0,0 +1,8 @@
<?xml version="1.0"?>
<!--
Header: Content-type: text/rss+xml;charset= 'us-ascii'
Description: text/rss+xml + explicit charset
Expect: not bozo and encoding == 'us-ascii'
-->
<rss version="2.0">
</rss>

View File

@ -0,0 +1,8 @@
<?xml version="1.0" encoding="iso-8859-1"?>
<!--
Header: Content-type: text/rss+xml;charset= "us-ascii"
Description: text/rss+xml + charset overrides encoding
Expect: not bozo and encoding == 'us-ascii'
-->
<rss version="2.0">
</rss>

View File

@ -0,0 +1,8 @@
<?xml version="1.0"?>
<!--
Header: Content-type: text/rss+xml
Description: text/rss+xml + no encoding
Expect: not bozo and encoding == 'us-ascii'
-->
<rss version="2.0">
</rss>

View File

@ -0,0 +1,8 @@
<?xml version="1.0" encoding="iso-8859-1"?>
<!--
Header: Content-type: text/rss+xml
Description: text/rss+xml + explicit encoding
Expect: not bozo and encoding == 'us-ascii'
-->
<rss version="2.0">
</rss>

View File

@ -0,0 +1,8 @@
<?xml version="1.0"?>
<!--
Header: Content-type: text/xml;
Description: text/xml + bogus charset
Expect: not bozo and encoding == 'us-ascii'
-->
<rss version="2.0">
</rss>

View File

@ -0,0 +1,8 @@
<?xml version="1.0"?>
<!--
Header: Content-type: text/xml; charset:iso-8859-1
Description: text/xml + bogus parameter
Expect: not bozo and encoding == 'us-ascii'
-->
<rss version="2.0">
</rss>

View File

@ -0,0 +1,8 @@
<?xml version="1.0"?>
<!--
Header: Content-type: text/xml;charset= "us-ascii"
Description: text/xml + explicit charset
Expect: not bozo and encoding == 'us-ascii'
-->
<rss version="2.0">
</rss>

View File

@ -0,0 +1,16 @@
<!--
SkipUnless: __import__('codecs').lookup('windows-1252')
Header: Content-type: text/xml; charset=windows-1252
Description: text/xml + explicit charset (this one is harder than the others)
Expect: not bozo and entries[0]['description'] == u'This is a \xa3\u201ctest.\u201d'
-->
<rss version="2.0">
<channel>
<item>
<title>Foo</title>
<link>http://purl.org/rss/2.0/?item</link>
<description>This is a £“test.”</description>
</item>
</channel>
</rss>

View File

@ -0,0 +1,8 @@
<?xml version="1.0" encoding="iso-8859-1"?>
<!--
Header: Content-type: text/xml;charset = us-ascii
Description: text/xml + charset overrides encoding
Expect: not bozo and encoding == 'us-ascii'
-->
<rss version="2.0">
</rss>

View File

@ -0,0 +1,17 @@
<?xml version='1.0' encoding='iso-8859-1'?>
<!--
SkipUnless: __import__('codecs').lookup('windows-1252')
Header: Content-type: text/xml; charset=windows-1252
Description: text/xml + charset overrides encoding (this one is harder than the others)
Expect: not bozo and entries[0]['description'] == u'This is a \xa3\u201ctest.\u201d'
-->
<rss version="2.0">
<channel>
<item>
<title>Foo</title>
<link>http://purl.org/rss/2.0/?item</link>
<description>This is a £“test.”</description>
</item>
</channel>
</rss>

View File

@ -0,0 +1,8 @@
<?xml version="1.0"?>
<!--
Header: Content-type: text/xml
Description: text/xml + no encoding
Expect: not bozo and encoding == 'us-ascii'
-->
<rss version="2.0">
</rss>

View File

@ -0,0 +1,8 @@
<?xml version="1.0"?>
<!--
Header: Content-type: text/xml-external-parsed-entity; charset="us-ascii"
Description: text/xml-external-parsed-entity + explicit charset
Expect: not bozo and encoding == 'us-ascii'
-->
<feed version="0.3" xmlns="http://purl.org/atom/ns#">
</feed>

View File

@ -0,0 +1,8 @@
<?xml version="1.0" encoding="iso-8859-1"?>
<!--
Header: Content-type: text/xml-external-parsed-entity;charset=us-ascii
Description: text/xml-external-parsed-entity + charset overrides encoding
Expect: not bozo and encoding == 'us-ascii'
-->
<feed version="0.3" xmlns="http://purl.org/atom/ns#">
</feed>

View File

@ -0,0 +1,8 @@
<?xml version="1.0"?>
<!--
Header: Content-type: text/xml-external-parsed-entity
Description: text/xml-external-parsed-entity + no encoding
Expect: not bozo and encoding == 'us-ascii'
-->
<feed version="0.3" xmlns="http://purl.org/atom/ns#">
</feed>

View File

@ -0,0 +1,8 @@
<?xml version="1.0" encoding="iso-8859-1"?>
<!--
Header: Content-type: text/xml-external-parsed-entity
Description: text/xml-parsed-entity + explicit encoding
Expect: not bozo and encoding == 'us-ascii'
-->
<feed version="0.3" xmlns="http://purl.org/atom/ns#">
</feed>

View File

@ -0,0 +1,8 @@
<?xml version="1.0"?>
<!--
Header: Content-type: text/xml; qs=0.9
Description: text/xml + qs value
Expect: not bozo and encoding == 'us-ascii'
-->
<feed version="0.3" xmlns="http://purl.org/atom/ns#">
</feed>

Binary file not shown.

Binary file not shown.

View File

@ -0,0 +1,7 @@
<?xml version="1.0"?>
<!--
Description: no content-type and no encoding
Expect: not bozo and encoding == 'utf-8'
-->
<feed version="0.3" xmlns="http://purl.org/atom/ns#">
</feed>

View File

@ -0,0 +1,7 @@
<?xml version="1.0" encoding="iso-8859-1"?>
<!--
Description: no content-type + explicit encoding
Expect: not bozo and encoding == 'iso-8859-1'
-->
<feed version="0.3" xmlns="http://purl.org/atom/ns#">
</feed>

Binary file not shown.

Binary file not shown.

Some files were not shown because too many files have changed in this diff Show More