"""
class SoupTest(object):
@property
def default_builder(self):
return default_builder
def soup(self, markup, **kwargs):
"""Build a Beautiful Soup object from markup."""
builder = kwargs.pop('builder', self.default_builder)
return BeautifulSoup(markup, builder=builder, **kwargs)
def document_for(self, markup, **kwargs):
"""Turn an HTML fragment into a document.
The details depend on the builder.
"""
return self.default_builder(**kwargs).test_fragment_to_document(markup)
def assert_soup(self, to_parse, compare_parsed_to=None):
"""Parse some markup using Beautiful Soup and verify that
the output markup is as expected.
"""
builder = self.default_builder
obj = BeautifulSoup(to_parse, builder=builder)
if compare_parsed_to is None:
compare_parsed_to = to_parse
# Verify that the documents come out the same.
assert obj.decode() == self.document_for(compare_parsed_to)
# Also run some checks on the BeautifulSoup object itself:
# Verify that every tag that was opened was eventually closed.
# There are no tags in the open tag counter.
assert all(v==0 for v in list(obj.open_tag_counter.values()))
# The only tag in the tag stack is the one for the root
# document.
assert [obj.ROOT_TAG_NAME] == [x.name for x in obj.tagStack]
assertSoupEquals = assert_soup
def assertConnectedness(self, element):
"""Ensure that next_element and previous_element are properly
set for all descendants of the given element.
"""
earlier = None
for e in element.descendants:
if earlier:
assert e == earlier.next_element
assert earlier == e.previous_element
earlier = e
def linkage_validator(self, el, _recursive_call=False):
"""Ensure proper linkage throughout the document."""
descendant = None
# Document element should have no previous element or previous sibling.
# It also shouldn't have a next sibling.
if el.parent is None:
assert el.previous_element is None,\
"Bad previous_element\nNODE: {}\nPREV: {}\nEXPECTED: {}".format(
el, el.previous_element, None
)
assert el.previous_sibling is None,\
"Bad previous_sibling\nNODE: {}\nPREV: {}\nEXPECTED: {}".format(
el, el.previous_sibling, None
)
assert el.next_sibling is None,\
"Bad next_sibling\nNODE: {}\nNEXT: {}\nEXPECTED: {}".format(
el, el.next_sibling, None
)
idx = 0
child = None
last_child = None
last_idx = len(el.contents) - 1
for child in el.contents:
descendant = None
# Parent should link next element to their first child
# That child should have no previous sibling
if idx == 0:
if el.parent is not None:
assert el.next_element is child,\
"Bad next_element\nNODE: {}\nNEXT: {}\nEXPECTED: {}".format(
el, el.next_element, child
)
assert child.previous_element is el,\
"Bad previous_element\nNODE: {}\nPREV: {}\nEXPECTED: {}".format(
child, child.previous_element, el
)
assert child.previous_sibling is None,\
"Bad previous_sibling\nNODE: {}\nPREV {}\nEXPECTED: {}".format(
child, child.previous_sibling, None
)
# If not the first child, previous index should link as sibling to this index
# Previous element should match the last index or the last bubbled up descendant
else:
assert child.previous_sibling is el.contents[idx - 1],\
"Bad previous_sibling\nNODE: {}\nPREV {}\nEXPECTED {}".format(
child, child.previous_sibling, el.contents[idx - 1]
)
assert el.contents[idx - 1].next_sibling is child,\
"Bad next_sibling\nNODE: {}\nNEXT {}\nEXPECTED {}".format(
el.contents[idx - 1], el.contents[idx - 1].next_sibling, child
)
if last_child is not None:
assert child.previous_element is last_child,\
"Bad previous_element\nNODE: {}\nPREV {}\nEXPECTED {}\nCONTENTS {}".format(
child, child.previous_element, last_child, child.parent.contents
)
assert last_child.next_element is child,\
"Bad next_element\nNODE: {}\nNEXT {}\nEXPECTED {}".format(
last_child, last_child.next_element, child
)
if isinstance(child, Tag) and child.contents:
descendant = self.linkage_validator(child, True)
# A bubbled up descendant should have no next siblings
assert descendant.next_sibling is None,\
"Bad next_sibling\nNODE: {}\nNEXT {}\nEXPECTED {}".format(
descendant, descendant.next_sibling, None
)
# Mark last child as either the bubbled up descendant or the current child
if descendant is not None:
last_child = descendant
else:
last_child = child
# If last child, there are non next siblings
if idx == last_idx:
assert child.next_sibling is None,\
"Bad next_sibling\nNODE: {}\nNEXT {}\nEXPECTED {}".format(
child, child.next_sibling, None
)
idx += 1
child = descendant if descendant is not None else child
if child is None:
child = el
if not _recursive_call and child is not None:
target = el
while True:
if target is None:
assert child.next_element is None, \
"Bad next_element\nNODE: {}\nNEXT {}\nEXPECTED {}".format(
child, child.next_element, None
)
break
elif target.next_sibling is not None:
assert child.next_element is target.next_sibling, \
"Bad next_element\nNODE: {}\nNEXT {}\nEXPECTED {}".format(
child, child.next_element, target.next_sibling
)
break
target = target.parent
# We are done, so nothing to return
return None
else:
# Return the child to the recursive caller
return child
def assert_selects(self, tags, should_match):
"""Make sure that the given tags have the correct text.
This is used in tests that define a bunch of tags, each
containing a single string, and then select certain strings by
some mechanism.
"""
assert [tag.string for tag in tags] == should_match
def assert_selects_ids(self, tags, should_match):
"""Make sure that the given tags have the correct IDs.
This is used in tests that define a bunch of tags, each
containing a single string, and then select certain strings by
some mechanism.
"""
assert [tag['id'] for tag in tags] == should_match
class TreeBuilderSmokeTest(object):
# Tests that are common to HTML and XML tree builders.
@pytest.mark.parametrize(
"multi_valued_attributes",
[None, dict(b=['class']), {'*': ['notclass']}]
)
def test_attribute_not_multi_valued(self, multi_valued_attributes):
markup = ''
soup = self.soup(markup, multi_valued_attributes=multi_valued_attributes)
assert soup.a['class'] == 'a b c'
@pytest.mark.parametrize(
"multi_valued_attributes", [dict(a=['class']), {'*': ['class']}]
)
def test_attribute_multi_valued(self, multi_valued_attributes):
markup = ''
soup = self.soup(
markup, multi_valued_attributes=multi_valued_attributes
)
assert soup.a['class'] == ['a', 'b', 'c']
def test_fuzzed_input(self):
# This test centralizes in one place the various fuzz tests
# for Beautiful Soup created by the oss-fuzz project.
# These strings superficially resemble markup, but they
# generally can't be parsed into anything. The best we can
# hope for is that parsing these strings won't crash the
# parser.
#
# n.b. This markup is commented out because these fuzz tests
# _do_ crash the parser. However the crashes are due to bugs
# in html.parser, not Beautiful Soup -- otherwise I'd fix the
# bugs!
bad_markup = [
# https://bugs.chromium.org/p/oss-fuzz/issues/detail?id=28873
# https://github.com/guidovranken/python-library-fuzzers/blob/master/corp-html/519e5b4269a01185a0d5e76295251921da2f0700
# https://bugs.python.org/issue37747
#
#b'\nSome CSS"
)
assert isinstance(soup.style.string, Stylesheet)
assert isinstance(soup.script.string, Script)
soup = self.soup(
""
)
assert isinstance(soup.style.string, Stylesheet)
# The contents of the style tag resemble an HTML comment, but
# it's not treated as a comment.
assert soup.style.string == ""
assert isinstance(soup.style.string, Stylesheet)
def test_pickle_and_unpickle_identity(self):
# Pickling a tree, then unpickling it, yields a tree identical
# to the original.
tree = self.soup("foo")
dumped = pickle.dumps(tree, 2)
loaded = pickle.loads(dumped)
assert loaded.__class__ == BeautifulSoup
assert loaded.decode() == tree.decode()
def assertDoctypeHandled(self, doctype_fragment):
"""Assert that a given doctype string is handled correctly."""
doctype_str, soup = self._document_with_doctype(doctype_fragment)
# Make sure a Doctype object was created.
doctype = soup.contents[0]
assert doctype.__class__ == Doctype
assert doctype == doctype_fragment
assert soup.encode("utf8")[:len(doctype_str)] == doctype_str
# Make sure that the doctype was correctly associated with the
# parse tree and that the rest of the document parsed.
assert soup.p.contents[0] == 'foo'
def _document_with_doctype(self, doctype_fragment, doctype_string="DOCTYPE"):
"""Generate and parse a document with the given doctype."""
doctype = '' % (doctype_string, doctype_fragment)
markup = doctype + '\n
foo
'
soup = self.soup(markup)
return doctype.encode("utf8"), soup
def test_normal_doctypes(self):
"""Make sure normal, everyday HTML doctypes are handled correctly."""
self.assertDoctypeHandled("html")
self.assertDoctypeHandled(
'html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN"')
def test_empty_doctype(self):
soup = self.soup("")
doctype = soup.contents[0]
assert "" == doctype.strip()
def test_mixed_case_doctype(self):
# A lowercase or mixed-case doctype becomes a Doctype.
for doctype_fragment in ("doctype", "DocType"):
doctype_str, soup = self._document_with_doctype(
"html", doctype_fragment
)
# Make sure a Doctype object was created and that the DOCTYPE
# is uppercase.
doctype = soup.contents[0]
assert doctype.__class__ == Doctype
assert doctype == "html"
assert soup.encode("utf8")[:len(doctype_str)] == b""
# Make sure that the doctype was correctly associated with the
# parse tree and that the rest of the document parsed.
assert soup.p.contents[0] == 'foo'
def test_public_doctype_with_url(self):
doctype = 'html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd"'
self.assertDoctypeHandled(doctype)
def test_system_doctype(self):
self.assertDoctypeHandled('foo SYSTEM "http://www.example.com/"')
def test_namespaced_system_doctype(self):
# We can handle a namespaced doctype with a system ID.
self.assertDoctypeHandled('xsl:stylesheet SYSTEM "htmlent.dtd"')
def test_namespaced_public_doctype(self):
# Test a namespaced doctype with a public id.
self.assertDoctypeHandled('xsl:stylesheet PUBLIC "htmlent.dtd"')
def test_real_xhtml_document(self):
"""A real XHTML document should come out more or less the same as it went in."""
markup = b"""
Hello.
Goodbye.
"""
with warnings.catch_warnings(record=True) as w:
soup = self.soup(markup)
assert soup.encode("utf-8").replace(b"\n", b"") == markup.replace(b"\n", b"")
# No warning was issued about parsing an XML document as HTML,
# because XHTML is both.
assert w == []
def test_namespaced_html(self):
# When a namespaced XML document is parsed as HTML it should
# be treated as HTML with weird tag names.
markup = b"""content"""
with warnings.catch_warnings(record=True) as w:
soup = self.soup(markup)
assert 2 == len(soup.find_all("ns1:foo"))
# n.b. no "you're parsing XML as HTML" warning was given
# because there was no XML declaration.
assert [] == w
def test_detect_xml_parsed_as_html(self):
# A warning is issued when parsing an XML document as HTML,
# but basic stuff should still work.
markup = b"""string"""
with warnings.catch_warnings(record=True) as w:
soup = self.soup(markup)
assert soup.tag.string == 'string'
[warning] = w
assert isinstance(warning.message, XMLParsedAsHTMLWarning)
assert str(warning.message) == XMLParsedAsHTMLWarning.MESSAGE
# NOTE: the warning is not issued if the document appears to
# be XHTML (tested with test_real_xhtml_document in the
# superclass) or if there is no XML declaration (tested with
# test_namespaced_html in the superclass).
def test_processing_instruction(self):
# We test both Unicode and bytestring to verify that
# process_markup correctly sets processing_instruction_class
# even when the markup is already Unicode and there is no
# need to process anything.
markup = """"""
soup = self.soup(markup)
assert markup == soup.decode()
markup = b""""""
soup = self.soup(markup)
assert markup == soup.encode("utf8")
def test_deepcopy(self):
"""Make sure you can copy the tree builder.
This is important because the builder is part of a
BeautifulSoup object, and we want to be able to copy that.
"""
copy.deepcopy(self.default_builder)
def test_p_tag_is_never_empty_element(self):
"""A
tag is never designated as an empty-element tag.
Even if the markup shows it as an empty-element tag, it
shouldn't be presented that way.
"""
soup = self.soup("
")
assert not soup.p.is_empty_element
assert str(soup.p) == ""
def test_unclosed_tags_get_closed(self):
"""A tag that's not closed by the end of the document should be closed.
This applies to all tags except empty-element tags.
"""
self.assert_soup("
", "
")
self.assert_soup("", "")
self.assert_soup(" ", " ")
def test_br_is_always_empty_element_tag(self):
"""A tag is designated as an empty-element tag.
Some parsers treat as one tag, some parsers as
two tags, but it should always be an empty-element tag.
"""
soup = self.soup(" ")
assert soup.br.is_empty_element
assert str(soup.br) == " "
def test_nested_formatting_elements(self):
self.assert_soup("")
def test_double_head(self):
html = '''
Ordinary HEAD element test
Hello, world!
'''
soup = self.soup(html)
assert "text/javascript" == soup.find('script')['type']
def test_comment(self):
# Comments are represented as Comment objects.
markup = "
foobaz
"
self.assert_soup(markup)
soup = self.soup(markup)
comment = soup.find(string="foobar")
assert comment.__class__ == Comment
# The comment is properly integrated into the tree.
foo = soup.find(string="foo")
assert comment == foo.next_element
baz = soup.find(string="baz")
assert comment == baz.previous_element
def test_preserved_whitespace_in_pre_and_textarea(self):
"""Whitespace must be preserved in