"""Tests to ensure that the html5lib tree builder generates good trees."""
import warnings
try:
from bs4.builder import HTML5TreeBuilder
HTML5LIB_PRESENT = True
except ImportError, e:
HTML5LIB_PRESENT = False
from bs4.element import SoupStrainer
from bs4.testing import (
HTML5TreeBuilderSmokeTest,
SoupTest,
skipIf,
)
@skipIf(
not HTML5LIB_PRESENT,
"html5lib seems not to be present, not testing its tree builder.")
class HTML5LibBuilderSmokeTest(SoupTest, HTML5TreeBuilderSmokeTest):
"""See ``HTML5TreeBuilderSmokeTest``."""
@property
def default_builder(self):
return HTML5TreeBuilder()
def test_soupstrainer(self):
# The html5lib tree builder does not support SoupStrainers.
strainer = SoupStrainer("b")
markup = "
A bold statement.
"
with warnings.catch_warnings(record=True) as w:
soup = self.soup(markup, parse_only=strainer)
self.assertEqual(
soup.decode(), self.document_for(markup))
self.assertTrue(
"the html5lib tree builder doesn't support parse_only" in
str(w[0].message))
def test_correctly_nested_tables(self):
"""html5lib inserts tags where other parsers don't."""
markup = (''
''
"Here's another table:"
' | ')
self.assertSoupEquals(
markup,
'Here\'s another table:'
''
' |
')
self.assertSoupEquals(
"")
def test_xml_declaration_followed_by_doctype(self):
markup = '''
foo
'''
soup = self.soup(markup)
# Verify that we can reach the tag; this means the tree is connected.
self.assertEqual(b"
foo
", soup.p.encode())
def test_reparented_markup(self):
markup = 'foo
\nbar
'
soup = self.soup(markup)
self.assertEqual(u"foo
\nbar
", soup.body.decode())
self.assertEqual(2, len(soup.find_all('p')))
def test_reparented_markup_ends_with_whitespace(self):
markup = 'foo
\nbar
\n'
soup = self.soup(markup)
self.assertEqual(u"foo
\nbar
\n", soup.body.decode())
self.assertEqual(2, len(soup.find_all('p')))
def test_reparented_markup_containing_identical_whitespace_nodes(self):
"""Verify that we keep the two whitespace nodes in this
document distinct when reparenting the adjacent
tags.
"""
markup = ''
soup = self.soup(markup)
space1, space2 = soup.find_all(string=' ')
tbody1, tbody2 = soup.find_all('tbody')
assert space1.next_element is tbody1
assert tbody2.next_element is space2
def test_processing_instruction(self):
"""Processing instructions become comments."""
markup = b""""""
soup = self.soup(markup)
assert str(soup).startswith("")
def test_cloned_multivalue_node(self):
markup = b""""""
soup = self.soup(markup)
a1, a2 = soup.find_all('a')
self.assertEqual(a1, a2)
assert a1 is not a2