"""Tests to ensure that the html.parser tree builder generates good trees.""" from pdb import set_trace import pickle import warnings from bs4.testing import SoupTest, HTMLTreeBuilderSmokeTest from bs4.builder import HTMLParserTreeBuilder from bs4.builder._htmlparser import BeautifulSoupHTMLParser class HTMLParserTreeBuilderSmokeTest(SoupTest, HTMLTreeBuilderSmokeTest): default_builder = HTMLParserTreeBuilder def test_namespaced_system_doctype(self): # html.parser can't handle namespaced doctypes, so skip this one. pass def test_namespaced_public_doctype(self): # html.parser can't handle namespaced doctypes, so skip this one. pass def test_builder_is_pickled(self): """Unlike most tree builders, HTMLParserTreeBuilder and will be restored after pickling. """ tree = self.soup("foo") dumped = pickle.dumps(tree, 2) loaded = pickle.loads(dumped) self.assertTrue(isinstance(loaded.builder, type(tree.builder))) def test_redundant_empty_element_closing_tags(self): self.assertSoupEquals('





', "


") self.assertSoupEquals('


', "") def test_empty_element(self): # This verifies that any buffered data present when the parser # finishes working is handled. self.assertSoupEquals("foo &# bar", "foo &# bar") def test_tracking_line_numbers(self): # The html.parser TreeBuilder keeps track of line number and # position of each element. markup = "\n

\n\n\ntext

" soup = self.soup(markup) self.assertEqual(2, soup.p.sourceline) self.assertEqual(3, soup.p.sourcepos) self.assertEqual("sourceline", soup.p.find('sourceline').name) # You can deactivate this behavior. soup = self.soup(markup, store_line_numbers=False) self.assertEqual("sourceline", soup.p.sourceline.name) self.assertEqual("sourcepos", soup.p.sourcepos.name) def test_on_duplicate_attribute(self): # The html.parser tree builder has a variety of ways of # handling a tag that contains the same attribute multiple times. markup = '' # If you don't provide any particular value for # on_duplicate_attribute, later values replace earlier values. soup = self.soup(markup) self.assertEqual("url3", soup.a['href']) self.assertEqual(["cls"], soup.a['class']) self.assertEqual("id", soup.a['id']) # You can also get this behavior explicitly. def assert_attribute(on_duplicate_attribute, expected): soup = self.soup( markup, on_duplicate_attribute=on_duplicate_attribute ) self.assertEqual(expected, soup.a['href']) # Verify that non-duplicate attributes are treated normally. self.assertEqual(["cls"], soup.a['class']) self.assertEqual("id", soup.a['id']) assert_attribute(None, "url3") assert_attribute(BeautifulSoupHTMLParser.REPLACE, "url3") # You can ignore subsequent values in favor of the first. assert_attribute(BeautifulSoupHTMLParser.IGNORE, "url1") # And you can pass in a callable that does whatever you want. def accumulate(attrs, key, value): if not isinstance(attrs[key], list): attrs[key] = [attrs[key]] attrs[key].append(value) assert_attribute(accumulate, ["url1", "url2", "url3"]) def test_html5_attributes(self): # The html.parser TreeBuilder can convert any entity named in # the HTML5 spec to a sequence of Unicode characters, and # convert those Unicode characters to a (potentially # different) named entity on the way out. for input_element, output_unicode, output_element in ( ("⇄", '\u21c4', b'⇄'), ('⊧', '\u22a7', b'⊧'), ('𝔑', '\U0001d511', b'𝔑'), ('≧̸', '\u2267\u0338', b'≧̸'), ('¬', '\xac', b'¬'), ('⫬', '\u2aec', b'⫬'), ('"', '"', b'"'), ('∴', '\u2234', b'∴'), ('∴', '\u2234', b'∴'), ('∴', '\u2234', b'∴'), ("fj", 'fj', b'fj'), ("⊔", '\u2294', b'⊔'), ("⊔︀", '\u2294\ufe00', b'⊔︀'), ("'", "'", b"'"), ("|", "|", b"|"), ): markup = '
%s
' % input_element div = self.soup(markup).div without_element = div.encode() expect = b"
%s
" % output_unicode.encode("utf8") self.assertEqual(without_element, expect) with_element = div.encode(formatter="html") expect = b"
%s
" % output_element self.assertEqual(with_element, expect) class TestHTMLParserSubclass(SoupTest): def test_error(self): """Verify that our HTMLParser subclass implements error() in a way that doesn't cause a crash. """ parser = BeautifulSoupHTMLParser() with warnings.catch_warnings(record=True) as warns: parser.error("don't crash") [warning] = warns assert "don't crash" == str(warning.message)