diff --git a/lib/bs4/__init__.py b/lib/bs4/__init__.py
index aa818ae4..7a80452f 100644
--- a/lib/bs4/__init__.py
+++ b/lib/bs4/__init__.py
@@ -21,8 +21,8 @@ http://www.crummy.com/software/BeautifulSoup/bs4/doc/
# found in the LICENSE file.
__author__ = "Leonard Richardson (leonardr@segfault.org)"
-__version__ = "4.5.1"
-__copyright__ = "Copyright (c) 2004-2016 Leonard Richardson"
+__version__ = "4.6.0"
+__copyright__ = "Copyright (c) 2004-2017 Leonard Richardson"
__license__ = "MIT"
__all__ = ['BeautifulSoup']
@@ -82,7 +82,7 @@ class BeautifulSoup(Tag):
ASCII_SPACES = '\x20\x0a\x09\x0c\x0d'
- NO_PARSER_SPECIFIED_WARNING = "No parser was explicitly specified, so I'm using the best available %(markup_type)s parser for this system (\"%(parser)s\"). This usually isn't a problem, but if you run this code on another system, or in a different virtual environment, it may use a different parser and behave differently.\n\nThe code that caused this warning is on line %(line_number)s of the file %(filename)s. To get rid of this warning, change code that looks like this:\n\n BeautifulSoup([your markup])\n\nto this:\n\n BeautifulSoup([your markup], \"%(parser)s\")\n"
+ NO_PARSER_SPECIFIED_WARNING = "No parser was explicitly specified, so I'm using the best available %(markup_type)s parser for this system (\"%(parser)s\"). This usually isn't a problem, but if you run this code on another system, or in a different virtual environment, it may use a different parser and behave differently.\n\nThe code that caused this warning is on line %(line_number)s of the file %(filename)s. To get rid of this warning, change code that looks like this:\n\n BeautifulSoup(YOUR_MARKUP})\n\nto this:\n\n BeautifulSoup(YOUR_MARKUP, \"%(parser)s\")\n"
def __init__(self, markup="", features=None, builder=None,
parse_only=None, from_encoding=None, exclude_encodings=None,
@@ -215,8 +215,8 @@ class BeautifulSoup(Tag):
markup = markup.encode("utf8")
warnings.warn(
'"%s" looks like a filename, not markup. You should'
- 'probably open this file and pass the filehandle into'
- 'Beautiful Soup.' % markup)
+ ' probably open this file and pass the filehandle into'
+ ' Beautiful Soup.' % markup)
self._check_markup_is_url(markup)
for (self.markup, self.original_encoding, self.declared_html_encoding,
diff --git a/lib/bs4/builder/__init__.py b/lib/bs4/builder/__init__.py
index 601979bf..fdb3362f 100644
--- a/lib/bs4/builder/__init__.py
+++ b/lib/bs4/builder/__init__.py
@@ -232,8 +232,13 @@ class HTMLTreeBuilder(TreeBuilder):
"""
preserve_whitespace_tags = HTMLAwareEntitySubstitution.preserve_whitespace_tags
- empty_element_tags = set(['br' , 'hr', 'input', 'img', 'meta',
- 'spacer', 'link', 'frame', 'base'])
+ empty_element_tags = set([
+ # These are from HTML5.
+ 'area', 'base', 'br', 'col', 'embed', 'hr', 'img', 'input', 'keygen', 'link', 'menuitem', 'meta', 'param', 'source', 'track', 'wbr',
+
+ # These are from HTML4, removed in HTML5.
+ 'spacer', 'frame'
+ ])
# The HTML standard defines these attributes as containing a
# space-separated list of values, not a single value. That is,
diff --git a/lib/bs4/builder/_html5lib.py b/lib/bs4/builder/_html5lib.py
index c46f8823..5f548935 100644
--- a/lib/bs4/builder/_html5lib.py
+++ b/lib/bs4/builder/_html5lib.py
@@ -6,6 +6,7 @@ __all__ = [
]
import warnings
+import re
from bs4.builder import (
PERMISSIVE,
HTML,
@@ -17,7 +18,10 @@ from bs4.element import (
whitespace_re,
)
import html5lib
-from html5lib.constants import namespaces
+from html5lib.constants import (
+ namespaces,
+ prefixes,
+ )
from bs4.element import (
Comment,
Doctype,
@@ -83,7 +87,7 @@ class HTML5TreeBuilder(HTMLTreeBuilder):
def create_treebuilder(self, namespaceHTMLElements):
self.underlying_builder = TreeBuilderForHtml5lib(
- self.soup, namespaceHTMLElements)
+ namespaceHTMLElements, self.soup)
return self.underlying_builder
def test_fragment_to_document(self, fragment):
@@ -93,8 +97,12 @@ class HTML5TreeBuilder(HTMLTreeBuilder):
class TreeBuilderForHtml5lib(treebuilder_base.TreeBuilder):
- def __init__(self, soup, namespaceHTMLElements):
- self.soup = soup
+ def __init__(self, namespaceHTMLElements, soup=None):
+ if soup:
+ self.soup = soup
+ else:
+ from bs4 import BeautifulSoup
+ self.soup = BeautifulSoup("", "html.parser")
super(TreeBuilderForHtml5lib, self).__init__(namespaceHTMLElements)
def documentClass(self):
@@ -117,7 +125,8 @@ class TreeBuilderForHtml5lib(treebuilder_base.TreeBuilder):
return TextNode(Comment(data), self.soup)
def fragmentClass(self):
- self.soup = BeautifulSoup("")
+ from bs4 import BeautifulSoup
+ self.soup = BeautifulSoup("", "html.parser")
self.soup.name = "[document_fragment]"
return Element(self.soup, self.soup, None)
@@ -131,6 +140,56 @@ class TreeBuilderForHtml5lib(treebuilder_base.TreeBuilder):
def getFragment(self):
return treebuilder_base.TreeBuilder.getFragment(self).element
+ def testSerializer(self, element):
+ from bs4 import BeautifulSoup
+ rv = []
+ doctype_re = re.compile(r'^(.*?)(?: PUBLIC "(.*?)"(?: "(.*?)")?| SYSTEM "(.*?)")?$')
+
+ def serializeElement(element, indent=0):
+ if isinstance(element, BeautifulSoup):
+ pass
+ if isinstance(element, Doctype):
+ m = doctype_re.match(element)
+ if m:
+ name = m.group(1)
+ if m.lastindex > 1:
+ publicId = m.group(2) or ""
+ systemId = m.group(3) or m.group(4) or ""
+ rv.append("""|%s""" %
+ (' ' * indent, name, publicId, systemId))
+ else:
+ rv.append("|%s" % (' ' * indent, name))
+ else:
+ rv.append("|%s" % (' ' * indent,))
+ elif isinstance(element, Comment):
+ rv.append("|%s" % (' ' * indent, element))
+ elif isinstance(element, NavigableString):
+ rv.append("|%s\"%s\"" % (' ' * indent, element))
+ else:
+ if element.namespace:
+ name = "%s %s" % (prefixes[element.namespace],
+ element.name)
+ else:
+ name = element.name
+ rv.append("|%s<%s>" % (' ' * indent, name))
+ if element.attrs:
+ attributes = []
+ for name, value in element.attrs.items():
+ if isinstance(name, NamespacedAttribute):
+ name = "%s %s" % (prefixes[name.namespace], name.name)
+ if isinstance(value, list):
+ value = " ".join(value)
+ attributes.append((name, value))
+
+ for name, value in sorted(attributes):
+ rv.append('|%s%s="%s"' % (' ' * (indent + 2), name, value))
+ indent += 2
+ for child in element.children:
+ serializeElement(child, indent)
+ serializeElement(element, 0)
+
+ return "\n".join(rv)
+
class AttrList(object):
def __init__(self, element):
self.element = element
@@ -182,8 +241,10 @@ class Element(treebuilder_base.Node):
child = node
elif node.element.__class__ == NavigableString:
string_child = child = node.element
+ node.parent = self
else:
child = node.element
+ node.parent = self
if not isinstance(child, basestring) and child.parent is not None:
node.element.extract()
@@ -221,6 +282,8 @@ class Element(treebuilder_base.Node):
most_recent_element=most_recent_element)
def getAttributes(self):
+ if isinstance(self.element, Comment):
+ return {}
return AttrList(self.element)
def setAttributes(self, attributes):
@@ -248,11 +311,11 @@ class Element(treebuilder_base.Node):
attributes = property(getAttributes, setAttributes)
def insertText(self, data, insertBefore=None):
+ text = TextNode(self.soup.new_string(data), self.soup)
if insertBefore:
- text = TextNode(self.soup.new_string(data), self.soup)
- self.insertBefore(data, insertBefore)
+ self.insertBefore(text, insertBefore)
else:
- self.appendChild(data)
+ self.appendChild(text)
def insertBefore(self, node, refNode):
index = self.element.index(refNode.element)
@@ -274,6 +337,7 @@ class Element(treebuilder_base.Node):
# print "MOVE", self.element.contents
# print "FROM", self.element
# print "TO", new_parent.element
+
element = self.element
new_parent_element = new_parent.element
# Determine what this tag's next_element will be once all the children
@@ -292,7 +356,6 @@ class Element(treebuilder_base.Node):
new_parents_last_descendant_next_element = new_parent_element.next_element
to_append = element.contents
- append_after = new_parent_element.contents
if len(to_append) > 0:
# Set the first child's previous_element and previous_sibling
# to elements within the new parent
@@ -309,12 +372,19 @@ class Element(treebuilder_base.Node):
if new_parents_last_child:
new_parents_last_child.next_sibling = first_child
- # Fix the last child's next_element and next_sibling
- last_child = to_append[-1]
- last_child.next_element = new_parents_last_descendant_next_element
+ # Find the very last element being moved. It is now the
+ # parent's last descendant. It has no .next_sibling and
+ # its .next_element is whatever the previous last
+ # descendant had.
+ last_childs_last_descendant = to_append[-1]._last_descendant(False, True)
+
+ last_childs_last_descendant.next_element = new_parents_last_descendant_next_element
if new_parents_last_descendant_next_element:
- new_parents_last_descendant_next_element.previous_element = last_child
- last_child.next_sibling = None
+ # TODO: This code has no test coverage and I'm not sure
+ # how to get html5lib to go through this path, but it's
+ # just the other side of the previous line.
+ new_parents_last_descendant_next_element.previous_element = last_childs_last_descendant
+ last_childs_last_descendant.next_sibling = None
for child in to_append:
child.parent = new_parent_element
diff --git a/lib/bs4/builder/_htmlparser.py b/lib/bs4/builder/_htmlparser.py
index 823ca15a..67890b3a 100644
--- a/lib/bs4/builder/_htmlparser.py
+++ b/lib/bs4/builder/_htmlparser.py
@@ -52,7 +52,31 @@ from bs4.builder import (
HTMLPARSER = 'html.parser'
class BeautifulSoupHTMLParser(HTMLParser):
- def handle_starttag(self, name, attrs):
+
+ def __init__(self, *args, **kwargs):
+ HTMLParser.__init__(self, *args, **kwargs)
+
+ # Keep a list of empty-element tags that were encountered
+ # without an explicit closing tag. If we encounter a closing tag
+ # of this type, we'll associate it with one of those entries.
+ #
+ # This isn't a stack because we don't care about the
+ # order. It's a list of closing tags we've already handled and
+ # will ignore, assuming they ever show up.
+ self.already_closed_empty_element = []
+
+ def handle_startendtag(self, name, attrs):
+ # This is only called when the markup looks like
+ #