From 54304193e227a40f7cca03b9ddccd876a625bfbb Mon Sep 17 00:00:00 2001 From: evilhero Date: Tue, 1 May 2018 10:42:00 -0400 Subject: [PATCH] FIX: Fix for problems when using wwt (returning no search results would throw error, beautifulsoup could not be used if html5lib was up-to-date on the host system --- lib/bs4/__init__.py | 10 ++-- lib/bs4/builder/__init__.py | 9 ++- lib/bs4/builder/_html5lib.py | 98 +++++++++++++++++++++++++++----- lib/bs4/builder/_htmlparser.py | 57 +++++++++++++++++-- lib/bs4/dammit.py | 4 +- lib/bs4/element.py | 79 ++++++++++++++++++++----- lib/bs4/testing.py | 53 +++++++++++++++++ lib/bs4/tests/test_html5lib.py | 21 +++++++ lib/bs4/tests/test_htmlparser.py | 4 +- lib/bs4/tests/test_tree.py | 6 ++ mylar/wwt.py | 2 + 11 files changed, 302 insertions(+), 41 deletions(-) diff --git a/lib/bs4/__init__.py b/lib/bs4/__init__.py index aa818ae4..7a80452f 100644 --- a/lib/bs4/__init__.py +++ b/lib/bs4/__init__.py @@ -21,8 +21,8 @@ http://www.crummy.com/software/BeautifulSoup/bs4/doc/ # found in the LICENSE file. __author__ = "Leonard Richardson (leonardr@segfault.org)" -__version__ = "4.5.1" -__copyright__ = "Copyright (c) 2004-2016 Leonard Richardson" +__version__ = "4.6.0" +__copyright__ = "Copyright (c) 2004-2017 Leonard Richardson" __license__ = "MIT" __all__ = ['BeautifulSoup'] @@ -82,7 +82,7 @@ class BeautifulSoup(Tag): ASCII_SPACES = '\x20\x0a\x09\x0c\x0d' - NO_PARSER_SPECIFIED_WARNING = "No parser was explicitly specified, so I'm using the best available %(markup_type)s parser for this system (\"%(parser)s\"). This usually isn't a problem, but if you run this code on another system, or in a different virtual environment, it may use a different parser and behave differently.\n\nThe code that caused this warning is on line %(line_number)s of the file %(filename)s. To get rid of this warning, change code that looks like this:\n\n BeautifulSoup([your markup])\n\nto this:\n\n BeautifulSoup([your markup], \"%(parser)s\")\n" + NO_PARSER_SPECIFIED_WARNING = "No parser was explicitly specified, so I'm using the best available %(markup_type)s parser for this system (\"%(parser)s\"). This usually isn't a problem, but if you run this code on another system, or in a different virtual environment, it may use a different parser and behave differently.\n\nThe code that caused this warning is on line %(line_number)s of the file %(filename)s. To get rid of this warning, change code that looks like this:\n\n BeautifulSoup(YOUR_MARKUP})\n\nto this:\n\n BeautifulSoup(YOUR_MARKUP, \"%(parser)s\")\n" def __init__(self, markup="", features=None, builder=None, parse_only=None, from_encoding=None, exclude_encodings=None, @@ -215,8 +215,8 @@ class BeautifulSoup(Tag): markup = markup.encode("utf8") warnings.warn( '"%s" looks like a filename, not markup. You should' - 'probably open this file and pass the filehandle into' - 'Beautiful Soup.' % markup) + ' probably open this file and pass the filehandle into' + ' Beautiful Soup.' % markup) self._check_markup_is_url(markup) for (self.markup, self.original_encoding, self.declared_html_encoding, diff --git a/lib/bs4/builder/__init__.py b/lib/bs4/builder/__init__.py index 601979bf..fdb3362f 100644 --- a/lib/bs4/builder/__init__.py +++ b/lib/bs4/builder/__init__.py @@ -232,8 +232,13 @@ class HTMLTreeBuilder(TreeBuilder): """ preserve_whitespace_tags = HTMLAwareEntitySubstitution.preserve_whitespace_tags - empty_element_tags = set(['br' , 'hr', 'input', 'img', 'meta', - 'spacer', 'link', 'frame', 'base']) + empty_element_tags = set([ + # These are from HTML5. + 'area', 'base', 'br', 'col', 'embed', 'hr', 'img', 'input', 'keygen', 'link', 'menuitem', 'meta', 'param', 'source', 'track', 'wbr', + + # These are from HTML4, removed in HTML5. + 'spacer', 'frame' + ]) # The HTML standard defines these attributes as containing a # space-separated list of values, not a single value. That is, diff --git a/lib/bs4/builder/_html5lib.py b/lib/bs4/builder/_html5lib.py index c46f8823..5f548935 100644 --- a/lib/bs4/builder/_html5lib.py +++ b/lib/bs4/builder/_html5lib.py @@ -6,6 +6,7 @@ __all__ = [ ] import warnings +import re from bs4.builder import ( PERMISSIVE, HTML, @@ -17,7 +18,10 @@ from bs4.element import ( whitespace_re, ) import html5lib -from html5lib.constants import namespaces +from html5lib.constants import ( + namespaces, + prefixes, + ) from bs4.element import ( Comment, Doctype, @@ -83,7 +87,7 @@ class HTML5TreeBuilder(HTMLTreeBuilder): def create_treebuilder(self, namespaceHTMLElements): self.underlying_builder = TreeBuilderForHtml5lib( - self.soup, namespaceHTMLElements) + namespaceHTMLElements, self.soup) return self.underlying_builder def test_fragment_to_document(self, fragment): @@ -93,8 +97,12 @@ class HTML5TreeBuilder(HTMLTreeBuilder): class TreeBuilderForHtml5lib(treebuilder_base.TreeBuilder): - def __init__(self, soup, namespaceHTMLElements): - self.soup = soup + def __init__(self, namespaceHTMLElements, soup=None): + if soup: + self.soup = soup + else: + from bs4 import BeautifulSoup + self.soup = BeautifulSoup("", "html.parser") super(TreeBuilderForHtml5lib, self).__init__(namespaceHTMLElements) def documentClass(self): @@ -117,7 +125,8 @@ class TreeBuilderForHtml5lib(treebuilder_base.TreeBuilder): return TextNode(Comment(data), self.soup) def fragmentClass(self): - self.soup = BeautifulSoup("") + from bs4 import BeautifulSoup + self.soup = BeautifulSoup("", "html.parser") self.soup.name = "[document_fragment]" return Element(self.soup, self.soup, None) @@ -131,6 +140,56 @@ class TreeBuilderForHtml5lib(treebuilder_base.TreeBuilder): def getFragment(self): return treebuilder_base.TreeBuilder.getFragment(self).element + def testSerializer(self, element): + from bs4 import BeautifulSoup + rv = [] + doctype_re = re.compile(r'^(.*?)(?: PUBLIC "(.*?)"(?: "(.*?)")?| SYSTEM "(.*?)")?$') + + def serializeElement(element, indent=0): + if isinstance(element, BeautifulSoup): + pass + if isinstance(element, Doctype): + m = doctype_re.match(element) + if m: + name = m.group(1) + if m.lastindex > 1: + publicId = m.group(2) or "" + systemId = m.group(3) or m.group(4) or "" + rv.append("""|%s""" % + (' ' * indent, name, publicId, systemId)) + else: + rv.append("|%s" % (' ' * indent, name)) + else: + rv.append("|%s" % (' ' * indent,)) + elif isinstance(element, Comment): + rv.append("|%s" % (' ' * indent, element)) + elif isinstance(element, NavigableString): + rv.append("|%s\"%s\"" % (' ' * indent, element)) + else: + if element.namespace: + name = "%s %s" % (prefixes[element.namespace], + element.name) + else: + name = element.name + rv.append("|%s<%s>" % (' ' * indent, name)) + if element.attrs: + attributes = [] + for name, value in element.attrs.items(): + if isinstance(name, NamespacedAttribute): + name = "%s %s" % (prefixes[name.namespace], name.name) + if isinstance(value, list): + value = " ".join(value) + attributes.append((name, value)) + + for name, value in sorted(attributes): + rv.append('|%s%s="%s"' % (' ' * (indent + 2), name, value)) + indent += 2 + for child in element.children: + serializeElement(child, indent) + serializeElement(element, 0) + + return "\n".join(rv) + class AttrList(object): def __init__(self, element): self.element = element @@ -182,8 +241,10 @@ class Element(treebuilder_base.Node): child = node elif node.element.__class__ == NavigableString: string_child = child = node.element + node.parent = self else: child = node.element + node.parent = self if not isinstance(child, basestring) and child.parent is not None: node.element.extract() @@ -221,6 +282,8 @@ class Element(treebuilder_base.Node): most_recent_element=most_recent_element) def getAttributes(self): + if isinstance(self.element, Comment): + return {} return AttrList(self.element) def setAttributes(self, attributes): @@ -248,11 +311,11 @@ class Element(treebuilder_base.Node): attributes = property(getAttributes, setAttributes) def insertText(self, data, insertBefore=None): + text = TextNode(self.soup.new_string(data), self.soup) if insertBefore: - text = TextNode(self.soup.new_string(data), self.soup) - self.insertBefore(data, insertBefore) + self.insertBefore(text, insertBefore) else: - self.appendChild(data) + self.appendChild(text) def insertBefore(self, node, refNode): index = self.element.index(refNode.element) @@ -274,6 +337,7 @@ class Element(treebuilder_base.Node): # print "MOVE", self.element.contents # print "FROM", self.element # print "TO", new_parent.element + element = self.element new_parent_element = new_parent.element # Determine what this tag's next_element will be once all the children @@ -292,7 +356,6 @@ class Element(treebuilder_base.Node): new_parents_last_descendant_next_element = new_parent_element.next_element to_append = element.contents - append_after = new_parent_element.contents if len(to_append) > 0: # Set the first child's previous_element and previous_sibling # to elements within the new parent @@ -309,12 +372,19 @@ class Element(treebuilder_base.Node): if new_parents_last_child: new_parents_last_child.next_sibling = first_child - # Fix the last child's next_element and next_sibling - last_child = to_append[-1] - last_child.next_element = new_parents_last_descendant_next_element + # Find the very last element being moved. It is now the + # parent's last descendant. It has no .next_sibling and + # its .next_element is whatever the previous last + # descendant had. + last_childs_last_descendant = to_append[-1]._last_descendant(False, True) + + last_childs_last_descendant.next_element = new_parents_last_descendant_next_element if new_parents_last_descendant_next_element: - new_parents_last_descendant_next_element.previous_element = last_child - last_child.next_sibling = None + # TODO: This code has no test coverage and I'm not sure + # how to get html5lib to go through this path, but it's + # just the other side of the previous line. + new_parents_last_descendant_next_element.previous_element = last_childs_last_descendant + last_childs_last_descendant.next_sibling = None for child in to_append: child.parent = new_parent_element diff --git a/lib/bs4/builder/_htmlparser.py b/lib/bs4/builder/_htmlparser.py index 823ca15a..67890b3a 100644 --- a/lib/bs4/builder/_htmlparser.py +++ b/lib/bs4/builder/_htmlparser.py @@ -52,7 +52,31 @@ from bs4.builder import ( HTMLPARSER = 'html.parser' class BeautifulSoupHTMLParser(HTMLParser): - def handle_starttag(self, name, attrs): + + def __init__(self, *args, **kwargs): + HTMLParser.__init__(self, *args, **kwargs) + + # Keep a list of empty-element tags that were encountered + # without an explicit closing tag. If we encounter a closing tag + # of this type, we'll associate it with one of those entries. + # + # This isn't a stack because we don't care about the + # order. It's a list of closing tags we've already handled and + # will ignore, assuming they ever show up. + self.already_closed_empty_element = [] + + def handle_startendtag(self, name, attrs): + # This is only called when the markup looks like + # . + + # is_startend() tells handle_starttag not to close the tag + # just because its name matches a known empty-element tag. We + # know that this is an empty-element tag and we want to call + # handle_endtag ourselves. + tag = self.handle_starttag(name, attrs, handle_empty_element=False) + self.handle_endtag(name) + + def handle_starttag(self, name, attrs, handle_empty_element=True): # XXX namespace attr_dict = {} for key, value in attrs: @@ -62,10 +86,34 @@ class BeautifulSoupHTMLParser(HTMLParser): value = '' attr_dict[key] = value attrvalue = '""' - self.soup.handle_starttag(name, None, None, attr_dict) + #print "START", name + tag = self.soup.handle_starttag(name, None, None, attr_dict) + if tag and tag.is_empty_element and handle_empty_element: + # Unlike other parsers, html.parser doesn't send separate end tag + # events for empty-element tags. (It's handled in + # handle_startendtag, but only if the original markup looked like + # .) + # + # So we need to call handle_endtag() ourselves. Since we + # know the start event is identical to the end event, we + # don't want handle_endtag() to cross off any previous end + # events for tags of this name. + self.handle_endtag(name, check_already_closed=False) - def handle_endtag(self, name): - self.soup.handle_endtag(name) + # But we might encounter an explicit closing tag for this tag + # later on. If so, we want to ignore it. + self.already_closed_empty_element.append(name) + + def handle_endtag(self, name, check_already_closed=True): + #print "END", name + if check_already_closed and name in self.already_closed_empty_element: + # This is a redundant end tag for an empty-element tag. + # We've already called handle_endtag() for it, so just + # check it off the list. + # print "ALREADY CLOSED", name + self.already_closed_empty_element.remove(name) + else: + self.soup.handle_endtag(name) def handle_data(self, data): self.soup.handle_data(data) @@ -169,6 +217,7 @@ class HTMLParserTreeBuilder(HTMLTreeBuilder): warnings.warn(RuntimeWarning( "Python's built-in HTMLParser cannot parse the given document. This is not a bug in Beautiful Soup. The best solution is to install an external parser (lxml or html5lib), and use Beautiful Soup with that parser. See http://www.crummy.com/software/BeautifulSoup/bs4/doc/#installing-a-parser for help.")) raise e + parser.already_closed_empty_element = [] # Patch 3.2 versions of HTMLParser earlier than 3.2.3 to use some # 3.2.3 code. This ensures they don't treat markup like

as a diff --git a/lib/bs4/dammit.py b/lib/bs4/dammit.py index 2bf67f7f..7965565f 100644 --- a/lib/bs4/dammit.py +++ b/lib/bs4/dammit.py @@ -310,7 +310,7 @@ class EncodingDetector: else: xml_endpos = 1024 html_endpos = max(2048, int(len(markup) * 0.05)) - + declared_encoding = None declared_encoding_match = xml_encoding_re.search(markup, endpos=xml_endpos) if not declared_encoding_match and is_html: @@ -736,7 +736,7 @@ class UnicodeDammit: 0xde : b'\xc3\x9e', # Þ 0xdf : b'\xc3\x9f', # ß 0xe0 : b'\xc3\xa0', # à - 0xe1 : b'\xa1', # á + 0xe1 : b'\xa1', # á 0xe2 : b'\xc3\xa2', # â 0xe3 : b'\xc3\xa3', # ã 0xe4 : b'\xc3\xa4', # ä diff --git a/lib/bs4/element.py b/lib/bs4/element.py index b100d18b..9ef75f81 100644 --- a/lib/bs4/element.py +++ b/lib/bs4/element.py @@ -131,8 +131,8 @@ class PageElement(object): # to methods like encode() and prettify(): # # "html" - All Unicode characters with corresponding HTML entities - # are converted to those entities on output. - # "minimal" - Bare ampersands and angle brackets are converted to + # are converted to those entities on output. + # "minimal" - Bare ampersands and angle brackets are converted to # XML entities: & < > # None - The null formatter. Unicode characters are never # converted to entities. This is not recommended, but it's @@ -535,9 +535,16 @@ class PageElement(object): return ResultSet(strainer, result) elif isinstance(name, basestring): # Optimization to find all tags with a given name. + if name.count(':') == 1: + # This is a name with a prefix. + prefix, name = name.split(':', 1) + else: + prefix = None result = (element for element in generator if isinstance(element, Tag) - and element.name == name) + and element.name == name + and (prefix is None or element.prefix == prefix) + ) return ResultSet(strainer, result) results = ResultSet(strainer) while True: @@ -863,7 +870,7 @@ class Tag(PageElement): Its contents are a copy of the old Tag's contents. """ clone = type(self)(None, self.builder, self.name, self.namespace, - self.nsprefix, self.attrs, is_xml=self._is_xml) + self.prefix, self.attrs, is_xml=self._is_xml) for attr in ('can_be_empty_element', 'hidden'): setattr(clone, attr, getattr(self, attr)) for child in self.contents: @@ -985,6 +992,13 @@ class Tag(PageElement): attribute.""" return self.attrs.get(key, default) + def get_attribute_list(self, key, default=None): + """The same as get(), but always returns a list.""" + value = self.get(key, default) + if not isinstance(value, list): + value = [value] + return value + def has_attr(self, key): return key in self.attrs @@ -1698,7 +1712,7 @@ class SoupStrainer(object): "I don't know how to match against a %s" % markup.__class__) return found - def _matches(self, markup, match_against): + def _matches(self, markup, match_against, already_tried=None): # print u"Matching %s against %s" % (markup, match_against) result = False if isinstance(markup, list) or isinstance(markup, tuple): @@ -1713,7 +1727,7 @@ class SoupStrainer(object): if self._matches(' '.join(markup), match_against): return True return False - + if match_against is True: # True matches any non-None value. return markup is not None @@ -1723,6 +1737,7 @@ class SoupStrainer(object): # Custom callables take the tag as an argument, but all # other ways of matching match the tag name as a string. + original_markup = markup if isinstance(markup, Tag): markup = markup.name @@ -1733,18 +1748,51 @@ class SoupStrainer(object): # None matches None, False, an empty string, an empty list, and so on. return not match_against - if isinstance(match_against, unicode): + if (hasattr(match_against, '__iter__') + and not isinstance(match_against, basestring)): + # We're asked to match against an iterable of items. + # The markup must be match at least one item in the + # iterable. We'll try each one in turn. + # + # To avoid infinite recursion we need to keep track of + # items we've already seen. + if not already_tried: + already_tried = set() + for item in match_against: + if item.__hash__: + key = item + else: + key = id(item) + if key in already_tried: + continue + else: + already_tried.add(key) + if self._matches(original_markup, item, already_tried): + return True + else: + return False + + # Beyond this point we might need to run the test twice: once against + # the tag's name and once against its prefixed name. + match = False + + if not match and isinstance(match_against, unicode): # Exact string match - return markup == match_against + match = markup == match_against - if hasattr(match_against, 'match'): + if not match and hasattr(match_against, 'search'): # Regexp match return match_against.search(markup) - if hasattr(match_against, '__iter__'): - # The markup must be an exact match against something - # in the iterable. - return markup in match_against + if (not match + and isinstance(original_markup, Tag) + and original_markup.prefix): + # Try the whole thing again with the prefixed tag name. + return self._matches( + original_markup.prefix + ':' + original_markup.name, match_against + ) + + return match class ResultSet(list): @@ -1753,3 +1801,8 @@ class ResultSet(list): def __init__(self, source, result=()): super(ResultSet, self).__init__(result) self.source = source + + def __getattr__(self, key): + raise AttributeError( + "ResultSet object has no attribute '%s'. You're probably treating a list of items like a single item. Did you call find_all() when you meant to call find()?" % key + ) diff --git a/lib/bs4/testing.py b/lib/bs4/testing.py index 3a6ed425..6ba2506c 100644 --- a/lib/bs4/testing.py +++ b/lib/bs4/testing.py @@ -69,6 +69,18 @@ class HTMLTreeBuilderSmokeTest(object): markup in these tests, there's not much room for interpretation. """ + def test_empty_element_tags(self): + """Verify that all HTML4 and HTML5 empty element (aka void element) tags + are handled correctly. + """ + for name in [ + 'area', 'base', 'br', 'col', 'embed', 'hr', 'img', 'input', 'keygen', 'link', 'menuitem', 'meta', 'param', 'source', 'track', 'wbr', + 'spacer', 'frame' + ]: + soup = self.soup("") + new_tag = soup.new_tag(name) + self.assertEqual(True, new_tag.is_empty_element) + def test_pickle_and_unpickle_identity(self): # Pickling a tree, then unpickling it, yields a tree identical # to the original. @@ -330,6 +342,13 @@ Hello, world! self.assertEqual("p", soup.p.name) self.assertConnectedness(soup) + def test_empty_element_tags(self): + """Verify consistent handling of empty-element tags, + no matter how they come in through the markup. + """ + self.assertSoupEquals('


', "


") + self.assertSoupEquals('


', "


") + def test_head_tag_between_head_and_body(self): "Prevent recurrence of a bug in the html5lib treebuilder." content = """ @@ -669,6 +688,40 @@ class XMLTreeBuilderSmokeTest(object): soup = self.soup(markup) self.assertEqual(unicode(soup.foo), markup) + def test_find_by_prefixed_name(self): + doc = """ +foo + bar + baz + +""" + soup = self.soup(doc) + + # There are three tags. + self.assertEqual(3, len(soup.find_all('tag'))) + + # But two of them are ns1:tag and one of them is ns2:tag. + self.assertEqual(2, len(soup.find_all('ns1:tag'))) + self.assertEqual(1, len(soup.find_all('ns2:tag'))) + + self.assertEqual(1, len(soup.find_all('ns2:tag', key='value'))) + self.assertEqual(3, len(soup.find_all(['ns1:tag', 'ns2:tag']))) + + def test_copy_tag_preserves_namespace(self): + xml = """ +""" + + soup = self.soup(xml) + tag = soup.document + duplicate = copy.copy(tag) + + # The two tags have the same namespace prefix. + self.assertEqual(tag.prefix, duplicate.prefix) + + class HTML5TreeBuilderSmokeTest(HTMLTreeBuilderSmokeTest): """Smoke test for a tree builder that supports HTML5.""" diff --git a/lib/bs4/tests/test_html5lib.py b/lib/bs4/tests/test_html5lib.py index 8e3cba68..0f89d624 100644 --- a/lib/bs4/tests/test_html5lib.py +++ b/lib/bs4/tests/test_html5lib.py @@ -95,6 +95,22 @@ class HTML5LibBuilderSmokeTest(SoupTest, HTML5TreeBuilderSmokeTest): assert space1.next_element is tbody1 assert tbody2.next_element is space2 + def test_reparented_markup_containing_children(self): + markup = '' + soup = self.soup(markup) + noscript = soup.noscript + self.assertEqual("target", noscript.next_element) + target = soup.find(string='target') + + # The 'aftermath' string was duplicated; we want the second one. + final_aftermath = soup.find_all(string='aftermath')[-1] + + # The