mirror of https://github.com/evilhero/mylar
FIX: Fix for problems when using wwt (returning no search results would throw error, beautifulsoup could not be used if html5lib was up-to-date on the host system
This commit is contained in:
parent
d6cecceaba
commit
54304193e2
|
@ -21,8 +21,8 @@ http://www.crummy.com/software/BeautifulSoup/bs4/doc/
|
||||||
# found in the LICENSE file.
|
# found in the LICENSE file.
|
||||||
|
|
||||||
__author__ = "Leonard Richardson (leonardr@segfault.org)"
|
__author__ = "Leonard Richardson (leonardr@segfault.org)"
|
||||||
__version__ = "4.5.1"
|
__version__ = "4.6.0"
|
||||||
__copyright__ = "Copyright (c) 2004-2016 Leonard Richardson"
|
__copyright__ = "Copyright (c) 2004-2017 Leonard Richardson"
|
||||||
__license__ = "MIT"
|
__license__ = "MIT"
|
||||||
|
|
||||||
__all__ = ['BeautifulSoup']
|
__all__ = ['BeautifulSoup']
|
||||||
|
@ -82,7 +82,7 @@ class BeautifulSoup(Tag):
|
||||||
|
|
||||||
ASCII_SPACES = '\x20\x0a\x09\x0c\x0d'
|
ASCII_SPACES = '\x20\x0a\x09\x0c\x0d'
|
||||||
|
|
||||||
NO_PARSER_SPECIFIED_WARNING = "No parser was explicitly specified, so I'm using the best available %(markup_type)s parser for this system (\"%(parser)s\"). This usually isn't a problem, but if you run this code on another system, or in a different virtual environment, it may use a different parser and behave differently.\n\nThe code that caused this warning is on line %(line_number)s of the file %(filename)s. To get rid of this warning, change code that looks like this:\n\n BeautifulSoup([your markup])\n\nto this:\n\n BeautifulSoup([your markup], \"%(parser)s\")\n"
|
NO_PARSER_SPECIFIED_WARNING = "No parser was explicitly specified, so I'm using the best available %(markup_type)s parser for this system (\"%(parser)s\"). This usually isn't a problem, but if you run this code on another system, or in a different virtual environment, it may use a different parser and behave differently.\n\nThe code that caused this warning is on line %(line_number)s of the file %(filename)s. To get rid of this warning, change code that looks like this:\n\n BeautifulSoup(YOUR_MARKUP})\n\nto this:\n\n BeautifulSoup(YOUR_MARKUP, \"%(parser)s\")\n"
|
||||||
|
|
||||||
def __init__(self, markup="", features=None, builder=None,
|
def __init__(self, markup="", features=None, builder=None,
|
||||||
parse_only=None, from_encoding=None, exclude_encodings=None,
|
parse_only=None, from_encoding=None, exclude_encodings=None,
|
||||||
|
|
|
@ -232,8 +232,13 @@ class HTMLTreeBuilder(TreeBuilder):
|
||||||
"""
|
"""
|
||||||
|
|
||||||
preserve_whitespace_tags = HTMLAwareEntitySubstitution.preserve_whitespace_tags
|
preserve_whitespace_tags = HTMLAwareEntitySubstitution.preserve_whitespace_tags
|
||||||
empty_element_tags = set(['br' , 'hr', 'input', 'img', 'meta',
|
empty_element_tags = set([
|
||||||
'spacer', 'link', 'frame', 'base'])
|
# These are from HTML5.
|
||||||
|
'area', 'base', 'br', 'col', 'embed', 'hr', 'img', 'input', 'keygen', 'link', 'menuitem', 'meta', 'param', 'source', 'track', 'wbr',
|
||||||
|
|
||||||
|
# These are from HTML4, removed in HTML5.
|
||||||
|
'spacer', 'frame'
|
||||||
|
])
|
||||||
|
|
||||||
# The HTML standard defines these attributes as containing a
|
# The HTML standard defines these attributes as containing a
|
||||||
# space-separated list of values, not a single value. That is,
|
# space-separated list of values, not a single value. That is,
|
||||||
|
|
|
@ -6,6 +6,7 @@ __all__ = [
|
||||||
]
|
]
|
||||||
|
|
||||||
import warnings
|
import warnings
|
||||||
|
import re
|
||||||
from bs4.builder import (
|
from bs4.builder import (
|
||||||
PERMISSIVE,
|
PERMISSIVE,
|
||||||
HTML,
|
HTML,
|
||||||
|
@ -17,7 +18,10 @@ from bs4.element import (
|
||||||
whitespace_re,
|
whitespace_re,
|
||||||
)
|
)
|
||||||
import html5lib
|
import html5lib
|
||||||
from html5lib.constants import namespaces
|
from html5lib.constants import (
|
||||||
|
namespaces,
|
||||||
|
prefixes,
|
||||||
|
)
|
||||||
from bs4.element import (
|
from bs4.element import (
|
||||||
Comment,
|
Comment,
|
||||||
Doctype,
|
Doctype,
|
||||||
|
@ -83,7 +87,7 @@ class HTML5TreeBuilder(HTMLTreeBuilder):
|
||||||
|
|
||||||
def create_treebuilder(self, namespaceHTMLElements):
|
def create_treebuilder(self, namespaceHTMLElements):
|
||||||
self.underlying_builder = TreeBuilderForHtml5lib(
|
self.underlying_builder = TreeBuilderForHtml5lib(
|
||||||
self.soup, namespaceHTMLElements)
|
namespaceHTMLElements, self.soup)
|
||||||
return self.underlying_builder
|
return self.underlying_builder
|
||||||
|
|
||||||
def test_fragment_to_document(self, fragment):
|
def test_fragment_to_document(self, fragment):
|
||||||
|
@ -93,8 +97,12 @@ class HTML5TreeBuilder(HTMLTreeBuilder):
|
||||||
|
|
||||||
class TreeBuilderForHtml5lib(treebuilder_base.TreeBuilder):
|
class TreeBuilderForHtml5lib(treebuilder_base.TreeBuilder):
|
||||||
|
|
||||||
def __init__(self, soup, namespaceHTMLElements):
|
def __init__(self, namespaceHTMLElements, soup=None):
|
||||||
|
if soup:
|
||||||
self.soup = soup
|
self.soup = soup
|
||||||
|
else:
|
||||||
|
from bs4 import BeautifulSoup
|
||||||
|
self.soup = BeautifulSoup("", "html.parser")
|
||||||
super(TreeBuilderForHtml5lib, self).__init__(namespaceHTMLElements)
|
super(TreeBuilderForHtml5lib, self).__init__(namespaceHTMLElements)
|
||||||
|
|
||||||
def documentClass(self):
|
def documentClass(self):
|
||||||
|
@ -117,7 +125,8 @@ class TreeBuilderForHtml5lib(treebuilder_base.TreeBuilder):
|
||||||
return TextNode(Comment(data), self.soup)
|
return TextNode(Comment(data), self.soup)
|
||||||
|
|
||||||
def fragmentClass(self):
|
def fragmentClass(self):
|
||||||
self.soup = BeautifulSoup("")
|
from bs4 import BeautifulSoup
|
||||||
|
self.soup = BeautifulSoup("", "html.parser")
|
||||||
self.soup.name = "[document_fragment]"
|
self.soup.name = "[document_fragment]"
|
||||||
return Element(self.soup, self.soup, None)
|
return Element(self.soup, self.soup, None)
|
||||||
|
|
||||||
|
@ -131,6 +140,56 @@ class TreeBuilderForHtml5lib(treebuilder_base.TreeBuilder):
|
||||||
def getFragment(self):
|
def getFragment(self):
|
||||||
return treebuilder_base.TreeBuilder.getFragment(self).element
|
return treebuilder_base.TreeBuilder.getFragment(self).element
|
||||||
|
|
||||||
|
def testSerializer(self, element):
|
||||||
|
from bs4 import BeautifulSoup
|
||||||
|
rv = []
|
||||||
|
doctype_re = re.compile(r'^(.*?)(?: PUBLIC "(.*?)"(?: "(.*?)")?| SYSTEM "(.*?)")?$')
|
||||||
|
|
||||||
|
def serializeElement(element, indent=0):
|
||||||
|
if isinstance(element, BeautifulSoup):
|
||||||
|
pass
|
||||||
|
if isinstance(element, Doctype):
|
||||||
|
m = doctype_re.match(element)
|
||||||
|
if m:
|
||||||
|
name = m.group(1)
|
||||||
|
if m.lastindex > 1:
|
||||||
|
publicId = m.group(2) or ""
|
||||||
|
systemId = m.group(3) or m.group(4) or ""
|
||||||
|
rv.append("""|%s<!DOCTYPE %s "%s" "%s">""" %
|
||||||
|
(' ' * indent, name, publicId, systemId))
|
||||||
|
else:
|
||||||
|
rv.append("|%s<!DOCTYPE %s>" % (' ' * indent, name))
|
||||||
|
else:
|
||||||
|
rv.append("|%s<!DOCTYPE >" % (' ' * indent,))
|
||||||
|
elif isinstance(element, Comment):
|
||||||
|
rv.append("|%s<!-- %s -->" % (' ' * indent, element))
|
||||||
|
elif isinstance(element, NavigableString):
|
||||||
|
rv.append("|%s\"%s\"" % (' ' * indent, element))
|
||||||
|
else:
|
||||||
|
if element.namespace:
|
||||||
|
name = "%s %s" % (prefixes[element.namespace],
|
||||||
|
element.name)
|
||||||
|
else:
|
||||||
|
name = element.name
|
||||||
|
rv.append("|%s<%s>" % (' ' * indent, name))
|
||||||
|
if element.attrs:
|
||||||
|
attributes = []
|
||||||
|
for name, value in element.attrs.items():
|
||||||
|
if isinstance(name, NamespacedAttribute):
|
||||||
|
name = "%s %s" % (prefixes[name.namespace], name.name)
|
||||||
|
if isinstance(value, list):
|
||||||
|
value = " ".join(value)
|
||||||
|
attributes.append((name, value))
|
||||||
|
|
||||||
|
for name, value in sorted(attributes):
|
||||||
|
rv.append('|%s%s="%s"' % (' ' * (indent + 2), name, value))
|
||||||
|
indent += 2
|
||||||
|
for child in element.children:
|
||||||
|
serializeElement(child, indent)
|
||||||
|
serializeElement(element, 0)
|
||||||
|
|
||||||
|
return "\n".join(rv)
|
||||||
|
|
||||||
class AttrList(object):
|
class AttrList(object):
|
||||||
def __init__(self, element):
|
def __init__(self, element):
|
||||||
self.element = element
|
self.element = element
|
||||||
|
@ -182,8 +241,10 @@ class Element(treebuilder_base.Node):
|
||||||
child = node
|
child = node
|
||||||
elif node.element.__class__ == NavigableString:
|
elif node.element.__class__ == NavigableString:
|
||||||
string_child = child = node.element
|
string_child = child = node.element
|
||||||
|
node.parent = self
|
||||||
else:
|
else:
|
||||||
child = node.element
|
child = node.element
|
||||||
|
node.parent = self
|
||||||
|
|
||||||
if not isinstance(child, basestring) and child.parent is not None:
|
if not isinstance(child, basestring) and child.parent is not None:
|
||||||
node.element.extract()
|
node.element.extract()
|
||||||
|
@ -221,6 +282,8 @@ class Element(treebuilder_base.Node):
|
||||||
most_recent_element=most_recent_element)
|
most_recent_element=most_recent_element)
|
||||||
|
|
||||||
def getAttributes(self):
|
def getAttributes(self):
|
||||||
|
if isinstance(self.element, Comment):
|
||||||
|
return {}
|
||||||
return AttrList(self.element)
|
return AttrList(self.element)
|
||||||
|
|
||||||
def setAttributes(self, attributes):
|
def setAttributes(self, attributes):
|
||||||
|
@ -248,11 +311,11 @@ class Element(treebuilder_base.Node):
|
||||||
attributes = property(getAttributes, setAttributes)
|
attributes = property(getAttributes, setAttributes)
|
||||||
|
|
||||||
def insertText(self, data, insertBefore=None):
|
def insertText(self, data, insertBefore=None):
|
||||||
if insertBefore:
|
|
||||||
text = TextNode(self.soup.new_string(data), self.soup)
|
text = TextNode(self.soup.new_string(data), self.soup)
|
||||||
self.insertBefore(data, insertBefore)
|
if insertBefore:
|
||||||
|
self.insertBefore(text, insertBefore)
|
||||||
else:
|
else:
|
||||||
self.appendChild(data)
|
self.appendChild(text)
|
||||||
|
|
||||||
def insertBefore(self, node, refNode):
|
def insertBefore(self, node, refNode):
|
||||||
index = self.element.index(refNode.element)
|
index = self.element.index(refNode.element)
|
||||||
|
@ -274,6 +337,7 @@ class Element(treebuilder_base.Node):
|
||||||
# print "MOVE", self.element.contents
|
# print "MOVE", self.element.contents
|
||||||
# print "FROM", self.element
|
# print "FROM", self.element
|
||||||
# print "TO", new_parent.element
|
# print "TO", new_parent.element
|
||||||
|
|
||||||
element = self.element
|
element = self.element
|
||||||
new_parent_element = new_parent.element
|
new_parent_element = new_parent.element
|
||||||
# Determine what this tag's next_element will be once all the children
|
# Determine what this tag's next_element will be once all the children
|
||||||
|
@ -292,7 +356,6 @@ class Element(treebuilder_base.Node):
|
||||||
new_parents_last_descendant_next_element = new_parent_element.next_element
|
new_parents_last_descendant_next_element = new_parent_element.next_element
|
||||||
|
|
||||||
to_append = element.contents
|
to_append = element.contents
|
||||||
append_after = new_parent_element.contents
|
|
||||||
if len(to_append) > 0:
|
if len(to_append) > 0:
|
||||||
# Set the first child's previous_element and previous_sibling
|
# Set the first child's previous_element and previous_sibling
|
||||||
# to elements within the new parent
|
# to elements within the new parent
|
||||||
|
@ -309,12 +372,19 @@ class Element(treebuilder_base.Node):
|
||||||
if new_parents_last_child:
|
if new_parents_last_child:
|
||||||
new_parents_last_child.next_sibling = first_child
|
new_parents_last_child.next_sibling = first_child
|
||||||
|
|
||||||
# Fix the last child's next_element and next_sibling
|
# Find the very last element being moved. It is now the
|
||||||
last_child = to_append[-1]
|
# parent's last descendant. It has no .next_sibling and
|
||||||
last_child.next_element = new_parents_last_descendant_next_element
|
# its .next_element is whatever the previous last
|
||||||
|
# descendant had.
|
||||||
|
last_childs_last_descendant = to_append[-1]._last_descendant(False, True)
|
||||||
|
|
||||||
|
last_childs_last_descendant.next_element = new_parents_last_descendant_next_element
|
||||||
if new_parents_last_descendant_next_element:
|
if new_parents_last_descendant_next_element:
|
||||||
new_parents_last_descendant_next_element.previous_element = last_child
|
# TODO: This code has no test coverage and I'm not sure
|
||||||
last_child.next_sibling = None
|
# how to get html5lib to go through this path, but it's
|
||||||
|
# just the other side of the previous line.
|
||||||
|
new_parents_last_descendant_next_element.previous_element = last_childs_last_descendant
|
||||||
|
last_childs_last_descendant.next_sibling = None
|
||||||
|
|
||||||
for child in to_append:
|
for child in to_append:
|
||||||
child.parent = new_parent_element
|
child.parent = new_parent_element
|
||||||
|
|
|
@ -52,7 +52,31 @@ from bs4.builder import (
|
||||||
HTMLPARSER = 'html.parser'
|
HTMLPARSER = 'html.parser'
|
||||||
|
|
||||||
class BeautifulSoupHTMLParser(HTMLParser):
|
class BeautifulSoupHTMLParser(HTMLParser):
|
||||||
def handle_starttag(self, name, attrs):
|
|
||||||
|
def __init__(self, *args, **kwargs):
|
||||||
|
HTMLParser.__init__(self, *args, **kwargs)
|
||||||
|
|
||||||
|
# Keep a list of empty-element tags that were encountered
|
||||||
|
# without an explicit closing tag. If we encounter a closing tag
|
||||||
|
# of this type, we'll associate it with one of those entries.
|
||||||
|
#
|
||||||
|
# This isn't a stack because we don't care about the
|
||||||
|
# order. It's a list of closing tags we've already handled and
|
||||||
|
# will ignore, assuming they ever show up.
|
||||||
|
self.already_closed_empty_element = []
|
||||||
|
|
||||||
|
def handle_startendtag(self, name, attrs):
|
||||||
|
# This is only called when the markup looks like
|
||||||
|
# <tag/>.
|
||||||
|
|
||||||
|
# is_startend() tells handle_starttag not to close the tag
|
||||||
|
# just because its name matches a known empty-element tag. We
|
||||||
|
# know that this is an empty-element tag and we want to call
|
||||||
|
# handle_endtag ourselves.
|
||||||
|
tag = self.handle_starttag(name, attrs, handle_empty_element=False)
|
||||||
|
self.handle_endtag(name)
|
||||||
|
|
||||||
|
def handle_starttag(self, name, attrs, handle_empty_element=True):
|
||||||
# XXX namespace
|
# XXX namespace
|
||||||
attr_dict = {}
|
attr_dict = {}
|
||||||
for key, value in attrs:
|
for key, value in attrs:
|
||||||
|
@ -62,9 +86,33 @@ class BeautifulSoupHTMLParser(HTMLParser):
|
||||||
value = ''
|
value = ''
|
||||||
attr_dict[key] = value
|
attr_dict[key] = value
|
||||||
attrvalue = '""'
|
attrvalue = '""'
|
||||||
self.soup.handle_starttag(name, None, None, attr_dict)
|
#print "START", name
|
||||||
|
tag = self.soup.handle_starttag(name, None, None, attr_dict)
|
||||||
|
if tag and tag.is_empty_element and handle_empty_element:
|
||||||
|
# Unlike other parsers, html.parser doesn't send separate end tag
|
||||||
|
# events for empty-element tags. (It's handled in
|
||||||
|
# handle_startendtag, but only if the original markup looked like
|
||||||
|
# <tag/>.)
|
||||||
|
#
|
||||||
|
# So we need to call handle_endtag() ourselves. Since we
|
||||||
|
# know the start event is identical to the end event, we
|
||||||
|
# don't want handle_endtag() to cross off any previous end
|
||||||
|
# events for tags of this name.
|
||||||
|
self.handle_endtag(name, check_already_closed=False)
|
||||||
|
|
||||||
def handle_endtag(self, name):
|
# But we might encounter an explicit closing tag for this tag
|
||||||
|
# later on. If so, we want to ignore it.
|
||||||
|
self.already_closed_empty_element.append(name)
|
||||||
|
|
||||||
|
def handle_endtag(self, name, check_already_closed=True):
|
||||||
|
#print "END", name
|
||||||
|
if check_already_closed and name in self.already_closed_empty_element:
|
||||||
|
# This is a redundant end tag for an empty-element tag.
|
||||||
|
# We've already called handle_endtag() for it, so just
|
||||||
|
# check it off the list.
|
||||||
|
# print "ALREADY CLOSED", name
|
||||||
|
self.already_closed_empty_element.remove(name)
|
||||||
|
else:
|
||||||
self.soup.handle_endtag(name)
|
self.soup.handle_endtag(name)
|
||||||
|
|
||||||
def handle_data(self, data):
|
def handle_data(self, data):
|
||||||
|
@ -169,6 +217,7 @@ class HTMLParserTreeBuilder(HTMLTreeBuilder):
|
||||||
warnings.warn(RuntimeWarning(
|
warnings.warn(RuntimeWarning(
|
||||||
"Python's built-in HTMLParser cannot parse the given document. This is not a bug in Beautiful Soup. The best solution is to install an external parser (lxml or html5lib), and use Beautiful Soup with that parser. See http://www.crummy.com/software/BeautifulSoup/bs4/doc/#installing-a-parser for help."))
|
"Python's built-in HTMLParser cannot parse the given document. This is not a bug in Beautiful Soup. The best solution is to install an external parser (lxml or html5lib), and use Beautiful Soup with that parser. See http://www.crummy.com/software/BeautifulSoup/bs4/doc/#installing-a-parser for help."))
|
||||||
raise e
|
raise e
|
||||||
|
parser.already_closed_empty_element = []
|
||||||
|
|
||||||
# Patch 3.2 versions of HTMLParser earlier than 3.2.3 to use some
|
# Patch 3.2 versions of HTMLParser earlier than 3.2.3 to use some
|
||||||
# 3.2.3 code. This ensures they don't treat markup like <p></p> as a
|
# 3.2.3 code. This ensures they don't treat markup like <p></p> as a
|
||||||
|
|
|
@ -535,9 +535,16 @@ class PageElement(object):
|
||||||
return ResultSet(strainer, result)
|
return ResultSet(strainer, result)
|
||||||
elif isinstance(name, basestring):
|
elif isinstance(name, basestring):
|
||||||
# Optimization to find all tags with a given name.
|
# Optimization to find all tags with a given name.
|
||||||
|
if name.count(':') == 1:
|
||||||
|
# This is a name with a prefix.
|
||||||
|
prefix, name = name.split(':', 1)
|
||||||
|
else:
|
||||||
|
prefix = None
|
||||||
result = (element for element in generator
|
result = (element for element in generator
|
||||||
if isinstance(element, Tag)
|
if isinstance(element, Tag)
|
||||||
and element.name == name)
|
and element.name == name
|
||||||
|
and (prefix is None or element.prefix == prefix)
|
||||||
|
)
|
||||||
return ResultSet(strainer, result)
|
return ResultSet(strainer, result)
|
||||||
results = ResultSet(strainer)
|
results = ResultSet(strainer)
|
||||||
while True:
|
while True:
|
||||||
|
@ -863,7 +870,7 @@ class Tag(PageElement):
|
||||||
Its contents are a copy of the old Tag's contents.
|
Its contents are a copy of the old Tag's contents.
|
||||||
"""
|
"""
|
||||||
clone = type(self)(None, self.builder, self.name, self.namespace,
|
clone = type(self)(None, self.builder, self.name, self.namespace,
|
||||||
self.nsprefix, self.attrs, is_xml=self._is_xml)
|
self.prefix, self.attrs, is_xml=self._is_xml)
|
||||||
for attr in ('can_be_empty_element', 'hidden'):
|
for attr in ('can_be_empty_element', 'hidden'):
|
||||||
setattr(clone, attr, getattr(self, attr))
|
setattr(clone, attr, getattr(self, attr))
|
||||||
for child in self.contents:
|
for child in self.contents:
|
||||||
|
@ -985,6 +992,13 @@ class Tag(PageElement):
|
||||||
attribute."""
|
attribute."""
|
||||||
return self.attrs.get(key, default)
|
return self.attrs.get(key, default)
|
||||||
|
|
||||||
|
def get_attribute_list(self, key, default=None):
|
||||||
|
"""The same as get(), but always returns a list."""
|
||||||
|
value = self.get(key, default)
|
||||||
|
if not isinstance(value, list):
|
||||||
|
value = [value]
|
||||||
|
return value
|
||||||
|
|
||||||
def has_attr(self, key):
|
def has_attr(self, key):
|
||||||
return key in self.attrs
|
return key in self.attrs
|
||||||
|
|
||||||
|
@ -1698,7 +1712,7 @@ class SoupStrainer(object):
|
||||||
"I don't know how to match against a %s" % markup.__class__)
|
"I don't know how to match against a %s" % markup.__class__)
|
||||||
return found
|
return found
|
||||||
|
|
||||||
def _matches(self, markup, match_against):
|
def _matches(self, markup, match_against, already_tried=None):
|
||||||
# print u"Matching %s against %s" % (markup, match_against)
|
# print u"Matching %s against %s" % (markup, match_against)
|
||||||
result = False
|
result = False
|
||||||
if isinstance(markup, list) or isinstance(markup, tuple):
|
if isinstance(markup, list) or isinstance(markup, tuple):
|
||||||
|
@ -1723,6 +1737,7 @@ class SoupStrainer(object):
|
||||||
|
|
||||||
# Custom callables take the tag as an argument, but all
|
# Custom callables take the tag as an argument, but all
|
||||||
# other ways of matching match the tag name as a string.
|
# other ways of matching match the tag name as a string.
|
||||||
|
original_markup = markup
|
||||||
if isinstance(markup, Tag):
|
if isinstance(markup, Tag):
|
||||||
markup = markup.name
|
markup = markup.name
|
||||||
|
|
||||||
|
@ -1733,18 +1748,51 @@ class SoupStrainer(object):
|
||||||
# None matches None, False, an empty string, an empty list, and so on.
|
# None matches None, False, an empty string, an empty list, and so on.
|
||||||
return not match_against
|
return not match_against
|
||||||
|
|
||||||
if isinstance(match_against, unicode):
|
if (hasattr(match_against, '__iter__')
|
||||||
# Exact string match
|
and not isinstance(match_against, basestring)):
|
||||||
return markup == match_against
|
# We're asked to match against an iterable of items.
|
||||||
|
# The markup must be match at least one item in the
|
||||||
|
# iterable. We'll try each one in turn.
|
||||||
|
#
|
||||||
|
# To avoid infinite recursion we need to keep track of
|
||||||
|
# items we've already seen.
|
||||||
|
if not already_tried:
|
||||||
|
already_tried = set()
|
||||||
|
for item in match_against:
|
||||||
|
if item.__hash__:
|
||||||
|
key = item
|
||||||
|
else:
|
||||||
|
key = id(item)
|
||||||
|
if key in already_tried:
|
||||||
|
continue
|
||||||
|
else:
|
||||||
|
already_tried.add(key)
|
||||||
|
if self._matches(original_markup, item, already_tried):
|
||||||
|
return True
|
||||||
|
else:
|
||||||
|
return False
|
||||||
|
|
||||||
if hasattr(match_against, 'match'):
|
# Beyond this point we might need to run the test twice: once against
|
||||||
|
# the tag's name and once against its prefixed name.
|
||||||
|
match = False
|
||||||
|
|
||||||
|
if not match and isinstance(match_against, unicode):
|
||||||
|
# Exact string match
|
||||||
|
match = markup == match_against
|
||||||
|
|
||||||
|
if not match and hasattr(match_against, 'search'):
|
||||||
# Regexp match
|
# Regexp match
|
||||||
return match_against.search(markup)
|
return match_against.search(markup)
|
||||||
|
|
||||||
if hasattr(match_against, '__iter__'):
|
if (not match
|
||||||
# The markup must be an exact match against something
|
and isinstance(original_markup, Tag)
|
||||||
# in the iterable.
|
and original_markup.prefix):
|
||||||
return markup in match_against
|
# Try the whole thing again with the prefixed tag name.
|
||||||
|
return self._matches(
|
||||||
|
original_markup.prefix + ':' + original_markup.name, match_against
|
||||||
|
)
|
||||||
|
|
||||||
|
return match
|
||||||
|
|
||||||
|
|
||||||
class ResultSet(list):
|
class ResultSet(list):
|
||||||
|
@ -1753,3 +1801,8 @@ class ResultSet(list):
|
||||||
def __init__(self, source, result=()):
|
def __init__(self, source, result=()):
|
||||||
super(ResultSet, self).__init__(result)
|
super(ResultSet, self).__init__(result)
|
||||||
self.source = source
|
self.source = source
|
||||||
|
|
||||||
|
def __getattr__(self, key):
|
||||||
|
raise AttributeError(
|
||||||
|
"ResultSet object has no attribute '%s'. You're probably treating a list of items like a single item. Did you call find_all() when you meant to call find()?" % key
|
||||||
|
)
|
||||||
|
|
|
@ -69,6 +69,18 @@ class HTMLTreeBuilderSmokeTest(object):
|
||||||
markup in these tests, there's not much room for interpretation.
|
markup in these tests, there's not much room for interpretation.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
|
def test_empty_element_tags(self):
|
||||||
|
"""Verify that all HTML4 and HTML5 empty element (aka void element) tags
|
||||||
|
are handled correctly.
|
||||||
|
"""
|
||||||
|
for name in [
|
||||||
|
'area', 'base', 'br', 'col', 'embed', 'hr', 'img', 'input', 'keygen', 'link', 'menuitem', 'meta', 'param', 'source', 'track', 'wbr',
|
||||||
|
'spacer', 'frame'
|
||||||
|
]:
|
||||||
|
soup = self.soup("")
|
||||||
|
new_tag = soup.new_tag(name)
|
||||||
|
self.assertEqual(True, new_tag.is_empty_element)
|
||||||
|
|
||||||
def test_pickle_and_unpickle_identity(self):
|
def test_pickle_and_unpickle_identity(self):
|
||||||
# Pickling a tree, then unpickling it, yields a tree identical
|
# Pickling a tree, then unpickling it, yields a tree identical
|
||||||
# to the original.
|
# to the original.
|
||||||
|
@ -330,6 +342,13 @@ Hello, world!
|
||||||
self.assertEqual("p", soup.p.name)
|
self.assertEqual("p", soup.p.name)
|
||||||
self.assertConnectedness(soup)
|
self.assertConnectedness(soup)
|
||||||
|
|
||||||
|
def test_empty_element_tags(self):
|
||||||
|
"""Verify consistent handling of empty-element tags,
|
||||||
|
no matter how they come in through the markup.
|
||||||
|
"""
|
||||||
|
self.assertSoupEquals('<br/><br/><br/>', "<br/><br/><br/>")
|
||||||
|
self.assertSoupEquals('<br /><br /><br />', "<br/><br/><br/>")
|
||||||
|
|
||||||
def test_head_tag_between_head_and_body(self):
|
def test_head_tag_between_head_and_body(self):
|
||||||
"Prevent recurrence of a bug in the html5lib treebuilder."
|
"Prevent recurrence of a bug in the html5lib treebuilder."
|
||||||
content = """<html><head></head>
|
content = """<html><head></head>
|
||||||
|
@ -669,6 +688,40 @@ class XMLTreeBuilderSmokeTest(object):
|
||||||
soup = self.soup(markup)
|
soup = self.soup(markup)
|
||||||
self.assertEqual(unicode(soup.foo), markup)
|
self.assertEqual(unicode(soup.foo), markup)
|
||||||
|
|
||||||
|
def test_find_by_prefixed_name(self):
|
||||||
|
doc = """<?xml version="1.0" encoding="utf-8"?>
|
||||||
|
<Document xmlns="http://example.com/ns0"
|
||||||
|
xmlns:ns1="http://example.com/ns1"
|
||||||
|
xmlns:ns2="http://example.com/ns2"
|
||||||
|
<ns1:tag>foo</ns1:tag>
|
||||||
|
<ns1:tag>bar</ns1:tag>
|
||||||
|
<ns2:tag key="value">baz</ns2:tag>
|
||||||
|
</Document>
|
||||||
|
"""
|
||||||
|
soup = self.soup(doc)
|
||||||
|
|
||||||
|
# There are three <tag> tags.
|
||||||
|
self.assertEqual(3, len(soup.find_all('tag')))
|
||||||
|
|
||||||
|
# But two of them are ns1:tag and one of them is ns2:tag.
|
||||||
|
self.assertEqual(2, len(soup.find_all('ns1:tag')))
|
||||||
|
self.assertEqual(1, len(soup.find_all('ns2:tag')))
|
||||||
|
|
||||||
|
self.assertEqual(1, len(soup.find_all('ns2:tag', key='value')))
|
||||||
|
self.assertEqual(3, len(soup.find_all(['ns1:tag', 'ns2:tag'])))
|
||||||
|
|
||||||
|
def test_copy_tag_preserves_namespace(self):
|
||||||
|
xml = """<?xml version="1.0" encoding="UTF-8" standalone="yes"?>
|
||||||
|
<w:document xmlns:w="http://example.com/ns0"/>"""
|
||||||
|
|
||||||
|
soup = self.soup(xml)
|
||||||
|
tag = soup.document
|
||||||
|
duplicate = copy.copy(tag)
|
||||||
|
|
||||||
|
# The two tags have the same namespace prefix.
|
||||||
|
self.assertEqual(tag.prefix, duplicate.prefix)
|
||||||
|
|
||||||
|
|
||||||
class HTML5TreeBuilderSmokeTest(HTMLTreeBuilderSmokeTest):
|
class HTML5TreeBuilderSmokeTest(HTMLTreeBuilderSmokeTest):
|
||||||
"""Smoke test for a tree builder that supports HTML5."""
|
"""Smoke test for a tree builder that supports HTML5."""
|
||||||
|
|
||||||
|
|
|
@ -95,6 +95,22 @@ class HTML5LibBuilderSmokeTest(SoupTest, HTML5TreeBuilderSmokeTest):
|
||||||
assert space1.next_element is tbody1
|
assert space1.next_element is tbody1
|
||||||
assert tbody2.next_element is space2
|
assert tbody2.next_element is space2
|
||||||
|
|
||||||
|
def test_reparented_markup_containing_children(self):
|
||||||
|
markup = '<div><a>aftermath<p><noscript>target</noscript>aftermath</a></p></div>'
|
||||||
|
soup = self.soup(markup)
|
||||||
|
noscript = soup.noscript
|
||||||
|
self.assertEqual("target", noscript.next_element)
|
||||||
|
target = soup.find(string='target')
|
||||||
|
|
||||||
|
# The 'aftermath' string was duplicated; we want the second one.
|
||||||
|
final_aftermath = soup.find_all(string='aftermath')[-1]
|
||||||
|
|
||||||
|
# The <noscript> tag was moved beneath a copy of the <a> tag,
|
||||||
|
# but the 'target' string within is still connected to the
|
||||||
|
# (second) 'aftermath' string.
|
||||||
|
self.assertEqual(final_aftermath, target.next_element)
|
||||||
|
self.assertEqual(target, final_aftermath.previous_element)
|
||||||
|
|
||||||
def test_processing_instruction(self):
|
def test_processing_instruction(self):
|
||||||
"""Processing instructions become comments."""
|
"""Processing instructions become comments."""
|
||||||
markup = b"""<?PITarget PIContent?>"""
|
markup = b"""<?PITarget PIContent?>"""
|
||||||
|
@ -107,3 +123,8 @@ class HTML5LibBuilderSmokeTest(SoupTest, HTML5TreeBuilderSmokeTest):
|
||||||
a1, a2 = soup.find_all('a')
|
a1, a2 = soup.find_all('a')
|
||||||
self.assertEqual(a1, a2)
|
self.assertEqual(a1, a2)
|
||||||
assert a1 is not a2
|
assert a1 is not a2
|
||||||
|
|
||||||
|
def test_foster_parenting(self):
|
||||||
|
markup = b"""<table><td></tbody>A"""
|
||||||
|
soup = self.soup(markup)
|
||||||
|
self.assertEqual(u"<body>A<table><tbody><tr><td></td></tr></tbody></table></body>", soup.body.decode())
|
||||||
|
|
|
@ -29,4 +29,6 @@ class HTMLParserTreeBuilderSmokeTest(SoupTest, HTMLTreeBuilderSmokeTest):
|
||||||
loaded = pickle.loads(dumped)
|
loaded = pickle.loads(dumped)
|
||||||
self.assertTrue(isinstance(loaded.builder, type(tree.builder)))
|
self.assertTrue(isinstance(loaded.builder, type(tree.builder)))
|
||||||
|
|
||||||
|
def test_redundant_empty_element_closing_tags(self):
|
||||||
|
self.assertSoupEquals('<br></br><br></br><br></br>', "<br/><br/><br/>")
|
||||||
|
self.assertSoupEquals('</br></br></br>', "")
|
||||||
|
|
|
@ -1,3 +1,4 @@
|
||||||
|
|
||||||
# -*- coding: utf-8 -*-
|
# -*- coding: utf-8 -*-
|
||||||
"""Tests for Beautiful Soup's tree traversal methods.
|
"""Tests for Beautiful Soup's tree traversal methods.
|
||||||
|
|
||||||
|
@ -234,6 +235,7 @@ class TestFindAllByName(TreeTest):
|
||||||
self.assertEqual('1', r3.string)
|
self.assertEqual('1', r3.string)
|
||||||
self.assertEqual('3', r4.string)
|
self.assertEqual('3', r4.string)
|
||||||
|
|
||||||
|
|
||||||
class TestFindAllByAttribute(TreeTest):
|
class TestFindAllByAttribute(TreeTest):
|
||||||
|
|
||||||
def test_find_all_by_attribute_name(self):
|
def test_find_all_by_attribute_name(self):
|
||||||
|
@ -1284,6 +1286,10 @@ class TestCDAtaListAttributes(SoupTest):
|
||||||
soup = self.soup("<a class='foo\tbar'>")
|
soup = self.soup("<a class='foo\tbar'>")
|
||||||
self.assertEqual(b'<a class="foo bar"></a>', soup.a.encode())
|
self.assertEqual(b'<a class="foo bar"></a>', soup.a.encode())
|
||||||
|
|
||||||
|
def test_get_attribute_list(self):
|
||||||
|
soup = self.soup("<a id='abc def'>")
|
||||||
|
self.assertEqual(['abc def'], soup.a.get_attribute_list('id'))
|
||||||
|
|
||||||
def test_accept_charset(self):
|
def test_accept_charset(self):
|
||||||
soup = self.soup('<form accept-charset="ISO-8859-1 UTF-8">')
|
soup = self.soup('<form accept-charset="ISO-8859-1 UTF-8">')
|
||||||
self.assertEqual(['ISO-8859-1', 'UTF-8'], soup.form['accept-charset'])
|
self.assertEqual(['ISO-8859-1', 'UTF-8'], soup.form['accept-charset'])
|
||||||
|
|
|
@ -57,6 +57,8 @@ class wwt(object):
|
||||||
pagelist = resultpages.findAll("a")
|
pagelist = resultpages.findAll("a")
|
||||||
except:
|
except:
|
||||||
logger.info('No results found for %s' % self.query)
|
logger.info('No results found for %s' % self.query)
|
||||||
|
return
|
||||||
|
|
||||||
pages = []
|
pages = []
|
||||||
for p in pagelist:
|
for p in pagelist:
|
||||||
if p['href'] not in pages:
|
if p['href'] not in pages:
|
||||||
|
|
Loading…
Reference in New Issue