mirror of https://github.com/morpheus65535/bazarr
WIP
This commit is contained in:
parent
2b2fd4e8d7
commit
8227df459a
|
@ -20,6 +20,10 @@ http://www.crummy.com/software/BeautifulSoup/bs4/doc/
|
||||||
# Use of this source code is governed by a BSD-style license that can be
|
# Use of this source code is governed by a BSD-style license that can be
|
||||||
# found in the LICENSE file.
|
# found in the LICENSE file.
|
||||||
|
|
||||||
|
from __future__ import absolute_import
|
||||||
|
from __future__ import print_function
|
||||||
|
import six
|
||||||
|
from six.moves import range
|
||||||
__author__ = "Leonard Richardson (leonardr@segfault.org)"
|
__author__ = "Leonard Richardson (leonardr@segfault.org)"
|
||||||
__version__ = "4.6.0"
|
__version__ = "4.6.0"
|
||||||
__copyright__ = "Copyright (c) 2004-2017 Leonard Richardson"
|
__copyright__ = "Copyright (c) 2004-2017 Leonard Richardson"
|
||||||
|
@ -50,7 +54,7 @@ from .element import (
|
||||||
|
|
||||||
# The very first thing we do is give a useful error if someone is
|
# The very first thing we do is give a useful error if someone is
|
||||||
# running this code under Python 3 without converting it.
|
# running this code under Python 3 without converting it.
|
||||||
'You are trying to run the Python 2 version of Beautiful Soup under Python 3. This will not work.'<>'You need to convert the code, either by installing it (`python setup.py install`) or by running 2to3 (`2to3 -w bs4`).'
|
'You are trying to run the Python 2 version of Beautiful Soup under Python 3. This will not work.'!='You need to convert the code, either by installing it (`python setup.py install`) or by running 2to3 (`2to3 -w bs4`).'
|
||||||
|
|
||||||
class BeautifulSoup(Tag):
|
class BeautifulSoup(Tag):
|
||||||
"""
|
"""
|
||||||
|
@ -142,18 +146,18 @@ class BeautifulSoup(Tag):
|
||||||
from_encoding = from_encoding or deprecated_argument(
|
from_encoding = from_encoding or deprecated_argument(
|
||||||
"fromEncoding", "from_encoding")
|
"fromEncoding", "from_encoding")
|
||||||
|
|
||||||
if from_encoding and isinstance(markup, unicode):
|
if from_encoding and isinstance(markup, six.text_type):
|
||||||
warnings.warn("You provided Unicode markup but also provided a value for from_encoding. Your from_encoding will be ignored.")
|
warnings.warn("You provided Unicode markup but also provided a value for from_encoding. Your from_encoding will be ignored.")
|
||||||
from_encoding = None
|
from_encoding = None
|
||||||
|
|
||||||
if len(kwargs) > 0:
|
if len(kwargs) > 0:
|
||||||
arg = kwargs.keys().pop()
|
arg = list(kwargs.keys()).pop()
|
||||||
raise TypeError(
|
raise TypeError(
|
||||||
"__init__() got an unexpected keyword argument '%s'" % arg)
|
"__init__() got an unexpected keyword argument '%s'" % arg)
|
||||||
|
|
||||||
if builder is None:
|
if builder is None:
|
||||||
original_features = features
|
original_features = features
|
||||||
if isinstance(features, basestring):
|
if isinstance(features, six.string_types):
|
||||||
features = [features]
|
features = [features]
|
||||||
if features is None or len(features) == 0:
|
if features is None or len(features) == 0:
|
||||||
features = self.DEFAULT_BUILDER_FEATURES
|
features = self.DEFAULT_BUILDER_FEATURES
|
||||||
|
@ -191,13 +195,13 @@ class BeautifulSoup(Tag):
|
||||||
markup = markup.read()
|
markup = markup.read()
|
||||||
elif len(markup) <= 256 and (
|
elif len(markup) <= 256 and (
|
||||||
(isinstance(markup, bytes) and not b'<' in markup)
|
(isinstance(markup, bytes) and not b'<' in markup)
|
||||||
or (isinstance(markup, unicode) and not u'<' in markup)
|
or (isinstance(markup, six.text_type) and not u'<' in markup)
|
||||||
):
|
):
|
||||||
# Print out warnings for a couple beginner problems
|
# Print out warnings for a couple beginner problems
|
||||||
# involving passing non-markup to Beautiful Soup.
|
# involving passing non-markup to Beautiful Soup.
|
||||||
# Beautiful Soup will still parse the input as markup,
|
# Beautiful Soup will still parse the input as markup,
|
||||||
# just in case that's what the user really wants.
|
# just in case that's what the user really wants.
|
||||||
if (isinstance(markup, unicode)
|
if (isinstance(markup, six.text_type)
|
||||||
and not os.path.supports_unicode_filenames):
|
and not os.path.supports_unicode_filenames):
|
||||||
possible_filename = markup.encode("utf8")
|
possible_filename = markup.encode("utf8")
|
||||||
else:
|
else:
|
||||||
|
@ -205,13 +209,13 @@ class BeautifulSoup(Tag):
|
||||||
is_file = False
|
is_file = False
|
||||||
try:
|
try:
|
||||||
is_file = os.path.exists(possible_filename)
|
is_file = os.path.exists(possible_filename)
|
||||||
except Exception, e:
|
except Exception as e:
|
||||||
# This is almost certainly a problem involving
|
# This is almost certainly a problem involving
|
||||||
# characters not valid in filenames on this
|
# characters not valid in filenames on this
|
||||||
# system. Just let it go.
|
# system. Just let it go.
|
||||||
pass
|
pass
|
||||||
if is_file:
|
if is_file:
|
||||||
if isinstance(markup, unicode):
|
if isinstance(markup, six.text_type):
|
||||||
markup = markup.encode("utf8")
|
markup = markup.encode("utf8")
|
||||||
warnings.warn(
|
warnings.warn(
|
||||||
'"%s" looks like a filename, not markup. You should'
|
'"%s" looks like a filename, not markup. You should'
|
||||||
|
@ -263,7 +267,7 @@ class BeautifulSoup(Tag):
|
||||||
if isinstance(markup, bytes):
|
if isinstance(markup, bytes):
|
||||||
space = b' '
|
space = b' '
|
||||||
cant_start_with = (b"http:", b"https:")
|
cant_start_with = (b"http:", b"https:")
|
||||||
elif isinstance(markup, unicode):
|
elif isinstance(markup, six.text_type):
|
||||||
space = u' '
|
space = u' '
|
||||||
cant_start_with = (u"http:", u"https:")
|
cant_start_with = (u"http:", u"https:")
|
||||||
else:
|
else:
|
||||||
|
@ -526,4 +530,4 @@ class FeatureNotFound(ValueError):
|
||||||
if __name__ == '__main__':
|
if __name__ == '__main__':
|
||||||
import sys
|
import sys
|
||||||
soup = BeautifulSoup(sys.stdin)
|
soup = BeautifulSoup(sys.stdin)
|
||||||
print soup.prettify()
|
print(soup.prettify())
|
||||||
|
|
|
@ -1,6 +1,7 @@
|
||||||
# Use of this source code is governed by a BSD-style license that can be
|
# Use of this source code is governed by a BSD-style license that can be
|
||||||
# found in the LICENSE file.
|
# found in the LICENSE file.
|
||||||
|
|
||||||
|
from __future__ import absolute_import
|
||||||
from collections import defaultdict
|
from collections import defaultdict
|
||||||
import itertools
|
import itertools
|
||||||
import sys
|
import sys
|
||||||
|
@ -10,6 +11,7 @@ from bs4.element import (
|
||||||
HTMLAwareEntitySubstitution,
|
HTMLAwareEntitySubstitution,
|
||||||
whitespace_re
|
whitespace_re
|
||||||
)
|
)
|
||||||
|
import six
|
||||||
|
|
||||||
__all__ = [
|
__all__ = [
|
||||||
'HTMLTreeBuilder',
|
'HTMLTreeBuilder',
|
||||||
|
@ -166,7 +168,7 @@ class TreeBuilder(object):
|
||||||
# value is a whitespace-separated list of
|
# value is a whitespace-separated list of
|
||||||
# values. Split it into a list.
|
# values. Split it into a list.
|
||||||
value = attrs[attr]
|
value = attrs[attr]
|
||||||
if isinstance(value, basestring):
|
if isinstance(value, six.string_types):
|
||||||
values = whitespace_re.split(value)
|
values = whitespace_re.split(value)
|
||||||
else:
|
else:
|
||||||
# html5lib sometimes calls setAttributes twice
|
# html5lib sometimes calls setAttributes twice
|
||||||
|
|
|
@ -1,6 +1,8 @@
|
||||||
# Use of this source code is governed by a BSD-style license that can be
|
# Use of this source code is governed by a BSD-style license that can be
|
||||||
# found in the LICENSE file.
|
# found in the LICENSE file.
|
||||||
|
|
||||||
|
from __future__ import absolute_import
|
||||||
|
import six
|
||||||
__all__ = [
|
__all__ = [
|
||||||
'HTML5TreeBuilder',
|
'HTML5TreeBuilder',
|
||||||
]
|
]
|
||||||
|
@ -33,7 +35,7 @@ try:
|
||||||
# Pre-0.99999999
|
# Pre-0.99999999
|
||||||
from html5lib.treebuilders import _base as treebuilder_base
|
from html5lib.treebuilders import _base as treebuilder_base
|
||||||
new_html5lib = False
|
new_html5lib = False
|
||||||
except ImportError, e:
|
except ImportError as e:
|
||||||
# 0.99999999 and up
|
# 0.99999999 and up
|
||||||
from html5lib.treebuilders import base as treebuilder_base
|
from html5lib.treebuilders import base as treebuilder_base
|
||||||
new_html5lib = True
|
new_html5lib = True
|
||||||
|
@ -64,7 +66,7 @@ class HTML5TreeBuilder(HTMLTreeBuilder):
|
||||||
parser = html5lib.HTMLParser(tree=self.create_treebuilder)
|
parser = html5lib.HTMLParser(tree=self.create_treebuilder)
|
||||||
|
|
||||||
extra_kwargs = dict()
|
extra_kwargs = dict()
|
||||||
if not isinstance(markup, unicode):
|
if not isinstance(markup, six.text_type):
|
||||||
if new_html5lib:
|
if new_html5lib:
|
||||||
extra_kwargs['override_encoding'] = self.user_specified_encoding
|
extra_kwargs['override_encoding'] = self.user_specified_encoding
|
||||||
else:
|
else:
|
||||||
|
@ -72,13 +74,13 @@ class HTML5TreeBuilder(HTMLTreeBuilder):
|
||||||
doc = parser.parse(markup, **extra_kwargs)
|
doc = parser.parse(markup, **extra_kwargs)
|
||||||
|
|
||||||
# Set the character encoding detected by the tokenizer.
|
# Set the character encoding detected by the tokenizer.
|
||||||
if isinstance(markup, unicode):
|
if isinstance(markup, six.text_type):
|
||||||
# We need to special-case this because html5lib sets
|
# We need to special-case this because html5lib sets
|
||||||
# charEncoding to UTF-8 if it gets Unicode input.
|
# charEncoding to UTF-8 if it gets Unicode input.
|
||||||
doc.original_encoding = None
|
doc.original_encoding = None
|
||||||
else:
|
else:
|
||||||
original_encoding = parser.tokenizer.stream.charEncoding[0]
|
original_encoding = parser.tokenizer.stream.charEncoding[0]
|
||||||
if not isinstance(original_encoding, basestring):
|
if not isinstance(original_encoding, six.string_types):
|
||||||
# In 0.99999999 and up, the encoding is an html5lib
|
# In 0.99999999 and up, the encoding is an html5lib
|
||||||
# Encoding object. We want to use a string for compatibility
|
# Encoding object. We want to use a string for compatibility
|
||||||
# with other tree builders.
|
# with other tree builders.
|
||||||
|
@ -229,7 +231,7 @@ class Element(treebuilder_base.Node):
|
||||||
|
|
||||||
def appendChild(self, node):
|
def appendChild(self, node):
|
||||||
string_child = child = None
|
string_child = child = None
|
||||||
if isinstance(node, basestring):
|
if isinstance(node, six.string_types):
|
||||||
# Some other piece of code decided to pass in a string
|
# Some other piece of code decided to pass in a string
|
||||||
# instead of creating a TextElement object to contain the
|
# instead of creating a TextElement object to contain the
|
||||||
# string.
|
# string.
|
||||||
|
@ -246,7 +248,7 @@ class Element(treebuilder_base.Node):
|
||||||
child = node.element
|
child = node.element
|
||||||
node.parent = self
|
node.parent = self
|
||||||
|
|
||||||
if not isinstance(child, basestring) and child.parent is not None:
|
if not isinstance(child, six.string_types) and child.parent is not None:
|
||||||
node.element.extract()
|
node.element.extract()
|
||||||
|
|
||||||
if (string_child and self.element.contents
|
if (string_child and self.element.contents
|
||||||
|
@ -259,7 +261,7 @@ class Element(treebuilder_base.Node):
|
||||||
old_element.replace_with(new_element)
|
old_element.replace_with(new_element)
|
||||||
self.soup._most_recent_element = new_element
|
self.soup._most_recent_element = new_element
|
||||||
else:
|
else:
|
||||||
if isinstance(node, basestring):
|
if isinstance(node, six.string_types):
|
||||||
# Create a brand new NavigableString from this string.
|
# Create a brand new NavigableString from this string.
|
||||||
child = self.soup.new_string(node)
|
child = self.soup.new_string(node)
|
||||||
|
|
||||||
|
|
|
@ -3,15 +3,18 @@
|
||||||
# Use of this source code is governed by a BSD-style license that can be
|
# Use of this source code is governed by a BSD-style license that can be
|
||||||
# found in the LICENSE file.
|
# found in the LICENSE file.
|
||||||
|
|
||||||
|
from __future__ import absolute_import
|
||||||
|
from six import unichr
|
||||||
|
import six
|
||||||
__all__ = [
|
__all__ = [
|
||||||
'HTMLParserTreeBuilder',
|
'HTMLParserTreeBuilder',
|
||||||
]
|
]
|
||||||
|
|
||||||
from HTMLParser import HTMLParser
|
from six.moves.html_parser import HTMLParser
|
||||||
|
|
||||||
try:
|
try:
|
||||||
from HTMLParser import HTMLParseError
|
from six.moves.html_parser import HTMLParseError
|
||||||
except ImportError, e:
|
except ImportError as e:
|
||||||
# HTMLParseError is removed in Python 3.5. Since it can never be
|
# HTMLParseError is removed in Python 3.5. Since it can never be
|
||||||
# thrown in 3.5, we can just define our own class as a placeholder.
|
# thrown in 3.5, we can just define our own class as a placeholder.
|
||||||
class HTMLParseError(Exception):
|
class HTMLParseError(Exception):
|
||||||
|
@ -131,7 +134,7 @@ class BeautifulSoupHTMLParser(HTMLParser):
|
||||||
|
|
||||||
try:
|
try:
|
||||||
data = unichr(real_name)
|
data = unichr(real_name)
|
||||||
except (ValueError, OverflowError), e:
|
except (ValueError, OverflowError) as e:
|
||||||
data = u"\N{REPLACEMENT CHARACTER}"
|
data = u"\N{REPLACEMENT CHARACTER}"
|
||||||
|
|
||||||
self.handle_data(data)
|
self.handle_data(data)
|
||||||
|
@ -196,7 +199,7 @@ class HTMLParserTreeBuilder(HTMLTreeBuilder):
|
||||||
declared within markup, whether any characters had to be
|
declared within markup, whether any characters had to be
|
||||||
replaced with REPLACEMENT CHARACTER).
|
replaced with REPLACEMENT CHARACTER).
|
||||||
"""
|
"""
|
||||||
if isinstance(markup, unicode):
|
if isinstance(markup, six.text_type):
|
||||||
yield (markup, None, None, False)
|
yield (markup, None, None, False)
|
||||||
return
|
return
|
||||||
|
|
||||||
|
@ -213,7 +216,7 @@ class HTMLParserTreeBuilder(HTMLTreeBuilder):
|
||||||
parser.soup = self.soup
|
parser.soup = self.soup
|
||||||
try:
|
try:
|
||||||
parser.feed(markup)
|
parser.feed(markup)
|
||||||
except HTMLParseError, e:
|
except HTMLParseError as e:
|
||||||
warnings.warn(RuntimeWarning(
|
warnings.warn(RuntimeWarning(
|
||||||
"Python's built-in HTMLParser cannot parse the given document. This is not a bug in Beautiful Soup. The best solution is to install an external parser (lxml or html5lib), and use Beautiful Soup with that parser. See http://www.crummy.com/software/BeautifulSoup/bs4/doc/#installing-a-parser for help."))
|
"Python's built-in HTMLParser cannot parse the given document. This is not a bug in Beautiful Soup. The best solution is to install an external parser (lxml or html5lib), and use Beautiful Soup with that parser. See http://www.crummy.com/software/BeautifulSoup/bs4/doc/#installing-a-parser for help."))
|
||||||
raise e
|
raise e
|
||||||
|
|
|
@ -1,5 +1,7 @@
|
||||||
# Use of this source code is governed by a BSD-style license that can be
|
# Use of this source code is governed by a BSD-style license that can be
|
||||||
# found in the LICENSE file.
|
# found in the LICENSE file.
|
||||||
|
from __future__ import absolute_import
|
||||||
|
import six
|
||||||
__all__ = [
|
__all__ = [
|
||||||
'LXMLTreeBuilderForXML',
|
'LXMLTreeBuilderForXML',
|
||||||
'LXMLTreeBuilder',
|
'LXMLTreeBuilder',
|
||||||
|
@ -101,12 +103,12 @@ class LXMLTreeBuilderForXML(TreeBuilder):
|
||||||
else:
|
else:
|
||||||
self.processing_instruction_class = XMLProcessingInstruction
|
self.processing_instruction_class = XMLProcessingInstruction
|
||||||
|
|
||||||
if isinstance(markup, unicode):
|
if isinstance(markup, six.text_type):
|
||||||
# We were given Unicode. Maybe lxml can parse Unicode on
|
# We were given Unicode. Maybe lxml can parse Unicode on
|
||||||
# this system?
|
# this system?
|
||||||
yield markup, None, document_declared_encoding, False
|
yield markup, None, document_declared_encoding, False
|
||||||
|
|
||||||
if isinstance(markup, unicode):
|
if isinstance(markup, six.text_type):
|
||||||
# No, apparently not. Convert the Unicode to UTF-8 and
|
# No, apparently not. Convert the Unicode to UTF-8 and
|
||||||
# tell lxml to parse it as UTF-8.
|
# tell lxml to parse it as UTF-8.
|
||||||
yield (markup.encode("utf8"), "utf8",
|
yield (markup.encode("utf8"), "utf8",
|
||||||
|
@ -121,7 +123,7 @@ class LXMLTreeBuilderForXML(TreeBuilder):
|
||||||
def feed(self, markup):
|
def feed(self, markup):
|
||||||
if isinstance(markup, bytes):
|
if isinstance(markup, bytes):
|
||||||
markup = BytesIO(markup)
|
markup = BytesIO(markup)
|
||||||
elif isinstance(markup, unicode):
|
elif isinstance(markup, six.text_type):
|
||||||
markup = StringIO(markup)
|
markup = StringIO(markup)
|
||||||
|
|
||||||
# Call feed() at least once, even if the markup is empty,
|
# Call feed() at least once, even if the markup is empty,
|
||||||
|
@ -136,7 +138,7 @@ class LXMLTreeBuilderForXML(TreeBuilder):
|
||||||
if len(data) != 0:
|
if len(data) != 0:
|
||||||
self.parser.feed(data)
|
self.parser.feed(data)
|
||||||
self.parser.close()
|
self.parser.close()
|
||||||
except (UnicodeDecodeError, LookupError, etree.ParserError), e:
|
except (UnicodeDecodeError, LookupError, etree.ParserError) as e:
|
||||||
raise ParserRejectedMarkup(str(e))
|
raise ParserRejectedMarkup(str(e))
|
||||||
|
|
||||||
def close(self):
|
def close(self):
|
||||||
|
@ -249,7 +251,7 @@ class LXMLTreeBuilder(HTMLTreeBuilder, LXMLTreeBuilderForXML):
|
||||||
self.parser = self.parser_for(encoding)
|
self.parser = self.parser_for(encoding)
|
||||||
self.parser.feed(markup)
|
self.parser.feed(markup)
|
||||||
self.parser.close()
|
self.parser.close()
|
||||||
except (UnicodeDecodeError, LookupError, etree.ParserError), e:
|
except (UnicodeDecodeError, LookupError, etree.ParserError) as e:
|
||||||
raise ParserRejectedMarkup(str(e))
|
raise ParserRejectedMarkup(str(e))
|
||||||
|
|
||||||
|
|
||||||
|
|
|
@ -8,10 +8,13 @@ XML or HTML to reflect a new encoding; that's the tree builder's job.
|
||||||
"""
|
"""
|
||||||
# Use of this source code is governed by a BSD-style license that can be
|
# Use of this source code is governed by a BSD-style license that can be
|
||||||
# found in the LICENSE file.
|
# found in the LICENSE file.
|
||||||
|
from __future__ import absolute_import
|
||||||
|
from six import unichr
|
||||||
|
import six
|
||||||
__license__ = "MIT"
|
__license__ = "MIT"
|
||||||
|
|
||||||
import codecs
|
import codecs
|
||||||
from htmlentitydefs import codepoint2name
|
from six.moves.html_entities import codepoint2name
|
||||||
import re
|
import re
|
||||||
import logging
|
import logging
|
||||||
import string
|
import string
|
||||||
|
@ -274,7 +277,7 @@ class EncodingDetector:
|
||||||
def strip_byte_order_mark(cls, data):
|
def strip_byte_order_mark(cls, data):
|
||||||
"""If a byte-order mark is present, strip it and return the encoding it implies."""
|
"""If a byte-order mark is present, strip it and return the encoding it implies."""
|
||||||
encoding = None
|
encoding = None
|
||||||
if isinstance(data, unicode):
|
if isinstance(data, six.text_type):
|
||||||
# Unicode data cannot have a byte-order mark.
|
# Unicode data cannot have a byte-order mark.
|
||||||
return data, encoding
|
return data, encoding
|
||||||
if (len(data) >= 4) and (data[:2] == b'\xfe\xff') \
|
if (len(data) >= 4) and (data[:2] == b'\xfe\xff') \
|
||||||
|
@ -352,9 +355,9 @@ class UnicodeDammit:
|
||||||
markup, override_encodings, is_html, exclude_encodings)
|
markup, override_encodings, is_html, exclude_encodings)
|
||||||
|
|
||||||
# Short-circuit if the data is in Unicode to begin with.
|
# Short-circuit if the data is in Unicode to begin with.
|
||||||
if isinstance(markup, unicode) or markup == '':
|
if isinstance(markup, six.text_type) or markup == '':
|
||||||
self.markup = markup
|
self.markup = markup
|
||||||
self.unicode_markup = unicode(markup)
|
self.unicode_markup = six.text_type(markup)
|
||||||
self.original_encoding = None
|
self.original_encoding = None
|
||||||
return
|
return
|
||||||
|
|
||||||
|
@ -438,7 +441,7 @@ class UnicodeDammit:
|
||||||
def _to_unicode(self, data, encoding, errors="strict"):
|
def _to_unicode(self, data, encoding, errors="strict"):
|
||||||
'''Given a string and its encoding, decodes the string into Unicode.
|
'''Given a string and its encoding, decodes the string into Unicode.
|
||||||
%encoding is a string recognized by encodings.aliases'''
|
%encoding is a string recognized by encodings.aliases'''
|
||||||
return unicode(data, encoding, errors)
|
return six.text_type(data, encoding, errors)
|
||||||
|
|
||||||
@property
|
@property
|
||||||
def declared_html_encoding(self):
|
def declared_html_encoding(self):
|
||||||
|
|
|
@ -2,11 +2,15 @@
|
||||||
|
|
||||||
# Use of this source code is governed by a BSD-style license that can be
|
# Use of this source code is governed by a BSD-style license that can be
|
||||||
# found in the LICENSE file.
|
# found in the LICENSE file.
|
||||||
|
from __future__ import absolute_import
|
||||||
|
from __future__ import print_function
|
||||||
|
from six.moves import map
|
||||||
|
from six.moves import range
|
||||||
__license__ = "MIT"
|
__license__ = "MIT"
|
||||||
|
|
||||||
import cProfile
|
import cProfile
|
||||||
from StringIO import StringIO
|
from StringIO import StringIO
|
||||||
from HTMLParser import HTMLParser
|
from six.moves.html_parser import HTMLParser
|
||||||
import bs4
|
import bs4
|
||||||
from bs4 import BeautifulSoup, __version__
|
from bs4 import BeautifulSoup, __version__
|
||||||
from bs4.builder import builder_registry
|
from bs4.builder import builder_registry
|
||||||
|
@ -22,8 +26,8 @@ import cProfile
|
||||||
|
|
||||||
def diagnose(data):
|
def diagnose(data):
|
||||||
"""Diagnostic suite for isolating common problems."""
|
"""Diagnostic suite for isolating common problems."""
|
||||||
print "Diagnostic running on Beautiful Soup %s" % __version__
|
print("Diagnostic running on Beautiful Soup %s" % __version__)
|
||||||
print "Python version %s" % sys.version
|
print("Python version %s" % sys.version)
|
||||||
|
|
||||||
basic_parsers = ["html.parser", "html5lib", "lxml"]
|
basic_parsers = ["html.parser", "html5lib", "lxml"]
|
||||||
for name in basic_parsers:
|
for name in basic_parsers:
|
||||||
|
@ -32,16 +36,16 @@ def diagnose(data):
|
||||||
break
|
break
|
||||||
else:
|
else:
|
||||||
basic_parsers.remove(name)
|
basic_parsers.remove(name)
|
||||||
print (
|
print((
|
||||||
"I noticed that %s is not installed. Installing it may help." %
|
"I noticed that %s is not installed. Installing it may help." %
|
||||||
name)
|
name))
|
||||||
|
|
||||||
if 'lxml' in basic_parsers:
|
if 'lxml' in basic_parsers:
|
||||||
basic_parsers.append(["lxml", "xml"])
|
basic_parsers.append(["lxml", "xml"])
|
||||||
try:
|
try:
|
||||||
from lxml import etree
|
from lxml import etree
|
||||||
print "Found lxml version %s" % ".".join(map(str,etree.LXML_VERSION))
|
print("Found lxml version %s" % ".".join(map(str,etree.LXML_VERSION)))
|
||||||
except ImportError, e:
|
except ImportError as e:
|
||||||
print (
|
print (
|
||||||
"lxml is not installed or couldn't be imported.")
|
"lxml is not installed or couldn't be imported.")
|
||||||
|
|
||||||
|
@ -49,37 +53,37 @@ def diagnose(data):
|
||||||
if 'html5lib' in basic_parsers:
|
if 'html5lib' in basic_parsers:
|
||||||
try:
|
try:
|
||||||
import html5lib
|
import html5lib
|
||||||
print "Found html5lib version %s" % html5lib.__version__
|
print("Found html5lib version %s" % html5lib.__version__)
|
||||||
except ImportError, e:
|
except ImportError as e:
|
||||||
print (
|
print (
|
||||||
"html5lib is not installed or couldn't be imported.")
|
"html5lib is not installed or couldn't be imported.")
|
||||||
|
|
||||||
if hasattr(data, 'read'):
|
if hasattr(data, 'read'):
|
||||||
data = data.read()
|
data = data.read()
|
||||||
elif os.path.exists(data):
|
elif os.path.exists(data):
|
||||||
print '"%s" looks like a filename. Reading data from the file.' % data
|
print('"%s" looks like a filename. Reading data from the file.' % data)
|
||||||
with open(data) as fp:
|
with open(data) as fp:
|
||||||
data = fp.read()
|
data = fp.read()
|
||||||
elif data.startswith("http:") or data.startswith("https:"):
|
elif data.startswith("http:") or data.startswith("https:"):
|
||||||
print '"%s" looks like a URL. Beautiful Soup is not an HTTP client.' % data
|
print('"%s" looks like a URL. Beautiful Soup is not an HTTP client.' % data)
|
||||||
print "You need to use some other library to get the document behind the URL, and feed that document to Beautiful Soup."
|
print("You need to use some other library to get the document behind the URL, and feed that document to Beautiful Soup.")
|
||||||
return
|
return
|
||||||
print
|
print()
|
||||||
|
|
||||||
for parser in basic_parsers:
|
for parser in basic_parsers:
|
||||||
print "Trying to parse your markup with %s" % parser
|
print("Trying to parse your markup with %s" % parser)
|
||||||
success = False
|
success = False
|
||||||
try:
|
try:
|
||||||
soup = BeautifulSoup(data, parser)
|
soup = BeautifulSoup(data, parser)
|
||||||
success = True
|
success = True
|
||||||
except Exception, e:
|
except Exception as e:
|
||||||
print "%s could not parse the markup." % parser
|
print("%s could not parse the markup." % parser)
|
||||||
traceback.print_exc()
|
traceback.print_exc()
|
||||||
if success:
|
if success:
|
||||||
print "Here's what %s did with the markup:" % parser
|
print("Here's what %s did with the markup:" % parser)
|
||||||
print soup.prettify()
|
print(soup.prettify())
|
||||||
|
|
||||||
print "-" * 80
|
print("-" * 80)
|
||||||
|
|
||||||
def lxml_trace(data, html=True, **kwargs):
|
def lxml_trace(data, html=True, **kwargs):
|
||||||
"""Print out the lxml events that occur during parsing.
|
"""Print out the lxml events that occur during parsing.
|
||||||
|
@ -89,7 +93,7 @@ def lxml_trace(data, html=True, **kwargs):
|
||||||
"""
|
"""
|
||||||
from lxml import etree
|
from lxml import etree
|
||||||
for event, element in etree.iterparse(StringIO(data), html=html, **kwargs):
|
for event, element in etree.iterparse(StringIO(data), html=html, **kwargs):
|
||||||
print("%s, %4s, %s" % (event, element.tag, element.text))
|
print(("%s, %4s, %s" % (event, element.tag, element.text)))
|
||||||
|
|
||||||
class AnnouncingParser(HTMLParser):
|
class AnnouncingParser(HTMLParser):
|
||||||
"""Announces HTMLParser parse events, without doing anything else."""
|
"""Announces HTMLParser parse events, without doing anything else."""
|
||||||
|
@ -171,9 +175,9 @@ def rdoc(num_elements=1000):
|
||||||
|
|
||||||
def benchmark_parsers(num_elements=100000):
|
def benchmark_parsers(num_elements=100000):
|
||||||
"""Very basic head-to-head performance benchmark."""
|
"""Very basic head-to-head performance benchmark."""
|
||||||
print "Comparative parser benchmark on Beautiful Soup %s" % __version__
|
print("Comparative parser benchmark on Beautiful Soup %s" % __version__)
|
||||||
data = rdoc(num_elements)
|
data = rdoc(num_elements)
|
||||||
print "Generated a large invalid HTML document (%d bytes)." % len(data)
|
print("Generated a large invalid HTML document (%d bytes)." % len(data))
|
||||||
|
|
||||||
for parser in ["lxml", ["lxml", "html"], "html5lib", "html.parser"]:
|
for parser in ["lxml", ["lxml", "html"], "html5lib", "html.parser"]:
|
||||||
success = False
|
success = False
|
||||||
|
@ -182,24 +186,24 @@ def benchmark_parsers(num_elements=100000):
|
||||||
soup = BeautifulSoup(data, parser)
|
soup = BeautifulSoup(data, parser)
|
||||||
b = time.time()
|
b = time.time()
|
||||||
success = True
|
success = True
|
||||||
except Exception, e:
|
except Exception as e:
|
||||||
print "%s could not parse the markup." % parser
|
print("%s could not parse the markup." % parser)
|
||||||
traceback.print_exc()
|
traceback.print_exc()
|
||||||
if success:
|
if success:
|
||||||
print "BS4+%s parsed the markup in %.2fs." % (parser, b-a)
|
print("BS4+%s parsed the markup in %.2fs." % (parser, b-a))
|
||||||
|
|
||||||
from lxml import etree
|
from lxml import etree
|
||||||
a = time.time()
|
a = time.time()
|
||||||
etree.HTML(data)
|
etree.HTML(data)
|
||||||
b = time.time()
|
b = time.time()
|
||||||
print "Raw lxml parsed the markup in %.2fs." % (b-a)
|
print("Raw lxml parsed the markup in %.2fs." % (b-a))
|
||||||
|
|
||||||
import html5lib
|
import html5lib
|
||||||
parser = html5lib.HTMLParser()
|
parser = html5lib.HTMLParser()
|
||||||
a = time.time()
|
a = time.time()
|
||||||
parser.parse(data)
|
parser.parse(data)
|
||||||
b = time.time()
|
b = time.time()
|
||||||
print "Raw html5lib parsed the markup in %.2fs." % (b-a)
|
print("Raw html5lib parsed the markup in %.2fs." % (b-a))
|
||||||
|
|
||||||
def profile(num_elements=100000, parser="lxml"):
|
def profile(num_elements=100000, parser="lxml"):
|
||||||
|
|
||||||
|
|
|
@ -1,5 +1,8 @@
|
||||||
# Use of this source code is governed by a BSD-style license that can be
|
# Use of this source code is governed by a BSD-style license that can be
|
||||||
# found in the LICENSE file.
|
# found in the LICENSE file.
|
||||||
|
from __future__ import absolute_import
|
||||||
|
from __future__ import print_function
|
||||||
|
import six
|
||||||
__license__ = "MIT"
|
__license__ = "MIT"
|
||||||
|
|
||||||
import collections
|
import collections
|
||||||
|
@ -26,22 +29,22 @@ def _alias(attr):
|
||||||
return alias
|
return alias
|
||||||
|
|
||||||
|
|
||||||
class NamespacedAttribute(unicode):
|
class NamespacedAttribute(six.text_type):
|
||||||
|
|
||||||
def __new__(cls, prefix, name, namespace=None):
|
def __new__(cls, prefix, name, namespace=None):
|
||||||
if name is None:
|
if name is None:
|
||||||
obj = unicode.__new__(cls, prefix)
|
obj = six.text_type.__new__(cls, prefix)
|
||||||
elif prefix is None:
|
elif prefix is None:
|
||||||
# Not really namespaced.
|
# Not really namespaced.
|
||||||
obj = unicode.__new__(cls, name)
|
obj = six.text_type.__new__(cls, name)
|
||||||
else:
|
else:
|
||||||
obj = unicode.__new__(cls, prefix + ":" + name)
|
obj = six.text_type.__new__(cls, prefix + ":" + name)
|
||||||
obj.prefix = prefix
|
obj.prefix = prefix
|
||||||
obj.name = name
|
obj.name = name
|
||||||
obj.namespace = namespace
|
obj.namespace = namespace
|
||||||
return obj
|
return obj
|
||||||
|
|
||||||
class AttributeValueWithCharsetSubstitution(unicode):
|
class AttributeValueWithCharsetSubstitution(six.text_type):
|
||||||
"""A stand-in object for a character encoding specified in HTML."""
|
"""A stand-in object for a character encoding specified in HTML."""
|
||||||
|
|
||||||
class CharsetMetaAttributeValue(AttributeValueWithCharsetSubstitution):
|
class CharsetMetaAttributeValue(AttributeValueWithCharsetSubstitution):
|
||||||
|
@ -52,7 +55,7 @@ class CharsetMetaAttributeValue(AttributeValueWithCharsetSubstitution):
|
||||||
"""
|
"""
|
||||||
|
|
||||||
def __new__(cls, original_value):
|
def __new__(cls, original_value):
|
||||||
obj = unicode.__new__(cls, original_value)
|
obj = six.text_type.__new__(cls, original_value)
|
||||||
obj.original_value = original_value
|
obj.original_value = original_value
|
||||||
return obj
|
return obj
|
||||||
|
|
||||||
|
@ -75,9 +78,9 @@ class ContentMetaAttributeValue(AttributeValueWithCharsetSubstitution):
|
||||||
match = cls.CHARSET_RE.search(original_value)
|
match = cls.CHARSET_RE.search(original_value)
|
||||||
if match is None:
|
if match is None:
|
||||||
# No substitution necessary.
|
# No substitution necessary.
|
||||||
return unicode.__new__(unicode, original_value)
|
return six.text_type.__new__(six.text_type, original_value)
|
||||||
|
|
||||||
obj = unicode.__new__(cls, original_value)
|
obj = six.text_type.__new__(cls, original_value)
|
||||||
obj.original_value = original_value
|
obj.original_value = original_value
|
||||||
return obj
|
return obj
|
||||||
|
|
||||||
|
@ -312,7 +315,7 @@ class PageElement(object):
|
||||||
raise ValueError("Cannot insert None into a tag.")
|
raise ValueError("Cannot insert None into a tag.")
|
||||||
if new_child is self:
|
if new_child is self:
|
||||||
raise ValueError("Cannot insert a tag into itself.")
|
raise ValueError("Cannot insert a tag into itself.")
|
||||||
if (isinstance(new_child, basestring)
|
if (isinstance(new_child, six.string_types)
|
||||||
and not isinstance(new_child, NavigableString)):
|
and not isinstance(new_child, NavigableString)):
|
||||||
new_child = NavigableString(new_child)
|
new_child = NavigableString(new_child)
|
||||||
|
|
||||||
|
@ -533,7 +536,7 @@ class PageElement(object):
|
||||||
result = (element for element in generator
|
result = (element for element in generator
|
||||||
if isinstance(element, Tag))
|
if isinstance(element, Tag))
|
||||||
return ResultSet(strainer, result)
|
return ResultSet(strainer, result)
|
||||||
elif isinstance(name, basestring):
|
elif isinstance(name, six.string_types):
|
||||||
# Optimization to find all tags with a given name.
|
# Optimization to find all tags with a given name.
|
||||||
if name.count(':') == 1:
|
if name.count(':') == 1:
|
||||||
# This is a name with a prefix.
|
# This is a name with a prefix.
|
||||||
|
@ -691,7 +694,7 @@ class PageElement(object):
|
||||||
return self.parents
|
return self.parents
|
||||||
|
|
||||||
|
|
||||||
class NavigableString(unicode, PageElement):
|
class NavigableString(six.text_type, PageElement):
|
||||||
|
|
||||||
PREFIX = ''
|
PREFIX = ''
|
||||||
SUFFIX = ''
|
SUFFIX = ''
|
||||||
|
@ -709,10 +712,10 @@ class NavigableString(unicode, PageElement):
|
||||||
passed in to the superclass's __new__ or the superclass won't know
|
passed in to the superclass's __new__ or the superclass won't know
|
||||||
how to handle non-ASCII characters.
|
how to handle non-ASCII characters.
|
||||||
"""
|
"""
|
||||||
if isinstance(value, unicode):
|
if isinstance(value, six.text_type):
|
||||||
u = unicode.__new__(cls, value)
|
u = six.text_type.__new__(cls, value)
|
||||||
else:
|
else:
|
||||||
u = unicode.__new__(cls, value, DEFAULT_OUTPUT_ENCODING)
|
u = six.text_type.__new__(cls, value, DEFAULT_OUTPUT_ENCODING)
|
||||||
u.setup()
|
u.setup()
|
||||||
return u
|
return u
|
||||||
|
|
||||||
|
@ -723,7 +726,7 @@ class NavigableString(unicode, PageElement):
|
||||||
return type(self)(self)
|
return type(self)(self)
|
||||||
|
|
||||||
def __getnewargs__(self):
|
def __getnewargs__(self):
|
||||||
return (unicode(self),)
|
return (six.text_type(self),)
|
||||||
|
|
||||||
def __getattr__(self, attr):
|
def __getattr__(self, attr):
|
||||||
"""text.string gives you text. This is for backwards
|
"""text.string gives you text. This is for backwards
|
||||||
|
@ -1142,8 +1145,8 @@ class Tag(PageElement):
|
||||||
else:
|
else:
|
||||||
if isinstance(val, list) or isinstance(val, tuple):
|
if isinstance(val, list) or isinstance(val, tuple):
|
||||||
val = ' '.join(val)
|
val = ' '.join(val)
|
||||||
elif not isinstance(val, basestring):
|
elif not isinstance(val, six.string_types):
|
||||||
val = unicode(val)
|
val = six.text_type(val)
|
||||||
elif (
|
elif (
|
||||||
isinstance(val, AttributeValueWithCharsetSubstitution)
|
isinstance(val, AttributeValueWithCharsetSubstitution)
|
||||||
and eventual_encoding is not None):
|
and eventual_encoding is not None):
|
||||||
|
@ -1151,7 +1154,7 @@ class Tag(PageElement):
|
||||||
|
|
||||||
text = self.format_string(val, formatter)
|
text = self.format_string(val, formatter)
|
||||||
decoded = (
|
decoded = (
|
||||||
unicode(key) + '='
|
six.text_type(key) + '='
|
||||||
+ EntitySubstitution.quoted_attribute_value(text))
|
+ EntitySubstitution.quoted_attribute_value(text))
|
||||||
attrs.append(decoded)
|
attrs.append(decoded)
|
||||||
close = ''
|
close = ''
|
||||||
|
@ -1368,7 +1371,7 @@ class Tag(PageElement):
|
||||||
'Final combinator "%s" is missing an argument.' % tokens[-1])
|
'Final combinator "%s" is missing an argument.' % tokens[-1])
|
||||||
|
|
||||||
if self._select_debug:
|
if self._select_debug:
|
||||||
print 'Running CSS selector "%s"' % selector
|
print('Running CSS selector "%s"' % selector)
|
||||||
|
|
||||||
for index, token in enumerate(tokens):
|
for index, token in enumerate(tokens):
|
||||||
new_context = []
|
new_context = []
|
||||||
|
@ -1377,11 +1380,11 @@ class Tag(PageElement):
|
||||||
if tokens[index-1] in self._selector_combinators:
|
if tokens[index-1] in self._selector_combinators:
|
||||||
# This token was consumed by the previous combinator. Skip it.
|
# This token was consumed by the previous combinator. Skip it.
|
||||||
if self._select_debug:
|
if self._select_debug:
|
||||||
print ' Token was consumed by the previous combinator.'
|
print(' Token was consumed by the previous combinator.')
|
||||||
continue
|
continue
|
||||||
|
|
||||||
if self._select_debug:
|
if self._select_debug:
|
||||||
print ' Considering token "%s"' % token
|
print(' Considering token "%s"' % token)
|
||||||
recursive_candidate_generator = None
|
recursive_candidate_generator = None
|
||||||
tag_name = None
|
tag_name = None
|
||||||
|
|
||||||
|
@ -1488,14 +1491,14 @@ class Tag(PageElement):
|
||||||
next_token = tokens[index+1]
|
next_token = tokens[index+1]
|
||||||
def recursive_select(tag):
|
def recursive_select(tag):
|
||||||
if self._select_debug:
|
if self._select_debug:
|
||||||
print ' Calling select("%s") recursively on %s %s' % (next_token, tag.name, tag.attrs)
|
print(' Calling select("%s") recursively on %s %s' % (next_token, tag.name, tag.attrs))
|
||||||
print '-' * 40
|
print('-' * 40)
|
||||||
for i in tag.select(next_token, recursive_candidate_generator):
|
for i in tag.select(next_token, recursive_candidate_generator):
|
||||||
if self._select_debug:
|
if self._select_debug:
|
||||||
print '(Recursive select picked up candidate %s %s)' % (i.name, i.attrs)
|
print('(Recursive select picked up candidate %s %s)' % (i.name, i.attrs))
|
||||||
yield i
|
yield i
|
||||||
if self._select_debug:
|
if self._select_debug:
|
||||||
print '-' * 40
|
print('-' * 40)
|
||||||
_use_candidate_generator = recursive_select
|
_use_candidate_generator = recursive_select
|
||||||
elif _candidate_generator is None:
|
elif _candidate_generator is None:
|
||||||
# By default, a tag's candidates are all of its
|
# By default, a tag's candidates are all of its
|
||||||
|
@ -1506,7 +1509,7 @@ class Tag(PageElement):
|
||||||
check = "[any]"
|
check = "[any]"
|
||||||
else:
|
else:
|
||||||
check = tag_name
|
check = tag_name
|
||||||
print ' Default candidate generator, tag name="%s"' % check
|
print(' Default candidate generator, tag name="%s"' % check)
|
||||||
if self._select_debug:
|
if self._select_debug:
|
||||||
# This is redundant with later code, but it stops
|
# This is redundant with later code, but it stops
|
||||||
# a bunch of bogus tags from cluttering up the
|
# a bunch of bogus tags from cluttering up the
|
||||||
|
@ -1527,8 +1530,8 @@ class Tag(PageElement):
|
||||||
count = 0
|
count = 0
|
||||||
for tag in current_context:
|
for tag in current_context:
|
||||||
if self._select_debug:
|
if self._select_debug:
|
||||||
print " Running candidate generator on %s %s" % (
|
print(" Running candidate generator on %s %s" % (
|
||||||
tag.name, repr(tag.attrs))
|
tag.name, repr(tag.attrs)))
|
||||||
for candidate in _use_candidate_generator(tag):
|
for candidate in _use_candidate_generator(tag):
|
||||||
if not isinstance(candidate, Tag):
|
if not isinstance(candidate, Tag):
|
||||||
continue
|
continue
|
||||||
|
@ -1543,23 +1546,23 @@ class Tag(PageElement):
|
||||||
break
|
break
|
||||||
if checker is None or result:
|
if checker is None or result:
|
||||||
if self._select_debug:
|
if self._select_debug:
|
||||||
print " SUCCESS %s %s" % (candidate.name, repr(candidate.attrs))
|
print(" SUCCESS %s %s" % (candidate.name, repr(candidate.attrs)))
|
||||||
if id(candidate) not in new_context_ids:
|
if id(candidate) not in new_context_ids:
|
||||||
# If a tag matches a selector more than once,
|
# If a tag matches a selector more than once,
|
||||||
# don't include it in the context more than once.
|
# don't include it in the context more than once.
|
||||||
new_context.append(candidate)
|
new_context.append(candidate)
|
||||||
new_context_ids.add(id(candidate))
|
new_context_ids.add(id(candidate))
|
||||||
elif self._select_debug:
|
elif self._select_debug:
|
||||||
print " FAILURE %s %s" % (candidate.name, repr(candidate.attrs))
|
print(" FAILURE %s %s" % (candidate.name, repr(candidate.attrs)))
|
||||||
|
|
||||||
current_context = new_context
|
current_context = new_context
|
||||||
if limit and len(current_context) >= limit:
|
if limit and len(current_context) >= limit:
|
||||||
current_context = current_context[:limit]
|
current_context = current_context[:limit]
|
||||||
|
|
||||||
if self._select_debug:
|
if self._select_debug:
|
||||||
print "Final verdict:"
|
print("Final verdict:")
|
||||||
for i in current_context:
|
for i in current_context:
|
||||||
print " %s %s" % (i.name, i.attrs)
|
print(" %s %s" % (i.name, i.attrs))
|
||||||
return current_context
|
return current_context
|
||||||
|
|
||||||
# Old names for backwards compatibility
|
# Old names for backwards compatibility
|
||||||
|
@ -1612,7 +1615,7 @@ class SoupStrainer(object):
|
||||||
def _normalize_search_value(self, value):
|
def _normalize_search_value(self, value):
|
||||||
# Leave it alone if it's a Unicode string, a callable, a
|
# Leave it alone if it's a Unicode string, a callable, a
|
||||||
# regular expression, a boolean, or None.
|
# regular expression, a boolean, or None.
|
||||||
if (isinstance(value, unicode) or callable(value) or hasattr(value, 'match')
|
if (isinstance(value, six.text_type) or callable(value) or hasattr(value, 'match')
|
||||||
or isinstance(value, bool) or value is None):
|
or isinstance(value, bool) or value is None):
|
||||||
return value
|
return value
|
||||||
|
|
||||||
|
@ -1625,7 +1628,7 @@ class SoupStrainer(object):
|
||||||
new_value = []
|
new_value = []
|
||||||
for v in value:
|
for v in value:
|
||||||
if (hasattr(v, '__iter__') and not isinstance(v, bytes)
|
if (hasattr(v, '__iter__') and not isinstance(v, bytes)
|
||||||
and not isinstance(v, unicode)):
|
and not isinstance(v, six.text_type)):
|
||||||
# This is almost certainly the user's mistake. In the
|
# This is almost certainly the user's mistake. In the
|
||||||
# interests of avoiding infinite loops, we'll let
|
# interests of avoiding infinite loops, we'll let
|
||||||
# it through as-is rather than doing a recursive call.
|
# it through as-is rather than doing a recursive call.
|
||||||
|
@ -1637,7 +1640,7 @@ class SoupStrainer(object):
|
||||||
# Otherwise, convert it into a Unicode string.
|
# Otherwise, convert it into a Unicode string.
|
||||||
# The unicode(str()) thing is so this will do the same thing on Python 2
|
# The unicode(str()) thing is so this will do the same thing on Python 2
|
||||||
# and Python 3.
|
# and Python 3.
|
||||||
return unicode(str(value))
|
return six.text_type(str(value))
|
||||||
|
|
||||||
def __str__(self):
|
def __str__(self):
|
||||||
if self.text:
|
if self.text:
|
||||||
|
@ -1691,7 +1694,7 @@ class SoupStrainer(object):
|
||||||
found = None
|
found = None
|
||||||
# If given a list of items, scan it for a text element that
|
# If given a list of items, scan it for a text element that
|
||||||
# matches.
|
# matches.
|
||||||
if hasattr(markup, '__iter__') and not isinstance(markup, (Tag, basestring)):
|
if hasattr(markup, '__iter__') and not isinstance(markup, (Tag, six.string_types)):
|
||||||
for element in markup:
|
for element in markup:
|
||||||
if isinstance(element, NavigableString) \
|
if isinstance(element, NavigableString) \
|
||||||
and self.search(element):
|
and self.search(element):
|
||||||
|
@ -1704,7 +1707,7 @@ class SoupStrainer(object):
|
||||||
found = self.search_tag(markup)
|
found = self.search_tag(markup)
|
||||||
# If it's text, make sure the text matches.
|
# If it's text, make sure the text matches.
|
||||||
elif isinstance(markup, NavigableString) or \
|
elif isinstance(markup, NavigableString) or \
|
||||||
isinstance(markup, basestring):
|
isinstance(markup, six.string_types):
|
||||||
if not self.name and not self.attrs and self._matches(markup, self.text):
|
if not self.name and not self.attrs and self._matches(markup, self.text):
|
||||||
found = markup
|
found = markup
|
||||||
else:
|
else:
|
||||||
|
@ -1749,7 +1752,7 @@ class SoupStrainer(object):
|
||||||
return not match_against
|
return not match_against
|
||||||
|
|
||||||
if (hasattr(match_against, '__iter__')
|
if (hasattr(match_against, '__iter__')
|
||||||
and not isinstance(match_against, basestring)):
|
and not isinstance(match_against, six.string_types)):
|
||||||
# We're asked to match against an iterable of items.
|
# We're asked to match against an iterable of items.
|
||||||
# The markup must be match at least one item in the
|
# The markup must be match at least one item in the
|
||||||
# iterable. We'll try each one in turn.
|
# iterable. We'll try each one in turn.
|
||||||
|
@ -1776,7 +1779,7 @@ class SoupStrainer(object):
|
||||||
# the tag's name and once against its prefixed name.
|
# the tag's name and once against its prefixed name.
|
||||||
match = False
|
match = False
|
||||||
|
|
||||||
if not match and isinstance(match_against, unicode):
|
if not match and isinstance(match_against, six.text_type):
|
||||||
# Exact string match
|
# Exact string match
|
||||||
match = markup == match_against
|
match = markup == match_against
|
||||||
|
|
||||||
|
|
|
@ -2,6 +2,8 @@
|
||||||
|
|
||||||
# Use of this source code is governed by a BSD-style license that can be
|
# Use of this source code is governed by a BSD-style license that can be
|
||||||
# found in the LICENSE file.
|
# found in the LICENSE file.
|
||||||
|
from __future__ import absolute_import
|
||||||
|
import six
|
||||||
__license__ = "MIT"
|
__license__ = "MIT"
|
||||||
|
|
||||||
import pickle
|
import pickle
|
||||||
|
@ -645,7 +647,7 @@ class XMLTreeBuilderSmokeTest(object):
|
||||||
markup = '<rss xmlns:dc="foo"><dc:creator>b</dc:creator><dc:date>2012-07-02T20:33:42Z</dc:date><dc:rights>c</dc:rights><image>d</image></rss>'
|
markup = '<rss xmlns:dc="foo"><dc:creator>b</dc:creator><dc:date>2012-07-02T20:33:42Z</dc:date><dc:rights>c</dc:rights><image>d</image></rss>'
|
||||||
soup = self.soup(markup)
|
soup = self.soup(markup)
|
||||||
self.assertEqual(
|
self.assertEqual(
|
||||||
unicode(soup.rss), markup)
|
six.text_type(soup.rss), markup)
|
||||||
|
|
||||||
def test_docstring_includes_correct_encoding(self):
|
def test_docstring_includes_correct_encoding(self):
|
||||||
soup = self.soup("<root/>")
|
soup = self.soup("<root/>")
|
||||||
|
@ -676,17 +678,17 @@ class XMLTreeBuilderSmokeTest(object):
|
||||||
def test_closing_namespaced_tag(self):
|
def test_closing_namespaced_tag(self):
|
||||||
markup = '<p xmlns:dc="http://purl.org/dc/elements/1.1/"><dc:date>20010504</dc:date></p>'
|
markup = '<p xmlns:dc="http://purl.org/dc/elements/1.1/"><dc:date>20010504</dc:date></p>'
|
||||||
soup = self.soup(markup)
|
soup = self.soup(markup)
|
||||||
self.assertEqual(unicode(soup.p), markup)
|
self.assertEqual(six.text_type(soup.p), markup)
|
||||||
|
|
||||||
def test_namespaced_attributes(self):
|
def test_namespaced_attributes(self):
|
||||||
markup = '<foo xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"><bar xsi:schemaLocation="http://www.example.com"/></foo>'
|
markup = '<foo xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"><bar xsi:schemaLocation="http://www.example.com"/></foo>'
|
||||||
soup = self.soup(markup)
|
soup = self.soup(markup)
|
||||||
self.assertEqual(unicode(soup.foo), markup)
|
self.assertEqual(six.text_type(soup.foo), markup)
|
||||||
|
|
||||||
def test_namespaced_attributes_xml_namespace(self):
|
def test_namespaced_attributes_xml_namespace(self):
|
||||||
markup = '<foo xml:lang="fr">bar</foo>'
|
markup = '<foo xml:lang="fr">bar</foo>'
|
||||||
soup = self.soup(markup)
|
soup = self.soup(markup)
|
||||||
self.assertEqual(unicode(soup.foo), markup)
|
self.assertEqual(six.text_type(soup.foo), markup)
|
||||||
|
|
||||||
def test_find_by_prefixed_name(self):
|
def test_find_by_prefixed_name(self):
|
||||||
doc = """<?xml version="1.0" encoding="utf-8"?>
|
doc = """<?xml version="1.0" encoding="utf-8"?>
|
||||||
|
|
|
@ -1,5 +1,6 @@
|
||||||
"""Tests of the builder registry."""
|
"""Tests of the builder registry."""
|
||||||
|
|
||||||
|
from __future__ import absolute_import
|
||||||
import unittest
|
import unittest
|
||||||
import warnings
|
import warnings
|
||||||
|
|
||||||
|
|
|
@ -2,6 +2,7 @@
|
||||||
|
|
||||||
# pylint: disable-msg=E0611,W0142
|
# pylint: disable-msg=E0611,W0142
|
||||||
|
|
||||||
|
from __future__ import absolute_import
|
||||||
__metaclass__ = type
|
__metaclass__ = type
|
||||||
__all__ = [
|
__all__ = [
|
||||||
'additional_tests',
|
'additional_tests',
|
||||||
|
|
|
@ -1,11 +1,12 @@
|
||||||
"""Tests to ensure that the html5lib tree builder generates good trees."""
|
"""Tests to ensure that the html5lib tree builder generates good trees."""
|
||||||
|
|
||||||
|
from __future__ import absolute_import
|
||||||
import warnings
|
import warnings
|
||||||
|
|
||||||
try:
|
try:
|
||||||
from bs4.builder import HTML5TreeBuilder
|
from bs4.builder import HTML5TreeBuilder
|
||||||
HTML5LIB_PRESENT = True
|
HTML5LIB_PRESENT = True
|
||||||
except ImportError, e:
|
except ImportError as e:
|
||||||
HTML5LIB_PRESENT = False
|
HTML5LIB_PRESENT = False
|
||||||
from bs4.element import SoupStrainer
|
from bs4.element import SoupStrainer
|
||||||
from bs4.testing import (
|
from bs4.testing import (
|
||||||
|
|
|
@ -1,6 +1,7 @@
|
||||||
"""Tests to ensure that the html.parser tree builder generates good
|
"""Tests to ensure that the html.parser tree builder generates good
|
||||||
trees."""
|
trees."""
|
||||||
|
|
||||||
|
from __future__ import absolute_import
|
||||||
from pdb import set_trace
|
from pdb import set_trace
|
||||||
import pickle
|
import pickle
|
||||||
from bs4.testing import SoupTest, HTMLTreeBuilderSmokeTest
|
from bs4.testing import SoupTest, HTMLTreeBuilderSmokeTest
|
||||||
|
|
|
@ -1,13 +1,15 @@
|
||||||
"""Tests to ensure that the lxml tree builder generates good trees."""
|
"""Tests to ensure that the lxml tree builder generates good trees."""
|
||||||
|
|
||||||
|
from __future__ import absolute_import
|
||||||
import re
|
import re
|
||||||
import warnings
|
import warnings
|
||||||
|
import six
|
||||||
|
|
||||||
try:
|
try:
|
||||||
import lxml.etree
|
import lxml.etree
|
||||||
LXML_PRESENT = True
|
LXML_PRESENT = True
|
||||||
LXML_VERSION = lxml.etree.LXML_VERSION
|
LXML_VERSION = lxml.etree.LXML_VERSION
|
||||||
except ImportError, e:
|
except ImportError as e:
|
||||||
LXML_PRESENT = False
|
LXML_PRESENT = False
|
||||||
LXML_VERSION = (0,)
|
LXML_VERSION = (0,)
|
||||||
|
|
||||||
|
@ -62,7 +64,7 @@ class LXMLTreeBuilderSmokeTest(SoupTest, HTMLTreeBuilderSmokeTest):
|
||||||
# if one is installed.
|
# if one is installed.
|
||||||
with warnings.catch_warnings(record=True) as w:
|
with warnings.catch_warnings(record=True) as w:
|
||||||
soup = BeautifulStoneSoup("<b />")
|
soup = BeautifulStoneSoup("<b />")
|
||||||
self.assertEqual(u"<b/>", unicode(soup.b))
|
self.assertEqual(u"<b/>", six.text_type(soup.b))
|
||||||
self.assertTrue("BeautifulStoneSoup class is deprecated" in str(w[0].message))
|
self.assertTrue("BeautifulStoneSoup class is deprecated" in str(w[0].message))
|
||||||
|
|
||||||
@skipIf(
|
@skipIf(
|
||||||
|
|
|
@ -1,6 +1,7 @@
|
||||||
# -*- coding: utf-8 -*-
|
# -*- coding: utf-8 -*-
|
||||||
"""Tests of Beautiful Soup as a whole."""
|
"""Tests of Beautiful Soup as a whole."""
|
||||||
|
|
||||||
|
from __future__ import absolute_import
|
||||||
from pdb import set_trace
|
from pdb import set_trace
|
||||||
import logging
|
import logging
|
||||||
import unittest
|
import unittest
|
||||||
|
@ -28,11 +29,12 @@ from bs4.testing import (
|
||||||
skipIf,
|
skipIf,
|
||||||
)
|
)
|
||||||
import warnings
|
import warnings
|
||||||
|
import six
|
||||||
|
|
||||||
try:
|
try:
|
||||||
from bs4.builder import LXMLTreeBuilder, LXMLTreeBuilderForXML
|
from bs4.builder import LXMLTreeBuilder, LXMLTreeBuilderForXML
|
||||||
LXML_PRESENT = True
|
LXML_PRESENT = True
|
||||||
except ImportError, e:
|
except ImportError as e:
|
||||||
LXML_PRESENT = False
|
LXML_PRESENT = False
|
||||||
|
|
||||||
PYTHON_3_PRE_3_2 = (sys.version_info[0] == 3 and sys.version_info < (3,2))
|
PYTHON_3_PRE_3_2 = (sys.version_info[0] == 3 and sys.version_info < (3,2))
|
||||||
|
@ -250,7 +252,7 @@ class TestEncodingConversion(SoupTest):
|
||||||
ascii = b"<foo>a</foo>"
|
ascii = b"<foo>a</foo>"
|
||||||
soup_from_ascii = self.soup(ascii)
|
soup_from_ascii = self.soup(ascii)
|
||||||
unicode_output = soup_from_ascii.decode()
|
unicode_output = soup_from_ascii.decode()
|
||||||
self.assertTrue(isinstance(unicode_output, unicode))
|
self.assertTrue(isinstance(unicode_output, six.text_type))
|
||||||
self.assertEqual(unicode_output, self.document_for(ascii.decode()))
|
self.assertEqual(unicode_output, self.document_for(ascii.decode()))
|
||||||
self.assertEqual(soup_from_ascii.original_encoding.lower(), "utf-8")
|
self.assertEqual(soup_from_ascii.original_encoding.lower(), "utf-8")
|
||||||
finally:
|
finally:
|
||||||
|
|
|
@ -10,6 +10,7 @@ same markup, but all Beautiful Soup trees can be traversed with the
|
||||||
methods tested here.
|
methods tested here.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
|
from __future__ import absolute_import
|
||||||
from pdb import set_trace
|
from pdb import set_trace
|
||||||
import copy
|
import copy
|
||||||
import pickle
|
import pickle
|
||||||
|
@ -34,6 +35,7 @@ from bs4.testing import (
|
||||||
SoupTest,
|
SoupTest,
|
||||||
skipIf,
|
skipIf,
|
||||||
)
|
)
|
||||||
|
import six
|
||||||
|
|
||||||
XML_BUILDER_PRESENT = (builder_registry.lookup("xml") is not None)
|
XML_BUILDER_PRESENT = (builder_registry.lookup("xml") is not None)
|
||||||
LXML_PRESENT = (builder_registry.lookup("lxml") is not None)
|
LXML_PRESENT = (builder_registry.lookup("lxml") is not None)
|
||||||
|
@ -1111,7 +1113,7 @@ class TestTreeModification(SoupTest):
|
||||||
<script>baz</script>
|
<script>baz</script>
|
||||||
</html>""")
|
</html>""")
|
||||||
[soup.script.extract() for i in soup.find_all("script")]
|
[soup.script.extract() for i in soup.find_all("script")]
|
||||||
self.assertEqual("<body>\n\n<a></a>\n</body>", unicode(soup.body))
|
self.assertEqual("<body>\n\n<a></a>\n</body>", six.text_type(soup.body))
|
||||||
|
|
||||||
|
|
||||||
def test_extract_works_when_element_is_surrounded_by_identical_strings(self):
|
def test_extract_works_when_element_is_surrounded_by_identical_strings(self):
|
||||||
|
@ -1349,7 +1351,7 @@ class TestPersistence(SoupTest):
|
||||||
soup = BeautifulSoup(b'<p> </p>', 'html.parser')
|
soup = BeautifulSoup(b'<p> </p>', 'html.parser')
|
||||||
encoding = soup.original_encoding
|
encoding = soup.original_encoding
|
||||||
copy = soup.__copy__()
|
copy = soup.__copy__()
|
||||||
self.assertEqual(u"<p> </p>", unicode(copy))
|
self.assertEqual(u"<p> </p>", six.text_type(copy))
|
||||||
self.assertEqual(encoding, copy.original_encoding)
|
self.assertEqual(encoding, copy.original_encoding)
|
||||||
|
|
||||||
def test_unicode_pickle(self):
|
def test_unicode_pickle(self):
|
||||||
|
@ -1393,7 +1395,7 @@ class TestPersistence(SoupTest):
|
||||||
div_copy = copy.copy(div)
|
div_copy = copy.copy(div)
|
||||||
|
|
||||||
# The two tags look the same, and evaluate to equal.
|
# The two tags look the same, and evaluate to equal.
|
||||||
self.assertEqual(unicode(div), unicode(div_copy))
|
self.assertEqual(six.text_type(div), six.text_type(div_copy))
|
||||||
self.assertEqual(div, div_copy)
|
self.assertEqual(div, div_copy)
|
||||||
|
|
||||||
# But they're not the same object.
|
# But they're not the same object.
|
||||||
|
@ -1505,7 +1507,7 @@ class TestSubstitutions(SoupTest):
|
||||||
|
|
||||||
def test_prettify_outputs_unicode_by_default(self):
|
def test_prettify_outputs_unicode_by_default(self):
|
||||||
soup = self.soup("<a></a>")
|
soup = self.soup("<a></a>")
|
||||||
self.assertEqual(unicode, type(soup.prettify()))
|
self.assertEqual(six.text_type, type(soup.prettify()))
|
||||||
|
|
||||||
def test_prettify_can_encode_data(self):
|
def test_prettify_can_encode_data(self):
|
||||||
soup = self.soup("<a></a>")
|
soup = self.soup("<a></a>")
|
||||||
|
|
Loading…
Reference in New Issue