This commit is contained in:
Louis Vézina 2019-09-24 06:23:11 -04:00
parent 2b2fd4e8d7
commit 8227df459a
16 changed files with 148 additions and 113 deletions

View File

@ -20,6 +20,10 @@ http://www.crummy.com/software/BeautifulSoup/bs4/doc/
# Use of this source code is governed by a BSD-style license that can be # Use of this source code is governed by a BSD-style license that can be
# found in the LICENSE file. # found in the LICENSE file.
from __future__ import absolute_import
from __future__ import print_function
import six
from six.moves import range
__author__ = "Leonard Richardson (leonardr@segfault.org)" __author__ = "Leonard Richardson (leonardr@segfault.org)"
__version__ = "4.6.0" __version__ = "4.6.0"
__copyright__ = "Copyright (c) 2004-2017 Leonard Richardson" __copyright__ = "Copyright (c) 2004-2017 Leonard Richardson"
@ -50,7 +54,7 @@ from .element import (
# The very first thing we do is give a useful error if someone is # The very first thing we do is give a useful error if someone is
# running this code under Python 3 without converting it. # running this code under Python 3 without converting it.
'You are trying to run the Python 2 version of Beautiful Soup under Python 3. This will not work.'<>'You need to convert the code, either by installing it (`python setup.py install`) or by running 2to3 (`2to3 -w bs4`).' 'You are trying to run the Python 2 version of Beautiful Soup under Python 3. This will not work.'!='You need to convert the code, either by installing it (`python setup.py install`) or by running 2to3 (`2to3 -w bs4`).'
class BeautifulSoup(Tag): class BeautifulSoup(Tag):
""" """
@ -142,18 +146,18 @@ class BeautifulSoup(Tag):
from_encoding = from_encoding or deprecated_argument( from_encoding = from_encoding or deprecated_argument(
"fromEncoding", "from_encoding") "fromEncoding", "from_encoding")
if from_encoding and isinstance(markup, unicode): if from_encoding and isinstance(markup, six.text_type):
warnings.warn("You provided Unicode markup but also provided a value for from_encoding. Your from_encoding will be ignored.") warnings.warn("You provided Unicode markup but also provided a value for from_encoding. Your from_encoding will be ignored.")
from_encoding = None from_encoding = None
if len(kwargs) > 0: if len(kwargs) > 0:
arg = kwargs.keys().pop() arg = list(kwargs.keys()).pop()
raise TypeError( raise TypeError(
"__init__() got an unexpected keyword argument '%s'" % arg) "__init__() got an unexpected keyword argument '%s'" % arg)
if builder is None: if builder is None:
original_features = features original_features = features
if isinstance(features, basestring): if isinstance(features, six.string_types):
features = [features] features = [features]
if features is None or len(features) == 0: if features is None or len(features) == 0:
features = self.DEFAULT_BUILDER_FEATURES features = self.DEFAULT_BUILDER_FEATURES
@ -191,13 +195,13 @@ class BeautifulSoup(Tag):
markup = markup.read() markup = markup.read()
elif len(markup) <= 256 and ( elif len(markup) <= 256 and (
(isinstance(markup, bytes) and not b'<' in markup) (isinstance(markup, bytes) and not b'<' in markup)
or (isinstance(markup, unicode) and not u'<' in markup) or (isinstance(markup, six.text_type) and not u'<' in markup)
): ):
# Print out warnings for a couple beginner problems # Print out warnings for a couple beginner problems
# involving passing non-markup to Beautiful Soup. # involving passing non-markup to Beautiful Soup.
# Beautiful Soup will still parse the input as markup, # Beautiful Soup will still parse the input as markup,
# just in case that's what the user really wants. # just in case that's what the user really wants.
if (isinstance(markup, unicode) if (isinstance(markup, six.text_type)
and not os.path.supports_unicode_filenames): and not os.path.supports_unicode_filenames):
possible_filename = markup.encode("utf8") possible_filename = markup.encode("utf8")
else: else:
@ -205,13 +209,13 @@ class BeautifulSoup(Tag):
is_file = False is_file = False
try: try:
is_file = os.path.exists(possible_filename) is_file = os.path.exists(possible_filename)
except Exception, e: except Exception as e:
# This is almost certainly a problem involving # This is almost certainly a problem involving
# characters not valid in filenames on this # characters not valid in filenames on this
# system. Just let it go. # system. Just let it go.
pass pass
if is_file: if is_file:
if isinstance(markup, unicode): if isinstance(markup, six.text_type):
markup = markup.encode("utf8") markup = markup.encode("utf8")
warnings.warn( warnings.warn(
'"%s" looks like a filename, not markup. You should' '"%s" looks like a filename, not markup. You should'
@ -263,7 +267,7 @@ class BeautifulSoup(Tag):
if isinstance(markup, bytes): if isinstance(markup, bytes):
space = b' ' space = b' '
cant_start_with = (b"http:", b"https:") cant_start_with = (b"http:", b"https:")
elif isinstance(markup, unicode): elif isinstance(markup, six.text_type):
space = u' ' space = u' '
cant_start_with = (u"http:", u"https:") cant_start_with = (u"http:", u"https:")
else: else:
@ -526,4 +530,4 @@ class FeatureNotFound(ValueError):
if __name__ == '__main__': if __name__ == '__main__':
import sys import sys
soup = BeautifulSoup(sys.stdin) soup = BeautifulSoup(sys.stdin)
print soup.prettify() print(soup.prettify())

View File

@ -1,6 +1,7 @@
# Use of this source code is governed by a BSD-style license that can be # Use of this source code is governed by a BSD-style license that can be
# found in the LICENSE file. # found in the LICENSE file.
from __future__ import absolute_import
from collections import defaultdict from collections import defaultdict
import itertools import itertools
import sys import sys
@ -10,6 +11,7 @@ from bs4.element import (
HTMLAwareEntitySubstitution, HTMLAwareEntitySubstitution,
whitespace_re whitespace_re
) )
import six
__all__ = [ __all__ = [
'HTMLTreeBuilder', 'HTMLTreeBuilder',
@ -166,7 +168,7 @@ class TreeBuilder(object):
# value is a whitespace-separated list of # value is a whitespace-separated list of
# values. Split it into a list. # values. Split it into a list.
value = attrs[attr] value = attrs[attr]
if isinstance(value, basestring): if isinstance(value, six.string_types):
values = whitespace_re.split(value) values = whitespace_re.split(value)
else: else:
# html5lib sometimes calls setAttributes twice # html5lib sometimes calls setAttributes twice

View File

@ -1,6 +1,8 @@
# Use of this source code is governed by a BSD-style license that can be # Use of this source code is governed by a BSD-style license that can be
# found in the LICENSE file. # found in the LICENSE file.
from __future__ import absolute_import
import six
__all__ = [ __all__ = [
'HTML5TreeBuilder', 'HTML5TreeBuilder',
] ]
@ -33,7 +35,7 @@ try:
# Pre-0.99999999 # Pre-0.99999999
from html5lib.treebuilders import _base as treebuilder_base from html5lib.treebuilders import _base as treebuilder_base
new_html5lib = False new_html5lib = False
except ImportError, e: except ImportError as e:
# 0.99999999 and up # 0.99999999 and up
from html5lib.treebuilders import base as treebuilder_base from html5lib.treebuilders import base as treebuilder_base
new_html5lib = True new_html5lib = True
@ -64,7 +66,7 @@ class HTML5TreeBuilder(HTMLTreeBuilder):
parser = html5lib.HTMLParser(tree=self.create_treebuilder) parser = html5lib.HTMLParser(tree=self.create_treebuilder)
extra_kwargs = dict() extra_kwargs = dict()
if not isinstance(markup, unicode): if not isinstance(markup, six.text_type):
if new_html5lib: if new_html5lib:
extra_kwargs['override_encoding'] = self.user_specified_encoding extra_kwargs['override_encoding'] = self.user_specified_encoding
else: else:
@ -72,13 +74,13 @@ class HTML5TreeBuilder(HTMLTreeBuilder):
doc = parser.parse(markup, **extra_kwargs) doc = parser.parse(markup, **extra_kwargs)
# Set the character encoding detected by the tokenizer. # Set the character encoding detected by the tokenizer.
if isinstance(markup, unicode): if isinstance(markup, six.text_type):
# We need to special-case this because html5lib sets # We need to special-case this because html5lib sets
# charEncoding to UTF-8 if it gets Unicode input. # charEncoding to UTF-8 if it gets Unicode input.
doc.original_encoding = None doc.original_encoding = None
else: else:
original_encoding = parser.tokenizer.stream.charEncoding[0] original_encoding = parser.tokenizer.stream.charEncoding[0]
if not isinstance(original_encoding, basestring): if not isinstance(original_encoding, six.string_types):
# In 0.99999999 and up, the encoding is an html5lib # In 0.99999999 and up, the encoding is an html5lib
# Encoding object. We want to use a string for compatibility # Encoding object. We want to use a string for compatibility
# with other tree builders. # with other tree builders.
@ -229,7 +231,7 @@ class Element(treebuilder_base.Node):
def appendChild(self, node): def appendChild(self, node):
string_child = child = None string_child = child = None
if isinstance(node, basestring): if isinstance(node, six.string_types):
# Some other piece of code decided to pass in a string # Some other piece of code decided to pass in a string
# instead of creating a TextElement object to contain the # instead of creating a TextElement object to contain the
# string. # string.
@ -246,7 +248,7 @@ class Element(treebuilder_base.Node):
child = node.element child = node.element
node.parent = self node.parent = self
if not isinstance(child, basestring) and child.parent is not None: if not isinstance(child, six.string_types) and child.parent is not None:
node.element.extract() node.element.extract()
if (string_child and self.element.contents if (string_child and self.element.contents
@ -259,7 +261,7 @@ class Element(treebuilder_base.Node):
old_element.replace_with(new_element) old_element.replace_with(new_element)
self.soup._most_recent_element = new_element self.soup._most_recent_element = new_element
else: else:
if isinstance(node, basestring): if isinstance(node, six.string_types):
# Create a brand new NavigableString from this string. # Create a brand new NavigableString from this string.
child = self.soup.new_string(node) child = self.soup.new_string(node)

View File

@ -3,15 +3,18 @@
# Use of this source code is governed by a BSD-style license that can be # Use of this source code is governed by a BSD-style license that can be
# found in the LICENSE file. # found in the LICENSE file.
from __future__ import absolute_import
from six import unichr
import six
__all__ = [ __all__ = [
'HTMLParserTreeBuilder', 'HTMLParserTreeBuilder',
] ]
from HTMLParser import HTMLParser from six.moves.html_parser import HTMLParser
try: try:
from HTMLParser import HTMLParseError from six.moves.html_parser import HTMLParseError
except ImportError, e: except ImportError as e:
# HTMLParseError is removed in Python 3.5. Since it can never be # HTMLParseError is removed in Python 3.5. Since it can never be
# thrown in 3.5, we can just define our own class as a placeholder. # thrown in 3.5, we can just define our own class as a placeholder.
class HTMLParseError(Exception): class HTMLParseError(Exception):
@ -131,7 +134,7 @@ class BeautifulSoupHTMLParser(HTMLParser):
try: try:
data = unichr(real_name) data = unichr(real_name)
except (ValueError, OverflowError), e: except (ValueError, OverflowError) as e:
data = u"\N{REPLACEMENT CHARACTER}" data = u"\N{REPLACEMENT CHARACTER}"
self.handle_data(data) self.handle_data(data)
@ -196,7 +199,7 @@ class HTMLParserTreeBuilder(HTMLTreeBuilder):
declared within markup, whether any characters had to be declared within markup, whether any characters had to be
replaced with REPLACEMENT CHARACTER). replaced with REPLACEMENT CHARACTER).
""" """
if isinstance(markup, unicode): if isinstance(markup, six.text_type):
yield (markup, None, None, False) yield (markup, None, None, False)
return return
@ -213,7 +216,7 @@ class HTMLParserTreeBuilder(HTMLTreeBuilder):
parser.soup = self.soup parser.soup = self.soup
try: try:
parser.feed(markup) parser.feed(markup)
except HTMLParseError, e: except HTMLParseError as e:
warnings.warn(RuntimeWarning( warnings.warn(RuntimeWarning(
"Python's built-in HTMLParser cannot parse the given document. This is not a bug in Beautiful Soup. The best solution is to install an external parser (lxml or html5lib), and use Beautiful Soup with that parser. See http://www.crummy.com/software/BeautifulSoup/bs4/doc/#installing-a-parser for help.")) "Python's built-in HTMLParser cannot parse the given document. This is not a bug in Beautiful Soup. The best solution is to install an external parser (lxml or html5lib), and use Beautiful Soup with that parser. See http://www.crummy.com/software/BeautifulSoup/bs4/doc/#installing-a-parser for help."))
raise e raise e

View File

@ -1,5 +1,7 @@
# Use of this source code is governed by a BSD-style license that can be # Use of this source code is governed by a BSD-style license that can be
# found in the LICENSE file. # found in the LICENSE file.
from __future__ import absolute_import
import six
__all__ = [ __all__ = [
'LXMLTreeBuilderForXML', 'LXMLTreeBuilderForXML',
'LXMLTreeBuilder', 'LXMLTreeBuilder',
@ -101,12 +103,12 @@ class LXMLTreeBuilderForXML(TreeBuilder):
else: else:
self.processing_instruction_class = XMLProcessingInstruction self.processing_instruction_class = XMLProcessingInstruction
if isinstance(markup, unicode): if isinstance(markup, six.text_type):
# We were given Unicode. Maybe lxml can parse Unicode on # We were given Unicode. Maybe lxml can parse Unicode on
# this system? # this system?
yield markup, None, document_declared_encoding, False yield markup, None, document_declared_encoding, False
if isinstance(markup, unicode): if isinstance(markup, six.text_type):
# No, apparently not. Convert the Unicode to UTF-8 and # No, apparently not. Convert the Unicode to UTF-8 and
# tell lxml to parse it as UTF-8. # tell lxml to parse it as UTF-8.
yield (markup.encode("utf8"), "utf8", yield (markup.encode("utf8"), "utf8",
@ -121,7 +123,7 @@ class LXMLTreeBuilderForXML(TreeBuilder):
def feed(self, markup): def feed(self, markup):
if isinstance(markup, bytes): if isinstance(markup, bytes):
markup = BytesIO(markup) markup = BytesIO(markup)
elif isinstance(markup, unicode): elif isinstance(markup, six.text_type):
markup = StringIO(markup) markup = StringIO(markup)
# Call feed() at least once, even if the markup is empty, # Call feed() at least once, even if the markup is empty,
@ -136,7 +138,7 @@ class LXMLTreeBuilderForXML(TreeBuilder):
if len(data) != 0: if len(data) != 0:
self.parser.feed(data) self.parser.feed(data)
self.parser.close() self.parser.close()
except (UnicodeDecodeError, LookupError, etree.ParserError), e: except (UnicodeDecodeError, LookupError, etree.ParserError) as e:
raise ParserRejectedMarkup(str(e)) raise ParserRejectedMarkup(str(e))
def close(self): def close(self):
@ -249,7 +251,7 @@ class LXMLTreeBuilder(HTMLTreeBuilder, LXMLTreeBuilderForXML):
self.parser = self.parser_for(encoding) self.parser = self.parser_for(encoding)
self.parser.feed(markup) self.parser.feed(markup)
self.parser.close() self.parser.close()
except (UnicodeDecodeError, LookupError, etree.ParserError), e: except (UnicodeDecodeError, LookupError, etree.ParserError) as e:
raise ParserRejectedMarkup(str(e)) raise ParserRejectedMarkup(str(e))

View File

@ -8,10 +8,13 @@ XML or HTML to reflect a new encoding; that's the tree builder's job.
""" """
# Use of this source code is governed by a BSD-style license that can be # Use of this source code is governed by a BSD-style license that can be
# found in the LICENSE file. # found in the LICENSE file.
from __future__ import absolute_import
from six import unichr
import six
__license__ = "MIT" __license__ = "MIT"
import codecs import codecs
from htmlentitydefs import codepoint2name from six.moves.html_entities import codepoint2name
import re import re
import logging import logging
import string import string
@ -274,7 +277,7 @@ class EncodingDetector:
def strip_byte_order_mark(cls, data): def strip_byte_order_mark(cls, data):
"""If a byte-order mark is present, strip it and return the encoding it implies.""" """If a byte-order mark is present, strip it and return the encoding it implies."""
encoding = None encoding = None
if isinstance(data, unicode): if isinstance(data, six.text_type):
# Unicode data cannot have a byte-order mark. # Unicode data cannot have a byte-order mark.
return data, encoding return data, encoding
if (len(data) >= 4) and (data[:2] == b'\xfe\xff') \ if (len(data) >= 4) and (data[:2] == b'\xfe\xff') \
@ -352,9 +355,9 @@ class UnicodeDammit:
markup, override_encodings, is_html, exclude_encodings) markup, override_encodings, is_html, exclude_encodings)
# Short-circuit if the data is in Unicode to begin with. # Short-circuit if the data is in Unicode to begin with.
if isinstance(markup, unicode) or markup == '': if isinstance(markup, six.text_type) or markup == '':
self.markup = markup self.markup = markup
self.unicode_markup = unicode(markup) self.unicode_markup = six.text_type(markup)
self.original_encoding = None self.original_encoding = None
return return
@ -438,7 +441,7 @@ class UnicodeDammit:
def _to_unicode(self, data, encoding, errors="strict"): def _to_unicode(self, data, encoding, errors="strict"):
'''Given a string and its encoding, decodes the string into Unicode. '''Given a string and its encoding, decodes the string into Unicode.
%encoding is a string recognized by encodings.aliases''' %encoding is a string recognized by encodings.aliases'''
return unicode(data, encoding, errors) return six.text_type(data, encoding, errors)
@property @property
def declared_html_encoding(self): def declared_html_encoding(self):

View File

@ -2,11 +2,15 @@
# Use of this source code is governed by a BSD-style license that can be # Use of this source code is governed by a BSD-style license that can be
# found in the LICENSE file. # found in the LICENSE file.
from __future__ import absolute_import
from __future__ import print_function
from six.moves import map
from six.moves import range
__license__ = "MIT" __license__ = "MIT"
import cProfile import cProfile
from StringIO import StringIO from StringIO import StringIO
from HTMLParser import HTMLParser from six.moves.html_parser import HTMLParser
import bs4 import bs4
from bs4 import BeautifulSoup, __version__ from bs4 import BeautifulSoup, __version__
from bs4.builder import builder_registry from bs4.builder import builder_registry
@ -22,8 +26,8 @@ import cProfile
def diagnose(data): def diagnose(data):
"""Diagnostic suite for isolating common problems.""" """Diagnostic suite for isolating common problems."""
print "Diagnostic running on Beautiful Soup %s" % __version__ print("Diagnostic running on Beautiful Soup %s" % __version__)
print "Python version %s" % sys.version print("Python version %s" % sys.version)
basic_parsers = ["html.parser", "html5lib", "lxml"] basic_parsers = ["html.parser", "html5lib", "lxml"]
for name in basic_parsers: for name in basic_parsers:
@ -32,16 +36,16 @@ def diagnose(data):
break break
else: else:
basic_parsers.remove(name) basic_parsers.remove(name)
print ( print((
"I noticed that %s is not installed. Installing it may help." % "I noticed that %s is not installed. Installing it may help." %
name) name))
if 'lxml' in basic_parsers: if 'lxml' in basic_parsers:
basic_parsers.append(["lxml", "xml"]) basic_parsers.append(["lxml", "xml"])
try: try:
from lxml import etree from lxml import etree
print "Found lxml version %s" % ".".join(map(str,etree.LXML_VERSION)) print("Found lxml version %s" % ".".join(map(str,etree.LXML_VERSION)))
except ImportError, e: except ImportError as e:
print ( print (
"lxml is not installed or couldn't be imported.") "lxml is not installed or couldn't be imported.")
@ -49,37 +53,37 @@ def diagnose(data):
if 'html5lib' in basic_parsers: if 'html5lib' in basic_parsers:
try: try:
import html5lib import html5lib
print "Found html5lib version %s" % html5lib.__version__ print("Found html5lib version %s" % html5lib.__version__)
except ImportError, e: except ImportError as e:
print ( print (
"html5lib is not installed or couldn't be imported.") "html5lib is not installed or couldn't be imported.")
if hasattr(data, 'read'): if hasattr(data, 'read'):
data = data.read() data = data.read()
elif os.path.exists(data): elif os.path.exists(data):
print '"%s" looks like a filename. Reading data from the file.' % data print('"%s" looks like a filename. Reading data from the file.' % data)
with open(data) as fp: with open(data) as fp:
data = fp.read() data = fp.read()
elif data.startswith("http:") or data.startswith("https:"): elif data.startswith("http:") or data.startswith("https:"):
print '"%s" looks like a URL. Beautiful Soup is not an HTTP client.' % data print('"%s" looks like a URL. Beautiful Soup is not an HTTP client.' % data)
print "You need to use some other library to get the document behind the URL, and feed that document to Beautiful Soup." print("You need to use some other library to get the document behind the URL, and feed that document to Beautiful Soup.")
return return
print print()
for parser in basic_parsers: for parser in basic_parsers:
print "Trying to parse your markup with %s" % parser print("Trying to parse your markup with %s" % parser)
success = False success = False
try: try:
soup = BeautifulSoup(data, parser) soup = BeautifulSoup(data, parser)
success = True success = True
except Exception, e: except Exception as e:
print "%s could not parse the markup." % parser print("%s could not parse the markup." % parser)
traceback.print_exc() traceback.print_exc()
if success: if success:
print "Here's what %s did with the markup:" % parser print("Here's what %s did with the markup:" % parser)
print soup.prettify() print(soup.prettify())
print "-" * 80 print("-" * 80)
def lxml_trace(data, html=True, **kwargs): def lxml_trace(data, html=True, **kwargs):
"""Print out the lxml events that occur during parsing. """Print out the lxml events that occur during parsing.
@ -89,7 +93,7 @@ def lxml_trace(data, html=True, **kwargs):
""" """
from lxml import etree from lxml import etree
for event, element in etree.iterparse(StringIO(data), html=html, **kwargs): for event, element in etree.iterparse(StringIO(data), html=html, **kwargs):
print("%s, %4s, %s" % (event, element.tag, element.text)) print(("%s, %4s, %s" % (event, element.tag, element.text)))
class AnnouncingParser(HTMLParser): class AnnouncingParser(HTMLParser):
"""Announces HTMLParser parse events, without doing anything else.""" """Announces HTMLParser parse events, without doing anything else."""
@ -171,9 +175,9 @@ def rdoc(num_elements=1000):
def benchmark_parsers(num_elements=100000): def benchmark_parsers(num_elements=100000):
"""Very basic head-to-head performance benchmark.""" """Very basic head-to-head performance benchmark."""
print "Comparative parser benchmark on Beautiful Soup %s" % __version__ print("Comparative parser benchmark on Beautiful Soup %s" % __version__)
data = rdoc(num_elements) data = rdoc(num_elements)
print "Generated a large invalid HTML document (%d bytes)." % len(data) print("Generated a large invalid HTML document (%d bytes)." % len(data))
for parser in ["lxml", ["lxml", "html"], "html5lib", "html.parser"]: for parser in ["lxml", ["lxml", "html"], "html5lib", "html.parser"]:
success = False success = False
@ -182,24 +186,24 @@ def benchmark_parsers(num_elements=100000):
soup = BeautifulSoup(data, parser) soup = BeautifulSoup(data, parser)
b = time.time() b = time.time()
success = True success = True
except Exception, e: except Exception as e:
print "%s could not parse the markup." % parser print("%s could not parse the markup." % parser)
traceback.print_exc() traceback.print_exc()
if success: if success:
print "BS4+%s parsed the markup in %.2fs." % (parser, b-a) print("BS4+%s parsed the markup in %.2fs." % (parser, b-a))
from lxml import etree from lxml import etree
a = time.time() a = time.time()
etree.HTML(data) etree.HTML(data)
b = time.time() b = time.time()
print "Raw lxml parsed the markup in %.2fs." % (b-a) print("Raw lxml parsed the markup in %.2fs." % (b-a))
import html5lib import html5lib
parser = html5lib.HTMLParser() parser = html5lib.HTMLParser()
a = time.time() a = time.time()
parser.parse(data) parser.parse(data)
b = time.time() b = time.time()
print "Raw html5lib parsed the markup in %.2fs." % (b-a) print("Raw html5lib parsed the markup in %.2fs." % (b-a))
def profile(num_elements=100000, parser="lxml"): def profile(num_elements=100000, parser="lxml"):

View File

@ -1,5 +1,8 @@
# Use of this source code is governed by a BSD-style license that can be # Use of this source code is governed by a BSD-style license that can be
# found in the LICENSE file. # found in the LICENSE file.
from __future__ import absolute_import
from __future__ import print_function
import six
__license__ = "MIT" __license__ = "MIT"
import collections import collections
@ -26,22 +29,22 @@ def _alias(attr):
return alias return alias
class NamespacedAttribute(unicode): class NamespacedAttribute(six.text_type):
def __new__(cls, prefix, name, namespace=None): def __new__(cls, prefix, name, namespace=None):
if name is None: if name is None:
obj = unicode.__new__(cls, prefix) obj = six.text_type.__new__(cls, prefix)
elif prefix is None: elif prefix is None:
# Not really namespaced. # Not really namespaced.
obj = unicode.__new__(cls, name) obj = six.text_type.__new__(cls, name)
else: else:
obj = unicode.__new__(cls, prefix + ":" + name) obj = six.text_type.__new__(cls, prefix + ":" + name)
obj.prefix = prefix obj.prefix = prefix
obj.name = name obj.name = name
obj.namespace = namespace obj.namespace = namespace
return obj return obj
class AttributeValueWithCharsetSubstitution(unicode): class AttributeValueWithCharsetSubstitution(six.text_type):
"""A stand-in object for a character encoding specified in HTML.""" """A stand-in object for a character encoding specified in HTML."""
class CharsetMetaAttributeValue(AttributeValueWithCharsetSubstitution): class CharsetMetaAttributeValue(AttributeValueWithCharsetSubstitution):
@ -52,7 +55,7 @@ class CharsetMetaAttributeValue(AttributeValueWithCharsetSubstitution):
""" """
def __new__(cls, original_value): def __new__(cls, original_value):
obj = unicode.__new__(cls, original_value) obj = six.text_type.__new__(cls, original_value)
obj.original_value = original_value obj.original_value = original_value
return obj return obj
@ -75,9 +78,9 @@ class ContentMetaAttributeValue(AttributeValueWithCharsetSubstitution):
match = cls.CHARSET_RE.search(original_value) match = cls.CHARSET_RE.search(original_value)
if match is None: if match is None:
# No substitution necessary. # No substitution necessary.
return unicode.__new__(unicode, original_value) return six.text_type.__new__(six.text_type, original_value)
obj = unicode.__new__(cls, original_value) obj = six.text_type.__new__(cls, original_value)
obj.original_value = original_value obj.original_value = original_value
return obj return obj
@ -312,7 +315,7 @@ class PageElement(object):
raise ValueError("Cannot insert None into a tag.") raise ValueError("Cannot insert None into a tag.")
if new_child is self: if new_child is self:
raise ValueError("Cannot insert a tag into itself.") raise ValueError("Cannot insert a tag into itself.")
if (isinstance(new_child, basestring) if (isinstance(new_child, six.string_types)
and not isinstance(new_child, NavigableString)): and not isinstance(new_child, NavigableString)):
new_child = NavigableString(new_child) new_child = NavigableString(new_child)
@ -533,7 +536,7 @@ class PageElement(object):
result = (element for element in generator result = (element for element in generator
if isinstance(element, Tag)) if isinstance(element, Tag))
return ResultSet(strainer, result) return ResultSet(strainer, result)
elif isinstance(name, basestring): elif isinstance(name, six.string_types):
# Optimization to find all tags with a given name. # Optimization to find all tags with a given name.
if name.count(':') == 1: if name.count(':') == 1:
# This is a name with a prefix. # This is a name with a prefix.
@ -691,7 +694,7 @@ class PageElement(object):
return self.parents return self.parents
class NavigableString(unicode, PageElement): class NavigableString(six.text_type, PageElement):
PREFIX = '' PREFIX = ''
SUFFIX = '' SUFFIX = ''
@ -709,10 +712,10 @@ class NavigableString(unicode, PageElement):
passed in to the superclass's __new__ or the superclass won't know passed in to the superclass's __new__ or the superclass won't know
how to handle non-ASCII characters. how to handle non-ASCII characters.
""" """
if isinstance(value, unicode): if isinstance(value, six.text_type):
u = unicode.__new__(cls, value) u = six.text_type.__new__(cls, value)
else: else:
u = unicode.__new__(cls, value, DEFAULT_OUTPUT_ENCODING) u = six.text_type.__new__(cls, value, DEFAULT_OUTPUT_ENCODING)
u.setup() u.setup()
return u return u
@ -723,7 +726,7 @@ class NavigableString(unicode, PageElement):
return type(self)(self) return type(self)(self)
def __getnewargs__(self): def __getnewargs__(self):
return (unicode(self),) return (six.text_type(self),)
def __getattr__(self, attr): def __getattr__(self, attr):
"""text.string gives you text. This is for backwards """text.string gives you text. This is for backwards
@ -1142,8 +1145,8 @@ class Tag(PageElement):
else: else:
if isinstance(val, list) or isinstance(val, tuple): if isinstance(val, list) or isinstance(val, tuple):
val = ' '.join(val) val = ' '.join(val)
elif not isinstance(val, basestring): elif not isinstance(val, six.string_types):
val = unicode(val) val = six.text_type(val)
elif ( elif (
isinstance(val, AttributeValueWithCharsetSubstitution) isinstance(val, AttributeValueWithCharsetSubstitution)
and eventual_encoding is not None): and eventual_encoding is not None):
@ -1151,7 +1154,7 @@ class Tag(PageElement):
text = self.format_string(val, formatter) text = self.format_string(val, formatter)
decoded = ( decoded = (
unicode(key) + '=' six.text_type(key) + '='
+ EntitySubstitution.quoted_attribute_value(text)) + EntitySubstitution.quoted_attribute_value(text))
attrs.append(decoded) attrs.append(decoded)
close = '' close = ''
@ -1368,7 +1371,7 @@ class Tag(PageElement):
'Final combinator "%s" is missing an argument.' % tokens[-1]) 'Final combinator "%s" is missing an argument.' % tokens[-1])
if self._select_debug: if self._select_debug:
print 'Running CSS selector "%s"' % selector print('Running CSS selector "%s"' % selector)
for index, token in enumerate(tokens): for index, token in enumerate(tokens):
new_context = [] new_context = []
@ -1377,11 +1380,11 @@ class Tag(PageElement):
if tokens[index-1] in self._selector_combinators: if tokens[index-1] in self._selector_combinators:
# This token was consumed by the previous combinator. Skip it. # This token was consumed by the previous combinator. Skip it.
if self._select_debug: if self._select_debug:
print ' Token was consumed by the previous combinator.' print(' Token was consumed by the previous combinator.')
continue continue
if self._select_debug: if self._select_debug:
print ' Considering token "%s"' % token print(' Considering token "%s"' % token)
recursive_candidate_generator = None recursive_candidate_generator = None
tag_name = None tag_name = None
@ -1488,14 +1491,14 @@ class Tag(PageElement):
next_token = tokens[index+1] next_token = tokens[index+1]
def recursive_select(tag): def recursive_select(tag):
if self._select_debug: if self._select_debug:
print ' Calling select("%s") recursively on %s %s' % (next_token, tag.name, tag.attrs) print(' Calling select("%s") recursively on %s %s' % (next_token, tag.name, tag.attrs))
print '-' * 40 print('-' * 40)
for i in tag.select(next_token, recursive_candidate_generator): for i in tag.select(next_token, recursive_candidate_generator):
if self._select_debug: if self._select_debug:
print '(Recursive select picked up candidate %s %s)' % (i.name, i.attrs) print('(Recursive select picked up candidate %s %s)' % (i.name, i.attrs))
yield i yield i
if self._select_debug: if self._select_debug:
print '-' * 40 print('-' * 40)
_use_candidate_generator = recursive_select _use_candidate_generator = recursive_select
elif _candidate_generator is None: elif _candidate_generator is None:
# By default, a tag's candidates are all of its # By default, a tag's candidates are all of its
@ -1506,7 +1509,7 @@ class Tag(PageElement):
check = "[any]" check = "[any]"
else: else:
check = tag_name check = tag_name
print ' Default candidate generator, tag name="%s"' % check print(' Default candidate generator, tag name="%s"' % check)
if self._select_debug: if self._select_debug:
# This is redundant with later code, but it stops # This is redundant with later code, but it stops
# a bunch of bogus tags from cluttering up the # a bunch of bogus tags from cluttering up the
@ -1527,8 +1530,8 @@ class Tag(PageElement):
count = 0 count = 0
for tag in current_context: for tag in current_context:
if self._select_debug: if self._select_debug:
print " Running candidate generator on %s %s" % ( print(" Running candidate generator on %s %s" % (
tag.name, repr(tag.attrs)) tag.name, repr(tag.attrs)))
for candidate in _use_candidate_generator(tag): for candidate in _use_candidate_generator(tag):
if not isinstance(candidate, Tag): if not isinstance(candidate, Tag):
continue continue
@ -1543,23 +1546,23 @@ class Tag(PageElement):
break break
if checker is None or result: if checker is None or result:
if self._select_debug: if self._select_debug:
print " SUCCESS %s %s" % (candidate.name, repr(candidate.attrs)) print(" SUCCESS %s %s" % (candidate.name, repr(candidate.attrs)))
if id(candidate) not in new_context_ids: if id(candidate) not in new_context_ids:
# If a tag matches a selector more than once, # If a tag matches a selector more than once,
# don't include it in the context more than once. # don't include it in the context more than once.
new_context.append(candidate) new_context.append(candidate)
new_context_ids.add(id(candidate)) new_context_ids.add(id(candidate))
elif self._select_debug: elif self._select_debug:
print " FAILURE %s %s" % (candidate.name, repr(candidate.attrs)) print(" FAILURE %s %s" % (candidate.name, repr(candidate.attrs)))
current_context = new_context current_context = new_context
if limit and len(current_context) >= limit: if limit and len(current_context) >= limit:
current_context = current_context[:limit] current_context = current_context[:limit]
if self._select_debug: if self._select_debug:
print "Final verdict:" print("Final verdict:")
for i in current_context: for i in current_context:
print " %s %s" % (i.name, i.attrs) print(" %s %s" % (i.name, i.attrs))
return current_context return current_context
# Old names for backwards compatibility # Old names for backwards compatibility
@ -1612,7 +1615,7 @@ class SoupStrainer(object):
def _normalize_search_value(self, value): def _normalize_search_value(self, value):
# Leave it alone if it's a Unicode string, a callable, a # Leave it alone if it's a Unicode string, a callable, a
# regular expression, a boolean, or None. # regular expression, a boolean, or None.
if (isinstance(value, unicode) or callable(value) or hasattr(value, 'match') if (isinstance(value, six.text_type) or callable(value) or hasattr(value, 'match')
or isinstance(value, bool) or value is None): or isinstance(value, bool) or value is None):
return value return value
@ -1625,7 +1628,7 @@ class SoupStrainer(object):
new_value = [] new_value = []
for v in value: for v in value:
if (hasattr(v, '__iter__') and not isinstance(v, bytes) if (hasattr(v, '__iter__') and not isinstance(v, bytes)
and not isinstance(v, unicode)): and not isinstance(v, six.text_type)):
# This is almost certainly the user's mistake. In the # This is almost certainly the user's mistake. In the
# interests of avoiding infinite loops, we'll let # interests of avoiding infinite loops, we'll let
# it through as-is rather than doing a recursive call. # it through as-is rather than doing a recursive call.
@ -1637,7 +1640,7 @@ class SoupStrainer(object):
# Otherwise, convert it into a Unicode string. # Otherwise, convert it into a Unicode string.
# The unicode(str()) thing is so this will do the same thing on Python 2 # The unicode(str()) thing is so this will do the same thing on Python 2
# and Python 3. # and Python 3.
return unicode(str(value)) return six.text_type(str(value))
def __str__(self): def __str__(self):
if self.text: if self.text:
@ -1691,7 +1694,7 @@ class SoupStrainer(object):
found = None found = None
# If given a list of items, scan it for a text element that # If given a list of items, scan it for a text element that
# matches. # matches.
if hasattr(markup, '__iter__') and not isinstance(markup, (Tag, basestring)): if hasattr(markup, '__iter__') and not isinstance(markup, (Tag, six.string_types)):
for element in markup: for element in markup:
if isinstance(element, NavigableString) \ if isinstance(element, NavigableString) \
and self.search(element): and self.search(element):
@ -1704,7 +1707,7 @@ class SoupStrainer(object):
found = self.search_tag(markup) found = self.search_tag(markup)
# If it's text, make sure the text matches. # If it's text, make sure the text matches.
elif isinstance(markup, NavigableString) or \ elif isinstance(markup, NavigableString) or \
isinstance(markup, basestring): isinstance(markup, six.string_types):
if not self.name and not self.attrs and self._matches(markup, self.text): if not self.name and not self.attrs and self._matches(markup, self.text):
found = markup found = markup
else: else:
@ -1749,7 +1752,7 @@ class SoupStrainer(object):
return not match_against return not match_against
if (hasattr(match_against, '__iter__') if (hasattr(match_against, '__iter__')
and not isinstance(match_against, basestring)): and not isinstance(match_against, six.string_types)):
# We're asked to match against an iterable of items. # We're asked to match against an iterable of items.
# The markup must be match at least one item in the # The markup must be match at least one item in the
# iterable. We'll try each one in turn. # iterable. We'll try each one in turn.
@ -1776,7 +1779,7 @@ class SoupStrainer(object):
# the tag's name and once against its prefixed name. # the tag's name and once against its prefixed name.
match = False match = False
if not match and isinstance(match_against, unicode): if not match and isinstance(match_against, six.text_type):
# Exact string match # Exact string match
match = markup == match_against match = markup == match_against

View File

@ -2,6 +2,8 @@
# Use of this source code is governed by a BSD-style license that can be # Use of this source code is governed by a BSD-style license that can be
# found in the LICENSE file. # found in the LICENSE file.
from __future__ import absolute_import
import six
__license__ = "MIT" __license__ = "MIT"
import pickle import pickle
@ -645,7 +647,7 @@ class XMLTreeBuilderSmokeTest(object):
markup = '<rss xmlns:dc="foo"><dc:creator>b</dc:creator><dc:date>2012-07-02T20:33:42Z</dc:date><dc:rights>c</dc:rights><image>d</image></rss>' markup = '<rss xmlns:dc="foo"><dc:creator>b</dc:creator><dc:date>2012-07-02T20:33:42Z</dc:date><dc:rights>c</dc:rights><image>d</image></rss>'
soup = self.soup(markup) soup = self.soup(markup)
self.assertEqual( self.assertEqual(
unicode(soup.rss), markup) six.text_type(soup.rss), markup)
def test_docstring_includes_correct_encoding(self): def test_docstring_includes_correct_encoding(self):
soup = self.soup("<root/>") soup = self.soup("<root/>")
@ -676,17 +678,17 @@ class XMLTreeBuilderSmokeTest(object):
def test_closing_namespaced_tag(self): def test_closing_namespaced_tag(self):
markup = '<p xmlns:dc="http://purl.org/dc/elements/1.1/"><dc:date>20010504</dc:date></p>' markup = '<p xmlns:dc="http://purl.org/dc/elements/1.1/"><dc:date>20010504</dc:date></p>'
soup = self.soup(markup) soup = self.soup(markup)
self.assertEqual(unicode(soup.p), markup) self.assertEqual(six.text_type(soup.p), markup)
def test_namespaced_attributes(self): def test_namespaced_attributes(self):
markup = '<foo xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"><bar xsi:schemaLocation="http://www.example.com"/></foo>' markup = '<foo xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"><bar xsi:schemaLocation="http://www.example.com"/></foo>'
soup = self.soup(markup) soup = self.soup(markup)
self.assertEqual(unicode(soup.foo), markup) self.assertEqual(six.text_type(soup.foo), markup)
def test_namespaced_attributes_xml_namespace(self): def test_namespaced_attributes_xml_namespace(self):
markup = '<foo xml:lang="fr">bar</foo>' markup = '<foo xml:lang="fr">bar</foo>'
soup = self.soup(markup) soup = self.soup(markup)
self.assertEqual(unicode(soup.foo), markup) self.assertEqual(six.text_type(soup.foo), markup)
def test_find_by_prefixed_name(self): def test_find_by_prefixed_name(self):
doc = """<?xml version="1.0" encoding="utf-8"?> doc = """<?xml version="1.0" encoding="utf-8"?>

View File

@ -1,5 +1,6 @@
"""Tests of the builder registry.""" """Tests of the builder registry."""
from __future__ import absolute_import
import unittest import unittest
import warnings import warnings

View File

@ -2,6 +2,7 @@
# pylint: disable-msg=E0611,W0142 # pylint: disable-msg=E0611,W0142
from __future__ import absolute_import
__metaclass__ = type __metaclass__ = type
__all__ = [ __all__ = [
'additional_tests', 'additional_tests',

View File

@ -1,11 +1,12 @@
"""Tests to ensure that the html5lib tree builder generates good trees.""" """Tests to ensure that the html5lib tree builder generates good trees."""
from __future__ import absolute_import
import warnings import warnings
try: try:
from bs4.builder import HTML5TreeBuilder from bs4.builder import HTML5TreeBuilder
HTML5LIB_PRESENT = True HTML5LIB_PRESENT = True
except ImportError, e: except ImportError as e:
HTML5LIB_PRESENT = False HTML5LIB_PRESENT = False
from bs4.element import SoupStrainer from bs4.element import SoupStrainer
from bs4.testing import ( from bs4.testing import (

View File

@ -1,6 +1,7 @@
"""Tests to ensure that the html.parser tree builder generates good """Tests to ensure that the html.parser tree builder generates good
trees.""" trees."""
from __future__ import absolute_import
from pdb import set_trace from pdb import set_trace
import pickle import pickle
from bs4.testing import SoupTest, HTMLTreeBuilderSmokeTest from bs4.testing import SoupTest, HTMLTreeBuilderSmokeTest

View File

@ -1,13 +1,15 @@
"""Tests to ensure that the lxml tree builder generates good trees.""" """Tests to ensure that the lxml tree builder generates good trees."""
from __future__ import absolute_import
import re import re
import warnings import warnings
import six
try: try:
import lxml.etree import lxml.etree
LXML_PRESENT = True LXML_PRESENT = True
LXML_VERSION = lxml.etree.LXML_VERSION LXML_VERSION = lxml.etree.LXML_VERSION
except ImportError, e: except ImportError as e:
LXML_PRESENT = False LXML_PRESENT = False
LXML_VERSION = (0,) LXML_VERSION = (0,)
@ -62,7 +64,7 @@ class LXMLTreeBuilderSmokeTest(SoupTest, HTMLTreeBuilderSmokeTest):
# if one is installed. # if one is installed.
with warnings.catch_warnings(record=True) as w: with warnings.catch_warnings(record=True) as w:
soup = BeautifulStoneSoup("<b />") soup = BeautifulStoneSoup("<b />")
self.assertEqual(u"<b/>", unicode(soup.b)) self.assertEqual(u"<b/>", six.text_type(soup.b))
self.assertTrue("BeautifulStoneSoup class is deprecated" in str(w[0].message)) self.assertTrue("BeautifulStoneSoup class is deprecated" in str(w[0].message))
@skipIf( @skipIf(

View File

@ -1,6 +1,7 @@
# -*- coding: utf-8 -*- # -*- coding: utf-8 -*-
"""Tests of Beautiful Soup as a whole.""" """Tests of Beautiful Soup as a whole."""
from __future__ import absolute_import
from pdb import set_trace from pdb import set_trace
import logging import logging
import unittest import unittest
@ -28,11 +29,12 @@ from bs4.testing import (
skipIf, skipIf,
) )
import warnings import warnings
import six
try: try:
from bs4.builder import LXMLTreeBuilder, LXMLTreeBuilderForXML from bs4.builder import LXMLTreeBuilder, LXMLTreeBuilderForXML
LXML_PRESENT = True LXML_PRESENT = True
except ImportError, e: except ImportError as e:
LXML_PRESENT = False LXML_PRESENT = False
PYTHON_3_PRE_3_2 = (sys.version_info[0] == 3 and sys.version_info < (3,2)) PYTHON_3_PRE_3_2 = (sys.version_info[0] == 3 and sys.version_info < (3,2))
@ -250,7 +252,7 @@ class TestEncodingConversion(SoupTest):
ascii = b"<foo>a</foo>" ascii = b"<foo>a</foo>"
soup_from_ascii = self.soup(ascii) soup_from_ascii = self.soup(ascii)
unicode_output = soup_from_ascii.decode() unicode_output = soup_from_ascii.decode()
self.assertTrue(isinstance(unicode_output, unicode)) self.assertTrue(isinstance(unicode_output, six.text_type))
self.assertEqual(unicode_output, self.document_for(ascii.decode())) self.assertEqual(unicode_output, self.document_for(ascii.decode()))
self.assertEqual(soup_from_ascii.original_encoding.lower(), "utf-8") self.assertEqual(soup_from_ascii.original_encoding.lower(), "utf-8")
finally: finally:

View File

@ -10,6 +10,7 @@ same markup, but all Beautiful Soup trees can be traversed with the
methods tested here. methods tested here.
""" """
from __future__ import absolute_import
from pdb import set_trace from pdb import set_trace
import copy import copy
import pickle import pickle
@ -34,6 +35,7 @@ from bs4.testing import (
SoupTest, SoupTest,
skipIf, skipIf,
) )
import six
XML_BUILDER_PRESENT = (builder_registry.lookup("xml") is not None) XML_BUILDER_PRESENT = (builder_registry.lookup("xml") is not None)
LXML_PRESENT = (builder_registry.lookup("lxml") is not None) LXML_PRESENT = (builder_registry.lookup("lxml") is not None)
@ -1111,7 +1113,7 @@ class TestTreeModification(SoupTest):
<script>baz</script> <script>baz</script>
</html>""") </html>""")
[soup.script.extract() for i in soup.find_all("script")] [soup.script.extract() for i in soup.find_all("script")]
self.assertEqual("<body>\n\n<a></a>\n</body>", unicode(soup.body)) self.assertEqual("<body>\n\n<a></a>\n</body>", six.text_type(soup.body))
def test_extract_works_when_element_is_surrounded_by_identical_strings(self): def test_extract_works_when_element_is_surrounded_by_identical_strings(self):
@ -1349,7 +1351,7 @@ class TestPersistence(SoupTest):
soup = BeautifulSoup(b'<p>&nbsp;</p>', 'html.parser') soup = BeautifulSoup(b'<p>&nbsp;</p>', 'html.parser')
encoding = soup.original_encoding encoding = soup.original_encoding
copy = soup.__copy__() copy = soup.__copy__()
self.assertEqual(u"<p> </p>", unicode(copy)) self.assertEqual(u"<p> </p>", six.text_type(copy))
self.assertEqual(encoding, copy.original_encoding) self.assertEqual(encoding, copy.original_encoding)
def test_unicode_pickle(self): def test_unicode_pickle(self):
@ -1393,7 +1395,7 @@ class TestPersistence(SoupTest):
div_copy = copy.copy(div) div_copy = copy.copy(div)
# The two tags look the same, and evaluate to equal. # The two tags look the same, and evaluate to equal.
self.assertEqual(unicode(div), unicode(div_copy)) self.assertEqual(six.text_type(div), six.text_type(div_copy))
self.assertEqual(div, div_copy) self.assertEqual(div, div_copy)
# But they're not the same object. # But they're not the same object.
@ -1505,7 +1507,7 @@ class TestSubstitutions(SoupTest):
def test_prettify_outputs_unicode_by_default(self): def test_prettify_outputs_unicode_by_default(self):
soup = self.soup("<a></a>") soup = self.soup("<a></a>")
self.assertEqual(unicode, type(soup.prettify())) self.assertEqual(six.text_type, type(soup.prettify()))
def test_prettify_can_encode_data(self): def test_prettify_can_encode_data(self):
soup = self.soup("<a></a>") soup = self.soup("<a></a>")