diff --git a/libs/bs4/__init__.py b/libs/bs4/__init__.py index 7a80452f7..b7ea25e2e 100644 --- a/libs/bs4/__init__.py +++ b/libs/bs4/__init__.py @@ -20,6 +20,10 @@ http://www.crummy.com/software/BeautifulSoup/bs4/doc/ # Use of this source code is governed by a BSD-style license that can be # found in the LICENSE file. +from __future__ import absolute_import +from __future__ import print_function +import six +from six.moves import range __author__ = "Leonard Richardson (leonardr@segfault.org)" __version__ = "4.6.0" __copyright__ = "Copyright (c) 2004-2017 Leonard Richardson" @@ -50,7 +54,7 @@ from .element import ( # The very first thing we do is give a useful error if someone is # running this code under Python 3 without converting it. -'You are trying to run the Python 2 version of Beautiful Soup under Python 3. This will not work.'<>'You need to convert the code, either by installing it (`python setup.py install`) or by running 2to3 (`2to3 -w bs4`).' +'You are trying to run the Python 2 version of Beautiful Soup under Python 3. This will not work.'!='You need to convert the code, either by installing it (`python setup.py install`) or by running 2to3 (`2to3 -w bs4`).' class BeautifulSoup(Tag): """ @@ -142,18 +146,18 @@ class BeautifulSoup(Tag): from_encoding = from_encoding or deprecated_argument( "fromEncoding", "from_encoding") - if from_encoding and isinstance(markup, unicode): + if from_encoding and isinstance(markup, six.text_type): warnings.warn("You provided Unicode markup but also provided a value for from_encoding. Your from_encoding will be ignored.") from_encoding = None if len(kwargs) > 0: - arg = kwargs.keys().pop() + arg = list(kwargs.keys()).pop() raise TypeError( "__init__() got an unexpected keyword argument '%s'" % arg) if builder is None: original_features = features - if isinstance(features, basestring): + if isinstance(features, six.string_types): features = [features] if features is None or len(features) == 0: features = self.DEFAULT_BUILDER_FEATURES @@ -191,13 +195,13 @@ class BeautifulSoup(Tag): markup = markup.read() elif len(markup) <= 256 and ( (isinstance(markup, bytes) and not b'<' in markup) - or (isinstance(markup, unicode) and not u'<' in markup) + or (isinstance(markup, six.text_type) and not u'<' in markup) ): # Print out warnings for a couple beginner problems # involving passing non-markup to Beautiful Soup. # Beautiful Soup will still parse the input as markup, # just in case that's what the user really wants. - if (isinstance(markup, unicode) + if (isinstance(markup, six.text_type) and not os.path.supports_unicode_filenames): possible_filename = markup.encode("utf8") else: @@ -205,13 +209,13 @@ class BeautifulSoup(Tag): is_file = False try: is_file = os.path.exists(possible_filename) - except Exception, e: + except Exception as e: # This is almost certainly a problem involving # characters not valid in filenames on this # system. Just let it go. pass if is_file: - if isinstance(markup, unicode): + if isinstance(markup, six.text_type): markup = markup.encode("utf8") warnings.warn( '"%s" looks like a filename, not markup. You should' @@ -263,7 +267,7 @@ class BeautifulSoup(Tag): if isinstance(markup, bytes): space = b' ' cant_start_with = (b"http:", b"https:") - elif isinstance(markup, unicode): + elif isinstance(markup, six.text_type): space = u' ' cant_start_with = (u"http:", u"https:") else: @@ -526,4 +530,4 @@ class FeatureNotFound(ValueError): if __name__ == '__main__': import sys soup = BeautifulSoup(sys.stdin) - print soup.prettify() + print(soup.prettify()) diff --git a/libs/bs4/builder/__init__.py b/libs/bs4/builder/__init__.py index fdb3362fc..4b44ef05f 100644 --- a/libs/bs4/builder/__init__.py +++ b/libs/bs4/builder/__init__.py @@ -1,6 +1,7 @@ # Use of this source code is governed by a BSD-style license that can be # found in the LICENSE file. +from __future__ import absolute_import from collections import defaultdict import itertools import sys @@ -10,6 +11,7 @@ from bs4.element import ( HTMLAwareEntitySubstitution, whitespace_re ) +import six __all__ = [ 'HTMLTreeBuilder', @@ -166,7 +168,7 @@ class TreeBuilder(object): # value is a whitespace-separated list of # values. Split it into a list. value = attrs[attr] - if isinstance(value, basestring): + if isinstance(value, six.string_types): values = whitespace_re.split(value) else: # html5lib sometimes calls setAttributes twice diff --git a/libs/bs4/builder/_html5lib.py b/libs/bs4/builder/_html5lib.py index 5f5489358..cf6063b83 100644 --- a/libs/bs4/builder/_html5lib.py +++ b/libs/bs4/builder/_html5lib.py @@ -1,6 +1,8 @@ # Use of this source code is governed by a BSD-style license that can be # found in the LICENSE file. +from __future__ import absolute_import +import six __all__ = [ 'HTML5TreeBuilder', ] @@ -33,7 +35,7 @@ try: # Pre-0.99999999 from html5lib.treebuilders import _base as treebuilder_base new_html5lib = False -except ImportError, e: +except ImportError as e: # 0.99999999 and up from html5lib.treebuilders import base as treebuilder_base new_html5lib = True @@ -64,7 +66,7 @@ class HTML5TreeBuilder(HTMLTreeBuilder): parser = html5lib.HTMLParser(tree=self.create_treebuilder) extra_kwargs = dict() - if not isinstance(markup, unicode): + if not isinstance(markup, six.text_type): if new_html5lib: extra_kwargs['override_encoding'] = self.user_specified_encoding else: @@ -72,13 +74,13 @@ class HTML5TreeBuilder(HTMLTreeBuilder): doc = parser.parse(markup, **extra_kwargs) # Set the character encoding detected by the tokenizer. - if isinstance(markup, unicode): + if isinstance(markup, six.text_type): # We need to special-case this because html5lib sets # charEncoding to UTF-8 if it gets Unicode input. doc.original_encoding = None else: original_encoding = parser.tokenizer.stream.charEncoding[0] - if not isinstance(original_encoding, basestring): + if not isinstance(original_encoding, six.string_types): # In 0.99999999 and up, the encoding is an html5lib # Encoding object. We want to use a string for compatibility # with other tree builders. @@ -229,7 +231,7 @@ class Element(treebuilder_base.Node): def appendChild(self, node): string_child = child = None - if isinstance(node, basestring): + if isinstance(node, six.string_types): # Some other piece of code decided to pass in a string # instead of creating a TextElement object to contain the # string. @@ -246,7 +248,7 @@ class Element(treebuilder_base.Node): child = node.element node.parent = self - if not isinstance(child, basestring) and child.parent is not None: + if not isinstance(child, six.string_types) and child.parent is not None: node.element.extract() if (string_child and self.element.contents @@ -259,7 +261,7 @@ class Element(treebuilder_base.Node): old_element.replace_with(new_element) self.soup._most_recent_element = new_element else: - if isinstance(node, basestring): + if isinstance(node, six.string_types): # Create a brand new NavigableString from this string. child = self.soup.new_string(node) diff --git a/libs/bs4/builder/_htmlparser.py b/libs/bs4/builder/_htmlparser.py index 67890b3a3..624028e3c 100644 --- a/libs/bs4/builder/_htmlparser.py +++ b/libs/bs4/builder/_htmlparser.py @@ -3,15 +3,18 @@ # Use of this source code is governed by a BSD-style license that can be # found in the LICENSE file. +from __future__ import absolute_import +from six import unichr +import six __all__ = [ 'HTMLParserTreeBuilder', ] -from HTMLParser import HTMLParser +from six.moves.html_parser import HTMLParser try: - from HTMLParser import HTMLParseError -except ImportError, e: + from six.moves.html_parser import HTMLParseError +except ImportError as e: # HTMLParseError is removed in Python 3.5. Since it can never be # thrown in 3.5, we can just define our own class as a placeholder. class HTMLParseError(Exception): @@ -131,7 +134,7 @@ class BeautifulSoupHTMLParser(HTMLParser): try: data = unichr(real_name) - except (ValueError, OverflowError), e: + except (ValueError, OverflowError) as e: data = u"\N{REPLACEMENT CHARACTER}" self.handle_data(data) @@ -196,7 +199,7 @@ class HTMLParserTreeBuilder(HTMLTreeBuilder): declared within markup, whether any characters had to be replaced with REPLACEMENT CHARACTER). """ - if isinstance(markup, unicode): + if isinstance(markup, six.text_type): yield (markup, None, None, False) return @@ -213,7 +216,7 @@ class HTMLParserTreeBuilder(HTMLTreeBuilder): parser.soup = self.soup try: parser.feed(markup) - except HTMLParseError, e: + except HTMLParseError as e: warnings.warn(RuntimeWarning( "Python's built-in HTMLParser cannot parse the given document. This is not a bug in Beautiful Soup. The best solution is to install an external parser (lxml or html5lib), and use Beautiful Soup with that parser. See http://www.crummy.com/software/BeautifulSoup/bs4/doc/#installing-a-parser for help.")) raise e diff --git a/libs/bs4/builder/_lxml.py b/libs/bs4/builder/_lxml.py index d2ca2872d..73f6e2b34 100644 --- a/libs/bs4/builder/_lxml.py +++ b/libs/bs4/builder/_lxml.py @@ -1,5 +1,7 @@ # Use of this source code is governed by a BSD-style license that can be # found in the LICENSE file. +from __future__ import absolute_import +import six __all__ = [ 'LXMLTreeBuilderForXML', 'LXMLTreeBuilder', @@ -101,12 +103,12 @@ class LXMLTreeBuilderForXML(TreeBuilder): else: self.processing_instruction_class = XMLProcessingInstruction - if isinstance(markup, unicode): + if isinstance(markup, six.text_type): # We were given Unicode. Maybe lxml can parse Unicode on # this system? yield markup, None, document_declared_encoding, False - if isinstance(markup, unicode): + if isinstance(markup, six.text_type): # No, apparently not. Convert the Unicode to UTF-8 and # tell lxml to parse it as UTF-8. yield (markup.encode("utf8"), "utf8", @@ -121,7 +123,7 @@ class LXMLTreeBuilderForXML(TreeBuilder): def feed(self, markup): if isinstance(markup, bytes): markup = BytesIO(markup) - elif isinstance(markup, unicode): + elif isinstance(markup, six.text_type): markup = StringIO(markup) # Call feed() at least once, even if the markup is empty, @@ -136,7 +138,7 @@ class LXMLTreeBuilderForXML(TreeBuilder): if len(data) != 0: self.parser.feed(data) self.parser.close() - except (UnicodeDecodeError, LookupError, etree.ParserError), e: + except (UnicodeDecodeError, LookupError, etree.ParserError) as e: raise ParserRejectedMarkup(str(e)) def close(self): @@ -249,7 +251,7 @@ class LXMLTreeBuilder(HTMLTreeBuilder, LXMLTreeBuilderForXML): self.parser = self.parser_for(encoding) self.parser.feed(markup) self.parser.close() - except (UnicodeDecodeError, LookupError, etree.ParserError), e: + except (UnicodeDecodeError, LookupError, etree.ParserError) as e: raise ParserRejectedMarkup(str(e)) diff --git a/libs/bs4/dammit.py b/libs/bs4/dammit.py index 7965565f5..fe10691d0 100644 --- a/libs/bs4/dammit.py +++ b/libs/bs4/dammit.py @@ -8,10 +8,13 @@ XML or HTML to reflect a new encoding; that's the tree builder's job. """ # Use of this source code is governed by a BSD-style license that can be # found in the LICENSE file. +from __future__ import absolute_import +from six import unichr +import six __license__ = "MIT" import codecs -from htmlentitydefs import codepoint2name +from six.moves.html_entities import codepoint2name import re import logging import string @@ -274,7 +277,7 @@ class EncodingDetector: def strip_byte_order_mark(cls, data): """If a byte-order mark is present, strip it and return the encoding it implies.""" encoding = None - if isinstance(data, unicode): + if isinstance(data, six.text_type): # Unicode data cannot have a byte-order mark. return data, encoding if (len(data) >= 4) and (data[:2] == b'\xfe\xff') \ @@ -352,9 +355,9 @@ class UnicodeDammit: markup, override_encodings, is_html, exclude_encodings) # Short-circuit if the data is in Unicode to begin with. - if isinstance(markup, unicode) or markup == '': + if isinstance(markup, six.text_type) or markup == '': self.markup = markup - self.unicode_markup = unicode(markup) + self.unicode_markup = six.text_type(markup) self.original_encoding = None return @@ -438,7 +441,7 @@ class UnicodeDammit: def _to_unicode(self, data, encoding, errors="strict"): '''Given a string and its encoding, decodes the string into Unicode. %encoding is a string recognized by encodings.aliases''' - return unicode(data, encoding, errors) + return six.text_type(data, encoding, errors) @property def declared_html_encoding(self): diff --git a/libs/bs4/diagnose.py b/libs/bs4/diagnose.py index 8768332f5..8cc44833b 100644 --- a/libs/bs4/diagnose.py +++ b/libs/bs4/diagnose.py @@ -2,11 +2,15 @@ # Use of this source code is governed by a BSD-style license that can be # found in the LICENSE file. +from __future__ import absolute_import +from __future__ import print_function +from six.moves import map +from six.moves import range __license__ = "MIT" import cProfile from StringIO import StringIO -from HTMLParser import HTMLParser +from six.moves.html_parser import HTMLParser import bs4 from bs4 import BeautifulSoup, __version__ from bs4.builder import builder_registry @@ -22,8 +26,8 @@ import cProfile def diagnose(data): """Diagnostic suite for isolating common problems.""" - print "Diagnostic running on Beautiful Soup %s" % __version__ - print "Python version %s" % sys.version + print("Diagnostic running on Beautiful Soup %s" % __version__) + print("Python version %s" % sys.version) basic_parsers = ["html.parser", "html5lib", "lxml"] for name in basic_parsers: @@ -32,16 +36,16 @@ def diagnose(data): break else: basic_parsers.remove(name) - print ( + print(( "I noticed that %s is not installed. Installing it may help." % - name) + name)) if 'lxml' in basic_parsers: basic_parsers.append(["lxml", "xml"]) try: from lxml import etree - print "Found lxml version %s" % ".".join(map(str,etree.LXML_VERSION)) - except ImportError, e: + print("Found lxml version %s" % ".".join(map(str,etree.LXML_VERSION))) + except ImportError as e: print ( "lxml is not installed or couldn't be imported.") @@ -49,37 +53,37 @@ def diagnose(data): if 'html5lib' in basic_parsers: try: import html5lib - print "Found html5lib version %s" % html5lib.__version__ - except ImportError, e: + print("Found html5lib version %s" % html5lib.__version__) + except ImportError as e: print ( "html5lib is not installed or couldn't be imported.") if hasattr(data, 'read'): data = data.read() elif os.path.exists(data): - print '"%s" looks like a filename. Reading data from the file.' % data + print('"%s" looks like a filename. Reading data from the file.' % data) with open(data) as fp: data = fp.read() elif data.startswith("http:") or data.startswith("https:"): - print '"%s" looks like a URL. Beautiful Soup is not an HTTP client.' % data - print "You need to use some other library to get the document behind the URL, and feed that document to Beautiful Soup." + print('"%s" looks like a URL. Beautiful Soup is not an HTTP client.' % data) + print("You need to use some other library to get the document behind the URL, and feed that document to Beautiful Soup.") return - print + print() for parser in basic_parsers: - print "Trying to parse your markup with %s" % parser + print("Trying to parse your markup with %s" % parser) success = False try: soup = BeautifulSoup(data, parser) success = True - except Exception, e: - print "%s could not parse the markup." % parser + except Exception as e: + print("%s could not parse the markup." % parser) traceback.print_exc() if success: - print "Here's what %s did with the markup:" % parser - print soup.prettify() + print("Here's what %s did with the markup:" % parser) + print(soup.prettify()) - print "-" * 80 + print("-" * 80) def lxml_trace(data, html=True, **kwargs): """Print out the lxml events that occur during parsing. @@ -89,7 +93,7 @@ def lxml_trace(data, html=True, **kwargs): """ from lxml import etree for event, element in etree.iterparse(StringIO(data), html=html, **kwargs): - print("%s, %4s, %s" % (event, element.tag, element.text)) + print(("%s, %4s, %s" % (event, element.tag, element.text))) class AnnouncingParser(HTMLParser): """Announces HTMLParser parse events, without doing anything else.""" @@ -171,9 +175,9 @@ def rdoc(num_elements=1000): def benchmark_parsers(num_elements=100000): """Very basic head-to-head performance benchmark.""" - print "Comparative parser benchmark on Beautiful Soup %s" % __version__ + print("Comparative parser benchmark on Beautiful Soup %s" % __version__) data = rdoc(num_elements) - print "Generated a large invalid HTML document (%d bytes)." % len(data) + print("Generated a large invalid HTML document (%d bytes)." % len(data)) for parser in ["lxml", ["lxml", "html"], "html5lib", "html.parser"]: success = False @@ -182,24 +186,24 @@ def benchmark_parsers(num_elements=100000): soup = BeautifulSoup(data, parser) b = time.time() success = True - except Exception, e: - print "%s could not parse the markup." % parser + except Exception as e: + print("%s could not parse the markup." % parser) traceback.print_exc() if success: - print "BS4+%s parsed the markup in %.2fs." % (parser, b-a) + print("BS4+%s parsed the markup in %.2fs." % (parser, b-a)) from lxml import etree a = time.time() etree.HTML(data) b = time.time() - print "Raw lxml parsed the markup in %.2fs." % (b-a) + print("Raw lxml parsed the markup in %.2fs." % (b-a)) import html5lib parser = html5lib.HTMLParser() a = time.time() parser.parse(data) b = time.time() - print "Raw html5lib parsed the markup in %.2fs." % (b-a) + print("Raw html5lib parsed the markup in %.2fs." % (b-a)) def profile(num_elements=100000, parser="lxml"): diff --git a/libs/bs4/element.py b/libs/bs4/element.py index 9ef75f814..39f480371 100644 --- a/libs/bs4/element.py +++ b/libs/bs4/element.py @@ -1,5 +1,8 @@ # Use of this source code is governed by a BSD-style license that can be # found in the LICENSE file. +from __future__ import absolute_import +from __future__ import print_function +import six __license__ = "MIT" import collections @@ -26,22 +29,22 @@ def _alias(attr): return alias -class NamespacedAttribute(unicode): +class NamespacedAttribute(six.text_type): def __new__(cls, prefix, name, namespace=None): if name is None: - obj = unicode.__new__(cls, prefix) + obj = six.text_type.__new__(cls, prefix) elif prefix is None: # Not really namespaced. - obj = unicode.__new__(cls, name) + obj = six.text_type.__new__(cls, name) else: - obj = unicode.__new__(cls, prefix + ":" + name) + obj = six.text_type.__new__(cls, prefix + ":" + name) obj.prefix = prefix obj.name = name obj.namespace = namespace return obj -class AttributeValueWithCharsetSubstitution(unicode): +class AttributeValueWithCharsetSubstitution(six.text_type): """A stand-in object for a character encoding specified in HTML.""" class CharsetMetaAttributeValue(AttributeValueWithCharsetSubstitution): @@ -52,7 +55,7 @@ class CharsetMetaAttributeValue(AttributeValueWithCharsetSubstitution): """ def __new__(cls, original_value): - obj = unicode.__new__(cls, original_value) + obj = six.text_type.__new__(cls, original_value) obj.original_value = original_value return obj @@ -75,9 +78,9 @@ class ContentMetaAttributeValue(AttributeValueWithCharsetSubstitution): match = cls.CHARSET_RE.search(original_value) if match is None: # No substitution necessary. - return unicode.__new__(unicode, original_value) + return six.text_type.__new__(six.text_type, original_value) - obj = unicode.__new__(cls, original_value) + obj = six.text_type.__new__(cls, original_value) obj.original_value = original_value return obj @@ -312,7 +315,7 @@ class PageElement(object): raise ValueError("Cannot insert None into a tag.") if new_child is self: raise ValueError("Cannot insert a tag into itself.") - if (isinstance(new_child, basestring) + if (isinstance(new_child, six.string_types) and not isinstance(new_child, NavigableString)): new_child = NavigableString(new_child) @@ -533,7 +536,7 @@ class PageElement(object): result = (element for element in generator if isinstance(element, Tag)) return ResultSet(strainer, result) - elif isinstance(name, basestring): + elif isinstance(name, six.string_types): # Optimization to find all tags with a given name. if name.count(':') == 1: # This is a name with a prefix. @@ -691,7 +694,7 @@ class PageElement(object): return self.parents -class NavigableString(unicode, PageElement): +class NavigableString(six.text_type, PageElement): PREFIX = '' SUFFIX = '' @@ -709,10 +712,10 @@ class NavigableString(unicode, PageElement): passed in to the superclass's __new__ or the superclass won't know how to handle non-ASCII characters. """ - if isinstance(value, unicode): - u = unicode.__new__(cls, value) + if isinstance(value, six.text_type): + u = six.text_type.__new__(cls, value) else: - u = unicode.__new__(cls, value, DEFAULT_OUTPUT_ENCODING) + u = six.text_type.__new__(cls, value, DEFAULT_OUTPUT_ENCODING) u.setup() return u @@ -723,7 +726,7 @@ class NavigableString(unicode, PageElement): return type(self)(self) def __getnewargs__(self): - return (unicode(self),) + return (six.text_type(self),) def __getattr__(self, attr): """text.string gives you text. This is for backwards @@ -1142,8 +1145,8 @@ class Tag(PageElement): else: if isinstance(val, list) or isinstance(val, tuple): val = ' '.join(val) - elif not isinstance(val, basestring): - val = unicode(val) + elif not isinstance(val, six.string_types): + val = six.text_type(val) elif ( isinstance(val, AttributeValueWithCharsetSubstitution) and eventual_encoding is not None): @@ -1151,7 +1154,7 @@ class Tag(PageElement): text = self.format_string(val, formatter) decoded = ( - unicode(key) + '=' + six.text_type(key) + '=' + EntitySubstitution.quoted_attribute_value(text)) attrs.append(decoded) close = '' @@ -1368,7 +1371,7 @@ class Tag(PageElement): 'Final combinator "%s" is missing an argument.' % tokens[-1]) if self._select_debug: - print 'Running CSS selector "%s"' % selector + print('Running CSS selector "%s"' % selector) for index, token in enumerate(tokens): new_context = [] @@ -1377,11 +1380,11 @@ class Tag(PageElement): if tokens[index-1] in self._selector_combinators: # This token was consumed by the previous combinator. Skip it. if self._select_debug: - print ' Token was consumed by the previous combinator.' + print(' Token was consumed by the previous combinator.') continue if self._select_debug: - print ' Considering token "%s"' % token + print(' Considering token "%s"' % token) recursive_candidate_generator = None tag_name = None @@ -1488,14 +1491,14 @@ class Tag(PageElement): next_token = tokens[index+1] def recursive_select(tag): if self._select_debug: - print ' Calling select("%s") recursively on %s %s' % (next_token, tag.name, tag.attrs) - print '-' * 40 + print(' Calling select("%s") recursively on %s %s' % (next_token, tag.name, tag.attrs)) + print('-' * 40) for i in tag.select(next_token, recursive_candidate_generator): if self._select_debug: - print '(Recursive select picked up candidate %s %s)' % (i.name, i.attrs) + print('(Recursive select picked up candidate %s %s)' % (i.name, i.attrs)) yield i if self._select_debug: - print '-' * 40 + print('-' * 40) _use_candidate_generator = recursive_select elif _candidate_generator is None: # By default, a tag's candidates are all of its @@ -1506,7 +1509,7 @@ class Tag(PageElement): check = "[any]" else: check = tag_name - print ' Default candidate generator, tag name="%s"' % check + print(' Default candidate generator, tag name="%s"' % check) if self._select_debug: # This is redundant with later code, but it stops # a bunch of bogus tags from cluttering up the @@ -1527,8 +1530,8 @@ class Tag(PageElement): count = 0 for tag in current_context: if self._select_debug: - print " Running candidate generator on %s %s" % ( - tag.name, repr(tag.attrs)) + print(" Running candidate generator on %s %s" % ( + tag.name, repr(tag.attrs))) for candidate in _use_candidate_generator(tag): if not isinstance(candidate, Tag): continue @@ -1543,23 +1546,23 @@ class Tag(PageElement): break if checker is None or result: if self._select_debug: - print " SUCCESS %s %s" % (candidate.name, repr(candidate.attrs)) + print(" SUCCESS %s %s" % (candidate.name, repr(candidate.attrs))) if id(candidate) not in new_context_ids: # If a tag matches a selector more than once, # don't include it in the context more than once. new_context.append(candidate) new_context_ids.add(id(candidate)) elif self._select_debug: - print " FAILURE %s %s" % (candidate.name, repr(candidate.attrs)) + print(" FAILURE %s %s" % (candidate.name, repr(candidate.attrs))) current_context = new_context if limit and len(current_context) >= limit: current_context = current_context[:limit] if self._select_debug: - print "Final verdict:" + print("Final verdict:") for i in current_context: - print " %s %s" % (i.name, i.attrs) + print(" %s %s" % (i.name, i.attrs)) return current_context # Old names for backwards compatibility @@ -1612,7 +1615,7 @@ class SoupStrainer(object): def _normalize_search_value(self, value): # Leave it alone if it's a Unicode string, a callable, a # regular expression, a boolean, or None. - if (isinstance(value, unicode) or callable(value) or hasattr(value, 'match') + if (isinstance(value, six.text_type) or callable(value) or hasattr(value, 'match') or isinstance(value, bool) or value is None): return value @@ -1625,7 +1628,7 @@ class SoupStrainer(object): new_value = [] for v in value: if (hasattr(v, '__iter__') and not isinstance(v, bytes) - and not isinstance(v, unicode)): + and not isinstance(v, six.text_type)): # This is almost certainly the user's mistake. In the # interests of avoiding infinite loops, we'll let # it through as-is rather than doing a recursive call. @@ -1637,7 +1640,7 @@ class SoupStrainer(object): # Otherwise, convert it into a Unicode string. # The unicode(str()) thing is so this will do the same thing on Python 2 # and Python 3. - return unicode(str(value)) + return six.text_type(str(value)) def __str__(self): if self.text: @@ -1691,7 +1694,7 @@ class SoupStrainer(object): found = None # If given a list of items, scan it for a text element that # matches. - if hasattr(markup, '__iter__') and not isinstance(markup, (Tag, basestring)): + if hasattr(markup, '__iter__') and not isinstance(markup, (Tag, six.string_types)): for element in markup: if isinstance(element, NavigableString) \ and self.search(element): @@ -1704,7 +1707,7 @@ class SoupStrainer(object): found = self.search_tag(markup) # If it's text, make sure the text matches. elif isinstance(markup, NavigableString) or \ - isinstance(markup, basestring): + isinstance(markup, six.string_types): if not self.name and not self.attrs and self._matches(markup, self.text): found = markup else: @@ -1749,7 +1752,7 @@ class SoupStrainer(object): return not match_against if (hasattr(match_against, '__iter__') - and not isinstance(match_against, basestring)): + and not isinstance(match_against, six.string_types)): # We're asked to match against an iterable of items. # The markup must be match at least one item in the # iterable. We'll try each one in turn. @@ -1776,7 +1779,7 @@ class SoupStrainer(object): # the tag's name and once against its prefixed name. match = False - if not match and isinstance(match_against, unicode): + if not match and isinstance(match_against, six.text_type): # Exact string match match = markup == match_against diff --git a/libs/bs4/testing.py b/libs/bs4/testing.py index 6ba2506c4..489b1b25a 100644 --- a/libs/bs4/testing.py +++ b/libs/bs4/testing.py @@ -2,6 +2,8 @@ # Use of this source code is governed by a BSD-style license that can be # found in the LICENSE file. +from __future__ import absolute_import +import six __license__ = "MIT" import pickle @@ -645,7 +647,7 @@ class XMLTreeBuilderSmokeTest(object): markup = 'b2012-07-02T20:33:42Zcd' soup = self.soup(markup) self.assertEqual( - unicode(soup.rss), markup) + six.text_type(soup.rss), markup) def test_docstring_includes_correct_encoding(self): soup = self.soup("") @@ -676,17 +678,17 @@ class XMLTreeBuilderSmokeTest(object): def test_closing_namespaced_tag(self): markup = '

20010504

' soup = self.soup(markup) - self.assertEqual(unicode(soup.p), markup) + self.assertEqual(six.text_type(soup.p), markup) def test_namespaced_attributes(self): markup = '' soup = self.soup(markup) - self.assertEqual(unicode(soup.foo), markup) + self.assertEqual(six.text_type(soup.foo), markup) def test_namespaced_attributes_xml_namespace(self): markup = 'bar' soup = self.soup(markup) - self.assertEqual(unicode(soup.foo), markup) + self.assertEqual(six.text_type(soup.foo), markup) def test_find_by_prefixed_name(self): doc = """ diff --git a/libs/bs4/tests/test_builder_registry.py b/libs/bs4/tests/test_builder_registry.py index 90cad8293..d20678285 100644 --- a/libs/bs4/tests/test_builder_registry.py +++ b/libs/bs4/tests/test_builder_registry.py @@ -1,5 +1,6 @@ """Tests of the builder registry.""" +from __future__ import absolute_import import unittest import warnings diff --git a/libs/bs4/tests/test_docs.py b/libs/bs4/tests/test_docs.py index 5b9f67709..01eb94ef4 100644 --- a/libs/bs4/tests/test_docs.py +++ b/libs/bs4/tests/test_docs.py @@ -2,6 +2,7 @@ # pylint: disable-msg=E0611,W0142 +from __future__ import absolute_import __metaclass__ = type __all__ = [ 'additional_tests', diff --git a/libs/bs4/tests/test_html5lib.py b/libs/bs4/tests/test_html5lib.py index 0f89d6244..b9e54c9a0 100644 --- a/libs/bs4/tests/test_html5lib.py +++ b/libs/bs4/tests/test_html5lib.py @@ -1,11 +1,12 @@ """Tests to ensure that the html5lib tree builder generates good trees.""" +from __future__ import absolute_import import warnings try: from bs4.builder import HTML5TreeBuilder HTML5LIB_PRESENT = True -except ImportError, e: +except ImportError as e: HTML5LIB_PRESENT = False from bs4.element import SoupStrainer from bs4.testing import ( diff --git a/libs/bs4/tests/test_htmlparser.py b/libs/bs4/tests/test_htmlparser.py index d5cf0253f..94d9ca456 100644 --- a/libs/bs4/tests/test_htmlparser.py +++ b/libs/bs4/tests/test_htmlparser.py @@ -1,6 +1,7 @@ """Tests to ensure that the html.parser tree builder generates good trees.""" +from __future__ import absolute_import from pdb import set_trace import pickle from bs4.testing import SoupTest, HTMLTreeBuilderSmokeTest diff --git a/libs/bs4/tests/test_lxml.py b/libs/bs4/tests/test_lxml.py index a05870b91..b579b1938 100644 --- a/libs/bs4/tests/test_lxml.py +++ b/libs/bs4/tests/test_lxml.py @@ -1,13 +1,15 @@ """Tests to ensure that the lxml tree builder generates good trees.""" +from __future__ import absolute_import import re import warnings +import six try: import lxml.etree LXML_PRESENT = True LXML_VERSION = lxml.etree.LXML_VERSION -except ImportError, e: +except ImportError as e: LXML_PRESENT = False LXML_VERSION = (0,) @@ -62,7 +64,7 @@ class LXMLTreeBuilderSmokeTest(SoupTest, HTMLTreeBuilderSmokeTest): # if one is installed. with warnings.catch_warnings(record=True) as w: soup = BeautifulStoneSoup("") - self.assertEqual(u"", unicode(soup.b)) + self.assertEqual(u"", six.text_type(soup.b)) self.assertTrue("BeautifulStoneSoup class is deprecated" in str(w[0].message)) @skipIf( diff --git a/libs/bs4/tests/test_soup.py b/libs/bs4/tests/test_soup.py index f3e69edf3..047d9c42e 100644 --- a/libs/bs4/tests/test_soup.py +++ b/libs/bs4/tests/test_soup.py @@ -1,6 +1,7 @@ # -*- coding: utf-8 -*- """Tests of Beautiful Soup as a whole.""" +from __future__ import absolute_import from pdb import set_trace import logging import unittest @@ -28,11 +29,12 @@ from bs4.testing import ( skipIf, ) import warnings +import six try: from bs4.builder import LXMLTreeBuilder, LXMLTreeBuilderForXML LXML_PRESENT = True -except ImportError, e: +except ImportError as e: LXML_PRESENT = False PYTHON_3_PRE_3_2 = (sys.version_info[0] == 3 and sys.version_info < (3,2)) @@ -250,7 +252,7 @@ class TestEncodingConversion(SoupTest): ascii = b"a" soup_from_ascii = self.soup(ascii) unicode_output = soup_from_ascii.decode() - self.assertTrue(isinstance(unicode_output, unicode)) + self.assertTrue(isinstance(unicode_output, six.text_type)) self.assertEqual(unicode_output, self.document_for(ascii.decode())) self.assertEqual(soup_from_ascii.original_encoding.lower(), "utf-8") finally: diff --git a/libs/bs4/tests/test_tree.py b/libs/bs4/tests/test_tree.py index c0e7c4080..f4bce3e14 100644 --- a/libs/bs4/tests/test_tree.py +++ b/libs/bs4/tests/test_tree.py @@ -10,6 +10,7 @@ same markup, but all Beautiful Soup trees can be traversed with the methods tested here. """ +from __future__ import absolute_import from pdb import set_trace import copy import pickle @@ -34,6 +35,7 @@ from bs4.testing import ( SoupTest, skipIf, ) +import six XML_BUILDER_PRESENT = (builder_registry.lookup("xml") is not None) LXML_PRESENT = (builder_registry.lookup("lxml") is not None) @@ -1111,7 +1113,7 @@ class TestTreeModification(SoupTest): """) [soup.script.extract() for i in soup.find_all("script")] - self.assertEqual("\n\n\n", unicode(soup.body)) + self.assertEqual("\n\n\n", six.text_type(soup.body)) def test_extract_works_when_element_is_surrounded_by_identical_strings(self): @@ -1349,7 +1351,7 @@ class TestPersistence(SoupTest): soup = BeautifulSoup(b'

 

', 'html.parser') encoding = soup.original_encoding copy = soup.__copy__() - self.assertEqual(u"

 

", unicode(copy)) + self.assertEqual(u"

 

", six.text_type(copy)) self.assertEqual(encoding, copy.original_encoding) def test_unicode_pickle(self): @@ -1393,7 +1395,7 @@ class TestPersistence(SoupTest): div_copy = copy.copy(div) # The two tags look the same, and evaluate to equal. - self.assertEqual(unicode(div), unicode(div_copy)) + self.assertEqual(six.text_type(div), six.text_type(div_copy)) self.assertEqual(div, div_copy) # But they're not the same object. @@ -1505,7 +1507,7 @@ class TestSubstitutions(SoupTest): def test_prettify_outputs_unicode_by_default(self): soup = self.soup("") - self.assertEqual(unicode, type(soup.prettify())) + self.assertEqual(six.text_type, type(soup.prettify())) def test_prettify_can_encode_data(self): soup = self.soup("")