2022-01-24 04:07:52 +00:00
""" Beautiful Soup Elixir and Tonic - " The Screen-Scraper ' s Friend " .
2020-01-30 01:07:26 +00:00
http : / / www . crummy . com / software / BeautifulSoup /
Beautiful Soup uses a pluggable XML or HTML parser to parse a
( possibly invalid ) document into a tree representation . Beautiful Soup
provides methods and Pythonic idioms that make it easy to navigate ,
search , and modify the parse tree .
2022-01-24 04:07:52 +00:00
Beautiful Soup works with Python 3.5 and up . It works better if lxml
2020-01-30 01:07:26 +00:00
and / or html5lib is installed .
For more than you ever wanted to know about Beautiful Soup , see the
2022-01-24 04:07:52 +00:00
documentation : http : / / www . crummy . com / software / BeautifulSoup / bs4 / doc /
2020-01-30 01:07:26 +00:00
"""
__author__ = " Leonard Richardson (leonardr@segfault.org) "
2022-11-07 18:06:49 +00:00
__version__ = " 4.11.1 "
__copyright__ = " Copyright (c) 2004-2022 Leonard Richardson "
2020-01-30 01:07:26 +00:00
# Use of this source code is governed by the MIT license.
__license__ = " MIT "
__all__ = [ ' BeautifulSoup ' ]
2022-01-24 04:07:52 +00:00
from collections import Counter
2020-01-30 01:07:26 +00:00
import os
import re
import sys
import traceback
import warnings
2022-01-24 04:07:52 +00:00
# The very first thing we do is give a useful error if someone is
# running this code under Python 2.
if sys . version_info . major < 3 :
raise ImportError ( ' You are trying to use a Python 3-specific version of Beautiful Soup under Python 2. This will not work. The final version of Beautiful Soup to support Python 2 was 4.9.3. ' )
2022-11-07 18:06:49 +00:00
from . builder import (
builder_registry ,
ParserRejectedMarkup ,
XMLParsedAsHTMLWarning ,
)
2020-01-30 01:07:26 +00:00
from . dammit import UnicodeDammit
from . element import (
CData ,
Comment ,
DEFAULT_OUTPUT_ENCODING ,
Declaration ,
Doctype ,
NavigableString ,
PageElement ,
ProcessingInstruction ,
2022-01-24 04:07:52 +00:00
PYTHON_SPECIFIC_ENCODINGS ,
2020-01-30 01:07:26 +00:00
ResultSet ,
2022-01-24 04:07:52 +00:00
Script ,
Stylesheet ,
2020-01-30 01:07:26 +00:00
SoupStrainer ,
Tag ,
2022-01-24 04:07:52 +00:00
TemplateString ,
2020-01-30 01:07:26 +00:00
)
2022-01-24 04:07:52 +00:00
# Define some custom warnings.
class GuessedAtParserWarning ( UserWarning ) :
""" The warning issued when BeautifulSoup has to guess what parser to
use - - probably because no parser was specified in the constructor .
"""
2020-01-30 01:07:26 +00:00
2022-01-24 04:07:52 +00:00
class MarkupResemblesLocatorWarning ( UserWarning ) :
""" The warning issued when BeautifulSoup is given ' markup ' that
actually looks like a resource locator - - a URL or a path to a file
on disk .
2020-01-30 01:07:26 +00:00
"""
2022-11-07 18:06:49 +00:00
2022-01-24 04:07:52 +00:00
class BeautifulSoup ( Tag ) :
""" A data structure representing a parsed HTML or XML document.
Most of the methods you ' ll call on a BeautifulSoup object are inherited from
PageElement or Tag .
Internally , this class defines the basic interface called by the
tree builders when converting an HTML / XML document into a data
structure . The interface abstracts away the differences between
parsers . To write a new tree builder , you ' ll need to understand
these methods as a whole .
These methods will be called by the BeautifulSoup constructor :
* reset ( )
* feed ( markup )
2020-01-30 01:07:26 +00:00
The tree builder may call these methods from its feed ( ) implementation :
2022-01-24 04:07:52 +00:00
* handle_starttag ( name , attrs ) # See note about return value
* handle_endtag ( name )
* handle_data ( data ) # Appends to the current data node
* endData ( containerClass ) # Ends the current data node
2020-01-30 01:07:26 +00:00
No matter how complicated the underlying parser is , you should be
able to build a tree using ' start tag ' events , ' end tag ' events ,
' data ' events , and " done with data " events .
If you encounter an empty - element tag ( aka a self - closing tag ,
like HTML ' s <br> tag), call handle_starttag and then
handle_endtag .
"""
2022-01-24 04:07:52 +00:00
# Since BeautifulSoup subclasses Tag, it's possible to treat it as
# a Tag with a .name. This name makes it clear the BeautifulSoup
# object isn't a real markup tag.
2020-01-30 01:07:26 +00:00
ROOT_TAG_NAME = ' [document] '
# If the end-user gives no indication which tree builder they
# want, look for one with these features.
DEFAULT_BUILDER_FEATURES = [ ' html ' , ' fast ' ]
2022-01-24 04:07:52 +00:00
# A string containing all ASCII whitespace characters, used in
# endData() to detect data chunks that seem 'empty'.
2020-01-30 01:07:26 +00:00
ASCII_SPACES = ' \x20 \x0a \x09 \x0c \x0d '
NO_PARSER_SPECIFIED_WARNING = " No parser was explicitly specified, so I ' m using the best available %(markup_type)s parser for this system ( \" %(parser)s \" ). This usually isn ' t a problem, but if you run this code on another system, or in a different virtual environment, it may use a different parser and behave differently. \n \n The code that caused this warning is on line %(line_number)s of the file %(filename)s . To get rid of this warning, pass the additional argument ' features= \" %(parser)s \" ' to the BeautifulSoup constructor. \n "
2022-01-24 04:07:52 +00:00
2020-01-30 01:07:26 +00:00
def __init__ ( self , markup = " " , features = None , builder = None ,
parse_only = None , from_encoding = None , exclude_encodings = None ,
2022-01-24 04:07:52 +00:00
element_classes = None , * * kwargs ) :
2020-01-30 01:07:26 +00:00
""" Constructor.
: param markup : A string or a file - like object representing
2022-01-24 04:07:52 +00:00
markup to be parsed .
2020-01-30 01:07:26 +00:00
2022-01-24 04:07:52 +00:00
: param features : Desirable features of the parser to be
used . This may be the name of a specific parser ( " lxml " ,
" lxml-xml " , " html.parser " , or " html5lib " ) or it may be the
type of markup to be used ( " html " , " html5 " , " xml " ) . It ' s
recommended that you name a specific parser , so that
Beautiful Soup gives you the same results across platforms
and virtual environments .
2020-01-30 01:07:26 +00:00
: param builder : A TreeBuilder subclass to instantiate ( or
2022-01-24 04:07:52 +00:00
instance to use ) instead of looking one up based on
` features ` . You only need to use this if you ' ve implemented a
custom TreeBuilder .
2020-01-30 01:07:26 +00:00
: param parse_only : A SoupStrainer . Only parts of the document
2022-01-24 04:07:52 +00:00
matching the SoupStrainer will be considered . This is useful
when parsing part of a document that would otherwise be too
large to fit into memory .
2020-01-30 01:07:26 +00:00
: param from_encoding : A string indicating the encoding of the
2022-01-24 04:07:52 +00:00
document to be parsed . Pass this in if Beautiful Soup is
guessing wrongly about the document ' s encoding.
2020-01-30 01:07:26 +00:00
: param exclude_encodings : A list of strings indicating
2022-01-24 04:07:52 +00:00
encodings known to be wrong . Pass this in if you don ' t know
the document ' s encoding but you know Beautiful Soup ' s guess is
wrong .
2020-01-30 01:07:26 +00:00
2022-01-24 04:07:52 +00:00
: param element_classes : A dictionary mapping BeautifulSoup
classes like Tag and NavigableString , to other classes you ' d
like to be instantiated instead as the parse tree is
built . This is useful for subclassing Tag or NavigableString
to modify default behavior .
2020-01-30 01:07:26 +00:00
2022-01-24 04:07:52 +00:00
: param kwargs : For backwards compatibility purposes , the
constructor accepts certain keyword arguments used in
Beautiful Soup 3. None of these arguments do anything in
Beautiful Soup 4 ; they will result in a warning and then be
ignored .
Apart from this , any keyword arguments passed into the
BeautifulSoup constructor are propagated to the TreeBuilder
constructor . This makes it possible to configure a
TreeBuilder by passing in arguments , not just by saying which
one to use .
2020-01-30 01:07:26 +00:00
"""
if ' convertEntities ' in kwargs :
del kwargs [ ' convertEntities ' ]
warnings . warn (
" BS4 does not respect the convertEntities argument to the "
" BeautifulSoup constructor. Entities are always converted "
" to Unicode characters. " )
if ' markupMassage ' in kwargs :
del kwargs [ ' markupMassage ' ]
warnings . warn (
" BS4 does not respect the markupMassage argument to the "
" BeautifulSoup constructor. The tree builder is responsible "
" for any necessary markup massage. " )
if ' smartQuotesTo ' in kwargs :
del kwargs [ ' smartQuotesTo ' ]
warnings . warn (
" BS4 does not respect the smartQuotesTo argument to the "
" BeautifulSoup constructor. Smart quotes are always converted "
" to Unicode characters. " )
if ' selfClosingTags ' in kwargs :
del kwargs [ ' selfClosingTags ' ]
warnings . warn (
" BS4 does not respect the selfClosingTags argument to the "
" BeautifulSoup constructor. The tree builder is responsible "
" for understanding self-closing tags. " )
if ' isHTML ' in kwargs :
del kwargs [ ' isHTML ' ]
warnings . warn (
" BS4 does not respect the isHTML argument to the "
" BeautifulSoup constructor. Suggest you use "
" features= ' lxml ' for HTML and features= ' lxml-xml ' for "
" XML. " )
def deprecated_argument ( old_name , new_name ) :
if old_name in kwargs :
warnings . warn (
' The " %s " argument to the BeautifulSoup constructor '
2022-11-07 18:06:49 +00:00
' has been renamed to " %s . " ' % ( old_name , new_name ) ,
DeprecationWarning
)
return kwargs . pop ( old_name )
2020-01-30 01:07:26 +00:00
return None
parse_only = parse_only or deprecated_argument (
" parseOnlyThese " , " parse_only " )
from_encoding = from_encoding or deprecated_argument (
" fromEncoding " , " from_encoding " )
if from_encoding and isinstance ( markup , str ) :
warnings . warn ( " You provided Unicode markup but also provided a value for from_encoding. Your from_encoding will be ignored. " )
from_encoding = None
2022-01-24 04:07:52 +00:00
self . element_classes = element_classes or dict ( )
2020-01-30 01:07:26 +00:00
# We need this information to track whether or not the builder
# was specified well enough that we can omit the 'you need to
# specify a parser' warning.
original_builder = builder
original_features = features
if isinstance ( builder , type ) :
# A builder class was passed in; it needs to be instantiated.
builder_class = builder
builder = None
elif builder is None :
if isinstance ( features , str ) :
features = [ features ]
if features is None or len ( features ) == 0 :
features = self . DEFAULT_BUILDER_FEATURES
builder_class = builder_registry . lookup ( * features )
if builder_class is None :
raise FeatureNotFound (
" Couldn ' t find a tree builder with the features you "
" requested: %s . Do you need to install a parser library? "
% " , " . join ( features ) )
# At this point either we have a TreeBuilder instance in
# builder, or we have a builder_class that we can instantiate
# with the remaining **kwargs.
if builder is None :
builder = builder_class ( * * kwargs )
if not original_builder and not (
original_features == builder . NAME or
original_features in builder . ALTERNATE_NAMES
2022-01-24 04:07:52 +00:00
) and markup :
# The user did not tell us which TreeBuilder to use,
# and we had to guess. Issue a warning.
2020-01-30 01:07:26 +00:00
if builder . is_xml :
markup_type = " XML "
else :
markup_type = " HTML "
# This code adapted from warnings.py so that we get the same line
# of code as our warnings.warn() call gets, even if the answer is wrong
# (as it may be in a multithreading situation).
caller = None
try :
caller = sys . _getframe ( 1 )
except ValueError :
pass
if caller :
globals = caller . f_globals
line_number = caller . f_lineno
else :
globals = sys . __dict__
line_number = 1
filename = globals . get ( ' __file__ ' )
if filename :
fnl = filename . lower ( )
if fnl . endswith ( ( " .pyc " , " .pyo " ) ) :
filename = filename [ : - 1 ]
if filename :
# If there is no filename at all, the user is most likely in a REPL,
# and the warning is not necessary.
values = dict (
filename = filename ,
line_number = line_number ,
parser = builder . NAME ,
markup_type = markup_type
)
2022-01-24 04:07:52 +00:00
warnings . warn (
self . NO_PARSER_SPECIFIED_WARNING % values ,
GuessedAtParserWarning , stacklevel = 2
)
2020-01-30 01:07:26 +00:00
else :
if kwargs :
warnings . warn ( " Keyword arguments to the BeautifulSoup constructor will be ignored. These would normally be passed into the TreeBuilder constructor, but a TreeBuilder instance was passed in as `builder`. " )
self . builder = builder
self . is_xml = builder . is_xml
self . known_xml = self . is_xml
self . _namespaces = dict ( )
self . parse_only = parse_only
if hasattr ( markup , ' read ' ) : # It's a file-type object.
markup = markup . read ( )
elif len ( markup ) < = 256 and (
( isinstance ( markup , bytes ) and not b ' < ' in markup )
or ( isinstance ( markup , str ) and not ' < ' in markup )
) :
2022-11-07 18:06:49 +00:00
# Issue warnings for a couple beginner problems
2020-01-30 01:07:26 +00:00
# involving passing non-markup to Beautiful Soup.
# Beautiful Soup will still parse the input as markup,
2022-11-07 18:06:49 +00:00
# since that is sometimes the intended behavior.
if not self . _markup_is_url ( markup ) :
self . _markup_resembles_filename ( markup )
2020-01-30 01:07:26 +00:00
2022-01-24 04:07:52 +00:00
rejections = [ ]
success = False
2020-01-30 01:07:26 +00:00
for ( self . markup , self . original_encoding , self . declared_html_encoding ,
self . contains_replacement_characters ) in (
self . builder . prepare_markup (
markup , from_encoding , exclude_encodings = exclude_encodings ) ) :
self . reset ( )
2022-11-07 18:06:49 +00:00
self . builder . initialize_soup ( self )
2020-01-30 01:07:26 +00:00
try :
self . _feed ( )
2022-01-24 04:07:52 +00:00
success = True
2020-01-30 01:07:26 +00:00
break
2022-01-24 04:07:52 +00:00
except ParserRejectedMarkup as e :
rejections . append ( e )
2020-01-30 01:07:26 +00:00
pass
2022-01-24 04:07:52 +00:00
if not success :
other_exceptions = [ str ( e ) for e in rejections ]
raise ParserRejectedMarkup (
" The markup you provided was rejected by the parser. Trying a different parser or a different encoding may help. \n \n Original exception(s) from parser: \n " + " \n " . join ( other_exceptions )
)
2020-01-30 01:07:26 +00:00
# Clear out the markup and remove the builder's circular
# reference to this object.
self . markup = None
self . builder . soup = None
def __copy__ ( self ) :
2022-01-24 04:07:52 +00:00
""" Copy a BeautifulSoup object by converting the document to a string and parsing it again. """
2020-01-30 01:07:26 +00:00
copy = type ( self ) (
self . encode ( ' utf-8 ' ) , builder = self . builder , from_encoding = ' utf-8 '
)
# Although we encoded the tree to UTF-8, that may not have
# been the encoding of the original markup. Set the copy's
# .original_encoding to reflect the original object's
# .original_encoding.
copy . original_encoding = self . original_encoding
return copy
def __getstate__ ( self ) :
# Frequently a tree builder can't be pickled.
d = dict ( self . __dict__ )
2022-11-07 18:06:49 +00:00
if ' builder ' in d and d [ ' builder ' ] is not None and not self . builder . picklable :
2020-01-30 01:07:26 +00:00
d [ ' builder ' ] = None
return d
2022-11-07 18:06:49 +00:00
2022-01-24 04:07:52 +00:00
@classmethod
def _decode_markup ( cls , markup ) :
""" Ensure `markup` is bytes so it ' s safe to send into warnings.warn.
TODO : warnings . warn had this problem back in 2010 but it might not
anymore .
"""
if isinstance ( markup , bytes ) :
decoded = markup . decode ( ' utf-8 ' , ' replace ' )
else :
decoded = markup
return decoded
@classmethod
2022-11-07 18:06:49 +00:00
def _markup_is_url ( cls , markup ) :
2022-01-24 04:07:52 +00:00
""" Error-handling method to raise a warning if incoming markup looks
like a URL .
: param markup : A string .
2022-11-07 18:06:49 +00:00
: return : Whether or not the markup resembles a URL
closely enough to justify a warning .
2020-01-30 01:07:26 +00:00
"""
if isinstance ( markup , bytes ) :
space = b ' '
cant_start_with = ( b " http: " , b " https: " )
elif isinstance ( markup , str ) :
space = ' '
cant_start_with = ( " http: " , " https: " )
else :
2022-11-07 18:06:49 +00:00
return False
2020-01-30 01:07:26 +00:00
if any ( markup . startswith ( prefix ) for prefix in cant_start_with ) :
if not space in markup :
warnings . warn (
2022-11-07 18:06:49 +00:00
' The input looks more like a URL than markup. You may want to use '
' an HTTP client like requests to get the document behind '
' the URL, and feed that document to Beautiful Soup. ' ,
2022-01-24 04:07:52 +00:00
MarkupResemblesLocatorWarning
2020-01-30 01:07:26 +00:00
)
2022-11-07 18:06:49 +00:00
return True
return False
@classmethod
def _markup_resembles_filename ( cls , markup ) :
""" Error-handling method to raise a warning if incoming markup
resembles a filename .
2020-01-30 01:07:26 +00:00
2022-11-07 18:06:49 +00:00
: param markup : A bytestring or string .
: return : Whether or not the markup resembles a filename
closely enough to justify a warning .
"""
path_characters = ' / \\ '
extensions = [ ' .html ' , ' .htm ' , ' .xml ' , ' .xhtml ' , ' .txt ' ]
if isinstance ( markup , bytes ) :
path_characters = path_characters . encode ( " utf8 " )
extensions = [ x . encode ( ' utf8 ' ) for x in extensions ]
filelike = False
if any ( x in markup for x in path_characters ) :
filelike = True
else :
lower = markup . lower ( )
if any ( lower . endswith ( ext ) for ext in extensions ) :
filelike = True
if filelike :
warnings . warn (
' The input looks more like a filename than markup. You may '
' want to open this file and pass the filehandle into '
' Beautiful Soup. ' ,
MarkupResemblesLocatorWarning
)
return True
return False
2020-01-30 01:07:26 +00:00
def _feed ( self ) :
2022-01-24 04:07:52 +00:00
""" Internal method that parses previously set markup, creating a large
number of Tag and NavigableString objects .
"""
2020-01-30 01:07:26 +00:00
# Convert the document to Unicode.
self . builder . reset ( )
self . builder . feed ( self . markup )
# Close out any unfinished strings and close all the open tags.
self . endData ( )
while self . currentTag . name != self . ROOT_TAG_NAME :
self . popTag ( )
def reset ( self ) :
2022-01-24 04:07:52 +00:00
""" Reset this object to a state as though it had never parsed any
markup .
"""
2020-01-30 01:07:26 +00:00
Tag . __init__ ( self , self , self . builder , self . ROOT_TAG_NAME )
self . hidden = 1
self . builder . reset ( )
self . current_data = [ ]
self . currentTag = None
self . tagStack = [ ]
2022-01-24 04:07:52 +00:00
self . open_tag_counter = Counter ( )
2020-01-30 01:07:26 +00:00
self . preserve_whitespace_tag_stack = [ ]
2022-01-24 04:07:52 +00:00
self . string_container_stack = [ ]
2020-01-30 01:07:26 +00:00
self . pushTag ( self )
2022-01-24 04:07:52 +00:00
def new_tag ( self , name , namespace = None , nsprefix = None , attrs = { } ,
sourceline = None , sourcepos = None , * * kwattrs ) :
""" Create a new Tag associated with this BeautifulSoup object.
: param name : The name of the new Tag .
: param namespace : The URI of the new Tag ' s XML namespace, if any.
: param prefix : The prefix for the new Tag ' s XML namespace, if any.
: param attrs : A dictionary of this Tag ' s attribute values; can
be used instead of ` kwattrs ` for attributes like ' class '
that are reserved words in Python .
: param sourceline : The line number where this tag was
( purportedly ) found in its source document .
: param sourcepos : The character position within ` sourceline ` where this
tag was ( purportedly ) found .
: param kwattrs : Keyword arguments for the new Tag ' s attribute values.
"""
2020-01-30 01:07:26 +00:00
kwattrs . update ( attrs )
2022-01-24 04:07:52 +00:00
return self . element_classes . get ( Tag , Tag ) (
None , self . builder , name , namespace , nsprefix , kwattrs ,
sourceline = sourceline , sourcepos = sourcepos
)
2020-01-30 01:07:26 +00:00
2022-01-24 04:07:52 +00:00
def string_container ( self , base_class = None ) :
container = base_class or NavigableString
# There may be a general override of NavigableString.
container = self . element_classes . get (
container , container
)
2020-01-30 01:07:26 +00:00
2022-01-24 04:07:52 +00:00
# On top of that, we may be inside a tag that needs a special
# container class.
if self . string_container_stack and container is NavigableString :
container = self . builder . string_containers . get (
self . string_container_stack [ - 1 ] . name , container
)
return container
def new_string ( self , s , subclass = None ) :
""" Create a new NavigableString associated with this BeautifulSoup
object .
"""
container = self . string_container ( subclass )
return container ( s )
def insert_before ( self , * args ) :
""" This method is part of the PageElement API, but `BeautifulSoup` doesn ' t implement
it because there is nothing before or after it in the parse tree .
"""
2020-01-30 01:07:26 +00:00
raise NotImplementedError ( " BeautifulSoup objects don ' t support insert_before(). " )
2022-01-24 04:07:52 +00:00
def insert_after ( self , * args ) :
""" This method is part of the PageElement API, but `BeautifulSoup` doesn ' t implement
it because there is nothing before or after it in the parse tree .
"""
2020-01-30 01:07:26 +00:00
raise NotImplementedError ( " BeautifulSoup objects don ' t support insert_after(). " )
def popTag ( self ) :
2022-01-24 04:07:52 +00:00
""" Internal method called by _popToTag when a tag is closed. """
2020-01-30 01:07:26 +00:00
tag = self . tagStack . pop ( )
2022-01-24 04:07:52 +00:00
if tag . name in self . open_tag_counter :
self . open_tag_counter [ tag . name ] - = 1
2020-01-30 01:07:26 +00:00
if self . preserve_whitespace_tag_stack and tag == self . preserve_whitespace_tag_stack [ - 1 ] :
self . preserve_whitespace_tag_stack . pop ( )
2022-01-24 04:07:52 +00:00
if self . string_container_stack and tag == self . string_container_stack [ - 1 ] :
self . string_container_stack . pop ( )
#print("Pop", tag.name)
2020-01-30 01:07:26 +00:00
if self . tagStack :
self . currentTag = self . tagStack [ - 1 ]
return self . currentTag
def pushTag ( self , tag ) :
2022-01-24 04:07:52 +00:00
""" Internal method called by handle_starttag when a tag is opened. """
#print("Push", tag.name)
2020-01-30 01:07:26 +00:00
if self . currentTag is not None :
self . currentTag . contents . append ( tag )
self . tagStack . append ( tag )
self . currentTag = self . tagStack [ - 1 ]
2022-01-24 04:07:52 +00:00
if tag . name != self . ROOT_TAG_NAME :
self . open_tag_counter [ tag . name ] + = 1
2020-01-30 01:07:26 +00:00
if tag . name in self . builder . preserve_whitespace_tags :
self . preserve_whitespace_tag_stack . append ( tag )
2022-01-24 04:07:52 +00:00
if tag . name in self . builder . string_containers :
self . string_container_stack . append ( tag )
2020-01-30 01:07:26 +00:00
2022-01-24 04:07:52 +00:00
def endData ( self , containerClass = None ) :
""" Method called by the TreeBuilder when the end of a data segment
occurs .
"""
2020-01-30 01:07:26 +00:00
if self . current_data :
current_data = ' ' . join ( self . current_data )
# If whitespace is not preserved, and this string contains
# nothing but ASCII spaces, replace it with a single space
# or newline.
if not self . preserve_whitespace_tag_stack :
strippable = True
for i in current_data :
if i not in self . ASCII_SPACES :
strippable = False
break
if strippable :
if ' \n ' in current_data :
current_data = ' \n '
else :
current_data = ' '
# Reset the data collector.
self . current_data = [ ]
# Should we add this string to the tree at all?
if self . parse_only and len ( self . tagStack ) < = 1 and \
( not self . parse_only . text or \
not self . parse_only . search ( current_data ) ) :
return
2022-01-24 04:07:52 +00:00
containerClass = self . string_container ( containerClass )
2020-01-30 01:07:26 +00:00
o = containerClass ( current_data )
self . object_was_parsed ( o )
def object_was_parsed ( self , o , parent = None , most_recent_element = None ) :
2022-01-24 04:07:52 +00:00
""" Method called by the TreeBuilder to integrate an object into the parse tree. """
2020-01-30 01:07:26 +00:00
if parent is None :
parent = self . currentTag
if most_recent_element is not None :
previous_element = most_recent_element
else :
previous_element = self . _most_recent_element
next_element = previous_sibling = next_sibling = None
if isinstance ( o , Tag ) :
next_element = o . next_element
next_sibling = o . next_sibling
previous_sibling = o . previous_sibling
if previous_element is None :
previous_element = o . previous_element
fix = parent . next_element is not None
o . setup ( parent , previous_element , next_element , previous_sibling , next_sibling )
self . _most_recent_element = o
parent . contents . append ( o )
# Check if we are inserting into an already parsed node.
if fix :
self . _linkage_fixer ( parent )
def _linkage_fixer ( self , el ) :
""" Make sure linkage of this fragment is sound. """
first = el . contents [ 0 ]
child = el . contents [ - 1 ]
descendant = child
if child is first and el . parent is not None :
# Parent should be linked to first child
el . next_element = child
# We are no longer linked to whatever this element is
prev_el = child . previous_element
if prev_el is not None and prev_el is not el :
prev_el . next_element = None
# First child should be linked to the parent, and no previous siblings.
child . previous_element = el
child . previous_sibling = None
# We have no sibling as we've been appended as the last.
child . next_sibling = None
# This index is a tag, dig deeper for a "last descendant"
if isinstance ( child , Tag ) and child . contents :
descendant = child . _last_descendant ( False )
# As the final step, link last descendant. It should be linked
# to the parent's next sibling (if found), else walk up the chain
# and find a parent with a sibling. It should have no next sibling.
descendant . next_element = None
descendant . next_sibling = None
target = el
while True :
if target is None :
break
elif target . next_sibling is not None :
descendant . next_element = target . next_sibling
target . next_sibling . previous_element = child
break
target = target . parent
def _popToTag ( self , name , nsprefix = None , inclusivePop = True ) :
""" Pops the tag stack up to and including the most recent
2022-01-24 04:07:52 +00:00
instance of the given tag .
If there are no open tags with the given name , nothing will be
popped .
: param name : Pop up to the most recent tag with this name .
: param nsprefix : The namespace prefix that goes with ` name ` .
: param inclusivePop : It this is false , pops the tag stack up
to but * not * including the most recent instqance of the
given tag .
"""
#print("Popping to %s" % name)
2020-01-30 01:07:26 +00:00
if name == self . ROOT_TAG_NAME :
# The BeautifulSoup object itself can never be popped.
return
most_recently_popped = None
stack_size = len ( self . tagStack )
for i in range ( stack_size - 1 , 0 , - 1 ) :
2022-01-24 04:07:52 +00:00
if not self . open_tag_counter . get ( name ) :
break
2020-01-30 01:07:26 +00:00
t = self . tagStack [ i ]
if ( name == t . name and nsprefix == t . prefix ) :
if inclusivePop :
most_recently_popped = self . popTag ( )
break
most_recently_popped = self . popTag ( )
return most_recently_popped
2022-01-24 04:07:52 +00:00
def handle_starttag ( self , name , namespace , nsprefix , attrs , sourceline = None ,
2022-11-07 18:06:49 +00:00
sourcepos = None , namespaces = None ) :
2022-01-24 04:07:52 +00:00
""" Called by the tree builder when a new tag is encountered.
: param name : Name of the tag .
: param nsprefix : Namespace prefix for the tag .
: param attrs : A dictionary of attribute values .
: param sourceline : The line number where this tag was found in its
source document .
: param sourcepos : The character position within ` sourceline ` where this
tag was found .
2022-11-07 18:06:49 +00:00
: param namespaces : A dictionary of all namespace prefix mappings
currently in scope in the document .
2020-01-30 01:07:26 +00:00
2022-01-24 04:07:52 +00:00
If this method returns None , the tag was rejected by an active
2020-01-30 01:07:26 +00:00
SoupStrainer . You should proceed as if the tag had not occurred
in the document . For instance , if this was a self - closing tag ,
don ' t call handle_endtag.
"""
2022-01-24 04:07:52 +00:00
# print("Start tag %s: %s" % (name, attrs))
2020-01-30 01:07:26 +00:00
self . endData ( )
if ( self . parse_only and len ( self . tagStack ) < = 1
and ( self . parse_only . text
or not self . parse_only . search_tag ( name , attrs ) ) ) :
return None
2022-01-24 04:07:52 +00:00
tag = self . element_classes . get ( Tag , Tag ) (
self , self . builder , name , namespace , nsprefix , attrs ,
self . currentTag , self . _most_recent_element ,
2022-11-07 18:06:49 +00:00
sourceline = sourceline , sourcepos = sourcepos ,
namespaces = namespaces
2022-01-24 04:07:52 +00:00
)
2020-01-30 01:07:26 +00:00
if tag is None :
return tag
if self . _most_recent_element is not None :
self . _most_recent_element . next_element = tag
self . _most_recent_element = tag
self . pushTag ( tag )
return tag
def handle_endtag ( self , name , nsprefix = None ) :
2022-01-24 04:07:52 +00:00
""" Called by the tree builder when an ending tag is encountered.
: param name : Name of the tag .
: param nsprefix : Namespace prefix for the tag .
"""
#print("End tag: " + name)
2020-01-30 01:07:26 +00:00
self . endData ( )
self . _popToTag ( name , nsprefix )
2022-11-07 18:06:49 +00:00
2020-01-30 01:07:26 +00:00
def handle_data ( self , data ) :
2022-01-24 04:07:52 +00:00
""" Called by the tree builder when a chunk of textual data is encountered. """
2020-01-30 01:07:26 +00:00
self . current_data . append ( data )
2022-01-24 04:07:52 +00:00
2020-01-30 01:07:26 +00:00
def decode ( self , pretty_print = False ,
eventual_encoding = DEFAULT_OUTPUT_ENCODING ,
formatter = " minimal " ) :
2022-01-24 04:07:52 +00:00
""" Returns a string or Unicode representation of the parse tree
as an HTML or XML document .
2020-01-30 01:07:26 +00:00
2022-01-24 04:07:52 +00:00
: param pretty_print : If this is True , indentation will be used to
make the document more readable .
: param eventual_encoding : The encoding of the final document .
If this is None , the document will be a Unicode string .
"""
2020-01-30 01:07:26 +00:00
if self . is_xml :
# Print the XML declaration
encoding_part = ' '
2022-01-24 04:07:52 +00:00
if eventual_encoding in PYTHON_SPECIFIC_ENCODINGS :
# This is a special Python encoding; it can't actually
# go into an XML document because it means nothing
# outside of Python.
eventual_encoding = None
2020-01-30 01:07:26 +00:00
if eventual_encoding != None :
encoding_part = ' encoding= " %s " ' % eventual_encoding
prefix = ' <?xml version= " 1.0 " %s ?> \n ' % encoding_part
else :
prefix = ' '
if not pretty_print :
indent_level = None
else :
indent_level = 0
return prefix + super ( BeautifulSoup , self ) . decode (
indent_level , eventual_encoding , formatter )
2022-01-24 04:07:52 +00:00
# Aliases to make it easier to get started quickly, e.g. 'from bs4 import _soup'
2020-01-30 01:07:26 +00:00
_s = BeautifulSoup
_soup = BeautifulSoup
class BeautifulStoneSoup ( BeautifulSoup ) :
""" Deprecated interface to an XML parser. """
def __init__ ( self , * args , * * kwargs ) :
kwargs [ ' features ' ] = ' xml '
warnings . warn (
' The BeautifulStoneSoup class is deprecated. Instead of using '
2022-11-07 18:06:49 +00:00
' it, pass features= " xml " into the BeautifulSoup constructor. ' ,
DeprecationWarning
)
2020-01-30 01:07:26 +00:00
super ( BeautifulStoneSoup , self ) . __init__ ( * args , * * kwargs )
class StopParsing ( Exception ) :
2022-01-24 04:07:52 +00:00
""" Exception raised by a TreeBuilder if it ' s unable to continue parsing. """
2020-01-30 01:07:26 +00:00
pass
class FeatureNotFound ( ValueError ) :
2022-01-24 04:07:52 +00:00
""" Exception raised by the BeautifulSoup constructor if no parser with the
requested features is found .
"""
2020-01-30 01:07:26 +00:00
pass
2022-01-24 04:07:52 +00:00
#If this file is run as a script, act as an HTML pretty-printer.
2020-01-30 01:07:26 +00:00
if __name__ == ' __main__ ' :
import sys
soup = BeautifulSoup ( sys . stdin )
2022-01-24 04:07:52 +00:00
print ( ( soup . prettify ( ) ) )