From 69eeb9b49dd914be5a60c486f442200133df9c69 Mon Sep 17 00:00:00 2001 From: Brian Hartvigsen Date: Tue, 23 Aug 2016 00:01:41 -0600 Subject: [PATCH] Put libs/ in sys.path Update bs4 to latest version to fix issues Get clean modules using `pip install --upgrade --target=lib` Move cherrypy, mako, pystun, bs4 into lib directory --- Mylar.py | 9 +- bs4/__init__.py | 355 --- bs4/builder/_html5lib.py | 222 -- bs4/tests/test_html5lib.py | 58 - comictagger.py | 2 +- lib/apscheduler/__init__.py | 2 +- lib/apscheduler/events.py | 0 lib/apscheduler/job.py | 2 +- lib/apscheduler/jobstores/__init__.py | 0 lib/apscheduler/jobstores/base.py | 0 lib/apscheduler/jobstores/mongodb_store.py | 4 +- lib/apscheduler/jobstores/ram_store.py | 2 +- lib/apscheduler/jobstores/shelve_store.py | 6 +- lib/apscheduler/jobstores/sqlalchemy_store.py | 4 +- lib/apscheduler/scheduler.py | 12 +- lib/apscheduler/threadpool.py | 0 lib/apscheduler/triggers/__init__.py | 6 +- lib/apscheduler/triggers/cron/__init__.py | 4 +- lib/apscheduler/triggers/cron/expressions.py | 2 +- lib/apscheduler/triggers/cron/fields.py | 2 +- lib/apscheduler/triggers/interval.py | 2 +- lib/apscheduler/triggers/simple.py | 2 +- lib/apscheduler/util.py | 70 +- lib/bs4/__init__.py | 529 ++++ {bs4 => lib/bs4}/builder/__init__.py | 30 +- lib/bs4/builder/_html5lib.py | 356 +++ {bs4 => lib/bs4}/builder/_htmlparser.py | 73 +- lib/bs4/builder/_lxml.py | 258 ++ {bs4 => lib/bs4}/dammit.py | 392 +-- lib/bs4/diagnose.py | 219 ++ {bs4 => lib/bs4}/element.py | 756 +++-- {bs4 => lib/bs4}/testing.py | 193 +- {bs4 => lib/bs4}/tests/__init__.py | 0 .../bs4}/tests/test_builder_registry.py | 14 +- {bs4 => lib/bs4}/tests/test_docs.py | 0 lib/bs4/tests/test_html5lib.py | 109 + {bs4 => lib/bs4}/tests/test_htmlparser.py | 13 + {bs4 => lib/bs4}/tests/test_lxml.py | 39 +- {bs4 => lib/bs4}/tests/test_soup.py | 219 +- {bs4 => lib/bs4}/tests/test_tree.py | 423 ++- {cherrypy => lib/cherrypy}/LICENSE.txt | 0 {cherrypy => lib/cherrypy}/__init__.py | 0 {cherrypy => lib/cherrypy}/_cpchecker.py | 0 {cherrypy => lib/cherrypy}/_cpcompat.py | 0 {cherrypy => lib/cherrypy}/_cpconfig.py | 0 {cherrypy => lib/cherrypy}/_cpdispatch.py | 0 {cherrypy => lib/cherrypy}/_cperror.py | 0 {cherrypy => lib/cherrypy}/_cplogging.py | 0 {cherrypy => lib/cherrypy}/_cpmodpy.py | 0 .../cherrypy}/_cpnative_server.py | 0 {cherrypy => lib/cherrypy}/_cpreqbody.py | 0 {cherrypy => lib/cherrypy}/_cprequest.py | 0 {cherrypy => lib/cherrypy}/_cpserver.py | 0 .../cherrypy}/_cpthreadinglocal.py | 0 {cherrypy => lib/cherrypy}/_cptools.py | 0 {cherrypy => lib/cherrypy}/_cptree.py | 0 {cherrypy => lib/cherrypy}/_cpwsgi.py | 0 {cherrypy => lib/cherrypy}/_cpwsgi_server.py | 0 {cherrypy => lib/cherrypy}/cherryd | 0 {cherrypy => lib/cherrypy}/favicon.ico | Bin {cherrypy => lib/cherrypy}/lib/__init__.py | 0 {cherrypy => lib/cherrypy}/lib/auth.py | 0 {cherrypy => lib/cherrypy}/lib/auth_basic.py | 0 {cherrypy => lib/cherrypy}/lib/auth_digest.py | 0 {cherrypy => lib/cherrypy}/lib/caching.py | 0 {cherrypy => lib/cherrypy}/lib/covercp.py | 0 {cherrypy => lib/cherrypy}/lib/cpstats.py | 0 {cherrypy => lib/cherrypy}/lib/cptools.py | 0 {cherrypy => lib/cherrypy}/lib/encoding.py | 0 {cherrypy => lib/cherrypy}/lib/http.py | 0 {cherrypy => lib/cherrypy}/lib/httpauth.py | 0 {cherrypy => lib/cherrypy}/lib/httputil.py | 0 {cherrypy => lib/cherrypy}/lib/jsontools.py | 0 {cherrypy => lib/cherrypy}/lib/profiler.py | 0 {cherrypy => lib/cherrypy}/lib/reprconf.py | 0 {cherrypy => lib/cherrypy}/lib/sessions.py | 0 {cherrypy => lib/cherrypy}/lib/static.py | 0 {cherrypy => lib/cherrypy}/lib/xmlrpc.py | 0 .../cherrypy}/process/__init__.py | 0 {cherrypy => lib/cherrypy}/process/plugins.py | 0 {cherrypy => lib/cherrypy}/process/servers.py | 0 {cherrypy => lib/cherrypy}/process/win32.py | 0 {cherrypy => lib/cherrypy}/process/wspbus.py | 0 .../cherrypy}/scaffold/__init__.py | 0 .../cherrypy}/scaffold/apache-fcgi.conf | 0 .../cherrypy}/scaffold/example.conf | 0 {cherrypy => lib/cherrypy}/scaffold/site.conf | 0 .../static/made_with_cherrypy_small.png | Bin {cherrypy => lib/cherrypy}/test/__init__.py | 0 .../cherrypy}/test/_test_decorators.py | 0 .../cherrypy}/test/_test_states_demo.py | 0 {cherrypy => lib/cherrypy}/test/benchmark.py | 0 .../cherrypy}/test/checkerdemo.py | 0 {cherrypy => lib/cherrypy}/test/fastcgi.conf | 0 {cherrypy => lib/cherrypy}/test/fcgi.conf | 0 {cherrypy => lib/cherrypy}/test/helper.py | 0 {cherrypy => lib/cherrypy}/test/logtest.py | 0 {cherrypy => lib/cherrypy}/test/modfastcgi.py | 0 {cherrypy => lib/cherrypy}/test/modfcgid.py | 0 {cherrypy => lib/cherrypy}/test/modpy.py | 0 {cherrypy => lib/cherrypy}/test/modwsgi.py | 0 .../cherrypy}/test/native-server.ini | 0 .../cherrypy}/test/sessiondemo.py | 0 .../cherrypy}/test/static/dirback.jpg | Bin .../cherrypy}/test/static/index.html | 0 {cherrypy => lib/cherrypy}/test/style.css | 0 {cherrypy => lib/cherrypy}/test/test.pem | 0 .../cherrypy}/test/test_auth_basic.py | 0 .../cherrypy}/test/test_auth_digest.py | 0 {cherrypy => lib/cherrypy}/test/test_bus.py | 0 .../cherrypy}/test/test_caching.py | 0 .../cherrypy}/test/test_config.py | 0 .../cherrypy}/test/test_config_server.py | 0 {cherrypy => lib/cherrypy}/test/test_conn.py | 0 {cherrypy => lib/cherrypy}/test/test_core.py | 0 .../test/test_dynamicobjectmapping.py | 0 .../cherrypy}/test/test_encoding.py | 0 {cherrypy => lib/cherrypy}/test/test_etags.py | 0 {cherrypy => lib/cherrypy}/test/test_http.py | 0 .../cherrypy}/test/test_httpauth.py | 0 .../cherrypy}/test/test_httplib.py | 0 {cherrypy => lib/cherrypy}/test/test_json.py | 0 .../cherrypy}/test/test_logging.py | 0 {cherrypy => lib/cherrypy}/test/test_mime.py | 0 .../cherrypy}/test/test_misc_tools.py | 0 .../cherrypy}/test/test_objectmapping.py | 0 {cherrypy => lib/cherrypy}/test/test_proxy.py | 0 .../cherrypy}/test/test_refleaks.py | 0 .../cherrypy}/test/test_request_obj.py | 0 .../cherrypy}/test/test_routes.py | 0 .../cherrypy}/test/test_session.py | 0 .../test/test_sessionauthenticate.py | 0 .../cherrypy}/test/test_states.py | 0 .../cherrypy}/test/test_static.py | 0 {cherrypy => lib/cherrypy}/test/test_tools.py | 0 .../cherrypy}/test/test_tutorials.py | 0 .../cherrypy}/test/test_virtualhost.py | 0 .../cherrypy}/test/test_wsgi_ns.py | 0 .../cherrypy}/test/test_wsgi_vhost.py | 0 .../cherrypy}/test/test_wsgiapps.py | 0 .../cherrypy}/test/test_xmlrpc.py | 0 {cherrypy => lib/cherrypy}/test/webtest.py | 0 .../cherrypy}/wsgiserver/__init__.py | 0 .../cherrypy}/wsgiserver/ssl_builtin.py | 0 .../cherrypy}/wsgiserver/ssl_pyopenssl.py | 0 lib/comictaggerlib/settings.py | 68 +- lib/httplib2/__init__.py | 735 ++++- lib/httplib2/iri2uri.py | 0 lib/httplib2/socks.py | 438 +++ {mako => lib/mako}/__init__.py | 0 {mako => lib/mako}/_ast_util.py | 0 {mako => lib/mako}/ast.py | 0 {mako => lib/mako}/cache.py | 0 {mako => lib/mako}/codegen.py | 0 {mako => lib/mako}/exceptions.py | 0 lib/{pystun/tests => mako/ext}/__init__.py | 0 {mako => lib/mako}/ext/autohandler.py | 0 {mako => lib/mako}/ext/babelplugin.py | 0 {mako => lib/mako}/ext/preprocessors.py | 0 {mako => lib/mako}/ext/pygmentplugin.py | 0 {mako => lib/mako}/ext/turbogears.py | 0 {mako => lib/mako}/filters.py | 0 {mako => lib/mako}/lexer.py | 0 {mako => lib/mako}/lookup.py | 0 {mako => lib/mako}/parsetree.py | 0 {mako => lib/mako}/pygen.py | 0 {mako => lib/mako}/pyparser.py | 0 {mako => lib/mako}/runtime.py | 0 {mako => lib/mako}/template.py | 0 {mako => lib/mako}/util.py | 0 lib/markupsafe/__init__.py | 298 ++ lib/markupsafe/_compat.py | 26 + lib/markupsafe/_constants.py | 267 ++ lib/markupsafe/_native.py | 46 + lib/markupsafe/_speedups.c | 239 ++ lib/markupsafe/tests.py | 179 ++ lib/pystun/README.rst | 93 - lib/requests/LICENSE | 13 - lib/requests/NOTICE | 54 - lib/requests/README.rst | 87 - lib/requests/__init__.py | 2 +- lib/requests/adapters.py | 0 lib/requests/api.py | 0 lib/requests/auth.py | 2 +- lib/requests/cacert.pem | 0 lib/requests/certs.py | 0 lib/requests/compat.py | 0 lib/requests/cookies.py | 9 +- lib/requests/exceptions.py | 0 lib/requests/hooks.py | 0 lib/requests/models.py | 60 +- lib/requests/packages/README.rst | 8 - lib/requests/packages/__init__.py | 0 lib/requests/packages/chardet/__init__.py | 0 lib/requests/packages/chardet/big5freq.py | 0 lib/requests/packages/chardet/big5prober.py | 0 lib/requests/packages/chardet/chardetect.py | 0 .../packages/chardet/chardistribution.py | 0 .../packages/chardet/charsetgroupprober.py | 0 .../packages/chardet/charsetprober.py | 0 .../packages/chardet/codingstatemachine.py | 0 lib/requests/packages/chardet/compat.py | 0 lib/requests/packages/chardet/constants.py | 0 lib/requests/packages/chardet/cp949prober.py | 0 lib/requests/packages/chardet/escprober.py | 0 lib/requests/packages/chardet/escsm.py | 0 lib/requests/packages/chardet/eucjpprober.py | 0 lib/requests/packages/chardet/euckrfreq.py | 0 lib/requests/packages/chardet/euckrprober.py | 0 lib/requests/packages/chardet/euctwfreq.py | 0 lib/requests/packages/chardet/euctwprober.py | 0 lib/requests/packages/chardet/gb2312freq.py | 0 lib/requests/packages/chardet/gb2312prober.py | 0 lib/requests/packages/chardet/hebrewprober.py | 0 lib/requests/packages/chardet/jisfreq.py | 0 lib/requests/packages/chardet/jpcntx.py | 0 .../packages/chardet/langbulgarianmodel.py | 0 .../packages/chardet/langcyrillicmodel.py | 0 .../packages/chardet/langgreekmodel.py | 0 .../packages/chardet/langhebrewmodel.py | 0 .../packages/chardet/langhungarianmodel.py | 0 .../packages/chardet/langthaimodel.py | 0 lib/requests/packages/chardet/latin1prober.py | 0 .../packages/chardet/mbcharsetprober.py | 0 .../packages/chardet/mbcsgroupprober.py | 0 lib/requests/packages/chardet/mbcssm.py | 0 .../packages/chardet/sbcharsetprober.py | 0 .../packages/chardet/sbcsgroupprober.py | 0 lib/requests/packages/chardet/sjisprober.py | 0 .../packages/chardet/universaldetector.py | 0 lib/requests/packages/chardet/utf8prober.py | 0 lib/requests/packages/urllib3/__init__.py | 0 lib/requests/packages/urllib3/_collections.py | 0 lib/requests/packages/urllib3/connection.py | 0 .../packages/urllib3/connectionpool.py | 0 .../packages/urllib3/contrib/__init__.py | 0 .../packages/urllib3/contrib/ntlmpool.py | 0 .../packages/urllib3/contrib/pyopenssl.py | 0 lib/requests/packages/urllib3/exceptions.py | 0 lib/requests/packages/urllib3/fields.py | 0 lib/requests/packages/urllib3/filepost.py | 0 .../packages/urllib3/packages/__init__.py | 0 .../packages/urllib3/packages/ordered_dict.py | 0 lib/requests/packages/urllib3/packages/six.py | 0 .../packages/ssl_match_hostname/__init__.py | 0 .../ssl_match_hostname/_implementation.py | 0 lib/requests/packages/urllib3/poolmanager.py | 0 lib/requests/packages/urllib3/request.py | 0 lib/requests/packages/urllib3/response.py | 0 .../packages/urllib3/util/__init__.py | 0 .../packages/urllib3/util/connection.py | 0 lib/requests/packages/urllib3/util/request.py | 0 .../packages/urllib3/util/response.py | 0 lib/requests/packages/urllib3/util/retry.py | 0 lib/requests/packages/urllib3/util/ssl_.py | 0 lib/requests/packages/urllib3/util/timeout.py | 0 lib/requests/packages/urllib3/util/url.py | 0 lib/requests/sessions.py | 0 lib/requests/status_codes.py | 0 lib/requests/structures.py | 0 lib/requests/utils.py | 0 lib/simplejson/__init__.py | 0 lib/simplejson/_speedups.c | 2726 ----------------- lib/simplejson/decoder.py | 4 +- lib/simplejson/encoder.py | 4 +- lib/simplejson/ordered_dict.py | 0 lib/simplejson/scanner.py | 2 +- lib/simplejson/tests/__init__.py | 67 + lib/simplejson/tests/test_bigint_as_string.py | 55 + lib/simplejson/tests/test_check_circular.py | 30 + lib/simplejson/tests/test_decimal.py | 55 + lib/simplejson/tests/test_decode.py | 83 + lib/simplejson/tests/test_default.py | 9 + lib/simplejson/tests/test_dump.py | 67 + .../tests/test_encode_basestring_ascii.py | 46 + lib/simplejson/tests/test_encode_for_html.py | 32 + lib/simplejson/tests/test_errors.py | 34 + lib/simplejson/tests/test_fail.py | 91 + lib/simplejson/tests/test_float.py | 19 + lib/simplejson/tests/test_indent.py | 86 + lib/simplejson/tests/test_namedtuple.py | 121 + lib/simplejson/tests/test_pass1.py | 76 + lib/simplejson/tests/test_pass2.py | 14 + lib/simplejson/tests/test_pass3.py | 20 + lib/simplejson/tests/test_recursion.py | 67 + lib/simplejson/tests/test_scanstring.py | 117 + lib/simplejson/tests/test_separators.py | 42 + lib/simplejson/tests/test_speedups.py | 20 + lib/simplejson/tests/test_tuple.py | 49 + lib/simplejson/tests/test_unicode.py | 109 + lib/simplejson/tool.py | 2 +- lib/{transmissionrpc => }/six.py | 0 lib/{pystun => stun}/__init__.py | 0 lib/{pystun => stun}/cli.py | 0 {mako/ext => lib/tests}/__init__.py | 0 lib/{pystun => }/tests/test_cli.py | 0 lib/transmissionrpc/__init__.py | 36 +- lib/transmissionrpc/client.py | 1862 +++++------ lib/transmissionrpc/constants.py | 590 ++-- lib/transmissionrpc/error.py | 108 +- lib/transmissionrpc/httphandler.py | 164 +- lib/transmissionrpc/session.py | 222 +- lib/transmissionrpc/torrent.py | 958 +++--- lib/transmissionrpc/utils.py | 414 +-- mylar/__init__.py | 4 +- mylar/auth32p.py | 4 +- mylar/helpers.py | 2 +- mylar/rsscheck.py | 2 +- mylar/search.py | 8 +- mylar/torrent/clients/rtorrent.py | 2 +- mylar/torrent/clients/transmission.py | 2 +- 311 files changed, 8861 insertions(+), 6577 deletions(-) delete mode 100644 bs4/__init__.py delete mode 100644 bs4/builder/_html5lib.py delete mode 100644 bs4/tests/test_html5lib.py mode change 100755 => 100644 lib/apscheduler/__init__.py mode change 100755 => 100644 lib/apscheduler/events.py mode change 100755 => 100644 lib/apscheduler/job.py mode change 100755 => 100644 lib/apscheduler/jobstores/__init__.py mode change 100755 => 100644 lib/apscheduler/jobstores/base.py mode change 100755 => 100644 lib/apscheduler/jobstores/mongodb_store.py mode change 100755 => 100644 lib/apscheduler/jobstores/ram_store.py mode change 100755 => 100644 lib/apscheduler/jobstores/shelve_store.py mode change 100755 => 100644 lib/apscheduler/jobstores/sqlalchemy_store.py mode change 100755 => 100644 lib/apscheduler/scheduler.py mode change 100755 => 100644 lib/apscheduler/threadpool.py mode change 100755 => 100644 lib/apscheduler/triggers/__init__.py mode change 100755 => 100644 lib/apscheduler/triggers/cron/__init__.py mode change 100755 => 100644 lib/apscheduler/triggers/cron/expressions.py mode change 100755 => 100644 lib/apscheduler/triggers/cron/fields.py mode change 100755 => 100644 lib/apscheduler/triggers/interval.py mode change 100755 => 100644 lib/apscheduler/triggers/simple.py mode change 100755 => 100644 lib/apscheduler/util.py create mode 100644 lib/bs4/__init__.py rename {bs4 => lib/bs4}/builder/__init__.py (92%) create mode 100644 lib/bs4/builder/_html5lib.py rename {bs4 => lib/bs4}/builder/_htmlparser.py (79%) create mode 100644 lib/bs4/builder/_lxml.py rename {bs4 => lib/bs4}/dammit.py (72%) create mode 100644 lib/bs4/diagnose.py rename {bs4 => lib/bs4}/element.py (64%) rename {bs4 => lib/bs4}/testing.py (74%) rename {bs4 => lib/bs4}/tests/__init__.py (100%) rename {bs4 => lib/bs4}/tests/test_builder_registry.py (93%) rename {bs4 => lib/bs4}/tests/test_docs.py (100%) create mode 100644 lib/bs4/tests/test_html5lib.py rename {bs4 => lib/bs4}/tests/test_htmlparser.py (61%) rename {bs4 => lib/bs4}/tests/test_lxml.py (69%) rename {bs4 => lib/bs4}/tests/test_soup.py (62%) rename {bs4 => lib/bs4}/tests/test_tree.py (80%) rename {cherrypy => lib/cherrypy}/LICENSE.txt (100%) rename {cherrypy => lib/cherrypy}/__init__.py (100%) rename {cherrypy => lib/cherrypy}/_cpchecker.py (100%) rename {cherrypy => lib/cherrypy}/_cpcompat.py (100%) rename {cherrypy => lib/cherrypy}/_cpconfig.py (100%) rename {cherrypy => lib/cherrypy}/_cpdispatch.py (100%) rename {cherrypy => lib/cherrypy}/_cperror.py (100%) rename {cherrypy => lib/cherrypy}/_cplogging.py (100%) rename {cherrypy => lib/cherrypy}/_cpmodpy.py (100%) rename {cherrypy => lib/cherrypy}/_cpnative_server.py (100%) rename {cherrypy => lib/cherrypy}/_cpreqbody.py (100%) rename {cherrypy => lib/cherrypy}/_cprequest.py (100%) rename {cherrypy => lib/cherrypy}/_cpserver.py (100%) rename {cherrypy => lib/cherrypy}/_cpthreadinglocal.py (100%) rename {cherrypy => lib/cherrypy}/_cptools.py (100%) rename {cherrypy => lib/cherrypy}/_cptree.py (100%) rename {cherrypy => lib/cherrypy}/_cpwsgi.py (100%) rename {cherrypy => lib/cherrypy}/_cpwsgi_server.py (100%) rename {cherrypy => lib/cherrypy}/cherryd (100%) rename {cherrypy => lib/cherrypy}/favicon.ico (100%) rename {cherrypy => lib/cherrypy}/lib/__init__.py (100%) rename {cherrypy => lib/cherrypy}/lib/auth.py (100%) rename {cherrypy => lib/cherrypy}/lib/auth_basic.py (100%) rename {cherrypy => lib/cherrypy}/lib/auth_digest.py (100%) rename {cherrypy => lib/cherrypy}/lib/caching.py (100%) rename {cherrypy => lib/cherrypy}/lib/covercp.py (100%) rename {cherrypy => lib/cherrypy}/lib/cpstats.py (100%) rename {cherrypy => lib/cherrypy}/lib/cptools.py (100%) rename {cherrypy => lib/cherrypy}/lib/encoding.py (100%) rename {cherrypy => lib/cherrypy}/lib/http.py (100%) rename {cherrypy => lib/cherrypy}/lib/httpauth.py (100%) rename {cherrypy => lib/cherrypy}/lib/httputil.py (100%) rename {cherrypy => lib/cherrypy}/lib/jsontools.py (100%) rename {cherrypy => lib/cherrypy}/lib/profiler.py (100%) rename {cherrypy => lib/cherrypy}/lib/reprconf.py (100%) rename {cherrypy => lib/cherrypy}/lib/sessions.py (100%) rename {cherrypy => lib/cherrypy}/lib/static.py (100%) rename {cherrypy => lib/cherrypy}/lib/xmlrpc.py (100%) rename {cherrypy => lib/cherrypy}/process/__init__.py (100%) rename {cherrypy => lib/cherrypy}/process/plugins.py (100%) rename {cherrypy => lib/cherrypy}/process/servers.py (100%) rename {cherrypy => lib/cherrypy}/process/win32.py (100%) rename {cherrypy => lib/cherrypy}/process/wspbus.py (100%) rename {cherrypy => lib/cherrypy}/scaffold/__init__.py (100%) rename {cherrypy => lib/cherrypy}/scaffold/apache-fcgi.conf (100%) rename {cherrypy => lib/cherrypy}/scaffold/example.conf (100%) rename {cherrypy => lib/cherrypy}/scaffold/site.conf (100%) rename {cherrypy => lib/cherrypy}/scaffold/static/made_with_cherrypy_small.png (100%) rename {cherrypy => lib/cherrypy}/test/__init__.py (100%) rename {cherrypy => lib/cherrypy}/test/_test_decorators.py (100%) rename {cherrypy => lib/cherrypy}/test/_test_states_demo.py (100%) rename {cherrypy => lib/cherrypy}/test/benchmark.py (100%) rename {cherrypy => lib/cherrypy}/test/checkerdemo.py (100%) rename {cherrypy => lib/cherrypy}/test/fastcgi.conf (100%) rename {cherrypy => lib/cherrypy}/test/fcgi.conf (100%) rename {cherrypy => lib/cherrypy}/test/helper.py (100%) rename {cherrypy => lib/cherrypy}/test/logtest.py (100%) rename {cherrypy => lib/cherrypy}/test/modfastcgi.py (100%) rename {cherrypy => lib/cherrypy}/test/modfcgid.py (100%) rename {cherrypy => lib/cherrypy}/test/modpy.py (100%) rename {cherrypy => lib/cherrypy}/test/modwsgi.py (100%) rename {cherrypy => lib/cherrypy}/test/native-server.ini (100%) rename {cherrypy => lib/cherrypy}/test/sessiondemo.py (100%) rename {cherrypy => lib/cherrypy}/test/static/dirback.jpg (100%) rename {cherrypy => lib/cherrypy}/test/static/index.html (100%) rename {cherrypy => lib/cherrypy}/test/style.css (100%) rename {cherrypy => lib/cherrypy}/test/test.pem (100%) rename {cherrypy => lib/cherrypy}/test/test_auth_basic.py (100%) rename {cherrypy => lib/cherrypy}/test/test_auth_digest.py (100%) rename {cherrypy => lib/cherrypy}/test/test_bus.py (100%) rename {cherrypy => lib/cherrypy}/test/test_caching.py (100%) rename {cherrypy => lib/cherrypy}/test/test_config.py (100%) rename {cherrypy => lib/cherrypy}/test/test_config_server.py (100%) rename {cherrypy => lib/cherrypy}/test/test_conn.py (100%) rename {cherrypy => lib/cherrypy}/test/test_core.py (100%) rename {cherrypy => lib/cherrypy}/test/test_dynamicobjectmapping.py (100%) rename {cherrypy => lib/cherrypy}/test/test_encoding.py (100%) rename {cherrypy => lib/cherrypy}/test/test_etags.py (100%) rename {cherrypy => lib/cherrypy}/test/test_http.py (100%) rename {cherrypy => lib/cherrypy}/test/test_httpauth.py (100%) rename {cherrypy => lib/cherrypy}/test/test_httplib.py (100%) rename {cherrypy => lib/cherrypy}/test/test_json.py (100%) rename {cherrypy => lib/cherrypy}/test/test_logging.py (100%) rename {cherrypy => lib/cherrypy}/test/test_mime.py (100%) rename {cherrypy => lib/cherrypy}/test/test_misc_tools.py (100%) rename {cherrypy => lib/cherrypy}/test/test_objectmapping.py (100%) rename {cherrypy => lib/cherrypy}/test/test_proxy.py (100%) rename {cherrypy => lib/cherrypy}/test/test_refleaks.py (100%) rename {cherrypy => lib/cherrypy}/test/test_request_obj.py (100%) rename {cherrypy => lib/cherrypy}/test/test_routes.py (100%) rename {cherrypy => lib/cherrypy}/test/test_session.py (100%) rename {cherrypy => lib/cherrypy}/test/test_sessionauthenticate.py (100%) rename {cherrypy => lib/cherrypy}/test/test_states.py (100%) rename {cherrypy => lib/cherrypy}/test/test_static.py (100%) rename {cherrypy => lib/cherrypy}/test/test_tools.py (100%) rename {cherrypy => lib/cherrypy}/test/test_tutorials.py (100%) rename {cherrypy => lib/cherrypy}/test/test_virtualhost.py (100%) rename {cherrypy => lib/cherrypy}/test/test_wsgi_ns.py (100%) rename {cherrypy => lib/cherrypy}/test/test_wsgi_vhost.py (100%) rename {cherrypy => lib/cherrypy}/test/test_wsgiapps.py (100%) rename {cherrypy => lib/cherrypy}/test/test_xmlrpc.py (100%) rename {cherrypy => lib/cherrypy}/test/webtest.py (100%) rename {cherrypy => lib/cherrypy}/wsgiserver/__init__.py (100%) rename {cherrypy => lib/cherrypy}/wsgiserver/ssl_builtin.py (100%) rename {cherrypy => lib/cherrypy}/wsgiserver/ssl_pyopenssl.py (100%) mode change 100755 => 100644 lib/httplib2/__init__.py mode change 100755 => 100644 lib/httplib2/iri2uri.py create mode 100644 lib/httplib2/socks.py rename {mako => lib/mako}/__init__.py (100%) mode change 100755 => 100644 rename {mako => lib/mako}/_ast_util.py (100%) mode change 100755 => 100644 rename {mako => lib/mako}/ast.py (100%) mode change 100755 => 100644 rename {mako => lib/mako}/cache.py (100%) mode change 100755 => 100644 rename {mako => lib/mako}/codegen.py (100%) mode change 100755 => 100644 rename {mako => lib/mako}/exceptions.py (100%) mode change 100755 => 100644 rename lib/{pystun/tests => mako/ext}/__init__.py (100%) rename {mako => lib/mako}/ext/autohandler.py (100%) mode change 100755 => 100644 rename {mako => lib/mako}/ext/babelplugin.py (100%) mode change 100755 => 100644 rename {mako => lib/mako}/ext/preprocessors.py (100%) mode change 100755 => 100644 rename {mako => lib/mako}/ext/pygmentplugin.py (100%) mode change 100755 => 100644 rename {mako => lib/mako}/ext/turbogears.py (100%) mode change 100755 => 100644 rename {mako => lib/mako}/filters.py (100%) mode change 100755 => 100644 rename {mako => lib/mako}/lexer.py (100%) mode change 100755 => 100644 rename {mako => lib/mako}/lookup.py (100%) mode change 100755 => 100644 rename {mako => lib/mako}/parsetree.py (100%) mode change 100755 => 100644 rename {mako => lib/mako}/pygen.py (100%) mode change 100755 => 100644 rename {mako => lib/mako}/pyparser.py (100%) mode change 100755 => 100644 rename {mako => lib/mako}/runtime.py (100%) mode change 100755 => 100644 rename {mako => lib/mako}/template.py (100%) mode change 100755 => 100644 rename {mako => lib/mako}/util.py (100%) mode change 100755 => 100644 create mode 100644 lib/markupsafe/__init__.py create mode 100644 lib/markupsafe/_compat.py create mode 100644 lib/markupsafe/_constants.py create mode 100644 lib/markupsafe/_native.py create mode 100644 lib/markupsafe/_speedups.c create mode 100644 lib/markupsafe/tests.py delete mode 100644 lib/pystun/README.rst delete mode 100755 lib/requests/LICENSE delete mode 100755 lib/requests/NOTICE delete mode 100755 lib/requests/README.rst mode change 100755 => 100644 lib/requests/__init__.py mode change 100755 => 100644 lib/requests/adapters.py mode change 100755 => 100644 lib/requests/api.py mode change 100755 => 100644 lib/requests/auth.py mode change 100755 => 100644 lib/requests/cacert.pem mode change 100755 => 100644 lib/requests/certs.py mode change 100755 => 100644 lib/requests/compat.py mode change 100755 => 100644 lib/requests/cookies.py mode change 100755 => 100644 lib/requests/exceptions.py mode change 100755 => 100644 lib/requests/hooks.py mode change 100755 => 100644 lib/requests/models.py delete mode 100755 lib/requests/packages/README.rst mode change 100755 => 100644 lib/requests/packages/__init__.py mode change 100755 => 100644 lib/requests/packages/chardet/__init__.py mode change 100755 => 100644 lib/requests/packages/chardet/big5freq.py mode change 100755 => 100644 lib/requests/packages/chardet/big5prober.py mode change 100755 => 100644 lib/requests/packages/chardet/chardetect.py mode change 100755 => 100644 lib/requests/packages/chardet/chardistribution.py mode change 100755 => 100644 lib/requests/packages/chardet/charsetgroupprober.py mode change 100755 => 100644 lib/requests/packages/chardet/charsetprober.py mode change 100755 => 100644 lib/requests/packages/chardet/codingstatemachine.py mode change 100755 => 100644 lib/requests/packages/chardet/compat.py mode change 100755 => 100644 lib/requests/packages/chardet/constants.py mode change 100755 => 100644 lib/requests/packages/chardet/cp949prober.py mode change 100755 => 100644 lib/requests/packages/chardet/escprober.py mode change 100755 => 100644 lib/requests/packages/chardet/escsm.py mode change 100755 => 100644 lib/requests/packages/chardet/eucjpprober.py mode change 100755 => 100644 lib/requests/packages/chardet/euckrfreq.py mode change 100755 => 100644 lib/requests/packages/chardet/euckrprober.py mode change 100755 => 100644 lib/requests/packages/chardet/euctwfreq.py mode change 100755 => 100644 lib/requests/packages/chardet/euctwprober.py mode change 100755 => 100644 lib/requests/packages/chardet/gb2312freq.py mode change 100755 => 100644 lib/requests/packages/chardet/gb2312prober.py mode change 100755 => 100644 lib/requests/packages/chardet/hebrewprober.py mode change 100755 => 100644 lib/requests/packages/chardet/jisfreq.py mode change 100755 => 100644 lib/requests/packages/chardet/jpcntx.py mode change 100755 => 100644 lib/requests/packages/chardet/langbulgarianmodel.py mode change 100755 => 100644 lib/requests/packages/chardet/langcyrillicmodel.py mode change 100755 => 100644 lib/requests/packages/chardet/langgreekmodel.py mode change 100755 => 100644 lib/requests/packages/chardet/langhebrewmodel.py mode change 100755 => 100644 lib/requests/packages/chardet/langhungarianmodel.py mode change 100755 => 100644 lib/requests/packages/chardet/langthaimodel.py mode change 100755 => 100644 lib/requests/packages/chardet/latin1prober.py mode change 100755 => 100644 lib/requests/packages/chardet/mbcharsetprober.py mode change 100755 => 100644 lib/requests/packages/chardet/mbcsgroupprober.py mode change 100755 => 100644 lib/requests/packages/chardet/mbcssm.py mode change 100755 => 100644 lib/requests/packages/chardet/sbcharsetprober.py mode change 100755 => 100644 lib/requests/packages/chardet/sbcsgroupprober.py mode change 100755 => 100644 lib/requests/packages/chardet/sjisprober.py mode change 100755 => 100644 lib/requests/packages/chardet/universaldetector.py mode change 100755 => 100644 lib/requests/packages/chardet/utf8prober.py mode change 100755 => 100644 lib/requests/packages/urllib3/__init__.py mode change 100755 => 100644 lib/requests/packages/urllib3/_collections.py mode change 100755 => 100644 lib/requests/packages/urllib3/connection.py mode change 100755 => 100644 lib/requests/packages/urllib3/connectionpool.py mode change 100755 => 100644 lib/requests/packages/urllib3/contrib/__init__.py mode change 100755 => 100644 lib/requests/packages/urllib3/contrib/ntlmpool.py mode change 100755 => 100644 lib/requests/packages/urllib3/contrib/pyopenssl.py mode change 100755 => 100644 lib/requests/packages/urllib3/exceptions.py mode change 100755 => 100644 lib/requests/packages/urllib3/fields.py mode change 100755 => 100644 lib/requests/packages/urllib3/filepost.py mode change 100755 => 100644 lib/requests/packages/urllib3/packages/__init__.py mode change 100755 => 100644 lib/requests/packages/urllib3/packages/ordered_dict.py mode change 100755 => 100644 lib/requests/packages/urllib3/packages/six.py mode change 100755 => 100644 lib/requests/packages/urllib3/packages/ssl_match_hostname/__init__.py mode change 100755 => 100644 lib/requests/packages/urllib3/packages/ssl_match_hostname/_implementation.py mode change 100755 => 100644 lib/requests/packages/urllib3/poolmanager.py mode change 100755 => 100644 lib/requests/packages/urllib3/request.py mode change 100755 => 100644 lib/requests/packages/urllib3/response.py mode change 100755 => 100644 lib/requests/packages/urllib3/util/__init__.py mode change 100755 => 100644 lib/requests/packages/urllib3/util/connection.py mode change 100755 => 100644 lib/requests/packages/urllib3/util/request.py mode change 100755 => 100644 lib/requests/packages/urllib3/util/response.py mode change 100755 => 100644 lib/requests/packages/urllib3/util/retry.py mode change 100755 => 100644 lib/requests/packages/urllib3/util/ssl_.py mode change 100755 => 100644 lib/requests/packages/urllib3/util/timeout.py mode change 100755 => 100644 lib/requests/packages/urllib3/util/url.py mode change 100755 => 100644 lib/requests/sessions.py mode change 100755 => 100644 lib/requests/status_codes.py mode change 100755 => 100644 lib/requests/structures.py mode change 100755 => 100644 lib/requests/utils.py mode change 100755 => 100644 lib/simplejson/__init__.py delete mode 100755 lib/simplejson/_speedups.c mode change 100755 => 100644 lib/simplejson/decoder.py mode change 100755 => 100644 lib/simplejson/encoder.py mode change 100755 => 100644 lib/simplejson/ordered_dict.py mode change 100755 => 100644 lib/simplejson/scanner.py create mode 100644 lib/simplejson/tests/__init__.py create mode 100644 lib/simplejson/tests/test_bigint_as_string.py create mode 100644 lib/simplejson/tests/test_check_circular.py create mode 100644 lib/simplejson/tests/test_decimal.py create mode 100644 lib/simplejson/tests/test_decode.py create mode 100644 lib/simplejson/tests/test_default.py create mode 100644 lib/simplejson/tests/test_dump.py create mode 100644 lib/simplejson/tests/test_encode_basestring_ascii.py create mode 100644 lib/simplejson/tests/test_encode_for_html.py create mode 100644 lib/simplejson/tests/test_errors.py create mode 100644 lib/simplejson/tests/test_fail.py create mode 100644 lib/simplejson/tests/test_float.py create mode 100644 lib/simplejson/tests/test_indent.py create mode 100644 lib/simplejson/tests/test_namedtuple.py create mode 100644 lib/simplejson/tests/test_pass1.py create mode 100644 lib/simplejson/tests/test_pass2.py create mode 100644 lib/simplejson/tests/test_pass3.py create mode 100644 lib/simplejson/tests/test_recursion.py create mode 100644 lib/simplejson/tests/test_scanstring.py create mode 100644 lib/simplejson/tests/test_separators.py create mode 100644 lib/simplejson/tests/test_speedups.py create mode 100644 lib/simplejson/tests/test_tuple.py create mode 100644 lib/simplejson/tests/test_unicode.py mode change 100755 => 100644 lib/simplejson/tool.py rename lib/{transmissionrpc => }/six.py (100%) rename lib/{pystun => stun}/__init__.py (100%) rename lib/{pystun => stun}/cli.py (100%) rename {mako/ext => lib/tests}/__init__.py (100%) mode change 100755 => 100644 rename lib/{pystun => }/tests/test_cli.py (100%) diff --git a/Mylar.py b/Mylar.py index f49d1ffb..ab6e7f15 100644 --- a/Mylar.py +++ b/Mylar.py @@ -21,16 +21,14 @@ import time import threading import signal -from lib.configobj import ConfigObj +sys.path.insert(1, os.path.join(os.path.dirname(__file__), 'lib')) import mylar from mylar import webstart, logger, filechecker, versioncheck -try: - import argparse -except ImportError: - import lib.argparse as argparse +import argparse + if ( sys.platform == 'win32' and sys.executable.split( '\\' )[-1] == 'pythonw.exe'): sys.stdout = open(os.devnull, "w") @@ -198,6 +196,7 @@ def main(): i += 1 + from configobj import ConfigObj mylar.CFG = ConfigObj(mylar.CONFIG_FILE, encoding='utf-8') # Rename the main thread diff --git a/bs4/__init__.py b/bs4/__init__.py deleted file mode 100644 index d0fd1ff1..00000000 --- a/bs4/__init__.py +++ /dev/null @@ -1,355 +0,0 @@ -"""Beautiful Soup -Elixir and Tonic -"The Screen-Scraper's Friend" -http://www.crummy.com/software/BeautifulSoup/ - -Beautiful Soup uses a pluggable XML or HTML parser to parse a -(possibly invalid) document into a tree representation. Beautiful Soup -provides provides methods and Pythonic idioms that make it easy to -navigate, search, and modify the parse tree. - -Beautiful Soup works with Python 2.6 and up. It works better if lxml -and/or html5lib is installed. - -For more than you ever wanted to know about Beautiful Soup, see the -documentation: -http://www.crummy.com/software/BeautifulSoup/bs4/doc/ -""" - -__author__ = "Leonard Richardson (leonardr@segfault.org)" -__version__ = "4.1.1" -__copyright__ = "Copyright (c) 2004-2012 Leonard Richardson" -__license__ = "MIT" - -__all__ = ['BeautifulSoup'] - -import re -import warnings - -from .builder import builder_registry -from .dammit import UnicodeDammit -from .element import ( - CData, - Comment, - DEFAULT_OUTPUT_ENCODING, - Declaration, - Doctype, - NavigableString, - PageElement, - ProcessingInstruction, - ResultSet, - SoupStrainer, - Tag, - ) - -# The very first thing we do is give a useful error if someone is -# running this code under Python 3 without converting it. -syntax_error = u'You are trying to run the Python 2 version of Beautiful Soup under Python 3. This will not work. You need to convert the code, either by installing it (`python setup.py install`) or by running 2to3 (`2to3 -w bs4`).' - -class BeautifulSoup(Tag): - """ - This class defines the basic interface called by the tree builders. - - These methods will be called by the parser: - reset() - feed(markup) - - The tree builder may call these methods from its feed() implementation: - handle_starttag(name, attrs) # See note about return value - handle_endtag(name) - handle_data(data) # Appends to the current data node - endData(containerClass=NavigableString) # Ends the current data node - - No matter how complicated the underlying parser is, you should be - able to build a tree using 'start tag' events, 'end tag' events, - 'data' events, and "done with data" events. - - If you encounter an empty-element tag (aka a self-closing tag, - like HTML's
tag), call handle_starttag and then - handle_endtag. - """ - ROOT_TAG_NAME = u'[document]' - - # If the end-user gives no indication which tree builder they - # want, look for one with these features. - DEFAULT_BUILDER_FEATURES = ['html', 'fast'] - - # Used when determining whether a text node is all whitespace and - # can be replaced with a single space. A text node that contains - # fancy Unicode spaces (usually non-breaking) should be left - # alone. - STRIP_ASCII_SPACES = {9: None, 10: None, 12: None, 13: None, 32: None, } - - def __init__(self, markup="", features=None, builder=None, - parse_only=None, from_encoding=None, **kwargs): - """The Soup object is initialized as the 'root tag', and the - provided markup (which can be a string or a file-like object) - is fed into the underlying parser.""" - - if 'convertEntities' in kwargs: - warnings.warn( - "BS4 does not respect the convertEntities argument to the " - "BeautifulSoup constructor. Entities are always converted " - "to Unicode characters.") - - if 'markupMassage' in kwargs: - del kwargs['markupMassage'] - warnings.warn( - "BS4 does not respect the markupMassage argument to the " - "BeautifulSoup constructor. The tree builder is responsible " - "for any necessary markup massage.") - - if 'smartQuotesTo' in kwargs: - del kwargs['smartQuotesTo'] - warnings.warn( - "BS4 does not respect the smartQuotesTo argument to the " - "BeautifulSoup constructor. Smart quotes are always converted " - "to Unicode characters.") - - if 'selfClosingTags' in kwargs: - del kwargs['selfClosingTags'] - warnings.warn( - "BS4 does not respect the selfClosingTags argument to the " - "BeautifulSoup constructor. The tree builder is responsible " - "for understanding self-closing tags.") - - if 'isHTML' in kwargs: - del kwargs['isHTML'] - warnings.warn( - "BS4 does not respect the isHTML argument to the " - "BeautifulSoup constructor. You can pass in features='html' " - "or features='xml' to get a builder capable of handling " - "one or the other.") - - def deprecated_argument(old_name, new_name): - if old_name in kwargs: - warnings.warn( - 'The "%s" argument to the BeautifulSoup constructor ' - 'has been renamed to "%s."' % (old_name, new_name)) - value = kwargs[old_name] - del kwargs[old_name] - return value - return None - - parse_only = parse_only or deprecated_argument( - "parseOnlyThese", "parse_only") - - from_encoding = from_encoding or deprecated_argument( - "fromEncoding", "from_encoding") - - if len(kwargs) > 0: - arg = kwargs.keys().pop() - raise TypeError( - "__init__() got an unexpected keyword argument '%s'" % arg) - - if builder is None: - if isinstance(features, basestring): - features = [features] - if features is None or len(features) == 0: - features = self.DEFAULT_BUILDER_FEATURES - builder_class = builder_registry.lookup(*features) - if builder_class is None: - raise ValueError( - "Couldn't find a tree builder with the features you " - "requested: %s. Do you need to install a parser library?" - % ",".join(features)) - builder = builder_class() - self.builder = builder - self.is_xml = builder.is_xml - self.builder.soup = self - - self.parse_only = parse_only - - self.reset() - - if hasattr(markup, 'read'): # It's a file-type object. - markup = markup.read() - (self.markup, self.original_encoding, self.declared_html_encoding, - self.contains_replacement_characters) = ( - self.builder.prepare_markup(markup, from_encoding)) - - try: - self._feed() - except StopParsing: - pass - - # Clear out the markup and remove the builder's circular - # reference to this object. - self.markup = None - self.builder.soup = None - - def _feed(self): - # Convert the document to Unicode. - self.builder.reset() - - self.builder.feed(self.markup) - # Close out any unfinished strings and close all the open tags. - self.endData() - while self.currentTag.name != self.ROOT_TAG_NAME: - self.popTag() - - def reset(self): - Tag.__init__(self, self, self.builder, self.ROOT_TAG_NAME) - self.hidden = 1 - self.builder.reset() - self.currentData = [] - self.currentTag = None - self.tagStack = [] - self.pushTag(self) - - def new_tag(self, name, namespace=None, nsprefix=None, **attrs): - """Create a new tag associated with this soup.""" - return Tag(None, self.builder, name, namespace, nsprefix, attrs) - - def new_string(self, s): - """Create a new NavigableString associated with this soup.""" - navigable = NavigableString(s) - navigable.setup() - return navigable - - def insert_before(self, successor): - raise ValueError("BeautifulSoup objects don't support insert_before().") - - def insert_after(self, successor): - raise ValueError("BeautifulSoup objects don't support insert_after().") - - def popTag(self): - tag = self.tagStack.pop() - #print "Pop", tag.name - if self.tagStack: - self.currentTag = self.tagStack[-1] - return self.currentTag - - def pushTag(self, tag): - #print "Push", tag.name - if self.currentTag: - self.currentTag.contents.append(tag) - self.tagStack.append(tag) - self.currentTag = self.tagStack[-1] - - def endData(self, containerClass=NavigableString): - if self.currentData: - currentData = u''.join(self.currentData) - if (currentData.translate(self.STRIP_ASCII_SPACES) == '' and - not set([tag.name for tag in self.tagStack]).intersection( - self.builder.preserve_whitespace_tags)): - if '\n' in currentData: - currentData = '\n' - else: - currentData = ' ' - self.currentData = [] - if self.parse_only and len(self.tagStack) <= 1 and \ - (not self.parse_only.text or \ - not self.parse_only.search(currentData)): - return - o = containerClass(currentData) - self.object_was_parsed(o) - - def object_was_parsed(self, o): - """Add an object to the parse tree.""" - o.setup(self.currentTag, self.previous_element) - if self.previous_element: - self.previous_element.next_element = o - self.previous_element = o - self.currentTag.contents.append(o) - - def _popToTag(self, name, nsprefix=None, inclusivePop=True): - """Pops the tag stack up to and including the most recent - instance of the given tag. If inclusivePop is false, pops the tag - stack up to but *not* including the most recent instqance of - the given tag.""" - #print "Popping to %s" % name - if name == self.ROOT_TAG_NAME: - return - - numPops = 0 - mostRecentTag = None - - for i in range(len(self.tagStack) - 1, 0, -1): - if (name == self.tagStack[i].name - and nsprefix == self.tagStack[i].prefix): - numPops = len(self.tagStack) - i - break - if not inclusivePop: - numPops = numPops - 1 - - for i in range(0, numPops): - mostRecentTag = self.popTag() - return mostRecentTag - - def handle_starttag(self, name, namespace, nsprefix, attrs): - """Push a start tag on to the stack. - - If this method returns None, the tag was rejected by the - SoupStrainer. You should proceed as if the tag had not occured - in the document. For instance, if this was a self-closing tag, - don't call handle_endtag. - """ - - # print "Start tag %s: %s" % (name, attrs) - self.endData() - - if (self.parse_only and len(self.tagStack) <= 1 - and (self.parse_only.text - or not self.parse_only.search_tag(name, attrs))): - return None - - tag = Tag(self, self.builder, name, namespace, nsprefix, attrs, - self.currentTag, self.previous_element) - if tag is None: - return tag - if self.previous_element: - self.previous_element.next_element = tag - self.previous_element = tag - self.pushTag(tag) - return tag - - def handle_endtag(self, name, nsprefix=None): - #print "End tag: " + name - self.endData() - self._popToTag(name, nsprefix) - - def handle_data(self, data): - self.currentData.append(data) - - def decode(self, pretty_print=False, - eventual_encoding=DEFAULT_OUTPUT_ENCODING, - formatter="minimal"): - """Returns a string or Unicode representation of this document. - To get Unicode, pass None for encoding.""" - - if self.is_xml: - # Print the XML declaration - encoding_part = '' - if eventual_encoding != None: - encoding_part = ' encoding="%s"' % eventual_encoding - prefix = u'\n' % encoding_part - else: - prefix = u'' - if not pretty_print: - indent_level = None - else: - indent_level = 0 - return prefix + super(BeautifulSoup, self).decode( - indent_level, eventual_encoding, formatter) - -class BeautifulStoneSoup(BeautifulSoup): - """Deprecated interface to an XML parser.""" - - def __init__(self, *args, **kwargs): - kwargs['features'] = 'xml' - warnings.warn( - 'The BeautifulStoneSoup class is deprecated. Instead of using ' - 'it, pass features="xml" into the BeautifulSoup constructor.') - super(BeautifulStoneSoup, self).__init__(*args, **kwargs) - - -class StopParsing(Exception): - pass - - -#By default, act as an HTML pretty-printer. -if __name__ == '__main__': - import sys - soup = BeautifulSoup(sys.stdin) - print soup.prettify() diff --git a/bs4/builder/_html5lib.py b/bs4/builder/_html5lib.py deleted file mode 100644 index 6001e386..00000000 --- a/bs4/builder/_html5lib.py +++ /dev/null @@ -1,222 +0,0 @@ -__all__ = [ - 'HTML5TreeBuilder', - ] - -import warnings -from bs4.builder import ( - PERMISSIVE, - HTML, - HTML_5, - HTMLTreeBuilder, - ) -from bs4.element import NamespacedAttribute -import html5lib -from html5lib.constants import namespaces -from bs4.element import ( - Comment, - Doctype, - NavigableString, - Tag, - ) - -class HTML5TreeBuilder(HTMLTreeBuilder): - """Use html5lib to build a tree.""" - - features = ['html5lib', PERMISSIVE, HTML_5, HTML] - - def prepare_markup(self, markup, user_specified_encoding): - # Store the user-specified encoding for use later on. - self.user_specified_encoding = user_specified_encoding - return markup, None, None, False - - # These methods are defined by Beautiful Soup. - def feed(self, markup): - if self.soup.parse_only is not None: - warnings.warn("You provided a value for parse_only, but the html5lib tree builder doesn't support parse_only. The entire document will be parsed.") - parser = html5lib.HTMLParser(tree=self.create_treebuilder) - doc = parser.parse(markup, encoding=self.user_specified_encoding) - - # Set the character encoding detected by the tokenizer. - if isinstance(markup, unicode): - # We need to special-case this because html5lib sets - # charEncoding to UTF-8 if it gets Unicode input. - doc.original_encoding = None - else: - doc.original_encoding = parser.tokenizer.stream.charEncoding[0] - - def create_treebuilder(self, namespaceHTMLElements): - self.underlying_builder = TreeBuilderForHtml5lib( - self.soup, namespaceHTMLElements) - return self.underlying_builder - - def test_fragment_to_document(self, fragment): - """See `TreeBuilder`.""" - return u'%s' % fragment - - -class TreeBuilderForHtml5lib(html5lib.treebuilders._base.TreeBuilder): - - def __init__(self, soup, namespaceHTMLElements): - self.soup = soup - super(TreeBuilderForHtml5lib, self).__init__(namespaceHTMLElements) - - def documentClass(self): - self.soup.reset() - return Element(self.soup, self.soup, None) - - def insertDoctype(self, token): - name = token["name"] - publicId = token["publicId"] - systemId = token["systemId"] - - doctype = Doctype.for_name_and_ids(name, publicId, systemId) - self.soup.object_was_parsed(doctype) - - def elementClass(self, name, namespace): - tag = self.soup.new_tag(name, namespace) - return Element(tag, self.soup, namespace) - - def commentClass(self, data): - return TextNode(Comment(data), self.soup) - - def fragmentClass(self): - self.soup = BeautifulSoup("") - self.soup.name = "[document_fragment]" - return Element(self.soup, self.soup, None) - - def appendChild(self, node): - # XXX This code is not covered by the BS4 tests. - self.soup.append(node.element) - - def getDocument(self): - return self.soup - - def getFragment(self): - return html5lib.treebuilders._base.TreeBuilder.getFragment(self).element - -class AttrList(object): - def __init__(self, element): - self.element = element - self.attrs = dict(self.element.attrs) - def __iter__(self): - return list(self.attrs.items()).__iter__() - def __setitem__(self, name, value): - "set attr", name, value - self.element[name] = value - def items(self): - return list(self.attrs.items()) - def keys(self): - return list(self.attrs.keys()) - def __len__(self): - return len(self.attrs) - def __getitem__(self, name): - return self.attrs[name] - def __contains__(self, name): - return name in list(self.attrs.keys()) - - -class Element(html5lib.treebuilders._base.Node): - def __init__(self, element, soup, namespace): - html5lib.treebuilders._base.Node.__init__(self, element.name) - self.element = element - self.soup = soup - self.namespace = namespace - - def appendChild(self, node): - if (node.element.__class__ == NavigableString and self.element.contents - and self.element.contents[-1].__class__ == NavigableString): - # Concatenate new text onto old text node - # XXX This has O(n^2) performance, for input like - # "aaa..." - old_element = self.element.contents[-1] - new_element = self.soup.new_string(old_element + node.element) - old_element.replace_with(new_element) - else: - self.element.append(node.element) - node.parent = self - - def getAttributes(self): - return AttrList(self.element) - - def setAttributes(self, attributes): - if attributes is not None and len(attributes) > 0: - - converted_attributes = [] - for name, value in list(attributes.items()): - if isinstance(name, tuple): - new_name = NamespacedAttribute(*name) - del attributes[name] - attributes[new_name] = value - - self.soup.builder._replace_cdata_list_attribute_values( - self.name, attributes) - for name, value in attributes.items(): - self.element[name] = value - - # The attributes may contain variables that need substitution. - # Call set_up_substitutions manually. - # - # The Tag constructor called this method when the Tag was created, - # but we just set/changed the attributes, so call it again. - self.soup.builder.set_up_substitutions(self.element) - attributes = property(getAttributes, setAttributes) - - def insertText(self, data, insertBefore=None): - text = TextNode(self.soup.new_string(data), self.soup) - if insertBefore: - self.insertBefore(text, insertBefore) - else: - self.appendChild(text) - - def insertBefore(self, node, refNode): - index = self.element.index(refNode.element) - if (node.element.__class__ == NavigableString and self.element.contents - and self.element.contents[index-1].__class__ == NavigableString): - # (See comments in appendChild) - old_node = self.element.contents[index-1] - new_str = self.soup.new_string(old_node + node.element) - old_node.replace_with(new_str) - else: - self.element.insert(index, node.element) - node.parent = self - - def removeChild(self, node): - node.element.extract() - - def reparentChildren(self, newParent): - while self.element.contents: - child = self.element.contents[0] - child.extract() - if isinstance(child, Tag): - newParent.appendChild( - Element(child, self.soup, namespaces["html"])) - else: - newParent.appendChild( - TextNode(child, self.soup)) - - def cloneNode(self): - tag = self.soup.new_tag(self.element.name, self.namespace) - node = Element(tag, self.soup, self.namespace) - for key,value in self.attributes: - node.attributes[key] = value - return node - - def hasContent(self): - return self.element.contents - - def getNameTuple(self): - if self.namespace == None: - return namespaces["html"], self.name - else: - return self.namespace, self.name - - nameTuple = property(getNameTuple) - -class TextNode(Element): - def __init__(self, element, soup): - html5lib.treebuilders._base.Node.__init__(self, None) - self.element = element - self.soup = soup - - def cloneNode(self): - raise NotImplementedError diff --git a/bs4/tests/test_html5lib.py b/bs4/tests/test_html5lib.py deleted file mode 100644 index f195f7d0..00000000 --- a/bs4/tests/test_html5lib.py +++ /dev/null @@ -1,58 +0,0 @@ -"""Tests to ensure that the html5lib tree builder generates good trees.""" - -import warnings - -try: - from bs4.builder import HTML5TreeBuilder - HTML5LIB_PRESENT = True -except ImportError, e: - HTML5LIB_PRESENT = False -from bs4.element import SoupStrainer -from bs4.testing import ( - HTML5TreeBuilderSmokeTest, - SoupTest, - skipIf, -) - -@skipIf( - not HTML5LIB_PRESENT, - "html5lib seems not to be present, not testing its tree builder.") -class HTML5LibBuilderSmokeTest(SoupTest, HTML5TreeBuilderSmokeTest): - """See ``HTML5TreeBuilderSmokeTest``.""" - - @property - def default_builder(self): - return HTML5TreeBuilder() - - def test_soupstrainer(self): - # The html5lib tree builder does not support SoupStrainers. - strainer = SoupStrainer("b") - markup = "

A bold statement.

" - with warnings.catch_warnings(record=True) as w: - soup = self.soup(markup, parse_only=strainer) - self.assertEqual( - soup.decode(), self.document_for(markup)) - - self.assertTrue( - "the html5lib tree builder doesn't support parse_only" in - str(w[0].message)) - - def test_correctly_nested_tables(self): - """html5lib inserts tags where other parsers don't.""" - markup = ('' - '' - "') - - self.assertSoupEquals( - markup, - '
Here's another table:" - '' - '' - '
foo
Here\'s another table:' - '
foo
' - '
') - - self.assertSoupEquals( - "" - "" - "
Foo
Bar
Baz
") diff --git a/comictagger.py b/comictagger.py index 1932b46f..35250929 100755 --- a/comictagger.py +++ b/comictagger.py @@ -1,5 +1,5 @@ #!/usr/bin/env python -from lib.comictaggerlib.main import ctmain +from comictaggerlib.main import ctmain if __name__ == '__main__': ctmain() diff --git a/lib/apscheduler/__init__.py b/lib/apscheduler/__init__.py old mode 100755 new mode 100644 index 6b502147..11e93a1d --- a/lib/apscheduler/__init__.py +++ b/lib/apscheduler/__init__.py @@ -1,3 +1,3 @@ -version_info = (2, 0, 0, 'rc', 2) +version_info = (2, 0, 0) version = '.'.join(str(n) for n in version_info[:3]) release = version + ''.join(str(n) for n in version_info[3:]) diff --git a/lib/apscheduler/events.py b/lib/apscheduler/events.py old mode 100755 new mode 100644 diff --git a/lib/apscheduler/job.py b/lib/apscheduler/job.py old mode 100755 new mode 100644 index c863bc3b..868e7234 --- a/lib/apscheduler/job.py +++ b/lib/apscheduler/job.py @@ -5,7 +5,7 @@ Jobs represent scheduled tasks. from threading import Lock from datetime import timedelta -from lib.apscheduler.util import to_unicode, ref_to_obj, get_callable_name,\ +from apscheduler.util import to_unicode, ref_to_obj, get_callable_name,\ obj_to_ref diff --git a/lib/apscheduler/jobstores/__init__.py b/lib/apscheduler/jobstores/__init__.py old mode 100755 new mode 100644 diff --git a/lib/apscheduler/jobstores/base.py b/lib/apscheduler/jobstores/base.py old mode 100755 new mode 100644 diff --git a/lib/apscheduler/jobstores/mongodb_store.py b/lib/apscheduler/jobstores/mongodb_store.py old mode 100755 new mode 100644 index d1093860..3f522c25 --- a/lib/apscheduler/jobstores/mongodb_store.py +++ b/lib/apscheduler/jobstores/mongodb_store.py @@ -3,8 +3,8 @@ Stores jobs in a MongoDB database. """ import logging -from lib.apscheduler.jobstores.base import JobStore -from lib.apscheduler.job import Job +from apscheduler.jobstores.base import JobStore +from apscheduler.job import Job try: import cPickle as pickle diff --git a/lib/apscheduler/jobstores/ram_store.py b/lib/apscheduler/jobstores/ram_store.py old mode 100755 new mode 100644 index 1c3c667e..85091fe8 --- a/lib/apscheduler/jobstores/ram_store.py +++ b/lib/apscheduler/jobstores/ram_store.py @@ -2,7 +2,7 @@ Stores jobs in an array in RAM. Provides no persistence support. """ -from lib.apscheduler.jobstores.base import JobStore +from apscheduler.jobstores.base import JobStore class RAMJobStore(JobStore): diff --git a/lib/apscheduler/jobstores/shelve_store.py b/lib/apscheduler/jobstores/shelve_store.py old mode 100755 new mode 100644 index f29d53cb..87c95f8f --- a/lib/apscheduler/jobstores/shelve_store.py +++ b/lib/apscheduler/jobstores/shelve_store.py @@ -7,9 +7,9 @@ import pickle import random import logging -from lib.apscheduler.jobstores.base import JobStore -from lib.apscheduler.job import Job -from lib.apscheduler.util import itervalues +from apscheduler.jobstores.base import JobStore +from apscheduler.job import Job +from apscheduler.util import itervalues logger = logging.getLogger(__name__) diff --git a/lib/apscheduler/jobstores/sqlalchemy_store.py b/lib/apscheduler/jobstores/sqlalchemy_store.py old mode 100755 new mode 100644 index c0fee127..8ece7e24 --- a/lib/apscheduler/jobstores/sqlalchemy_store.py +++ b/lib/apscheduler/jobstores/sqlalchemy_store.py @@ -4,8 +4,8 @@ Stores jobs in a database table using SQLAlchemy. import pickle import logging -from lib.apscheduler.jobstores.base import JobStore -from lib.apscheduler.job import Job +from apscheduler.jobstores.base import JobStore +from apscheduler.job import Job try: from sqlalchemy import * diff --git a/lib/apscheduler/scheduler.py b/lib/apscheduler/scheduler.py old mode 100755 new mode 100644 index 461cfea4..ee08ad8b --- a/lib/apscheduler/scheduler.py +++ b/lib/apscheduler/scheduler.py @@ -9,12 +9,12 @@ from logging import getLogger import os import sys -from lib.apscheduler.util import * -from lib.apscheduler.triggers import SimpleTrigger, IntervalTrigger, CronTrigger -from lib.apscheduler.jobstores.ram_store import RAMJobStore -from lib.apscheduler.job import Job, MaxInstancesReachedError -from lib.apscheduler.events import * -from lib.apscheduler.threadpool import ThreadPool +from apscheduler.util import * +from apscheduler.triggers import SimpleTrigger, IntervalTrigger, CronTrigger +from apscheduler.jobstores.ram_store import RAMJobStore +from apscheduler.job import Job, MaxInstancesReachedError +from apscheduler.events import * +from apscheduler.threadpool import ThreadPool logger = getLogger(__name__) diff --git a/lib/apscheduler/threadpool.py b/lib/apscheduler/threadpool.py old mode 100755 new mode 100644 diff --git a/lib/apscheduler/triggers/__init__.py b/lib/apscheduler/triggers/__init__.py old mode 100755 new mode 100644 index a40ece20..74a97884 --- a/lib/apscheduler/triggers/__init__.py +++ b/lib/apscheduler/triggers/__init__.py @@ -1,3 +1,3 @@ -from lib.apscheduler.triggers.cron import CronTrigger -from lib.apscheduler.triggers.interval import IntervalTrigger -from lib.apscheduler.triggers.simple import SimpleTrigger +from apscheduler.triggers.cron import CronTrigger +from apscheduler.triggers.interval import IntervalTrigger +from apscheduler.triggers.simple import SimpleTrigger diff --git a/lib/apscheduler/triggers/cron/__init__.py b/lib/apscheduler/triggers/cron/__init__.py old mode 100755 new mode 100644 index 665d2dae..3f8d9a8f --- a/lib/apscheduler/triggers/cron/__init__.py +++ b/lib/apscheduler/triggers/cron/__init__.py @@ -1,7 +1,7 @@ from datetime import date, datetime -from lib.apscheduler.triggers.cron.fields import * -from lib.apscheduler.util import datetime_ceil, convert_to_datetime +from apscheduler.triggers.cron.fields import * +from apscheduler.util import datetime_ceil, convert_to_datetime class CronTrigger(object): diff --git a/lib/apscheduler/triggers/cron/expressions.py b/lib/apscheduler/triggers/cron/expressions.py old mode 100755 new mode 100644 index 646d6f3c..018c7a30 --- a/lib/apscheduler/triggers/cron/expressions.py +++ b/lib/apscheduler/triggers/cron/expressions.py @@ -5,7 +5,7 @@ This module contains the expressions applicable for CronTrigger's fields. from calendar import monthrange import re -from lib.apscheduler.util import asint +from apscheduler.util import asint __all__ = ('AllExpression', 'RangeExpression', 'WeekdayRangeExpression', 'WeekdayPositionExpression') diff --git a/lib/apscheduler/triggers/cron/fields.py b/lib/apscheduler/triggers/cron/fields.py old mode 100755 new mode 100644 index 24cb1b31..ef970cc9 --- a/lib/apscheduler/triggers/cron/fields.py +++ b/lib/apscheduler/triggers/cron/fields.py @@ -5,7 +5,7 @@ fields. from calendar import monthrange -from lib.apscheduler.triggers.cron.expressions import * +from apscheduler.triggers.cron.expressions import * __all__ = ('MIN_VALUES', 'MAX_VALUES', 'DEFAULT_VALUES', 'BaseField', 'WeekField', 'DayOfMonthField', 'DayOfWeekField') diff --git a/lib/apscheduler/triggers/interval.py b/lib/apscheduler/triggers/interval.py old mode 100755 new mode 100644 index a7c1ee11..dd16d777 --- a/lib/apscheduler/triggers/interval.py +++ b/lib/apscheduler/triggers/interval.py @@ -1,7 +1,7 @@ from datetime import datetime, timedelta from math import ceil -from lib.apscheduler.util import convert_to_datetime, timedelta_seconds +from apscheduler.util import convert_to_datetime, timedelta_seconds class IntervalTrigger(object): diff --git a/lib/apscheduler/triggers/simple.py b/lib/apscheduler/triggers/simple.py old mode 100755 new mode 100644 index 702ed78b..ea61b3f1 --- a/lib/apscheduler/triggers/simple.py +++ b/lib/apscheduler/triggers/simple.py @@ -1,4 +1,4 @@ -from lib.apscheduler.util import convert_to_datetime +from apscheduler.util import convert_to_datetime class SimpleTrigger(object): diff --git a/lib/apscheduler/util.py b/lib/apscheduler/util.py old mode 100755 new mode 100644 index af28ae49..a49aaed8 --- a/lib/apscheduler/util.py +++ b/lib/apscheduler/util.py @@ -6,6 +6,7 @@ from datetime import date, datetime, timedelta from time import mktime import re import sys +from types import MethodType __all__ = ('asint', 'asbool', 'convert_to_datetime', 'timedelta_seconds', 'time_difference', 'datetime_ceil', 'combine_opts', @@ -108,7 +109,7 @@ def datetime_ceil(dateval): """ if dateval.microsecond > 0: return dateval + timedelta(seconds=1, - microseconds=-dateval.microsecond) + microseconds= -dateval.microsecond) return dateval @@ -137,41 +138,64 @@ def get_callable_name(func): """ Returns the best available display name for the given function/callable. """ - name = func.__module__ - if hasattr(func, '__self__') and func.__self__: - name += '.' + func.__self__.__name__ - elif hasattr(func, 'im_self') and func.im_self: # py2.4, 2.5 - name += '.' + func.im_self.__name__ - if hasattr(func, '__name__'): - name += '.' + func.__name__ - return name + f_self = getattr(func, '__self__', None) or getattr(func, 'im_self', None) + + if f_self and hasattr(func, '__name__'): + if isinstance(f_self, type): + # class method + return '%s.%s' % (f_self.__name__, func.__name__) + # bound method + return '%s.%s' % (f_self.__class__.__name__, func.__name__) + + if hasattr(func, '__call__'): + if hasattr(func, '__name__'): + # function, unbound method or a class with a __call__ method + return func.__name__ + # instance of a class with a __call__ method + return func.__class__.__name__ + + raise TypeError('Unable to determine a name for %s -- ' + 'maybe it is not a callable?' % repr(func)) def obj_to_ref(obj): """ Returns the path to the given object. """ - ref = '%s:%s' % (obj.__module__, obj.__name__) + ref = '%s:%s' % (obj.__module__, get_callable_name(obj)) try: obj2 = ref_to_obj(ref) - except AttributeError: - pass - else: - if obj2 == obj: - return ref - - raise ValueError('Only module level objects are supported') + if obj != obj2: + raise ValueError + except Exception: + raise ValueError('Cannot determine the reference to %s' % repr(obj)) + + return ref def ref_to_obj(ref): """ Returns the object pointed to by ``ref``. """ + if not isinstance(ref, basestring): + raise TypeError('References must be strings') + if not ':' in ref: + raise ValueError('Invalid reference') + modulename, rest = ref.split(':', 1) - obj = __import__(modulename) - for name in modulename.split('.')[1:] + rest.split('.'): - obj = getattr(obj, name) - return obj + try: + obj = __import__(modulename) + except ImportError: + raise LookupError('Error resolving reference %s: ' + 'could not import module' % ref) + + try: + for name in modulename.split('.')[1:] + rest.split('.'): + obj = getattr(obj, name) + return obj + except Exception: + raise LookupError('Error resolving reference %s: ' + 'error looking up object' % ref) def maybe_ref(ref): @@ -191,14 +215,16 @@ def to_unicode(string, encoding='ascii'): """ if hasattr(string, 'decode'): return string.decode(encoding, 'ignore') - return string + return string # pragma: nocover if sys.version_info < (3, 0): # pragma: nocover iteritems = lambda d: d.iteritems() itervalues = lambda d: d.itervalues() xrange = xrange + basestring = basestring else: # pragma: nocover iteritems = lambda d: d.items() itervalues = lambda d: d.values() xrange = range + basestring = str diff --git a/lib/bs4/__init__.py b/lib/bs4/__init__.py new file mode 100644 index 00000000..aa818ae4 --- /dev/null +++ b/lib/bs4/__init__.py @@ -0,0 +1,529 @@ +"""Beautiful Soup +Elixir and Tonic +"The Screen-Scraper's Friend" +http://www.crummy.com/software/BeautifulSoup/ + +Beautiful Soup uses a pluggable XML or HTML parser to parse a +(possibly invalid) document into a tree representation. Beautiful Soup +provides methods and Pythonic idioms that make it easy to navigate, +search, and modify the parse tree. + +Beautiful Soup works with Python 2.7 and up. It works better if lxml +and/or html5lib is installed. + +For more than you ever wanted to know about Beautiful Soup, see the +documentation: +http://www.crummy.com/software/BeautifulSoup/bs4/doc/ + +""" + +# Use of this source code is governed by a BSD-style license that can be +# found in the LICENSE file. + +__author__ = "Leonard Richardson (leonardr@segfault.org)" +__version__ = "4.5.1" +__copyright__ = "Copyright (c) 2004-2016 Leonard Richardson" +__license__ = "MIT" + +__all__ = ['BeautifulSoup'] + +import os +import re +import traceback +import warnings + +from .builder import builder_registry, ParserRejectedMarkup +from .dammit import UnicodeDammit +from .element import ( + CData, + Comment, + DEFAULT_OUTPUT_ENCODING, + Declaration, + Doctype, + NavigableString, + PageElement, + ProcessingInstruction, + ResultSet, + SoupStrainer, + Tag, + ) + +# The very first thing we do is give a useful error if someone is +# running this code under Python 3 without converting it. +'You are trying to run the Python 2 version of Beautiful Soup under Python 3. This will not work.'<>'You need to convert the code, either by installing it (`python setup.py install`) or by running 2to3 (`2to3 -w bs4`).' + +class BeautifulSoup(Tag): + """ + This class defines the basic interface called by the tree builders. + + These methods will be called by the parser: + reset() + feed(markup) + + The tree builder may call these methods from its feed() implementation: + handle_starttag(name, attrs) # See note about return value + handle_endtag(name) + handle_data(data) # Appends to the current data node + endData(containerClass=NavigableString) # Ends the current data node + + No matter how complicated the underlying parser is, you should be + able to build a tree using 'start tag' events, 'end tag' events, + 'data' events, and "done with data" events. + + If you encounter an empty-element tag (aka a self-closing tag, + like HTML's
tag), call handle_starttag and then + handle_endtag. + """ + ROOT_TAG_NAME = u'[document]' + + # If the end-user gives no indication which tree builder they + # want, look for one with these features. + DEFAULT_BUILDER_FEATURES = ['html', 'fast'] + + ASCII_SPACES = '\x20\x0a\x09\x0c\x0d' + + NO_PARSER_SPECIFIED_WARNING = "No parser was explicitly specified, so I'm using the best available %(markup_type)s parser for this system (\"%(parser)s\"). This usually isn't a problem, but if you run this code on another system, or in a different virtual environment, it may use a different parser and behave differently.\n\nThe code that caused this warning is on line %(line_number)s of the file %(filename)s. To get rid of this warning, change code that looks like this:\n\n BeautifulSoup([your markup])\n\nto this:\n\n BeautifulSoup([your markup], \"%(parser)s\")\n" + + def __init__(self, markup="", features=None, builder=None, + parse_only=None, from_encoding=None, exclude_encodings=None, + **kwargs): + """The Soup object is initialized as the 'root tag', and the + provided markup (which can be a string or a file-like object) + is fed into the underlying parser.""" + + if 'convertEntities' in kwargs: + warnings.warn( + "BS4 does not respect the convertEntities argument to the " + "BeautifulSoup constructor. Entities are always converted " + "to Unicode characters.") + + if 'markupMassage' in kwargs: + del kwargs['markupMassage'] + warnings.warn( + "BS4 does not respect the markupMassage argument to the " + "BeautifulSoup constructor. The tree builder is responsible " + "for any necessary markup massage.") + + if 'smartQuotesTo' in kwargs: + del kwargs['smartQuotesTo'] + warnings.warn( + "BS4 does not respect the smartQuotesTo argument to the " + "BeautifulSoup constructor. Smart quotes are always converted " + "to Unicode characters.") + + if 'selfClosingTags' in kwargs: + del kwargs['selfClosingTags'] + warnings.warn( + "BS4 does not respect the selfClosingTags argument to the " + "BeautifulSoup constructor. The tree builder is responsible " + "for understanding self-closing tags.") + + if 'isHTML' in kwargs: + del kwargs['isHTML'] + warnings.warn( + "BS4 does not respect the isHTML argument to the " + "BeautifulSoup constructor. Suggest you use " + "features='lxml' for HTML and features='lxml-xml' for " + "XML.") + + def deprecated_argument(old_name, new_name): + if old_name in kwargs: + warnings.warn( + 'The "%s" argument to the BeautifulSoup constructor ' + 'has been renamed to "%s."' % (old_name, new_name)) + value = kwargs[old_name] + del kwargs[old_name] + return value + return None + + parse_only = parse_only or deprecated_argument( + "parseOnlyThese", "parse_only") + + from_encoding = from_encoding or deprecated_argument( + "fromEncoding", "from_encoding") + + if from_encoding and isinstance(markup, unicode): + warnings.warn("You provided Unicode markup but also provided a value for from_encoding. Your from_encoding will be ignored.") + from_encoding = None + + if len(kwargs) > 0: + arg = kwargs.keys().pop() + raise TypeError( + "__init__() got an unexpected keyword argument '%s'" % arg) + + if builder is None: + original_features = features + if isinstance(features, basestring): + features = [features] + if features is None or len(features) == 0: + features = self.DEFAULT_BUILDER_FEATURES + builder_class = builder_registry.lookup(*features) + if builder_class is None: + raise FeatureNotFound( + "Couldn't find a tree builder with the features you " + "requested: %s. Do you need to install a parser library?" + % ",".join(features)) + builder = builder_class() + if not (original_features == builder.NAME or + original_features in builder.ALTERNATE_NAMES): + if builder.is_xml: + markup_type = "XML" + else: + markup_type = "HTML" + + caller = traceback.extract_stack()[0] + filename = caller[0] + line_number = caller[1] + warnings.warn(self.NO_PARSER_SPECIFIED_WARNING % dict( + filename=filename, + line_number=line_number, + parser=builder.NAME, + markup_type=markup_type)) + + self.builder = builder + self.is_xml = builder.is_xml + self.known_xml = self.is_xml + self.builder.soup = self + + self.parse_only = parse_only + + if hasattr(markup, 'read'): # It's a file-type object. + markup = markup.read() + elif len(markup) <= 256 and ( + (isinstance(markup, bytes) and not b'<' in markup) + or (isinstance(markup, unicode) and not u'<' in markup) + ): + # Print out warnings for a couple beginner problems + # involving passing non-markup to Beautiful Soup. + # Beautiful Soup will still parse the input as markup, + # just in case that's what the user really wants. + if (isinstance(markup, unicode) + and not os.path.supports_unicode_filenames): + possible_filename = markup.encode("utf8") + else: + possible_filename = markup + is_file = False + try: + is_file = os.path.exists(possible_filename) + except Exception, e: + # This is almost certainly a problem involving + # characters not valid in filenames on this + # system. Just let it go. + pass + if is_file: + if isinstance(markup, unicode): + markup = markup.encode("utf8") + warnings.warn( + '"%s" looks like a filename, not markup. You should' + 'probably open this file and pass the filehandle into' + 'Beautiful Soup.' % markup) + self._check_markup_is_url(markup) + + for (self.markup, self.original_encoding, self.declared_html_encoding, + self.contains_replacement_characters) in ( + self.builder.prepare_markup( + markup, from_encoding, exclude_encodings=exclude_encodings)): + self.reset() + try: + self._feed() + break + except ParserRejectedMarkup: + pass + + # Clear out the markup and remove the builder's circular + # reference to this object. + self.markup = None + self.builder.soup = None + + def __copy__(self): + copy = type(self)( + self.encode('utf-8'), builder=self.builder, from_encoding='utf-8' + ) + + # Although we encoded the tree to UTF-8, that may not have + # been the encoding of the original markup. Set the copy's + # .original_encoding to reflect the original object's + # .original_encoding. + copy.original_encoding = self.original_encoding + return copy + + def __getstate__(self): + # Frequently a tree builder can't be pickled. + d = dict(self.__dict__) + if 'builder' in d and not self.builder.picklable: + d['builder'] = None + return d + + @staticmethod + def _check_markup_is_url(markup): + """ + Check if markup looks like it's actually a url and raise a warning + if so. Markup can be unicode or str (py2) / bytes (py3). + """ + if isinstance(markup, bytes): + space = b' ' + cant_start_with = (b"http:", b"https:") + elif isinstance(markup, unicode): + space = u' ' + cant_start_with = (u"http:", u"https:") + else: + return + + if any(markup.startswith(prefix) for prefix in cant_start_with): + if not space in markup: + if isinstance(markup, bytes): + decoded_markup = markup.decode('utf-8', 'replace') + else: + decoded_markup = markup + warnings.warn( + '"%s" looks like a URL. Beautiful Soup is not an' + ' HTTP client. You should probably use an HTTP client like' + ' requests to get the document behind the URL, and feed' + ' that document to Beautiful Soup.' % decoded_markup + ) + + def _feed(self): + # Convert the document to Unicode. + self.builder.reset() + + self.builder.feed(self.markup) + # Close out any unfinished strings and close all the open tags. + self.endData() + while self.currentTag.name != self.ROOT_TAG_NAME: + self.popTag() + + def reset(self): + Tag.__init__(self, self, self.builder, self.ROOT_TAG_NAME) + self.hidden = 1 + self.builder.reset() + self.current_data = [] + self.currentTag = None + self.tagStack = [] + self.preserve_whitespace_tag_stack = [] + self.pushTag(self) + + def new_tag(self, name, namespace=None, nsprefix=None, **attrs): + """Create a new tag associated with this soup.""" + return Tag(None, self.builder, name, namespace, nsprefix, attrs) + + def new_string(self, s, subclass=NavigableString): + """Create a new NavigableString associated with this soup.""" + return subclass(s) + + def insert_before(self, successor): + raise NotImplementedError("BeautifulSoup objects don't support insert_before().") + + def insert_after(self, successor): + raise NotImplementedError("BeautifulSoup objects don't support insert_after().") + + def popTag(self): + tag = self.tagStack.pop() + if self.preserve_whitespace_tag_stack and tag == self.preserve_whitespace_tag_stack[-1]: + self.preserve_whitespace_tag_stack.pop() + #print "Pop", tag.name + if self.tagStack: + self.currentTag = self.tagStack[-1] + return self.currentTag + + def pushTag(self, tag): + #print "Push", tag.name + if self.currentTag: + self.currentTag.contents.append(tag) + self.tagStack.append(tag) + self.currentTag = self.tagStack[-1] + if tag.name in self.builder.preserve_whitespace_tags: + self.preserve_whitespace_tag_stack.append(tag) + + def endData(self, containerClass=NavigableString): + if self.current_data: + current_data = u''.join(self.current_data) + # If whitespace is not preserved, and this string contains + # nothing but ASCII spaces, replace it with a single space + # or newline. + if not self.preserve_whitespace_tag_stack: + strippable = True + for i in current_data: + if i not in self.ASCII_SPACES: + strippable = False + break + if strippable: + if '\n' in current_data: + current_data = '\n' + else: + current_data = ' ' + + # Reset the data collector. + self.current_data = [] + + # Should we add this string to the tree at all? + if self.parse_only and len(self.tagStack) <= 1 and \ + (not self.parse_only.text or \ + not self.parse_only.search(current_data)): + return + + o = containerClass(current_data) + self.object_was_parsed(o) + + def object_was_parsed(self, o, parent=None, most_recent_element=None): + """Add an object to the parse tree.""" + parent = parent or self.currentTag + previous_element = most_recent_element or self._most_recent_element + + next_element = previous_sibling = next_sibling = None + if isinstance(o, Tag): + next_element = o.next_element + next_sibling = o.next_sibling + previous_sibling = o.previous_sibling + if not previous_element: + previous_element = o.previous_element + + o.setup(parent, previous_element, next_element, previous_sibling, next_sibling) + + self._most_recent_element = o + parent.contents.append(o) + + if parent.next_sibling: + # This node is being inserted into an element that has + # already been parsed. Deal with any dangling references. + index = len(parent.contents)-1 + while index >= 0: + if parent.contents[index] is o: + break + index -= 1 + else: + raise ValueError( + "Error building tree: supposedly %r was inserted " + "into %r after the fact, but I don't see it!" % ( + o, parent + ) + ) + if index == 0: + previous_element = parent + previous_sibling = None + else: + previous_element = previous_sibling = parent.contents[index-1] + if index == len(parent.contents)-1: + next_element = parent.next_sibling + next_sibling = None + else: + next_element = next_sibling = parent.contents[index+1] + + o.previous_element = previous_element + if previous_element: + previous_element.next_element = o + o.next_element = next_element + if next_element: + next_element.previous_element = o + o.next_sibling = next_sibling + if next_sibling: + next_sibling.previous_sibling = o + o.previous_sibling = previous_sibling + if previous_sibling: + previous_sibling.next_sibling = o + + def _popToTag(self, name, nsprefix=None, inclusivePop=True): + """Pops the tag stack up to and including the most recent + instance of the given tag. If inclusivePop is false, pops the tag + stack up to but *not* including the most recent instqance of + the given tag.""" + #print "Popping to %s" % name + if name == self.ROOT_TAG_NAME: + # The BeautifulSoup object itself can never be popped. + return + + most_recently_popped = None + + stack_size = len(self.tagStack) + for i in range(stack_size - 1, 0, -1): + t = self.tagStack[i] + if (name == t.name and nsprefix == t.prefix): + if inclusivePop: + most_recently_popped = self.popTag() + break + most_recently_popped = self.popTag() + + return most_recently_popped + + def handle_starttag(self, name, namespace, nsprefix, attrs): + """Push a start tag on to the stack. + + If this method returns None, the tag was rejected by the + SoupStrainer. You should proceed as if the tag had not occurred + in the document. For instance, if this was a self-closing tag, + don't call handle_endtag. + """ + + # print "Start tag %s: %s" % (name, attrs) + self.endData() + + if (self.parse_only and len(self.tagStack) <= 1 + and (self.parse_only.text + or not self.parse_only.search_tag(name, attrs))): + return None + + tag = Tag(self, self.builder, name, namespace, nsprefix, attrs, + self.currentTag, self._most_recent_element) + if tag is None: + return tag + if self._most_recent_element: + self._most_recent_element.next_element = tag + self._most_recent_element = tag + self.pushTag(tag) + return tag + + def handle_endtag(self, name, nsprefix=None): + #print "End tag: " + name + self.endData() + self._popToTag(name, nsprefix) + + def handle_data(self, data): + self.current_data.append(data) + + def decode(self, pretty_print=False, + eventual_encoding=DEFAULT_OUTPUT_ENCODING, + formatter="minimal"): + """Returns a string or Unicode representation of this document. + To get Unicode, pass None for encoding.""" + + if self.is_xml: + # Print the XML declaration + encoding_part = '' + if eventual_encoding != None: + encoding_part = ' encoding="%s"' % eventual_encoding + prefix = u'\n' % encoding_part + else: + prefix = u'' + if not pretty_print: + indent_level = None + else: + indent_level = 0 + return prefix + super(BeautifulSoup, self).decode( + indent_level, eventual_encoding, formatter) + +# Alias to make it easier to type import: 'from bs4 import _soup' +_s = BeautifulSoup +_soup = BeautifulSoup + +class BeautifulStoneSoup(BeautifulSoup): + """Deprecated interface to an XML parser.""" + + def __init__(self, *args, **kwargs): + kwargs['features'] = 'xml' + warnings.warn( + 'The BeautifulStoneSoup class is deprecated. Instead of using ' + 'it, pass features="xml" into the BeautifulSoup constructor.') + super(BeautifulStoneSoup, self).__init__(*args, **kwargs) + + +class StopParsing(Exception): + pass + +class FeatureNotFound(ValueError): + pass + + +#By default, act as an HTML pretty-printer. +if __name__ == '__main__': + import sys + soup = BeautifulSoup(sys.stdin) + print soup.prettify() diff --git a/bs4/builder/__init__.py b/lib/bs4/builder/__init__.py similarity index 92% rename from bs4/builder/__init__.py rename to lib/bs4/builder/__init__.py index dc7deb93..601979bf 100644 --- a/bs4/builder/__init__.py +++ b/lib/bs4/builder/__init__.py @@ -1,9 +1,13 @@ +# Use of this source code is governed by a BSD-style license that can be +# found in the LICENSE file. + from collections import defaultdict import itertools import sys from bs4.element import ( CharsetMetaAttributeValue, ContentMetaAttributeValue, + HTMLAwareEntitySubstitution, whitespace_re ) @@ -80,9 +84,12 @@ builder_registry = TreeBuilderRegistry() class TreeBuilder(object): """Turn a document into a Beautiful Soup object tree.""" + NAME = "[Unknown tree builder]" + ALTERNATE_NAMES = [] features = [] is_xml = False + picklable = False preserve_whitespace_tags = set() empty_element_tags = None # A tag will be considered an empty-element # tag when and only when it has no contents. @@ -147,16 +154,18 @@ class TreeBuilder(object): Modifies its input in place. """ + if not attrs: + return attrs if self.cdata_list_attributes: universal = self.cdata_list_attributes.get('*', []) tag_specific = self.cdata_list_attributes.get( - tag_name.lower(), []) - for cdata_list_attr in itertools.chain(universal, tag_specific): - if cdata_list_attr in dict(attrs): - # Basically, we have a "class" attribute whose - # value is a whitespace-separated list of CSS - # classes. Split it into a list. - value = attrs[cdata_list_attr] + tag_name.lower(), None) + for attr in attrs.keys(): + if attr in universal or (tag_specific and attr in tag_specific): + # We have a "class"-type attribute whose string + # value is a whitespace-separated list of + # values. Split it into a list. + value = attrs[attr] if isinstance(value, basestring): values = whitespace_re.split(value) else: @@ -167,7 +176,7 @@ class TreeBuilder(object): # leave the value alone rather than trying to # split it again. values = value - attrs[cdata_list_attr] = values + attrs[attr] = values return attrs class SAXTreeBuilder(TreeBuilder): @@ -222,7 +231,7 @@ class HTMLTreeBuilder(TreeBuilder): Such as which tags are empty-element tags. """ - preserve_whitespace_tags = set(['pre', 'textarea']) + preserve_whitespace_tags = HTMLAwareEntitySubstitution.preserve_whitespace_tags empty_element_tags = set(['br' , 'hr', 'input', 'img', 'meta', 'spacer', 'link', 'frame', 'base']) @@ -296,6 +305,9 @@ def register_treebuilders_from(module): # Register the builder while we're at it. this_module.builder_registry.register(obj) +class ParserRejectedMarkup(Exception): + pass + # Builders are registered in reverse order of priority, so that custom # builder registrations will take precedence. In general, we want lxml # to take precedence over html5lib, because it's faster. And we only diff --git a/lib/bs4/builder/_html5lib.py b/lib/bs4/builder/_html5lib.py new file mode 100644 index 00000000..c46f8823 --- /dev/null +++ b/lib/bs4/builder/_html5lib.py @@ -0,0 +1,356 @@ +# Use of this source code is governed by a BSD-style license that can be +# found in the LICENSE file. + +__all__ = [ + 'HTML5TreeBuilder', + ] + +import warnings +from bs4.builder import ( + PERMISSIVE, + HTML, + HTML_5, + HTMLTreeBuilder, + ) +from bs4.element import ( + NamespacedAttribute, + whitespace_re, +) +import html5lib +from html5lib.constants import namespaces +from bs4.element import ( + Comment, + Doctype, + NavigableString, + Tag, + ) + +try: + # Pre-0.99999999 + from html5lib.treebuilders import _base as treebuilder_base + new_html5lib = False +except ImportError, e: + # 0.99999999 and up + from html5lib.treebuilders import base as treebuilder_base + new_html5lib = True + +class HTML5TreeBuilder(HTMLTreeBuilder): + """Use html5lib to build a tree.""" + + NAME = "html5lib" + + features = [NAME, PERMISSIVE, HTML_5, HTML] + + def prepare_markup(self, markup, user_specified_encoding, + document_declared_encoding=None, exclude_encodings=None): + # Store the user-specified encoding for use later on. + self.user_specified_encoding = user_specified_encoding + + # document_declared_encoding and exclude_encodings aren't used + # ATM because the html5lib TreeBuilder doesn't use + # UnicodeDammit. + if exclude_encodings: + warnings.warn("You provided a value for exclude_encoding, but the html5lib tree builder doesn't support exclude_encoding.") + yield (markup, None, None, False) + + # These methods are defined by Beautiful Soup. + def feed(self, markup): + if self.soup.parse_only is not None: + warnings.warn("You provided a value for parse_only, but the html5lib tree builder doesn't support parse_only. The entire document will be parsed.") + parser = html5lib.HTMLParser(tree=self.create_treebuilder) + + extra_kwargs = dict() + if not isinstance(markup, unicode): + if new_html5lib: + extra_kwargs['override_encoding'] = self.user_specified_encoding + else: + extra_kwargs['encoding'] = self.user_specified_encoding + doc = parser.parse(markup, **extra_kwargs) + + # Set the character encoding detected by the tokenizer. + if isinstance(markup, unicode): + # We need to special-case this because html5lib sets + # charEncoding to UTF-8 if it gets Unicode input. + doc.original_encoding = None + else: + original_encoding = parser.tokenizer.stream.charEncoding[0] + if not isinstance(original_encoding, basestring): + # In 0.99999999 and up, the encoding is an html5lib + # Encoding object. We want to use a string for compatibility + # with other tree builders. + original_encoding = original_encoding.name + doc.original_encoding = original_encoding + + def create_treebuilder(self, namespaceHTMLElements): + self.underlying_builder = TreeBuilderForHtml5lib( + self.soup, namespaceHTMLElements) + return self.underlying_builder + + def test_fragment_to_document(self, fragment): + """See `TreeBuilder`.""" + return u'%s' % fragment + + +class TreeBuilderForHtml5lib(treebuilder_base.TreeBuilder): + + def __init__(self, soup, namespaceHTMLElements): + self.soup = soup + super(TreeBuilderForHtml5lib, self).__init__(namespaceHTMLElements) + + def documentClass(self): + self.soup.reset() + return Element(self.soup, self.soup, None) + + def insertDoctype(self, token): + name = token["name"] + publicId = token["publicId"] + systemId = token["systemId"] + + doctype = Doctype.for_name_and_ids(name, publicId, systemId) + self.soup.object_was_parsed(doctype) + + def elementClass(self, name, namespace): + tag = self.soup.new_tag(name, namespace) + return Element(tag, self.soup, namespace) + + def commentClass(self, data): + return TextNode(Comment(data), self.soup) + + def fragmentClass(self): + self.soup = BeautifulSoup("") + self.soup.name = "[document_fragment]" + return Element(self.soup, self.soup, None) + + def appendChild(self, node): + # XXX This code is not covered by the BS4 tests. + self.soup.append(node.element) + + def getDocument(self): + return self.soup + + def getFragment(self): + return treebuilder_base.TreeBuilder.getFragment(self).element + +class AttrList(object): + def __init__(self, element): + self.element = element + self.attrs = dict(self.element.attrs) + def __iter__(self): + return list(self.attrs.items()).__iter__() + def __setitem__(self, name, value): + # If this attribute is a multi-valued attribute for this element, + # turn its value into a list. + list_attr = HTML5TreeBuilder.cdata_list_attributes + if (name in list_attr['*'] + or (self.element.name in list_attr + and name in list_attr[self.element.name])): + # A node that is being cloned may have already undergone + # this procedure. + if not isinstance(value, list): + value = whitespace_re.split(value) + self.element[name] = value + def items(self): + return list(self.attrs.items()) + def keys(self): + return list(self.attrs.keys()) + def __len__(self): + return len(self.attrs) + def __getitem__(self, name): + return self.attrs[name] + def __contains__(self, name): + return name in list(self.attrs.keys()) + + +class Element(treebuilder_base.Node): + def __init__(self, element, soup, namespace): + treebuilder_base.Node.__init__(self, element.name) + self.element = element + self.soup = soup + self.namespace = namespace + + def appendChild(self, node): + string_child = child = None + if isinstance(node, basestring): + # Some other piece of code decided to pass in a string + # instead of creating a TextElement object to contain the + # string. + string_child = child = node + elif isinstance(node, Tag): + # Some other piece of code decided to pass in a Tag + # instead of creating an Element object to contain the + # Tag. + child = node + elif node.element.__class__ == NavigableString: + string_child = child = node.element + else: + child = node.element + + if not isinstance(child, basestring) and child.parent is not None: + node.element.extract() + + if (string_child and self.element.contents + and self.element.contents[-1].__class__ == NavigableString): + # We are appending a string onto another string. + # TODO This has O(n^2) performance, for input like + # "aaa..." + old_element = self.element.contents[-1] + new_element = self.soup.new_string(old_element + string_child) + old_element.replace_with(new_element) + self.soup._most_recent_element = new_element + else: + if isinstance(node, basestring): + # Create a brand new NavigableString from this string. + child = self.soup.new_string(node) + + # Tell Beautiful Soup to act as if it parsed this element + # immediately after the parent's last descendant. (Or + # immediately after the parent, if it has no children.) + if self.element.contents: + most_recent_element = self.element._last_descendant(False) + elif self.element.next_element is not None: + # Something from further ahead in the parse tree is + # being inserted into this earlier element. This is + # very annoying because it means an expensive search + # for the last element in the tree. + most_recent_element = self.soup._last_descendant() + else: + most_recent_element = self.element + + self.soup.object_was_parsed( + child, parent=self.element, + most_recent_element=most_recent_element) + + def getAttributes(self): + return AttrList(self.element) + + def setAttributes(self, attributes): + + if attributes is not None and len(attributes) > 0: + + converted_attributes = [] + for name, value in list(attributes.items()): + if isinstance(name, tuple): + new_name = NamespacedAttribute(*name) + del attributes[name] + attributes[new_name] = value + + self.soup.builder._replace_cdata_list_attribute_values( + self.name, attributes) + for name, value in attributes.items(): + self.element[name] = value + + # The attributes may contain variables that need substitution. + # Call set_up_substitutions manually. + # + # The Tag constructor called this method when the Tag was created, + # but we just set/changed the attributes, so call it again. + self.soup.builder.set_up_substitutions(self.element) + attributes = property(getAttributes, setAttributes) + + def insertText(self, data, insertBefore=None): + if insertBefore: + text = TextNode(self.soup.new_string(data), self.soup) + self.insertBefore(data, insertBefore) + else: + self.appendChild(data) + + def insertBefore(self, node, refNode): + index = self.element.index(refNode.element) + if (node.element.__class__ == NavigableString and self.element.contents + and self.element.contents[index-1].__class__ == NavigableString): + # (See comments in appendChild) + old_node = self.element.contents[index-1] + new_str = self.soup.new_string(old_node + node.element) + old_node.replace_with(new_str) + else: + self.element.insert(index, node.element) + node.parent = self + + def removeChild(self, node): + node.element.extract() + + def reparentChildren(self, new_parent): + """Move all of this tag's children into another tag.""" + # print "MOVE", self.element.contents + # print "FROM", self.element + # print "TO", new_parent.element + element = self.element + new_parent_element = new_parent.element + # Determine what this tag's next_element will be once all the children + # are removed. + final_next_element = element.next_sibling + + new_parents_last_descendant = new_parent_element._last_descendant(False, False) + if len(new_parent_element.contents) > 0: + # The new parent already contains children. We will be + # appending this tag's children to the end. + new_parents_last_child = new_parent_element.contents[-1] + new_parents_last_descendant_next_element = new_parents_last_descendant.next_element + else: + # The new parent contains no children. + new_parents_last_child = None + new_parents_last_descendant_next_element = new_parent_element.next_element + + to_append = element.contents + append_after = new_parent_element.contents + if len(to_append) > 0: + # Set the first child's previous_element and previous_sibling + # to elements within the new parent + first_child = to_append[0] + if new_parents_last_descendant: + first_child.previous_element = new_parents_last_descendant + else: + first_child.previous_element = new_parent_element + first_child.previous_sibling = new_parents_last_child + if new_parents_last_descendant: + new_parents_last_descendant.next_element = first_child + else: + new_parent_element.next_element = first_child + if new_parents_last_child: + new_parents_last_child.next_sibling = first_child + + # Fix the last child's next_element and next_sibling + last_child = to_append[-1] + last_child.next_element = new_parents_last_descendant_next_element + if new_parents_last_descendant_next_element: + new_parents_last_descendant_next_element.previous_element = last_child + last_child.next_sibling = None + + for child in to_append: + child.parent = new_parent_element + new_parent_element.contents.append(child) + + # Now that this element has no children, change its .next_element. + element.contents = [] + element.next_element = final_next_element + + # print "DONE WITH MOVE" + # print "FROM", self.element + # print "TO", new_parent_element + + def cloneNode(self): + tag = self.soup.new_tag(self.element.name, self.namespace) + node = Element(tag, self.soup, self.namespace) + for key,value in self.attributes: + node.attributes[key] = value + return node + + def hasContent(self): + return self.element.contents + + def getNameTuple(self): + if self.namespace == None: + return namespaces["html"], self.name + else: + return self.namespace, self.name + + nameTuple = property(getNameTuple) + +class TextNode(Element): + def __init__(self, element, soup): + treebuilder_base.Node.__init__(self, None) + self.element = element + self.soup = soup + + def cloneNode(self): + raise NotImplementedError diff --git a/bs4/builder/_htmlparser.py b/lib/bs4/builder/_htmlparser.py similarity index 79% rename from bs4/builder/_htmlparser.py rename to lib/bs4/builder/_htmlparser.py index ede5cecb..823ca15a 100644 --- a/bs4/builder/_htmlparser.py +++ b/lib/bs4/builder/_htmlparser.py @@ -1,13 +1,22 @@ """Use the HTMLParser library to parse HTML files that aren't too bad.""" +# Use of this source code is governed by a BSD-style license that can be +# found in the LICENSE file. + __all__ = [ 'HTMLParserTreeBuilder', ] -from HTMLParser import ( - HTMLParser, - HTMLParseError, - ) +from HTMLParser import HTMLParser + +try: + from HTMLParser import HTMLParseError +except ImportError, e: + # HTMLParseError is removed in Python 3.5. Since it can never be + # thrown in 3.5, we can just define our own class as a placeholder. + class HTMLParseError(Exception): + pass + import sys import warnings @@ -19,10 +28,10 @@ import warnings # At the end of this file, we monkeypatch HTMLParser so that # strict=True works well on Python 3.2.2. major, minor, release = sys.version_info[:3] -CONSTRUCTOR_TAKES_STRICT = ( - major > 3 - or (major == 3 and minor > 2) - or (major == 3 and minor == 2 and release >= 3)) +CONSTRUCTOR_TAKES_STRICT = major == 3 and minor == 2 and release >= 3 +CONSTRUCTOR_STRICT_IS_DEPRECATED = major == 3 and minor == 3 +CONSTRUCTOR_TAKES_CONVERT_CHARREFS = major == 3 and minor >= 4 + from bs4.element import ( CData, @@ -45,7 +54,15 @@ HTMLPARSER = 'html.parser' class BeautifulSoupHTMLParser(HTMLParser): def handle_starttag(self, name, attrs): # XXX namespace - self.soup.handle_starttag(name, None, None, dict(attrs)) + attr_dict = {} + for key, value in attrs: + # Change None attribute values to the empty string + # for consistency with the other tree builders. + if value is None: + value = '' + attr_dict[key] = value + attrvalue = '""' + self.soup.handle_starttag(name, None, None, attr_dict) def handle_endtag(self, name): self.soup.handle_endtag(name) @@ -55,9 +72,12 @@ class BeautifulSoupHTMLParser(HTMLParser): def handle_charref(self, name): # XXX workaround for a bug in HTMLParser. Remove this once - # it's fixed. + # it's fixed in all supported versions. + # http://bugs.python.org/issue13633 if name.startswith('x'): real_name = int(name.lstrip('x'), 16) + elif name.startswith('X'): + real_name = int(name.lstrip('X'), 16) else: real_name = int(name) @@ -85,6 +105,9 @@ class BeautifulSoupHTMLParser(HTMLParser): self.soup.endData() if data.startswith("DOCTYPE "): data = data[len("DOCTYPE "):] + elif data == 'DOCTYPE': + # i.e. "" + data = '' self.soup.handle_data(data) self.soup.endData(Doctype) @@ -100,14 +123,6 @@ class BeautifulSoupHTMLParser(HTMLParser): def handle_pi(self, data): self.soup.endData() - if data.endswith("?") and data.lower().startswith("xml"): - # "An XHTML processing instruction using the trailing '?' - # will cause the '?' to be included in data." - HTMLParser - # docs. - # - # Strip the question mark so we don't end up with two - # question marks. - data = data[:-1] self.soup.handle_data(data) self.soup.endData(ProcessingInstruction) @@ -115,28 +130,34 @@ class BeautifulSoupHTMLParser(HTMLParser): class HTMLParserTreeBuilder(HTMLTreeBuilder): is_xml = False - features = [HTML, STRICT, HTMLPARSER] + picklable = True + NAME = HTMLPARSER + features = [NAME, HTML, STRICT] def __init__(self, *args, **kwargs): - if CONSTRUCTOR_TAKES_STRICT: + if CONSTRUCTOR_TAKES_STRICT and not CONSTRUCTOR_STRICT_IS_DEPRECATED: kwargs['strict'] = False + if CONSTRUCTOR_TAKES_CONVERT_CHARREFS: + kwargs['convert_charrefs'] = False self.parser_args = (args, kwargs) def prepare_markup(self, markup, user_specified_encoding=None, - document_declared_encoding=None): + document_declared_encoding=None, exclude_encodings=None): """ :return: A 4-tuple (markup, original encoding, encoding declared within markup, whether any characters had to be replaced with REPLACEMENT CHARACTER). """ if isinstance(markup, unicode): - return markup, None, None, False + yield (markup, None, None, False) + return try_encodings = [user_specified_encoding, document_declared_encoding] - dammit = UnicodeDammit(markup, try_encodings, is_html=True) - return (dammit.markup, dammit.original_encoding, - dammit.declared_html_encoding, - dammit.contains_replacement_characters) + dammit = UnicodeDammit(markup, try_encodings, is_html=True, + exclude_encodings=exclude_encodings) + yield (dammit.markup, dammit.original_encoding, + dammit.declared_html_encoding, + dammit.contains_replacement_characters) def feed(self, markup): args, kwargs = self.parser_args diff --git a/lib/bs4/builder/_lxml.py b/lib/bs4/builder/_lxml.py new file mode 100644 index 00000000..d2ca2872 --- /dev/null +++ b/lib/bs4/builder/_lxml.py @@ -0,0 +1,258 @@ +# Use of this source code is governed by a BSD-style license that can be +# found in the LICENSE file. +__all__ = [ + 'LXMLTreeBuilderForXML', + 'LXMLTreeBuilder', + ] + +from io import BytesIO +from StringIO import StringIO +import collections +from lxml import etree +from bs4.element import ( + Comment, + Doctype, + NamespacedAttribute, + ProcessingInstruction, + XMLProcessingInstruction, +) +from bs4.builder import ( + FAST, + HTML, + HTMLTreeBuilder, + PERMISSIVE, + ParserRejectedMarkup, + TreeBuilder, + XML) +from bs4.dammit import EncodingDetector + +LXML = 'lxml' + +class LXMLTreeBuilderForXML(TreeBuilder): + DEFAULT_PARSER_CLASS = etree.XMLParser + + is_xml = True + processing_instruction_class = XMLProcessingInstruction + + NAME = "lxml-xml" + ALTERNATE_NAMES = ["xml"] + + # Well, it's permissive by XML parser standards. + features = [NAME, LXML, XML, FAST, PERMISSIVE] + + CHUNK_SIZE = 512 + + # This namespace mapping is specified in the XML Namespace + # standard. + DEFAULT_NSMAPS = {'http://www.w3.org/XML/1998/namespace' : "xml"} + + def default_parser(self, encoding): + # This can either return a parser object or a class, which + # will be instantiated with default arguments. + if self._default_parser is not None: + return self._default_parser + return etree.XMLParser( + target=self, strip_cdata=False, recover=True, encoding=encoding) + + def parser_for(self, encoding): + # Use the default parser. + parser = self.default_parser(encoding) + + if isinstance(parser, collections.Callable): + # Instantiate the parser with default arguments + parser = parser(target=self, strip_cdata=False, encoding=encoding) + return parser + + def __init__(self, parser=None, empty_element_tags=None): + # TODO: Issue a warning if parser is present but not a + # callable, since that means there's no way to create new + # parsers for different encodings. + self._default_parser = parser + if empty_element_tags is not None: + self.empty_element_tags = set(empty_element_tags) + self.soup = None + self.nsmaps = [self.DEFAULT_NSMAPS] + + def _getNsTag(self, tag): + # Split the namespace URL out of a fully-qualified lxml tag + # name. Copied from lxml's src/lxml/sax.py. + if tag[0] == '{': + return tuple(tag[1:].split('}', 1)) + else: + return (None, tag) + + def prepare_markup(self, markup, user_specified_encoding=None, + exclude_encodings=None, + document_declared_encoding=None): + """ + :yield: A series of 4-tuples. + (markup, encoding, declared encoding, + has undergone character replacement) + + Each 4-tuple represents a strategy for parsing the document. + """ + # Instead of using UnicodeDammit to convert the bytestring to + # Unicode using different encodings, use EncodingDetector to + # iterate over the encodings, and tell lxml to try to parse + # the document as each one in turn. + is_html = not self.is_xml + if is_html: + self.processing_instruction_class = ProcessingInstruction + else: + self.processing_instruction_class = XMLProcessingInstruction + + if isinstance(markup, unicode): + # We were given Unicode. Maybe lxml can parse Unicode on + # this system? + yield markup, None, document_declared_encoding, False + + if isinstance(markup, unicode): + # No, apparently not. Convert the Unicode to UTF-8 and + # tell lxml to parse it as UTF-8. + yield (markup.encode("utf8"), "utf8", + document_declared_encoding, False) + + try_encodings = [user_specified_encoding, document_declared_encoding] + detector = EncodingDetector( + markup, try_encodings, is_html, exclude_encodings) + for encoding in detector.encodings: + yield (detector.markup, encoding, document_declared_encoding, False) + + def feed(self, markup): + if isinstance(markup, bytes): + markup = BytesIO(markup) + elif isinstance(markup, unicode): + markup = StringIO(markup) + + # Call feed() at least once, even if the markup is empty, + # or the parser won't be initialized. + data = markup.read(self.CHUNK_SIZE) + try: + self.parser = self.parser_for(self.soup.original_encoding) + self.parser.feed(data) + while len(data) != 0: + # Now call feed() on the rest of the data, chunk by chunk. + data = markup.read(self.CHUNK_SIZE) + if len(data) != 0: + self.parser.feed(data) + self.parser.close() + except (UnicodeDecodeError, LookupError, etree.ParserError), e: + raise ParserRejectedMarkup(str(e)) + + def close(self): + self.nsmaps = [self.DEFAULT_NSMAPS] + + def start(self, name, attrs, nsmap={}): + # Make sure attrs is a mutable dict--lxml may send an immutable dictproxy. + attrs = dict(attrs) + nsprefix = None + # Invert each namespace map as it comes in. + if len(self.nsmaps) > 1: + # There are no new namespaces for this tag, but + # non-default namespaces are in play, so we need a + # separate tag stack to know when they end. + self.nsmaps.append(None) + elif len(nsmap) > 0: + # A new namespace mapping has come into play. + inverted_nsmap = dict((value, key) for key, value in nsmap.items()) + self.nsmaps.append(inverted_nsmap) + # Also treat the namespace mapping as a set of attributes on the + # tag, so we can recreate it later. + attrs = attrs.copy() + for prefix, namespace in nsmap.items(): + attribute = NamespacedAttribute( + "xmlns", prefix, "http://www.w3.org/2000/xmlns/") + attrs[attribute] = namespace + + # Namespaces are in play. Find any attributes that came in + # from lxml with namespaces attached to their names, and + # turn then into NamespacedAttribute objects. + new_attrs = {} + for attr, value in attrs.items(): + namespace, attr = self._getNsTag(attr) + if namespace is None: + new_attrs[attr] = value + else: + nsprefix = self._prefix_for_namespace(namespace) + attr = NamespacedAttribute(nsprefix, attr, namespace) + new_attrs[attr] = value + attrs = new_attrs + + namespace, name = self._getNsTag(name) + nsprefix = self._prefix_for_namespace(namespace) + self.soup.handle_starttag(name, namespace, nsprefix, attrs) + + def _prefix_for_namespace(self, namespace): + """Find the currently active prefix for the given namespace.""" + if namespace is None: + return None + for inverted_nsmap in reversed(self.nsmaps): + if inverted_nsmap is not None and namespace in inverted_nsmap: + return inverted_nsmap[namespace] + return None + + def end(self, name): + self.soup.endData() + completed_tag = self.soup.tagStack[-1] + namespace, name = self._getNsTag(name) + nsprefix = None + if namespace is not None: + for inverted_nsmap in reversed(self.nsmaps): + if inverted_nsmap is not None and namespace in inverted_nsmap: + nsprefix = inverted_nsmap[namespace] + break + self.soup.handle_endtag(name, nsprefix) + if len(self.nsmaps) > 1: + # This tag, or one of its parents, introduced a namespace + # mapping, so pop it off the stack. + self.nsmaps.pop() + + def pi(self, target, data): + self.soup.endData() + self.soup.handle_data(target + ' ' + data) + self.soup.endData(self.processing_instruction_class) + + def data(self, content): + self.soup.handle_data(content) + + def doctype(self, name, pubid, system): + self.soup.endData() + doctype = Doctype.for_name_and_ids(name, pubid, system) + self.soup.object_was_parsed(doctype) + + def comment(self, content): + "Handle comments as Comment objects." + self.soup.endData() + self.soup.handle_data(content) + self.soup.endData(Comment) + + def test_fragment_to_document(self, fragment): + """See `TreeBuilder`.""" + return u'\n%s' % fragment + + +class LXMLTreeBuilder(HTMLTreeBuilder, LXMLTreeBuilderForXML): + + NAME = LXML + ALTERNATE_NAMES = ["lxml-html"] + + features = ALTERNATE_NAMES + [NAME, HTML, FAST, PERMISSIVE] + is_xml = False + processing_instruction_class = ProcessingInstruction + + def default_parser(self, encoding): + return etree.HTMLParser + + def feed(self, markup): + encoding = self.soup.original_encoding + try: + self.parser = self.parser_for(encoding) + self.parser.feed(markup) + self.parser.close() + except (UnicodeDecodeError, LookupError, etree.ParserError), e: + raise ParserRejectedMarkup(str(e)) + + + def test_fragment_to_document(self, fragment): + """See `TreeBuilder`.""" + return u'%s' % fragment diff --git a/bs4/dammit.py b/lib/bs4/dammit.py similarity index 72% rename from bs4/dammit.py rename to lib/bs4/dammit.py index 58cad9ba..2bf67f7f 100644 --- a/bs4/dammit.py +++ b/lib/bs4/dammit.py @@ -1,27 +1,43 @@ # -*- coding: utf-8 -*- """Beautiful Soup bonus library: Unicode, Dammit -This class forces XML data into a standard format (usually to UTF-8 or -Unicode). It is heavily based on code from Mark Pilgrim's Universal -Feed Parser. It does not rewrite the XML or HTML to reflect a new -encoding; that's the tree builder's job. +This library converts a bytestream to Unicode through any means +necessary. It is heavily based on code from Mark Pilgrim's Universal +Feed Parser. It works best on XML and HTML, but it does not rewrite the +XML or HTML to reflect a new encoding; that's the tree builder's job. """ +# Use of this source code is governed by a BSD-style license that can be +# found in the LICENSE file. +__license__ = "MIT" import codecs from htmlentitydefs import codepoint2name import re -import warnings +import logging +import string -# Autodetects character encodings. Very useful. -# Download from http://chardet.feedparser.org/ -# or 'apt-get install python-chardet' -# or 'easy_install chardet' +# Import a library to autodetect character encodings. +chardet_type = None try: - import chardet - #import chardet.constants - #chardet.constants._debug = 1 + # First try the fast C implementation. + # PyPI package: cchardet + import cchardet + def chardet_dammit(s): + return cchardet.detect(s)['encoding'] except ImportError: - chardet = None + try: + # Fall back to the pure Python implementation + # Debian package: python-chardet + # PyPI package: chardet + import chardet + def chardet_dammit(s): + return chardet.detect(s)['encoding'] + #import chardet.constants + #chardet.constants._debug = 1 + except ImportError: + # No chardet available. + def chardet_dammit(s): + return None # Available from http://cjkpython.i18n.org/. try: @@ -69,6 +85,8 @@ class EntitySubstitution(object): "&(?!#\d+;|#x[0-9a-fA-F]+;|\w+;)" ")") + AMPERSAND_OR_BRACKET = re.compile("([<>&])") + @classmethod def _substitute_html_entity(cls, matchobj): entity = cls.CHARACTER_TO_HTML_ENTITY.get(matchobj.group(0)) @@ -122,6 +140,28 @@ class EntitySubstitution(object): def substitute_xml(cls, value, make_quoted_attribute=False): """Substitute XML entities for special XML characters. + :param value: A string to be substituted. The less-than sign + will become <, the greater-than sign will become >, + and any ampersands will become &. If you want ampersands + that appear to be part of an entity definition to be left + alone, use substitute_xml_containing_entities() instead. + + :param make_quoted_attribute: If True, then the string will be + quoted, as befits an attribute value. + """ + # Escape angle brackets and ampersands. + value = cls.AMPERSAND_OR_BRACKET.sub( + cls._substitute_xml_entity, value) + + if make_quoted_attribute: + value = cls.quoted_attribute_value(value) + return value + + @classmethod + def substitute_xml_containing_entities( + cls, value, make_quoted_attribute=False): + """Substitute XML entities for special XML characters. + :param value: A string to be substituted. The less-than sign will become <, the greater-than sign will become >, and any ampersands that are not part of an entity defition will @@ -155,6 +195,133 @@ class EntitySubstitution(object): cls._substitute_html_entity, s) +class EncodingDetector: + """Suggests a number of possible encodings for a bytestring. + + Order of precedence: + + 1. Encodings you specifically tell EncodingDetector to try first + (the override_encodings argument to the constructor). + + 2. An encoding declared within the bytestring itself, either in an + XML declaration (if the bytestring is to be interpreted as an XML + document), or in a tag (if the bytestring is to be + interpreted as an HTML document.) + + 3. An encoding detected through textual analysis by chardet, + cchardet, or a similar external library. + + 4. UTF-8. + + 5. Windows-1252. + """ + def __init__(self, markup, override_encodings=None, is_html=False, + exclude_encodings=None): + self.override_encodings = override_encodings or [] + exclude_encodings = exclude_encodings or [] + self.exclude_encodings = set([x.lower() for x in exclude_encodings]) + self.chardet_encoding = None + self.is_html = is_html + self.declared_encoding = None + + # First order of business: strip a byte-order mark. + self.markup, self.sniffed_encoding = self.strip_byte_order_mark(markup) + + def _usable(self, encoding, tried): + if encoding is not None: + encoding = encoding.lower() + if encoding in self.exclude_encodings: + return False + if encoding not in tried: + tried.add(encoding) + return True + return False + + @property + def encodings(self): + """Yield a number of encodings that might work for this markup.""" + tried = set() + for e in self.override_encodings: + if self._usable(e, tried): + yield e + + # Did the document originally start with a byte-order mark + # that indicated its encoding? + if self._usable(self.sniffed_encoding, tried): + yield self.sniffed_encoding + + # Look within the document for an XML or HTML encoding + # declaration. + if self.declared_encoding is None: + self.declared_encoding = self.find_declared_encoding( + self.markup, self.is_html) + if self._usable(self.declared_encoding, tried): + yield self.declared_encoding + + # Use third-party character set detection to guess at the + # encoding. + if self.chardet_encoding is None: + self.chardet_encoding = chardet_dammit(self.markup) + if self._usable(self.chardet_encoding, tried): + yield self.chardet_encoding + + # As a last-ditch effort, try utf-8 and windows-1252. + for e in ('utf-8', 'windows-1252'): + if self._usable(e, tried): + yield e + + @classmethod + def strip_byte_order_mark(cls, data): + """If a byte-order mark is present, strip it and return the encoding it implies.""" + encoding = None + if isinstance(data, unicode): + # Unicode data cannot have a byte-order mark. + return data, encoding + if (len(data) >= 4) and (data[:2] == b'\xfe\xff') \ + and (data[2:4] != '\x00\x00'): + encoding = 'utf-16be' + data = data[2:] + elif (len(data) >= 4) and (data[:2] == b'\xff\xfe') \ + and (data[2:4] != '\x00\x00'): + encoding = 'utf-16le' + data = data[2:] + elif data[:3] == b'\xef\xbb\xbf': + encoding = 'utf-8' + data = data[3:] + elif data[:4] == b'\x00\x00\xfe\xff': + encoding = 'utf-32be' + data = data[4:] + elif data[:4] == b'\xff\xfe\x00\x00': + encoding = 'utf-32le' + data = data[4:] + return data, encoding + + @classmethod + def find_declared_encoding(cls, markup, is_html=False, search_entire_document=False): + """Given a document, tries to find its declared encoding. + + An XML encoding is declared at the beginning of the document. + + An HTML encoding is declared in a tag, hopefully near the + beginning of the document. + """ + if search_entire_document: + xml_endpos = html_endpos = len(markup) + else: + xml_endpos = 1024 + html_endpos = max(2048, int(len(markup) * 0.05)) + + declared_encoding = None + declared_encoding_match = xml_encoding_re.search(markup, endpos=xml_endpos) + if not declared_encoding_match and is_html: + declared_encoding_match = html_meta_re.search(markup, endpos=html_endpos) + if declared_encoding_match is not None: + declared_encoding = declared_encoding_match.groups()[0].decode( + 'ascii', 'replace') + if declared_encoding: + return declared_encoding.lower() + return None + class UnicodeDammit: """A class for detecting the encoding of a *ML document and converting it to a Unicode string. If the source encoding is @@ -175,66 +342,51 @@ class UnicodeDammit: ] def __init__(self, markup, override_encodings=[], - smart_quotes_to=None, is_html=False): - self.declared_html_encoding = None + smart_quotes_to=None, is_html=False, exclude_encodings=[]): self.smart_quotes_to = smart_quotes_to self.tried_encodings = [] self.contains_replacement_characters = False + self.is_html = is_html + self.log = logging.getLogger(__name__) + self.detector = EncodingDetector( + markup, override_encodings, is_html, exclude_encodings) - if markup == '' or isinstance(markup, unicode): + # Short-circuit if the data is in Unicode to begin with. + if isinstance(markup, unicode) or markup == '': self.markup = markup self.unicode_markup = unicode(markup) self.original_encoding = None return - new_markup, document_encoding, sniffed_encoding = \ - self._detectEncoding(markup, is_html) - self.markup = new_markup + # The encoding detector may have stripped a byte-order mark. + # Use the stripped markup from this point on. + self.markup = self.detector.markup u = None - if new_markup != markup: - # _detectEncoding modified the markup, then converted it to - # Unicode and then to UTF-8. So convert it from UTF-8. - u = self._convert_from("utf8") - self.original_encoding = sniffed_encoding + for encoding in self.detector.encodings: + markup = self.detector.markup + u = self._convert_from(encoding) + if u is not None: + break if not u: - for proposed_encoding in ( - override_encodings + [document_encoding, sniffed_encoding]): - if proposed_encoding is not None: - u = self._convert_from(proposed_encoding) - if u: - break + # None of the encodings worked. As an absolute last resort, + # try them again with character replacement. - # If no luck and we have auto-detection library, try that: - if not u and chardet and not isinstance(self.markup, unicode): - u = self._convert_from(chardet.detect(self.markup)['encoding']) - - # As a last resort, try utf-8 and windows-1252: - if not u: - for proposed_encoding in ("utf-8", "windows-1252"): - u = self._convert_from(proposed_encoding) - if u: - break - - # As an absolute last resort, try the encodings again with - # character replacement. - if not u: - for proposed_encoding in ( - override_encodings + [ - document_encoding, sniffed_encoding, "utf-8", "windows-1252"]): - if proposed_encoding != "ascii": - u = self._convert_from(proposed_encoding, "replace") + for encoding in self.detector.encodings: + if encoding != "ascii": + u = self._convert_from(encoding, "replace") if u is not None: - warnings.warn( - UnicodeWarning( + self.log.warning( "Some characters could not be decoded, and were " - "replaced with REPLACEMENT CHARACTER.")) + "replaced with REPLACEMENT CHARACTER." + ) self.contains_replacement_characters = True break - # We could at this point force it to ASCII, but that would - # destroy so much data that I think giving up is better + # If none of that worked, we could at this point force it to + # ASCII, but that would destroy so much data that I think + # giving up is better. self.unicode_markup = u if not u: self.original_encoding = None @@ -262,11 +414,10 @@ class UnicodeDammit: return None self.tried_encodings.append((proposed, errors)) markup = self.markup - # Convert smart quotes to HTML if coming from an encoding # that might have them. if (self.smart_quotes_to is not None - and proposed.lower() in self.ENCODINGS_WITH_SMART_QUOTES): + and proposed in self.ENCODINGS_WITH_SMART_QUOTES): smart_quotes_re = b"([\x80-\x9f])" smart_quotes_compiled = re.compile(smart_quotes_re) markup = smart_quotes_compiled.sub(self._sub_ms_char, markup) @@ -287,99 +438,24 @@ class UnicodeDammit: def _to_unicode(self, data, encoding, errors="strict"): '''Given a string and its encoding, decodes the string into Unicode. %encoding is a string recognized by encodings.aliases''' + return unicode(data, encoding, errors) - # strip Byte Order Mark (if present) - if (len(data) >= 4) and (data[:2] == '\xfe\xff') \ - and (data[2:4] != '\x00\x00'): - encoding = 'utf-16be' - data = data[2:] - elif (len(data) >= 4) and (data[:2] == '\xff\xfe') \ - and (data[2:4] != '\x00\x00'): - encoding = 'utf-16le' - data = data[2:] - elif data[:3] == '\xef\xbb\xbf': - encoding = 'utf-8' - data = data[3:] - elif data[:4] == '\x00\x00\xfe\xff': - encoding = 'utf-32be' - data = data[4:] - elif data[:4] == '\xff\xfe\x00\x00': - encoding = 'utf-32le' - data = data[4:] - newdata = unicode(data, encoding, errors) - return newdata - - def _detectEncoding(self, xml_data, is_html=False): - """Given a document, tries to detect its XML encoding.""" - xml_encoding = sniffed_xml_encoding = None - try: - if xml_data[:4] == b'\x4c\x6f\xa7\x94': - # EBCDIC - xml_data = self._ebcdic_to_ascii(xml_data) - elif xml_data[:4] == b'\x00\x3c\x00\x3f': - # UTF-16BE - sniffed_xml_encoding = 'utf-16be' - xml_data = unicode(xml_data, 'utf-16be').encode('utf-8') - elif (len(xml_data) >= 4) and (xml_data[:2] == b'\xfe\xff') \ - and (xml_data[2:4] != b'\x00\x00'): - # UTF-16BE with BOM - sniffed_xml_encoding = 'utf-16be' - xml_data = unicode(xml_data[2:], 'utf-16be').encode('utf-8') - elif xml_data[:4] == b'\x3c\x00\x3f\x00': - # UTF-16LE - sniffed_xml_encoding = 'utf-16le' - xml_data = unicode(xml_data, 'utf-16le').encode('utf-8') - elif (len(xml_data) >= 4) and (xml_data[:2] == b'\xff\xfe') and \ - (xml_data[2:4] != b'\x00\x00'): - # UTF-16LE with BOM - sniffed_xml_encoding = 'utf-16le' - xml_data = unicode(xml_data[2:], 'utf-16le').encode('utf-8') - elif xml_data[:4] == b'\x00\x00\x00\x3c': - # UTF-32BE - sniffed_xml_encoding = 'utf-32be' - xml_data = unicode(xml_data, 'utf-32be').encode('utf-8') - elif xml_data[:4] == b'\x3c\x00\x00\x00': - # UTF-32LE - sniffed_xml_encoding = 'utf-32le' - xml_data = unicode(xml_data, 'utf-32le').encode('utf-8') - elif xml_data[:4] == b'\x00\x00\xfe\xff': - # UTF-32BE with BOM - sniffed_xml_encoding = 'utf-32be' - xml_data = unicode(xml_data[4:], 'utf-32be').encode('utf-8') - elif xml_data[:4] == b'\xff\xfe\x00\x00': - # UTF-32LE with BOM - sniffed_xml_encoding = 'utf-32le' - xml_data = unicode(xml_data[4:], 'utf-32le').encode('utf-8') - elif xml_data[:3] == b'\xef\xbb\xbf': - # UTF-8 with BOM - sniffed_xml_encoding = 'utf-8' - xml_data = unicode(xml_data[3:], 'utf-8').encode('utf-8') - else: - sniffed_xml_encoding = 'ascii' - pass - except: - xml_encoding_match = None - xml_encoding_match = xml_encoding_re.match(xml_data) - if not xml_encoding_match and is_html: - xml_encoding_match = html_meta_re.search(xml_data) - if xml_encoding_match is not None: - xml_encoding = xml_encoding_match.groups()[0].decode( - 'ascii').lower() - if is_html: - self.declared_html_encoding = xml_encoding - if sniffed_xml_encoding and \ - (xml_encoding in ('iso-10646-ucs-2', 'ucs-2', 'csunicode', - 'iso-10646-ucs-4', 'ucs-4', 'csucs4', - 'utf-16', 'utf-32', 'utf_16', 'utf_32', - 'utf16', 'u16')): - xml_encoding = sniffed_xml_encoding - return xml_data, xml_encoding, sniffed_xml_encoding + @property + def declared_html_encoding(self): + if not self.is_html: + return None + return self.detector.declared_encoding def find_codec(self, charset): - return self._codec(self.CHARSET_ALIASES.get(charset, charset)) \ - or (charset and self._codec(charset.replace("-", ""))) \ - or (charset and self._codec(charset.replace("-", "_"))) \ + value = (self._codec(self.CHARSET_ALIASES.get(charset, charset)) + or (charset and self._codec(charset.replace("-", ""))) + or (charset and self._codec(charset.replace("-", "_"))) + or (charset and charset.lower()) or charset + ) + if value: + return value.lower() + return None def _codec(self, charset): if not charset: @@ -392,32 +468,6 @@ class UnicodeDammit: pass return codec - EBCDIC_TO_ASCII_MAP = None - - def _ebcdic_to_ascii(self, s): - c = self.__class__ - if not c.EBCDIC_TO_ASCII_MAP: - emap = (0,1,2,3,156,9,134,127,151,141,142,11,12,13,14,15, - 16,17,18,19,157,133,8,135,24,25,146,143,28,29,30,31, - 128,129,130,131,132,10,23,27,136,137,138,139,140,5,6,7, - 144,145,22,147,148,149,150,4,152,153,154,155,20,21,158,26, - 32,160,161,162,163,164,165,166,167,168,91,46,60,40,43,33, - 38,169,170,171,172,173,174,175,176,177,93,36,42,41,59,94, - 45,47,178,179,180,181,182,183,184,185,124,44,37,95,62,63, - 186,187,188,189,190,191,192,193,194,96,58,35,64,39,61,34, - 195,97,98,99,100,101,102,103,104,105,196,197,198,199,200, - 201,202,106,107,108,109,110,111,112,113,114,203,204,205, - 206,207,208,209,126,115,116,117,118,119,120,121,122,210, - 211,212,213,214,215,216,217,218,219,220,221,222,223,224, - 225,226,227,228,229,230,231,123,65,66,67,68,69,70,71,72, - 73,232,233,234,235,236,237,125,74,75,76,77,78,79,80,81, - 82,238,239,240,241,242,243,92,159,83,84,85,86,87,88,89, - 90,244,245,246,247,248,249,48,49,50,51,52,53,54,55,56,57, - 250,251,252,253,254,255) - import string - c.EBCDIC_TO_ASCII_MAP = string.maketrans( - ''.join(map(chr, list(range(256)))), ''.join(map(chr, emap))) - return s.translate(c.EBCDIC_TO_ASCII_MAP) # A partial mapping of ISO-Latin-1 to HTML entities/XML numeric entities. MS_CHARS = {b'\x80': ('euro', '20AC'), diff --git a/lib/bs4/diagnose.py b/lib/bs4/diagnose.py new file mode 100644 index 00000000..8768332f --- /dev/null +++ b/lib/bs4/diagnose.py @@ -0,0 +1,219 @@ +"""Diagnostic functions, mainly for use when doing tech support.""" + +# Use of this source code is governed by a BSD-style license that can be +# found in the LICENSE file. +__license__ = "MIT" + +import cProfile +from StringIO import StringIO +from HTMLParser import HTMLParser +import bs4 +from bs4 import BeautifulSoup, __version__ +from bs4.builder import builder_registry + +import os +import pstats +import random +import tempfile +import time +import traceback +import sys +import cProfile + +def diagnose(data): + """Diagnostic suite for isolating common problems.""" + print "Diagnostic running on Beautiful Soup %s" % __version__ + print "Python version %s" % sys.version + + basic_parsers = ["html.parser", "html5lib", "lxml"] + for name in basic_parsers: + for builder in builder_registry.builders: + if name in builder.features: + break + else: + basic_parsers.remove(name) + print ( + "I noticed that %s is not installed. Installing it may help." % + name) + + if 'lxml' in basic_parsers: + basic_parsers.append(["lxml", "xml"]) + try: + from lxml import etree + print "Found lxml version %s" % ".".join(map(str,etree.LXML_VERSION)) + except ImportError, e: + print ( + "lxml is not installed or couldn't be imported.") + + + if 'html5lib' in basic_parsers: + try: + import html5lib + print "Found html5lib version %s" % html5lib.__version__ + except ImportError, e: + print ( + "html5lib is not installed or couldn't be imported.") + + if hasattr(data, 'read'): + data = data.read() + elif os.path.exists(data): + print '"%s" looks like a filename. Reading data from the file.' % data + with open(data) as fp: + data = fp.read() + elif data.startswith("http:") or data.startswith("https:"): + print '"%s" looks like a URL. Beautiful Soup is not an HTTP client.' % data + print "You need to use some other library to get the document behind the URL, and feed that document to Beautiful Soup." + return + print + + for parser in basic_parsers: + print "Trying to parse your markup with %s" % parser + success = False + try: + soup = BeautifulSoup(data, parser) + success = True + except Exception, e: + print "%s could not parse the markup." % parser + traceback.print_exc() + if success: + print "Here's what %s did with the markup:" % parser + print soup.prettify() + + print "-" * 80 + +def lxml_trace(data, html=True, **kwargs): + """Print out the lxml events that occur during parsing. + + This lets you see how lxml parses a document when no Beautiful + Soup code is running. + """ + from lxml import etree + for event, element in etree.iterparse(StringIO(data), html=html, **kwargs): + print("%s, %4s, %s" % (event, element.tag, element.text)) + +class AnnouncingParser(HTMLParser): + """Announces HTMLParser parse events, without doing anything else.""" + + def _p(self, s): + print(s) + + def handle_starttag(self, name, attrs): + self._p("%s START" % name) + + def handle_endtag(self, name): + self._p("%s END" % name) + + def handle_data(self, data): + self._p("%s DATA" % data) + + def handle_charref(self, name): + self._p("%s CHARREF" % name) + + def handle_entityref(self, name): + self._p("%s ENTITYREF" % name) + + def handle_comment(self, data): + self._p("%s COMMENT" % data) + + def handle_decl(self, data): + self._p("%s DECL" % data) + + def unknown_decl(self, data): + self._p("%s UNKNOWN-DECL" % data) + + def handle_pi(self, data): + self._p("%s PI" % data) + +def htmlparser_trace(data): + """Print out the HTMLParser events that occur during parsing. + + This lets you see how HTMLParser parses a document when no + Beautiful Soup code is running. + """ + parser = AnnouncingParser() + parser.feed(data) + +_vowels = "aeiou" +_consonants = "bcdfghjklmnpqrstvwxyz" + +def rword(length=5): + "Generate a random word-like string." + s = '' + for i in range(length): + if i % 2 == 0: + t = _consonants + else: + t = _vowels + s += random.choice(t) + return s + +def rsentence(length=4): + "Generate a random sentence-like string." + return " ".join(rword(random.randint(4,9)) for i in range(length)) + +def rdoc(num_elements=1000): + """Randomly generate an invalid HTML document.""" + tag_names = ['p', 'div', 'span', 'i', 'b', 'script', 'table'] + elements = [] + for i in range(num_elements): + choice = random.randint(0,3) + if choice == 0: + # New tag. + tag_name = random.choice(tag_names) + elements.append("<%s>" % tag_name) + elif choice == 1: + elements.append(rsentence(random.randint(1,4))) + elif choice == 2: + # Close a tag. + tag_name = random.choice(tag_names) + elements.append("" % tag_name) + return "" + "\n".join(elements) + "" + +def benchmark_parsers(num_elements=100000): + """Very basic head-to-head performance benchmark.""" + print "Comparative parser benchmark on Beautiful Soup %s" % __version__ + data = rdoc(num_elements) + print "Generated a large invalid HTML document (%d bytes)." % len(data) + + for parser in ["lxml", ["lxml", "html"], "html5lib", "html.parser"]: + success = False + try: + a = time.time() + soup = BeautifulSoup(data, parser) + b = time.time() + success = True + except Exception, e: + print "%s could not parse the markup." % parser + traceback.print_exc() + if success: + print "BS4+%s parsed the markup in %.2fs." % (parser, b-a) + + from lxml import etree + a = time.time() + etree.HTML(data) + b = time.time() + print "Raw lxml parsed the markup in %.2fs." % (b-a) + + import html5lib + parser = html5lib.HTMLParser() + a = time.time() + parser.parse(data) + b = time.time() + print "Raw html5lib parsed the markup in %.2fs." % (b-a) + +def profile(num_elements=100000, parser="lxml"): + + filehandle = tempfile.NamedTemporaryFile() + filename = filehandle.name + + data = rdoc(num_elements) + vars = dict(bs4=bs4, data=data, parser=parser) + cProfile.runctx('bs4.BeautifulSoup(data, parser)' , vars, vars, filename) + + stats = pstats.Stats(filename) + # stats.strip_dirs() + stats.sort_stats("cumulative") + stats.print_stats('_html5lib|bs4', 50) + +if __name__ == '__main__': + diagnose(sys.stdin.read()) diff --git a/bs4/element.py b/lib/bs4/element.py similarity index 64% rename from bs4/element.py rename to lib/bs4/element.py index 4a4d3ed3..b100d18b 100644 --- a/bs4/element.py +++ b/lib/bs4/element.py @@ -1,5 +1,10 @@ +# Use of this source code is governed by a BSD-style license that can be +# found in the LICENSE file. +__license__ = "MIT" + import collections import re +import shlex import sys import warnings from bs4.dammit import EntitySubstitution @@ -26,6 +31,9 @@ class NamespacedAttribute(unicode): def __new__(cls, prefix, name, namespace=None): if name is None: obj = unicode.__new__(cls, prefix) + elif prefix is None: + # Not really namespaced. + obj = unicode.__new__(cls, name) else: obj = unicode.__new__(cls, prefix + ":" + name) obj.prefix = prefix @@ -78,6 +86,42 @@ class ContentMetaAttributeValue(AttributeValueWithCharsetSubstitution): return match.group(1) + encoding return self.CHARSET_RE.sub(rewrite, self.original_value) +class HTMLAwareEntitySubstitution(EntitySubstitution): + + """Entity substitution rules that are aware of some HTML quirks. + + Specifically, the contents of + +Hello, world! + + +''' + soup = self.soup(html) + self.assertEqual("text/javascript", soup.find('script')['type']) + def test_comment(self): # Comments are represented as Comment objects. markup = "

foobaz

" @@ -159,10 +218,29 @@ class HTMLTreeBuilderSmokeTest(object): comment = soup.find(text="foobar") self.assertEqual(comment.__class__, Comment) + # The comment is properly integrated into the tree. + foo = soup.find(text="foo") + self.assertEqual(comment, foo.next_element) + baz = soup.find(text="baz") + self.assertEqual(comment, baz.previous_element) + def test_preserved_whitespace_in_pre_and_textarea(self): - """Whitespace must be preserved in
 and ")
+        """Whitespace must be preserved in 
 and "
+        self.assertSoupEquals(pre_markup)
+        self.assertSoupEquals(textarea_markup)
+
+        soup = self.soup(pre_markup)
+        self.assertEqual(soup.pre.prettify(), pre_markup)
+
+        soup = self.soup(textarea_markup)
+        self.assertEqual(soup.textarea.prettify(), textarea_markup)
+
+        soup = self.soup("")
+        self.assertEqual(soup.textarea.prettify(), "")
 
     def test_nested_inline_elements(self):
         """Inline elements can be nested indefinitely."""
@@ -210,6 +288,14 @@ class HTMLTreeBuilderSmokeTest(object):
         soup = self.soup(markup)
         self.assertEqual(["css"], soup.div.div['class'])
 
+    def test_multivalued_attribute_on_html(self):
+        # html5lib uses a different API to set the attributes ot the
+        #  tag. This has caused problems with multivalued
+        # attributes.
+        markup = ''
+        soup = self.soup(markup)
+        self.assertEqual(["a", "b"], soup.html['class'])
+
     def test_angle_brackets_in_attribute_values_are_escaped(self):
         self.assertSoupEquals('', '')
 
@@ -217,12 +303,14 @@ class HTMLTreeBuilderSmokeTest(object):
         expect = u'

' self.assertSoupEquals('

', expect) self.assertSoupEquals('

', expect) + self.assertSoupEquals('

', expect) self.assertSoupEquals('

', expect) def test_entities_in_text_converted_to_unicode(self): expect = u'

pi\N{LATIN SMALL LETTER N WITH TILDE}ata

' self.assertSoupEquals("

piñata

", expect) self.assertSoupEquals("

piñata

", expect) + self.assertSoupEquals("

piñata

", expect) self.assertSoupEquals("

piñata

", expect) def test_quot_entity_converted_to_quotation_mark(self): @@ -235,6 +323,41 @@ class HTMLTreeBuilderSmokeTest(object): self.assertSoupEquals("�", expect) self.assertSoupEquals("�", expect) + def test_multipart_strings(self): + "Mostly to prevent a recurrence of a bug in the html5lib treebuilder." + soup = self.soup("

\nfoo

") + self.assertEqual("p", soup.h2.string.next_element.name) + self.assertEqual("p", soup.p.name) + self.assertConnectedness(soup) + + def test_head_tag_between_head_and_body(self): + "Prevent recurrence of a bug in the html5lib treebuilder." + content = """ + + foo + +""" + soup = self.soup(content) + self.assertNotEqual(None, soup.html.body) + self.assertConnectedness(soup) + + def test_multiple_copies_of_a_tag(self): + "Prevent recurrence of a bug in the html5lib treebuilder." + content = """ + + + + + +""" + soup = self.soup(content) + self.assertConnectedness(soup.article) + def test_basic_namespaces(self): """Parsers don't need to *understand* namespaces, but at the very least they should not choke on namespaces or lose @@ -262,6 +385,14 @@ class HTMLTreeBuilderSmokeTest(object): # to detect any differences between them. # + def test_can_parse_unicode_document(self): + # A seemingly innocuous document... but it's in Unicode! And + # it contains characters that can't be represented in the + # encoding found in the declaration! The horror! + markup = u'Sacr\N{LATIN SMALL LETTER E WITH ACUTE} bleu!' + soup = self.soup(markup) + self.assertEqual(u'Sacr\xe9 bleu!', soup.body.string) + def test_soupstrainer(self): """Parsers should be able to work with SoupStrainers.""" strainer = SoupStrainer("b") @@ -372,7 +503,9 @@ class HTMLTreeBuilderSmokeTest(object): hebrew_document = b'Hebrew (ISO 8859-8) in Visual Directionality

Hebrew (ISO 8859-8) in Visual Directionality

\xed\xe5\xec\xf9' soup = self.soup( hebrew_document, from_encoding="iso8859-8") - self.assertEqual(soup.original_encoding, 'iso8859-8') + # Some tree builders call it iso8859-8, others call it iso-8859-9. + # That's not a difference we really care about. + assert soup.original_encoding in ('iso8859-8', 'iso-8859-8') self.assertEqual( soup.encode('utf-8'), hebrew_document.decode("iso8859-8").encode("utf-8")) @@ -436,11 +569,30 @@ class HTMLTreeBuilderSmokeTest(object): class XMLTreeBuilderSmokeTest(object): + def test_pickle_and_unpickle_identity(self): + # Pickling a tree, then unpickling it, yields a tree identical + # to the original. + tree = self.soup("foo") + dumped = pickle.dumps(tree, 2) + loaded = pickle.loads(dumped) + self.assertEqual(loaded.__class__, BeautifulSoup) + self.assertEqual(loaded.decode(), tree.decode()) + def test_docstring_generated(self): soup = self.soup("") self.assertEqual( soup.encode(), b'\n') + def test_xml_declaration(self): + markup = b"""\n""" + soup = self.soup(markup) + self.assertEqual(markup, soup.encode("utf8")) + + def test_processing_instruction(self): + markup = b"""\n""" + soup = self.soup(markup) + self.assertEqual(markup, soup.encode("utf8")) + def test_real_xhtml_document(self): """A real XHTML document should come out *exactly* the same as it went in.""" markup = b""" @@ -453,6 +605,23 @@ class XMLTreeBuilderSmokeTest(object): self.assertEqual( soup.encode("utf-8"), markup) + def test_formatter_processes_script_tag_for_xml_documents(self): + doc = """ + +""" + soup = BeautifulSoup(doc, "lxml-xml") + # lxml would have stripped this while parsing, but we can add + # it later. + soup.script.string = 'console.log("< < hey > > ");' + encoded = soup.encode() + self.assertTrue(b"< < hey > >" in encoded) + + def test_can_parse_unicode_document(self): + markup = u'Sacr\N{LATIN SMALL LETTER E WITH ACUTE} bleu!' + soup = self.soup(markup) + self.assertEqual(u'Sacr\xe9 bleu!', soup.root.string) + def test_popping_namespaced_tag(self): markup = 'b2012-07-02T20:33:42Zcd' soup = self.soup(markup) @@ -490,6 +659,16 @@ class XMLTreeBuilderSmokeTest(object): soup = self.soup(markup) self.assertEqual(unicode(soup.p), markup) + def test_namespaced_attributes(self): + markup = '' + soup = self.soup(markup) + self.assertEqual(unicode(soup.foo), markup) + + def test_namespaced_attributes_xml_namespace(self): + markup = 'bar' + soup = self.soup(markup) + self.assertEqual(unicode(soup.foo), markup) + class HTML5TreeBuilderSmokeTest(HTMLTreeBuilderSmokeTest): """Smoke test for a tree builder that supports HTML5.""" @@ -518,6 +697,12 @@ class HTML5TreeBuilderSmokeTest(HTMLTreeBuilderSmokeTest): self.assertEqual(namespace, soup.math.namespace) self.assertEqual(namespace, soup.msqrt.namespace) + def test_xml_declaration_becomes_comment(self): + markup = '' + soup = self.soup(markup) + self.assertTrue(isinstance(soup.contents[0], Comment)) + self.assertEqual(soup.contents[0], '?xml version="1.0" encoding="utf-8"?') + self.assertEqual("html", soup.contents[0].next_element.name) def skipIf(condition, reason): def nothing(test, *args, **kwargs): diff --git a/bs4/tests/__init__.py b/lib/bs4/tests/__init__.py similarity index 100% rename from bs4/tests/__init__.py rename to lib/bs4/tests/__init__.py diff --git a/bs4/tests/test_builder_registry.py b/lib/bs4/tests/test_builder_registry.py similarity index 93% rename from bs4/tests/test_builder_registry.py rename to lib/bs4/tests/test_builder_registry.py index 92ad10fb..90cad829 100644 --- a/bs4/tests/test_builder_registry.py +++ b/lib/bs4/tests/test_builder_registry.py @@ -1,6 +1,7 @@ """Tests of the builder registry.""" import unittest +import warnings from bs4 import BeautifulSoup from bs4.builder import ( @@ -67,10 +68,15 @@ class BuiltInRegistryTest(unittest.TestCase): HTMLParserTreeBuilder) def test_beautifulsoup_constructor_does_lookup(self): - # You can pass in a string. - BeautifulSoup("", features="html") - # Or a list of strings. - BeautifulSoup("", features=["html", "fast"]) + + with warnings.catch_warnings(record=True) as w: + # This will create a warning about not explicitly + # specifying a parser, but we'll ignore it. + + # You can pass in a string. + BeautifulSoup("", features="html") + # Or a list of strings. + BeautifulSoup("", features=["html", "fast"]) # You'll get an exception if BS can't find an appropriate # builder. diff --git a/bs4/tests/test_docs.py b/lib/bs4/tests/test_docs.py similarity index 100% rename from bs4/tests/test_docs.py rename to lib/bs4/tests/test_docs.py diff --git a/lib/bs4/tests/test_html5lib.py b/lib/bs4/tests/test_html5lib.py new file mode 100644 index 00000000..8e3cba68 --- /dev/null +++ b/lib/bs4/tests/test_html5lib.py @@ -0,0 +1,109 @@ +"""Tests to ensure that the html5lib tree builder generates good trees.""" + +import warnings + +try: + from bs4.builder import HTML5TreeBuilder + HTML5LIB_PRESENT = True +except ImportError, e: + HTML5LIB_PRESENT = False +from bs4.element import SoupStrainer +from bs4.testing import ( + HTML5TreeBuilderSmokeTest, + SoupTest, + skipIf, +) + +@skipIf( + not HTML5LIB_PRESENT, + "html5lib seems not to be present, not testing its tree builder.") +class HTML5LibBuilderSmokeTest(SoupTest, HTML5TreeBuilderSmokeTest): + """See ``HTML5TreeBuilderSmokeTest``.""" + + @property + def default_builder(self): + return HTML5TreeBuilder() + + def test_soupstrainer(self): + # The html5lib tree builder does not support SoupStrainers. + strainer = SoupStrainer("b") + markup = "

A bold statement.

" + with warnings.catch_warnings(record=True) as w: + soup = self.soup(markup, parse_only=strainer) + self.assertEqual( + soup.decode(), self.document_for(markup)) + + self.assertTrue( + "the html5lib tree builder doesn't support parse_only" in + str(w[0].message)) + + def test_correctly_nested_tables(self): + """html5lib inserts tags where other parsers don't.""" + markup = ('' + '' + "') + + self.assertSoupEquals( + markup, + '
Here's another table:" + '' + '' + '
foo
Here\'s another table:' + '
foo
' + '
') + + self.assertSoupEquals( + "" + "" + "
Foo
Bar
Baz
") + + def test_xml_declaration_followed_by_doctype(self): + markup = ''' + + + + + +

foo

+ +''' + soup = self.soup(markup) + # Verify that we can reach the

tag; this means the tree is connected. + self.assertEqual(b"

foo

", soup.p.encode()) + + def test_reparented_markup(self): + markup = '

foo

\n

bar

' + soup = self.soup(markup) + self.assertEqual(u"

foo

\n

bar

", soup.body.decode()) + self.assertEqual(2, len(soup.find_all('p'))) + + + def test_reparented_markup_ends_with_whitespace(self): + markup = '

foo

\n

bar

\n' + soup = self.soup(markup) + self.assertEqual(u"

foo

\n

bar

\n", soup.body.decode()) + self.assertEqual(2, len(soup.find_all('p'))) + + def test_reparented_markup_containing_identical_whitespace_nodes(self): + """Verify that we keep the two whitespace nodes in this + document distinct when reparenting the adjacent tags. + """ + markup = '
' + soup = self.soup(markup) + space1, space2 = soup.find_all(string=' ') + tbody1, tbody2 = soup.find_all('tbody') + assert space1.next_element is tbody1 + assert tbody2.next_element is space2 + + def test_processing_instruction(self): + """Processing instructions become comments.""" + markup = b"""""" + soup = self.soup(markup) + assert str(soup).startswith("") + + def test_cloned_multivalue_node(self): + markup = b"""

""" + soup = self.soup(markup) + a1, a2 = soup.find_all('a') + self.assertEqual(a1, a2) + assert a1 is not a2 diff --git a/bs4/tests/test_htmlparser.py b/lib/bs4/tests/test_htmlparser.py similarity index 61% rename from bs4/tests/test_htmlparser.py rename to lib/bs4/tests/test_htmlparser.py index bcb5ed23..b45e35f9 100644 --- a/bs4/tests/test_htmlparser.py +++ b/lib/bs4/tests/test_htmlparser.py @@ -1,6 +1,8 @@ """Tests to ensure that the html.parser tree builder generates good trees.""" +from pdb import set_trace +import pickle from bs4.testing import SoupTest, HTMLTreeBuilderSmokeTest from bs4.builder import HTMLParserTreeBuilder @@ -17,3 +19,14 @@ class HTMLParserTreeBuilderSmokeTest(SoupTest, HTMLTreeBuilderSmokeTest): def test_namespaced_public_doctype(self): # html.parser can't handle namespaced doctypes, so skip this one. pass + + def test_builder_is_pickled(self): + """Unlike most tree builders, HTMLParserTreeBuilder and will + be restored after pickling. + """ + tree = self.soup("foo") + dumped = pickle.dumps(tree, 2) + loaded = pickle.loads(dumped) + self.assertTrue(isinstance(loaded.builder, type(tree.builder))) + + diff --git a/bs4/tests/test_lxml.py b/lib/bs4/tests/test_lxml.py similarity index 69% rename from bs4/tests/test_lxml.py rename to lib/bs4/tests/test_lxml.py index 39e26bfb..a05870b9 100644 --- a/bs4/tests/test_lxml.py +++ b/lib/bs4/tests/test_lxml.py @@ -4,10 +4,15 @@ import re import warnings try: - from bs4.builder import LXMLTreeBuilder, LXMLTreeBuilderForXML + import lxml.etree LXML_PRESENT = True + LXML_VERSION = lxml.etree.LXML_VERSION except ImportError, e: LXML_PRESENT = False + LXML_VERSION = (0,) + +if LXML_PRESENT: + from bs4.builder import LXMLTreeBuilder, LXMLTreeBuilderForXML from bs4 import ( BeautifulSoup, @@ -41,27 +46,24 @@ class LXMLTreeBuilderSmokeTest(SoupTest, HTMLTreeBuilderSmokeTest): self.assertSoupEquals( "

foo�bar

", "

foobar

") + # In lxml < 2.3.5, an empty doctype causes a segfault. Skip this + # test if an old version of lxml is installed. + + @skipIf( + not LXML_PRESENT or LXML_VERSION < (2,3,5,0), + "Skipping doctype test for old version of lxml to avoid segfault.") + def test_empty_doctype(self): + soup = self.soup("") + doctype = soup.contents[0] + self.assertEqual("", doctype.strip()) + def test_beautifulstonesoup_is_xml_parser(self): # Make sure that the deprecated BSS class uses an xml builder # if one is installed. - with warnings.catch_warnings(record=False) as w: + with warnings.catch_warnings(record=True) as w: soup = BeautifulStoneSoup("") - self.assertEqual(u"", unicode(soup.b)) - - def test_real_xhtml_document(self): - """lxml strips the XML definition from an XHTML doc, which is fine.""" - markup = b""" - - -Hello. -Goodbye. -""" - soup = self.soup(markup) - self.assertEqual( - soup.encode("utf-8").replace(b"\n", b''), - markup.replace(b'\n', b'').replace( - b'', b'')) - + self.assertEqual(u"", unicode(soup.b)) + self.assertTrue("BeautifulStoneSoup class is deprecated" in str(w[0].message)) @skipIf( not LXML_PRESENT, @@ -72,4 +74,3 @@ class LXMLXMLTreeBuilderSmokeTest(SoupTest, XMLTreeBuilderSmokeTest): @property def default_builder(self): return LXMLTreeBuilderForXML() - diff --git a/bs4/tests/test_soup.py b/lib/bs4/tests/test_soup.py similarity index 62% rename from bs4/tests/test_soup.py rename to lib/bs4/tests/test_soup.py index 23a664e7..f3e69edf 100644 --- a/bs4/tests/test_soup.py +++ b/lib/bs4/tests/test_soup.py @@ -1,7 +1,12 @@ # -*- coding: utf-8 -*- """Tests of Beautiful Soup as a whole.""" +from pdb import set_trace +import logging import unittest +import sys +import tempfile + from bs4 import ( BeautifulSoup, BeautifulStoneSoup, @@ -13,7 +18,11 @@ from bs4.element import ( NamespacedAttribute, ) import bs4.dammit -from bs4.dammit import EntitySubstitution, UnicodeDammit +from bs4.dammit import ( + EntitySubstitution, + UnicodeDammit, + EncodingDetector, +) from bs4.testing import ( SoupTest, skipIf, @@ -26,7 +35,48 @@ try: except ImportError, e: LXML_PRESENT = False -class TestDeprecatedConstructorArguments(SoupTest): +PYTHON_3_PRE_3_2 = (sys.version_info[0] == 3 and sys.version_info < (3,2)) + +class TestConstructor(SoupTest): + + def test_short_unicode_input(self): + data = u"

éé

" + soup = self.soup(data) + self.assertEqual(u"éé", soup.h1.string) + + def test_embedded_null(self): + data = u"

foo\0bar

" + soup = self.soup(data) + self.assertEqual(u"foo\0bar", soup.h1.string) + + def test_exclude_encodings(self): + utf8_data = u"RäksmörgÃ¥s".encode("utf-8") + soup = self.soup(utf8_data, exclude_encodings=["utf-8"]) + self.assertEqual("windows-1252", soup.original_encoding) + + +class TestWarnings(SoupTest): + + def _no_parser_specified(self, s, is_there=True): + v = s.startswith(BeautifulSoup.NO_PARSER_SPECIFIED_WARNING[:80]) + self.assertTrue(v) + + def test_warning_if_no_parser_specified(self): + with warnings.catch_warnings(record=True) as w: + soup = self.soup("") + msg = str(w[0].message) + self._assert_no_parser_specified(msg) + + def test_warning_if_parser_specified_too_vague(self): + with warnings.catch_warnings(record=True) as w: + soup = self.soup("", "html") + msg = str(w[0].message) + self._assert_no_parser_specified(msg) + + def test_no_warning_if_explicit_parser_specified(self): + with warnings.catch_warnings(record=True) as w: + soup = self.soup("", "html.parser") + self.assertEqual([], w) def test_parseOnlyThese_renamed_to_parse_only(self): with warnings.catch_warnings(record=True) as w: @@ -49,14 +99,52 @@ class TestDeprecatedConstructorArguments(SoupTest): self.assertRaises( TypeError, self.soup, "", no_such_argument=True) - @skipIf( - not LXML_PRESENT, - "lxml not present, not testing BeautifulStoneSoup.") - def test_beautifulstonesoup(self): +class TestWarnings(SoupTest): + + def test_disk_file_warning(self): + filehandle = tempfile.NamedTemporaryFile() + filename = filehandle.name + try: + with warnings.catch_warnings(record=True) as w: + soup = self.soup(filename) + msg = str(w[0].message) + self.assertTrue("looks like a filename" in msg) + finally: + filehandle.close() + + # The file no longer exists, so Beautiful Soup will no longer issue the warning. with warnings.catch_warnings(record=True) as w: - soup = BeautifulStoneSoup("") - self.assertTrue(isinstance(soup, BeautifulSoup)) - self.assertTrue("BeautifulStoneSoup class is deprecated") + soup = self.soup(filename) + self.assertEqual(0, len(w)) + + def test_url_warning_with_bytes_url(self): + with warnings.catch_warnings(record=True) as warning_list: + soup = self.soup(b"http://www.crummybytes.com/") + # Be aware this isn't the only warning that can be raised during + # execution.. + self.assertTrue(any("looks like a URL" in str(w.message) + for w in warning_list)) + + def test_url_warning_with_unicode_url(self): + with warnings.catch_warnings(record=True) as warning_list: + # note - this url must differ from the bytes one otherwise + # python's warnings system swallows the second warning + soup = self.soup(u"http://www.crummyunicode.com/") + self.assertTrue(any("looks like a URL" in str(w.message) + for w in warning_list)) + + def test_url_warning_with_bytes_and_space(self): + with warnings.catch_warnings(record=True) as warning_list: + soup = self.soup(b"http://www.crummybytes.com/ is great") + self.assertFalse(any("looks like a URL" in str(w.message) + for w in warning_list)) + + def test_url_warning_with_unicode_and_space(self): + with warnings.catch_warnings(record=True) as warning_list: + soup = self.soup(u"http://www.crummyuncode.com/ is great") + self.assertFalse(any("looks like a URL" in str(w.message) + for w in warning_list)) + class TestSelectiveParsing(SoupTest): @@ -120,9 +208,14 @@ class TestEntitySubstitution(unittest.TestCase): def test_xml_quoting_handles_ampersands(self): self.assertEqual(self.sub.substitute_xml("AT&T"), "AT&T") - def test_xml_quoting_ignores_ampersands_when_they_are_part_of_an_entity(self): + def test_xml_quoting_including_ampersands_when_they_are_part_of_an_entity(self): self.assertEqual( self.sub.substitute_xml("ÁT&T"), + "&Aacute;T&T") + + def test_xml_quoting_ignoring_ampersands_when_they_are_part_of_an_entity(self): + self.assertEqual( + self.sub.substitute_xml_containing_entities("ÁT&T"), "ÁT&T") def test_quotes_not_html_substituted(self): @@ -137,22 +230,32 @@ class TestEncodingConversion(SoupTest): def setUp(self): super(TestEncodingConversion, self).setUp() - self.unicode_data = u"Sacr\N{LATIN SMALL LETTER E WITH ACUTE} bleu!" + self.unicode_data = u'Sacr\N{LATIN SMALL LETTER E WITH ACUTE} bleu!' self.utf8_data = self.unicode_data.encode("utf-8") # Just so you know what it looks like. self.assertEqual( self.utf8_data, - b"Sacr\xc3\xa9 bleu!") + b'Sacr\xc3\xa9 bleu!') def test_ascii_in_unicode_out(self): # ASCII input is converted to Unicode. The original_encoding - # attribute is set. - ascii = b"a" - soup_from_ascii = self.soup(ascii) - unicode_output = soup_from_ascii.decode() - self.assertTrue(isinstance(unicode_output, unicode)) - self.assertEqual(unicode_output, self.document_for(ascii.decode())) - self.assertEqual(soup_from_ascii.original_encoding, "ascii") + # attribute is set to 'utf-8', a superset of ASCII. + chardet = bs4.dammit.chardet_dammit + logging.disable(logging.WARNING) + try: + def noop(str): + return None + # Disable chardet, which will realize that the ASCII is ASCII. + bs4.dammit.chardet_dammit = noop + ascii = b"a" + soup_from_ascii = self.soup(ascii) + unicode_output = soup_from_ascii.decode() + self.assertTrue(isinstance(unicode_output, unicode)) + self.assertEqual(unicode_output, self.document_for(ascii.decode())) + self.assertEqual(soup_from_ascii.original_encoding.lower(), "utf-8") + finally: + logging.disable(logging.NOTSET) + bs4.dammit.chardet_dammit = chardet def test_unicode_in_unicode_out(self): # Unicode input is left alone. The original_encoding attribute @@ -174,9 +277,20 @@ class TestEncodingConversion(SoupTest): soup_from_unicode = self.soup(self.unicode_data) self.assertEqual(soup_from_unicode.encode('utf-8'), self.utf8_data) + @skipIf( + PYTHON_3_PRE_3_2, + "Bad HTMLParser detected; skipping test of non-ASCII characters in attribute name.") + def test_attribute_name_containing_unicode_characters(self): + markup = u'
' + self.assertEqual(self.soup(markup).div.encode("utf8"), markup.encode("utf8")) class TestUnicodeDammit(unittest.TestCase): - """Standalone tests of Unicode, Dammit.""" + """Standalone tests of UnicodeDammit.""" + + def test_unicode_input(self): + markup = u"I'm already Unicode! \N{SNOWMAN}" + dammit = UnicodeDammit(markup) + self.assertEqual(dammit.unicode_markup, markup) def test_smart_quotes_to_unicode(self): markup = b"\x91\x92\x93\x94" @@ -203,33 +317,54 @@ class TestUnicodeDammit(unittest.TestCase): dammit.unicode_markup, """''""""") def test_detect_utf8(self): - utf8 = b"\xc3\xa9" + utf8 = b"Sacr\xc3\xa9 bleu! \xe2\x98\x83" dammit = UnicodeDammit(utf8) - self.assertEqual(dammit.unicode_markup, u'\xe9') - self.assertEqual(dammit.original_encoding, 'utf-8') + self.assertEqual(dammit.original_encoding.lower(), 'utf-8') + self.assertEqual(dammit.unicode_markup, u'Sacr\xe9 bleu! \N{SNOWMAN}') + def test_convert_hebrew(self): hebrew = b"\xed\xe5\xec\xf9" dammit = UnicodeDammit(hebrew, ["iso-8859-8"]) - self.assertEqual(dammit.original_encoding, 'iso-8859-8') + self.assertEqual(dammit.original_encoding.lower(), 'iso-8859-8') self.assertEqual(dammit.unicode_markup, u'\u05dd\u05d5\u05dc\u05e9') def test_dont_see_smart_quotes_where_there_are_none(self): utf_8 = b"\343\202\261\343\203\274\343\202\277\343\202\244 Watch" dammit = UnicodeDammit(utf_8) - self.assertEqual(dammit.original_encoding, 'utf-8') + self.assertEqual(dammit.original_encoding.lower(), 'utf-8') self.assertEqual(dammit.unicode_markup.encode("utf-8"), utf_8) def test_ignore_inappropriate_codecs(self): utf8_data = u"Räksmörgås".encode("utf-8") dammit = UnicodeDammit(utf8_data, ["iso-8859-8"]) - self.assertEqual(dammit.original_encoding, 'utf-8') + self.assertEqual(dammit.original_encoding.lower(), 'utf-8') def test_ignore_invalid_codecs(self): utf8_data = u"Räksmörgås".encode("utf-8") for bad_encoding in ['.utf8', '...', 'utF---16.!']: dammit = UnicodeDammit(utf8_data, [bad_encoding]) - self.assertEqual(dammit.original_encoding, 'utf-8') + self.assertEqual(dammit.original_encoding.lower(), 'utf-8') + + def test_exclude_encodings(self): + # This is UTF-8. + utf8_data = u"Räksmörgås".encode("utf-8") + + # But if we exclude UTF-8 from consideration, the guess is + # Windows-1252. + dammit = UnicodeDammit(utf8_data, exclude_encodings=["utf-8"]) + self.assertEqual(dammit.original_encoding.lower(), 'windows-1252') + + # And if we exclude that, there is no valid guess at all. + dammit = UnicodeDammit( + utf8_data, exclude_encodings=["utf-8", "windows-1252"]) + self.assertEqual(dammit.original_encoding, None) + + def test_encoding_detector_replaces_junk_in_encoding_name_with_replacement_character(self): + detected = EncodingDetector( + b'') + encodings = list(detected.encodings) + assert u'utf-\N{REPLACEMENT CHARACTER}' in encodings def test_detect_html5_style_meta_tag(self): @@ -261,26 +396,24 @@ class TestUnicodeDammit(unittest.TestCase): doc = b"""\357\273\277 \330\250\330\252\330\261 \310\322\321\220\312\321\355\344""" - chardet = bs4.dammit.chardet + chardet = bs4.dammit.chardet_dammit + logging.disable(logging.WARNING) try: - bs4.dammit.chardet = None - with warnings.catch_warnings(record=True) as w: - dammit = UnicodeDammit(doc) - self.assertEqual(True, dammit.contains_replacement_characters) - self.assertTrue(u"\ufffd" in dammit.unicode_markup) + def noop(str): + return None + bs4.dammit.chardet_dammit = noop + dammit = UnicodeDammit(doc) + self.assertEqual(True, dammit.contains_replacement_characters) + self.assertTrue(u"\ufffd" in dammit.unicode_markup) - soup = BeautifulSoup(doc, "html.parser") - self.assertTrue(soup.contains_replacement_characters) - - msg = w[0].message - self.assertTrue(isinstance(msg, UnicodeWarning)) - self.assertTrue("Some characters could not be decoded" in str(msg)) + soup = BeautifulSoup(doc, "html.parser") + self.assertTrue(soup.contains_replacement_characters) finally: - bs4.dammit.chardet = chardet + logging.disable(logging.NOTSET) + bs4.dammit.chardet_dammit = chardet - def test_sniffed_xml_encoding(self): - # A document written in UTF-16LE will be converted by a different - # code path that sniffs the byte order markers. + def test_byte_order_mark_removed(self): + # A document written in UTF-16LE will have its byte order marker stripped. data = b'\xff\xfe<\x00a\x00>\x00\xe1\x00\xe9\x00<\x00/\x00a\x00>\x00' dammit = UnicodeDammit(data) self.assertEqual(u"áé", dammit.unicode_markup) diff --git a/bs4/tests/test_tree.py b/lib/bs4/tests/test_tree.py similarity index 80% rename from bs4/tests/test_tree.py rename to lib/bs4/tests/test_tree.py index cc573ede..a4fe0b16 100644 --- a/bs4/tests/test_tree.py +++ b/lib/bs4/tests/test_tree.py @@ -9,6 +9,7 @@ same markup, but all Beautiful Soup trees can be traversed with the methods tested here. """ +from pdb import set_trace import copy import pickle import re @@ -19,7 +20,10 @@ from bs4.builder import ( HTMLParserTreeBuilder, ) from bs4.element import ( + PY3K, CData, + Comment, + Declaration, Doctype, NavigableString, SoupStrainer, @@ -67,7 +71,23 @@ class TestFind(TreeTest): def test_unicode_text_find(self): soup = self.soup(u'

Räksmörgås

') - self.assertEqual(soup.find(text=u'Räksmörgås'), u'Räksmörgås') + self.assertEqual(soup.find(string=u'Räksmörgås'), u'Räksmörgås') + + def test_unicode_attribute_find(self): + soup = self.soup(u'

here it is

') + str(soup) + self.assertEqual("here it is", soup.find(id=u'Räksmörgås').text) + + + def test_find_everything(self): + """Test an optimization that finds all tags.""" + soup = self.soup("foobar") + self.assertEqual(2, len(soup.find_all())) + + def test_find_everything_with_name(self): + """Test an optimization that finds all tags with a given name.""" + soup = self.soup("foobarbaz") + self.assertEqual(2, len(soup.find_all('a'))) class TestFindAll(TreeTest): """Basic tests of the find_all() method.""" @@ -76,6 +96,7 @@ class TestFindAll(TreeTest): """You can search the tree for text nodes.""" soup = self.soup("Foobar\xbb") # Exact match. + self.assertEqual(soup.find_all(string="bar"), [u"bar"]) self.assertEqual(soup.find_all(text="bar"), [u"bar"]) # Match any of a number of strings. self.assertEqual( @@ -114,6 +135,19 @@ class TestFindAll(TreeTest): # recursion. self.assertEqual([], soup.find_all(l)) + def test_find_all_resultset(self): + """All find_all calls return a ResultSet""" + soup = self.soup("") + result = soup.find_all("a") + self.assertTrue(hasattr(result, "source")) + + result = soup.find_all(True) + self.assertTrue(hasattr(result, "source")) + + result = soup.find_all(text="foo") + self.assertTrue(hasattr(result, "source")) + + class TestFindAllBasicNamespaces(TreeTest): def test_find_by_namespaced_name(self): @@ -188,6 +222,17 @@ class TestFindAllByName(TreeTest): self.assertSelects( tree.find_all(id_matches_name), ["Match 1.", "Match 2."]) + def test_find_with_multi_valued_attribute(self): + soup = self.soup( + "
1
2
3
" + ) + r1 = soup.find('div', 'a d'); + r2 = soup.find('div', re.compile(r'a d')); + r3, r4 = soup.find_all('div', ['a b', 'a d']); + self.assertEqual('3', r1.string) + self.assertEqual('3', r2.string) + self.assertEqual('1', r3.string) + self.assertEqual('3', r4.string) class TestFindAllByAttribute(TreeTest): @@ -228,18 +273,24 @@ class TestFindAllByAttribute(TreeTest): self.assertSelects(tree.find_all(attrs={'name' : 'name1'}), ["Name match."]) - # Passing class='class2' would cause a syntax error. self.assertSelects(tree.find_all(attrs={'class' : 'class2'}), ["Class match."]) def test_find_all_by_class(self): - # Passing in a string to 'attrs' will search the CSS class. tree = self.soup(""" Class 1. Class 2. Class 1. Class 3 and 4. """) + + # Passing in the class_ keyword argument will search against + # the 'class' attribute. + self.assertSelects(tree.find_all('a', class_='1'), ['Class 1.']) + self.assertSelects(tree.find_all('c', class_='3'), ['Class 3 and 4.']) + self.assertSelects(tree.find_all('c', class_='4'), ['Class 3 and 4.']) + + # Passing in a string to 'attrs' will also search the CSS class. self.assertSelects(tree.find_all('a', '1'), ['Class 1.']) self.assertSelects(tree.find_all(attrs='1'), ['Class 1.', 'Class 1.']) self.assertSelects(tree.find_all('c', '3'), ['Class 3 and 4.']) @@ -248,18 +299,16 @@ class TestFindAllByAttribute(TreeTest): def test_find_by_class_when_multiple_classes_present(self): tree = self.soup("Found it") - attrs = { 'class' : re.compile("o") } - f = tree.find_all("gar", attrs=attrs) + f = tree.find_all("gar", class_=re.compile("o")) self.assertSelects(f, ["Found it"]) - f = tree.find_all("gar", re.compile("a")) + f = tree.find_all("gar", class_=re.compile("a")) self.assertSelects(f, ["Found it"]) - # Since the class is not the string "foo bar", but the two - # strings "foo" and "bar", this will not find anything. - attrs = { 'class' : re.compile("o b") } - f = tree.find_all("gar", attrs=attrs) - self.assertSelects(f, []) + # If the search fails to match the individual strings "foo" and "bar", + # it will be tried against the combined string "foo bar". + f = tree.find_all("gar", class_=re.compile("o b")) + self.assertSelects(f, ["Found it"]) def test_find_all_with_non_dictionary_for_attrs_finds_by_class(self): soup = self.soup("Found it") @@ -283,8 +332,9 @@ class TestFindAllByAttribute(TreeTest): self.assertEqual([a, a2], soup.find_all("a", "foo")) self.assertEqual([a], soup.find_all("a", "bar")) - # If you specify the attribute as a string that contains a + # If you specify the class as a string that contains a # space, only that specific value will be found. + self.assertEqual([a], soup.find_all("a", class_="foo bar")) self.assertEqual([a], soup.find_all("a", "foo bar")) self.assertEqual([], soup.find_all("a", "bar foo")) @@ -296,7 +346,7 @@ class TestFindAllByAttribute(TreeTest): strainer = SoupStrainer(attrs={'id' : 'first'}) self.assertSelects(tree.find_all(strainer), ['Match.']) - def test_find_all_with_missing_atribute(self): + def test_find_all_with_missing_attribute(self): # You can pass in None as the value of an attribute to find_all. # This will match tags that do not have that attribute set. tree = self.soup("""ID present. @@ -420,6 +470,7 @@ class TestParentOperations(TreeTest): def test_find_parent(self): self.assertEqual(self.start.find_parent('ul')['id'], 'bottom') + self.assertEqual(self.start.find_parent('ul', id='top')['id'], 'top') def test_parent_of_text_element(self): text = self.tree.find(text="Start here") @@ -658,7 +709,7 @@ class TestTagCreation(SoupTest): def test_tag_inherits_self_closing_rules_from_builder(self): if XML_BUILDER_PRESENT: - xml_soup = BeautifulSoup("", "xml") + xml_soup = BeautifulSoup("", "lxml-xml") xml_br = xml_soup.new_tag("br") xml_p = xml_soup.new_tag("p") @@ -667,7 +718,7 @@ class TestTagCreation(SoupTest): self.assertEqual(b"
", xml_br.encode()) self.assertEqual(b"

", xml_p.encode()) - html_soup = BeautifulSoup("", "html") + html_soup = BeautifulSoup("", "html.parser") html_br = html_soup.new_tag("br") html_p = html_soup.new_tag("p") @@ -682,6 +733,12 @@ class TestTagCreation(SoupTest): self.assertEqual("foo", s) self.assertTrue(isinstance(s, NavigableString)) + def test_new_string_can_create_navigablestring_subclass(self): + soup = self.soup("") + s = soup.new_string("foo", Comment) + self.assertEqual("foo", s) + self.assertTrue(isinstance(s, Comment)) + class TestTreeModification(SoupTest): def test_attribute_modification(self): @@ -737,6 +794,14 @@ class TestTreeModification(SoupTest): new_a = a.unwrap() self.assertEqual(a, new_a) + def test_replace_with_and_unwrap_give_useful_exception_when_tag_has_no_parent(self): + soup = self.soup("FooBar") + a = soup.a + a.extract() + self.assertEqual(None, a.parent) + self.assertRaises(ValueError, a.unwrap) + self.assertRaises(ValueError, a.replace_with, soup.c) + def test_replace_tag_with_itself(self): text = "Foo" soup = self.soup(text) @@ -881,20 +946,20 @@ class TestTreeModification(SoupTest): self.assertEqual( soup.decode(), self.document_for("QUUXbarfooBAZ")) - def test_insert_after_raises_valueerror_if_after_has_no_meaning(self): + def test_insert_after_raises_exception_if_after_has_no_meaning(self): soup = self.soup("") tag = soup.new_tag("a") string = soup.new_string("") self.assertRaises(ValueError, string.insert_after, tag) - self.assertRaises(ValueError, soup.insert_after, tag) + self.assertRaises(NotImplementedError, soup.insert_after, tag) self.assertRaises(ValueError, tag.insert_after, tag) - def test_insert_before_raises_valueerror_if_before_has_no_meaning(self): + def test_insert_before_raises_notimplementederror_if_before_has_no_meaning(self): soup = self.soup("") tag = soup.new_tag("a") string = soup.new_string("") self.assertRaises(ValueError, string.insert_before, tag) - self.assertRaises(ValueError, soup.insert_before, tag) + self.assertRaises(NotImplementedError, soup.insert_before, tag) self.assertRaises(ValueError, tag.insert_before, tag) def test_replace_with(self): @@ -1031,6 +1096,31 @@ class TestTreeModification(SoupTest): self.assertEqual(foo_2, soup.a.string) self.assertEqual(bar_2, soup.b.string) + def test_extract_multiples_of_same_tag(self): + soup = self.soup(""" + + + + + + + + + +""") + [soup.script.extract() for i in soup.find_all("script")] + self.assertEqual("\n\n\n", unicode(soup.body)) + + + def test_extract_works_when_element_is_surrounded_by_identical_strings(self): + soup = self.soup( + '\n' + 'hi\n' + '') + soup.find('body').extract() + self.assertEqual(None, soup.find('body')) + + def test_clear(self): """Tag.clear()""" soup = self.soup("

String Italicized and another

") @@ -1043,7 +1133,7 @@ class TestTreeModification(SoupTest): # clear using decompose() em = a.em a.clear(decompose=True) - self.assertFalse(hasattr(em, "contents")) + self.assertEqual(0, len(em.contents)) def test_string_set(self): """Tag.string = 'string'""" @@ -1161,6 +1251,19 @@ class TestElementObjects(SoupTest): self.assertEqual(soup.a.get_text(","), "a,r, , t ") self.assertEqual(soup.a.get_text(",", strip=True), "a,r,t") + def test_get_text_ignores_comments(self): + soup = self.soup("foobar") + self.assertEqual(soup.get_text(), "foobar") + + self.assertEqual( + soup.get_text(types=(NavigableString, Comment)), "fooIGNOREbar") + self.assertEqual( + soup.get_text(types=None), "fooIGNOREbar") + + def test_all_strings_ignores_comments(self): + soup = self.soup("foobar") + self.assertEqual(['foo', 'bar'], list(soup.strings)) + class TestCDAtaListAttributes(SoupTest): """Testing cdata-list attributes like 'class'. @@ -1193,6 +1296,12 @@ class TestCDAtaListAttributes(SoupTest): # attribute for any other tag. self.assertEqual('ISO-8859-1 UTF-8', soup.a['accept-charset']) + def test_string_has_immutable_name_property(self): + string = self.soup("s").string + self.assertEqual(None, string.name) + def t(): + string.name = 'foo' + self.assertRaises(AttributeError, t) class TestPersistence(SoupTest): "Testing features like pickle and deepcopy." @@ -1230,6 +1339,13 @@ class TestPersistence(SoupTest): copied = copy.deepcopy(self.tree) self.assertEqual(copied.decode(), self.tree.decode()) + def test_copy_preserves_encoding(self): + soup = BeautifulSoup(b'

 

', 'html.parser') + encoding = soup.original_encoding + copy = soup.__copy__() + self.assertEqual(u"

 

", unicode(copy)) + self.assertEqual(encoding, copy.original_encoding) + def test_unicode_pickle(self): # A tree containing Unicode characters can be pickled. html = u"\N{SNOWMAN}" @@ -1238,6 +1354,51 @@ class TestPersistence(SoupTest): loaded = pickle.loads(dumped) self.assertEqual(loaded.decode(), soup.decode()) + def test_copy_navigablestring_is_not_attached_to_tree(self): + html = u"FooBar" + soup = self.soup(html) + s1 = soup.find(string="Foo") + s2 = copy.copy(s1) + self.assertEqual(s1, s2) + self.assertEqual(None, s2.parent) + self.assertEqual(None, s2.next_element) + self.assertNotEqual(None, s1.next_sibling) + self.assertEqual(None, s2.next_sibling) + self.assertEqual(None, s2.previous_element) + + def test_copy_navigablestring_subclass_has_same_type(self): + html = u"" + soup = self.soup(html) + s1 = soup.string + s2 = copy.copy(s1) + self.assertEqual(s1, s2) + self.assertTrue(isinstance(s2, Comment)) + + def test_copy_entire_soup(self): + html = u"
FooBar
end" + soup = self.soup(html) + soup_copy = copy.copy(soup) + self.assertEqual(soup, soup_copy) + + def test_copy_tag_copies_contents(self): + html = u"
FooBar
end" + soup = self.soup(html) + div = soup.div + div_copy = copy.copy(div) + + # The two tags look the same, and evaluate to equal. + self.assertEqual(unicode(div), unicode(div_copy)) + self.assertEqual(div, div_copy) + + # But they're not the same object. + self.assertFalse(div is div_copy) + + # And they don't have the same relation to the parse tree. The + # copy is not associated with a parse tree at all. + self.assertEqual(None, div_copy.parent) + self.assertEqual(None, div_copy.previous_element) + self.assertEqual(None, div_copy.find(string='Bar').next_element) + self.assertNotEqual(None, div.find(string='Bar').next_element) class TestSubstitutions(SoupTest): @@ -1305,8 +1466,34 @@ class TestSubstitutions(SoupTest): expect_upper = u'E' self.assertEqual(expect_upper, a.decode(formatter=lambda x: x.upper())) + def test_formatter_skips_script_tag_for_html_documents(self): + doc = """ + +""" + encoded = BeautifulSoup(doc, 'html.parser').encode() + self.assertTrue(b"< < hey > >" in encoded) + + def test_formatter_skips_style_tag_for_html_documents(self): + doc = """ + +""" + encoded = BeautifulSoup(doc, 'html.parser').encode() + self.assertTrue(b"< < hey > >" in encoded) + + def test_prettify_leaves_preformatted_text_alone(self): + soup = self.soup("
foo
  \tbar\n  \n  
baz ") + # Everything outside the
 tag is reformatted, but everything
+        # inside is left alone.
+        self.assertEqual(
+            u'
\n foo\n
  \tbar\n  \n  
\n baz\n
', + soup.div.prettify()) + def test_prettify_accepts_formatter(self): - soup = BeautifulSoup("foo") + soup = BeautifulSoup("foo", 'html.parser') pretty = soup.prettify(formatter = lambda x: x.upper()) self.assertTrue("FOO" in pretty) @@ -1403,6 +1590,14 @@ class TestEncoding(SoupTest): self.assertEqual( u"\N{SNOWMAN}".encode("utf8"), soup.b.renderContents()) + def test_repr(self): + html = u"\N{SNOWMAN}" + soup = self.soup(html) + if PY3K: + self.assertEqual(html, repr(soup)) + else: + self.assertEqual(b'\\u2603', repr(soup)) + class TestNavigableStringSubclasses(SoupTest): def test_cdata(self): @@ -1441,6 +1636,9 @@ class TestNavigableStringSubclasses(SoupTest): soup.insert(1, doctype) self.assertEqual(soup.encode(), b"\n") + def test_declaration(self): + d = Declaration("foo") + self.assertEqual("", d.output_ready()) class TestSoupSelector(TreeTest): @@ -1453,8 +1651,8 @@ class TestSoupSelector(TreeTest): - -
+Hello there. +

An H1

Some text

@@ -1471,8 +1669,18 @@ class TestSoupSelector(TreeTest): span2a1 + +
+ + + + + + + +

English

English UK

English US

@@ -1484,10 +1692,10 @@ class TestSoupSelector(TreeTest): """ def setUp(self): - self.soup = BeautifulSoup(self.HTML) + self.soup = BeautifulSoup(self.HTML, 'html.parser') - def assertSelects(self, selector, expected_ids): - el_ids = [el['id'] for el in self.soup.select(selector)] + def assertSelects(self, selector, expected_ids, **kwargs): + el_ids = [el['id'] for el in self.soup.select(selector, **kwargs)] el_ids.sort() expected_ids.sort() self.assertEqual(expected_ids, el_ids, @@ -1510,23 +1718,52 @@ class TestSoupSelector(TreeTest): def test_one_tag_many(self): els = self.soup.select('div') - self.assertEqual(len(els), 3) + self.assertEqual(len(els), 4) for div in els: self.assertEqual(div.name, 'div') + el = self.soup.select_one('div') + self.assertEqual('main', el['id']) + + def test_select_one_returns_none_if_no_match(self): + match = self.soup.select_one('nonexistenttag') + self.assertEqual(None, match) + + def test_tag_in_tag_one(self): els = self.soup.select('div div') - self.assertSelects('div div', ['inner']) + self.assertSelects('div div', ['inner', 'data1']) def test_tag_in_tag_many(self): for selector in ('html div', 'html body div', 'body div'): - self.assertSelects(selector, ['main', 'inner', 'footer']) + self.assertSelects(selector, ['data1', 'main', 'inner', 'footer']) + + + def test_limit(self): + self.assertSelects('html div', ['main'], limit=1) + self.assertSelects('html body div', ['inner', 'main'], limit=2) + self.assertSelects('body div', ['data1', 'main', 'inner', 'footer'], + limit=10) def test_tag_no_match(self): self.assertEqual(len(self.soup.select('del')), 0) def test_invalid_tag(self): - self.assertEqual(len(self.soup.select('tag%t')), 0) + self.assertRaises(ValueError, self.soup.select, 'tag%t') + + def test_select_dashed_tag_ids(self): + self.assertSelects('custom-dashed-tag', ['dash1', 'dash2']) + + def test_select_dashed_by_id(self): + dashed = self.soup.select('custom-dashed-tag[id=\"dash2\"]') + self.assertEqual(dashed[0].name, 'custom-dashed-tag') + self.assertEqual(dashed[0]['id'], 'dash2') + + def test_dashed_tag_text(self): + self.assertEqual(self.soup.select('body > custom-dashed-tag')[0].text, u'Hello there.') + + def test_select_dashed_matches_find_all(self): + self.assertEqual(self.soup.select('custom-dashed-tag'), self.soup.find_all('custom-dashed-tag')) def test_header_tags(self): self.assertSelectMultiple( @@ -1559,7 +1796,7 @@ class TestSoupSelector(TreeTest): for el in els: self.assertEqual(el.name, 'p') self.assertEqual(els[1]['class'], ['onep']) - self.assertFalse(els[0].has_key('class')) + self.assertFalse(els[0].has_attr('class')) def test_a_bunch_of_emptys(self): for selector in ('div#main del', 'div#main div.oops', 'div div#main'): @@ -1579,6 +1816,9 @@ class TestSoupSelector(TreeTest): self.assertSelects('.s1 > a', ['s1a1', 's1a2']) self.assertSelects('.s1 > a span', ['s1a2s1']) + def test_child_selector_id(self): + self.assertSelects('.s1 > a#s1a2 span', ['s1a2s1']) + def test_attribute_equals(self): self.assertSelectMultiple( ('p[class="onep"]', ['p1']), @@ -1625,6 +1865,7 @@ class TestSoupSelector(TreeTest): ('[id^="m"]', ['me', 'main']), ('div[id^="m"]', ['main']), ('a[id^="m"]', ['me']), + ('div[data-tag^="dashed"]', ['data1']) ) def test_attribute_endswith(self): @@ -1632,8 +1873,8 @@ class TestSoupSelector(TreeTest): ('[href$=".css"]', ['l1']), ('link[href$=".css"]', ['l1']), ('link[id$="1"]', ['l1']), - ('[id$="1"]', ['l1', 'p1', 'header1', 's1a1', 's2a1', 's1a2s1']), - ('div[id$="1"]', []), + ('[id$="1"]', ['data1', 'l1', 'p1', 'header1', 's1a1', 's2a1', 's1a2s1', 'dash1']), + ('div[id$="1"]', ['data1']), ('[id$="noending"]', []), ) @@ -1646,7 +1887,6 @@ class TestSoupSelector(TreeTest): ('[rel*="notstyle"]', []), ('link[rel*="notstyle"]', []), ('link[href*="bla"]', ['l1']), - ('a[href*="http://"]', ['bob', 'me']), ('[href*="http://"]', ['bob', 'me']), ('[id*="p"]', ['pmulti', 'p1']), ('div[id*="m"]', ['main']), @@ -1655,8 +1895,8 @@ class TestSoupSelector(TreeTest): ('[href*=".css"]', ['l1']), ('link[href*=".css"]', ['l1']), ('link[id*="1"]', ['l1']), - ('[id*="1"]', ['l1', 'p1', 'header1', 's1a1', 's1a2', 's2a1', 's1a2s1']), - ('div[id*="1"]', []), + ('[id*="1"]', ['data1', 'l1', 'p1', 'header1', 's1a1', 's1a2', 's2a1', 's1a2s1', 'dash1']), + ('div[id*="1"]', ['data1']), ('[id*="noending"]', []), # New for this test ('[href*="."]', ['bob', 'me', 'l1']), @@ -1664,6 +1904,7 @@ class TestSoupSelector(TreeTest): ('link[href*="."]', ['l1']), ('div[id*="n"]', ['main', 'inner']), ('div[id*="nn"]', ['inner']), + ('div[data-tag*="edval"]', ['data1']) ) def test_attribute_exact_or_hypen(self): @@ -1683,8 +1924,52 @@ class TestSoupSelector(TreeTest): ('p[class]', ['p1', 'pmulti']), ('[blah]', []), ('p[blah]', []), + ('div[data-tag]', ['data1']) ) + def test_quoted_space_in_selector_name(self): + html = """
nope
+
yes
+ """ + soup = BeautifulSoup(html, 'html.parser') + [chosen] = soup.select('div[style="display: right"]') + self.assertEqual("yes", chosen.string) + + def test_unsupported_pseudoclass(self): + self.assertRaises( + NotImplementedError, self.soup.select, "a:no-such-pseudoclass") + + self.assertRaises( + NotImplementedError, self.soup.select, "a:nth-of-type(a)") + + + def test_nth_of_type(self): + # Try to select first paragraph + els = self.soup.select('div#inner p:nth-of-type(1)') + self.assertEqual(len(els), 1) + self.assertEqual(els[0].string, u'Some text') + + # Try to select third paragraph + els = self.soup.select('div#inner p:nth-of-type(3)') + self.assertEqual(len(els), 1) + self.assertEqual(els[0].string, u'Another') + + # Try to select (non-existent!) fourth paragraph + els = self.soup.select('div#inner p:nth-of-type(4)') + self.assertEqual(len(els), 0) + + # Pass in an invalid value. + self.assertRaises( + ValueError, self.soup.select, 'div p:nth-of-type(0)') + + def test_nth_of_type_direct_descendant(self): + els = self.soup.select('div#inner > p:nth-of-type(1)') + self.assertEqual(len(els), 1) + self.assertEqual(els[0].string, u'Some text') + + def test_id_child_selector_nth_of_type(self): + self.assertSelects('#inner > p:nth-of-type(2)', ['p1']) + def test_select_on_element(self): # Other tests operate on the tree; this operates on an element # within the tree. @@ -1692,4 +1977,68 @@ class TestSoupSelector(TreeTest): selected = inner.select("div") # The
tag was selected. The