mylar/lib/bs4/dammit.py

# -*- coding: utf-8 -*-
"""Beautiful Soup bonus library: Unicode, Dammit

This library converts a bytestream to Unicode through any means
necessary. It is heavily based on code from Mark Pilgrim's Universal
Feed Parser. It works best on XML and HTML, but it does not rewrite the
XML or HTML to reflect a new encoding; that's the tree builder's job.
"""
# Use of this source code is governed by a BSD-style license that can be
# found in the LICENSE file.
__license__ = "MIT"

import codecs
from htmlentitydefs import codepoint2name
import re
import logging
import string

# Import a library to autodetect character encodings.
chardet_type = None
try:
    # First try the fast C implementation.
    #  PyPI package: cchardet
    import cchardet
    def chardet_dammit(s):
        return cchardet.detect(s)['encoding']
except ImportError:
    try:
        # Fall back to the pure Python implementation
        #  Debian package: python-chardet
        #  PyPI package: chardet
        import chardet
        def chardet_dammit(s):
            return chardet.detect(s)['encoding']
        #import chardet.constants
        #chardet.constants._debug = 1
    except ImportError:
        # No chardet available.
        def chardet_dammit(s):
            return None

# Available from http://cjkpython.i18n.org/.
try:
    import iconv_codec
except ImportError:
    pass

xml_encoding_re = re.compile(
    '^<\?.*encoding=[\'"](.*?)[\'"].*\?>'.encode(), re.I)
html_meta_re = re.compile(
    '<\s*meta[^>]+charset\s*=\s*["\']?([^>]*?)[ /;\'">]'.encode(), re.I)

class EntitySubstitution(object):

    """Substitute XML or HTML entities for the corresponding characters."""

    def _populate_class_variables():
        lookup = {}
        reverse_lookup = {}
        characters_for_re = []
        for codepoint, name in list(codepoint2name.items()):
            character = unichr(codepoint)
            if codepoint != 34:
                # There's no point in turning the quotation mark into
                # &quot;, unless it happens within an attribute value, which
                # is handled elsewhere.
                characters_for_re.append(character)
                lookup[character] = name
            # But we do want to turn &quot; into the quotation mark.
            reverse_lookup[name] = character
        re_definition = "[%s]" % "".join(characters_for_re)
        return lookup, reverse_lookup, re.compile(re_definition)
    (CHARACTER_TO_HTML_ENTITY, HTML_ENTITY_TO_CHARACTER,
     CHARACTER_TO_HTML_ENTITY_RE) = _populate_class_variables()

    CHARACTER_TO_XML_ENTITY = {
        "'": "apos",
        '"': "quot",
        "&": "amp",
        "<": "lt",
        ">": "gt",
        }

    BARE_AMPERSAND_OR_BRACKET = re.compile("([<>]|"
                                           "&(?!#\d+;|#x[0-9a-fA-F]+;|\w+;)"
                                           ")")

    AMPERSAND_OR_BRACKET = re.compile("([<>&])")

    @classmethod
    def _substitute_html_entity(cls, matchobj):
        entity = cls.CHARACTER_TO_HTML_ENTITY.get(matchobj.group(0))
        return "&%s;" % entity

    @classmethod
    def _substitute_xml_entity(cls, matchobj):
        """Used with a regular expression to substitute the
        appropriate XML entity for an XML special character."""
        entity = cls.CHARACTER_TO_XML_ENTITY[matchobj.group(0)]
        return "&%s;" % entity

    @classmethod
    def quoted_attribute_value(self, value):
        """Make a value into a quoted XML attribute, possibly escaping it.

         Most strings will be quoted using double quotes.

          Bob's Bar -> "Bob's Bar"

         If a string contains double quotes, it will be quoted using
         single quotes.

          Welcome to "my bar" -> 'Welcome to "my bar"'

         If a string contains both single and double quotes, the
         double quotes will be escaped, and the string will be quoted
         using double quotes.

          Welcome to "Bob's Bar" -> "Welcome to &quot;Bob's bar&quot;
        """
        quote_with = '"'
        if '"' in value:
            if "'" in value:
                # The string contains both single and double
                # quotes.  Turn the double quotes into
                # entities. We quote the double quotes rather than
                # the single quotes because the entity name is
                # "&quot;" whether this is HTML or XML.  If we
                # quoted the single quotes, we'd have to decide
                # between &apos; and &squot;.
                replace_with = "&quot;"
                value = value.replace('"', replace_with)
            else:
                # There are double quotes but no single quotes.
                # We can use single quotes to quote the attribute.
                quote_with = "'"
        return quote_with + value + quote_with

    @classmethod
    def substitute_xml(cls, value, make_quoted_attribute=False):
        """Substitute XML entities for special XML characters.

        :param value: A string to be substituted. The less-than sign
          will become &lt;, the greater-than sign will become &gt;,
          and any ampersands will become &amp;. If you want ampersands
          that appear to be part of an entity definition to be left
          alone, use substitute_xml_containing_entities() instead.

        :param make_quoted_attribute: If True, then the string will be
         quoted, as befits an attribute value.
        """
        # Escape angle brackets and ampersands.
        value = cls.AMPERSAND_OR_BRACKET.sub(
            cls._substitute_xml_entity, value)

        if make_quoted_attribute:
            value = cls.quoted_attribute_value(value)
        return value

    @classmethod
    def substitute_xml_containing_entities(
        cls, value, make_quoted_attribute=False):
        """Substitute XML entities for special XML characters.

        :param value: A string to be substituted. The less-than sign will
          become &lt;, the greater-than sign will become &gt;, and any
          ampersands that are not part of an entity defition will
          become &amp;.

        :param make_quoted_attribute: If True, then the string will be
         quoted, as befits an attribute value.
        """
        # Escape angle brackets, and ampersands that aren't part of
        # entities.
        value = cls.BARE_AMPERSAND_OR_BRACKET.sub(
            cls._substitute_xml_entity, value)

        if make_quoted_attribute:
            value = cls.quoted_attribute_value(value)
        return value

    @classmethod
    def substitute_html(cls, s):
        """Replace certain Unicode characters with named HTML entities.

        This differs from data.encode(encoding, 'xmlcharrefreplace')
        in that the goal is to make the result more readable (to those
        with ASCII displays) rather than to recover from
        errors. There's absolutely nothing wrong with a UTF-8 string
        containg a LATIN SMALL LETTER E WITH ACUTE, but replacing that
        character with "&eacute;" will make it more readable to some
        people.
        """
        return cls.CHARACTER_TO_HTML_ENTITY_RE.sub(
            cls._substitute_html_entity, s)


class EncodingDetector:
    """Suggests a number of possible encodings for a bytestring.

    Order of precedence:

    1. Encodings you specifically tell EncodingDetector to try first
    (the override_encodings argument to the constructor).

    2. An encoding declared within the bytestring itself, either in an
    XML declaration (if the bytestring is to be interpreted as an XML
    document), or in a <meta> tag (if the bytestring is to be
    interpreted as an HTML document.)

    3. An encoding detected through textual analysis by chardet,
    cchardet, or a similar external library.

    4. UTF-8.

    5. Windows-1252.
    """
    def __init__(self, markup, override_encodings=None, is_html=False,
                 exclude_encodings=None):
        self.override_encodings = override_encodings or []
        exclude_encodings = exclude_encodings or []
        self.exclude_encodings = set([x.lower() for x in exclude_encodings])
        self.chardet_encoding = None
        self.is_html = is_html
        self.declared_encoding = None

        # First order of business: strip a byte-order mark.
        self.markup, self.sniffed_encoding = self.strip_byte_order_mark(markup)

    def _usable(self, encoding, tried):
        if encoding is not None:
            encoding = encoding.lower()
            if encoding in self.exclude_encodings:
                return False
            if encoding not in tried:
                tried.add(encoding)
                return True
        return False

    @property
    def encodings(self):
        """Yield a number of encodings that might work for this markup."""
        tried = set()
        for e in self.override_encodings:
            if self._usable(e, tried):
                yield e

        # Did the document originally start with a byte-order mark
        # that indicated its encoding?
        if self._usable(self.sniffed_encoding, tried):
            yield self.sniffed_encoding

        # Look within the document for an XML or HTML encoding
        # declaration.
        if self.declared_encoding is None:
            self.declared_encoding = self.find_declared_encoding(
                self.markup, self.is_html)
        if self._usable(self.declared_encoding, tried):
            yield self.declared_encoding

        # Use third-party character set detection to guess at the
        # encoding.
        if self.chardet_encoding is None:
            self.chardet_encoding = chardet_dammit(self.markup)
        if self._usable(self.chardet_encoding, tried):
            yield self.chardet_encoding

        # As a last-ditch effort, try utf-8 and windows-1252.
        for e in ('utf-8', 'windows-1252'):
            if self._usable(e, tried):
                yield e

    @classmethod
    def strip_byte_order_mark(cls, data):
        """If a byte-order mark is present, strip it and return the encoding it implies."""
        encoding = None
        if isinstance(data, unicode):
            # Unicode data cannot have a byte-order mark.
            return data, encoding
        if (len(data) >= 4) and (data[:2] == b'\xfe\xff') \
               and (data[2:4] != '\x00\x00'):
            encoding = 'utf-16be'
            data = data[2:]
        elif (len(data) >= 4) and (data[:2] == b'\xff\xfe') \
                 and (data[2:4] != '\x00\x00'):
            encoding = 'utf-16le'
            data = data[2:]
        elif data[:3] == b'\xef\xbb\xbf':
            encoding = 'utf-8'
            data = data[3:]
        elif data[:4] == b'\x00\x00\xfe\xff':
            encoding = 'utf-32be'
            data = data[4:]
        elif data[:4] == b'\xff\xfe\x00\x00':
            encoding = 'utf-32le'
            data = data[4:]
        return data, encoding

    @classmethod
    def find_declared_encoding(cls, markup, is_html=False, search_entire_document=False):
        """Given a document, tries to find its declared encoding.

        An XML encoding is declared at the beginning of the document.

        An HTML encoding is declared in a <meta> tag, hopefully near the
        beginning of the document.
        """
        if search_entire_document:
            xml_endpos = html_endpos = len(markup)
        else:
            xml_endpos = 1024
            html_endpos = max(2048, int(len(markup) * 0.05))

        declared_encoding = None
        declared_encoding_match = xml_encoding_re.search(markup, endpos=xml_endpos)
        if not declared_encoding_match and is_html:
            declared_encoding_match = html_meta_re.search(markup, endpos=html_endpos)
        if declared_encoding_match is not None:
            declared_encoding = declared_encoding_match.groups()[0].decode(
                'ascii', 'replace')
        if declared_encoding:
            return declared_encoding.lower()
        return None

class UnicodeDammit:
    """A class for detecting the encoding of a *ML document and
    converting it to a Unicode string. If the source encoding is
    windows-1252, can replace MS smart quotes with their HTML or XML
    equivalents."""

    # This dictionary maps commonly seen values for "charset" in HTML
    # meta tags to the corresponding Python codec names. It only covers
    # values that aren't in Python's aliases and can't be determined
    # by the heuristics in find_codec.
    CHARSET_ALIASES = {"macintosh": "mac-roman",
                       "x-sjis": "shift-jis"}

    ENCODINGS_WITH_SMART_QUOTES = [
        "windows-1252",
        "iso-8859-1",
        "iso-8859-2",
        ]

    def __init__(self, markup, override_encodings=[],
                 smart_quotes_to=None, is_html=False, exclude_encodings=[]):
        self.smart_quotes_to = smart_quotes_to
        self.tried_encodings = []
        self.contains_replacement_characters = False
        self.is_html = is_html
        self.log = logging.getLogger(__name__)
        self.detector = EncodingDetector(
            markup, override_encodings, is_html, exclude_encodings)

        # Short-circuit if the data is in Unicode to begin with.
        if isinstance(markup, unicode) or markup == '':
            self.markup = markup
            self.unicode_markup = unicode(markup)
            self.original_encoding = None
            return

        # The encoding detector may have stripped a byte-order mark.
        # Use the stripped markup from this point on.
        self.markup = self.detector.markup

        u = None
        for encoding in self.detector.encodings:
            markup = self.detector.markup
            u = self._convert_from(encoding)
            if u is not None:
                break

        if not u:
            # None of the encodings worked. As an absolute last resort,
            # try them again with character replacement.

            for encoding in self.detector.encodings:
                if encoding != "ascii":
                    u = self._convert_from(encoding, "replace")
                if u is not None:
                    self.log.warning(
                            "Some characters could not be decoded, and were "
                            "replaced with REPLACEMENT CHARACTER."
                    )
                    self.contains_replacement_characters = True
                    break

        # If none of that worked, we could at this point force it to
        # ASCII, but that would destroy so much data that I think
        # giving up is better.
        self.unicode_markup = u
        if not u:
            self.original_encoding = None

    def _sub_ms_char(self, match):
        """Changes a MS smart quote character to an XML or HTML
        entity, or an ASCII character."""
        orig = match.group(1)
        if self.smart_quotes_to == 'ascii':
            sub = self.MS_CHARS_TO_ASCII.get(orig).encode()
        else:
            sub = self.MS_CHARS.get(orig)
            if type(sub) == tuple:
                if self.smart_quotes_to == 'xml':
                    sub = '&#x'.encode() + sub[1].encode() + ';'.encode()
                else:
                    sub = '&'.encode() + sub[0].encode() + ';'.encode()
            else:
                sub = sub.encode()
        return sub

    def _convert_from(self, proposed, errors="strict"):
        proposed = self.find_codec(proposed)
        if not proposed or (proposed, errors) in self.tried_encodings:
            return None
        self.tried_encodings.append((proposed, errors))
        markup = self.markup
        # Convert smart quotes to HTML if coming from an encoding
        # that might have them.
        if (self.smart_quotes_to is not None
            and proposed in self.ENCODINGS_WITH_SMART_QUOTES):
            smart_quotes_re = b"([\x80-\x9f])"
            smart_quotes_compiled = re.compile(smart_quotes_re)
            markup = smart_quotes_compiled.sub(self._sub_ms_char, markup)

        try:
            #print "Trying to convert document to %s (errors=%s)" % (
            #    proposed, errors)
            u = self._to_unicode(markup, proposed, errors)
            self.markup = u
            self.original_encoding = proposed
        except Exception as e:
            #print "That didn't work!"
            #print e
            return None
        #print "Correct encoding: %s" % proposed
        return self.markup

    def _to_unicode(self, data, encoding, errors="strict"):
        '''Given a string and its encoding, decodes the string into Unicode.
        %encoding is a string recognized by encodings.aliases'''
        return unicode(data, encoding, errors)

    @property
    def declared_html_encoding(self):
        if not self.is_html:
            return None
        return self.detector.declared_encoding

    def find_codec(self, charset):
        value = (self._codec(self.CHARSET_ALIASES.get(charset, charset))
               or (charset and self._codec(charset.replace("-", "")))
               or (charset and self._codec(charset.replace("-", "_")))
               or (charset and charset.lower())
               or charset
                )
        if value:
            return value.lower()
        return None

    def _codec(self, charset):
        if not charset:
            return charset
        codec = None
        try:
            codecs.lookup(charset)
            codec = charset
        except (LookupError, ValueError):
            pass
        return codec


    # A partial mapping of ISO-Latin-1 to HTML entities/XML numeric entities.
    MS_CHARS = {b'\x80': ('euro', '20AC'),
                b'\x81': ' ',
                b'\x82': ('sbquo', '201A'),
                b'\x83': ('fnof', '192'),
                b'\x84': ('bdquo', '201E'),
                b'\x85': ('hellip', '2026'),
                b'\x86': ('dagger', '2020'),
                b'\x87': ('Dagger', '2021'),
                b'\x88': ('circ', '2C6'),
                b'\x89': ('permil', '2030'),
                b'\x8A': ('Scaron', '160'),
                b'\x8B': ('lsaquo', '2039'),
                b'\x8C': ('OElig', '152'),
                b'\x8D': '?',
                b'\x8E': ('#x17D', '17D'),
                b'\x8F': '?',
                b'\x90': '?',
                b'\x91': ('lsquo', '2018'),
                b'\x92': ('rsquo', '2019'),
                b'\x93': ('ldquo', '201C'),
                b'\x94': ('rdquo', '201D'),
                b'\x95': ('bull', '2022'),
                b'\x96': ('ndash', '2013'),
                b'\x97': ('mdash', '2014'),
                b'\x98': ('tilde', '2DC'),
                b'\x99': ('trade', '2122'),
                b'\x9a': ('scaron', '161'),
                b'\x9b': ('rsaquo', '203A'),
                b'\x9c': ('oelig', '153'),
                b'\x9d': '?',
                b'\x9e': ('#x17E', '17E'),
                b'\x9f': ('Yuml', ''),}

    # A parochial partial mapping of ISO-Latin-1 to ASCII. Contains
    # horrors like stripping diacritical marks to turn á into a, but also
    # contains non-horrors like turning “ into ".
    MS_CHARS_TO_ASCII = {
        b'\x80' : 'EUR',
        b'\x81' : ' ',
        b'\x82' : ',',
        b'\x83' : 'f',
        b'\x84' : ',,',
        b'\x85' : '...',
        b'\x86' : '+',
        b'\x87' : '++',
        b'\x88' : '^',
        b'\x89' : '%',
        b'\x8a' : 'S',
        b'\x8b' : '<',
        b'\x8c' : 'OE',
        b'\x8d' : '?',
        b'\x8e' : 'Z',
        b'\x8f' : '?',
        b'\x90' : '?',
        b'\x91' : "'",
        b'\x92' : "'",
        b'\x93' : '"',
        b'\x94' : '"',
        b'\x95' : '*',
        b'\x96' : '-',
        b'\x97' : '--',
        b'\x98' : '~',
        b'\x99' : '(TM)',
        b'\x9a' : 's',
        b'\x9b' : '>',
        b'\x9c' : 'oe',
        b'\x9d' : '?',
        b'\x9e' : 'z',
        b'\x9f' : 'Y',
        b'\xa0' : ' ',
        b'\xa1' : '!',
        b'\xa2' : 'c',
        b'\xa3' : 'GBP',
        b'\xa4' : '$', #This approximation is especially parochial--this is the
                       #generic currency symbol.
        b'\xa5' : 'YEN',
        b'\xa6' : '|',
        b'\xa7' : 'S',
        b'\xa8' : '..',
        b'\xa9' : '',
        b'\xaa' : '(th)',
        b'\xab' : '<<',
        b'\xac' : '!',
        b'\xad' : ' ',
        b'\xae' : '(R)',
        b'\xaf' : '-',
        b'\xb0' : 'o',
        b'\xb1' : '+-',
        b'\xb2' : '2',
        b'\xb3' : '3',
        b'\xb4' : ("'", 'acute'),
        b'\xb5' : 'u',
        b'\xb6' : 'P',
        b'\xb7' : '*',
        b'\xb8' : ',',
        b'\xb9' : '1',
        b'\xba' : '(th)',
        b'\xbb' : '>>',
        b'\xbc' : '1/4',
        b'\xbd' : '1/2',
        b'\xbe' : '3/4',
        b'\xbf' : '?',
        b'\xc0' : 'A',
        b'\xc1' : 'A',
        b'\xc2' : 'A',
        b'\xc3' : 'A',
        b'\xc4' : 'A',
        b'\xc5' : 'A',
        b'\xc6' : 'AE',
        b'\xc7' : 'C',
        b'\xc8' : 'E',
        b'\xc9' : 'E',
        b'\xca' : 'E',
        b'\xcb' : 'E',
        b'\xcc' : 'I',
        b'\xcd' : 'I',
        b'\xce' : 'I',
        b'\xcf' : 'I',
        b'\xd0' : 'D',
        b'\xd1' : 'N',
        b'\xd2' : 'O',
        b'\xd3' : 'O',
        b'\xd4' : 'O',
        b'\xd5' : 'O',
        b'\xd6' : 'O',
        b'\xd7' : '*',
        b'\xd8' : 'O',
        b'\xd9' : 'U',
        b'\xda' : 'U',
        b'\xdb' : 'U',
        b'\xdc' : 'U',
        b'\xdd' : 'Y',
        b'\xde' : 'b',
        b'\xdf' : 'B',
        b'\xe0' : 'a',
        b'\xe1' : 'a',
        b'\xe2' : 'a',
        b'\xe3' : 'a',
        b'\xe4' : 'a',
        b'\xe5' : 'a',
        b'\xe6' : 'ae',
        b'\xe7' : 'c',
        b'\xe8' : 'e',
        b'\xe9' : 'e',
        b'\xea' : 'e',
        b'\xeb' : 'e',
        b'\xec' : 'i',
        b'\xed' : 'i',
        b'\xee' : 'i',
        b'\xef' : 'i',
        b'\xf0' : 'o',
        b'\xf1' : 'n',
        b'\xf2' : 'o',
        b'\xf3' : 'o',
        b'\xf4' : 'o',
        b'\xf5' : 'o',
        b'\xf6' : 'o',
        b'\xf7' : '/',
        b'\xf8' : 'o',
        b'\xf9' : 'u',
        b'\xfa' : 'u',
        b'\xfb' : 'u',
        b'\xfc' : 'u',
        b'\xfd' : 'y',
        b'\xfe' : 'b',
        b'\xff' : 'y',
        }

    # A map used when removing rogue Windows-1252/ISO-8859-1
    # characters in otherwise UTF-8 documents.
    #
    # Note that \x81, \x8d, \x8f, \x90, and \x9d are undefined in
    # Windows-1252.
    WINDOWS_1252_TO_UTF8 = {
        0x80 : b'\xe2\x82\xac', # €
        0x82 : b'\xe2\x80\x9a', # ‚
        0x83 : b'\xc6\x92',     # ƒ
        0x84 : b'\xe2\x80\x9e', # „
        0x85 : b'\xe2\x80\xa6', # …
        0x86 : b'\xe2\x80\xa0', # †
        0x87 : b'\xe2\x80\xa1', # ‡
        0x88 : b'\xcb\x86',     # ˆ
        0x89 : b'\xe2\x80\xb0', # ‰
        0x8a : b'\xc5\xa0',     # Š
        0x8b : b'\xe2\x80\xb9', # ‹
        0x8c : b'\xc5\x92',     # Œ
        0x8e : b'\xc5\xbd',     # Ž
        0x91 : b'\xe2\x80\x98', # ‘
        0x92 : b'\xe2\x80\x99', # ’
        0x93 : b'\xe2\x80\x9c', # “
        0x94 : b'\xe2\x80\x9d', # ”
        0x95 : b'\xe2\x80\xa2', # •
        0x96 : b'\xe2\x80\x93', # –
        0x97 : b'\xe2\x80\x94', # —
        0x98 : b'\xcb\x9c',     # ˜
        0x99 : b'\xe2\x84\xa2', # ™
        0x9a : b'\xc5\xa1',     # š
        0x9b : b'\xe2\x80\xba', # ›
        0x9c : b'\xc5\x93',     # œ
        0x9e : b'\xc5\xbe',     # ž
        0x9f : b'\xc5\xb8',     # Ÿ
        0xa0 : b'\xc2\xa0',     #  
        0xa1 : b'\xc2\xa1',     # ¡
        0xa2 : b'\xc2\xa2',     # ¢
        0xa3 : b'\xc2\xa3',     # £
        0xa4 : b'\xc2\xa4',     # ¤
        0xa5 : b'\xc2\xa5',     # ¥
        0xa6 : b'\xc2\xa6',     # ¦
        0xa7 : b'\xc2\xa7',     # §
        0xa8 : b'\xc2\xa8',     # ¨
        0xa9 : b'\xc2\xa9',     # ©
        0xaa : b'\xc2\xaa',     # ª
        0xab : b'\xc2\xab',     # «
        0xac : b'\xc2\xac',     # ¬
        0xad : b'\xc2\xad',     # 
        0xae : b'\xc2\xae',     # ®
        0xaf : b'\xc2\xaf',     # ¯
        0xb0 : b'\xc2\xb0',     # °
        0xb1 : b'\xc2\xb1',     # ±
        0xb2 : b'\xc2\xb2',     # ²
        0xb3 : b'\xc2\xb3',     # ³
        0xb4 : b'\xc2\xb4',     # ´
        0xb5 : b'\xc2\xb5',     # µ
        0xb6 : b'\xc2\xb6',     # ¶
        0xb7 : b'\xc2\xb7',     # ·
        0xb8 : b'\xc2\xb8',     # ¸
        0xb9 : b'\xc2\xb9',     # ¹
        0xba : b'\xc2\xba',     # º
        0xbb : b'\xc2\xbb',     # »
        0xbc : b'\xc2\xbc',     # ¼
        0xbd : b'\xc2\xbd',     # ½
        0xbe : b'\xc2\xbe',     # ¾
        0xbf : b'\xc2\xbf',     # ¿
        0xc0 : b'\xc3\x80',     # À
        0xc1 : b'\xc3\x81',     # Á
        0xc2 : b'\xc3\x82',     # Â
        0xc3 : b'\xc3\x83',     # Ã
        0xc4 : b'\xc3\x84',     # Ä
        0xc5 : b'\xc3\x85',     # Å
        0xc6 : b'\xc3\x86',     # Æ
        0xc7 : b'\xc3\x87',     # Ç
        0xc8 : b'\xc3\x88',     # È
        0xc9 : b'\xc3\x89',     # É
        0xca : b'\xc3\x8a',     # Ê
        0xcb : b'\xc3\x8b',     # Ë
        0xcc : b'\xc3\x8c',     # Ì
        0xcd : b'\xc3\x8d',     # Í
        0xce : b'\xc3\x8e',     # Î
        0xcf : b'\xc3\x8f',     # Ï
        0xd0 : b'\xc3\x90',     # Ð
        0xd1 : b'\xc3\x91',     # Ñ
        0xd2 : b'\xc3\x92',     # Ò
        0xd3 : b'\xc3\x93',     # Ó
        0xd4 : b'\xc3\x94',     # Ô
        0xd5 : b'\xc3\x95',     # Õ
        0xd6 : b'\xc3\x96',     # Ö
        0xd7 : b'\xc3\x97',     # ×
        0xd8 : b'\xc3\x98',     # Ø
        0xd9 : b'\xc3\x99',     # Ù
        0xda : b'\xc3\x9a',     # Ú
        0xdb : b'\xc3\x9b',     # Û
        0xdc : b'\xc3\x9c',     # Ü
        0xdd : b'\xc3\x9d',     # Ý
        0xde : b'\xc3\x9e',     # Þ
        0xdf : b'\xc3\x9f',     # ß
        0xe0 : b'\xc3\xa0',     # à
        0xe1 : b'\xa1',         # á
        0xe2 : b'\xc3\xa2',     # â
        0xe3 : b'\xc3\xa3',     # ã
        0xe4 : b'\xc3\xa4',     # ä
        0xe5 : b'\xc3\xa5',     # å
        0xe6 : b'\xc3\xa6',     # æ
        0xe7 : b'\xc3\xa7',     # ç
        0xe8 : b'\xc3\xa8',     # è
        0xe9 : b'\xc3\xa9',     # é
        0xea : b'\xc3\xaa',     # ê
        0xeb : b'\xc3\xab',     # ë
        0xec : b'\xc3\xac',     # ì
        0xed : b'\xc3\xad',     # í
        0xee : b'\xc3\xae',     # î
        0xef : b'\xc3\xaf',     # ï
        0xf0 : b'\xc3\xb0',     # ð
        0xf1 : b'\xc3\xb1',     # ñ
        0xf2 : b'\xc3\xb2',     # ò
        0xf3 : b'\xc3\xb3',     # ó
        0xf4 : b'\xc3\xb4',     # ô
        0xf5 : b'\xc3\xb5',     # õ
        0xf6 : b'\xc3\xb6',     # ö
        0xf7 : b'\xc3\xb7',     # ÷
        0xf8 : b'\xc3\xb8',     # ø
        0xf9 : b'\xc3\xb9',     # ù
        0xfa : b'\xc3\xba',     # ú
        0xfb : b'\xc3\xbb',     # û
        0xfc : b'\xc3\xbc',     # ü
        0xfd : b'\xc3\xbd',     # ý
        0xfe : b'\xc3\xbe',     # þ
        }

    MULTIBYTE_MARKERS_AND_SIZES = [
        (0xc2, 0xdf, 2), # 2-byte characters start with a byte C2-DF
        (0xe0, 0xef, 3), # 3-byte characters start with E0-EF
        (0xf0, 0xf4, 4), # 4-byte characters start with F0-F4
        ]

    FIRST_MULTIBYTE_MARKER = MULTIBYTE_MARKERS_AND_SIZES[0][0]
    LAST_MULTIBYTE_MARKER = MULTIBYTE_MARKERS_AND_SIZES[-1][1]

    @classmethod
    def detwingle(cls, in_bytes, main_encoding="utf8",
                  embedded_encoding="windows-1252"):
        """Fix characters from one encoding embedded in some other encoding.

        Currently the only situation supported is Windows-1252 (or its
        subset ISO-8859-1), embedded in UTF-8.

        The input must be a bytestring. If you've already converted
        the document to Unicode, you're too late.

        The output is a bytestring in which `embedded_encoding`
        characters have been converted to their `main_encoding`
        equivalents.
        """
        if embedded_encoding.replace('_', '-').lower() not in (
            'windows-1252', 'windows_1252'):
            raise NotImplementedError(
                "Windows-1252 and ISO-8859-1 are the only currently supported "
                "embedded encodings.")

        if main_encoding.lower() not in ('utf8', 'utf-8'):
            raise NotImplementedError(
                "UTF-8 is the only currently supported main encoding.")

        byte_chunks = []

        chunk_start = 0
        pos = 0
        while pos < len(in_bytes):
            byte = in_bytes[pos]
            if not isinstance(byte, int):
                # Python 2.x
                byte = ord(byte)
            if (byte >= cls.FIRST_MULTIBYTE_MARKER
                and byte <= cls.LAST_MULTIBYTE_MARKER):
                # This is the start of a UTF-8 multibyte character. Skip
                # to the end.
                for start, end, size in cls.MULTIBYTE_MARKERS_AND_SIZES:
                    if byte >= start and byte <= end:
                        pos += size
                        break
            elif byte >= 0x80 and byte in cls.WINDOWS_1252_TO_UTF8:
                # We found a Windows-1252 character!
                # Save the string up to this point as a chunk.
                byte_chunks.append(in_bytes[chunk_start:pos])

                # Now translate the Windows-1252 character into UTF-8
                # and add it as another, one-byte chunk.
                byte_chunks.append(cls.WINDOWS_1252_TO_UTF8[byte])
                pos += 1
                chunk_start = pos
            else:
                # Go on to the next character.
                pos += 1
        if chunk_start == 0:
            # The string is unchanged.
            return in_bytes
        else:
            # Store the final chunk.
            byte_chunks.append(in_bytes[chunk_start:])
        return b''.join(byte_chunks)
-												attempted fix for : and & in Comic Titles when adding

											
										
										
											2012-09-13 15:27:34 +00:00
+								# -*- coding: utf-8 -*-
 								"""Beautiful Soup bonus library: Unicode, Dammit
-												Put libs/ in sys.path
Update bs4 to latest version to fix issues
Get clean modules using `pip install --upgrade --target=lib`
Move cherrypy, mako, pystun, bs4 into lib directory

											
										
										
											2016-08-23 06:01:41 +00:00
+								This library converts a bytestream to Unicode through any means
 								necessary. It is heavily based on code from Mark Pilgrim's Universal
 								Feed Parser. It works best on XML and HTML, but it does not rewrite the
 								XML or HTML to reflect a new encoding; that's the tree builder's job.
-												attempted fix for : and & in Comic Titles when adding

											
										
										
											2012-09-13 15:27:34 +00:00
+								"""
-												Put libs/ in sys.path
Update bs4 to latest version to fix issues
Get clean modules using `pip install --upgrade --target=lib`
Move cherrypy, mako, pystun, bs4 into lib directory

											
										
										
											2016-08-23 06:01:41 +00:00
+								# Use of this source code is governed by a BSD-style license that can be
 								# found in the LICENSE file.
 								__license__ = "MIT"
-												attempted fix for : and & in Comic Titles when adding

											
										
										
											2012-09-13 15:27:34 +00:00
 								import codecs
 								from htmlentitydefs import codepoint2name
 								import re
-												Put libs/ in sys.path
Update bs4 to latest version to fix issues
Get clean modules using `pip install --upgrade --target=lib`
Move cherrypy, mako, pystun, bs4 into lib directory

											
										
										
											2016-08-23 06:01:41 +00:00
+								import logging
 								import string
-												attempted fix for : and & in Comic Titles when adding

											
										
										
											2012-09-13 15:27:34 +00:00
-												Put libs/ in sys.path
Update bs4 to latest version to fix issues
Get clean modules using `pip install --upgrade --target=lib`
Move cherrypy, mako, pystun, bs4 into lib directory

											
										
										
											2016-08-23 06:01:41 +00:00
+								# Import a library to autodetect character encodings.
 								chardet_type = None
-												attempted fix for : and & in Comic Titles when adding

											
										
										
											2012-09-13 15:27:34 +00:00
+								try:
-												Put libs/ in sys.path
Update bs4 to latest version to fix issues
Get clean modules using `pip install --upgrade --target=lib`
Move cherrypy, mako, pystun, bs4 into lib directory

											
										
										
											2016-08-23 06:01:41 +00:00
+								    # First try the fast C implementation.
 								    #  PyPI package: cchardet
 								    import cchardet
 								    def chardet_dammit(s):
 								        return cchardet.detect(s)['encoding']
-												attempted fix for : and & in Comic Titles when adding

											
										
										
											2012-09-13 15:27:34 +00:00
+								except ImportError:
-												Put libs/ in sys.path
Update bs4 to latest version to fix issues
Get clean modules using `pip install --upgrade --target=lib`
Move cherrypy, mako, pystun, bs4 into lib directory

											
										
										
											2016-08-23 06:01:41 +00:00
+								    try:
 								        # Fall back to the pure Python implementation
 								        #  Debian package: python-chardet
 								        #  PyPI package: chardet
 								        import chardet
 								        def chardet_dammit(s):
 								            return chardet.detect(s)['encoding']
 								        #import chardet.constants
 								        #chardet.constants._debug = 1
 								    except ImportError:
 								        # No chardet available.
 								        def chardet_dammit(s):
 								            return None
-												attempted fix for : and & in Comic Titles when adding

											
										
										
											2012-09-13 15:27:34 +00:00
 								# Available from http://cjkpython.i18n.org/.
 								try:
 								    import iconv_codec
 								except ImportError:
 								    pass
 								xml_encoding_re = re.compile(
 								    '^<\?.*encoding=[\'"](.*?)[\'"].*\?>'.encode(), re.I)
 								html_meta_re = re.compile(
 								    '<\s*meta[^>]+charset\s*=\s*["\']?([^>]*?)[ /;\'">]'.encode(), re.I)
 								class EntitySubstitution(object):
 								    """Substitute XML or HTML entities for the corresponding characters."""
 								    def _populate_class_variables():
 								        lookup = {}
 								        reverse_lookup = {}
 								        characters_for_re = []
 								        for codepoint, name in list(codepoint2name.items()):
 								            character = unichr(codepoint)
 								            if codepoint != 34:
 								                # There's no point in turning the quotation mark into
 								                # &quot;, unless it happens within an attribute value, which
 								                # is handled elsewhere.
 								                characters_for_re.append(character)
 								                lookup[character] = name
 								            # But we do want to turn &quot; into the quotation mark.
 								            reverse_lookup[name] = character
 								        re_definition = "[%s]" % "".join(characters_for_re)
 								        return lookup, reverse_lookup, re.compile(re_definition)
 								    (CHARACTER_TO_HTML_ENTITY, HTML_ENTITY_TO_CHARACTER,
 								     CHARACTER_TO_HTML_ENTITY_RE) = _populate_class_variables()
 								    CHARACTER_TO_XML_ENTITY = {
 								        "'": "apos",
 								        '"': "quot",
 								        "&": "amp",
 								        "<": "lt",
 								        ">": "gt",
 								        }
 								    BARE_AMPERSAND_OR_BRACKET = re.compile("([<>]|"
 								                                           "&(?!#\d+;|#x[0-9a-fA-F]+;|\w+;)"
 								                                           ")")
-												Put libs/ in sys.path
Update bs4 to latest version to fix issues
Get clean modules using `pip install --upgrade --target=lib`
Move cherrypy, mako, pystun, bs4 into lib directory

											
										
										
											2016-08-23 06:01:41 +00:00
+								    AMPERSAND_OR_BRACKET = re.compile("([<>&])")
-												attempted fix for : and & in Comic Titles when adding

											
										
										
											2012-09-13 15:27:34 +00:00
+								    @classmethod
 								    def _substitute_html_entity(cls, matchobj):
 								        entity = cls.CHARACTER_TO_HTML_ENTITY.get(matchobj.group(0))
 								        return "&%s;" % entity
 								    @classmethod
 								    def _substitute_xml_entity(cls, matchobj):
 								        """Used with a regular expression to substitute the
 								        appropriate XML entity for an XML special character."""
 								        entity = cls.CHARACTER_TO_XML_ENTITY[matchobj.group(0)]
 								        return "&%s;" % entity
 								    @classmethod
 								    def quoted_attribute_value(self, value):
 								        """Make a value into a quoted XML attribute, possibly escaping it.
 								         Most strings will be quoted using double quotes.
 								          Bob's Bar -> "Bob's Bar"
 								         If a string contains double quotes, it will be quoted using
 								         single quotes.
 								          Welcome to "my bar" -> 'Welcome to "my bar"'
 								         If a string contains both single and double quotes, the
 								         double quotes will be escaped, and the string will be quoted
 								         using double quotes.
 								          Welcome to "Bob's Bar" -> "Welcome to &quot;Bob's bar&quot;
 								        """
 								        quote_with = '"'
 								        if '"' in value:
 								            if "'" in value:
 								                # The string contains both single and double
 								                # quotes.  Turn the double quotes into
 								                # entities. We quote the double quotes rather than
 								                # the single quotes because the entity name is
 								                # "&quot;" whether this is HTML or XML.  If we
 								                # quoted the single quotes, we'd have to decide
 								                # between &apos; and &squot;.
 								                replace_with = "&quot;"
 								                value = value.replace('"', replace_with)
 								            else:
 								                # There are double quotes but no single quotes.
 								                # We can use single quotes to quote the attribute.
 								                quote_with = "'"
 								        return quote_with + value + quote_with
 								    @classmethod
 								    def substitute_xml(cls, value, make_quoted_attribute=False):
 								        """Substitute XML entities for special XML characters.
-												Put libs/ in sys.path
Update bs4 to latest version to fix issues
Get clean modules using `pip install --upgrade --target=lib`
Move cherrypy, mako, pystun, bs4 into lib directory

											
										
										
											2016-08-23 06:01:41 +00:00
+								        :param value: A string to be substituted. The less-than sign
 								          will become &lt;, the greater-than sign will become &gt;,
 								          and any ampersands will become &amp;. If you want ampersands
 								          that appear to be part of an entity definition to be left
 								          alone, use substitute_xml_containing_entities() instead.
 								        :param make_quoted_attribute: If True, then the string will be
 								         quoted, as befits an attribute value.
 								        """
 								        # Escape angle brackets and ampersands.
 								        value = cls.AMPERSAND_OR_BRACKET.sub(
 								            cls._substitute_xml_entity, value)
 								        if make_quoted_attribute:
 								            value = cls.quoted_attribute_value(value)
 								        return value
 								    @classmethod
 								    def substitute_xml_containing_entities(
 								        cls, value, make_quoted_attribute=False):
 								        """Substitute XML entities for special XML characters.
-												attempted fix for : and & in Comic Titles when adding

											
										
										
											2012-09-13 15:27:34 +00:00
+								        :param value: A string to be substituted. The less-than sign will
 								          become &lt;, the greater-than sign will become &gt;, and any
 								          ampersands that are not part of an entity defition will
 								          become &amp;.
 								        :param make_quoted_attribute: If True, then the string will be
 								         quoted, as befits an attribute value.
 								        """
 								        # Escape angle brackets, and ampersands that aren't part of
 								        # entities.
 								        value = cls.BARE_AMPERSAND_OR_BRACKET.sub(
 								            cls._substitute_xml_entity, value)
 								        if make_quoted_attribute:
 								            value = cls.quoted_attribute_value(value)
 								        return value
 								    @classmethod
 								    def substitute_html(cls, s):
 								        """Replace certain Unicode characters with named HTML entities.
 								        This differs from data.encode(encoding, 'xmlcharrefreplace')
 								        in that the goal is to make the result more readable (to those
 								        with ASCII displays) rather than to recover from
 								        errors. There's absolutely nothing wrong with a UTF-8 string
 								        containg a LATIN SMALL LETTER E WITH ACUTE, but replacing that
 								        character with "&eacute;" will make it more readable to some
 								        people.
 								        """
 								        return cls.CHARACTER_TO_HTML_ENTITY_RE.sub(
 								            cls._substitute_html_entity, s)
-												Put libs/ in sys.path
Update bs4 to latest version to fix issues
Get clean modules using `pip install --upgrade --target=lib`
Move cherrypy, mako, pystun, bs4 into lib directory

											
										
										
											2016-08-23 06:01:41 +00:00
+								class EncodingDetector:
 								    """Suggests a number of possible encodings for a bytestring.
 								    Order of precedence:
 . Encodings you specifically tell EncodingDetector to try first
 								    (the override_encodings argument to the constructor).
 . An encoding declared within the bytestring itself, either in an
 								    XML declaration (if the bytestring is to be interpreted as an XML
 								    document), or in a <meta> tag (if the bytestring is to be
 								    interpreted as an HTML document.)
 . An encoding detected through textual analysis by chardet,
 								    cchardet, or a similar external library.
 . UTF-8.
 . Windows-1252.
 								    """
 								    def __init__(self, markup, override_encodings=None, is_html=False,
 								                 exclude_encodings=None):
 								        self.override_encodings = override_encodings or []
 								        exclude_encodings = exclude_encodings or []
 								        self.exclude_encodings = set([x.lower() for x in exclude_encodings])
 								        self.chardet_encoding = None
 								        self.is_html = is_html
 								        self.declared_encoding = None
 								        # First order of business: strip a byte-order mark.
 								        self.markup, self.sniffed_encoding = self.strip_byte_order_mark(markup)
 								    def _usable(self, encoding, tried):
 								        if encoding is not None:
 								            encoding = encoding.lower()
 								            if encoding in self.exclude_encodings:
 								                return False
 								            if encoding not in tried:
 								                tried.add(encoding)
 								                return True
 								        return False
 								    @property
 								    def encodings(self):
 								        """Yield a number of encodings that might work for this markup."""
 								        tried = set()
 								        for e in self.override_encodings:
 								            if self._usable(e, tried):
 								                yield e
 								        # Did the document originally start with a byte-order mark
 								        # that indicated its encoding?
 								        if self._usable(self.sniffed_encoding, tried):
 								            yield self.sniffed_encoding
 								        # Look within the document for an XML or HTML encoding
 								        # declaration.
 								        if self.declared_encoding is None:
 								            self.declared_encoding = self.find_declared_encoding(
 								                self.markup, self.is_html)
 								        if self._usable(self.declared_encoding, tried):
 								            yield self.declared_encoding
 								        # Use third-party character set detection to guess at the
 								        # encoding.
 								        if self.chardet_encoding is None:
 								            self.chardet_encoding = chardet_dammit(self.markup)
 								        if self._usable(self.chardet_encoding, tried):
 								            yield self.chardet_encoding
 								        # As a last-ditch effort, try utf-8 and windows-1252.
 								        for e in ('utf-8', 'windows-1252'):
 								            if self._usable(e, tried):
 								                yield e
 								    @classmethod
 								    def strip_byte_order_mark(cls, data):
 								        """If a byte-order mark is present, strip it and return the encoding it implies."""
 								        encoding = None
 								        if isinstance(data, unicode):
 								            # Unicode data cannot have a byte-order mark.
 								            return data, encoding
 								        if (len(data) >= 4) and (data[:2] == b'\xfe\xff') \
 								               and (data[2:4] != '\x00\x00'):
 								            encoding = 'utf-16be'
 								            data = data[2:]
 								        elif (len(data) >= 4) and (data[:2] == b'\xff\xfe') \
 								                 and (data[2:4] != '\x00\x00'):
 								            encoding = 'utf-16le'
 								            data = data[2:]
 								        elif data[:3] == b'\xef\xbb\xbf':
 								            encoding = 'utf-8'
 								            data = data[3:]
 								        elif data[:4] == b'\x00\x00\xfe\xff':
 								            encoding = 'utf-32be'
 								            data = data[4:]
 								        elif data[:4] == b'\xff\xfe\x00\x00':
 								            encoding = 'utf-32le'
 								            data = data[4:]
 								        return data, encoding
 								    @classmethod
 								    def find_declared_encoding(cls, markup, is_html=False, search_entire_document=False):
 								        """Given a document, tries to find its declared encoding.
 								        An XML encoding is declared at the beginning of the document.
 								        An HTML encoding is declared in a <meta> tag, hopefully near the
 								        beginning of the document.
 								        """
 								        if search_entire_document:
 								            xml_endpos = html_endpos = len(markup)
 								        else:
 								            xml_endpos = 1024
 								            html_endpos = max(2048, int(len(markup) * 0.05))
-												FIX: Fix for problems when using wwt (returning no search results would throw error, beautifulsoup could not be used if html5lib was up-to-date on the host system

											
										
										
											2018-05-01 14:42:00 +00:00
-												Put libs/ in sys.path
Update bs4 to latest version to fix issues
Get clean modules using `pip install --upgrade --target=lib`
Move cherrypy, mako, pystun, bs4 into lib directory

											
										
										
											2016-08-23 06:01:41 +00:00
+								        declared_encoding = None
 								        declared_encoding_match = xml_encoding_re.search(markup, endpos=xml_endpos)
 								        if not declared_encoding_match and is_html:
 								            declared_encoding_match = html_meta_re.search(markup, endpos=html_endpos)
 								        if declared_encoding_match is not None:
 								            declared_encoding = declared_encoding_match.groups()[0].decode(
 								                'ascii', 'replace')
 								        if declared_encoding:
 								            return declared_encoding.lower()
 								        return None
-												attempted fix for : and & in Comic Titles when adding

											
										
										
											2012-09-13 15:27:34 +00:00
+								class UnicodeDammit:
 								    """A class for detecting the encoding of a *ML document and
 								    converting it to a Unicode string. If the source encoding is
 								    windows-1252, can replace MS smart quotes with their HTML or XML
 								    equivalents."""
 								    # This dictionary maps commonly seen values for "charset" in HTML
 								    # meta tags to the corresponding Python codec names. It only covers
 								    # values that aren't in Python's aliases and can't be determined
 								    # by the heuristics in find_codec.
 								    CHARSET_ALIASES = {"macintosh": "mac-roman",
 								                       "x-sjis": "shift-jis"}
 								    ENCODINGS_WITH_SMART_QUOTES = [
 								        "windows-1252",
 								        "iso-8859-1",
 								        "iso-8859-2",
 								        ]
 								    def __init__(self, markup, override_encodings=[],
-												Put libs/ in sys.path
Update bs4 to latest version to fix issues
Get clean modules using `pip install --upgrade --target=lib`
Move cherrypy, mako, pystun, bs4 into lib directory

											
										
										
											2016-08-23 06:01:41 +00:00
+								                 smart_quotes_to=None, is_html=False, exclude_encodings=[]):
-												attempted fix for : and & in Comic Titles when adding

											
										
										
											2012-09-13 15:27:34 +00:00
+								        self.smart_quotes_to = smart_quotes_to
 								        self.tried_encodings = []
 								        self.contains_replacement_characters = False
-												Put libs/ in sys.path
Update bs4 to latest version to fix issues
Get clean modules using `pip install --upgrade --target=lib`
Move cherrypy, mako, pystun, bs4 into lib directory

											
										
										
											2016-08-23 06:01:41 +00:00
+								        self.is_html = is_html
 								        self.log = logging.getLogger(__name__)
 								        self.detector = EncodingDetector(
 								            markup, override_encodings, is_html, exclude_encodings)
-												attempted fix for : and & in Comic Titles when adding

											
										
										
											2012-09-13 15:27:34 +00:00
-												Put libs/ in sys.path
Update bs4 to latest version to fix issues
Get clean modules using `pip install --upgrade --target=lib`
Move cherrypy, mako, pystun, bs4 into lib directory

											
										
										
											2016-08-23 06:01:41 +00:00
+								        # Short-circuit if the data is in Unicode to begin with.
 								        if isinstance(markup, unicode) or markup == '':
-												attempted fix for : and & in Comic Titles when adding

											
										
										
											2012-09-13 15:27:34 +00:00
+								            self.markup = markup
 								            self.unicode_markup = unicode(markup)
 								            self.original_encoding = None
 								            return
-												Put libs/ in sys.path
Update bs4 to latest version to fix issues
Get clean modules using `pip install --upgrade --target=lib`
Move cherrypy, mako, pystun, bs4 into lib directory

											
										
										
											2016-08-23 06:01:41 +00:00
+								        # The encoding detector may have stripped a byte-order mark.
 								        # Use the stripped markup from this point on.
 								        self.markup = self.detector.markup
-												attempted fix for : and & in Comic Titles when adding

											
										
										
											2012-09-13 15:27:34 +00:00
 								        u = None
-												Put libs/ in sys.path
Update bs4 to latest version to fix issues
Get clean modules using `pip install --upgrade --target=lib`
Move cherrypy, mako, pystun, bs4 into lib directory

											
										
										
											2016-08-23 06:01:41 +00:00
+								        for encoding in self.detector.encodings:
 								            markup = self.detector.markup
 								            u = self._convert_from(encoding)
 								            if u is not None:
 								                break
-												attempted fix for : and & in Comic Titles when adding

											
										
										
											2012-09-13 15:27:34 +00:00
 								        if not u:
-												Put libs/ in sys.path
Update bs4 to latest version to fix issues
Get clean modules using `pip install --upgrade --target=lib`
Move cherrypy, mako, pystun, bs4 into lib directory

											
										
										
											2016-08-23 06:01:41 +00:00
+								            # None of the encodings worked. As an absolute last resort,
 								            # try them again with character replacement.
-												attempted fix for : and & in Comic Titles when adding

											
										
										
											2012-09-13 15:27:34 +00:00
-												Put libs/ in sys.path
Update bs4 to latest version to fix issues
Get clean modules using `pip install --upgrade --target=lib`
Move cherrypy, mako, pystun, bs4 into lib directory

											
										
										
											2016-08-23 06:01:41 +00:00
+								            for encoding in self.detector.encodings:
 								                if encoding != "ascii":
 								                    u = self._convert_from(encoding, "replace")
-												attempted fix for : and & in Comic Titles when adding

											
										
										
											2012-09-13 15:27:34 +00:00
+								                if u is not None:
-												Put libs/ in sys.path
Update bs4 to latest version to fix issues
Get clean modules using `pip install --upgrade --target=lib`
Move cherrypy, mako, pystun, bs4 into lib directory

											
										
										
											2016-08-23 06:01:41 +00:00
+								                    self.log.warning(
-												attempted fix for : and & in Comic Titles when adding

											
										
										
											2012-09-13 15:27:34 +00:00
+								                            "Some characters could not be decoded, and were "
-												Put libs/ in sys.path
Update bs4 to latest version to fix issues
Get clean modules using `pip install --upgrade --target=lib`
Move cherrypy, mako, pystun, bs4 into lib directory

											
										
										
											2016-08-23 06:01:41 +00:00
+								                            "replaced with REPLACEMENT CHARACTER."
 								                    )
-												attempted fix for : and & in Comic Titles when adding

											
										
										
											2012-09-13 15:27:34 +00:00
+								                    self.contains_replacement_characters = True
 								                    break
-												Put libs/ in sys.path
Update bs4 to latest version to fix issues
Get clean modules using `pip install --upgrade --target=lib`
Move cherrypy, mako, pystun, bs4 into lib directory

											
										
										
											2016-08-23 06:01:41 +00:00
+								        # If none of that worked, we could at this point force it to
 								        # ASCII, but that would destroy so much data that I think
 								        # giving up is better.
-												attempted fix for : and & in Comic Titles when adding

											
										
										
											2012-09-13 15:27:34 +00:00
+								        self.unicode_markup = u
 								        if not u:
 								            self.original_encoding = None
 								    def _sub_ms_char(self, match):
 								        """Changes a MS smart quote character to an XML or HTML
 								        entity, or an ASCII character."""
 								        orig = match.group(1)
 								        if self.smart_quotes_to == 'ascii':
 								            sub = self.MS_CHARS_TO_ASCII.get(orig).encode()
 								        else:
 								            sub = self.MS_CHARS.get(orig)
 								            if type(sub) == tuple:
 								                if self.smart_quotes_to == 'xml':
 								                    sub = '&#x'.encode() + sub[1].encode() + ';'.encode()
 								                else:
 								                    sub = '&'.encode() + sub[0].encode() + ';'.encode()
 								            else:
 								                sub = sub.encode()
 								        return sub
 								    def _convert_from(self, proposed, errors="strict"):
 								        proposed = self.find_codec(proposed)
 								        if not proposed or (proposed, errors) in self.tried_encodings:
 								            return None
 								        self.tried_encodings.append((proposed, errors))
 								        markup = self.markup
 								        # Convert smart quotes to HTML if coming from an encoding
 								        # that might have them.
 								        if (self.smart_quotes_to is not None
-												Put libs/ in sys.path
Update bs4 to latest version to fix issues
Get clean modules using `pip install --upgrade --target=lib`
Move cherrypy, mako, pystun, bs4 into lib directory

											
										
										
											2016-08-23 06:01:41 +00:00
+								            and proposed in self.ENCODINGS_WITH_SMART_QUOTES):
-												attempted fix for : and & in Comic Titles when adding

											
										
										
											2012-09-13 15:27:34 +00:00
+								            smart_quotes_re = b"([\x80-\x9f])"
 								            smart_quotes_compiled = re.compile(smart_quotes_re)
 								            markup = smart_quotes_compiled.sub(self._sub_ms_char, markup)
 								        try:
 								            #print "Trying to convert document to %s (errors=%s)" % (
 								            #    proposed, errors)
 								            u = self._to_unicode(markup, proposed, errors)
 								            self.markup = u
 								            self.original_encoding = proposed
 								        except Exception as e:
 								            #print "That didn't work!"
 								            #print e
 								            return None
 								        #print "Correct encoding: %s" % proposed
 								        return self.markup
 								    def _to_unicode(self, data, encoding, errors="strict"):
 								        '''Given a string and its encoding, decodes the string into Unicode.
 								        %encoding is a string recognized by encodings.aliases'''
-												Put libs/ in sys.path
Update bs4 to latest version to fix issues
Get clean modules using `pip install --upgrade --target=lib`
Move cherrypy, mako, pystun, bs4 into lib directory

											
										
										
											2016-08-23 06:01:41 +00:00
+								        return unicode(data, encoding, errors)
-												attempted fix for : and & in Comic Titles when adding

											
										
										
											2012-09-13 15:27:34 +00:00
-												Put libs/ in sys.path
Update bs4 to latest version to fix issues
Get clean modules using `pip install --upgrade --target=lib`
Move cherrypy, mako, pystun, bs4 into lib directory

											
										
										
											2016-08-23 06:01:41 +00:00
+								    @property
 								    def declared_html_encoding(self):
 								        if not self.is_html:
 								            return None
 								        return self.detector.declared_encoding
-												attempted fix for : and & in Comic Titles when adding

											
										
										
											2012-09-13 15:27:34 +00:00
 								    def find_codec(self, charset):
-												Put libs/ in sys.path
Update bs4 to latest version to fix issues
Get clean modules using `pip install --upgrade --target=lib`
Move cherrypy, mako, pystun, bs4 into lib directory

											
										
										
											2016-08-23 06:01:41 +00:00
+								        value = (self._codec(self.CHARSET_ALIASES.get(charset, charset))
 								               or (charset and self._codec(charset.replace("-", "")))
 								               or (charset and self._codec(charset.replace("-", "_")))
 								               or (charset and charset.lower())
-												attempted fix for : and & in Comic Titles when adding

											
										
										
											2012-09-13 15:27:34 +00:00
+								               or charset
-												Put libs/ in sys.path
Update bs4 to latest version to fix issues
Get clean modules using `pip install --upgrade --target=lib`
Move cherrypy, mako, pystun, bs4 into lib directory

											
										
										
											2016-08-23 06:01:41 +00:00
+								                )
 								        if value:
 								            return value.lower()
 								        return None
-												attempted fix for : and & in Comic Titles when adding

											
										
										
											2012-09-13 15:27:34 +00:00
 								    def _codec(self, charset):
 								        if not charset:
 								            return charset
 								        codec = None
 								        try:
 								            codecs.lookup(charset)
 								            codec = charset
 								        except (LookupError, ValueError):
 								            pass
 								        return codec
 								    # A partial mapping of ISO-Latin-1 to HTML entities/XML numeric entities.
 								    MS_CHARS = {b'\x80': ('euro', '20AC'),
 								                b'\x81': ' ',
 								                b'\x82': ('sbquo', '201A'),
 								                b'\x83': ('fnof', '192'),
 								                b'\x84': ('bdquo', '201E'),
 								                b'\x85': ('hellip', '2026'),
 								                b'\x86': ('dagger', '2020'),
 								                b'\x87': ('Dagger', '2021'),
 								                b'\x88': ('circ', '2C6'),
 								                b'\x89': ('permil', '2030'),
 								                b'\x8A': ('Scaron', '160'),
 								                b'\x8B': ('lsaquo', '2039'),
 								                b'\x8C': ('OElig', '152'),
 								                b'\x8D': '?',
 								                b'\x8E': ('#x17D', '17D'),
 								                b'\x8F': '?',
 								                b'\x90': '?',
 								                b'\x91': ('lsquo', '2018'),
 								                b'\x92': ('rsquo', '2019'),
 								                b'\x93': ('ldquo', '201C'),
 								                b'\x94': ('rdquo', '201D'),
 								                b'\x95': ('bull', '2022'),
 								                b'\x96': ('ndash', '2013'),
 								                b'\x97': ('mdash', '2014'),
 								                b'\x98': ('tilde', '2DC'),
 								                b'\x99': ('trade', '2122'),
 								                b'\x9a': ('scaron', '161'),
 								                b'\x9b': ('rsaquo', '203A'),
 								                b'\x9c': ('oelig', '153'),
 								                b'\x9d': '?',
 								                b'\x9e': ('#x17E', '17E'),
 								                b'\x9f': ('Yuml', ''),}
 								    # A parochial partial mapping of ISO-Latin-1 to ASCII. Contains
 								    # horrors like stripping diacritical marks to turn á into a, but also
 								    # contains non-horrors like turning “ into ".
 								    MS_CHARS_TO_ASCII = {
 								        b'\x80' : 'EUR',
 								        b'\x81' : ' ',
 								        b'\x82' : ',',
 								        b'\x83' : 'f',
 								        b'\x84' : ',,',
 								        b'\x85' : '...',
 								        b'\x86' : '+',
 								        b'\x87' : '++',
 								        b'\x88' : '^',
 								        b'\x89' : '%',
 								        b'\x8a' : 'S',
 								        b'\x8b' : '<',
 								        b'\x8c' : 'OE',
 								        b'\x8d' : '?',
 								        b'\x8e' : 'Z',
 								        b'\x8f' : '?',
 								        b'\x90' : '?',
 								        b'\x91' : "'",
 								        b'\x92' : "'",
 								        b'\x93' : '"',
 								        b'\x94' : '"',
 								        b'\x95' : '*',
 								        b'\x96' : '-',
 								        b'\x97' : '--',
 								        b'\x98' : '~',
 								        b'\x99' : '(TM)',
 								        b'\x9a' : 's',
 								        b'\x9b' : '>',
 								        b'\x9c' : 'oe',
 								        b'\x9d' : '?',
 								        b'\x9e' : 'z',
 								        b'\x9f' : 'Y',
 								        b'\xa0' : ' ',
 								        b'\xa1' : '!',
 								        b'\xa2' : 'c',
 								        b'\xa3' : 'GBP',
 								        b'\xa4' : '$', #This approximation is especially parochial--this is the
 								                       #generic currency symbol.
 								        b'\xa5' : 'YEN',
 								        b'\xa6' : '|',
 								        b'\xa7' : 'S',
 								        b'\xa8' : '..',
 								        b'\xa9' : '',
 								        b'\xaa' : '(th)',
 								        b'\xab' : '<<',
 								        b'\xac' : '!',
 								        b'\xad' : ' ',
 								        b'\xae' : '(R)',
 								        b'\xaf' : '-',
 								        b'\xb0' : 'o',
 								        b'\xb1' : '+-',
 								        b'\xb2' : '2',
 								        b'\xb3' : '3',
 								        b'\xb4' : ("'", 'acute'),
 								        b'\xb5' : 'u',
 								        b'\xb6' : 'P',
 								        b'\xb7' : '*',
 								        b'\xb8' : ',',
 								        b'\xb9' : '1',
 								        b'\xba' : '(th)',
 								        b'\xbb' : '>>',
 								        b'\xbc' : '1/4',
 								        b'\xbd' : '1/2',
 								        b'\xbe' : '3/4',
 								        b'\xbf' : '?',
 								        b'\xc0' : 'A',
 								        b'\xc1' : 'A',
 								        b'\xc2' : 'A',
 								        b'\xc3' : 'A',
 								        b'\xc4' : 'A',
 								        b'\xc5' : 'A',
 								        b'\xc6' : 'AE',
 								        b'\xc7' : 'C',
 								        b'\xc8' : 'E',
 								        b'\xc9' : 'E',
 								        b'\xca' : 'E',
 								        b'\xcb' : 'E',
 								        b'\xcc' : 'I',
 								        b'\xcd' : 'I',
 								        b'\xce' : 'I',
 								        b'\xcf' : 'I',
 								        b'\xd0' : 'D',
 								        b'\xd1' : 'N',
 								        b'\xd2' : 'O',
 								        b'\xd3' : 'O',
 								        b'\xd4' : 'O',
 								        b'\xd5' : 'O',
 								        b'\xd6' : 'O',
 								        b'\xd7' : '*',
 								        b'\xd8' : 'O',
 								        b'\xd9' : 'U',
 								        b'\xda' : 'U',
 								        b'\xdb' : 'U',
 								        b'\xdc' : 'U',
 								        b'\xdd' : 'Y',
 								        b'\xde' : 'b',
 								        b'\xdf' : 'B',
 								        b'\xe0' : 'a',
 								        b'\xe1' : 'a',
 								        b'\xe2' : 'a',
 								        b'\xe3' : 'a',
 								        b'\xe4' : 'a',
 								        b'\xe5' : 'a',
 								        b'\xe6' : 'ae',
 								        b'\xe7' : 'c',
 								        b'\xe8' : 'e',
 								        b'\xe9' : 'e',
 								        b'\xea' : 'e',
 								        b'\xeb' : 'e',
 								        b'\xec' : 'i',
 								        b'\xed' : 'i',
 								        b'\xee' : 'i',
 								        b'\xef' : 'i',
 								        b'\xf0' : 'o',
 								        b'\xf1' : 'n',
 								        b'\xf2' : 'o',
 								        b'\xf3' : 'o',
 								        b'\xf4' : 'o',
 								        b'\xf5' : 'o',
 								        b'\xf6' : 'o',
 								        b'\xf7' : '/',
 								        b'\xf8' : 'o',
 								        b'\xf9' : 'u',
 								        b'\xfa' : 'u',
 								        b'\xfb' : 'u',
 								        b'\xfc' : 'u',
 								        b'\xfd' : 'y',
 								        b'\xfe' : 'b',
 								        b'\xff' : 'y',
 								        }
 								    # A map used when removing rogue Windows-1252/ISO-8859-1
 								    # characters in otherwise UTF-8 documents.
 								    #
 								    # Note that \x81, \x8d, \x8f, \x90, and \x9d are undefined in
 								    # Windows-1252.
 								    WINDOWS_1252_TO_UTF8 = {
 x80 : b'\xe2\x82\xac', # €
 x82 : b'\xe2\x80\x9a', # ‚
 x83 : b'\xc6\x92',     # ƒ
 x84 : b'\xe2\x80\x9e', # „
 x85 : b'\xe2\x80\xa6', # …
 x86 : b'\xe2\x80\xa0', # †
 x87 : b'\xe2\x80\xa1', # ‡
 x88 : b'\xcb\x86',     # ˆ
 x89 : b'\xe2\x80\xb0', # ‰
 x8a : b'\xc5\xa0',     # Š
 x8b : b'\xe2\x80\xb9', # ‹
 x8c : b'\xc5\x92',     # Œ
 x8e : b'\xc5\xbd',     # Ž
 x91 : b'\xe2\x80\x98', # ‘
 x92 : b'\xe2\x80\x99', # ’
 x93 : b'\xe2\x80\x9c', # “
 x94 : b'\xe2\x80\x9d', # ”
 x95 : b'\xe2\x80\xa2', # •
 x96 : b'\xe2\x80\x93', # –
 x97 : b'\xe2\x80\x94', # —
 x98 : b'\xcb\x9c',     # ˜
 x99 : b'\xe2\x84\xa2', # ™
 x9a : b'\xc5\xa1',     # š
 x9b : b'\xe2\x80\xba', # ›
 x9c : b'\xc5\x93',     # œ
 x9e : b'\xc5\xbe',     # ž
 x9f : b'\xc5\xb8',     # Ÿ
 xa0 : b'\xc2\xa0',     #
 xa1 : b'\xc2\xa1',     # ¡
 xa2 : b'\xc2\xa2',     # ¢
 xa3 : b'\xc2\xa3',     # £
 xa4 : b'\xc2\xa4',     # ¤
 xa5 : b'\xc2\xa5',     # ¥
 xa6 : b'\xc2\xa6',     # ¦
 xa7 : b'\xc2\xa7',     # §
 xa8 : b'\xc2\xa8',     # ¨
 xa9 : b'\xc2\xa9',     # ©
 xaa : b'\xc2\xaa',     # ª
 xab : b'\xc2\xab',     # «
 xac : b'\xc2\xac',     # ¬
 xad : b'\xc2\xad',     #
 xae : b'\xc2\xae',     # ®
 xaf : b'\xc2\xaf',     # ¯
 xb0 : b'\xc2\xb0',     # °
 xb1 : b'\xc2\xb1',     # ±
 xb2 : b'\xc2\xb2',     # ²
 xb3 : b'\xc2\xb3',     # ³
 xb4 : b'\xc2\xb4',     # ´
 xb5 : b'\xc2\xb5',     # µ
 xb6 : b'\xc2\xb6',     # ¶
 xb7 : b'\xc2\xb7',     # ·
 xb8 : b'\xc2\xb8',     # ¸
 xb9 : b'\xc2\xb9',     # ¹
 xba : b'\xc2\xba',     # º
 xbb : b'\xc2\xbb',     # »
 xbc : b'\xc2\xbc',     # ¼
 xbd : b'\xc2\xbd',     # ½
 xbe : b'\xc2\xbe',     # ¾
 xbf : b'\xc2\xbf',     # ¿
 xc0 : b'\xc3\x80',     # À
 xc1 : b'\xc3\x81',     # Á
 xc2 : b'\xc3\x82',     # Â
 xc3 : b'\xc3\x83',     # Ã
 xc4 : b'\xc3\x84',     # Ä
 xc5 : b'\xc3\x85',     # Å
 xc6 : b'\xc3\x86',     # Æ
 xc7 : b'\xc3\x87',     # Ç
 xc8 : b'\xc3\x88',     # È
 xc9 : b'\xc3\x89',     # É
 xca : b'\xc3\x8a',     # Ê
 xcb : b'\xc3\x8b',     # Ë
 xcc : b'\xc3\x8c',     # Ì
 xcd : b'\xc3\x8d',     # Í
 xce : b'\xc3\x8e',     # Î
 xcf : b'\xc3\x8f',     # Ï
 xd0 : b'\xc3\x90',     # Ð
 xd1 : b'\xc3\x91',     # Ñ
 xd2 : b'\xc3\x92',     # Ò
 xd3 : b'\xc3\x93',     # Ó
 xd4 : b'\xc3\x94',     # Ô
 xd5 : b'\xc3\x95',     # Õ
 xd6 : b'\xc3\x96',     # Ö
 xd7 : b'\xc3\x97',     # ×
 xd8 : b'\xc3\x98',     # Ø
 xd9 : b'\xc3\x99',     # Ù
 xda : b'\xc3\x9a',     # Ú
 xdb : b'\xc3\x9b',     # Û
 xdc : b'\xc3\x9c',     # Ü
 xdd : b'\xc3\x9d',     # Ý
 xde : b'\xc3\x9e',     # Þ
 xdf : b'\xc3\x9f',     # ß
 xe0 : b'\xc3\xa0',     # à
-												FIX: Fix for problems when using wwt (returning no search results would throw error, beautifulsoup could not be used if html5lib was up-to-date on the host system

											
										
										
											2018-05-01 14:42:00 +00:00
+xe1 : b'\xa1',         # á
-												attempted fix for : and & in Comic Titles when adding

											
										
										
											2012-09-13 15:27:34 +00:00
+xe2 : b'\xc3\xa2',     # â
 xe3 : b'\xc3\xa3',     # ã
 xe4 : b'\xc3\xa4',     # ä
 xe5 : b'\xc3\xa5',     # å
 xe6 : b'\xc3\xa6',     # æ
 xe7 : b'\xc3\xa7',     # ç
 xe8 : b'\xc3\xa8',     # è
 xe9 : b'\xc3\xa9',     # é
 xea : b'\xc3\xaa',     # ê
 xeb : b'\xc3\xab',     # ë
 xec : b'\xc3\xac',     # ì
 xed : b'\xc3\xad',     # í
 xee : b'\xc3\xae',     # î
 xef : b'\xc3\xaf',     # ï
 xf0 : b'\xc3\xb0',     # ð
 xf1 : b'\xc3\xb1',     # ñ
 xf2 : b'\xc3\xb2',     # ò
 xf3 : b'\xc3\xb3',     # ó
 xf4 : b'\xc3\xb4',     # ô
 xf5 : b'\xc3\xb5',     # õ
 xf6 : b'\xc3\xb6',     # ö
 xf7 : b'\xc3\xb7',     # ÷
 xf8 : b'\xc3\xb8',     # ø
 xf9 : b'\xc3\xb9',     # ù
 xfa : b'\xc3\xba',     # ú
 xfb : b'\xc3\xbb',     # û
 xfc : b'\xc3\xbc',     # ü
 xfd : b'\xc3\xbd',     # ý
 xfe : b'\xc3\xbe',     # þ
 								        }
 								    MULTIBYTE_MARKERS_AND_SIZES = [
 								        (0xc2, 0xdf, 2), # 2-byte characters start with a byte C2-DF
 								        (0xe0, 0xef, 3), # 3-byte characters start with E0-EF
 								        (0xf0, 0xf4, 4), # 4-byte characters start with F0-F4
 								        ]
 								    FIRST_MULTIBYTE_MARKER = MULTIBYTE_MARKERS_AND_SIZES[0][0]
 								    LAST_MULTIBYTE_MARKER = MULTIBYTE_MARKERS_AND_SIZES[-1][1]
 								    @classmethod
 								    def detwingle(cls, in_bytes, main_encoding="utf8",
 								                  embedded_encoding="windows-1252"):
 								        """Fix characters from one encoding embedded in some other encoding.
 								        Currently the only situation supported is Windows-1252 (or its
 								        subset ISO-8859-1), embedded in UTF-8.
 								        The input must be a bytestring. If you've already converted
 								        the document to Unicode, you're too late.
 								        The output is a bytestring in which `embedded_encoding`
 								        characters have been converted to their `main_encoding`
 								        equivalents.
 								        """
 								        if embedded_encoding.replace('_', '-').lower() not in (
 								            'windows-1252', 'windows_1252'):
 								            raise NotImplementedError(
 								                "Windows-1252 and ISO-8859-1 are the only currently supported "
 								                "embedded encodings.")
 								        if main_encoding.lower() not in ('utf8', 'utf-8'):
 								            raise NotImplementedError(
 								                "UTF-8 is the only currently supported main encoding.")
 								        byte_chunks = []
 								        chunk_start = 0
 								        pos = 0
 								        while pos < len(in_bytes):
 								            byte = in_bytes[pos]
 								            if not isinstance(byte, int):
 								                # Python 2.x
 								                byte = ord(byte)
 								            if (byte >= cls.FIRST_MULTIBYTE_MARKER
 								                and byte <= cls.LAST_MULTIBYTE_MARKER):
 								                # This is the start of a UTF-8 multibyte character. Skip
 								                # to the end.
 								                for start, end, size in cls.MULTIBYTE_MARKERS_AND_SIZES:
 								                    if byte >= start and byte <= end:
 								                        pos += size
 								                        break
 								            elif byte >= 0x80 and byte in cls.WINDOWS_1252_TO_UTF8:
 								                # We found a Windows-1252 character!
 								                # Save the string up to this point as a chunk.
 								                byte_chunks.append(in_bytes[chunk_start:pos])
 								                # Now translate the Windows-1252 character into UTF-8
 								                # and add it as another, one-byte chunk.
 								                byte_chunks.append(cls.WINDOWS_1252_TO_UTF8[byte])
 								                pos += 1
 								                chunk_start = pos
 								            else:
 								                # Go on to the next character.
 								                pos += 1
 								        if chunk_start == 0:
 								            # The string is unchanged.
 								            return in_bytes
 								        else:
 								            # Store the final chunk.
 								            byte_chunks.append(in_bytes[chunk_start:])
 								        return b''.join(byte_chunks)