bazarr/libs/markdown/serializers.py

283 lines
9.6 KiB
Python

# markdown/searializers.py
#
# Add x/html serialization to Elementree
# Taken from ElementTree 1.3 preview with slight modifications
#
# Copyright (c) 1999-2007 by Fredrik Lundh. All rights reserved.
#
# fredrik@pythonware.com
# http://www.pythonware.com
#
# --------------------------------------------------------------------
# The ElementTree toolkit is
#
# Copyright (c) 1999-2007 by Fredrik Lundh
#
# By obtaining, using, and/or copying this software and/or its
# associated documentation, you agree that you have read, understood,
# and will comply with the following terms and conditions:
#
# Permission to use, copy, modify, and distribute this software and
# its associated documentation for any purpose and without fee is
# hereby granted, provided that the above copyright notice appears in
# all copies, and that both that copyright notice and this permission
# notice appear in supporting documentation, and that the name of
# Secret Labs AB or the author not be used in advertising or publicity
# pertaining to distribution of the software without specific, written
# prior permission.
#
# SECRET LABS AB AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD
# TO THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANT-
# ABILITY AND FITNESS. IN NO EVENT SHALL SECRET LABS AB OR THE AUTHOR
# BE LIABLE FOR ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY
# DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
# WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
# ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
# OF THIS SOFTWARE.
# --------------------------------------------------------------------
from __future__ import absolute_import
from __future__ import unicode_literals
from . import util
ElementTree = util.etree.ElementTree
QName = util.etree.QName
if hasattr(util.etree, 'test_comment'): # pragma: no cover
Comment = util.etree.test_comment
else: # pragma: no cover
Comment = util.etree.Comment
PI = util.etree.PI
ProcessingInstruction = util.etree.ProcessingInstruction
__all__ = ['to_html_string', 'to_xhtml_string']
HTML_EMPTY = ("area", "base", "basefont", "br", "col", "frame", "hr",
"img", "input", "isindex", "link", "meta" "param")
try:
HTML_EMPTY = set(HTML_EMPTY)
except NameError: # pragma: no cover
pass
_namespace_map = {
# "well-known" namespace prefixes
"http://www.w3.org/XML/1998/namespace": "xml",
"http://www.w3.org/1999/xhtml": "html",
"http://www.w3.org/1999/02/22-rdf-syntax-ns#": "rdf",
"http://schemas.xmlsoap.org/wsdl/": "wsdl",
# xml schema
"http://www.w3.org/2001/XMLSchema": "xs",
"http://www.w3.org/2001/XMLSchema-instance": "xsi",
# dublic core
"http://purl.org/dc/elements/1.1/": "dc",
}
def _raise_serialization_error(text): # pragma: no cover
raise TypeError(
"cannot serialize %r (type %s)" % (text, type(text).__name__)
)
def _encode(text, encoding):
try:
return text.encode(encoding, "xmlcharrefreplace")
except (TypeError, AttributeError): # pragma: no cover
_raise_serialization_error(text)
def _escape_cdata(text):
# escape character data
try:
# it's worth avoiding do-nothing calls for strings that are
# shorter than 500 character, or so. assume that's, by far,
# the most common case in most applications.
if "&" in text:
text = text.replace("&", "&")
if "<" in text:
text = text.replace("<", "&lt;")
if ">" in text:
text = text.replace(">", "&gt;")
return text
except (TypeError, AttributeError): # pragma: no cover
_raise_serialization_error(text)
def _escape_attrib(text):
# escape attribute value
try:
if "&" in text:
text = text.replace("&", "&amp;")
if "<" in text:
text = text.replace("<", "&lt;")
if ">" in text:
text = text.replace(">", "&gt;")
if "\"" in text:
text = text.replace("\"", "&quot;")
if "\n" in text:
text = text.replace("\n", "&#10;")
return text
except (TypeError, AttributeError): # pragma: no cover
_raise_serialization_error(text)
def _escape_attrib_html(text):
# escape attribute value
try:
if "&" in text:
text = text.replace("&", "&amp;")
if "<" in text:
text = text.replace("<", "&lt;")
if ">" in text:
text = text.replace(">", "&gt;")
if "\"" in text:
text = text.replace("\"", "&quot;")
return text
except (TypeError, AttributeError): # pragma: no cover
_raise_serialization_error(text)
def _serialize_html(write, elem, qnames, namespaces, format):
tag = elem.tag
text = elem.text
if tag is Comment:
write("<!--%s-->" % _escape_cdata(text))
elif tag is ProcessingInstruction:
write("<?%s?>" % _escape_cdata(text))
else:
tag = qnames[tag]
if tag is None:
if text:
write(_escape_cdata(text))
for e in elem:
_serialize_html(write, e, qnames, None, format)
else:
write("<" + tag)
items = elem.items()
if items or namespaces:
items = sorted(items) # lexical order
for k, v in items:
if isinstance(k, QName):
k = k.text
if isinstance(v, QName):
v = qnames[v.text]
else:
v = _escape_attrib_html(v)
if qnames[k] == v and format == 'html':
# handle boolean attributes
write(" %s" % v)
else:
write(" %s=\"%s\"" % (qnames[k], v))
if namespaces:
items = namespaces.items()
items.sort(key=lambda x: x[1]) # sort on prefix
for v, k in items:
if k:
k = ":" + k
write(" xmlns%s=\"%s\"" % (k, _escape_attrib(v)))
if format == "xhtml" and tag.lower() in HTML_EMPTY:
write(" />")
else:
write(">")
if text:
if tag.lower() in ["script", "style"]:
write(text)
else:
write(_escape_cdata(text))
for e in elem:
_serialize_html(write, e, qnames, None, format)
if tag.lower() not in HTML_EMPTY:
write("</" + tag + ">")
if elem.tail:
write(_escape_cdata(elem.tail))
def _write_html(root,
encoding=None,
default_namespace=None,
format="html"):
assert root is not None
data = []
write = data.append
qnames, namespaces = _namespaces(root, default_namespace)
_serialize_html(write, root, qnames, namespaces, format)
if encoding is None:
return "".join(data)
else:
return _encode("".join(data))
# --------------------------------------------------------------------
# serialization support
def _namespaces(elem, default_namespace=None):
# identify namespaces used in this tree
# maps qnames to *encoded* prefix:local names
qnames = {None: None}
# maps uri:s to prefixes
namespaces = {}
if default_namespace:
namespaces[default_namespace] = ""
def add_qname(qname):
# calculate serialized qname representation
try:
if qname[:1] == "{":
uri, tag = qname[1:].split("}", 1)
prefix = namespaces.get(uri)
if prefix is None:
prefix = _namespace_map.get(uri)
if prefix is None:
prefix = "ns%d" % len(namespaces)
if prefix != "xml":
namespaces[uri] = prefix
if prefix:
qnames[qname] = "%s:%s" % (prefix, tag)
else:
qnames[qname] = tag # default element
else:
if default_namespace:
raise ValueError(
"cannot use non-qualified names with "
"default_namespace option"
)
qnames[qname] = qname
except TypeError: # pragma: no cover
_raise_serialization_error(qname)
# populate qname and namespaces table
try:
iterate = elem.iter
except AttributeError:
iterate = elem.getiterator # cET compatibility
for elem in iterate():
tag = elem.tag
if isinstance(tag, QName) and tag.text not in qnames:
add_qname(tag.text)
elif isinstance(tag, util.string_type):
if tag not in qnames:
add_qname(tag)
elif tag is not None and tag is not Comment and tag is not PI:
_raise_serialization_error(tag)
for key, value in elem.items():
if isinstance(key, QName):
key = key.text
if key not in qnames:
add_qname(key)
if isinstance(value, QName) and value.text not in qnames:
add_qname(value.text)
text = elem.text
if isinstance(text, QName) and text.text not in qnames:
add_qname(text.text)
return qnames, namespaces
def to_html_string(element):
return _write_html(ElementTree(element).getroot(), format="html")
def to_xhtml_string(element):
return _write_html(ElementTree(element).getroot(), format="xhtml")