mirror of
https://github.com/morpheus65535/bazarr
synced 2025-01-01 04:26:13 +00:00
241 lines
9.7 KiB
Python
241 lines
9.7 KiB
Python
# -*- coding: utf-8 -*-
|
|
from ...exceptions import ReadError
|
|
from .readers import *
|
|
from pkg_resources import resource_stream # @UnresolvedImport
|
|
from xml.dom import minidom
|
|
import logging
|
|
|
|
|
|
__all__ = ['INTEGER', 'UINTEGER', 'FLOAT', 'STRING', 'UNICODE', 'DATE', 'MASTER', 'BINARY',
|
|
'SPEC_TYPES', 'READERS', 'Element', 'MasterElement', 'parse', 'parse_element',
|
|
'get_matroska_specs']
|
|
logger = logging.getLogger(__name__)
|
|
|
|
|
|
# EBML types
|
|
INTEGER, UINTEGER, FLOAT, STRING, UNICODE, DATE, MASTER, BINARY = range(8)
|
|
|
|
# Spec types to EBML types mapping
|
|
SPEC_TYPES = {
|
|
'integer': INTEGER,
|
|
'uinteger': UINTEGER,
|
|
'float': FLOAT,
|
|
'string': STRING,
|
|
'utf-8': UNICODE,
|
|
'date': DATE,
|
|
'master': MASTER,
|
|
'binary': BINARY
|
|
}
|
|
|
|
# Readers to use per EBML type
|
|
READERS = {
|
|
INTEGER: read_element_integer,
|
|
UINTEGER: read_element_uinteger,
|
|
FLOAT: read_element_float,
|
|
STRING: read_element_string,
|
|
UNICODE: read_element_unicode,
|
|
DATE: read_element_date,
|
|
BINARY: read_element_binary
|
|
}
|
|
|
|
class BaseElement(object):
|
|
|
|
def __init__(self, id=None, position=None, size=None, data=None):
|
|
self.id = id
|
|
self.position = position
|
|
self.size = size
|
|
self.data = data
|
|
|
|
class Element(BaseElement):
|
|
"""Base object of EBML
|
|
|
|
:param int id: id of the element, best represented as hexadecimal (0x18538067 for Matroska Segment element)
|
|
:param type: type of the element
|
|
:type type: :data:`INTEGER`, :data:`UINTEGER`, :data:`FLOAT`, :data:`STRING`, :data:`UNICODE`, :data:`DATE`, :data:`MASTER` or :data:`BINARY`
|
|
:param string name: name of the element
|
|
:param int level: level of the element
|
|
:param int position: position of element's data
|
|
:param int size: size of element's data
|
|
:param data: data as read by the corresponding :data:`READERS`
|
|
|
|
"""
|
|
def __init__(self, id=None, type=None, name=None, level=None, position=None, size=None, data=None):
|
|
super(Element, self).__init__(id, position, size, data)
|
|
self.type = type
|
|
self.name = name
|
|
self.level = level
|
|
|
|
def __repr__(self):
|
|
return '<%s [%s, %r]>' % (self.__class__.__name__, self.name, self.data)
|
|
|
|
|
|
class MasterElement(Element):
|
|
"""Element of type :data:`MASTER` that has a list of :class:`Element` as its data
|
|
|
|
:param int id: id of the element, best represented as hexadecimal (0x18538067 for Matroska Segment element)
|
|
:param string name: name of the element
|
|
:param int level: level of the element
|
|
:param int position: position of element's data
|
|
:param int size: size of element's data
|
|
:param data: child elements
|
|
:type data: list of :class:`Element`
|
|
|
|
:class:`MasterElement` implements some magic methods to ease manipulation. Thus, a MasterElement supports
|
|
the `in` keyword to test for the presence of a child element by its name and gives access to it
|
|
with a container getter::
|
|
|
|
>>> ebml_element = parse(open('test1.mkv', 'rb'), get_matroska_specs())[0]
|
|
>>> 'EBMLVersion' in ebml_element
|
|
False
|
|
>>> 'DocType' in ebml_element
|
|
True
|
|
>>> ebml_element['DocType']
|
|
Element(DocType, u'matroska')
|
|
|
|
"""
|
|
def __init__(self, id=None, name=None, level=None, position=None, size=None, data=None):
|
|
super(MasterElement, self).__init__(id, MASTER, name, level, position, size, data)
|
|
|
|
def load(self, stream, specs, ignore_element_types=None, ignore_element_names=None, max_level=None):
|
|
"""Load children :class:`Elements <Element>` with level lower or equal to the `max_level`
|
|
from the `stream` according to the `specs`
|
|
|
|
:param stream: file-like object from which to read
|
|
:param dict specs: see :ref:`specs`
|
|
:param int max_level: maximum level for children elements
|
|
:param list ignore_element_types: list of element types to ignore
|
|
:param list ignore_element_names: list of element names to ignore
|
|
:param int max_level: maximum level of elements
|
|
|
|
"""
|
|
self.data = parse(stream, specs, self.size, ignore_element_types, ignore_element_names, max_level)
|
|
|
|
def get(self, name, default=None):
|
|
"""Convenience method for ``master_element[name].data if name in master_element else default``
|
|
|
|
:param string name: the name of the child to get
|
|
:param default: default value if `name` is not in the :class:`MasterElement`
|
|
:return: the data of the child :class:`Element` or `default`
|
|
|
|
"""
|
|
if name not in self:
|
|
return default
|
|
element = self[name]
|
|
if element.type == MASTER:
|
|
raise ValueError('%s is a MasterElement' % name)
|
|
return element.data
|
|
|
|
def __getitem__(self, key):
|
|
if isinstance(key, int):
|
|
return self.data[key]
|
|
children = [e for e in self.data if e.name == key]
|
|
if not children:
|
|
raise KeyError(key)
|
|
if len(children) > 1:
|
|
raise KeyError('More than 1 child with key %s (%d)' % (key, len(children)))
|
|
return children[0]
|
|
|
|
def __contains__(self, item):
|
|
return len([e for e in self.data if e.name == item]) > 0
|
|
|
|
def __iter__(self):
|
|
return iter(self.data)
|
|
|
|
def parse(stream, specs, size=None, ignore_element_types=None, ignore_element_names=None, max_level=None, include_element_names=None):
|
|
"""Parse a stream for `size` bytes according to the `specs`
|
|
|
|
:param stream: file-like object from which to read
|
|
:param size: maximum number of bytes to read, None to read all the stream
|
|
:type size: int or None
|
|
:param dict specs: see :ref:`specs`
|
|
:param list ignore_element_types: list of element types to ignore
|
|
:param list ignore_element_names: list of element names to ignore
|
|
:param int max_level: maximum level of elements
|
|
:param list include_element_names: list of element names to include exclusively, so ignoring all other element names
|
|
:return: parsed data as a tree of :class:`~enzyme.parsers.ebml.core.Element`
|
|
:rtype: list
|
|
|
|
.. note::
|
|
If `size` is reached in a middle of an element, reading will continue
|
|
until the element is fully parsed.
|
|
|
|
"""
|
|
ignore_element_types = ignore_element_types if ignore_element_types is not None else []
|
|
ignore_element_names = ignore_element_names if ignore_element_names is not None else []
|
|
include_element_names = include_element_names if include_element_names is not None else []
|
|
start = stream.tell()
|
|
elements = []
|
|
while size is None or stream.tell() - start < size:
|
|
try:
|
|
element = parse_element(stream, specs)
|
|
if not element or not hasattr(element, "type"):
|
|
stream.seek(element.size, 1)
|
|
continue
|
|
|
|
if element.type is None:
|
|
logger.error('Element with id 0x%x is not in the specs' % element.id)
|
|
stream.seek(element.size, 1)
|
|
continue
|
|
elif element.type in ignore_element_types or element.name in ignore_element_names:
|
|
logger.info('%s %s %s ignored', element.__class__.__name__, element.name, element.type)
|
|
stream.seek(element.size, 1)
|
|
continue
|
|
elif len(include_element_names) > 0 and element.name not in include_element_names:
|
|
stream.seek(element.size, 1)
|
|
continue
|
|
elif element.type == MASTER:
|
|
if max_level is not None and element.level >= max_level:
|
|
logger.info('Maximum level %d reached for children of %s %s', max_level, element.__class__.__name__, element.name)
|
|
stream.seek(element.size, 1)
|
|
else:
|
|
logger.debug('Loading child elements for %s %s with size %d', element.__class__.__name__, element.name, element.size)
|
|
element.data = parse(stream, specs, element.size, ignore_element_types, ignore_element_names, max_level,include_element_names)
|
|
else:
|
|
element.data = READERS[element.type](stream, element.size)
|
|
elements.append(element)
|
|
except ReadError:
|
|
if size is not None:
|
|
raise
|
|
break
|
|
return elements
|
|
|
|
|
|
def parse_element(stream, specs):
|
|
"""Extract a single :class:`Element` from the `stream` according to the `specs`
|
|
|
|
:param stream: file-like object from which to read
|
|
:param dict specs: see :ref:`specs`
|
|
:return: the parsed element
|
|
:rtype: :class:`Element`
|
|
|
|
"""
|
|
element_id = read_element_id(stream)
|
|
if element_id is None:
|
|
raise ReadError('Cannot read element id')
|
|
element_size = read_element_size(stream)
|
|
if element_size is None:
|
|
raise ReadError('Cannot read element size')
|
|
if element_id not in specs:
|
|
return BaseElement(element_id,stream.tell(),element_size)
|
|
element_type, element_name, element_level = specs[element_id]
|
|
if element_type == MASTER:
|
|
element = MasterElement(element_id, element_name, element_level, stream.tell(), element_size)
|
|
else:
|
|
element = Element(element_id, element_type, element_name, element_level, stream.tell(), element_size)
|
|
return element
|
|
|
|
def get_matroska_specs(webm_only=False):
|
|
"""Get the Matroska specs
|
|
|
|
:param bool webm_only: load *only* WebM specs
|
|
:return: the specs in the appropriate format. See :ref:`specs`
|
|
:rtype: dict
|
|
|
|
"""
|
|
specs = {}
|
|
with resource_stream(__name__, 'specs/matroska.xml') as resource:
|
|
xmldoc = minidom.parse(resource)
|
|
for element in xmldoc.getElementsByTagName('element'):
|
|
if not webm_only or element.hasAttribute('webm') and element.getAttribute('webm') == '1':
|
|
specs[int(element.getAttribute('id'), 16)] = (SPEC_TYPES[element.getAttribute('type')], element.getAttribute('name'), int(element.getAttribute('level')))
|
|
return specs
|