mirror of https://github.com/morpheus65535/bazarr
909 lines
30 KiB
Python
909 lines
30 KiB
Python
from __future__ import absolute_import, unicode_literals
|
|
|
|
import re
|
|
from commonmark import common
|
|
from commonmark.common import unescape_string
|
|
from commonmark.inlines import InlineParser
|
|
from commonmark.node import Node
|
|
|
|
|
|
CODE_INDENT = 4
|
|
reHtmlBlockOpen = [
|
|
re.compile(r'.'), # dummy for 0
|
|
re.compile(r'^<(?:script|pre|style)(?:\s|>|$)', re.IGNORECASE),
|
|
re.compile(r'^<!--'),
|
|
re.compile(r'^<[?]'),
|
|
re.compile(r'^<![A-Z]'),
|
|
re.compile(r'^<!\[CDATA\['),
|
|
re.compile(
|
|
r'^<[/]?(?:address|article|aside|base|basefont|blockquote|body|'
|
|
r'caption|center|col|colgroup|dd|details|dialog|dir|div|dl|dt|'
|
|
r'fieldset|figcaption|figure|footer|form|frame|frameset|h1|head|'
|
|
r'header|hr|html|iframe|legend|li|link|main|menu|menuitem|'
|
|
r'nav|noframes|ol|optgroup|option|p|param|section|source|title|'
|
|
r'summary|table|tbody|td|tfoot|th|thead|title|tr|track|ul)'
|
|
r'(?:\s|[/]?[>]|$)',
|
|
re.IGNORECASE),
|
|
re.compile(
|
|
'^(?:' + common.OPENTAG + '|' + common.CLOSETAG + ')\\s*$',
|
|
re.IGNORECASE),
|
|
]
|
|
reHtmlBlockClose = [
|
|
re.compile(r'.'), # dummy for 0
|
|
re.compile(r'<\/(?:script|pre|style)>', re.IGNORECASE),
|
|
re.compile(r'-->'),
|
|
re.compile(r'\?>'),
|
|
re.compile(r'>'),
|
|
re.compile(r'\]\]>'),
|
|
]
|
|
reThematicBreak = re.compile(
|
|
r'^(?:(?:\*[ \t]*){3,}|(?:_[ \t]*){3,}|(?:-[ \t]*){3,})[ \t]*$')
|
|
reMaybeSpecial = re.compile(r'^[#`~*+_=<>0-9-]')
|
|
reNonSpace = re.compile(r'[^ \t\f\v\r\n]')
|
|
reBulletListMarker = re.compile(r'^[*+-]')
|
|
reOrderedListMarker = re.compile(r'^(\d{1,9})([.)])')
|
|
reATXHeadingMarker = re.compile(r'^#{1,6}(?:[ \t]+|$)')
|
|
reCodeFence = re.compile(r'^`{3,}(?!.*`)|^~{3,}')
|
|
reClosingCodeFence = re.compile(r'^(?:`{3,}|~{3,})(?= *$)')
|
|
reSetextHeadingLine = re.compile(r'^(?:=+|-+)[ \t]*$')
|
|
reLineEnding = re.compile(r'\r\n|\n|\r')
|
|
|
|
|
|
def is_blank(s):
|
|
"""Returns True if string contains only space characters."""
|
|
return re.search(reNonSpace, s) is None
|
|
|
|
|
|
def is_space_or_tab(s):
|
|
return s in (' ', '\t')
|
|
|
|
|
|
def peek(ln, pos):
|
|
if pos < len(ln):
|
|
return ln[pos]
|
|
else:
|
|
return None
|
|
|
|
|
|
def ends_with_blank_line(block):
|
|
""" Returns true if block ends with a blank line,
|
|
descending if needed into lists and sublists."""
|
|
while block:
|
|
if block.last_line_blank:
|
|
return True
|
|
if not block.last_line_checked and \
|
|
block.t in ('list', 'item'):
|
|
block.last_line_checked = True
|
|
block = block.last_child
|
|
else:
|
|
block.last_line_checked = True
|
|
break
|
|
|
|
return False
|
|
|
|
|
|
def parse_list_marker(parser, container):
|
|
""" Parse a list marker and return data on the marker (type,
|
|
start, delimiter, bullet character, padding) or None."""
|
|
rest = parser.current_line[parser.next_nonspace:]
|
|
data = {
|
|
'type': None,
|
|
'tight': True, # lists are tight by default
|
|
'bullet_char': None,
|
|
'start': None,
|
|
'delimiter': None,
|
|
'padding': None,
|
|
'marker_offset': parser.indent,
|
|
}
|
|
if parser.indent >= 4:
|
|
return None
|
|
m = re.search(reBulletListMarker, rest)
|
|
m2 = re.search(reOrderedListMarker, rest)
|
|
if m:
|
|
data['type'] = 'bullet'
|
|
data['bullet_char'] = m.group()[0]
|
|
elif m2 and (container.t != 'paragraph' or m2.group(1) == '1'):
|
|
m = m2
|
|
data['type'] = 'ordered'
|
|
data['start'] = int(m.group(1))
|
|
data['delimiter'] = m.group(2)
|
|
else:
|
|
return None
|
|
|
|
# make sure we have spaces after
|
|
nextc = peek(parser.current_line, parser.next_nonspace + len(m.group()))
|
|
if not (nextc is None or nextc == '\t' or nextc == ' '):
|
|
return None
|
|
|
|
# if it interrupts paragraph, make sure first line isn't blank
|
|
if container.t == 'paragraph' and \
|
|
not re.search(
|
|
reNonSpace,
|
|
parser.current_line[parser.next_nonspace + len(m.group()):]):
|
|
return None
|
|
|
|
# we've got a match! advance offset and calculate padding
|
|
parser.advance_next_nonspace() # to start of marker
|
|
parser.advance_offset(len(m.group()), True) # to end of marker
|
|
spaces_start_col = parser.column
|
|
spaces_start_offset = parser.offset
|
|
while True:
|
|
parser.advance_offset(1, True)
|
|
nextc = peek(parser.current_line, parser.offset)
|
|
if parser.column - spaces_start_col < 5 and \
|
|
is_space_or_tab(nextc):
|
|
pass
|
|
else:
|
|
break
|
|
blank_item = peek(parser.current_line, parser.offset) is None
|
|
spaces_after_marker = parser.column - spaces_start_col
|
|
if spaces_after_marker >= 5 or \
|
|
spaces_after_marker < 1 or \
|
|
blank_item:
|
|
data['padding'] = len(m.group()) + 1
|
|
parser.column = spaces_start_col
|
|
parser.offset = spaces_start_offset
|
|
if is_space_or_tab(peek(parser.current_line, parser.offset)):
|
|
parser.advance_offset(1, True)
|
|
else:
|
|
data['padding'] = len(m.group()) + spaces_after_marker
|
|
|
|
return data
|
|
|
|
|
|
def lists_match(list_data, item_data):
|
|
"""
|
|
Returns True if the two list items are of the same type,
|
|
with the same delimiter and bullet character. This is used
|
|
in agglomerating list items into lists.
|
|
"""
|
|
return list_data.get('type') == item_data.get('type') and \
|
|
list_data.get('delimiter') == item_data.get('delimiter') and \
|
|
list_data.get('bullet_char') == item_data.get('bullet_char')
|
|
|
|
|
|
class Block(object):
|
|
accepts_lines = None
|
|
|
|
@staticmethod
|
|
def continue_(parser=None, container=None):
|
|
return
|
|
|
|
@staticmethod
|
|
def finalize(parser=None, block=None):
|
|
return
|
|
|
|
@staticmethod
|
|
def can_contain(t):
|
|
return
|
|
|
|
|
|
class Document(Block):
|
|
accepts_lines = False
|
|
|
|
@staticmethod
|
|
def continue_(parser=None, container=None):
|
|
return 0
|
|
|
|
@staticmethod
|
|
def finalize(parser=None, block=None):
|
|
return
|
|
|
|
@staticmethod
|
|
def can_contain(t):
|
|
return t != 'item'
|
|
|
|
|
|
class List(Block):
|
|
accepts_lines = False
|
|
|
|
@staticmethod
|
|
def continue_(parser=None, container=None):
|
|
return 0
|
|
|
|
@staticmethod
|
|
def finalize(parser=None, block=None):
|
|
item = block.first_child
|
|
while item:
|
|
# check for non-final list item ending with blank line:
|
|
if ends_with_blank_line(item) and item.nxt:
|
|
block.list_data['tight'] = False
|
|
break
|
|
# recurse into children of list item, to see if there are
|
|
# spaces between any of them:
|
|
subitem = item.first_child
|
|
while subitem:
|
|
if ends_with_blank_line(subitem) and \
|
|
(item.nxt or subitem.nxt):
|
|
block.list_data['tight'] = False
|
|
break
|
|
subitem = subitem.nxt
|
|
item = item.nxt
|
|
|
|
@staticmethod
|
|
def can_contain(t):
|
|
return t == 'item'
|
|
|
|
|
|
class BlockQuote(Block):
|
|
accepts_lines = False
|
|
|
|
@staticmethod
|
|
def continue_(parser=None, container=None):
|
|
ln = parser.current_line
|
|
if not parser.indented and peek(ln, parser.next_nonspace) == '>':
|
|
parser.advance_next_nonspace()
|
|
parser.advance_offset(1, False)
|
|
if is_space_or_tab(peek(ln, parser.offset)):
|
|
parser.advance_offset(1, True)
|
|
else:
|
|
return 1
|
|
return 0
|
|
|
|
@staticmethod
|
|
def finalize(parser=None, block=None):
|
|
return
|
|
|
|
@staticmethod
|
|
def can_contain(t):
|
|
return t != 'item'
|
|
|
|
|
|
class Item(Block):
|
|
accepts_lines = False
|
|
|
|
@staticmethod
|
|
def continue_(parser=None, container=None):
|
|
if parser.blank:
|
|
if container.first_child is None:
|
|
# Blank line after empty list item
|
|
return 1
|
|
else:
|
|
parser.advance_next_nonspace()
|
|
elif parser.indent >= (container.list_data['marker_offset'] +
|
|
container.list_data['padding']):
|
|
parser.advance_offset(
|
|
container.list_data['marker_offset'] +
|
|
container.list_data['padding'], True)
|
|
else:
|
|
return 1
|
|
return 0
|
|
|
|
@staticmethod
|
|
def finalize(parser=None, block=None):
|
|
return
|
|
|
|
@staticmethod
|
|
def can_contain(t):
|
|
return t != 'item'
|
|
|
|
|
|
class Heading(Block):
|
|
accepts_lines = False
|
|
|
|
@staticmethod
|
|
def continue_(parser=None, container=None):
|
|
# A heading can never container > 1 line, so fail to match:
|
|
return 1
|
|
|
|
@staticmethod
|
|
def finalize(parser=None, block=None):
|
|
return
|
|
|
|
@staticmethod
|
|
def can_contain(t):
|
|
return False
|
|
|
|
|
|
class ThematicBreak(Block):
|
|
accepts_lines = False
|
|
|
|
@staticmethod
|
|
def continue_(parser=None, container=None):
|
|
# A thematic break can never container > 1 line, so fail to match:
|
|
return 1
|
|
|
|
@staticmethod
|
|
def finalize(parser=None, block=None):
|
|
return
|
|
|
|
@staticmethod
|
|
def can_contain(t):
|
|
return False
|
|
|
|
|
|
class CodeBlock(Block):
|
|
accepts_lines = True
|
|
|
|
@staticmethod
|
|
def continue_(parser=None, container=None):
|
|
ln = parser.current_line
|
|
indent = parser.indent
|
|
if container.is_fenced:
|
|
match = indent <= 3 and \
|
|
len(ln) >= parser.next_nonspace + 1 and \
|
|
ln[parser.next_nonspace] == container.fence_char and \
|
|
re.search(reClosingCodeFence, ln[parser.next_nonspace:])
|
|
if match and len(match.group()) >= container.fence_length:
|
|
# closing fence - we're at end of line, so we can return
|
|
parser.finalize(container, parser.line_number)
|
|
return 2
|
|
else:
|
|
# skip optional spaces of fence offset
|
|
i = container.fence_offset
|
|
while i > 0 and is_space_or_tab(peek(ln, parser.offset)):
|
|
parser.advance_offset(1, True)
|
|
i -= 1
|
|
else:
|
|
# indented
|
|
if indent >= CODE_INDENT:
|
|
parser.advance_offset(CODE_INDENT, True)
|
|
elif parser.blank:
|
|
parser.advance_next_nonspace()
|
|
else:
|
|
return 1
|
|
return 0
|
|
|
|
@staticmethod
|
|
def finalize(parser=None, block=None):
|
|
if block.is_fenced:
|
|
# first line becomes info string
|
|
content = block.string_content
|
|
newline_pos = content.index('\n')
|
|
first_line = content[0:newline_pos]
|
|
rest = content[newline_pos + 1:]
|
|
block.info = unescape_string(first_line.strip())
|
|
block.literal = rest
|
|
else:
|
|
# indented
|
|
block.literal = re.sub(r'(\n *)+$', '\n', block.string_content)
|
|
|
|
block.string_content = None
|
|
|
|
@staticmethod
|
|
def can_contain(t):
|
|
return False
|
|
|
|
|
|
class HtmlBlock(Block):
|
|
accepts_lines = True
|
|
|
|
@staticmethod
|
|
def continue_(parser=None, container=None):
|
|
if parser.blank and (container.html_block_type == 6 or
|
|
container.html_block_type == 7):
|
|
return 1
|
|
else:
|
|
return 0
|
|
|
|
@staticmethod
|
|
def finalize(parser=None, block=None):
|
|
block.literal = re.sub(r'(\n *)+$', '', block.string_content)
|
|
# allow GC
|
|
block.string_content = None
|
|
|
|
@staticmethod
|
|
def can_contain(t):
|
|
return False
|
|
|
|
|
|
class Paragraph(Block):
|
|
accepts_lines = True
|
|
|
|
@staticmethod
|
|
def continue_(parser=None, container=None):
|
|
return 1 if parser.blank else 0
|
|
|
|
@staticmethod
|
|
def finalize(parser=None, block=None):
|
|
has_reference_defs = False
|
|
|
|
# try parsing the beginning as link reference definitions:
|
|
while peek(block.string_content, 0) == '[':
|
|
pos = parser.inline_parser.parseReference(
|
|
block.string_content, parser.refmap)
|
|
if not pos:
|
|
break
|
|
block.string_content = block.string_content[pos:]
|
|
has_reference_defs = True
|
|
if has_reference_defs and is_blank(block.string_content):
|
|
block.unlink()
|
|
|
|
@staticmethod
|
|
def can_contain(t):
|
|
return False
|
|
|
|
|
|
class BlockStarts(object):
|
|
"""Block start functions.
|
|
|
|
Return values:
|
|
0 = no match
|
|
1 = matched container, keep going
|
|
2 = matched leaf, no more block starts
|
|
"""
|
|
METHODS = [
|
|
'block_quote',
|
|
'atx_heading',
|
|
'fenced_code_block',
|
|
'html_block',
|
|
'setext_heading',
|
|
'thematic_break',
|
|
'list_item',
|
|
'indented_code_block',
|
|
]
|
|
|
|
@staticmethod
|
|
def block_quote(parser, container=None):
|
|
if not parser.indented and \
|
|
peek(parser.current_line, parser.next_nonspace) == '>':
|
|
parser.advance_next_nonspace()
|
|
parser.advance_offset(1, False)
|
|
# optional following space
|
|
if is_space_or_tab(peek(parser.current_line, parser.offset)):
|
|
parser.advance_offset(1, True)
|
|
parser.close_unmatched_blocks()
|
|
parser.add_child('block_quote', parser.next_nonspace)
|
|
return 1
|
|
|
|
return 0
|
|
|
|
@staticmethod
|
|
def atx_heading(parser, container=None):
|
|
if not parser.indented:
|
|
m = re.search(reATXHeadingMarker,
|
|
parser.current_line[parser.next_nonspace:])
|
|
if m:
|
|
parser.advance_next_nonspace()
|
|
parser.advance_offset(len(m.group()), False)
|
|
parser.close_unmatched_blocks()
|
|
container = parser.add_child('heading', parser.next_nonspace)
|
|
# number of #s
|
|
container.level = len(m.group().strip())
|
|
# remove trailing ###s:
|
|
container.string_content = re.sub(
|
|
r'[ \t]+#+[ \t]*$', '', re.sub(
|
|
r'^[ \t]*#+[ \t]*$',
|
|
'',
|
|
parser.current_line[parser.offset:]))
|
|
parser.advance_offset(
|
|
len(parser.current_line) - parser.offset, False)
|
|
return 2
|
|
|
|
return 0
|
|
|
|
@staticmethod
|
|
def fenced_code_block(parser, container=None):
|
|
if not parser.indented:
|
|
m = re.search(
|
|
reCodeFence,
|
|
parser.current_line[parser.next_nonspace:])
|
|
if m:
|
|
fence_length = len(m.group())
|
|
parser.close_unmatched_blocks()
|
|
container = parser.add_child(
|
|
'code_block', parser.next_nonspace)
|
|
container.is_fenced = True
|
|
container.fence_length = fence_length
|
|
container.fence_char = m.group()[0]
|
|
container.fence_offset = parser.indent
|
|
parser.advance_next_nonspace()
|
|
parser.advance_offset(fence_length, False)
|
|
return 2
|
|
|
|
return 0
|
|
|
|
@staticmethod
|
|
def html_block(parser, container=None):
|
|
if not parser.indented and \
|
|
peek(parser.current_line, parser.next_nonspace) == '<':
|
|
s = parser.current_line[parser.next_nonspace:]
|
|
|
|
for block_type in range(1, 8):
|
|
if re.search(reHtmlBlockOpen[block_type], s) and \
|
|
(block_type < 7 or container.t != 'paragraph'):
|
|
parser.close_unmatched_blocks()
|
|
# We don't adjust parser.offset;
|
|
# spaces are part of the HTML block:
|
|
b = parser.add_child('html_block', parser.offset)
|
|
b.html_block_type = block_type
|
|
return 2
|
|
return 0
|
|
|
|
@staticmethod
|
|
def setext_heading(parser, container=None):
|
|
if not parser.indented and container.t == 'paragraph':
|
|
m = re.search(
|
|
reSetextHeadingLine,
|
|
parser.current_line[parser.next_nonspace:])
|
|
if m:
|
|
parser.close_unmatched_blocks()
|
|
# resolve reference link definitiosn
|
|
while peek(container.string_content, 0) == '[':
|
|
pos = parser.inline_parser.parseReference(
|
|
container.string_content, parser.refmap)
|
|
if not pos:
|
|
break
|
|
container.string_content = container.string_content[pos:]
|
|
if container.string_content:
|
|
heading = Node('heading', container.sourcepos)
|
|
heading.level = 1 if m.group()[0] == '=' else 2
|
|
heading.string_content = container.string_content
|
|
container.insert_after(heading)
|
|
container.unlink()
|
|
parser.tip = heading
|
|
parser.advance_offset(
|
|
len(parser.current_line) - parser.offset, False)
|
|
return 2
|
|
else:
|
|
return 0
|
|
|
|
return 0
|
|
|
|
@staticmethod
|
|
def thematic_break(parser, container=None):
|
|
if not parser.indented and re.search(
|
|
reThematicBreak, parser.current_line[parser.next_nonspace:]):
|
|
parser.close_unmatched_blocks()
|
|
parser.add_child('thematic_break', parser.next_nonspace)
|
|
parser.advance_offset(
|
|
len(parser.current_line) - parser.offset, False)
|
|
return 2
|
|
return 0
|
|
|
|
@staticmethod
|
|
def list_item(parser, container=None):
|
|
if (not parser.indented or container.t == 'list'):
|
|
data = parse_list_marker(parser, container)
|
|
if data:
|
|
parser.close_unmatched_blocks()
|
|
|
|
# add the list if needed
|
|
if parser.tip.t != 'list' or \
|
|
not lists_match(container.list_data, data):
|
|
container = parser.add_child('list', parser.next_nonspace)
|
|
container.list_data = data
|
|
|
|
# add the list item
|
|
container = parser.add_child('item', parser.next_nonspace)
|
|
container.list_data = data
|
|
return 1
|
|
|
|
return 0
|
|
|
|
@staticmethod
|
|
def indented_code_block(parser, container=None):
|
|
if parser.indented and \
|
|
parser.tip.t != 'paragraph' and \
|
|
not parser.blank:
|
|
# indented code
|
|
parser.advance_offset(CODE_INDENT, True)
|
|
parser.close_unmatched_blocks()
|
|
parser.add_child('code_block', parser.offset)
|
|
return 2
|
|
|
|
return 0
|
|
|
|
|
|
class Parser(object):
|
|
def __init__(self, options={}):
|
|
self.doc = Node('document', [[1, 1], [0, 0]])
|
|
self.block_starts = BlockStarts()
|
|
self.tip = self.doc
|
|
self.oldtip = self.doc
|
|
self.current_line = ''
|
|
self.line_number = 0
|
|
self.offset = 0
|
|
self.column = 0
|
|
self.next_nonspace = 0
|
|
self.next_nonspace_column = 0
|
|
self.indent = 0
|
|
self.indented = False
|
|
self.blank = False
|
|
self.partially_consumed_tab = False
|
|
self.all_closed = True
|
|
self.last_matched_container = self.doc
|
|
self.refmap = {}
|
|
self.last_line_length = 0
|
|
self.inline_parser = InlineParser(options)
|
|
self.options = options
|
|
|
|
def add_line(self):
|
|
""" Add a line to the block at the tip. We assume the tip
|
|
can accept lines -- that check should be done before calling this."""
|
|
if self.partially_consumed_tab:
|
|
# Skip over tab
|
|
self.offset += 1
|
|
# Add space characters
|
|
chars_to_tab = 4 - (self.column % 4)
|
|
self.tip.string_content += (' ' * chars_to_tab)
|
|
self.tip.string_content += (self.current_line[self.offset:] + '\n')
|
|
|
|
def add_child(self, tag, offset):
|
|
""" Add block of type tag as a child of the tip. If the tip can't
|
|
accept children, close and finalize it and try its parent,
|
|
and so on til we find a block that can accept children."""
|
|
while not self.blocks[self.tip.t].can_contain(tag):
|
|
self.finalize(self.tip, self.line_number - 1)
|
|
|
|
column_number = offset + 1
|
|
new_block = Node(tag, [[self.line_number, column_number], [0, 0]])
|
|
new_block.string_content = ''
|
|
self.tip.append_child(new_block)
|
|
self.tip = new_block
|
|
return new_block
|
|
|
|
def close_unmatched_blocks(self):
|
|
"""Finalize and close any unmatched blocks."""
|
|
if not self.all_closed:
|
|
while self.oldtip != self.last_matched_container:
|
|
parent = self.oldtip.parent
|
|
self.finalize(self.oldtip, self.line_number - 1)
|
|
self.oldtip = parent
|
|
self.all_closed = True
|
|
|
|
def find_next_nonspace(self):
|
|
current_line = self.current_line
|
|
i = self.offset
|
|
cols = self.column
|
|
|
|
try:
|
|
c = current_line[i]
|
|
except IndexError:
|
|
c = ''
|
|
while c != '':
|
|
if c == ' ':
|
|
i += 1
|
|
cols += 1
|
|
elif c == '\t':
|
|
i += 1
|
|
cols += (4 - (cols % 4))
|
|
else:
|
|
break
|
|
|
|
try:
|
|
c = current_line[i]
|
|
except IndexError:
|
|
c = ''
|
|
|
|
self.blank = (c == '\n' or c == '\r' or c == '')
|
|
self.next_nonspace = i
|
|
self.next_nonspace_column = cols
|
|
self.indent = self.next_nonspace_column - self.column
|
|
self.indented = self.indent >= CODE_INDENT
|
|
|
|
def advance_next_nonspace(self):
|
|
self.offset = self.next_nonspace
|
|
self.column = self.next_nonspace_column
|
|
self.partially_consumed_tab = False
|
|
|
|
def advance_offset(self, count, columns):
|
|
current_line = self.current_line
|
|
try:
|
|
c = current_line[self.offset]
|
|
except IndexError:
|
|
c = None
|
|
while count > 0 and c is not None:
|
|
if c == '\t':
|
|
chars_to_tab = 4 - (self.column % 4)
|
|
if columns:
|
|
self.partially_consumed_tab = chars_to_tab > count
|
|
chars_to_advance = min(count, chars_to_tab)
|
|
self.column += chars_to_advance
|
|
self.offset += 0 if self.partially_consumed_tab else 1
|
|
count -= chars_to_advance
|
|
else:
|
|
self.partially_consumed_tab = False
|
|
self.column += chars_to_tab
|
|
self.offset += 1
|
|
count -= 1
|
|
else:
|
|
self.partially_consumed_tab = False
|
|
self.offset += 1
|
|
# assume ascii; block starts are ascii
|
|
self.column += 1
|
|
count -= 1
|
|
try:
|
|
c = current_line[self.offset]
|
|
except IndexError:
|
|
c = None
|
|
|
|
def incorporate_line(self, ln):
|
|
"""Analyze a line of text and update the document appropriately.
|
|
|
|
We parse markdown text by calling this on each line of input,
|
|
then finalizing the document.
|
|
"""
|
|
all_matched = True
|
|
|
|
container = self.doc
|
|
self.oldtip = self.tip
|
|
self.offset = 0
|
|
self.column = 0
|
|
self.blank = False
|
|
self.partially_consumed_tab = False
|
|
self.line_number += 1
|
|
|
|
# replace NUL characters for security
|
|
if re.search(r'\u0000', ln) is not None:
|
|
ln = re.sub(r'\0', '\uFFFD', ln)
|
|
|
|
self.current_line = ln
|
|
|
|
# For each containing block, try to parse the associated line start.
|
|
# Bail out on failure: container will point to the last matching block.
|
|
# Set all_matched to false if not all containers match.
|
|
while True:
|
|
last_child = container.last_child
|
|
if not (last_child and last_child.is_open):
|
|
break
|
|
container = last_child
|
|
|
|
self.find_next_nonspace()
|
|
|
|
rv = self.blocks[container.t].continue_(self, container)
|
|
if rv == 0:
|
|
# we've matched, keep going
|
|
pass
|
|
elif rv == 1:
|
|
# we've failed to match a block
|
|
all_matched = False
|
|
elif rv == 2:
|
|
# we've hit end of line for fenced code close and can return
|
|
self.last_line_length = len(ln)
|
|
return
|
|
else:
|
|
raise ValueError(
|
|
'continue_ returned illegal value, must be 0, 1, or 2')
|
|
|
|
if not all_matched:
|
|
# back up to last matching block
|
|
container = container.parent
|
|
break
|
|
|
|
self.all_closed = (container == self.oldtip)
|
|
self.last_matched_container = container
|
|
|
|
matched_leaf = container.t != 'paragraph' and \
|
|
self.blocks[container.t].accepts_lines
|
|
starts = self.block_starts
|
|
starts_len = len(starts.METHODS)
|
|
# Unless last matched container is a code block, try new container
|
|
# starts, adding children to the last matched container:
|
|
while not matched_leaf:
|
|
self.find_next_nonspace()
|
|
|
|
# this is a little performance optimization:
|
|
if not self.indented and \
|
|
not re.search(reMaybeSpecial, ln[self.next_nonspace:]):
|
|
self.advance_next_nonspace()
|
|
break
|
|
|
|
i = 0
|
|
while i < starts_len:
|
|
res = getattr(starts, starts.METHODS[i])(self, container)
|
|
if res == 1:
|
|
container = self.tip
|
|
break
|
|
elif res == 2:
|
|
container = self.tip
|
|
matched_leaf = True
|
|
break
|
|
else:
|
|
i += 1
|
|
|
|
if i == starts_len:
|
|
# nothing matched
|
|
self.advance_next_nonspace()
|
|
break
|
|
|
|
# What remains at the offset is a text line. Add the text to the
|
|
# appropriate container.
|
|
if not self.all_closed and not self.blank and \
|
|
self.tip.t == 'paragraph':
|
|
# lazy paragraph continuation
|
|
self.add_line()
|
|
else:
|
|
# not a lazy continuation
|
|
# finalize any blocks not matched
|
|
self.close_unmatched_blocks()
|
|
if self.blank and container.last_child:
|
|
container.last_child.last_line_blank = True
|
|
|
|
t = container.t
|
|
|
|
# Block quote lines are never blank as they start with >
|
|
# and we don't count blanks in fenced code for purposes of
|
|
# tight/loose lists or breaking out of lists. We also
|
|
# don't set last_line_blank on an empty list item, or if we
|
|
# just closed a fenced block.
|
|
last_line_blank = self.blank and \
|
|
not (t == 'block_quote' or
|
|
(t == 'code_block' and container.is_fenced) or
|
|
(t == 'item' and
|
|
not container.first_child and
|
|
container.sourcepos[0][0] == self.line_number))
|
|
|
|
# propagate last_line_blank up through parents:
|
|
cont = container
|
|
while cont:
|
|
cont.last_line_blank = last_line_blank
|
|
cont = cont.parent
|
|
|
|
if self.blocks[t].accepts_lines:
|
|
self.add_line()
|
|
# if HtmlBlock, check for end condition
|
|
if t == 'html_block' and \
|
|
container.html_block_type >= 1 and \
|
|
container.html_block_type <= 5 and \
|
|
re.search(
|
|
reHtmlBlockClose[container.html_block_type],
|
|
self.current_line[self.offset:]):
|
|
self.finalize(container, self.line_number)
|
|
elif self.offset < len(ln) and not self.blank:
|
|
# create a paragraph container for one line
|
|
container = self.add_child('paragraph', self.offset)
|
|
self.advance_next_nonspace()
|
|
self.add_line()
|
|
|
|
self.last_line_length = len(ln)
|
|
|
|
def finalize(self, block, line_number):
|
|
""" Finalize a block. Close it and do any necessary postprocessing,
|
|
e.g. creating string_content from strings, setting the 'tight'
|
|
or 'loose' status of a list, and parsing the beginnings
|
|
of paragraphs for reference definitions. Reset the tip to the
|
|
parent of the closed block."""
|
|
above = block.parent
|
|
block.is_open = False
|
|
block.sourcepos[1] = [line_number, self.last_line_length]
|
|
|
|
self.blocks[block.t].finalize(self, block)
|
|
|
|
self.tip = above
|
|
|
|
def process_inlines(self, block):
|
|
"""
|
|
Walk through a block & children recursively, parsing string content
|
|
into inline content where appropriate.
|
|
"""
|
|
walker = block.walker()
|
|
self.inline_parser.refmap = self.refmap
|
|
self.inline_parser.options = self.options
|
|
event = walker.nxt()
|
|
while event is not None:
|
|
node = event['node']
|
|
t = node.t
|
|
if not event['entering'] and (t == 'paragraph' or t == 'heading'):
|
|
self.inline_parser.parse(node)
|
|
event = walker.nxt()
|
|
|
|
def parse(self, my_input):
|
|
""" The main parsing function. Returns a parsed document AST."""
|
|
self.doc = Node('document', [[1, 1], [0, 0]])
|
|
self.tip = self.doc
|
|
self.refmap = {}
|
|
self.line_number = 0
|
|
self.last_line_length = 0
|
|
self.offset = 0
|
|
self.column = 0
|
|
self.last_matched_container = self.doc
|
|
self.current_line = ''
|
|
lines = re.split(reLineEnding, my_input)
|
|
length = len(lines)
|
|
if len(my_input) > 0 and my_input[-1] == '\n':
|
|
# ignore last blank line created by final newline
|
|
length -= 1
|
|
for i in range(length):
|
|
self.incorporate_line(lines[i])
|
|
while (self.tip):
|
|
self.finalize(self.tip, length)
|
|
self.process_inlines(self.doc)
|
|
return self.doc
|
|
|
|
|
|
CAMEL_RE = re.compile("(.)([A-Z](?:[a-z]+|(?<=[a-z0-9].)))")
|
|
Parser.blocks = dict(
|
|
(CAMEL_RE.sub(r'\1_\2', cls.__name__).lower(), cls)
|
|
for cls in Block.__subclasses__())
|