2018-09-17 00:27:00 +00:00
|
|
|
#!/usr/bin/env python
|
|
|
|
# -*- coding: utf-8 -*-
|
|
|
|
"""
|
|
|
|
Chain patterns and handle repetiting capture group
|
|
|
|
"""
|
|
|
|
# pylint: disable=super-init-not-called
|
|
|
|
import itertools
|
|
|
|
|
2020-05-20 15:29:39 +00:00
|
|
|
from .builder import Builder
|
|
|
|
from .loose import call
|
2018-09-17 00:27:00 +00:00
|
|
|
from .match import Match, Matches
|
2020-05-20 15:29:39 +00:00
|
|
|
from .pattern import Pattern, filter_match_kwargs, BasePattern
|
2018-09-17 00:27:00 +00:00
|
|
|
from .remodule import re
|
|
|
|
|
|
|
|
|
|
|
|
class _InvalidChainException(Exception):
|
|
|
|
"""
|
|
|
|
Internal exception raised when a chain is not valid
|
|
|
|
"""
|
|
|
|
pass
|
|
|
|
|
|
|
|
|
2020-05-20 15:29:39 +00:00
|
|
|
class Chain(Pattern, Builder):
|
2018-09-17 00:27:00 +00:00
|
|
|
"""
|
|
|
|
Definition of a pattern chain to search for.
|
|
|
|
"""
|
|
|
|
|
2020-05-20 15:29:39 +00:00
|
|
|
def __init__(self, parent, chain_breaker=None, **kwargs):
|
|
|
|
Builder.__init__(self)
|
|
|
|
call(Pattern.__init__, self, **kwargs)
|
2018-09-17 00:27:00 +00:00
|
|
|
self._kwargs = kwargs
|
|
|
|
self._match_kwargs = filter_match_kwargs(kwargs)
|
|
|
|
if callable(chain_breaker):
|
|
|
|
self.chain_breaker = chain_breaker
|
|
|
|
else:
|
|
|
|
self.chain_breaker = None
|
2020-05-20 15:29:39 +00:00
|
|
|
self.parent = parent
|
2018-09-17 00:27:00 +00:00
|
|
|
self.parts = []
|
|
|
|
|
2020-05-20 15:29:39 +00:00
|
|
|
def pattern(self, *pattern):
|
2018-09-17 00:27:00 +00:00
|
|
|
"""
|
|
|
|
|
|
|
|
:param pattern:
|
|
|
|
:return:
|
|
|
|
"""
|
2020-05-20 15:29:39 +00:00
|
|
|
if not pattern:
|
|
|
|
raise ValueError("One pattern should be given to the chain")
|
|
|
|
if len(pattern) > 1:
|
|
|
|
raise ValueError("Only one pattern can be given to the chain")
|
|
|
|
part = ChainPart(self, pattern[0])
|
2018-09-17 00:27:00 +00:00
|
|
|
self.parts.append(part)
|
|
|
|
return part
|
|
|
|
|
|
|
|
def close(self):
|
|
|
|
"""
|
2020-05-20 15:29:39 +00:00
|
|
|
Deeply close the chain
|
|
|
|
:return: Rebulk instance
|
2018-09-17 00:27:00 +00:00
|
|
|
"""
|
2020-05-20 15:29:39 +00:00
|
|
|
parent = self.parent
|
|
|
|
while isinstance(parent, Chain):
|
|
|
|
parent = parent.parent
|
|
|
|
return parent
|
2018-09-17 00:27:00 +00:00
|
|
|
|
|
|
|
def _match(self, pattern, input_string, context=None):
|
|
|
|
# pylint: disable=too-many-locals,too-many-nested-blocks
|
|
|
|
chain_matches = []
|
|
|
|
chain_input_string = input_string
|
|
|
|
offset = 0
|
|
|
|
while offset < len(input_string):
|
|
|
|
chain_found = False
|
|
|
|
current_chain_matches = []
|
|
|
|
valid_chain = True
|
|
|
|
for chain_part in self.parts:
|
|
|
|
try:
|
2020-05-20 15:29:39 +00:00
|
|
|
chain_part_matches, raw_chain_part_matches = chain_part.matches(chain_input_string,
|
|
|
|
context,
|
|
|
|
with_raw_matches=True)
|
2018-09-17 00:27:00 +00:00
|
|
|
|
2020-05-20 15:29:39 +00:00
|
|
|
chain_found, chain_input_string, offset = \
|
|
|
|
self._to_next_chain_part(chain_part, chain_part_matches, raw_chain_part_matches, chain_found,
|
|
|
|
input_string, chain_input_string, offset, current_chain_matches)
|
2018-09-17 00:27:00 +00:00
|
|
|
except _InvalidChainException:
|
|
|
|
valid_chain = False
|
|
|
|
if current_chain_matches:
|
|
|
|
offset = current_chain_matches[0].raw_end
|
|
|
|
break
|
|
|
|
if not chain_found:
|
|
|
|
break
|
|
|
|
if current_chain_matches and valid_chain:
|
|
|
|
match = self._build_chain_match(current_chain_matches, input_string)
|
|
|
|
chain_matches.append(match)
|
|
|
|
|
|
|
|
return chain_matches
|
|
|
|
|
2020-05-20 15:29:39 +00:00
|
|
|
def _to_next_chain_part(self, chain_part, chain_part_matches, raw_chain_part_matches, chain_found,
|
|
|
|
input_string, chain_input_string, offset, current_chain_matches):
|
|
|
|
Chain._fix_matches_offset(chain_part_matches, input_string, offset)
|
|
|
|
Chain._fix_matches_offset(raw_chain_part_matches, input_string, offset)
|
|
|
|
|
|
|
|
if raw_chain_part_matches:
|
|
|
|
grouped_matches_dict = self._group_by_match_index(chain_part_matches)
|
|
|
|
grouped_raw_matches_dict = self._group_by_match_index(raw_chain_part_matches)
|
|
|
|
|
|
|
|
for match_index, grouped_raw_matches in grouped_raw_matches_dict.items():
|
|
|
|
chain_found = True
|
|
|
|
offset = grouped_raw_matches[-1].raw_end
|
|
|
|
chain_input_string = input_string[offset:]
|
|
|
|
|
|
|
|
if not chain_part.is_hidden:
|
|
|
|
grouped_matches = grouped_matches_dict.get(match_index, [])
|
|
|
|
if self._chain_breaker_eval(current_chain_matches + grouped_matches):
|
|
|
|
current_chain_matches.extend(grouped_matches)
|
|
|
|
return chain_found, chain_input_string, offset
|
|
|
|
|
|
|
|
def _process_match(self, match, match_index, child=False):
|
2018-09-17 00:27:00 +00:00
|
|
|
"""
|
2020-05-20 15:29:39 +00:00
|
|
|
Handle a match
|
2018-09-17 00:27:00 +00:00
|
|
|
:param match:
|
|
|
|
:type match:
|
2020-05-20 15:29:39 +00:00
|
|
|
:param match_index:
|
|
|
|
:type match_index:
|
|
|
|
:param child:
|
|
|
|
:type child:
|
2018-09-17 00:27:00 +00:00
|
|
|
:return:
|
|
|
|
:rtype:
|
|
|
|
"""
|
2020-05-20 15:29:39 +00:00
|
|
|
# pylint: disable=too-many-locals
|
|
|
|
ret = super(Chain, self)._process_match(match, match_index, child=child)
|
|
|
|
if ret:
|
|
|
|
return True
|
|
|
|
|
|
|
|
if match.children:
|
2018-09-17 00:27:00 +00:00
|
|
|
last_pattern = match.children[-1].pattern
|
2020-05-20 15:29:39 +00:00
|
|
|
last_pattern_groups = self._group_by_match_index(
|
|
|
|
[child_ for child_ in match.children if child_.pattern == last_pattern]
|
|
|
|
)
|
|
|
|
|
|
|
|
if last_pattern_groups:
|
|
|
|
original_children = Matches(match.children)
|
|
|
|
original_end = match.end
|
|
|
|
|
|
|
|
for index in reversed(list(last_pattern_groups)):
|
|
|
|
last_matches = last_pattern_groups[index]
|
|
|
|
for last_match in last_matches:
|
|
|
|
match.children.remove(last_match)
|
|
|
|
match.end = match.children[-1].end if match.children else match.start
|
|
|
|
ret = super(Chain, self)._process_match(match, match_index, child=child)
|
|
|
|
if ret:
|
|
|
|
return True
|
|
|
|
|
|
|
|
match.children = original_children
|
|
|
|
match.end = original_end
|
|
|
|
|
|
|
|
return False
|
2018-09-17 00:27:00 +00:00
|
|
|
|
|
|
|
def _build_chain_match(self, current_chain_matches, input_string):
|
|
|
|
start = None
|
|
|
|
end = None
|
|
|
|
for match in current_chain_matches:
|
|
|
|
if start is None or start > match.start:
|
|
|
|
start = match.start
|
|
|
|
if end is None or end < match.end:
|
|
|
|
end = match.end
|
|
|
|
match = call(Match, start, end, pattern=self, input_string=input_string, **self._match_kwargs)
|
|
|
|
for chain_match in current_chain_matches:
|
|
|
|
if chain_match.children:
|
|
|
|
for child in chain_match.children:
|
|
|
|
match.children.append(child)
|
|
|
|
if chain_match not in match.children:
|
|
|
|
match.children.append(chain_match)
|
|
|
|
chain_match.parent = match
|
|
|
|
return match
|
|
|
|
|
|
|
|
def _chain_breaker_eval(self, matches):
|
|
|
|
return not self.chain_breaker or not self.chain_breaker(Matches(matches))
|
|
|
|
|
|
|
|
@staticmethod
|
|
|
|
def _fix_matches_offset(chain_part_matches, input_string, offset):
|
|
|
|
for chain_part_match in chain_part_matches:
|
|
|
|
if chain_part_match.input_string != input_string:
|
|
|
|
chain_part_match.input_string = input_string
|
|
|
|
chain_part_match.end += offset
|
|
|
|
chain_part_match.start += offset
|
|
|
|
if chain_part_match.children:
|
|
|
|
Chain._fix_matches_offset(chain_part_match.children, input_string, offset)
|
|
|
|
|
|
|
|
@staticmethod
|
2020-05-20 15:29:39 +00:00
|
|
|
def _group_by_match_index(matches):
|
|
|
|
grouped_matches_dict = dict()
|
|
|
|
for match_index, match in itertools.groupby(matches, lambda m: m.match_index):
|
|
|
|
grouped_matches_dict[match_index] = list(match)
|
|
|
|
return grouped_matches_dict
|
2018-09-17 00:27:00 +00:00
|
|
|
|
|
|
|
@property
|
|
|
|
def match_options(self):
|
|
|
|
return {}
|
|
|
|
|
|
|
|
@property
|
|
|
|
def patterns(self):
|
|
|
|
return [self]
|
|
|
|
|
|
|
|
def __repr__(self):
|
|
|
|
defined = ""
|
|
|
|
if self.defined_at:
|
|
|
|
defined = "@%s" % (self.defined_at,)
|
|
|
|
return "<%s%s:%s>" % (self.__class__.__name__, defined, self.parts)
|
|
|
|
|
|
|
|
|
2020-05-20 15:29:39 +00:00
|
|
|
class ChainPart(BasePattern):
|
2018-09-17 00:27:00 +00:00
|
|
|
"""
|
|
|
|
Part of a pattern chain.
|
|
|
|
"""
|
|
|
|
|
|
|
|
def __init__(self, chain, pattern):
|
|
|
|
self._chain = chain
|
|
|
|
self.pattern = pattern
|
|
|
|
self.repeater_start = 1
|
|
|
|
self.repeater_end = 1
|
|
|
|
self._hidden = False
|
|
|
|
|
2020-05-20 15:29:39 +00:00
|
|
|
@property
|
|
|
|
def _is_chain_start(self):
|
|
|
|
return self._chain.parts[0] == self
|
|
|
|
|
|
|
|
def matches(self, input_string, context=None, with_raw_matches=False):
|
|
|
|
matches, raw_matches = self.pattern.matches(input_string, context=context, with_raw_matches=True)
|
|
|
|
|
|
|
|
matches = self._truncate_repeater(matches, input_string)
|
|
|
|
raw_matches = self._truncate_repeater(raw_matches, input_string)
|
|
|
|
|
|
|
|
self._validate_repeater(raw_matches)
|
|
|
|
|
|
|
|
if with_raw_matches:
|
|
|
|
return matches, raw_matches
|
|
|
|
|
|
|
|
return matches
|
|
|
|
|
|
|
|
def _truncate_repeater(self, matches, input_string):
|
|
|
|
if not matches:
|
|
|
|
return matches
|
|
|
|
|
|
|
|
if not self._is_chain_start:
|
|
|
|
separator = input_string[0:matches[0].initiator.raw_start]
|
|
|
|
if separator:
|
|
|
|
return []
|
|
|
|
|
|
|
|
j = 1
|
|
|
|
for i in range(0, len(matches) - 1):
|
|
|
|
separator = input_string[matches[i].initiator.raw_end:
|
|
|
|
matches[i + 1].initiator.raw_start]
|
|
|
|
if separator:
|
|
|
|
break
|
|
|
|
j += 1
|
|
|
|
truncated = matches[:j]
|
|
|
|
if self.repeater_end is not None:
|
|
|
|
truncated = [m for m in truncated if m.match_index < self.repeater_end]
|
|
|
|
return truncated
|
|
|
|
|
|
|
|
def _validate_repeater(self, matches):
|
|
|
|
max_match_index = -1
|
|
|
|
if matches:
|
|
|
|
max_match_index = max([m.match_index for m in matches])
|
|
|
|
if max_match_index + 1 < self.repeater_start:
|
|
|
|
raise _InvalidChainException
|
|
|
|
|
2018-09-17 00:27:00 +00:00
|
|
|
def chain(self):
|
|
|
|
"""
|
|
|
|
Add patterns chain, using configuration from this chain
|
|
|
|
|
|
|
|
:return:
|
|
|
|
:rtype:
|
|
|
|
"""
|
|
|
|
return self._chain.chain()
|
|
|
|
|
|
|
|
def hidden(self, hidden=True):
|
|
|
|
"""
|
|
|
|
Hide chain part results from global chain result
|
|
|
|
|
|
|
|
:param hidden:
|
|
|
|
:type hidden:
|
|
|
|
:return:
|
|
|
|
:rtype:
|
|
|
|
"""
|
|
|
|
self._hidden = hidden
|
|
|
|
return self
|
|
|
|
|
|
|
|
@property
|
|
|
|
def is_hidden(self):
|
|
|
|
"""
|
|
|
|
Check if the chain part is hidden
|
|
|
|
:return:
|
|
|
|
:rtype:
|
|
|
|
"""
|
|
|
|
return self._hidden
|
|
|
|
|
|
|
|
def regex(self, *pattern, **kwargs):
|
|
|
|
"""
|
|
|
|
Add re pattern
|
|
|
|
|
|
|
|
:param pattern:
|
|
|
|
:type pattern:
|
|
|
|
:param kwargs:
|
|
|
|
:type kwargs:
|
|
|
|
:return:
|
|
|
|
:rtype:
|
|
|
|
"""
|
|
|
|
return self._chain.regex(*pattern, **kwargs)
|
|
|
|
|
|
|
|
def functional(self, *pattern, **kwargs):
|
|
|
|
"""
|
|
|
|
Add functional pattern
|
|
|
|
|
|
|
|
:param pattern:
|
|
|
|
:type pattern:
|
|
|
|
:param kwargs:
|
|
|
|
:type kwargs:
|
|
|
|
:return:
|
|
|
|
:rtype:
|
|
|
|
"""
|
|
|
|
return self._chain.functional(*pattern, **kwargs)
|
|
|
|
|
|
|
|
def string(self, *pattern, **kwargs):
|
|
|
|
"""
|
|
|
|
Add string pattern
|
|
|
|
|
|
|
|
:param pattern:
|
|
|
|
:type pattern:
|
|
|
|
:param kwargs:
|
|
|
|
:type kwargs:
|
|
|
|
:return:
|
|
|
|
:rtype:
|
|
|
|
"""
|
|
|
|
return self._chain.string(*pattern, **kwargs)
|
|
|
|
|
|
|
|
def close(self):
|
|
|
|
"""
|
|
|
|
Close the chain builder to continue registering other patterns
|
|
|
|
|
|
|
|
:return:
|
|
|
|
:rtype:
|
|
|
|
"""
|
|
|
|
return self._chain.close()
|
|
|
|
|
|
|
|
def repeater(self, value):
|
|
|
|
"""
|
|
|
|
Define the repeater of the current chain part.
|
|
|
|
|
|
|
|
:param value:
|
|
|
|
:type value:
|
|
|
|
:return:
|
|
|
|
:rtype:
|
|
|
|
"""
|
|
|
|
try:
|
|
|
|
value = int(value)
|
|
|
|
self.repeater_start = value
|
|
|
|
self.repeater_end = value
|
|
|
|
return self
|
|
|
|
except ValueError:
|
|
|
|
pass
|
|
|
|
if value == '+':
|
|
|
|
self.repeater_start = 1
|
|
|
|
self.repeater_end = None
|
|
|
|
if value == '*':
|
|
|
|
self.repeater_start = 0
|
|
|
|
self.repeater_end = None
|
|
|
|
elif value == '?':
|
|
|
|
self.repeater_start = 0
|
|
|
|
self.repeater_end = 1
|
|
|
|
else:
|
|
|
|
match = re.match(r'\{\s*(\d*)\s*,?\s*(\d*)\s*\}', value)
|
|
|
|
if match:
|
|
|
|
start = match.group(1)
|
|
|
|
end = match.group(2)
|
|
|
|
if start or end:
|
|
|
|
self.repeater_start = int(start) if start else 0
|
|
|
|
self.repeater_end = int(end) if end else None
|
|
|
|
return self
|
|
|
|
|
|
|
|
def __repr__(self):
|
|
|
|
return "%s({%s,%s})" % (self.pattern, self.repeater_start, self.repeater_end)
|