mylar/lib/js2py/translators/jsregexps.py

219 lines
6.7 KiB
Python
Raw Normal View History

from pyjsparserdata import *
REGEXP_SPECIAL_SINGLE = {'\\', '^', '$', '*', '+', '?', '.'}
NOT_PATTERN_CHARS = {'^', '$', '\\', '.', '*', '+', '?', '(', ')', '[', ']', '|'} # what about '{', '}', ???
CHAR_CLASS_ESCAPE = {'d', 'D', 's', 'S', 'w', 'W'}
CONTROL_ESCAPE_CHARS = {'f', 'n', 'r', 't', 'v'}
CONTROL_LETTERS = {'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't',
'u', 'v', 'w', 'x', 'y', 'z', 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M', 'N',
'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z'}
def SpecialChar(char):
return {'type': 'SpecialChar',
'content': char}
def isPatternCharacter(char):
return char not in NOT_PATTERN_CHARS
class JsRegExpParser:
def __init__(self, source, flags):
self.source = source
self.flags = flags
self.index = 0
self.length = len(source)
self.lineNumber = 0
self.lineStart = 0
def parsePattern(self):
'''Perform sctring escape - for regexp literals'''
return {'type': 'Pattern',
'contents': self.parseDisjunction()}
def parseDisjunction(self):
alternatives = []
while True:
alternatives.append(self.parseAlternative())
if not self.isEOF():
self.expect_character('|')
else:
break
return {'type': 'Disjunction',
'contents': alternatives}
def isEOF(self):
if self.index>=self.length:
return True
return False
def expect_character(self, character):
if self.source[self.index]!=character:
self.throwUnexpected(character)
self.index += 1
def parseAlternative(self):
contents = []
while not self.isEOF() and self.source[self.index]!='|':
contents.append(self.parseTerm())
return {'type': 'Alternative',
'contents': contents}
def follows(self, chars):
for i, c in enumerate(chars):
if self.index+i>=self.length or self.source[self.index+i] != c:
return False
return True
def parseTerm(self):
assertion = self.parseAssertion()
if assertion:
return assertion
else:
return {'type': 'Term',
'contents': self.parseAtom()} # quantifier will go inside atom!
def parseAssertion(self):
if self.follows('$'):
content = SpecialChar('$')
self.index += 1
elif self.follows('^'):
content = SpecialChar('^')
self.index += 1
elif self.follows('\\b'):
content = SpecialChar('\\b')
self.index += 2
elif self.follows('\\B'):
content = SpecialChar('\\B')
self.index += 2
elif self.follows('(?='):
self.index += 3
dis = self.parseDisjunction()
self.expect_character(')')
content = {'type': 'Lookached',
'contents': dis,
'negated': False}
elif self.follows('(?!'):
self.index += 3
dis = self.parseDisjunction()
self.expect_character(')')
content = {'type': 'Lookached',
'contents': dis,
'negated': True}
else:
return None
return {'type': 'Assertion',
'content': content}
def parseAtom(self):
if self.follows('.'):
content = SpecialChar('.')
self.index += 1
elif self.follows('\\'):
self.index += 1
content = self.parseAtomEscape()
elif self.follows('['):
content = self.parseCharacterClass()
elif self.follows('(?:'):
self.index += 3
dis = self.parseDisjunction()
self.expect_character(')')
content = 'idk'
elif self.follows('('):
self.index += 1
dis = self.parseDisjunction()
self.expect_character(')')
content = 'idk'
elif isPatternCharacter(self.source[self.index]):
content = self.source[self.index]
self.index += 1
else:
return None
quantifier = self.parseQuantifier()
return {'type': 'Atom',
'content': content,
'quantifier': quantifier}
def parseQuantifier(self):
prefix = self.parseQuantifierPrefix()
if not prefix:
return None
greedy = True
if self.follows('?'):
self.index += 1
greedy = False
return {'type': 'Quantifier',
'contents': prefix,
'greedy': greedy}
def parseQuantifierPrefix(self):
if self.isEOF():
return None
if self.follows('+'):
content = '+'
self.index += 1
elif self.follows('?'):
content = '?'
self.index += 1
elif self.follows('*'):
content = '*'
self.index += 1
elif self.follows('{'): # try matching otherwise return None and restore the state
i = self.index
self.index += 1
digs1 = self.scanDecimalDigs()
# if no minimal number of digs provided then return no quantifier
if not digs1:
self.index = i
return None
# scan char limit if provided
if self.follows(','):
self.index += 1
digs2 = self.scanDecimalDigs()
else:
digs2 = ''
# must be valid!
if not self.follows('}'):
self.index = i
return None
else:
self.expect_character('}')
content = int(digs1), int(digs2) if digs2 else None
else:
return None
return content
def parseAtomEscape(self):
ch = self.source[self.index]
if isDecimalDigit(ch) and ch!=0:
digs = self.scanDecimalDigs()
elif ch in CHAR_CLASS_ESCAPE:
self.index += 1
return SpecialChar('\\' + ch)
else:
return self.parseCharacterEscape()
def parseCharacterEscape(self):
ch = self.source[self.index]
if ch in CONTROL_ESCAPE_CHARS:
return SpecialChar('\\' + ch)
if ch=='c':
'ok, fuck this shit.'
def scanDecimalDigs(self):
s = self.index
while not self.isEOF() and isDecimalDigit(self.source[self.index]):
self.index += 1
return self.source[s:self.index]
a = JsRegExpParser('a(?=x)', '')
print(a.parsePattern())