bazarr/libs/js2py/translators/jsregexps.py

from pyjsparser.pyjsparserdata import *

REGEXP_SPECIAL_SINGLE = {'\\', '^', '$', '*', '+', '?', '.'}

NOT_PATTERN_CHARS = {
    '^', '$', '\\', '.', '*', '+', '?', '(', ')', '[', ']', '|'
}  # what about '{', '}',  ???

CHAR_CLASS_ESCAPE = {'d', 'D', 's', 'S', 'w', 'W'}
CONTROL_ESCAPE_CHARS = {'f', 'n', 'r', 't', 'v'}
CONTROL_LETTERS = {
    'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o',
    'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z', 'A', 'B', 'C', 'D',
    'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P', 'Q', 'R', 'S',
    'T', 'U', 'V', 'W', 'X', 'Y', 'Z'
}


def SpecialChar(char):
    return {'type': 'SpecialChar', 'content': char}


def isPatternCharacter(char):
    return char not in NOT_PATTERN_CHARS


class JsRegExpParser:
    def __init__(self, source, flags):
        self.source = source
        self.flags = flags
        self.index = 0
        self.length = len(source)
        self.lineNumber = 0
        self.lineStart = 0

    def parsePattern(self):
        '''Perform sctring escape - for regexp literals'''
        return {'type': 'Pattern', 'contents': self.parseDisjunction()}

    def parseDisjunction(self):
        alternatives = []
        while True:
            alternatives.append(self.parseAlternative())
            if not self.isEOF():
                self.expect_character('|')
            else:
                break
        return {'type': 'Disjunction', 'contents': alternatives}

    def isEOF(self):
        if self.index >= self.length:
            return True
        return False

    def expect_character(self, character):
        if self.source[self.index] != character:
            self.throwUnexpected(character)
        self.index += 1

    def parseAlternative(self):
        contents = []
        while not self.isEOF() and self.source[self.index] != '|':
            contents.append(self.parseTerm())
        return {'type': 'Alternative', 'contents': contents}

    def follows(self, chars):
        for i, c in enumerate(chars):
            if self.index + i >= self.length or self.source[self.index +
                                                            i] != c:
                return False
        return True

    def parseTerm(self):
        assertion = self.parseAssertion()
        if assertion:
            return assertion
        else:
            return {
                'type': 'Term',
                'contents': self.parseAtom()
            }  # quantifier will go inside atom!

    def parseAssertion(self):
        if self.follows('$'):
            content = SpecialChar('$')
            self.index += 1
        elif self.follows('^'):
            content = SpecialChar('^')
            self.index += 1
        elif self.follows('\\b'):
            content = SpecialChar('\\b')
            self.index += 2
        elif self.follows('\\B'):
            content = SpecialChar('\\B')
            self.index += 2
        elif self.follows('(?='):
            self.index += 3
            dis = self.parseDisjunction()
            self.expect_character(')')
            content = {'type': 'Lookached', 'contents': dis, 'negated': False}
        elif self.follows('(?!'):
            self.index += 3
            dis = self.parseDisjunction()
            self.expect_character(')')
            content = {'type': 'Lookached', 'contents': dis, 'negated': True}
        else:
            return None
        return {'type': 'Assertion', 'content': content}

    def parseAtom(self):
        if self.follows('.'):
            content = SpecialChar('.')
            self.index += 1
        elif self.follows('\\'):
            self.index += 1
            content = self.parseAtomEscape()
        elif self.follows('['):
            content = self.parseCharacterClass()
        elif self.follows('(?:'):
            self.index += 3
            dis = self.parseDisjunction()
            self.expect_character(')')
            content = 'idk'
        elif self.follows('('):
            self.index += 1
            dis = self.parseDisjunction()
            self.expect_character(')')
            content = 'idk'
        elif isPatternCharacter(self.source[self.index]):
            content = self.source[self.index]
            self.index += 1
        else:
            return None
        quantifier = self.parseQuantifier()
        return {'type': 'Atom', 'content': content, 'quantifier': quantifier}

    def parseQuantifier(self):
        prefix = self.parseQuantifierPrefix()
        if not prefix:
            return None
        greedy = True
        if self.follows('?'):
            self.index += 1
            greedy = False
        return {'type': 'Quantifier', 'contents': prefix, 'greedy': greedy}

    def parseQuantifierPrefix(self):
        if self.isEOF():
            return None
        if self.follows('+'):
            content = '+'
            self.index += 1
        elif self.follows('?'):
            content = '?'
            self.index += 1
        elif self.follows('*'):
            content = '*'
            self.index += 1
        elif self.follows(
                '{'
        ):  # try matching otherwise return None and restore the state
            i = self.index
            self.index += 1
            digs1 = self.scanDecimalDigs()
            # if no minimal number of digs provided then return no quantifier
            if not digs1:
                self.index = i
                return None
            # scan char limit if provided
            if self.follows(','):
                self.index += 1
                digs2 = self.scanDecimalDigs()
            else:
                digs2 = ''
            # must be valid!
            if not self.follows('}'):
                self.index = i
                return None
            else:
                self.expect_character('}')
                content = int(digs1), int(digs2) if digs2 else None
        else:
            return None
        return content

    def parseAtomEscape(self):
        ch = self.source[self.index]
        if isDecimalDigit(ch) and ch != 0:
            digs = self.scanDecimalDigs()
        elif ch in CHAR_CLASS_ESCAPE:
            self.index += 1
            return SpecialChar('\\' + ch)
        else:
            return self.parseCharacterEscape()

    def parseCharacterEscape(self):
        ch = self.source[self.index]
        if ch in CONTROL_ESCAPE_CHARS:
            return SpecialChar('\\' + ch)
        if ch == 'c':
            'ok, fuck this shit.'

    def scanDecimalDigs(self):
        s = self.index
        while not self.isEOF() and isDecimalDigit(self.source[self.index]):
            self.index += 1
        return self.source[s:self.index]


a = JsRegExpParser('a(?=x)', '')
print(a.parsePattern())
core: update to subliminal_patch:head; replace cfscrape; add dependencies 2019-04-11 00:02:14 +00:00			`from pyjsparser.pyjsparserdata import *`

			`REGEXP_SPECIAL_SINGLE = {'\\', '^', '$', '*', '+', '?', '.'}`

			`NOT_PATTERN_CHARS = {`
			`'^', '$', '\\', '.', '*', '+', '?', '(', ')', '[', ']', '\|'`
			`} # what about '{', '}', ???`

			`CHAR_CLASS_ESCAPE = {'d', 'D', 's', 'S', 'w', 'W'}`
			`CONTROL_ESCAPE_CHARS = {'f', 'n', 'r', 't', 'v'}`
			`CONTROL_LETTERS = {`
			`'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o',`
			`'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z', 'A', 'B', 'C', 'D',`
			`'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P', 'Q', 'R', 'S',`
			`'T', 'U', 'V', 'W', 'X', 'Y', 'Z'`
			`}`


			`def SpecialChar(char):`
			`return {'type': 'SpecialChar', 'content': char}`


			`def isPatternCharacter(char):`
			`return char not in NOT_PATTERN_CHARS`


			`class JsRegExpParser:`
			`def __init__(self, source, flags):`
			`self.source = source`
			`self.flags = flags`
			`self.index = 0`
			`self.length = len(source)`
			`self.lineNumber = 0`
			`self.lineStart = 0`

			`def parsePattern(self):`
			`'''Perform sctring escape - for regexp literals'''`
			`return {'type': 'Pattern', 'contents': self.parseDisjunction()}`

			`def parseDisjunction(self):`
			`alternatives = []`
			`while True:`
			`alternatives.append(self.parseAlternative())`
			`if not self.isEOF():`
			`self.expect_character('\|')`
			`else:`
			`break`
			`return {'type': 'Disjunction', 'contents': alternatives}`

			`def isEOF(self):`
			`if self.index >= self.length:`
			`return True`
			`return False`

			`def expect_character(self, character):`
			`if self.source[self.index] != character:`
			`self.throwUnexpected(character)`
			`self.index += 1`

			`def parseAlternative(self):`
			`contents = []`
			`while not self.isEOF() and self.source[self.index] != '\|':`
			`contents.append(self.parseTerm())`
			`return {'type': 'Alternative', 'contents': contents}`

			`def follows(self, chars):`
			`for i, c in enumerate(chars):`
			`if self.index + i >= self.length or self.source[self.index +`
			`i] != c:`
			`return False`
			`return True`

			`def parseTerm(self):`
			`assertion = self.parseAssertion()`
			`if assertion:`
			`return assertion`
			`else:`
			`return {`
			`'type': 'Term',`
			`'contents': self.parseAtom()`
			`} # quantifier will go inside atom!`

			`def parseAssertion(self):`
			`if self.follows('$'):`
			`content = SpecialChar('$')`
			`self.index += 1`
			`elif self.follows('^'):`
			`content = SpecialChar('^')`
			`self.index += 1`
			`elif self.follows('\\b'):`
			`content = SpecialChar('\\b')`
			`self.index += 2`
			`elif self.follows('\\B'):`
			`content = SpecialChar('\\B')`
			`self.index += 2`
			`elif self.follows('(?='):`
			`self.index += 3`
			`dis = self.parseDisjunction()`
			`self.expect_character(')')`
			`content = {'type': 'Lookached', 'contents': dis, 'negated': False}`
			`elif self.follows('(?!'):`
			`self.index += 3`
			`dis = self.parseDisjunction()`
			`self.expect_character(')')`
			`content = {'type': 'Lookached', 'contents': dis, 'negated': True}`
			`else:`
			`return None`
			`return {'type': 'Assertion', 'content': content}`

			`def parseAtom(self):`
			`if self.follows('.'):`
			`content = SpecialChar('.')`
			`self.index += 1`
			`elif self.follows('\\'):`
			`self.index += 1`
			`content = self.parseAtomEscape()`
			`elif self.follows('['):`
			`content = self.parseCharacterClass()`
			`elif self.follows('(?:'):`
			`self.index += 3`
			`dis = self.parseDisjunction()`
			`self.expect_character(')')`
			`content = 'idk'`
			`elif self.follows('('):`
			`self.index += 1`
			`dis = self.parseDisjunction()`
			`self.expect_character(')')`
			`content = 'idk'`
			`elif isPatternCharacter(self.source[self.index]):`
			`content = self.source[self.index]`
			`self.index += 1`
			`else:`
			`return None`
			`quantifier = self.parseQuantifier()`
			`return {'type': 'Atom', 'content': content, 'quantifier': quantifier}`

			`def parseQuantifier(self):`
			`prefix = self.parseQuantifierPrefix()`
			`if not prefix:`
			`return None`
			`greedy = True`
			`if self.follows('?'):`
			`self.index += 1`
			`greedy = False`
			`return {'type': 'Quantifier', 'contents': prefix, 'greedy': greedy}`

			`def parseQuantifierPrefix(self):`
			`if self.isEOF():`
			`return None`
			`if self.follows('+'):`
			`content = '+'`
			`self.index += 1`
			`elif self.follows('?'):`
			`content = '?'`
			`self.index += 1`
			`elif self.follows('*'):`
			`content = '*'`
			`self.index += 1`
			`elif self.follows(`
			`'{'`
			`): # try matching otherwise return None and restore the state`
			`i = self.index`
			`self.index += 1`
			`digs1 = self.scanDecimalDigs()`
			`# if no minimal number of digs provided then return no quantifier`
			`if not digs1:`
			`self.index = i`
			`return None`
			`# scan char limit if provided`
			`if self.follows(','):`
			`self.index += 1`
			`digs2 = self.scanDecimalDigs()`
			`else:`
			`digs2 = ''`
			`# must be valid!`
			`if not self.follows('}'):`
			`self.index = i`
			`return None`
			`else:`
			`self.expect_character('}')`
			`content = int(digs1), int(digs2) if digs2 else None`
			`else:`
			`return None`
			`return content`

			`def parseAtomEscape(self):`
			`ch = self.source[self.index]`
			`if isDecimalDigit(ch) and ch != 0:`
			`digs = self.scanDecimalDigs()`
			`elif ch in CHAR_CLASS_ESCAPE:`
			`self.index += 1`
			`return SpecialChar('\\' + ch)`
			`else:`
			`return self.parseCharacterEscape()`

			`def parseCharacterEscape(self):`
			`ch = self.source[self.index]`
			`if ch in CONTROL_ESCAPE_CHARS:`
			`return SpecialChar('\\' + ch)`
			`if ch == 'c':`
			`'ok, fuck this shit.'`

			`def scanDecimalDigs(self):`
			`s = self.index`
			`while not self.isEOF() and isDecimalDigit(self.source[self.index]):`
			`self.index += 1`
			`return self.source[s:self.index]`


			`a = JsRegExpParser('a(?=x)', '')`
			`print(a.parsePattern())`