mylar/lib/js2py/legecy_translators/constants.py

from string import ascii_lowercase, digits
##################################
StringName = u'PyJsConstantString%d_'
NumberName = u'PyJsConstantNumber%d_'
RegExpName = u'PyJsConstantRegExp%d_'
##################################
ALPHAS = set(ascii_lowercase+ ascii_lowercase.upper())
NUMS = set(digits)
IDENTIFIER_START = ALPHAS.union(NUMS)
ESCAPE_CHARS = {'n', '0', 'b', 'f', 'r', 't', 'v', '"', "'", '\\'}
OCTAL = {'0', '1', '2', '3', '4', '5', '6', '7'}
HEX = set('0123456789abcdefABCDEF')
from utils import *
IDENTIFIER_PART  = IDENTIFIER_PART.union({'.'})


def _is_cancelled(source, n):
    cancelled = False
    k = 0
    while True:
        k+=1
        if source[n-k]!='\\':
            break
        cancelled = not cancelled
    return cancelled

def _ensure_regexp(source, n): #<- this function has to be improved
    '''returns True if regexp starts at n else returns False
      checks whether it is not a division '''
    markers = '(+~"\'=[%:?!*^|&-,;/\\'
    k = 0
    while True:
        k+=1
        if n-k<0:
            return True
        char = source[n-k]
        if char in markers:
            return True
        if char!=' ' and char!='\n':
            break
    return False

def parse_num(source, start, charset):
    """Returns a first index>=start of chat not in charset"""
    while start<len(source) and source[start] in charset:
        start+=1
    return start

def parse_exponent(source, start):
    """returns end of exponential, raises SyntaxError if failed"""
    if not source[start] in {'e', 'E'}:
        if source[start] in IDENTIFIER_PART:
            raise SyntaxError('Invalid number literal!')
        return start
    start += 1
    if source[start] in {'-', '+'}:
        start += 1
    FOUND = False
    # we need at least one dig after exponent
    while source[start] in NUMS:
        FOUND = True
        start+=1
    if not FOUND or source[start] in IDENTIFIER_PART:
        raise SyntaxError('Invalid number literal!')
    return start

def remove_constants(source):
    '''Replaces Strings and Regexp literals in the source code with
       identifiers and *removes comments*. Identifier is of the format:

       PyJsStringConst(String const number)_ - for Strings
       PyJsRegExpConst(RegExp const number)_ - for RegExps

       Returns dict which relates identifier and replaced constant.

       Removes single line and multiline comments from JavaScript source code
       Pseudo comments (inside strings) will not be removed.

       For example this line:
       var x = "/*PSEUDO COMMENT*/ TEXT //ANOTHER PSEUDO COMMENT"
       will be unaltered'''
    source=' '+source+'\n'
    comments = []
    inside_comment, single_comment = False, False
    inside_single, inside_double = False, False
    inside_regexp = False
    regexp_class_count = 0
    n = 0
    while n < len(source):
        char = source[n]
        if char=='"' and not (inside_comment or inside_single or inside_regexp):
            if not _is_cancelled(source, n):
                if inside_double:
                    inside_double[1] = n+1
                    comments.append(inside_double)
                    inside_double = False
                else:
                    inside_double = [n, None, 0]
        elif char=="'"  and not (inside_comment  or inside_double or inside_regexp):
            if not _is_cancelled(source, n):
                if inside_single:
                    inside_single[1] = n+1
                    comments.append(inside_single)
                    inside_single = False
                else:
                    inside_single = [n, None, 0]
        elif  (inside_single or inside_double):
            if char in LINE_TERMINATOR:
                if _is_cancelled(source, n):
                    if char==CR and source[n+1]==LF:
                        n+=1
                    n+=1
                    continue
                else:
                    raise SyntaxError('Invalid string literal. Line terminators must be escaped!')
        else:
            if inside_comment:
                if single_comment:
                    if char in LINE_TERMINATOR:
                        inside_comment[1] = n
                        comments.append(inside_comment)
                        inside_comment = False
                        single_comment = False
                else: # Multiline
                    if char=='/' and source[n-1]=='*':
                        inside_comment[1] = n+1
                        comments.append(inside_comment)
                        inside_comment = False
            elif inside_regexp:
                if not quiting_regexp:
                    if char in LINE_TERMINATOR:
                        raise SyntaxError('Invalid regexp literal. Line terminators cant appear!')
                    if _is_cancelled(source, n):
                        n+=1
                        continue
                    if char=='[':
                        regexp_class_count += 1
                    elif char==']':
                        regexp_class_count = max(regexp_class_count-1, 0)
                    elif  char=='/' and not regexp_class_count:
                        quiting_regexp = True
                else:
                    if char not in IDENTIFIER_START:
                        inside_regexp[1] = n
                        comments.append(inside_regexp)
                        inside_regexp = False
            elif char=='/' and source[n-1]=='/':
                single_comment = True
                inside_comment = [n-1, None, 1]
            elif char=='*' and source[n-1]=='/':
                inside_comment = [n-1, None, 1]
            elif char=='/' and source[n+1] not in ('/', '*'):
                if not _ensure_regexp(source, n): #<- improve this one
                    n+=1
                    continue #Probably just a division
                quiting_regexp = False
                inside_regexp = [n, None, 2]
            elif not (inside_comment or inside_regexp):
                if (char in NUMS  and source[n-1] not in IDENTIFIER_PART) or char=='.':
                    if char=='.':
                        k = parse_num(source,n+1, NUMS)
                        if k==n+1: # just a stupid dot...
                            n+=1
                            continue
                        k = parse_exponent(source, k)
                    elif char=='0' and source[n+1] in {'x', 'X'}: #Hex number probably
                        k = parse_num(source, n+2, HEX)
                        if k==n+2 or source[k] in IDENTIFIER_PART:
                            raise SyntaxError('Invalid hex literal!')
                    else: #int or exp or flot or exp flot
                        k = parse_num(source, n+1, NUMS)
                        if source[k]=='.':
                            k = parse_num(source, k+1, NUMS)
                        k = parse_exponent(source, k)
                    comments.append((n, k, 3))
                    n = k
                    continue
        n+=1
    res = ''
    start = 0
    count = 0
    constants = {}
    for end, next_start, typ in comments:
        res += source[start:end]
        start = next_start
        if typ==0: # String
            name = StringName
        elif typ==1: # comment
            continue
        elif typ==2: # regexp
            name = RegExpName
        elif typ==3: # number
            name = NumberName
        else:
            raise RuntimeError()
        res += ' '+name % count+' '
        constants[name % count] = source[end: next_start]
        count += 1
    res+=source[start:]
    # remove this stupid white space
    for e in WHITE:
        res = res.replace(e, ' ')
    res = res.replace(CR+LF, '\n')
    for e in LINE_TERMINATOR:
        res = res.replace(e, '\n')
    return res.strip(), constants


def recover_constants(py_source, replacements): #now has n^2 complexity. improve to n
    '''Converts identifiers representing Js constants to the PyJs constants
    PyJsNumberConst_1_ which has the true value of 5 will be converted to PyJsNumber(5)'''
    for identifier, value in replacements.iteritems():
        if identifier.startswith('PyJsConstantRegExp'):
            py_source = py_source.replace(identifier, 'JsRegExp(%s)'%repr(value))
        elif identifier.startswith('PyJsConstantString'):
            py_source = py_source.replace(identifier, 'Js(u%s)' % unify_string_literals(value))
        else:
            py_source = py_source.replace(identifier, 'Js(%s)'%value)
    return py_source


def unify_string_literals(js_string):
    """this function parses the string just like javascript
       for example literal '\d' in JavaScript would be interpreted
       as 'd' - backslash would be ignored and in Pyhon this
       would be interpreted as '\\d' This function fixes this problem."""
    n = 0
    res = ''
    limit = len(js_string)
    while n < limit:
        char = js_string[n]
        if char=='\\':
            new, n = do_escape(js_string, n)
            res += new
        else:
            res += char
            n += 1
    return res

def unify_regexp_literals(js):
    pass


def do_escape(source, n):
    """Its actually quite complicated to cover every case :)
       http://www.javascriptkit.com/jsref/escapesequence.shtml"""
    if not n+1 < len(source):
        return '' # not possible here but can be possible in general case.
    if source[n+1] in LINE_TERMINATOR:
        if source[n+1]==CR and n+2<len(source) and source[n+2]==LF:
            return source[n:n+3], n+3
        return source[n:n+2], n+2
    if source[n+1] in ESCAPE_CHARS:
        return source[n:n+2], n+2
    if source[n+1]in {'x', 'u'}:
        char, length = ('u', 4) if source[n+1]=='u' else ('x', 2)
        n+=2
        end = parse_num(source, n, HEX)
        if end-n < length:
            raise SyntaxError('Invalid escape sequence!')
        #if length==4:
        #    return unichr(int(source[n:n+4], 16)), n+4 # <- this was a very bad way of solving this problem :)
        return source[n-2:n+length], n+length
    if source[n+1] in OCTAL:
        n += 1
        end = parse_num(source, n, OCTAL)
        end = min(end, n+3) # cant be longer than 3
        # now the max allowed is 377 ( in octal) and 255 in decimal
        max_num = 255
        num = 0
        len_parsed = 0
        for e in source[n:end]:
            cand = 8*num + int(e)
            if cand > max_num:
                break
            num = cand
            len_parsed += 1
        # we have to return in a different form because python may want to parse more...
        # for example '\777' will be parsed by python as a whole while js will use only \77
        return '\\' + hex(num)[1:], n + len_parsed
    return source[n+1], n+2


#####TEST######

if __name__=='__main__':
    test = ('''
    ''')

    t, d = remove_constants(test)
    print t, d
FIX:(#1358) nzbname error when retrieving nzb and assigning filename from WWT tracker, FIX:(#1372)(#1369) Fix for creating series folder when adding a series when series title contains either double quotation marks, or an asterisk, FIX:(#1373) Filechecker would ignore filenames that had the extension captialized, FIX:(#1366) When Comic Publisher is not provided on CV, would error during add, FIX: Attempted fix for unicode characters when importing (series title, filenames), FIX: Removed str references that would cause an error on weekly pull in some instances, FIX: When checking for watched series, if series title being checked against had only one word, would cause a traceback error, FIX: When attempting to retrieve results/torrents from TPSE and was behind cloudflare, would error out, IMP: file-size check now works for 32p feeds, FIX: When pullist issue was marked as Wanted and issue was populated on series detail page, occassionaly would not have the same status of Wanted, FIX: Fixed incorrect placement of Comic Location title in GUI, IMP: Added short description for Search Delay option within GUI, FIX:(#1370) multiple selection from Manage Comics tab (Refresh/Delete/Pause) would only select one item 2016-09-06 15:06:07 +00:00			`from string import ascii_lowercase, digits`
			`##################################`
			`StringName = u'PyJsConstantString%d_'`
			`NumberName = u'PyJsConstantNumber%d_'`
			`RegExpName = u'PyJsConstantRegExp%d_'`
			`##################################`
			`ALPHAS = set(ascii_lowercase+ ascii_lowercase.upper())`
			`NUMS = set(digits)`
			`IDENTIFIER_START = ALPHAS.union(NUMS)`
			`ESCAPE_CHARS = {'n', '0', 'b', 'f', 'r', 't', 'v', '"', "'", '\\'}`
			`OCTAL = {'0', '1', '2', '3', '4', '5', '6', '7'}`
			`HEX = set('0123456789abcdefABCDEF')`
			`from utils import *`
			`IDENTIFIER_PART = IDENTIFIER_PART.union({'.'})`


			`def _is_cancelled(source, n):`
			`cancelled = False`
			`k = 0`
			`while True:`
			`k+=1`
			`if source[n-k]!='\\':`
			`break`
			`cancelled = not cancelled`
			`return cancelled`

			`def _ensure_regexp(source, n): #<- this function has to be improved`
			`'''returns True if regexp starts at n else returns False`
			`checks whether it is not a division '''`
			`markers = '(+~"\'=[%:?!*^\|&-,;/\\'`
			`k = 0`
			`while True:`
			`k+=1`
			`if n-k<0:`
			`return True`
			`char = source[n-k]`
			`if char in markers:`
			`return True`
			`if char!=' ' and char!='\n':`
			`break`
			`return False`

			`def parse_num(source, start, charset):`
			`"""Returns a first index>=start of chat not in charset"""`
			`while start<len(source) and source[start] in charset:`
			`start+=1`
			`return start`

			`def parse_exponent(source, start):`
			`"""returns end of exponential, raises SyntaxError if failed"""`
			`if not source[start] in {'e', 'E'}:`
			`if source[start] in IDENTIFIER_PART:`
			`raise SyntaxError('Invalid number literal!')`
			`return start`
			`start += 1`
			`if source[start] in {'-', '+'}:`
			`start += 1`
			`FOUND = False`
			`# we need at least one dig after exponent`
			`while source[start] in NUMS:`
			`FOUND = True`
			`start+=1`
			`if not FOUND or source[start] in IDENTIFIER_PART:`
			`raise SyntaxError('Invalid number literal!')`
			`return start`

			`def remove_constants(source):`
			`'''Replaces Strings and Regexp literals in the source code with`
			`identifiers and removes comments. Identifier is of the format:`

			`PyJsStringConst(String const number)_ - for Strings`
			`PyJsRegExpConst(RegExp const number)_ - for RegExps`

			`Returns dict which relates identifier and replaced constant.`

			`Removes single line and multiline comments from JavaScript source code`
			`Pseudo comments (inside strings) will not be removed.`

			`For example this line:`
			`var x = "/PSEUDO COMMENT/ TEXT //ANOTHER PSEUDO COMMENT"`
			`will be unaltered'''`
			`source=' '+source+'\n'`
			`comments = []`
			`inside_comment, single_comment = False, False`
			`inside_single, inside_double = False, False`
			`inside_regexp = False`
			`regexp_class_count = 0`
			`n = 0`
			`while n < len(source):`
			`char = source[n]`
			`if char=='"' and not (inside_comment or inside_single or inside_regexp):`
			`if not _is_cancelled(source, n):`
			`if inside_double:`
			`inside_double[1] = n+1`
			`comments.append(inside_double)`
			`inside_double = False`
			`else:`
			`inside_double = [n, None, 0]`
			`elif char=="'" and not (inside_comment or inside_double or inside_regexp):`
			`if not _is_cancelled(source, n):`
			`if inside_single:`
			`inside_single[1] = n+1`
			`comments.append(inside_single)`
			`inside_single = False`
			`else:`
			`inside_single = [n, None, 0]`
			`elif (inside_single or inside_double):`
			`if char in LINE_TERMINATOR:`
			`if _is_cancelled(source, n):`
			`if char==CR and source[n+1]==LF:`
			`n+=1`
			`n+=1`
			`continue`
			`else:`
			`raise SyntaxError('Invalid string literal. Line terminators must be escaped!')`
			`else:`
			`if inside_comment:`
			`if single_comment:`
			`if char in LINE_TERMINATOR:`
			`inside_comment[1] = n`
			`comments.append(inside_comment)`
			`inside_comment = False`
			`single_comment = False`
			`else: # Multiline`
			`if char=='/' and source[n-1]=='*':`
			`inside_comment[1] = n+1`
			`comments.append(inside_comment)`
			`inside_comment = False`
			`elif inside_regexp:`
			`if not quiting_regexp:`
			`if char in LINE_TERMINATOR:`
			`raise SyntaxError('Invalid regexp literal. Line terminators cant appear!')`
			`if _is_cancelled(source, n):`
			`n+=1`
			`continue`
			`if char=='[':`
			`regexp_class_count += 1`
			`elif char==']':`
			`regexp_class_count = max(regexp_class_count-1, 0)`
			`elif char=='/' and not regexp_class_count:`
			`quiting_regexp = True`
			`else:`
			`if char not in IDENTIFIER_START:`
			`inside_regexp[1] = n`
			`comments.append(inside_regexp)`
			`inside_regexp = False`
			`elif char=='/' and source[n-1]=='/':`
			`single_comment = True`
			`inside_comment = [n-1, None, 1]`
			`elif char=='*' and source[n-1]=='/':`
			`inside_comment = [n-1, None, 1]`
			`elif char=='/' and source[n+1] not in ('/', '*'):`
			`if not _ensure_regexp(source, n): #<- improve this one`
			`n+=1`
			`continue #Probably just a division`
			`quiting_regexp = False`
			`inside_regexp = [n, None, 2]`
			`elif not (inside_comment or inside_regexp):`
			`if (char in NUMS and source[n-1] not in IDENTIFIER_PART) or char=='.':`
			`if char=='.':`
			`k = parse_num(source,n+1, NUMS)`
			`if k==n+1: # just a stupid dot...`
			`n+=1`
			`continue`
			`k = parse_exponent(source, k)`
			`elif char=='0' and source[n+1] in {'x', 'X'}: #Hex number probably`
			`k = parse_num(source, n+2, HEX)`
			`if k==n+2 or source[k] in IDENTIFIER_PART:`
			`raise SyntaxError('Invalid hex literal!')`
			`else: #int or exp or flot or exp flot`
			`k = parse_num(source, n+1, NUMS)`
			`if source[k]=='.':`
			`k = parse_num(source, k+1, NUMS)`
			`k = parse_exponent(source, k)`
			`comments.append((n, k, 3))`
			`n = k`
			`continue`
			`n+=1`
			`res = ''`
			`start = 0`
			`count = 0`
			`constants = {}`
			`for end, next_start, typ in comments:`
			`res += source[start:end]`
			`start = next_start`
			`if typ==0: # String`
			`name = StringName`
			`elif typ==1: # comment`
			`continue`
			`elif typ==2: # regexp`
			`name = RegExpName`
			`elif typ==3: # number`
			`name = NumberName`
			`else:`
			`raise RuntimeError()`
			`res += ' '+name % count+' '`
			`constants[name % count] = source[end: next_start]`
			`count += 1`
			`res+=source[start:]`
			`# remove this stupid white space`
			`for e in WHITE:`
			`res = res.replace(e, ' ')`
			`res = res.replace(CR+LF, '\n')`
			`for e in LINE_TERMINATOR:`
			`res = res.replace(e, '\n')`
			`return res.strip(), constants`


			`def recover_constants(py_source, replacements): #now has n^2 complexity. improve to n`
			`'''Converts identifiers representing Js constants to the PyJs constants`
			`PyJsNumberConst_1_ which has the true value of 5 will be converted to PyJsNumber(5)'''`
			`for identifier, value in replacements.iteritems():`
			`if identifier.startswith('PyJsConstantRegExp'):`
			`py_source = py_source.replace(identifier, 'JsRegExp(%s)'%repr(value))`
			`elif identifier.startswith('PyJsConstantString'):`
			`py_source = py_source.replace(identifier, 'Js(u%s)' % unify_string_literals(value))`
			`else:`
			`py_source = py_source.replace(identifier, 'Js(%s)'%value)`
			`return py_source`


			`def unify_string_literals(js_string):`
			`"""this function parses the string just like javascript`
			`for example literal '\d' in JavaScript would be interpreted`
			`as 'd' - backslash would be ignored and in Pyhon this`
			`would be interpreted as '\\d' This function fixes this problem."""`
			`n = 0`
			`res = ''`
			`limit = len(js_string)`
			`while n < limit:`
			`char = js_string[n]`
			`if char=='\\':`
			`new, n = do_escape(js_string, n)`
			`res += new`
			`else:`
			`res += char`
			`n += 1`
			`return res`

			`def unify_regexp_literals(js):`
			`pass`


			`def do_escape(source, n):`
			`"""Its actually quite complicated to cover every case :)`
			`http://www.javascriptkit.com/jsref/escapesequence.shtml"""`
			`if not n+1 < len(source):`
			`return '' # not possible here but can be possible in general case.`
			`if source[n+1] in LINE_TERMINATOR:`
			`if source[n+1]==CR and n+2<len(source) and source[n+2]==LF:`
			`return source[n:n+3], n+3`
			`return source[n:n+2], n+2`
			`if source[n+1] in ESCAPE_CHARS:`
			`return source[n:n+2], n+2`
			`if source[n+1]in {'x', 'u'}:`
			`char, length = ('u', 4) if source[n+1]=='u' else ('x', 2)`
			`n+=2`
			`end = parse_num(source, n, HEX)`
			`if end-n < length:`
			`raise SyntaxError('Invalid escape sequence!')`
			`#if length==4:`
			`# return unichr(int(source[n:n+4], 16)), n+4 # <- this was a very bad way of solving this problem :)`
			`return source[n-2:n+length], n+length`
			`if source[n+1] in OCTAL:`
			`n += 1`
			`end = parse_num(source, n, OCTAL)`
			`end = min(end, n+3) # cant be longer than 3`
			`# now the max allowed is 377 ( in octal) and 255 in decimal`
			`max_num = 255`
			`num = 0`
			`len_parsed = 0`
			`for e in source[n:end]:`
			`cand = 8*num + int(e)`
			`if cand > max_num:`
			`break`
			`num = cand`
			`len_parsed += 1`
			`# we have to return in a different form because python may want to parse more...`
			`# for example '\777' will be parsed by python as a whole while js will use only \77`
			`return '\\' + hex(num)[1:], n + len_parsed`
			`return source[n+1], n+2`





			`#####TEST######`

			`if __name__=='__main__':`
			`test = ('''`
			`''')`

			`t, d = remove_constants(test)`
			`print t, d`