mylar/lib/js2py/legecy_translators/constants.py

from string import ascii_lowercase, digits
##################################
StringName = u'PyJsConstantString%d_'
NumberName = u'PyJsConstantNumber%d_'
RegExpName = u'PyJsConstantRegExp%d_'
##################################
ALPHAS = set(ascii_lowercase+ ascii_lowercase.upper())
NUMS = set(digits)
IDENTIFIER_START = ALPHAS.union(NUMS)
ESCAPE_CHARS = {'n', '0', 'b', 'f', 'r', 't', 'v', '"', "'", '\\'}
OCTAL = {'0', '1', '2', '3', '4', '5', '6', '7'}
HEX = set('0123456789abcdefABCDEF')
from utils import *
IDENTIFIER_PART  = IDENTIFIER_PART.union({'.'})


def _is_cancelled(source, n):
    cancelled = False
    k = 0
    while True:
        k+=1
        if source[n-k]!='\\':
            break
        cancelled = not cancelled
    return cancelled

def _ensure_regexp(source, n): #<- this function has to be improved
    '''returns True if regexp starts at n else returns False
      checks whether it is not a division '''
    markers = '(+~"\'=[%:?!*^|&-,;/\\'
    k = 0
    while True:
        k+=1
        if n-k<0:
            return True
        char = source[n-k]
        if char in markers:
            return True
        if char!=' ' and char!='\n':
            break
    return False

def parse_num(source, start, charset):
    """Returns a first index>=start of chat not in charset"""
    while start<len(source) and source[start] in charset:
        start+=1
    return start

def parse_exponent(source, start):
    """returns end of exponential, raises SyntaxError if failed"""
    if not source[start] in {'e', 'E'}:
        if source[start] in IDENTIFIER_PART:
            raise SyntaxError('Invalid number literal!')
        return start
    start += 1
    if source[start] in {'-', '+'}:
        start += 1
    FOUND = False
    # we need at least one dig after exponent
    while source[start] in NUMS:
        FOUND = True
        start+=1
    if not FOUND or source[start] in IDENTIFIER_PART:
        raise SyntaxError('Invalid number literal!')
    return start

def remove_constants(source):
    '''Replaces Strings and Regexp literals in the source code with
       identifiers and *removes comments*. Identifier is of the format:

       PyJsStringConst(String const number)_ - for Strings
       PyJsRegExpConst(RegExp const number)_ - for RegExps

       Returns dict which relates identifier and replaced constant.

       Removes single line and multiline comments from JavaScript source code
       Pseudo comments (inside strings) will not be removed.

       For example this line:
       var x = "/*PSEUDO COMMENT*/ TEXT //ANOTHER PSEUDO COMMENT"
       will be unaltered'''
    source=' '+source+'\n'
    comments = []
    inside_comment, single_comment = False, False
    inside_single, inside_double = False, False
    inside_regexp = False
    regexp_class_count = 0
    n = 0
    while n < len(source):
        char = source[n]
        if char=='"' and not (inside_comment or inside_single or inside_regexp):
            if not _is_cancelled(source, n):
                if inside_double:
                    inside_double[1] = n+1
                    comments.append(inside_double)
                    inside_double = False
                else:
                    inside_double = [n, None, 0]
        elif char=="'"  and not (inside_comment  or inside_double or inside_regexp):
            if not _is_cancelled(source, n):
                if inside_single:
                    inside_single[1] = n+1
                    comments.append(inside_single)
                    inside_single = False
                else:
                    inside_single = [n, None, 0]
        elif  (inside_single or inside_double):
            if char in LINE_TERMINATOR:
                if _is_cancelled(source, n):
                    if char==CR and source[n+1]==LF:
                        n+=1
                    n+=1
                    continue
                else:
                    raise SyntaxError('Invalid string literal. Line terminators must be escaped!')
        else:
            if inside_comment:
                if single_comment:
                    if char in LINE_TERMINATOR:
                        inside_comment[1] = n
                        comments.append(inside_comment)
                        inside_comment = False
                        single_comment = False
                else: # Multiline
                    if char=='/' and source[n-1]=='*':
                        inside_comment[1] = n+1
                        comments.append(inside_comment)
                        inside_comment = False
            elif inside_regexp:
                if not quiting_regexp:
                    if char in LINE_TERMINATOR:
                        raise SyntaxError('Invalid regexp literal. Line terminators cant appear!')
                    if _is_cancelled(source, n):
                        n+=1
                        continue
                    if char=='[':
                        regexp_class_count += 1
                    elif char==']':
                        regexp_class_count = max(regexp_class_count-1, 0)
                    elif  char=='/' and not regexp_class_count:
                        quiting_regexp = True
                else:
                    if char not in IDENTIFIER_START:
                        inside_regexp[1] = n
                        comments.append(inside_regexp)
                        inside_regexp = False
            elif char=='/' and source[n-1]=='/':
                single_comment = True
                inside_comment = [n-1, None, 1]
            elif char=='*' and source[n-1]=='/':
                inside_comment = [n-1, None, 1]
            elif char=='/' and source[n+1] not in ('/', '*'):
                if not _ensure_regexp(source, n): #<- improve this one
                    n+=1
                    continue #Probably just a division
                quiting_regexp = False
                inside_regexp = [n, None, 2]
            elif not (inside_comment or inside_regexp):
                if (char in NUMS  and source[n-1] not in IDENTIFIER_PART) or char=='.':
                    if char=='.':
                        k = parse_num(source,n+1, NUMS)
                        if k==n+1: # just a stupid dot...
                            n+=1
                            continue
                        k = parse_exponent(source, k)
                    elif char=='0' and source[n+1] in {'x', 'X'}: #Hex number probably
                        k = parse_num(source, n+2, HEX)
                        if k==n+2 or source[k] in IDENTIFIER_PART:
                            raise SyntaxError('Invalid hex literal!')
                    else: #int or exp or flot or exp flot
                        k = parse_num(source, n+1, NUMS)
                        if source[k]=='.':
                            k = parse_num(source, k+1, NUMS)
                        k = parse_exponent(source, k)
                    comments.append((n, k, 3))
                    n = k
                    continue
        n+=1
    res = ''
    start = 0
    count = 0
    constants = {}
    for end, next_start, typ in comments:
        res += source[start:end]
        start = next_start
        if typ==0: # String
            name = StringName
        elif typ==1: # comment
            continue
        elif typ==2: # regexp
            name = RegExpName
        elif typ==3: # number
            name = NumberName
        else:
            raise RuntimeError()
        res += ' '+name % count+' '
        constants[name % count] = source[end: next_start]
        count += 1
    res+=source[start:]
    # remove this stupid white space
    for e in WHITE:
        res = res.replace(e, ' ')
    res = res.replace(CR+LF, '\n')
    for e in LINE_TERMINATOR:
        res = res.replace(e, '\n')
    return res.strip(), constants


def recover_constants(py_source, replacements): #now has n^2 complexity. improve to n
    '''Converts identifiers representing Js constants to the PyJs constants
    PyJsNumberConst_1_ which has the true value of 5 will be converted to PyJsNumber(5)'''
    for identifier, value in replacements.iteritems():
        if identifier.startswith('PyJsConstantRegExp'):
            py_source = py_source.replace(identifier, 'JsRegExp(%s)'%repr(value))
        elif identifier.startswith('PyJsConstantString'):
            py_source = py_source.replace(identifier, 'Js(u%s)' % unify_string_literals(value))
        else:
            py_source = py_source.replace(identifier, 'Js(%s)'%value)
    return py_source


def unify_string_literals(js_string):
    """this function parses the string just like javascript
       for example literal '\d' in JavaScript would be interpreted
       as 'd' - backslash would be ignored and in Pyhon this
       would be interpreted as '\\d' This function fixes this problem."""
    n = 0
    res = ''
    limit = len(js_string)
    while n < limit:
        char = js_string[n]
        if char=='\\':
            new, n = do_escape(js_string, n)
            res += new
        else:
            res += char
            n += 1
    return res

def unify_regexp_literals(js):
    pass


def do_escape(source, n):
    """Its actually quite complicated to cover every case :)
       http://www.javascriptkit.com/jsref/escapesequence.shtml"""
    if not n+1 < len(source):
        return '' # not possible here but can be possible in general case.
    if source[n+1] in LINE_TERMINATOR:
        if source[n+1]==CR and n+2<len(source) and source[n+2]==LF:
            return source[n:n+3], n+3
        return source[n:n+2], n+2
    if source[n+1] in ESCAPE_CHARS:
        return source[n:n+2], n+2
    if source[n+1]in {'x', 'u'}:
        char, length = ('u', 4) if source[n+1]=='u' else ('x', 2)
        n+=2
        end = parse_num(source, n, HEX)
        if end-n < length:
            raise SyntaxError('Invalid escape sequence!')
        #if length==4:
        #    return unichr(int(source[n:n+4], 16)), n+4 # <- this was a very bad way of solving this problem :)
        return source[n-2:n+length], n+length
    if source[n+1] in OCTAL:
        n += 1
        end = parse_num(source, n, OCTAL)
        end = min(end, n+3) # cant be longer than 3
        # now the max allowed is 377 ( in octal) and 255 in decimal
        max_num = 255
        num = 0
        len_parsed = 0
        for e in source[n:end]:
            cand = 8*num + int(e)
            if cand > max_num:
                break
            num = cand
            len_parsed += 1
        # we have to return in a different form because python may want to parse more...
        # for example '\777' will be parsed by python as a whole while js will use only \77
        return '\\' + hex(num)[1:], n + len_parsed
    return source[n+1], n+2


#####TEST######

if __name__=='__main__':
    test = ('''
    ''')

    t, d = remove_constants(test)
    print t, d