from __future__ import print_function from string import ascii_lowercase, digits ################################## StringName = u'PyJsConstantString%d_' NumberName = u'PyJsConstantNumber%d_' RegExpName = u'PyJsConstantRegExp%d_' ################################## ALPHAS = set(ascii_lowercase + ascii_lowercase.upper()) NUMS = set(digits) IDENTIFIER_START = ALPHAS.union(NUMS) ESCAPE_CHARS = {'n', '0', 'b', 'f', 'r', 't', 'v', '"', "'", '\\'} OCTAL = {'0', '1', '2', '3', '4', '5', '6', '7'} HEX = set('0123456789abcdefABCDEF') from utils import * IDENTIFIER_PART = IDENTIFIER_PART.union({'.'}) def _is_cancelled(source, n): cancelled = False k = 0 while True: k += 1 if source[n - k] != '\\': break cancelled = not cancelled return cancelled def _ensure_regexp(source, n): #<- this function has to be improved '''returns True if regexp starts at n else returns False checks whether it is not a division ''' markers = '(+~"\'=[%:?!*^|&-,;/\\' k = 0 while True: k += 1 if n - k < 0: return True char = source[n - k] if char in markers: return True if char != ' ' and char != '\n': break return False def parse_num(source, start, charset): """Returns a first index>=start of chat not in charset""" while start < len(source) and source[start] in charset: start += 1 return start def parse_exponent(source, start): """returns end of exponential, raises SyntaxError if failed""" if not source[start] in {'e', 'E'}: if source[start] in IDENTIFIER_PART: raise SyntaxError('Invalid number literal!') return start start += 1 if source[start] in {'-', '+'}: start += 1 FOUND = False # we need at least one dig after exponent while source[start] in NUMS: FOUND = True start += 1 if not FOUND or source[start] in IDENTIFIER_PART: raise SyntaxError('Invalid number literal!') return start def remove_constants(source): '''Replaces Strings and Regexp literals in the source code with identifiers and *removes comments*. Identifier is of the format: PyJsStringConst(String const number)_ - for Strings PyJsRegExpConst(RegExp const number)_ - for RegExps Returns dict which relates identifier and replaced constant. Removes single line and multiline comments from JavaScript source code Pseudo comments (inside strings) will not be removed. For example this line: var x = "/*PSEUDO COMMENT*/ TEXT //ANOTHER PSEUDO COMMENT" will be unaltered''' source = ' ' + source + '\n' comments = [] inside_comment, single_comment = False, False inside_single, inside_double = False, False inside_regexp = False regexp_class_count = 0 n = 0 while n < len(source): char = source[n] if char == '"' and not (inside_comment or inside_single or inside_regexp): if not _is_cancelled(source, n): if inside_double: inside_double[1] = n + 1 comments.append(inside_double) inside_double = False else: inside_double = [n, None, 0] elif char == "'" and not (inside_comment or inside_double or inside_regexp): if not _is_cancelled(source, n): if inside_single: inside_single[1] = n + 1 comments.append(inside_single) inside_single = False else: inside_single = [n, None, 0] elif (inside_single or inside_double): if char in LINE_TERMINATOR: if _is_cancelled(source, n): if char == CR and source[n + 1] == LF: n += 1 n += 1 continue else: raise SyntaxError( 'Invalid string literal. Line terminators must be escaped!' ) else: if inside_comment: if single_comment: if char in LINE_TERMINATOR: inside_comment[1] = n comments.append(inside_comment) inside_comment = False single_comment = False else: # Multiline if char == '/' and source[n - 1] == '*': inside_comment[1] = n + 1 comments.append(inside_comment) inside_comment = False elif inside_regexp: if not quiting_regexp: if char in LINE_TERMINATOR: raise SyntaxError( 'Invalid regexp literal. Line terminators cant appear!' ) if _is_cancelled(source, n): n += 1 continue if char == '[': regexp_class_count += 1 elif char == ']': regexp_class_count = max(regexp_class_count - 1, 0) elif char == '/' and not regexp_class_count: quiting_regexp = True else: if char not in IDENTIFIER_START: inside_regexp[1] = n comments.append(inside_regexp) inside_regexp = False elif char == '/' and source[n - 1] == '/': single_comment = True inside_comment = [n - 1, None, 1] elif char == '*' and source[n - 1] == '/': inside_comment = [n - 1, None, 1] elif char == '/' and source[n + 1] not in ('/', '*'): if not _ensure_regexp(source, n): #<- improve this one n += 1 continue #Probably just a division quiting_regexp = False inside_regexp = [n, None, 2] elif not (inside_comment or inside_regexp): if (char in NUMS and source[n - 1] not in IDENTIFIER_PART) or char == '.': if char == '.': k = parse_num(source, n + 1, NUMS) if k == n + 1: # just a stupid dot... n += 1 continue k = parse_exponent(source, k) elif char == '0' and source[n + 1] in { 'x', 'X' }: #Hex number probably k = parse_num(source, n + 2, HEX) if k == n + 2 or source[k] in IDENTIFIER_PART: raise SyntaxError('Invalid hex literal!') else: #int or exp or flot or exp flot k = parse_num(source, n + 1, NUMS) if source[k] == '.': k = parse_num(source, k + 1, NUMS) k = parse_exponent(source, k) comments.append((n, k, 3)) n = k continue n += 1 res = '' start = 0 count = 0 constants = {} for end, next_start, typ in comments: res += source[start:end] start = next_start if typ == 0: # String name = StringName elif typ == 1: # comment continue elif typ == 2: # regexp name = RegExpName elif typ == 3: # number name = NumberName else: raise RuntimeError() res += ' ' + name % count + ' ' constants[name % count] = source[end:next_start] count += 1 res += source[start:] # remove this stupid white space for e in WHITE: res = res.replace(e, ' ') res = res.replace(CR + LF, '\n') for e in LINE_TERMINATOR: res = res.replace(e, '\n') return res.strip(), constants def recover_constants(py_source, replacements): #now has n^2 complexity. improve to n '''Converts identifiers representing Js constants to the PyJs constants PyJsNumberConst_1_ which has the true value of 5 will be converted to PyJsNumber(5)''' for identifier, value in replacements.iteritems(): if identifier.startswith('PyJsConstantRegExp'): py_source = py_source.replace(identifier, 'JsRegExp(%s)' % repr(value)) elif identifier.startswith('PyJsConstantString'): py_source = py_source.replace( identifier, 'Js(u%s)' % unify_string_literals(value)) else: py_source = py_source.replace(identifier, 'Js(%s)' % value) return py_source def unify_string_literals(js_string): """this function parses the string just like javascript for example literal '\d' in JavaScript would be interpreted as 'd' - backslash would be ignored and in Pyhon this would be interpreted as '\\d' This function fixes this problem.""" n = 0 res = '' limit = len(js_string) while n < limit: char = js_string[n] if char == '\\': new, n = do_escape(js_string, n) res += new else: res += char n += 1 return res def unify_regexp_literals(js): pass def do_escape(source, n): """Its actually quite complicated to cover every case :) http://www.javascriptkit.com/jsref/escapesequence.shtml""" if not n + 1 < len(source): return '' # not possible here but can be possible in general case. if source[n + 1] in LINE_TERMINATOR: if source[n + 1] == CR and n + 2 < len(source) and source[n + 2] == LF: return source[n:n + 3], n + 3 return source[n:n + 2], n + 2 if source[n + 1] in ESCAPE_CHARS: return source[n:n + 2], n + 2 if source[n + 1] in {'x', 'u'}: char, length = ('u', 4) if source[n + 1] == 'u' else ('x', 2) n += 2 end = parse_num(source, n, HEX) if end - n < length: raise SyntaxError('Invalid escape sequence!') #if length==4: # return unichr(int(source[n:n+4], 16)), n+4 # <- this was a very bad way of solving this problem :) return source[n - 2:n + length], n + length if source[n + 1] in OCTAL: n += 1 end = parse_num(source, n, OCTAL) end = min(end, n + 3) # cant be longer than 3 # now the max allowed is 377 ( in octal) and 255 in decimal max_num = 255 num = 0 len_parsed = 0 for e in source[n:end]: cand = 8 * num + int(e) if cand > max_num: break num = cand len_parsed += 1 # we have to return in a different form because python may want to parse more... # for example '\777' will be parsed by python as a whole while js will use only \77 return '\\' + hex(num)[1:], n + len_parsed return source[n + 1], n + 2 #####TEST###### if __name__ == '__main__': test = (''' ''') t, d = remove_constants(test) print(t, d)