bazarr/libs/js2py/legecy_translators/constants.py

310 lines
11 KiB
Python

from __future__ import print_function
from string import ascii_lowercase, digits
##################################
StringName = u'PyJsConstantString%d_'
NumberName = u'PyJsConstantNumber%d_'
RegExpName = u'PyJsConstantRegExp%d_'
##################################
ALPHAS = set(ascii_lowercase + ascii_lowercase.upper())
NUMS = set(digits)
IDENTIFIER_START = ALPHAS.union(NUMS)
ESCAPE_CHARS = {'n', '0', 'b', 'f', 'r', 't', 'v', '"', "'", '\\'}
OCTAL = {'0', '1', '2', '3', '4', '5', '6', '7'}
HEX = set('0123456789abcdefABCDEF')
from utils import *
IDENTIFIER_PART = IDENTIFIER_PART.union({'.'})
def _is_cancelled(source, n):
cancelled = False
k = 0
while True:
k += 1
if source[n - k] != '\\':
break
cancelled = not cancelled
return cancelled
def _ensure_regexp(source, n): #<- this function has to be improved
'''returns True if regexp starts at n else returns False
checks whether it is not a division '''
markers = '(+~"\'=[%:?!*^|&-,;/\\'
k = 0
while True:
k += 1
if n - k < 0:
return True
char = source[n - k]
if char in markers:
return True
if char != ' ' and char != '\n':
break
return False
def parse_num(source, start, charset):
"""Returns a first index>=start of chat not in charset"""
while start < len(source) and source[start] in charset:
start += 1
return start
def parse_exponent(source, start):
"""returns end of exponential, raises SyntaxError if failed"""
if not source[start] in {'e', 'E'}:
if source[start] in IDENTIFIER_PART:
raise SyntaxError('Invalid number literal!')
return start
start += 1
if source[start] in {'-', '+'}:
start += 1
FOUND = False
# we need at least one dig after exponent
while source[start] in NUMS:
FOUND = True
start += 1
if not FOUND or source[start] in IDENTIFIER_PART:
raise SyntaxError('Invalid number literal!')
return start
def remove_constants(source):
'''Replaces Strings and Regexp literals in the source code with
identifiers and *removes comments*. Identifier is of the format:
PyJsStringConst(String const number)_ - for Strings
PyJsRegExpConst(RegExp const number)_ - for RegExps
Returns dict which relates identifier and replaced constant.
Removes single line and multiline comments from JavaScript source code
Pseudo comments (inside strings) will not be removed.
For example this line:
var x = "/*PSEUDO COMMENT*/ TEXT //ANOTHER PSEUDO COMMENT"
will be unaltered'''
source = ' ' + source + '\n'
comments = []
inside_comment, single_comment = False, False
inside_single, inside_double = False, False
inside_regexp = False
regexp_class_count = 0
n = 0
while n < len(source):
char = source[n]
if char == '"' and not (inside_comment or inside_single
or inside_regexp):
if not _is_cancelled(source, n):
if inside_double:
inside_double[1] = n + 1
comments.append(inside_double)
inside_double = False
else:
inside_double = [n, None, 0]
elif char == "'" and not (inside_comment or inside_double
or inside_regexp):
if not _is_cancelled(source, n):
if inside_single:
inside_single[1] = n + 1
comments.append(inside_single)
inside_single = False
else:
inside_single = [n, None, 0]
elif (inside_single or inside_double):
if char in LINE_TERMINATOR:
if _is_cancelled(source, n):
if char == CR and source[n + 1] == LF:
n += 1
n += 1
continue
else:
raise SyntaxError(
'Invalid string literal. Line terminators must be escaped!'
)
else:
if inside_comment:
if single_comment:
if char in LINE_TERMINATOR:
inside_comment[1] = n
comments.append(inside_comment)
inside_comment = False
single_comment = False
else: # Multiline
if char == '/' and source[n - 1] == '*':
inside_comment[1] = n + 1
comments.append(inside_comment)
inside_comment = False
elif inside_regexp:
if not quiting_regexp:
if char in LINE_TERMINATOR:
raise SyntaxError(
'Invalid regexp literal. Line terminators cant appear!'
)
if _is_cancelled(source, n):
n += 1
continue
if char == '[':
regexp_class_count += 1
elif char == ']':
regexp_class_count = max(regexp_class_count - 1, 0)
elif char == '/' and not regexp_class_count:
quiting_regexp = True
else:
if char not in IDENTIFIER_START:
inside_regexp[1] = n
comments.append(inside_regexp)
inside_regexp = False
elif char == '/' and source[n - 1] == '/':
single_comment = True
inside_comment = [n - 1, None, 1]
elif char == '*' and source[n - 1] == '/':
inside_comment = [n - 1, None, 1]
elif char == '/' and source[n + 1] not in ('/', '*'):
if not _ensure_regexp(source, n): #<- improve this one
n += 1
continue #Probably just a division
quiting_regexp = False
inside_regexp = [n, None, 2]
elif not (inside_comment or inside_regexp):
if (char in NUMS and
source[n - 1] not in IDENTIFIER_PART) or char == '.':
if char == '.':
k = parse_num(source, n + 1, NUMS)
if k == n + 1: # just a stupid dot...
n += 1
continue
k = parse_exponent(source, k)
elif char == '0' and source[n + 1] in {
'x', 'X'
}: #Hex number probably
k = parse_num(source, n + 2, HEX)
if k == n + 2 or source[k] in IDENTIFIER_PART:
raise SyntaxError('Invalid hex literal!')
else: #int or exp or flot or exp flot
k = parse_num(source, n + 1, NUMS)
if source[k] == '.':
k = parse_num(source, k + 1, NUMS)
k = parse_exponent(source, k)
comments.append((n, k, 3))
n = k
continue
n += 1
res = ''
start = 0
count = 0
constants = {}
for end, next_start, typ in comments:
res += source[start:end]
start = next_start
if typ == 0: # String
name = StringName
elif typ == 1: # comment
continue
elif typ == 2: # regexp
name = RegExpName
elif typ == 3: # number
name = NumberName
else:
raise RuntimeError()
res += ' ' + name % count + ' '
constants[name % count] = source[end:next_start]
count += 1
res += source[start:]
# remove this stupid white space
for e in WHITE:
res = res.replace(e, ' ')
res = res.replace(CR + LF, '\n')
for e in LINE_TERMINATOR:
res = res.replace(e, '\n')
return res.strip(), constants
def recover_constants(py_source,
replacements): #now has n^2 complexity. improve to n
'''Converts identifiers representing Js constants to the PyJs constants
PyJsNumberConst_1_ which has the true value of 5 will be converted to PyJsNumber(5)'''
for identifier, value in replacements.iteritems():
if identifier.startswith('PyJsConstantRegExp'):
py_source = py_source.replace(identifier,
'JsRegExp(%s)' % repr(value))
elif identifier.startswith('PyJsConstantString'):
py_source = py_source.replace(
identifier, 'Js(u%s)' % unify_string_literals(value))
else:
py_source = py_source.replace(identifier, 'Js(%s)' % value)
return py_source
def unify_string_literals(js_string):
"""this function parses the string just like javascript
for example literal '\d' in JavaScript would be interpreted
as 'd' - backslash would be ignored and in Pyhon this
would be interpreted as '\\d' This function fixes this problem."""
n = 0
res = ''
limit = len(js_string)
while n < limit:
char = js_string[n]
if char == '\\':
new, n = do_escape(js_string, n)
res += new
else:
res += char
n += 1
return res
def unify_regexp_literals(js):
pass
def do_escape(source, n):
"""Its actually quite complicated to cover every case :)
http://www.javascriptkit.com/jsref/escapesequence.shtml"""
if not n + 1 < len(source):
return '' # not possible here but can be possible in general case.
if source[n + 1] in LINE_TERMINATOR:
if source[n + 1] == CR and n + 2 < len(source) and source[n + 2] == LF:
return source[n:n + 3], n + 3
return source[n:n + 2], n + 2
if source[n + 1] in ESCAPE_CHARS:
return source[n:n + 2], n + 2
if source[n + 1] in {'x', 'u'}:
char, length = ('u', 4) if source[n + 1] == 'u' else ('x', 2)
n += 2
end = parse_num(source, n, HEX)
if end - n < length:
raise SyntaxError('Invalid escape sequence!')
#if length==4:
# return unichr(int(source[n:n+4], 16)), n+4 # <- this was a very bad way of solving this problem :)
return source[n - 2:n + length], n + length
if source[n + 1] in OCTAL:
n += 1
end = parse_num(source, n, OCTAL)
end = min(end, n + 3) # cant be longer than 3
# now the max allowed is 377 ( in octal) and 255 in decimal
max_num = 255
num = 0
len_parsed = 0
for e in source[n:end]:
cand = 8 * num + int(e)
if cand > max_num:
break
num = cand
len_parsed += 1
# we have to return in a different form because python may want to parse more...
# for example '\777' will be parsed by python as a whole while js will use only \77
return '\\' + hex(num)[1:], n + len_parsed
return source[n + 1], n + 2
#####TEST######
if __name__ == '__main__':
test = ('''
''')
t, d = remove_constants(test)
print(t, d)