diff --git a/lslopt/lsldeadcode.py b/lslopt/lsldeadcode.py index 49ee4a6..aec0ac7 100644 --- a/lslopt/lsldeadcode.py +++ b/lslopt/lsldeadcode.py @@ -19,6 +19,7 @@ from lslopt import lslfuncs from lslopt.lslcommon import nr +from strutil import xrange class deadcode(object): @@ -530,7 +531,7 @@ class deadcode(object): self.MarkReferences(statedef) # Track removal of global lines, to reasign locations later. - LocMap = range(len(self.tree)) + LocMap = list(range(len(self.tree))) GlobalDeletions = [] diff --git a/lslopt/lslfoldconst.py b/lslopt/lslfoldconst.py index 69fa504..5e87250 100644 --- a/lslopt/lslfoldconst.py +++ b/lslopt/lslfoldconst.py @@ -23,6 +23,7 @@ from lslopt import lslfuncs from lslopt.lslfuncs import ZERO_VECTOR, ZERO_ROTATION import math from lslopt.lslfuncopt import OptimizeFunc, OptimizeArgs, FuncOptSetup +from strutil import xrange, unicode # TODO: Remove special handling of @ within IF,WHILE,FOR,DO diff --git a/lslopt/lsllastpass.py b/lslopt/lsllastpass.py index 81d8a31..5f77874 100644 --- a/lslopt/lsllastpass.py +++ b/lslopt/lsllastpass.py @@ -25,6 +25,7 @@ from lslopt.lslcommon import nr #import math #from lslparse import warning #from lslfuncopt import OptimizeFunc, OptimizeArgs, FuncOptSetup +from strutil import xrange class rec: def __init__(self, **init): diff --git a/lslopt/lslloadlib.py b/lslopt/lslloadlib.py index 9d3a2ab..58fecb6 100644 --- a/lslopt/lslloadlib.py +++ b/lslopt/lslloadlib.py @@ -20,6 +20,7 @@ import sys, re from lslopt.lslcommon import types, warning, Vector, Quaternion from lslopt import lslcommon, lslfuncs +from strutil import * def LoadLibrary(builtins = None, fndata = None): """Load builtins.txt and fndata.txt (or the given filenames) and return @@ -40,27 +41,27 @@ def LoadLibrary(builtins = None, fndata = None): # Library read code parse_lin_re = re.compile( - br'^\s*([a-z]+)\s+' - br'([a-zA-Z_][a-zA-Z0-9_]*)\s*\(\s*(' - br'[a-z]+\s+[a-zA-Z_][a-zA-Z0-9_]*' - br'(?:\s*,\s*[a-z]+\s+[a-zA-Z_][a-zA-Z0-9_]*)*' - br')?\s*\)\s*$' - br'|' - br'^\s*const\s+([a-z]+)' - br'\s+([a-zA-Z_][a-zA-Z0-9_]*)\s*=\s*(.*?)\s*$' - br'|' - br'^\s*(?:#.*|//.*)?$') - parse_arg_re = re.compile(br'^\s*([a-z]+)\s+([a-zA-Z_][a-zA-Z0-9_]*)\s*$') - parse_fp_re = re.compile(br'^\s*(-?(?=[0-9]|\.[0-9])[0-9]*' - br'((?:\.[0-9]*)?(?:[Ee][+-]?[0-9]+)?))\s*$') - parse_int_re = re.compile(br'^\s*(-?0x[0-9A-Fa-f]+|-?[0-9]+)\s*$') + r'^\s*([a-z]+)\s+' + r'([a-zA-Z_][a-zA-Z0-9_]*)\s*\(\s*(' + r'[a-z]+\s+[a-zA-Z_][a-zA-Z0-9_]*' + r'(?:\s*,\s*[a-z]+\s+[a-zA-Z_][a-zA-Z0-9_]*)*' + r')?\s*\)\s*$' + r'|' + r'^\s*const\s+([a-z]+)' + r'\s+([a-zA-Z_][a-zA-Z0-9_]*)\s*=\s*(.*?)\s*$' + r'|' + r'^\s*(?:#.*|//.*)?$') + parse_arg_re = re.compile(r'^\s*([a-z]+)\s+([a-zA-Z_][a-zA-Z0-9_]*)\s*$') + parse_fp_re = re.compile(r'^\s*(-?(?=[0-9]|\.[0-9])[0-9]*' + r'((?:\.[0-9]*)?(?:[Ee][+-]?[0-9]+)?))\s*$') + parse_int_re = re.compile(r'^\s*(-?0x[0-9A-Fa-f]+|-?[0-9]+)\s*$') parse_str_re = re.compile(u'^"((?:[^"\\\\]|\\\\.)*)"$') - f = open(builtins, 'rb') + f = open(builtins, 'r') try: linenum = 0 try: - ubuiltins = builtins.decode(sys.getfilesystemencoding()) + ubuiltins = str2u(builtins, sys.getfilesystemencoding()) except UnicodeDecodeError: # This is just a guess at the filename encoding. ubuiltins = builtins.decode('iso-8859-15') @@ -70,7 +71,7 @@ def LoadLibrary(builtins = None, fndata = None): if not line: break if line[-1] == '\n': line = line[:-1] try: - uline = line.decode('utf8') + uline = str2u(line, 'utf8') except UnicodeDecodeError: warning(u"Bad Unicode in %s line %d" % (ubuiltins, linenum)) continue @@ -153,7 +154,7 @@ def LoadLibrary(builtins = None, fndata = None): elif typ == 'float': value = lslfuncs.F32(float(value)) elif typ == 'string': - value = value.decode('utf8') + value = str2u(value, 'utf8') if parse_str_re.search(value): esc = False tmp = value[1:-1] @@ -242,14 +243,14 @@ def LoadLibrary(builtins = None, fndata = None): # TODO: "quaternion" doesn't compare equal to "rotation" even if they are # equivalent. Canonicalize it before comparison, to avoid false # reports of mismatches. - f = open(fndata, 'rb') + f = open(fndata, 'r') try: linenum = 0 curr_fn = None curr_ty = None skipping = False try: - ufndata = fndata.decode(sys.getfilesystemencoding()) + ufndata = str2u(fndata, sys.getfilesystemencoding()) except UnicodeDecodeError: # This is just a guess at the filename encoding. ufndata = fndata.decode('iso-8859-15') @@ -259,7 +260,7 @@ def LoadLibrary(builtins = None, fndata = None): if not line: break if line[-1] == '\n': line = line[:-1] try: - uline = line.decode('utf8') + uline = str2u(line, 'utf8') except UnicodeDecodeError: warning(u"Bad Unicode in %s line %d" % (ufndata, linenum)) continue @@ -272,7 +273,7 @@ def LoadLibrary(builtins = None, fndata = None): if match_fn and (rettype in ('void', 'event') or rettype in types): skipping = True # until proven otherwise name = match_fn.group(2) - uname = name.decode('utf8') + uname = str2u(name, 'utf8') if (rettype == 'event' and name not in events or rettype != 'event' and name not in functions ): @@ -347,7 +348,7 @@ def LoadLibrary(builtins = None, fndata = None): skipping = True continue if not skipping: - ucurr_fn = curr_fn.decode('utf8') + ucurr_fn = str2u(curr_fn, 'utf8') if match_flag.group(1): # SEF # We don't handle conditions yet. Take the @@ -438,7 +439,7 @@ def LoadLibrary(builtins = None, fndata = None): # Post-checks for i in functions: - ui = i.decode('utf8') + ui = str2u(i, 'utf8') if 'NeedsData' in functions[i]: del functions[i]['NeedsData'] warning(u"Library data, file %s: Function %s has no data." @@ -455,7 +456,7 @@ def LoadLibrary(builtins = None, fndata = None): u" delay. Removing SEF." % ui) del functions[i]['SEF'] for i in events: - ui = i.decode('utf8') + ui = str2u(i, 'utf8') if 'NeedsData' in events[i]: del events[i]['NeedsData'] warning(u"Library data, file %s: Event %s has no data." diff --git a/lslopt/lsloutput.py b/lslopt/lsloutput.py index 136e2a5..38c5e67 100644 --- a/lslopt/lsloutput.py +++ b/lslopt/lsloutput.py @@ -21,6 +21,7 @@ from lslopt import lslfuncs from lslopt import lslcommon from lslopt.lslcommon import Key, Vector, Quaternion, warning from math import copysign +from strutil import * debugScopes = False @@ -62,7 +63,7 @@ class outscript(object): " spaces by the viewer when copy-pasting the code" " (disable this warning by disabling the 'warntabs'" " option).") - return pfx + '"' + value.encode('utf8').replace('\\','\\\\') \ + return pfx + '"' + any2str(value, 'utf8').replace('\\','\\\\') \ .replace('"','\\"').replace('\n','\\n') + '"' + sfx if tvalue == int: if value < 0 and not self.globalmode and self.optsigns: diff --git a/lslopt/lslparse.py b/lslopt/lslparse.py index 2cf9ad4..1935908 100644 --- a/lslopt/lslparse.py +++ b/lslopt/lslparse.py @@ -29,6 +29,10 @@ import re # Note this module was basically written from bottom to top, which may help # reading it. +WHITESPACE_CHARS = frozenset({' ', '\r', '\n', '\x0B', '\x0C'}) +SINGLE_SYMBOLS = frozenset({'.', ';', '{', '}', ',', '=', '(', ')', '-', '+', + '*', '/', '%', '@', ':', '<', '>', '[', ']', '&', '|', '^', '~', '!'}) + def isdigit(c): return '0' <= c <= '9' @@ -48,7 +52,7 @@ def GetErrLineCol(parser): # Find start of current line lstart = parser.script.rfind('\n', 0, errorpos) + 1 # Find zero-based column number in characters - cno = len(parser.script[lstart:errorpos].decode('utf8')) + cno = len(any2u(parser.script[lstart:errorpos], 'utf8')) # Find in #line directives list i = len(parser.linedir) filename = '' # value to return if there's no #line before lno @@ -75,7 +79,7 @@ class EParse(Exception): if parser.emap and filename == '': filename = parser.filename - filename = (filename.decode('utf8', 'replace') + filename = (str2u(filename, 'utf8') .replace(u'\\', u'\\\\') .replace(u'"', u'\\"') ) @@ -543,7 +547,7 @@ class parser(object): # self.linestart is related to the preprocessor, therefore we # check the characters that are relevant for standard C. - if c not in ' \n\r\x0B\x0C': + if c not in WHITESPACE_CHARS: self.linestart = False # Process strings @@ -584,7 +588,7 @@ class parser(object): if is_string: self.pos += 1 - return ('STRING_VALUE', lslfuncs.zstr(strliteral.decode('utf8'))) + return ('STRING_VALUE', lslfuncs.zstr(str2u(strliteral, 'utf8'))) # fall through (to consider the L or to ignore the ") if isalpha_(c): @@ -705,7 +709,7 @@ class parser(object): return (self.script[self.pos-3:self.pos],) return (self.script[self.pos-2:self.pos],) - if c in '.;{},=()-+*/%@:<>[]&|^~!' and c != '': + if c in SINGLE_SYMBOLS: return (c,) if c == '\n': @@ -2801,8 +2805,7 @@ list lazy_list_set(list L, integer i, list v) self.filename = filename - if type(script) is unicode: - script = script.encode('utf8') + script = any2str(script, 'utf8') self.script = script self.length = len(script) diff --git a/lslopt/lslrenamer.py b/lslopt/lslrenamer.py index f3a7073..32946c8 100644 --- a/lslopt/lslrenamer.py +++ b/lslopt/lslrenamer.py @@ -23,6 +23,8 @@ # # A side effect of this change is that the script becomes unreadable gibberish. +from strutil import xrange + class renamer(object): CharSet1 = '_ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz' CharSet2 = '0123456789_ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz' diff --git a/main.py b/main.py index d9274a0..256452c 100755 --- a/main.py +++ b/main.py @@ -188,15 +188,15 @@ def PreparePreproc(script): def ScriptHeader(script, avname): if avname: - avname = b' - ' + avname - return (b'//start_unprocessed_text\n/*' + avname = ' - ' + avname + return ('//start_unprocessed_text\n/*' # + re.sub(r'([*/])(?=[*|/])', r'\1|', script) # FS's algorithm # HACK: This won't break strings containing ** or /* or // like URLs, # while still being compatible with FS. - + re.sub(br'([*/]\||\*(?=/))', br'\1|', script) - + b'*/\n//end_unprocessed_text\n//nfo_preprocessor_version 0\n' - b'//program_version LSL PyOptimizer v' + str2b(VERSION) - + str2b(avname) + b'\n//mono\n\n') + + re.sub(r'([*/]\||\*(?=/))', r'\1|', script) + + '*/\n//end_unprocessed_text\n//nfo_preprocessor_version 0\n' + '//program_version LSL PyOptimizer v' + VERSION + + avname + '\n//mono\n\n') def Usage(progname, about = None): if about is None: @@ -453,7 +453,7 @@ def main(argv): if chgfix[1:] not in validoptions: Usage(argv[0], 'optimizer-options') werr(u"\nError: Unrecognized" - u" optimizer option: %s\n" % chg.decode('utf8')) + u" optimizer option: %s\n" % str2u(chg, 'utf8')) return 1 if chgfix[0] == '-': options.discard(chgfix[1:]) @@ -591,6 +591,28 @@ def main(argv): f.close() del f + # Transform to str and check Unicode validity + if type(script) is unicode: + script = u2str(script, 'utf8') + else: + try: + # Try converting the script to Unicode, to report any encoding + # errors with accurate line information. + tmp = UniConvScript(script, options, + fname if fname != '-' else '', + emap).to_unicode() + # For Python 2, just report any errors and ignore the result. + # For Python 3, use the Unicode. + if python3: + script = tmp + del tmp + except EParse as e: + # We don't call ReportError to prevent problems due to + # displaying invalid UTF-8 + werr(e.args[0] + u"\n") + return 1 + # Now script is in native str format. + if script_header: script_header = ScriptHeader(script, avname) @@ -598,7 +620,7 @@ def main(argv): import time tmp = time.time() script_timestamp = time.strftime( - b'// Generated on %Y-%m-%dT%H:%M:%S.{0:06d}Z\n' + '// Generated on %Y-%m-%dT%H:%M:%S.{0:06d}Z\n' .format(int(tmp % 1 * 1000000)), time.gmtime(tmp)) del tmp @@ -642,27 +664,11 @@ def main(argv): # Append user arguments at the end to allow them to override defaults preproc_cmdline += preproc_user_postargs - # Transform to bytes and check Unicode validity - if type(script) is unicode: - script = script.encode('utf8') - else: - try: - # Try converting the script to Unicode, to report any encoding - # errors with accurate line information. At this point we don't - # need the result. - UniConvScript(script, options, - fname if fname != '-' else '', - emap).to_unicode() - except EParse as e: - # We don't call ReportError to prevent problems due to - # displaying invalid UTF-8 - werr(e.args[0] + u"\n") - return 1 - if preproc != 'none': + # PreparePreproc uses and returns Unicode string encoding. + script = u2b(PreparePreproc(any2u(script, 'utf8')), 'utf8') # At this point, for the external preprocessor to work we need the # script as a byte array, not as unicode, but it should be UTF-8. - script = PreparePreproc(script.decode('utf8')).encode('utf8') if preproc == 'mcpp': # As a special treatment for mcpp, we force it to output its # macros so we can read if USE_xxx are defined. With GCC that @@ -680,6 +686,8 @@ def main(argv): return status del p, status + script = any2str(script, 'utf8') + # This method is very imperfect, in several senses. However, since # it's applied to the output of the preprocessor, all of the # concerns should be addressed: @@ -687,13 +695,13 @@ def main(argv): # - Comments preceding the directive should not cause problems. # e.g.: /* test */ #directive # - #directive within a comment or string should be ignored. - for x in re.findall(br'(?:(?<=\n)|^)\s*#\s*define\s+(' - br'USE_SWITCHES' - br'|USE_LAZY_LISTS' - br')(?:$|[^A-Za-z0-9_])', script, re.S): - if x == b'USE_SWITCHES': + for x in re.findall(r'(?:(?<=\n)|^)\s*#\s*define\s+(' + r'USE_SWITCHES' + r'|USE_LAZY_LISTS' + r')(?:$|[^A-Za-z0-9_])', script, re.S): + if x == 'USE_SWITCHES': options.add('enableswitch') - elif x == b'USE_LAZY_LISTS': + elif x == 'USE_LAZY_LISTS': options.add('lazylists') if not preshow: @@ -703,9 +711,10 @@ def main(argv): lib = lslopt.lslloadlib.LoadLibrary(builtins, libdata) p = parser(lib) + assert type(script) == str try: ts = p.parse(script, options, - fname if fname != '-' else '') + 'stdin' if fname == '-' else fname) except EParse as e: ReportError(script, e) return 1 diff --git a/run-tests.py b/run-tests.py index 0c6e12f..c420c41 100755 --- a/run-tests.py +++ b/run-tests.py @@ -213,9 +213,9 @@ def invokeMain(argv, stdin = None): lslcommon.IsCalc = False lslcommon.Bugs.clear() lslcommon.Bugs.add(6495) - save_stdin = sys.stdin - save_stdout = sys.stdout - save_stderr = sys.stderr + lslcommon.save_stdin = sys.stdin + lslcommon.save_stdout = sys.stdout + lslcommon.save_stderr = sys.stderr stdout_output = None stderr_output = None try: @@ -231,9 +231,9 @@ def invokeMain(argv, stdin = None): stdout_output = sys.stdout.getvalue() stderr_output = sys.stderr.getvalue() finally: - sys.stdin = save_stdin - sys.stdout = save_stdout - sys.stderr = save_stderr + sys.stdin = lslcommon.save_stdin + sys.stdout = lslcommon.save_stdout + sys.stderr = lslcommon.save_stderr lslcommon.LSO = False lslcommon.IsCalc = False lslcommon.Bugs.clear() @@ -721,10 +721,9 @@ def generateScriptTests(): try: if expected_stderr.startswith(b'REGEX\n'): - self.assertIsNotNone( - re.search(expected_stderr[6:], - actual_stderr.decode('utf8') - ) + self.assertIsNotNone(re.search( + b2u(expected_stderr[6:], 'utf8'), + b2u(actual_stderr, 'utf8')) ) else: self.assertTrue(expected_stderr == actual_stderr) @@ -734,6 +733,7 @@ def generateScriptTests(): werr(expected_stderr) werr(u'\n************ actual stderr:\n') werr(actual_stderr) +# werr(('1' if difflib else '0')+('1' if expected_stderr else '0') + ('1' if actual_stderr else '0')) if difflib and expected_stderr and actual_stderr \ and not expected_stderr.startswith(b'REGEX\n'): werr(u'\n************ diff:\n' @@ -746,8 +746,9 @@ def generateScriptTests(): raise try: if expected_stdout.startswith(b'REGEX\n'): - self.assertIsNotNone(re.search(expected_stdout[6:], - actual_stdout)) + self.assertIsNotNone(re.search( + b2u(expected_stdout[6:], 'utf8'), + b2u(actual_stdout, 'utf8'))) else: self.assertTrue(expected_stdout == actual_stdout) except AssertionError: diff --git a/unit_tests/coverage.suite/invalid-file.err b/unit_tests/coverage.suite/invalid-file.err index be3f8c7..f5f09d7 100644 --- a/unit_tests/coverage.suite/invalid-file.err +++ b/unit_tests/coverage.suite/invalid-file.err @@ -1,2 +1,2 @@ REGEX -IOError: (?:\[Errno 21\] Is a directory|\[Errno 13\] Permission denied): 'unit_tests/coverage.suite/actually-a-dir.d' +Error: (?:\[Errno 21\] Is a directory|\[Errno 13\] Permission denied): 'unit_tests/coverage.suite/actually-a-dir.d'