From ed05a2e0225ddd6bd9f3e86c436c98306827e56c Mon Sep 17 00:00:00 2001
From: Sei Lisa <sei-lisa@email.fake>
Date: Sun, 26 Nov 2017 14:10:33 +0100
Subject: [PATCH] Make PreparePreproc Unicode-aware.

Fixes mismatches in column number output after a multiline string, if the last line of the string contains non-ASCII Unicode characters.
---
 main.py | 42 +++++++++++++++++++++---------------------
 1 file changed, 21 insertions(+), 21 deletions(-)

diff --git a/main.py b/main.py
index f24ed43..95b707f 100755
--- a/main.py
+++ b/main.py
@@ -106,27 +106,27 @@ def PreparePreproc(script):
     # instead of reproducing that C quirk. This also matches what FS is doing
     # currently, so it's good for compatibility.
     tok = re.compile(
-        r'(?:'
-            r'/(?:\?\?/\n|\\\n)*\*.*?\*(?:\?\?/\n|\\\n)*/'
-            r'|/(?:\?\?/\n|\\\n)*/(?:\?\?/\n|\\\n|[^\n])*\n'
-            r'|[^"]'
-        r')+'
-        r'|"'
+        ur'(?:'
+            ur'/(?:\?\?/\n|\\\n)*\*.*?\*(?:\?\?/\n|\\\n)*/'
+            ur'|/(?:\?\?/\n|\\\n)*/(?:\?\?/\n|\\\n|[^\n])*\n'
+            ur'|[^"]'
+        ur')+'
+        ur'|"'
         , re.S)
     # RE used inside strings.
     tok2 = re.compile(
-        r'(?:'
-            r"\?\?[='()!<>-]"   # valid trigraph except ??/ (backslash)
-            r"|(?:\?\?/|\\)(?:\?\?[/='()!<>-]|[^\n])"
+        ur'(?:'
+            ur"\?\?[='()!<>-]"  # valid trigraph except ??/ (backslash)
+            ur"|(?:\?\?/|\\)(?:\?\?[/='()!<>-]|[^\n])"
                                 # backslash trigraph or actual backslash,
                                 # followed by any trigraph or non-newline
-            r'|(?!\?\?/\n|\\\n|"|\n).'
+            ur'|(?!\?\?/\n|\\\n|"|\n).'
                                 # any character that doesn't start a trigraph/
                                 # backslash escape followed by a newline
                                 # or is a newline or double quote, as we're
                                 # interested in all those individually.
-        r')+'                   # as many of those as possible
-        r'|\?\?/\n|\\\n|\n|"'   # or any of those individually
+        ur')+'                  # as many of those as possible
+        ur'|\?\?/\n|\\\n|\n|"'  # or any of those individually
         )
 
     pos = 0
@@ -134,7 +134,7 @@ def PreparePreproc(script):
     while match:
         matched = match.group(0)
         pos += len(matched)
-        if matched == '"':
+        if matched == u'"':
             s += matched
             nlines = col = 0
             match2 = tok2.search(script, pos)
@@ -142,24 +142,24 @@ def PreparePreproc(script):
                 matched2 = match2.group(0)
                 pos += len(matched2)
 
-                if matched2 == '\\\n' or matched2 == '??/\n':
+                if matched2 == u'\\\n' or matched2 == u'??/\n':
                     nlines += 1
                     col = 0
                     match2 = tok2.search(script, pos)
                     continue
-                if matched2 == '"':
+                if matched2 == u'"':
                     if nlines:
-                        if script[pos:pos+1] == '\n':
+                        if script[pos:pos+1] == u'\n':
                             col = -1 # don't add spaces if not necessary
                         # col misses the quote added here, so add 1
-                        s += '"' + '\n'*nlines + ' '*(col+1)
+                        s += u'"' + u'\n'*nlines + u' '*(col+1)
                     else:
-                        s += '"'
+                        s += u'"'
                     break
-                if matched2 == '\n':
+                if matched2 == u'\n':
                     nlines += 1
                     col = 0
-                    s += '\\n'
+                    s += u'\\n'
                 else:
                     col += len(matched2)
                     s += matched2
@@ -628,7 +628,7 @@ def main(argv):
         if preproc != 'none':
             # At this point, for the external preprocessor to work we need the
             # script as a byte array, not as unicode, but it should be UTF-8.
-            script = PreparePreproc(script)
+            script = PreparePreproc(script.decode('utf8')).encode('utf8')
             if preproc == 'mcpp':
                 # As a special treatment for mcpp, we force it to output its
                 # macros so we can read if USE_xxx are defined. With GCC that