From 0b7d04e5ff7c936283e121bbb8a6fc82ded0b0f3 Mon Sep 17 00:00:00 2001
From: Sei Lisa <sei-lisa@email.fake>
Date: Thu, 8 Dec 2022 13:22:54 +0100
Subject: [PATCH] Deal with changes in llChar, strengthen some tests

Previously, llChar formed an UTF-8-1993 string with the given code and converted that, resulting in multiple question marks when the conversion to Unicode forced by Mono caused errors in multiple characters. They have changed the implementation and now it also considers U+FFFF invalid, and only returns one U+FFFD character if the input is invalid, and LSO behaves the same as Mono (no UTF-8-1993 anymore).

We've also detected problems with Windows (who else would it be) for the Unicode "astral" planes (planes beyond the Basic Multilingual Plane), so now there are new tests that include characters > U+FFFF. And since some builds of Python 2 use UTF-16 internally, we also check llSubString and friends with positions after an astral plane character. This is currently failing under Windows, as there are numerous encoding and line ending problems happening on that OS, especially with Python 3.
---
 lslopt/lslbasefuncs.py                    | 46 ++---------------------
 unit_tests/expr.suite/llord-char-hash.lsl |  3 ++
 unit_tests/expr.suite/llord-char-hash.out | 25 ++++++------
 unit_tests/expr.suite/string-funcs.lsl    |  7 +++-
 unit_tests/expr.suite/string-funcs.out    |  7 +++-
 5 files changed, 32 insertions(+), 56 deletions(-)

diff --git a/lslopt/lslbasefuncs.py b/lslopt/lslbasefuncs.py
index 1ba6375..683b15d 100644
--- a/lslopt/lslbasefuncs.py
+++ b/lslopt/lslbasefuncs.py
@@ -1153,50 +1153,10 @@ def llCeil(f):
 
 def llChar(code):
     code = fi(code)
-    # The result is consistent with a conversion of the codepoint to
-    # UTF-8-1993, then using InternalUTF8toString on the result.
-    # A thorough test shows that llChar(n) equals llUnescapeURL(utf8_1993)
-    # up to codepoint 0x13FFFF. Furthermore llChar(0x200000) returns "?????",
-    # and llChar(0x7FFFFFFF) returns "??????", which are also consistent with
-    # that. LSO also returns UTF-8-1993 for codepoints > 0x10FFFF. So, the
-    # internal implementation is likely to form a UTF8-1993 string from the
-    # codepoint and then convert that to string, like this:
-#    if code < 0:
-#       return u'?'
-#    if code < 0x80:
-#        s = (code,)
-#    elif code < 0x800:
-#        s = (0xC0+(code >> 6), 0x80+(code&0x3F))
-#    elif code < 0x10000:
-#        s = (0xE0+(code >> 12), 0x80+((code >> 6)&0x3F), 0x80+(code&0x3F))
-#    elif code < 0x200000:
-#        s = (0xF0+(code >> 18), 0x80+((code >> 12)&0x3F),
-#            0x80+((code >> 6)&0x3F), 0x80+(code&0x3F))
-#    elif code < 0x4000000:
-#        s = (0xF8+(code >> 24),
-#            0x80+((code >> 18)&0x3F), 0x80+((code >> 12)&0x3F),
-#            0x80+((code >> 6)&0x3F), 0x80+(code&0x3F))
-#    else:
-#        s = (0xFC+(code >> 30), 0x80+((code >> 24)&0x3F),
-#            0x80+((code >> 18)&0x3F), 0x80+((code >> 12)&0x3F),
-#            0x80+((code >> 6)&0x3F), 0x80+(code&0x3F))
-#    return zstr(InternalUTF8toString(bytearray(s)))
 
-    # Here's an alternative, simpler implementation that only works for Mono:
-    if lslcommon.LSO:
-        raise ELSLCantCompute
-    if code <= 0 or code > 0x10FFFF:
-        if code == 0:
-            return u''
-        if code < 0:
-            return u'?'
-        if code >= 0x4000000:
-            return u'??????'
-        if code >= 0x200000:
-            return u'?????'
-        return u'????'
-    if (0xD800 <= code <= 0xDFFF) or code == 0xFFFE:
-        return u'???'
+    if (not 1 <= code <= 0x10FFFF or 0xD800 <= code <= 0xDFFF
+            or code == 0xFFFE or code == 0xFFFF):
+        return u'' if code == 0 else u'\uFFFD'
     return unichr(code)
 
 def llCos(f):
diff --git a/unit_tests/expr.suite/llord-char-hash.lsl b/unit_tests/expr.suite/llord-char-hash.lsl
index 1a8a3dc..85cf564 100644
--- a/unit_tests/expr.suite/llord-char-hash.lsl
+++ b/unit_tests/expr.suite/llord-char-hash.lsl
@@ -5,12 +5,15 @@
 , llOrd(".", 1)
 , llOrd(".", 2)
 , llOrd("ð", 0)
+, llOrd("𝄞𝐀", -1)
 , llOrd("𝄞𝐀", 0)
 , llOrd("𝄞𝐀", 1)
 , llOrd("𝄞𝐀", 2)
 , llOrd("𝄞𝐀", 3)
+, llOrd("𝄞𝐀", 4)
 , llOrd(JSON_TRUE, 0)
 , llOrd(llUnescapeURL("%EF%BF%BF"), 0)
+, llEscapeURL(llChar(-123456789))
 , llEscapeURL(llChar(-123))
 , llEscapeURL(llChar(-1))
 , llEscapeURL(llChar(0))
diff --git a/unit_tests/expr.suite/llord-char-hash.out b/unit_tests/expr.suite/llord-char-hash.out
index c33b750..117d99f 100644
--- a/unit_tests/expr.suite/llord-char-hash.out
+++ b/unit_tests/expr.suite/llord-char-hash.out
@@ -5,30 +5,33 @@
 , 0
 , 0
 , 240
+, 119808
 , 119070
 , 119808
 , 0
 , 0
+, 0
 , 64982
 , 65535
-, "%3F"
-, "%3F"
+, "%EF%BF%BD"
+, "%EF%BF%BD"
+, "%EF%BF%BD"
 , ""
 , "%01"
 , "%C2%A9"
 , "%C5%8D"
 , "%E2%80%90"
-, "%3F%3F%3F"
-, "%3F%3F%3F"
-, "%3F%3F%3F"
-, "%EF%BF%BF"
+, "%EF%BF%BD"
+, "%EF%BF%BD"
+, "%EF%BF%BD"
+, "%EF%BF%BD"
 , "%F0%9F%98%80"
 , "%F4%8F%BF%BF"
-, "%3F%3F%3F%3F"
-, "%3F%3F%3F%3F%3F"
-, "%3F%3F%3F%3F%3F%3F"
-, "%3F%3F%3F%3F%3F%3F"
-, "%3F"
+, "%EF%BF%BD"
+, "%EF%BF%BD"
+, "%EF%BF%BD"
+, "%EF%BF%BD"
+, "%EF%BF%BD"
 , 1203819346
 , 0
 , 1172851538
diff --git a/unit_tests/expr.suite/string-funcs.lsl b/unit_tests/expr.suite/string-funcs.lsl
index 5662a17..156d2ab 100644
--- a/unit_tests/expr.suite/string-funcs.lsl
+++ b/unit_tests/expr.suite/string-funcs.lsl
@@ -182,9 +182,14 @@
 , llGetSubString("abcd",  9,  3)
 , llGetSubString("abcd",  9,  4)
 , llGetSubString("abcd",  9,  5)
+, llGetSubString("😀bcd",  0,  0)
+, llGetSubString("😀bcd",  1,  1)
+, llGetSubString("😀bcd",  2,  2)
+, llGetSubString("😀bcd",  3,  3)
+, llGetSubString("😀bcd",  4,  4)
 , llGetSubString("", 0, -1)
 , llStringLength("")
-, llStringLength("÷½¬⅛⅜⅝⅞±°z")
+, llStringLength("÷½¬⅛⅜⅝⅞😀±°z")
 , llSubStringIndex("x", "blah")
 , llSubStringIndex("", "")
 , llSubStringIndex("", "x")
diff --git a/unit_tests/expr.suite/string-funcs.out b/unit_tests/expr.suite/string-funcs.out
index 1d19099..09466a3 100644
--- a/unit_tests/expr.suite/string-funcs.out
+++ b/unit_tests/expr.suite/string-funcs.out
@@ -182,9 +182,14 @@
 , "abcd"
 , "abcd"
 , "abcd"
+, "😀"
+, "b"
+, "c"
+, "d"
+, ""
 , ""
 , 0
-, 10
+, 11
 , -1
 , 0
 , -1