From dc655e3501a16b5fcf3fc91de025f0fd9506e912 Mon Sep 17 00:00:00 2001 From: Sei Lisa Date: Sun, 11 Dec 2022 20:39:44 +0100 Subject: [PATCH] Monkey-patch some string functions in narrow string builds The official distribution of Python 2.7 on Windows is built with "narrow strings" (UTF-16 strings with no proper indexing or length). This makes some tests fail. "Fix" this by monkey-patching a few functions and using a wrapping unicode class, as we can't monkey-patch the actual unicode type. This is very fragile code, but it's the best we could do given the limitations. --- lslopt/lslbasefuncs.py | 6 ++-- strutil.py | 67 ++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 71 insertions(+), 2 deletions(-) diff --git a/lslopt/lslbasefuncs.py b/lslopt/lslbasefuncs.py index 683b15d..cdd0cb2 100644 --- a/lslopt/lslbasefuncs.py +++ b/lslopt/lslbasefuncs.py @@ -674,6 +674,8 @@ def InternalUTF8toString(s): # type check. Same for llGetSubString and llList2List. They are all joined into # one single function. def InternalGetDeleteSubSequence(val, start, end, isGet): + if type(val) == unicode: + val = uniwrap(val) start = fi(start) end = fi(end) L = len(val) @@ -1298,7 +1300,7 @@ def llGetSubString(s, start, end): return InternalGetDeleteSubSequence(s, start, end, isGet=True) def llHash(s): - s = fs(s) + s = uniwrap(fs(s)) hash = 0 for i in s: hash = (hash * 65599 + ord(i)) & 0xFFFFFFFF @@ -1718,7 +1720,7 @@ def llModPow(base, exp, mod): return S32(ret) def llOrd(val, index): - val = fs(val) + val = uniwrap(fs(val)) index = fi(index) L = len(val) if -L <= index < L: diff --git a/strutil.py b/strutil.py index 85bfa06..4a6e4fe 100644 --- a/strutil.py +++ b/strutil.py @@ -22,12 +22,15 @@ import codecs codecs.register(lambda x: codecs.lookup('utf8') if x == 'cp65001' else None) import sys + if sys.version_info.major >= 3: unicode = str unichr = chr xrange = range python3 = True python2 = False + python2Narrow = False + uniwrap = unicode def str2u(s, enc=None): """Convert a native Python3 str to Unicode. This is a NOP.""" @@ -57,6 +60,8 @@ else: xrange = xrange python2 = True python3 = False + python2Narrow = False + uniwrap = unicode def str2u(s, enc=None): """Convert a native Python2 str to Unicode.""" @@ -80,6 +85,68 @@ else: """Convert Bytes or Unicode to native Python 2 str.""" return s if type(s) == str else u2str(s, enc) + if len(u'\U00010001') == 2: + # Narrow character build (UTF-16 strings) + # Monkey-patch the relevant functions + python2Narrow = True + _unichr = unichr + _ord = ord + _len = len + + def unichr(n): + if not (65536 <= n < 0x110000): + return _unichr(n) + return ('\\U%08X' % n).decode('unicode-escape') + + def ord(x): + if isinstance(x, unicode) and _len(x) == 2: + x = unicode(x) + if 0xD800 <= _ord(x[0]) < 0xDC00: + return 65536 + ((_ord(x[0]) & 0x3FF) << 10 + | (_ord(x[1]) & 0x3FF)) + return _ord(x) + + def len(x): + if isinstance(x, unicode): + return _len(x.encode('utf-32le')) >> 2 + return _len(x) + + # Alas, we can't monkey-patch the unicode class' __getitem__ and + # __getslice__ methods; we need a workaround. + class uniwrap(unicode): + def __getslice__(self, start, stop): + lim = sys.maxint >> 2 + if start < 0: start = 0 + if stop < 0: stop = 0 + if start < lim: + start <<= 2 + else: + start = sys.maxint + if stop < lim: + stop <<= 2 + else: + stop = sys.maxint + return self.encode('utf-32le')[start:stop].decode( + 'utf-32le') + def __getitem__(self, item): + if type(item) == slice: + start = item.start + stop = item.stop + step = item.step + if start is not None: + start <<= 2 + if stop is not None: + stop <<= 2 + if step is not None: + step <<= 2 + return self.encode('utf-32le')[start:stop:step].decode( + 'utf-32le') + u = self.encode('utf-32le') + item <<= 2 + if item >= _len(u): + return u[item] # raise IndexError, as slicing doesn't + return u[item:(item+4 if item != -4 else None)].decode( + 'utf-32le') def b2u(s, enc=None): """Bytes to Unicode"""