Getting to the way it's supposed to be!

2024-10-12 00:43:51 +02:00
parent 84729f9d27
commit 8f2dad9cec
2663 changed files with 540071 additions and 14 deletions
--- a/modules/ufbx/misc/unicode_test_gen.py
+++ b/modules/ufbx/misc/unicode_test_gen.py
@@ -0,0 +1,169 @@
+import math
+import random
+import argparse
+import transmute_fbx as tfbx
+
+Node = tfbx.Node
+Value = tfbx.Value
+
+def max_codepoint(width):
+    if width == 0:
+        return -1
+    elif width == 1:
+        return 0x7f
+    elif width == 2:
+        return 0x7ff
+    elif width == 3:
+        return 0xffff
+    elif width == 4:
+        return 0x10_ffff
+    else:
+        raise ValueError(f"Unsupported width: {width}")
+
+def codepoint_to_utf8(codepoint, width, *, allow_overflow=False):
+    """Unrestricted codepoint to UTF-8"""
+
+    if not allow_overflow:
+        assert codepoint <= max_codepoint(width)
+
+    c = codepoint
+    if width == 1:
+        return bytes([c])
+    elif width == 2:
+        return bytes([
+            0b1100_0000 | ((c >> 6) & 0b0001_1111),
+            0b1000_0000 | ((c >> 0) & 0b0011_1111),
+        ])
+    elif width == 3:
+        return bytes([
+            0b1110_0000 | ((c >> 12) & 0b0000_1111),
+            0b1000_0000 | ((c >>  6) & 0b0011_1111),
+            0b1000_0000 | ((c >>  0) & 0b0011_1111),
+        ])
+    elif width == 4:
+        return bytes([
+            0b1111_0000 | ((c >> 18) & 0b0000_0111),
+            0b1000_0000 | ((c >> 12) & 0b0011_1111),
+            0b1000_0000 | ((c >>  6) & 0b0011_1111),
+            0b1000_0000 | ((c >>  0) & 0b0011_1111),
+        ])
+    else:
+        raise ValueError(f"Unsupported width: {width}")
+
+def int_to_bytes(value):
+    num_bytes = int(math.ceil(math.log2(value + 1) / 8))
+    return value.to_bytes(num_bytes, "big", signed=False)
+
+def valid_utf8(utf8):
+    try:
+        utf8.decode("utf-8")
+        return True
+    except UnicodeDecodeError:
+        return False
+
+fuzz_encodings = {
+    b"",
+    b"\x00",
+    b"\xff",
+    b"\xff\xff",
+    b"\xff\xff\xff",
+    b"\xff\xff\xff\xff",
+    b"Hello world",
+    b"Hello\xffworld",
+}
+
+for width in range(1, 4+1):
+    for codepoint in range(max_codepoint(width) - 1):
+        prev = codepoint_to_utf8(codepoint, width)
+        next = codepoint_to_utf8(codepoint + 1, width)
+        if valid_utf8(prev) != valid_utf8(next):
+            fuzz_encodings.add(prev)
+            fuzz_encodings.add(next)
+
+for width in range(1, 4+1):
+    fuzz_encodings.add(codepoint_to_utf8(max_codepoint(width - 1) + 1, width))
+    fuzz_encodings.add(codepoint_to_utf8(max_codepoint(width), width))
+
+for width in range(1, 4+1):
+    for n in range(0x10ffff):
+        codepoint = (n*n)//7 + n
+        if codepoint > max_codepoint(width):
+            break
+        fuzz_encodings.add(codepoint_to_utf8(codepoint, width))
+
+for n in range(0x400):
+    fuzz_encodings.add(int_to_bytes(n))
+
+for n in range(0, 0x1_00_00, 64):
+    fuzz_encodings.add(int_to_bytes(n))
+
+fuzz_encodings.add(codepoint_to_utf8(max_codepoint(4) + 1, 4, allow_overflow=True))
+for n in range(32):
+    codepoint = 0x10FFFF + n**4
+    assert codepoint <= 0x1FFFFF
+    fuzz_encodings.add(codepoint_to_utf8(codepoint, 4, allow_overflow=True))
+fuzz_encodings.add(codepoint_to_utf8(0x1FFFFF, 4, allow_overflow=True))
+
+random.seed(1)
+for n in range(200):
+    for k in range(1, 4+1):
+        fuzz_encodings.add(bytes(random.choices(range(256), k=k)))
+
+good = []
+bad = []
+for enc in sorted(fuzz_encodings, key=lambda e: (len(e), e)):
+    if valid_utf8(enc):
+        good.append(enc)
+    else:
+        bad.append(enc)
+
+def fmt_fbx_props(encodings, ascii):
+    for enc in encodings:
+        hex = b"".join(f"{x:02x}".encode("ascii") for x in enc)
+        if ascii:
+            string = enc.replace(b"\"", b"&quot;")
+        else:
+            string = enc
+        yield Node(b"P", [Value(b"S", hex), Value(b"S", b""), Value(b"S", b""), Value(b"S", b""), Value(b"S", string)], [])
+
+def fmt_fbx_model_name(name, ascii):
+    if ascii:
+        return Value(b"S", f"Model::{name}".encode("utf-8"))
+    else:
+        return Value(b"S", f"{name}\x00\x01Model".encode("utf-8"))
+
+def fmt_fbx_root(ascii):
+    fbx_root = Node(b"", [], [])
+
+    fbx_objects = Node(b"Objects", [], [])
+    fbx_root.children.append(fbx_objects)
+
+    fbx_good = Node(b"Model", [Value(b"L", 1), fmt_fbx_model_name("Good", ascii), Value(b"S", b"Mesh")], [])
+    fbx_objects.children.append(fbx_good)
+
+    fbx_good_props = Node(b"Properties70", [], list(fmt_fbx_props(good, ascii)))
+    fbx_good.children.append(fbx_good_props)
+
+    fbx_bad = Node(b"Model", [Value(b"L", 2), fmt_fbx_model_name("Bad", ascii), Value(b"S", b"Mesh")], [])
+    fbx_objects.children.append(fbx_bad)
+
+    fbx_bad_props = Node(b"Properties70", [], list(fmt_fbx_props(bad, ascii)))
+    fbx_bad.children.append(fbx_bad_props)
+
+    ok = [b"\xff" + enc for enc in good]
+
+    fbx_ok = Node(b"Model", [Value(b"L", 3), fmt_fbx_model_name("Ok", ascii), Value(b"S", b"Mesh")], [])
+    fbx_objects.children.append(fbx_ok)
+
+    fbx_ok_props = Node(b"Properties70", [], list(fmt_fbx_props(ok, ascii)))
+    fbx_ok.children.append(fbx_ok_props)
+
+    return fbx_root
+
+parser = argparse.ArgumentParser("unicode_test_gen.py")
+parser.add_argument("outfile", help="Output filename")
+argv = parser.parse_args()
+
+root = fmt_fbx_root(ascii=False)
+with open(argv.outfile, "wb") as f:
+    tfbx.binary_dump_root(f, root, tfbx.BinaryFormat(7500, False), b"")