coven/modules/ufbx/misc/unicode_test_gen.py

import math
import random
import argparse
import transmute_fbx as tfbx

Node = tfbx.Node
Value = tfbx.Value

def max_codepoint(width):
    if width == 0:
        return -1
    elif width == 1:
        return 0x7f
    elif width == 2:
        return 0x7ff
    elif width == 3:
        return 0xffff
    elif width == 4:
        return 0x10_ffff
    else:
        raise ValueError(f"Unsupported width: {width}")

def codepoint_to_utf8(codepoint, width, *, allow_overflow=False):
    """Unrestricted codepoint to UTF-8"""

    if not allow_overflow:
        assert codepoint <= max_codepoint(width)

    c = codepoint
    if width == 1:
        return bytes([c])
    elif width == 2:
        return bytes([
            0b1100_0000 | ((c >> 6) & 0b0001_1111),
            0b1000_0000 | ((c >> 0) & 0b0011_1111),
        ])
    elif width == 3:
        return bytes([
            0b1110_0000 | ((c >> 12) & 0b0000_1111),
            0b1000_0000 | ((c >>  6) & 0b0011_1111),
            0b1000_0000 | ((c >>  0) & 0b0011_1111),
        ])
    elif width == 4:
        return bytes([
            0b1111_0000 | ((c >> 18) & 0b0000_0111),
            0b1000_0000 | ((c >> 12) & 0b0011_1111),
            0b1000_0000 | ((c >>  6) & 0b0011_1111),
            0b1000_0000 | ((c >>  0) & 0b0011_1111),
        ])
    else:
        raise ValueError(f"Unsupported width: {width}")

def int_to_bytes(value):
    num_bytes = int(math.ceil(math.log2(value + 1) / 8))
    return value.to_bytes(num_bytes, "big", signed=False)

def valid_utf8(utf8):
    try:
        utf8.decode("utf-8")
        return True
    except UnicodeDecodeError:
        return False

fuzz_encodings = {
    b"",
    b"\x00",
    b"\xff",
    b"\xff\xff",
    b"\xff\xff\xff",
    b"\xff\xff\xff\xff",
    b"Hello world",
    b"Hello\xffworld",
}

for width in range(1, 4+1):
    for codepoint in range(max_codepoint(width) - 1):
        prev = codepoint_to_utf8(codepoint, width)
        next = codepoint_to_utf8(codepoint + 1, width)
        if valid_utf8(prev) != valid_utf8(next):
            fuzz_encodings.add(prev)
            fuzz_encodings.add(next)

for width in range(1, 4+1):
    fuzz_encodings.add(codepoint_to_utf8(max_codepoint(width - 1) + 1, width))
    fuzz_encodings.add(codepoint_to_utf8(max_codepoint(width), width))

for width in range(1, 4+1):
    for n in range(0x10ffff):
        codepoint = (n*n)//7 + n
        if codepoint > max_codepoint(width):
            break
        fuzz_encodings.add(codepoint_to_utf8(codepoint, width))

for n in range(0x400):
    fuzz_encodings.add(int_to_bytes(n))

for n in range(0, 0x1_00_00, 64):
    fuzz_encodings.add(int_to_bytes(n))

fuzz_encodings.add(codepoint_to_utf8(max_codepoint(4) + 1, 4, allow_overflow=True))
for n in range(32):
    codepoint = 0x10FFFF + n**4
    assert codepoint <= 0x1FFFFF
    fuzz_encodings.add(codepoint_to_utf8(codepoint, 4, allow_overflow=True))
fuzz_encodings.add(codepoint_to_utf8(0x1FFFFF, 4, allow_overflow=True))

random.seed(1)
for n in range(200):
    for k in range(1, 4+1):
        fuzz_encodings.add(bytes(random.choices(range(256), k=k)))

good = []
bad = []
for enc in sorted(fuzz_encodings, key=lambda e: (len(e), e)):
    if valid_utf8(enc):
        good.append(enc)
    else:
        bad.append(enc)

def fmt_fbx_props(encodings, ascii):
    for enc in encodings:
        hex = b"".join(f"{x:02x}".encode("ascii") for x in enc)
        if ascii:
            string = enc.replace(b"\"", b"&quot;")
        else:
            string = enc
        yield Node(b"P", [Value(b"S", hex), Value(b"S", b""), Value(b"S", b""), Value(b"S", b""), Value(b"S", string)], [])

def fmt_fbx_model_name(name, ascii):
    if ascii:
        return Value(b"S", f"Model::{name}".encode("utf-8"))
    else:
        return Value(b"S", f"{name}\x00\x01Model".encode("utf-8"))

def fmt_fbx_root(ascii):
    fbx_root = Node(b"", [], [])

    fbx_objects = Node(b"Objects", [], [])
    fbx_root.children.append(fbx_objects)

    fbx_good = Node(b"Model", [Value(b"L", 1), fmt_fbx_model_name("Good", ascii), Value(b"S", b"Mesh")], [])
    fbx_objects.children.append(fbx_good)

    fbx_good_props = Node(b"Properties70", [], list(fmt_fbx_props(good, ascii)))
    fbx_good.children.append(fbx_good_props)

    fbx_bad = Node(b"Model", [Value(b"L", 2), fmt_fbx_model_name("Bad", ascii), Value(b"S", b"Mesh")], [])
    fbx_objects.children.append(fbx_bad)

    fbx_bad_props = Node(b"Properties70", [], list(fmt_fbx_props(bad, ascii)))
    fbx_bad.children.append(fbx_bad_props)

    ok = [b"\xff" + enc for enc in good]

    fbx_ok = Node(b"Model", [Value(b"L", 3), fmt_fbx_model_name("Ok", ascii), Value(b"S", b"Mesh")], [])
    fbx_objects.children.append(fbx_ok)

    fbx_ok_props = Node(b"Properties70", [], list(fmt_fbx_props(ok, ascii)))
    fbx_ok.children.append(fbx_ok_props)

    return fbx_root

parser = argparse.ArgumentParser("unicode_test_gen.py")
parser.add_argument("outfile", help="Output filename")
argv = parser.parse_args()

root = fmt_fbx_root(ascii=False)
with open(argv.outfile, "wb") as f:
    tfbx.binary_dump_root(f, root, tfbx.BinaryFormat(7500, False), b"")