170 lines
5.0 KiB
Python
170 lines
5.0 KiB
Python
import math
|
|
import random
|
|
import argparse
|
|
import transmute_fbx as tfbx
|
|
|
|
Node = tfbx.Node
|
|
Value = tfbx.Value
|
|
|
|
def max_codepoint(width):
|
|
if width == 0:
|
|
return -1
|
|
elif width == 1:
|
|
return 0x7f
|
|
elif width == 2:
|
|
return 0x7ff
|
|
elif width == 3:
|
|
return 0xffff
|
|
elif width == 4:
|
|
return 0x10_ffff
|
|
else:
|
|
raise ValueError(f"Unsupported width: {width}")
|
|
|
|
def codepoint_to_utf8(codepoint, width, *, allow_overflow=False):
|
|
"""Unrestricted codepoint to UTF-8"""
|
|
|
|
if not allow_overflow:
|
|
assert codepoint <= max_codepoint(width)
|
|
|
|
c = codepoint
|
|
if width == 1:
|
|
return bytes([c])
|
|
elif width == 2:
|
|
return bytes([
|
|
0b1100_0000 | ((c >> 6) & 0b0001_1111),
|
|
0b1000_0000 | ((c >> 0) & 0b0011_1111),
|
|
])
|
|
elif width == 3:
|
|
return bytes([
|
|
0b1110_0000 | ((c >> 12) & 0b0000_1111),
|
|
0b1000_0000 | ((c >> 6) & 0b0011_1111),
|
|
0b1000_0000 | ((c >> 0) & 0b0011_1111),
|
|
])
|
|
elif width == 4:
|
|
return bytes([
|
|
0b1111_0000 | ((c >> 18) & 0b0000_0111),
|
|
0b1000_0000 | ((c >> 12) & 0b0011_1111),
|
|
0b1000_0000 | ((c >> 6) & 0b0011_1111),
|
|
0b1000_0000 | ((c >> 0) & 0b0011_1111),
|
|
])
|
|
else:
|
|
raise ValueError(f"Unsupported width: {width}")
|
|
|
|
def int_to_bytes(value):
|
|
num_bytes = int(math.ceil(math.log2(value + 1) / 8))
|
|
return value.to_bytes(num_bytes, "big", signed=False)
|
|
|
|
def valid_utf8(utf8):
|
|
try:
|
|
utf8.decode("utf-8")
|
|
return True
|
|
except UnicodeDecodeError:
|
|
return False
|
|
|
|
fuzz_encodings = {
|
|
b"",
|
|
b"\x00",
|
|
b"\xff",
|
|
b"\xff\xff",
|
|
b"\xff\xff\xff",
|
|
b"\xff\xff\xff\xff",
|
|
b"Hello world",
|
|
b"Hello\xffworld",
|
|
}
|
|
|
|
for width in range(1, 4+1):
|
|
for codepoint in range(max_codepoint(width) - 1):
|
|
prev = codepoint_to_utf8(codepoint, width)
|
|
next = codepoint_to_utf8(codepoint + 1, width)
|
|
if valid_utf8(prev) != valid_utf8(next):
|
|
fuzz_encodings.add(prev)
|
|
fuzz_encodings.add(next)
|
|
|
|
for width in range(1, 4+1):
|
|
fuzz_encodings.add(codepoint_to_utf8(max_codepoint(width - 1) + 1, width))
|
|
fuzz_encodings.add(codepoint_to_utf8(max_codepoint(width), width))
|
|
|
|
for width in range(1, 4+1):
|
|
for n in range(0x10ffff):
|
|
codepoint = (n*n)//7 + n
|
|
if codepoint > max_codepoint(width):
|
|
break
|
|
fuzz_encodings.add(codepoint_to_utf8(codepoint, width))
|
|
|
|
for n in range(0x400):
|
|
fuzz_encodings.add(int_to_bytes(n))
|
|
|
|
for n in range(0, 0x1_00_00, 64):
|
|
fuzz_encodings.add(int_to_bytes(n))
|
|
|
|
fuzz_encodings.add(codepoint_to_utf8(max_codepoint(4) + 1, 4, allow_overflow=True))
|
|
for n in range(32):
|
|
codepoint = 0x10FFFF + n**4
|
|
assert codepoint <= 0x1FFFFF
|
|
fuzz_encodings.add(codepoint_to_utf8(codepoint, 4, allow_overflow=True))
|
|
fuzz_encodings.add(codepoint_to_utf8(0x1FFFFF, 4, allow_overflow=True))
|
|
|
|
random.seed(1)
|
|
for n in range(200):
|
|
for k in range(1, 4+1):
|
|
fuzz_encodings.add(bytes(random.choices(range(256), k=k)))
|
|
|
|
good = []
|
|
bad = []
|
|
for enc in sorted(fuzz_encodings, key=lambda e: (len(e), e)):
|
|
if valid_utf8(enc):
|
|
good.append(enc)
|
|
else:
|
|
bad.append(enc)
|
|
|
|
def fmt_fbx_props(encodings, ascii):
|
|
for enc in encodings:
|
|
hex = b"".join(f"{x:02x}".encode("ascii") for x in enc)
|
|
if ascii:
|
|
string = enc.replace(b"\"", b""")
|
|
else:
|
|
string = enc
|
|
yield Node(b"P", [Value(b"S", hex), Value(b"S", b""), Value(b"S", b""), Value(b"S", b""), Value(b"S", string)], [])
|
|
|
|
def fmt_fbx_model_name(name, ascii):
|
|
if ascii:
|
|
return Value(b"S", f"Model::{name}".encode("utf-8"))
|
|
else:
|
|
return Value(b"S", f"{name}\x00\x01Model".encode("utf-8"))
|
|
|
|
def fmt_fbx_root(ascii):
|
|
fbx_root = Node(b"", [], [])
|
|
|
|
fbx_objects = Node(b"Objects", [], [])
|
|
fbx_root.children.append(fbx_objects)
|
|
|
|
fbx_good = Node(b"Model", [Value(b"L", 1), fmt_fbx_model_name("Good", ascii), Value(b"S", b"Mesh")], [])
|
|
fbx_objects.children.append(fbx_good)
|
|
|
|
fbx_good_props = Node(b"Properties70", [], list(fmt_fbx_props(good, ascii)))
|
|
fbx_good.children.append(fbx_good_props)
|
|
|
|
fbx_bad = Node(b"Model", [Value(b"L", 2), fmt_fbx_model_name("Bad", ascii), Value(b"S", b"Mesh")], [])
|
|
fbx_objects.children.append(fbx_bad)
|
|
|
|
fbx_bad_props = Node(b"Properties70", [], list(fmt_fbx_props(bad, ascii)))
|
|
fbx_bad.children.append(fbx_bad_props)
|
|
|
|
ok = [b"\xff" + enc for enc in good]
|
|
|
|
fbx_ok = Node(b"Model", [Value(b"L", 3), fmt_fbx_model_name("Ok", ascii), Value(b"S", b"Mesh")], [])
|
|
fbx_objects.children.append(fbx_ok)
|
|
|
|
fbx_ok_props = Node(b"Properties70", [], list(fmt_fbx_props(ok, ascii)))
|
|
fbx_ok.children.append(fbx_ok_props)
|
|
|
|
return fbx_root
|
|
|
|
parser = argparse.ArgumentParser("unicode_test_gen.py")
|
|
parser.add_argument("outfile", help="Output filename")
|
|
argv = parser.parse_args()
|
|
|
|
root = fmt_fbx_root(ascii=False)
|
|
with open(argv.outfile, "wb") as f:
|
|
tfbx.binary_dump_root(f, root, tfbx.BinaryFormat(7500, False), b"")
|