Fix MIL syntax for cross-generation ANE compatibility

The MIL scalar types used shorthand syntax (string("x"), int32(1)) that only works on M4. Changed to the canonical verbose format that CoreML's own compiler emits (tensor<string, []>("x"), tensor<int32, []>(1)). Also targets program(1.0) with <ios16> instead of program(1.3)/<ios18>, and simplifies buildInfo to just coremlc-version. For conv-based kernels, adds runtime fp16 I/O fallback — M1/M2 ANE doesn't support the cast op (fp32<->fp16), so on first compile failure it retries with native fp16 inputs/outputs and does the conversion on the CPU side. The fallback is persisted across exec() restarts. Note: matmul and scaled_dot_product_attention ops still fail on M1/M2 — these are M4+ ANE ops. The attention tests (test_ane_causal_attn, test_ane_sdpa5, test_full_fused attention part) require M4 hardware. Conv-based kernels (training, QKV projections, FFN) work on all generations. Tested on M1 Pro, macOS 26.3 (Tahoe).
2026-03-02 22:00:45 +01:00 · 2026-03-02 22:00:45 +01:00 · 709b60208f
parent 893f58e725
commit 709b60208f
15 changed files with 1145 additions and 716 deletions
--- a/.gitignore
+++ b/.gitignore
@ -0,0 +1,7 @@
+*.o
+ane_probe
+api_explore
+inmem_basic
+tiny_train
+tiny_train_m1
+train_large
--- a/training/ane_mil_gen.h
+++ b/training/ane_mil_gen.h
@ -5,6 +5,9 @@
 #include <string.h>
 #include <math.h>

+// Set by caller: 1 = fp16 I/O (M1/M2 fallback, no cast ops), 0 = fp32 I/O with cast (M4+)
+extern int g_fp16_io;
+
 // Build an FP16 weight blob with the required header structure.
 // weights_f32: source weights in row-major [out_ch, in_ch]
 // Returns NSData with header + FP16 weights
@ -30,21 +33,32 @@ static NSData *mil_build_weight_blob(const float *weights_f32, int out_ch, int i
 // Input W: [1, out_ch, in_ch] fp32
 // Output:  [1, out_ch, spatial] fp32
 static NSString *mil_gen_matmul(int in_ch, int out_ch, int spatial) {
+    if (g_fp16_io) {
+        return [NSString stringWithFormat:
+            @"program(1.0)\n"
+            "[buildInfo = dict<tensor<string, []>, tensor<string, []>>({{\"coremlc-version\", \"3505.4.1\"}})]\n"
+            "{\n"
+            "    func main<ios16>(tensor<fp16, [1, %d, %d]> x, tensor<fp16, [1, %d, %d]> W) {\n"
+            "        tensor<bool, []> tx = const()[name = tensor<string, []>(\"tx\"), val = tensor<bool, []>(false)];\n"
+            "        tensor<bool, []> ty = const()[name = tensor<string, []>(\"ty\"), val = tensor<bool, []>(false)];\n"
+            "        tensor<fp16, [1, %d, %d]> y = matmul(transpose_x = tx, transpose_y = ty, x = W, y = x)[name = tensor<string, []>(\"mm\")];\n"
+            "    } -> (y);\n"
+            "}\n",
+            in_ch, spatial, out_ch, in_ch, out_ch, spatial];
+    }
    return [NSString stringWithFormat:
-        @"program(1.3)\n"
-        "[buildInfo = dict<string, string>({{\"coremlc-component-MIL\", \"3510.2.1\"}, "
-        "{\"coremlc-version\", \"3505.4.1\"}, {\"coremltools-component-milinternal\", \"\"}, "
-        "{\"coremltools-version\", \"9.0\"}})]\n"
+        @"program(1.0)\n"
+        "[buildInfo = dict<tensor<string, []>, tensor<string, []>>({{\"coremlc-version\", \"3505.4.1\"}})]\n"
        "{\n"
-        "    func main<ios18>(tensor<fp32, [1, %d, %d]> x, tensor<fp32, [1, %d, %d]> W) {\n"
-        "        string to_fp16 = const()[name = string(\"to_fp16\"), val = string(\"fp16\")];\n"
-        "        tensor<fp16, [1, %d, %d]> x16 = cast(dtype = to_fp16, x = x)[name = string(\"cast_x\")];\n"
-        "        tensor<fp16, [1, %d, %d]> W16 = cast(dtype = to_fp16, x = W)[name = string(\"cast_W\")];\n"
-        "        bool tx = const()[name = string(\"tx\"), val = bool(false)];\n"
-        "        bool ty = const()[name = string(\"ty\"), val = bool(false)];\n"
-        "        tensor<fp16, [1, %d, %d]> y16 = matmul(transpose_x = tx, transpose_y = ty, x = W16, y = x16)[name = string(\"mm\")];\n"
-        "        string to_fp32 = const()[name = string(\"to_fp32\"), val = string(\"fp32\")];\n"
-        "        tensor<fp32, [1, %d, %d]> y = cast(dtype = to_fp32, x = y16)[name = string(\"cast_out\")];\n"
+        "    func main<ios16>(tensor<fp32, [1, %d, %d]> x, tensor<fp32, [1, %d, %d]> W) {\n"
+        "        tensor<string, []> to_fp16 = const()[name = tensor<string, []>(\"to_fp16\"), val = tensor<string, []>(\"fp16\")];\n"
+        "        tensor<fp16, [1, %d, %d]> x16 = cast(dtype = to_fp16, x = x)[name = tensor<string, []>(\"cast_x\")];\n"
+        "        tensor<fp16, [1, %d, %d]> W16 = cast(dtype = to_fp16, x = W)[name = tensor<string, []>(\"cast_W\")];\n"
+        "        tensor<bool, []> tx = const()[name = tensor<string, []>(\"tx\"), val = tensor<bool, []>(false)];\n"
+        "        tensor<bool, []> ty = const()[name = tensor<string, []>(\"ty\"), val = tensor<bool, []>(false)];\n"
+        "        tensor<fp16, [1, %d, %d]> y16 = matmul(transpose_x = tx, transpose_y = ty, x = W16, y = x16)[name = tensor<string, []>(\"mm\")];\n"
+        "        tensor<string, []> to_fp32 = const()[name = tensor<string, []>(\"to_fp32\"), val = tensor<string, []>(\"fp32\")];\n"
+        "        tensor<fp32, [1, %d, %d]> y = cast(dtype = to_fp32, x = y16)[name = tensor<string, []>(\"cast_out\")];\n"
        "    } -> (y);\n"
        "}\n",
        in_ch, spatial, out_ch, in_ch,
@ -54,26 +68,45 @@ static NSString *mil_gen_matmul(int in_ch, int out_ch, int spatial) {

 // Keep the baked-weight version for reference (used in inference-only scenarios)
 static NSString *mil_gen_conv(int in_ch, int out_ch, int spatial) {
+    if (g_fp16_io) {
+        return [NSString stringWithFormat:
+            @"program(1.0)\n"
+            "[buildInfo = dict<tensor<string, []>, tensor<string, []>>({{\"coremlc-version\", \"3505.4.1\"}})]\n"
+            "{\n"
+            "    func main<ios16>(tensor<fp16, [1, %d, 1, %d]> x) {\n"
+            "        tensor<string, []> c_pad_type = const()[name = tensor<string, []>(\"c_pad_type\"), val = tensor<string, []>(\"valid\")];\n"
+            "        tensor<int32, [2]> c_strides = const()[name = tensor<string, []>(\"c_strides\"), val = tensor<int32, [2]>([1, 1])];\n"
+            "        tensor<int32, [4]> c_pad = const()[name = tensor<string, []>(\"c_pad\"), val = tensor<int32, [4]>([0, 0, 0, 0])];\n"
+            "        tensor<int32, [2]> c_dilations = const()[name = tensor<string, []>(\"c_dilations\"), val = tensor<int32, [2]>([1, 1])];\n"
+            "        tensor<int32, []> c_groups = const()[name = tensor<string, []>(\"c_groups\"), val = tensor<int32, []>(1)];\n"
+            "        tensor<fp16, [%d, %d, 1, 1]> W = const()[name = tensor<string, []>(\"W\"), "
+            "val = tensor<fp16, [%d, %d, 1, 1]>(BLOBFILE(path = tensor<string, []>(\"@model_path/weights/weight.bin\"), offset = tensor<uint64, []>(64)))];\n"
+            "        tensor<fp16, [1, %d, 1, %d]> y = conv(dilations = c_dilations, groups = c_groups, "
+            "pad = c_pad, pad_type = c_pad_type, strides = c_strides, weight = W, x = x)[name = tensor<string, []>(\"conv\")];\n"
+            "    } -> (y);\n"
+            "}\n",
+            in_ch, spatial,
+            out_ch, in_ch, out_ch, in_ch,
+            out_ch, spatial];
+    }
    return [NSString stringWithFormat:
-        @"program(1.3)\n"
-        "[buildInfo = dict<string, string>({{\"coremlc-component-MIL\", \"3510.2.1\"}, "
-        "{\"coremlc-version\", \"3505.4.1\"}, {\"coremltools-component-milinternal\", \"\"}, "
-        "{\"coremltools-version\", \"9.0\"}})]\n"
+        @"program(1.0)\n"
+        "[buildInfo = dict<tensor<string, []>, tensor<string, []>>({{\"coremlc-version\", \"3505.4.1\"}})]\n"
        "{\n"
-        "    func main<ios18>(tensor<fp32, [1, %d, 1, %d]> x) {\n"
-        "        string c_pad_type = const()[name = string(\"c_pad_type\"), val = string(\"valid\")];\n"
-        "        tensor<int32, [2]> c_strides = const()[name = string(\"c_strides\"), val = tensor<int32, [2]>([1, 1])];\n"
-        "        tensor<int32, [4]> c_pad = const()[name = string(\"c_pad\"), val = tensor<int32, [4]>([0, 0, 0, 0])];\n"
-        "        tensor<int32, [2]> c_dilations = const()[name = string(\"c_dilations\"), val = tensor<int32, [2]>([1, 1])];\n"
-        "        int32 c_groups = const()[name = string(\"c_groups\"), val = int32(1)];\n"
-        "        string to_fp16 = const()[name = string(\"to_fp16\"), val = string(\"fp16\")];\n"
-        "        tensor<fp16, [1, %d, 1, %d]> x16 = cast(dtype = to_fp16, x = x)[name = string(\"cast_in\")];\n"
-        "        tensor<fp16, [%d, %d, 1, 1]> W = const()[name = string(\"W\"), "
-        "val = tensor<fp16, [%d, %d, 1, 1]>(BLOBFILE(path = string(\"@model_path/weights/weight.bin\"), offset = uint64(64)))];\n"
+        "    func main<ios16>(tensor<fp32, [1, %d, 1, %d]> x) {\n"
+        "        tensor<string, []> c_pad_type = const()[name = tensor<string, []>(\"c_pad_type\"), val = tensor<string, []>(\"valid\")];\n"
+        "        tensor<int32, [2]> c_strides = const()[name = tensor<string, []>(\"c_strides\"), val = tensor<int32, [2]>([1, 1])];\n"
+        "        tensor<int32, [4]> c_pad = const()[name = tensor<string, []>(\"c_pad\"), val = tensor<int32, [4]>([0, 0, 0, 0])];\n"
+        "        tensor<int32, [2]> c_dilations = const()[name = tensor<string, []>(\"c_dilations\"), val = tensor<int32, [2]>([1, 1])];\n"
+        "        tensor<int32, []> c_groups = const()[name = tensor<string, []>(\"c_groups\"), val = tensor<int32, []>(1)];\n"
+        "        tensor<string, []> to_fp16 = const()[name = tensor<string, []>(\"to_fp16\"), val = tensor<string, []>(\"fp16\")];\n"
+        "        tensor<fp16, [1, %d, 1, %d]> x16 = cast(dtype = to_fp16, x = x)[name = tensor<string, []>(\"cast_in\")];\n"
+        "        tensor<fp16, [%d, %d, 1, 1]> W = const()[name = tensor<string, []>(\"W\"), "
+        "val = tensor<fp16, [%d, %d, 1, 1]>(BLOBFILE(path = tensor<string, []>(\"@model_path/weights/weight.bin\"), offset = tensor<uint64, []>(64)))];\n"
        "        tensor<fp16, [1, %d, 1, %d]> y16 = conv(dilations = c_dilations, groups = c_groups, "
-        "pad = c_pad, pad_type = c_pad_type, strides = c_strides, weight = W, x = x16)[name = string(\"conv\")];\n"
-        "        string to_fp32 = const()[name = string(\"to_fp32\"), val = string(\"fp32\")];\n"
-        "        tensor<fp32, [1, %d, 1, %d]> y = cast(dtype = to_fp32, x = y16)[name = string(\"cast_out\")];\n"
+        "pad = c_pad, pad_type = c_pad_type, strides = c_strides, weight = W, x = x16)[name = tensor<string, []>(\"conv\")];\n"
+        "        tensor<string, []> to_fp32 = const()[name = tensor<string, []>(\"to_fp32\"), val = tensor<string, []>(\"fp32\")];\n"
+        "        tensor<fp32, [1, %d, 1, %d]> y = cast(dtype = to_fp32, x = y16)[name = tensor<string, []>(\"cast_out\")];\n"
        "    } -> (y);\n"
        "}\n",
        in_ch, spatial, in_ch, spatial,
@ -88,36 +121,65 @@ static NSString *mil_gen_conv(int in_ch, int out_ch, int spatial) {
 // where cs = 64 + dim*dim*2
 static NSString *mil_gen_qkv(int dim, int spatial) {
    NSUInteger cs = 64 + (NSUInteger)dim * dim * 2;
+    if (g_fp16_io) {
+        return [NSString stringWithFormat:
+            @"program(1.0)\n"
+            "[buildInfo = dict<tensor<string, []>, tensor<string, []>>({{\"coremlc-version\", \"3505.4.1\"}})]\n"
+            "{\n"
+            "    func main<ios16>(tensor<fp16, [1, %d, 1, %d]> x) {\n"
+            "        tensor<string, []> c_pad_type = const()[name = tensor<string, []>(\"c_pad_type\"), val = tensor<string, []>(\"valid\")];\n"
+            "        tensor<int32, [2]> c_strides = const()[name = tensor<string, []>(\"c_strides\"), val = tensor<int32, [2]>([1, 1])];\n"
+            "        tensor<int32, [4]> c_pad = const()[name = tensor<string, []>(\"c_pad\"), val = tensor<int32, [4]>([0, 0, 0, 0])];\n"
+            "        tensor<int32, [2]> c_dilations = const()[name = tensor<string, []>(\"c_dilations\"), val = tensor<int32, [2]>([1, 1])];\n"
+            "        tensor<int32, []> c_groups = const()[name = tensor<string, []>(\"c_groups\"), val = tensor<int32, []>(1)];\n"
+            "        tensor<fp16, [%d, %d, 1, 1]> Wq = const()[name = tensor<string, []>(\"Wq\"), "
+            "val = tensor<fp16, [%d, %d, 1, 1]>(BLOBFILE(path = tensor<string, []>(\"@model_path/weights/weight.bin\"), offset = tensor<uint64, []>(64)))];\n"
+            "        tensor<fp16, [%d, %d, 1, 1]> Wk = const()[name = tensor<string, []>(\"Wk\"), "
+            "val = tensor<fp16, [%d, %d, 1, 1]>(BLOBFILE(path = tensor<string, []>(\"@model_path/weights/weight.bin\"), offset = tensor<uint64, []>(%lu)))];\n"
+            "        tensor<fp16, [%d, %d, 1, 1]> Wv = const()[name = tensor<string, []>(\"Wv\"), "
+            "val = tensor<fp16, [%d, %d, 1, 1]>(BLOBFILE(path = tensor<string, []>(\"@model_path/weights/weight.bin\"), offset = tensor<uint64, []>(%lu)))];\n"
+            "        tensor<fp16, [1, %d, 1, %d]> q = conv(dilations = c_dilations, groups = c_groups, "
+            "pad = c_pad, pad_type = c_pad_type, strides = c_strides, weight = Wq, x = x)[name = tensor<string, []>(\"conv_q\")];\n"
+            "        tensor<fp16, [1, %d, 1, %d]> k = conv(dilations = c_dilations, groups = c_groups, "
+            "pad = c_pad, pad_type = c_pad_type, strides = c_strides, weight = Wk, x = x)[name = tensor<string, []>(\"conv_k\")];\n"
+            "        tensor<fp16, [1, %d, 1, %d]> v = conv(dilations = c_dilations, groups = c_groups, "
+            "pad = c_pad, pad_type = c_pad_type, strides = c_strides, weight = Wv, x = x)[name = tensor<string, []>(\"conv_v\")];\n"
+            "    } -> (q, k, v);\n"
+            "}\n",
+            dim, spatial,
+            dim, dim, dim, dim,
+            dim, dim, dim, dim, (unsigned long)(64 + cs),
+            dim, dim, dim, dim, (unsigned long)(64 + 2*cs),
+            dim, spatial, dim, spatial, dim, spatial];
+    }
    return [NSString stringWithFormat:
-        @"program(1.3)\n"
-        "[buildInfo = dict<string, string>({{\"coremlc-component-MIL\", \"3510.2.1\"}, "
-        "{\"coremlc-version\", \"3505.4.1\"}, {\"coremltools-component-milinternal\", \"\"}, "
-        "{\"coremltools-version\", \"9.0\"}})]\n"
+        @"program(1.0)\n"
+        "[buildInfo = dict<tensor<string, []>, tensor<string, []>>({{\"coremlc-version\", \"3505.4.1\"}})]\n"
        "{\n"
-        "    func main<ios18>(tensor<fp32, [1, %d, 1, %d]> x) {\n"
-        "        string c_pad_type = const()[name = string(\"c_pad_type\"), val = string(\"valid\")];\n"
-        "        tensor<int32, [2]> c_strides = const()[name = string(\"c_strides\"), val = tensor<int32, [2]>([1, 1])];\n"
-        "        tensor<int32, [4]> c_pad = const()[name = string(\"c_pad\"), val = tensor<int32, [4]>([0, 0, 0, 0])];\n"
-        "        tensor<int32, [2]> c_dilations = const()[name = string(\"c_dilations\"), val = tensor<int32, [2]>([1, 1])];\n"
-        "        int32 c_groups = const()[name = string(\"c_groups\"), val = int32(1)];\n"
-        "        string to_fp16 = const()[name = string(\"to_fp16\"), val = string(\"fp16\")];\n"
-        "        tensor<fp16, [1, %d, 1, %d]> x16 = cast(dtype = to_fp16, x = x)[name = string(\"cast_in\")];\n"
-        "        tensor<fp16, [%d, %d, 1, 1]> Wq = const()[name = string(\"Wq\"), "
-        "val = tensor<fp16, [%d, %d, 1, 1]>(BLOBFILE(path = string(\"@model_path/weights/weight.bin\"), offset = uint64(64)))];\n"
-        "        tensor<fp16, [%d, %d, 1, 1]> Wk = const()[name = string(\"Wk\"), "
-        "val = tensor<fp16, [%d, %d, 1, 1]>(BLOBFILE(path = string(\"@model_path/weights/weight.bin\"), offset = uint64(%lu)))];\n"
-        "        tensor<fp16, [%d, %d, 1, 1]> Wv = const()[name = string(\"Wv\"), "
-        "val = tensor<fp16, [%d, %d, 1, 1]>(BLOBFILE(path = string(\"@model_path/weights/weight.bin\"), offset = uint64(%lu)))];\n"
+        "    func main<ios16>(tensor<fp32, [1, %d, 1, %d]> x) {\n"
+        "        tensor<string, []> c_pad_type = const()[name = tensor<string, []>(\"c_pad_type\"), val = tensor<string, []>(\"valid\")];\n"
+        "        tensor<int32, [2]> c_strides = const()[name = tensor<string, []>(\"c_strides\"), val = tensor<int32, [2]>([1, 1])];\n"
+        "        tensor<int32, [4]> c_pad = const()[name = tensor<string, []>(\"c_pad\"), val = tensor<int32, [4]>([0, 0, 0, 0])];\n"
+        "        tensor<int32, [2]> c_dilations = const()[name = tensor<string, []>(\"c_dilations\"), val = tensor<int32, [2]>([1, 1])];\n"
+        "        tensor<int32, []> c_groups = const()[name = tensor<string, []>(\"c_groups\"), val = tensor<int32, []>(1)];\n"
+        "        tensor<string, []> to_fp16 = const()[name = tensor<string, []>(\"to_fp16\"), val = tensor<string, []>(\"fp16\")];\n"
+        "        tensor<fp16, [1, %d, 1, %d]> x16 = cast(dtype = to_fp16, x = x)[name = tensor<string, []>(\"cast_in\")];\n"
+        "        tensor<fp16, [%d, %d, 1, 1]> Wq = const()[name = tensor<string, []>(\"Wq\"), "
+        "val = tensor<fp16, [%d, %d, 1, 1]>(BLOBFILE(path = tensor<string, []>(\"@model_path/weights/weight.bin\"), offset = tensor<uint64, []>(64)))];\n"
+        "        tensor<fp16, [%d, %d, 1, 1]> Wk = const()[name = tensor<string, []>(\"Wk\"), "
+        "val = tensor<fp16, [%d, %d, 1, 1]>(BLOBFILE(path = tensor<string, []>(\"@model_path/weights/weight.bin\"), offset = tensor<uint64, []>(%lu)))];\n"
+        "        tensor<fp16, [%d, %d, 1, 1]> Wv = const()[name = tensor<string, []>(\"Wv\"), "
+        "val = tensor<fp16, [%d, %d, 1, 1]>(BLOBFILE(path = tensor<string, []>(\"@model_path/weights/weight.bin\"), offset = tensor<uint64, []>(%lu)))];\n"
        "        tensor<fp16, [1, %d, 1, %d]> q16 = conv(dilations = c_dilations, groups = c_groups, "
-        "pad = c_pad, pad_type = c_pad_type, strides = c_strides, weight = Wq, x = x16)[name = string(\"conv_q\")];\n"
+        "pad = c_pad, pad_type = c_pad_type, strides = c_strides, weight = Wq, x = x16)[name = tensor<string, []>(\"conv_q\")];\n"
        "        tensor<fp16, [1, %d, 1, %d]> k16 = conv(dilations = c_dilations, groups = c_groups, "
-        "pad = c_pad, pad_type = c_pad_type, strides = c_strides, weight = Wk, x = x16)[name = string(\"conv_k\")];\n"
+        "pad = c_pad, pad_type = c_pad_type, strides = c_strides, weight = Wk, x = x16)[name = tensor<string, []>(\"conv_k\")];\n"
        "        tensor<fp16, [1, %d, 1, %d]> v16 = conv(dilations = c_dilations, groups = c_groups, "
-        "pad = c_pad, pad_type = c_pad_type, strides = c_strides, weight = Wv, x = x16)[name = string(\"conv_v\")];\n"
-        "        string to_fp32 = const()[name = string(\"to_fp32\"), val = string(\"fp32\")];\n"
-        "        tensor<fp32, [1, %d, 1, %d]> q = cast(dtype = to_fp32, x = q16)[name = string(\"cast_q\")];\n"
-        "        tensor<fp32, [1, %d, 1, %d]> k = cast(dtype = to_fp32, x = k16)[name = string(\"cast_k\")];\n"
-        "        tensor<fp32, [1, %d, 1, %d]> v = cast(dtype = to_fp32, x = v16)[name = string(\"cast_v\")];\n"
+        "pad = c_pad, pad_type = c_pad_type, strides = c_strides, weight = Wv, x = x16)[name = tensor<string, []>(\"conv_v\")];\n"
+        "        tensor<string, []> to_fp32 = const()[name = tensor<string, []>(\"to_fp32\"), val = tensor<string, []>(\"fp32\")];\n"
+        "        tensor<fp32, [1, %d, 1, %d]> q = cast(dtype = to_fp32, x = q16)[name = tensor<string, []>(\"cast_q\")];\n"
+        "        tensor<fp32, [1, %d, 1, %d]> k = cast(dtype = to_fp32, x = k16)[name = tensor<string, []>(\"cast_k\")];\n"
+        "        tensor<fp32, [1, %d, 1, %d]> v = cast(dtype = to_fp32, x = v16)[name = tensor<string, []>(\"cast_v\")];\n"
        "    } -> (q, k, v);\n"
        "}\n",
        dim, spatial, dim, spatial,
@ -173,31 +235,55 @@ static NSData *mil_build_ffn_up_weight_blob(const float *w1, const float *w3, in
 // Generate MIL for fused FFN up: w1 + w3 parallel convs
 static NSString *mil_gen_ffn_up(int dim, int hidden_dim, int spatial) {
    NSUInteger cs = 64 + (NSUInteger)hidden_dim * dim * 2;
+    if (g_fp16_io) {
+        return [NSString stringWithFormat:
+            @"program(1.0)\n"
+            "[buildInfo = dict<tensor<string, []>, tensor<string, []>>({{\"coremlc-version\", \"3505.4.1\"}})]\n"
+            "{\n"
+            "    func main<ios16>(tensor<fp16, [1, %d, 1, %d]> x) {\n"
+            "        tensor<string, []> c_pad_type = const()[name = tensor<string, []>(\"c_pad_type\"), val = tensor<string, []>(\"valid\")];\n"
+            "        tensor<int32, [2]> c_strides = const()[name = tensor<string, []>(\"c_strides\"), val = tensor<int32, [2]>([1, 1])];\n"
+            "        tensor<int32, [4]> c_pad = const()[name = tensor<string, []>(\"c_pad\"), val = tensor<int32, [4]>([0, 0, 0, 0])];\n"
+            "        tensor<int32, [2]> c_dilations = const()[name = tensor<string, []>(\"c_dilations\"), val = tensor<int32, [2]>([1, 1])];\n"
+            "        tensor<int32, []> c_groups = const()[name = tensor<string, []>(\"c_groups\"), val = tensor<int32, []>(1)];\n"
+            "        tensor<fp16, [%d, %d, 1, 1]> W1 = const()[name = tensor<string, []>(\"W1\"), "
+            "val = tensor<fp16, [%d, %d, 1, 1]>(BLOBFILE(path = tensor<string, []>(\"@model_path/weights/weight.bin\"), offset = tensor<uint64, []>(64)))];\n"
+            "        tensor<fp16, [%d, %d, 1, 1]> W3 = const()[name = tensor<string, []>(\"W3\"), "
+            "val = tensor<fp16, [%d, %d, 1, 1]>(BLOBFILE(path = tensor<string, []>(\"@model_path/weights/weight.bin\"), offset = tensor<uint64, []>(%lu)))];\n"
+            "        tensor<fp16, [1, %d, 1, %d]> out1 = conv(dilations = c_dilations, groups = c_groups, "
+            "pad = c_pad, pad_type = c_pad_type, strides = c_strides, weight = W1, x = x)[name = tensor<string, []>(\"conv_w1\")];\n"
+            "        tensor<fp16, [1, %d, 1, %d]> out3 = conv(dilations = c_dilations, groups = c_groups, "
+            "pad = c_pad, pad_type = c_pad_type, strides = c_strides, weight = W3, x = x)[name = tensor<string, []>(\"conv_w3\")];\n"
+            "    } -> (out1, out3);\n"
+            "}\n",
+            dim, spatial,
+            hidden_dim, dim, hidden_dim, dim,
+            hidden_dim, dim, hidden_dim, dim, (unsigned long)(64 + cs),
+            hidden_dim, spatial, hidden_dim, spatial];
+    }
    return [NSString stringWithFormat:
-        @"program(1.3)\n"
-        "[buildInfo = dict<string, string>({{\"coremlc-component-MIL\", \"3510.2.1\"}, "
-        "{\"coremlc-version\", \"3505.4.1\"}, {\"coremltools-component-milinternal\", \"\"}, "
-        "{\"coremltools-version\", \"9.0\"}})]\n"
+        @"program(1.0)\n"
+        "[buildInfo = dict<tensor<string, []>, tensor<string, []>>({{\"coremlc-version\", \"3505.4.1\"}})]\n"
        "{\n"
-        "    func main<ios18>(tensor<fp32, [1, %d, 1, %d]> x) {\n"
-        "        string c_pad_type = const()[name = string(\"c_pad_type\"), val = string(\"valid\")];\n"
-        "        tensor<int32, [2]> c_strides = const()[name = string(\"c_strides\"), val = tensor<int32, [2]>([1, 1])];\n"
-        "        tensor<int32, [4]> c_pad = const()[name = string(\"c_pad\"), val = tensor<int32, [4]>([0, 0, 0, 0])];\n"
-        "        tensor<int32, [2]> c_dilations = const()[name = string(\"c_dilations\"), val = tensor<int32, [2]>([1, 1])];\n"
-        "        int32 c_groups = const()[name = string(\"c_groups\"), val = int32(1)];\n"
-        "        string to_fp16 = const()[name = string(\"to_fp16\"), val = string(\"fp16\")];\n"
-        "        tensor<fp16, [1, %d, 1, %d]> x16 = cast(dtype = to_fp16, x = x)[name = string(\"cast_in\")];\n"
-        "        tensor<fp16, [%d, %d, 1, 1]> W1 = const()[name = string(\"W1\"), "
-        "val = tensor<fp16, [%d, %d, 1, 1]>(BLOBFILE(path = string(\"@model_path/weights/weight.bin\"), offset = uint64(64)))];\n"
-        "        tensor<fp16, [%d, %d, 1, 1]> W3 = const()[name = string(\"W3\"), "
-        "val = tensor<fp16, [%d, %d, 1, 1]>(BLOBFILE(path = string(\"@model_path/weights/weight.bin\"), offset = uint64(%lu)))];\n"
+        "    func main<ios16>(tensor<fp32, [1, %d, 1, %d]> x) {\n"
+        "        tensor<string, []> c_pad_type = const()[name = tensor<string, []>(\"c_pad_type\"), val = tensor<string, []>(\"valid\")];\n"
+        "        tensor<int32, [2]> c_strides = const()[name = tensor<string, []>(\"c_strides\"), val = tensor<int32, [2]>([1, 1])];\n"
+        "        tensor<int32, [4]> c_pad = const()[name = tensor<string, []>(\"c_pad\"), val = tensor<int32, [4]>([0, 0, 0, 0])];\n"
+        "        tensor<int32, [2]> c_dilations = const()[name = tensor<string, []>(\"c_dilations\"), val = tensor<int32, [2]>([1, 1])];\n"
+        "        tensor<int32, []> c_groups = const()[name = tensor<string, []>(\"c_groups\"), val = tensor<int32, []>(1)];\n"
+        "        tensor<string, []> to_fp16 = const()[name = tensor<string, []>(\"to_fp16\"), val = tensor<string, []>(\"fp16\")];\n"
+        "        tensor<fp16, [1, %d, 1, %d]> x16 = cast(dtype = to_fp16, x = x)[name = tensor<string, []>(\"cast_in\")];\n"
+        "        tensor<fp16, [%d, %d, 1, 1]> W1 = const()[name = tensor<string, []>(\"W1\"), "
+        "val = tensor<fp16, [%d, %d, 1, 1]>(BLOBFILE(path = tensor<string, []>(\"@model_path/weights/weight.bin\"), offset = tensor<uint64, []>(64)))];\n"
+        "        tensor<fp16, [%d, %d, 1, 1]> W3 = const()[name = tensor<string, []>(\"W3\"), "
+        "val = tensor<fp16, [%d, %d, 1, 1]>(BLOBFILE(path = tensor<string, []>(\"@model_path/weights/weight.bin\"), offset = tensor<uint64, []>(%lu)))];\n"
        "        tensor<fp16, [1, %d, 1, %d]> h1 = conv(dilations = c_dilations, groups = c_groups, "
-        "pad = c_pad, pad_type = c_pad_type, strides = c_strides, weight = W1, x = x16)[name = string(\"conv_w1\")];\n"
+        "pad = c_pad, pad_type = c_pad_type, strides = c_strides, weight = W1, x = x16)[name = tensor<string, []>(\"conv_w1\")];\n"
        "        tensor<fp16, [1, %d, 1, %d]> h3 = conv(dilations = c_dilations, groups = c_groups, "
-        "pad = c_pad, pad_type = c_pad_type, strides = c_strides, weight = W3, x = x16)[name = string(\"conv_w3\")];\n"
-        "        string to_fp32 = const()[name = string(\"to_fp32\"), val = string(\"fp32\")];\n"
-        "        tensor<fp32, [1, %d, 1, %d]> out1 = cast(dtype = to_fp32, x = h1)[name = string(\"cast_h1\")];\n"
-        "        tensor<fp32, [1, %d, 1, %d]> out3 = cast(dtype = to_fp32, x = h3)[name = string(\"cast_h3\")];\n"
+        "pad = c_pad, pad_type = c_pad_type, strides = c_strides, weight = W3, x = x16)[name = tensor<string, []>(\"conv_w3\")];\n"
+        "        tensor<string, []> to_fp32 = const()[name = tensor<string, []>(\"to_fp32\"), val = tensor<string, []>(\"fp32\")];\n"
+        "        tensor<fp32, [1, %d, 1, %d]> out1 = cast(dtype = to_fp32, x = h1)[name = tensor<string, []>(\"cast_h1\")];\n"
+        "        tensor<fp32, [1, %d, 1, %d]> out3 = cast(dtype = to_fp32, x = h3)[name = tensor<string, []>(\"cast_h3\")];\n"
        "    } -> (out1, out3);\n"
        "}\n",
        dim, spatial, dim, spatial,
--- a/training/stories_mil.h
+++ b/training/stories_mil.h
@ -4,15 +4,13 @@
 #include "stories_io.h"

 #define MIL_HDR \
-    @"program(1.3)\n[buildInfo = dict<string, string>({{\"coremlc-component-MIL\", \"3510.2.1\"}, " \
-    "{\"coremlc-version\", \"3505.4.1\"}, {\"coremltools-component-milinternal\", \"\"}, " \
-    "{\"coremltools-version\", \"9.0\"}})]\n{\n"
+    @"program(1.0)\n[buildInfo = dict<tensor<string, []>, tensor<string, []>>({{\"coremlc-version\", \"3505.4.1\"}})]\n{\n"
 #define CONV_CONST \
-    "        string pt = const()[name=string(\"pt\"), val=string(\"valid\")];\n" \
-    "        tensor<int32, [2]> st = const()[name=string(\"st\"), val=tensor<int32, [2]>([1,1])];\n" \
-    "        tensor<int32, [4]> pd = const()[name=string(\"pd\"), val=tensor<int32, [4]>([0,0,0,0])];\n" \
-    "        tensor<int32, [2]> dl = const()[name=string(\"dl\"), val=tensor<int32, [2]>([1,1])];\n" \
-    "        int32 gr = const()[name=string(\"gr\"), val=int32(1)];\n"
+    "        tensor<string, []> pt = const()[name=tensor<string, []>(\"pt\"), val=tensor<string, []>(\"valid\")];\n" \
+    "        tensor<int32, [2]> st = const()[name=tensor<string, []>(\"st\"), val=tensor<int32, [2]>([1,1])];\n" \
+    "        tensor<int32, [4]> pd = const()[name=tensor<string, []>(\"pd\"), val=tensor<int32, [4]>([0,0,0,0])];\n" \
+    "        tensor<int32, [2]> dl = const()[name=tensor<string, []>(\"dl\"), val=tensor<int32, [2]>([1,1])];\n" \
+    "        tensor<int32, []> gr = const()[name=tensor<string, []>(\"gr\"), val=tensor<int32, []>(1)];\n"

 // SDPA forward + taps: x_in → rmsnorm → QKV+SDPA+Wo → concat(o_out, Q, K, V, attn_out, xnorm)
 static NSString *gen_sdpa_fwd_taps(void) {
@ -20,53 +18,53 @@ static NSString *gen_sdpa_fwd_taps(void) {
    float invd = 1.0f/(float)DIM;
    NSMutableString *m = [NSMutableString string];
    [m appendString:MIL_HDR];
-    [m appendFormat:@"    func main<ios18>(tensor<fp16, [1, %d, 1, %d]> x) {\n", DIM, SEQ];
-    [m appendFormat:@"        tensor<fp16, [1,%d,1,%d]> sq = mul(x=x,y=x)[name=string(\"sq\")];\n", DIM, SEQ];
-    [m appendFormat:@"        tensor<int32, [1]> rax = const()[name=string(\"rax\"), val=tensor<int32, [1]>([1])];\n"];
-    [m appendFormat:@"        bool kd = const()[name=string(\"kd\"), val=bool(true)];\n"];
-    [m appendFormat:@"        tensor<fp16, [1,1,1,%d]> ss = reduce_sum(x=sq,axes=rax,keep_dims=kd)[name=string(\"ss\")];\n", SEQ];
-    [m appendFormat:@"        fp16 invd = const()[name=string(\"invd\"), val=fp16(%f)];\n", invd];
-    [m appendFormat:@"        tensor<fp16, [1,1,1,%d]> ss2 = mul(x=ss,y=invd)[name=string(\"ss2\")];\n", SEQ];
-    [m appendFormat:@"        fp16 eps = const()[name=string(\"eps\"), val=fp16(0.00001)];\n"];
-    [m appendFormat:@"        tensor<fp16, [1,1,1,%d]> ss3 = add(x=ss2,y=eps)[name=string(\"ss3\")];\n", SEQ];
-    [m appendFormat:@"        fp16 nhalf = const()[name=string(\"nhalf\"), val=fp16(-0.5)];\n"];
-    [m appendFormat:@"        tensor<fp16, [1,1,1,%d]> rrms = pow(x=ss3,y=nhalf)[name=string(\"rrms\")];\n", SEQ];
-    [m appendFormat:@"        tensor<fp16, [1,%d,1,%d]> xr = mul(x=x,y=rrms)[name=string(\"xr\")];\n", DIM, SEQ];
-    [m appendFormat:@"        tensor<fp16, [1,%d,1,1]> rw = const()[name=string(\"rw\"), val=tensor<fp16, [1,%d,1,1]>(BLOBFILE(path=string(\"@model_path/weights/rms1.bin\"), offset=uint64(64)))];\n", DIM, DIM];
-    [m appendFormat:@"        tensor<fp16, [1,%d,1,%d]> xn = mul(x=xr,y=rw)[name=string(\"xn\")];\n", DIM, SEQ];
+    [m appendFormat:@"    func main<ios16>(tensor<fp16, [1, %d, 1, %d]> x) {\n", DIM, SEQ];
+    [m appendFormat:@"        tensor<fp16, [1,%d,1,%d]> sq = mul(x=x,y=x)[name=tensor<string, []>(\"sq\")];\n", DIM, SEQ];
+    [m appendFormat:@"        tensor<int32, [1]> rax = const()[name=tensor<string, []>(\"rax\"), val=tensor<int32, [1]>([1])];\n"];
+    [m appendFormat:@"        tensor<bool, []> kd = const()[name=tensor<string, []>(\"kd\"), val=tensor<bool, []>(true)];\n"];
+    [m appendFormat:@"        tensor<fp16, [1,1,1,%d]> ss = reduce_sum(x=sq,axes=rax,keep_dims=kd)[name=tensor<string, []>(\"ss\")];\n", SEQ];
+    [m appendFormat:@"        tensor<fp16, []> invd = const()[name=tensor<string, []>(\"invd\"), val=tensor<fp16, []>(%f)];\n", invd];
+    [m appendFormat:@"        tensor<fp16, [1,1,1,%d]> ss2 = mul(x=ss,y=invd)[name=tensor<string, []>(\"ss2\")];\n", SEQ];
+    [m appendFormat:@"        tensor<fp16, []> eps = const()[name=tensor<string, []>(\"eps\"), val=tensor<fp16, []>(0.00001)];\n"];
+    [m appendFormat:@"        tensor<fp16, [1,1,1,%d]> ss3 = add(x=ss2,y=eps)[name=tensor<string, []>(\"ss3\")];\n", SEQ];
+    [m appendFormat:@"        tensor<fp16, []> nhalf = const()[name=tensor<string, []>(\"nhalf\"), val=tensor<fp16, []>(-0.5)];\n"];
+    [m appendFormat:@"        tensor<fp16, [1,1,1,%d]> rrms = pow(x=ss3,y=nhalf)[name=tensor<string, []>(\"rrms\")];\n", SEQ];
+    [m appendFormat:@"        tensor<fp16, [1,%d,1,%d]> xr = mul(x=x,y=rrms)[name=tensor<string, []>(\"xr\")];\n", DIM, SEQ];
+    [m appendFormat:@"        tensor<fp16, [1,%d,1,1]> rw = const()[name=tensor<string, []>(\"rw\"), val=tensor<fp16, [1,%d,1,1]>(BLOBFILE(path=tensor<string, []>(\"@model_path/weights/rms1.bin\"), offset=tensor<uint64, []>(64)))];\n", DIM, DIM];
+    [m appendFormat:@"        tensor<fp16, [1,%d,1,%d]> xn = mul(x=xr,y=rw)[name=tensor<string, []>(\"xn\")];\n", DIM, SEQ];
    [m appendString:@CONV_CONST];
-    [m appendFormat:@"        tensor<fp16, [%d,%d,1,1]> Wq = const()[name=string(\"Wq\"), val=tensor<fp16, [%d,%d,1,1]>(BLOBFILE(path=string(\"@model_path/weights/wq.bin\"), offset=uint64(64)))];\n", DIM,DIM,DIM,DIM];
-    [m appendFormat:@"        tensor<fp16, [%d,%d,1,1]> Wk = const()[name=string(\"Wk\"), val=tensor<fp16, [%d,%d,1,1]>(BLOBFILE(path=string(\"@model_path/weights/wk.bin\"), offset=uint64(64)))];\n", DIM,DIM,DIM,DIM];
-    [m appendFormat:@"        tensor<fp16, [%d,%d,1,1]> Wv = const()[name=string(\"Wv\"), val=tensor<fp16, [%d,%d,1,1]>(BLOBFILE(path=string(\"@model_path/weights/wv.bin\"), offset=uint64(64)))];\n", DIM,DIM,DIM,DIM];
-    [m appendFormat:@"        tensor<fp16, [%d,%d,1,1]> Wo = const()[name=string(\"Wo\"), val=tensor<fp16, [%d,%d,1,1]>(BLOBFILE(path=string(\"@model_path/weights/wo.bin\"), offset=uint64(64)))];\n", DIM,DIM,DIM,DIM];
-    [m appendFormat:@"        tensor<fp16, [1,%d,1,%d]> qf = conv(dilations=dl,groups=gr,pad=pd,pad_type=pt,strides=st,weight=Wq,x=xn)[name=string(\"cq\")];\n", DIM,SEQ];
-    [m appendFormat:@"        tensor<fp16, [1,%d,1,%d]> kf = conv(dilations=dl,groups=gr,pad=pd,pad_type=pt,strides=st,weight=Wk,x=xn)[name=string(\"ck\")];\n", DIM,SEQ];
-    [m appendFormat:@"        tensor<fp16, [1,%d,1,%d]> vf = conv(dilations=dl,groups=gr,pad=pd,pad_type=pt,strides=st,weight=Wv,x=xn)[name=string(\"cv\")];\n", DIM,SEQ];
-    [m appendFormat:@"        tensor<int32, [4]> qsh = const()[name=string(\"qsh\"), val=tensor<int32, [4]>([1,%d,%d,%d])];\n", HEADS,HD,SEQ];
-    [m appendString:@"        tensor<int32, [4]> pm = const()[name=string(\"pm\"), val=tensor<int32, [4]>([0,1,3,2])];\n"];
-    [m appendFormat:@"        tensor<fp16, [1,%d,%d,%d]> q4 = reshape(shape=qsh,x=qf)[name=string(\"rq\")];\n", HEADS,HD,SEQ];
-    [m appendFormat:@"        tensor<fp16, [1,%d,%d,%d]> q = transpose(perm=pm,x=q4)[name=string(\"tq\")];\n", HEADS,SEQ,HD];
-    [m appendFormat:@"        tensor<fp16, [1,%d,%d,%d]> k4 = reshape(shape=qsh,x=kf)[name=string(\"rk\")];\n", HEADS,HD,SEQ];
-    [m appendFormat:@"        tensor<fp16, [1,%d,%d,%d]> k = transpose(perm=pm,x=k4)[name=string(\"tk\")];\n", HEADS,SEQ,HD];
-    [m appendFormat:@"        tensor<fp16, [1,%d,%d,%d]> v4 = reshape(shape=qsh,x=vf)[name=string(\"rv\")];\n", HEADS,HD,SEQ];
-    [m appendFormat:@"        tensor<fp16, [1,%d,%d,%d]> v = transpose(perm=pm,x=v4)[name=string(\"tv\")];\n", HEADS,SEQ,HD];
-    [m appendString:@"        bool tx = const()[name=string(\"tx\"), val=bool(false)];\n"];
-    [m appendString:@"        bool ty = const()[name=string(\"ty\"), val=bool(true)];\n"];
-    [m appendFormat:@"        tensor<fp16, [1,%d,%d,%d]> sc1 = matmul(transpose_x=tx,transpose_y=ty,x=q,y=k)[name=string(\"mm1\")];\n", HEADS,SEQ,SEQ];
-    [m appendFormat:@"        fp16 scv = const()[name=string(\"scv\"), val=fp16(%f)];\n", sc];
-    [m appendFormat:@"        tensor<fp16, [1,%d,%d,%d]> sc2 = mul(x=sc1,y=scv)[name=string(\"scl\")];\n", HEADS,SEQ,SEQ];
-    [m appendFormat:@"        tensor<fp16, [1,1,%d,%d]> cm = const()[name=string(\"cm\"), val=tensor<fp16, [1,1,%d,%d]>(BLOBFILE(path=string(\"@model_path/weights/mask.bin\"), offset=uint64(64)))];\n", SEQ,SEQ,SEQ,SEQ];
-    [m appendFormat:@"        tensor<fp16, [1,%d,%d,%d]> ms = add(x=sc2,y=cm)[name=string(\"msk\")];\n", HEADS,SEQ,SEQ];
-    [m appendString:@"        int32 sax = const()[name=string(\"sax\"), val=int32(-1)];\n"];
-    [m appendFormat:@"        tensor<fp16, [1,%d,%d,%d]> aw = softmax(axis=sax,x=ms)[name=string(\"sm\")];\n", HEADS,SEQ,SEQ];
-    [m appendFormat:@"        tensor<fp16, [1,%d,%d,%d]> a4 = matmul(transpose_x=tx,transpose_y=tx,x=aw,y=v)[name=string(\"mm2\")];\n", HEADS,SEQ,HD];
-    [m appendFormat:@"        tensor<fp16, [1,%d,%d,%d]> at = transpose(perm=pm,x=a4)[name=string(\"ta\")];\n", HEADS,HD,SEQ];
-    [m appendFormat:@"        tensor<int32, [4]> os = const()[name=string(\"os\"), val=tensor<int32, [4]>([1,%d,1,%d])];\n", DIM,SEQ];
-    [m appendFormat:@"        tensor<fp16, [1,%d,1,%d]> af = reshape(shape=os,x=at)[name=string(\"ra\")];\n", DIM,SEQ];
-    [m appendFormat:@"        tensor<fp16, [1,%d,1,%d]> oo = conv(dilations=dl,groups=gr,pad=pd,pad_type=pt,strides=st,weight=Wo,x=af)[name=string(\"co\")];\n", DIM,SEQ];
-    [m appendString:@"        int32 cax = const()[name=string(\"cax\"), val=int32(1)];\n"];
-    [m appendString:@"        bool cid = const()[name=string(\"cid\"), val=bool(false)];\n"];
-    [m appendFormat:@"        tensor<fp16, [1,%d,1,%d]> out = concat(axis=cax,interleave=cid,values=(oo,qf,kf,vf,af,xn))[name=string(\"cat\")];\n", 6*DIM,SEQ];
+    [m appendFormat:@"        tensor<fp16, [%d,%d,1,1]> Wq = const()[name=tensor<string, []>(\"Wq\"), val=tensor<fp16, [%d,%d,1,1]>(BLOBFILE(path=tensor<string, []>(\"@model_path/weights/wq.bin\"), offset=tensor<uint64, []>(64)))];\n", DIM,DIM,DIM,DIM];
+    [m appendFormat:@"        tensor<fp16, [%d,%d,1,1]> Wk = const()[name=tensor<string, []>(\"Wk\"), val=tensor<fp16, [%d,%d,1,1]>(BLOBFILE(path=tensor<string, []>(\"@model_path/weights/wk.bin\"), offset=tensor<uint64, []>(64)))];\n", DIM,DIM,DIM,DIM];
+    [m appendFormat:@"        tensor<fp16, [%d,%d,1,1]> Wv = const()[name=tensor<string, []>(\"Wv\"), val=tensor<fp16, [%d,%d,1,1]>(BLOBFILE(path=tensor<string, []>(\"@model_path/weights/wv.bin\"), offset=tensor<uint64, []>(64)))];\n", DIM,DIM,DIM,DIM];
+    [m appendFormat:@"        tensor<fp16, [%d,%d,1,1]> Wo = const()[name=tensor<string, []>(\"Wo\"), val=tensor<fp16, [%d,%d,1,1]>(BLOBFILE(path=tensor<string, []>(\"@model_path/weights/wo.bin\"), offset=tensor<uint64, []>(64)))];\n", DIM,DIM,DIM,DIM];
+    [m appendFormat:@"        tensor<fp16, [1,%d,1,%d]> qf = conv(dilations=dl,groups=gr,pad=pd,pad_type=pt,strides=st,weight=Wq,x=xn)[name=tensor<string, []>(\"cq\")];\n", DIM,SEQ];
+    [m appendFormat:@"        tensor<fp16, [1,%d,1,%d]> kf = conv(dilations=dl,groups=gr,pad=pd,pad_type=pt,strides=st,weight=Wk,x=xn)[name=tensor<string, []>(\"ck\")];\n", DIM,SEQ];
+    [m appendFormat:@"        tensor<fp16, [1,%d,1,%d]> vf = conv(dilations=dl,groups=gr,pad=pd,pad_type=pt,strides=st,weight=Wv,x=xn)[name=tensor<string, []>(\"cv\")];\n", DIM,SEQ];
+    [m appendFormat:@"        tensor<int32, [4]> qsh = const()[name=tensor<string, []>(\"qsh\"), val=tensor<int32, [4]>([1,%d,%d,%d])];\n", HEADS,HD,SEQ];
+    [m appendString:@"        tensor<int32, [4]> pm = const()[name=tensor<string, []>(\"pm\"), val=tensor<int32, [4]>([0,1,3,2])];\n"];
+    [m appendFormat:@"        tensor<fp16, [1,%d,%d,%d]> q4 = reshape(shape=qsh,x=qf)[name=tensor<string, []>(\"rq\")];\n", HEADS,HD,SEQ];
+    [m appendFormat:@"        tensor<fp16, [1,%d,%d,%d]> q = transpose(perm=pm,x=q4)[name=tensor<string, []>(\"tq\")];\n", HEADS,SEQ,HD];
+    [m appendFormat:@"        tensor<fp16, [1,%d,%d,%d]> k4 = reshape(shape=qsh,x=kf)[name=tensor<string, []>(\"rk\")];\n", HEADS,HD,SEQ];
+    [m appendFormat:@"        tensor<fp16, [1,%d,%d,%d]> k = transpose(perm=pm,x=k4)[name=tensor<string, []>(\"tk\")];\n", HEADS,SEQ,HD];
+    [m appendFormat:@"        tensor<fp16, [1,%d,%d,%d]> v4 = reshape(shape=qsh,x=vf)[name=tensor<string, []>(\"rv\")];\n", HEADS,HD,SEQ];
+    [m appendFormat:@"        tensor<fp16, [1,%d,%d,%d]> v = transpose(perm=pm,x=v4)[name=tensor<string, []>(\"tv\")];\n", HEADS,SEQ,HD];
+    [m appendString:@"        tensor<bool, []> tx = const()[name=tensor<string, []>(\"tx\"), val=tensor<bool, []>(false)];\n"];
+    [m appendString:@"        tensor<bool, []> ty = const()[name=tensor<string, []>(\"ty\"), val=tensor<bool, []>(true)];\n"];
+    [m appendFormat:@"        tensor<fp16, [1,%d,%d,%d]> sc1 = matmul(transpose_x=tx,transpose_y=ty,x=q,y=k)[name=tensor<string, []>(\"mm1\")];\n", HEADS,SEQ,SEQ];
+    [m appendFormat:@"        tensor<fp16, []> scv = const()[name=tensor<string, []>(\"scv\"), val=tensor<fp16, []>(%f)];\n", sc];
+    [m appendFormat:@"        tensor<fp16, [1,%d,%d,%d]> sc2 = mul(x=sc1,y=scv)[name=tensor<string, []>(\"scl\")];\n", HEADS,SEQ,SEQ];
+    [m appendFormat:@"        tensor<fp16, [1,1,%d,%d]> cm = const()[name=tensor<string, []>(\"cm\"), val=tensor<fp16, [1,1,%d,%d]>(BLOBFILE(path=tensor<string, []>(\"@model_path/weights/mask.bin\"), offset=tensor<uint64, []>(64)))];\n", SEQ,SEQ,SEQ,SEQ];
+    [m appendFormat:@"        tensor<fp16, [1,%d,%d,%d]> ms = add(x=sc2,y=cm)[name=tensor<string, []>(\"msk\")];\n", HEADS,SEQ,SEQ];
+    [m appendString:@"        tensor<int32, []> sax = const()[name=tensor<string, []>(\"sax\"), val=tensor<int32, []>(-1)];\n"];
+    [m appendFormat:@"        tensor<fp16, [1,%d,%d,%d]> aw = softmax(axis=sax,x=ms)[name=tensor<string, []>(\"sm\")];\n", HEADS,SEQ,SEQ];
+    [m appendFormat:@"        tensor<fp16, [1,%d,%d,%d]> a4 = matmul(transpose_x=tx,transpose_y=tx,x=aw,y=v)[name=tensor<string, []>(\"mm2\")];\n", HEADS,SEQ,HD];
+    [m appendFormat:@"        tensor<fp16, [1,%d,%d,%d]> at = transpose(perm=pm,x=a4)[name=tensor<string, []>(\"ta\")];\n", HEADS,HD,SEQ];
+    [m appendFormat:@"        tensor<int32, [4]> os = const()[name=tensor<string, []>(\"os\"), val=tensor<int32, [4]>([1,%d,1,%d])];\n", DIM,SEQ];
+    [m appendFormat:@"        tensor<fp16, [1,%d,1,%d]> af = reshape(shape=os,x=at)[name=tensor<string, []>(\"ra\")];\n", DIM,SEQ];
+    [m appendFormat:@"        tensor<fp16, [1,%d,1,%d]> oo = conv(dilations=dl,groups=gr,pad=pd,pad_type=pt,strides=st,weight=Wo,x=af)[name=tensor<string, []>(\"co\")];\n", DIM,SEQ];
+    [m appendString:@"        tensor<int32, []> cax = const()[name=tensor<string, []>(\"cax\"), val=tensor<int32, []>(1)];\n"];
+    [m appendString:@"        tensor<bool, []> cid = const()[name=tensor<string, []>(\"cid\"), val=tensor<bool, []>(false)];\n"];
+    [m appendFormat:@"        tensor<fp16, [1,%d,1,%d]> out = concat(axis=cax,interleave=cid,values=(oo,qf,kf,vf,af,xn))[name=tensor<string, []>(\"cat\")];\n", 6*DIM,SEQ];
    [m appendString:@"    } -> (out);\n}\n"];
    return m;
 }
@ -76,33 +74,33 @@ static NSString *gen_ffn_fwd_taps(void) {
    float invd = 1.0f/(float)DIM;
    NSMutableString *m = [NSMutableString string];
    [m appendString:MIL_HDR];
-    [m appendFormat:@"    func main<ios18>(tensor<fp16, [1, %d, 1, %d]> x) {\n", DIM, SEQ];
-    [m appendFormat:@"        tensor<fp16, [1,%d,1,%d]> sq = mul(x=x,y=x)[name=string(\"sq\")];\n", DIM, SEQ];
-    [m appendFormat:@"        tensor<int32, [1]> rax = const()[name=string(\"rax\"), val=tensor<int32, [1]>([1])];\n"];
-    [m appendFormat:@"        bool kd = const()[name=string(\"kd\"), val=bool(true)];\n"];
-    [m appendFormat:@"        tensor<fp16, [1,1,1,%d]> ss = reduce_sum(x=sq,axes=rax,keep_dims=kd)[name=string(\"ss\")];\n", SEQ];
-    [m appendFormat:@"        fp16 invd = const()[name=string(\"invd\"), val=fp16(%f)];\n", invd];
-    [m appendFormat:@"        tensor<fp16, [1,1,1,%d]> ss2 = mul(x=ss,y=invd)[name=string(\"ss2\")];\n", SEQ];
-    [m appendFormat:@"        fp16 eps = const()[name=string(\"eps\"), val=fp16(0.00001)];\n"];
-    [m appendFormat:@"        tensor<fp16, [1,1,1,%d]> ss3 = add(x=ss2,y=eps)[name=string(\"ss3\")];\n", SEQ];
-    [m appendFormat:@"        fp16 nhalf = const()[name=string(\"nhalf\"), val=fp16(-0.5)];\n"];
-    [m appendFormat:@"        tensor<fp16, [1,1,1,%d]> rrms = pow(x=ss3,y=nhalf)[name=string(\"rrms\")];\n", SEQ];
-    [m appendFormat:@"        tensor<fp16, [1,%d,1,%d]> xr = mul(x=x,y=rrms)[name=string(\"xr\")];\n", DIM, SEQ];
-    [m appendFormat:@"        tensor<fp16, [1,%d,1,1]> rw = const()[name=string(\"rw\"), val=tensor<fp16, [1,%d,1,1]>(BLOBFILE(path=string(\"@model_path/weights/rms2.bin\"), offset=uint64(64)))];\n", DIM, DIM];
-    [m appendFormat:@"        tensor<fp16, [1,%d,1,%d]> xn = mul(x=xr,y=rw)[name=string(\"xn\")];\n", DIM, SEQ];
+    [m appendFormat:@"    func main<ios16>(tensor<fp16, [1, %d, 1, %d]> x) {\n", DIM, SEQ];
+    [m appendFormat:@"        tensor<fp16, [1,%d,1,%d]> sq = mul(x=x,y=x)[name=tensor<string, []>(\"sq\")];\n", DIM, SEQ];
+    [m appendFormat:@"        tensor<int32, [1]> rax = const()[name=tensor<string, []>(\"rax\"), val=tensor<int32, [1]>([1])];\n"];
+    [m appendFormat:@"        tensor<bool, []> kd = const()[name=tensor<string, []>(\"kd\"), val=tensor<bool, []>(true)];\n"];
+    [m appendFormat:@"        tensor<fp16, [1,1,1,%d]> ss = reduce_sum(x=sq,axes=rax,keep_dims=kd)[name=tensor<string, []>(\"ss\")];\n", SEQ];
+    [m appendFormat:@"        tensor<fp16, []> invd = const()[name=tensor<string, []>(\"invd\"), val=tensor<fp16, []>(%f)];\n", invd];
+    [m appendFormat:@"        tensor<fp16, [1,1,1,%d]> ss2 = mul(x=ss,y=invd)[name=tensor<string, []>(\"ss2\")];\n", SEQ];
+    [m appendFormat:@"        tensor<fp16, []> eps = const()[name=tensor<string, []>(\"eps\"), val=tensor<fp16, []>(0.00001)];\n"];
+    [m appendFormat:@"        tensor<fp16, [1,1,1,%d]> ss3 = add(x=ss2,y=eps)[name=tensor<string, []>(\"ss3\")];\n", SEQ];
+    [m appendFormat:@"        tensor<fp16, []> nhalf = const()[name=tensor<string, []>(\"nhalf\"), val=tensor<fp16, []>(-0.5)];\n"];
+    [m appendFormat:@"        tensor<fp16, [1,1,1,%d]> rrms = pow(x=ss3,y=nhalf)[name=tensor<string, []>(\"rrms\")];\n", SEQ];
+    [m appendFormat:@"        tensor<fp16, [1,%d,1,%d]> xr = mul(x=x,y=rrms)[name=tensor<string, []>(\"xr\")];\n", DIM, SEQ];
+    [m appendFormat:@"        tensor<fp16, [1,%d,1,1]> rw = const()[name=tensor<string, []>(\"rw\"), val=tensor<fp16, [1,%d,1,1]>(BLOBFILE(path=tensor<string, []>(\"@model_path/weights/rms2.bin\"), offset=tensor<uint64, []>(64)))];\n", DIM, DIM];
+    [m appendFormat:@"        tensor<fp16, [1,%d,1,%d]> xn = mul(x=xr,y=rw)[name=tensor<string, []>(\"xn\")];\n", DIM, SEQ];
    [m appendString:@CONV_CONST];
-    [m appendFormat:@"        tensor<fp16, [%d,%d,1,1]> W1 = const()[name=string(\"W1\"), val=tensor<fp16, [%d,%d,1,1]>(BLOBFILE(path=string(\"@model_path/weights/w1.bin\"), offset=uint64(64)))];\n", HIDDEN,DIM,HIDDEN,DIM];
-    [m appendFormat:@"        tensor<fp16, [%d,%d,1,1]> W3 = const()[name=string(\"W3\"), val=tensor<fp16, [%d,%d,1,1]>(BLOBFILE(path=string(\"@model_path/weights/w3.bin\"), offset=uint64(64)))];\n", HIDDEN,DIM,HIDDEN,DIM];
-    [m appendFormat:@"        tensor<fp16, [%d,%d,1,1]> W2 = const()[name=string(\"W2\"), val=tensor<fp16, [%d,%d,1,1]>(BLOBFILE(path=string(\"@model_path/weights/w2.bin\"), offset=uint64(64)))];\n", DIM,HIDDEN,DIM,HIDDEN];
-    [m appendFormat:@"        tensor<fp16, [1,%d,1,%d]> h1 = conv(dilations=dl,groups=gr,pad=pd,pad_type=pt,strides=st,weight=W1,x=xn)[name=string(\"c1\")];\n", HIDDEN,SEQ];
-    [m appendFormat:@"        tensor<fp16, [1,%d,1,%d]> h3 = conv(dilations=dl,groups=gr,pad=pd,pad_type=pt,strides=st,weight=W3,x=xn)[name=string(\"c3\")];\n", HIDDEN,SEQ];
-    [m appendFormat:@"        tensor<fp16, [1,%d,1,%d]> sig = sigmoid(x=h1)[name=string(\"sg\")];\n", HIDDEN,SEQ];
-    [m appendFormat:@"        tensor<fp16, [1,%d,1,%d]> silu = mul(x=h1,y=sig)[name=string(\"si\")];\n", HIDDEN,SEQ];
-    [m appendFormat:@"        tensor<fp16, [1,%d,1,%d]> gate = mul(x=silu,y=h3)[name=string(\"gt\")];\n", HIDDEN,SEQ];
-    [m appendFormat:@"        tensor<fp16, [1,%d,1,%d]> y = conv(dilations=dl,groups=gr,pad=pd,pad_type=pt,strides=st,weight=W2,x=gate)[name=string(\"c2\")];\n", DIM,SEQ];
-    [m appendString:@"        int32 cax = const()[name=string(\"cax\"), val=int32(1)];\n"];
-    [m appendString:@"        bool cid = const()[name=string(\"cid\"), val=bool(false)];\n"];
-    [m appendFormat:@"        tensor<fp16, [1,%d,1,%d]> out = concat(axis=cax,interleave=cid,values=(y,h1,h3,gate,xn))[name=string(\"cat\")];\n", 2*DIM+3*HIDDEN,SEQ];
+    [m appendFormat:@"        tensor<fp16, [%d,%d,1,1]> W1 = const()[name=tensor<string, []>(\"W1\"), val=tensor<fp16, [%d,%d,1,1]>(BLOBFILE(path=tensor<string, []>(\"@model_path/weights/w1.bin\"), offset=tensor<uint64, []>(64)))];\n", HIDDEN,DIM,HIDDEN,DIM];
+    [m appendFormat:@"        tensor<fp16, [%d,%d,1,1]> W3 = const()[name=tensor<string, []>(\"W3\"), val=tensor<fp16, [%d,%d,1,1]>(BLOBFILE(path=tensor<string, []>(\"@model_path/weights/w3.bin\"), offset=tensor<uint64, []>(64)))];\n", HIDDEN,DIM,HIDDEN,DIM];
+    [m appendFormat:@"        tensor<fp16, [%d,%d,1,1]> W2 = const()[name=tensor<string, []>(\"W2\"), val=tensor<fp16, [%d,%d,1,1]>(BLOBFILE(path=tensor<string, []>(\"@model_path/weights/w2.bin\"), offset=tensor<uint64, []>(64)))];\n", DIM,HIDDEN,DIM,HIDDEN];
+    [m appendFormat:@"        tensor<fp16, [1,%d,1,%d]> h1 = conv(dilations=dl,groups=gr,pad=pd,pad_type=pt,strides=st,weight=W1,x=xn)[name=tensor<string, []>(\"c1\")];\n", HIDDEN,SEQ];
+    [m appendFormat:@"        tensor<fp16, [1,%d,1,%d]> h3 = conv(dilations=dl,groups=gr,pad=pd,pad_type=pt,strides=st,weight=W3,x=xn)[name=tensor<string, []>(\"c3\")];\n", HIDDEN,SEQ];
+    [m appendFormat:@"        tensor<fp16, [1,%d,1,%d]> sig = sigmoid(x=h1)[name=tensor<string, []>(\"sg\")];\n", HIDDEN,SEQ];
+    [m appendFormat:@"        tensor<fp16, [1,%d,1,%d]> silu = mul(x=h1,y=sig)[name=tensor<string, []>(\"si\")];\n", HIDDEN,SEQ];
+    [m appendFormat:@"        tensor<fp16, [1,%d,1,%d]> gate = mul(x=silu,y=h3)[name=tensor<string, []>(\"gt\")];\n", HIDDEN,SEQ];
+    [m appendFormat:@"        tensor<fp16, [1,%d,1,%d]> y = conv(dilations=dl,groups=gr,pad=pd,pad_type=pt,strides=st,weight=W2,x=gate)[name=tensor<string, []>(\"c2\")];\n", DIM,SEQ];
+    [m appendString:@"        tensor<int32, []> cax = const()[name=tensor<string, []>(\"cax\"), val=tensor<int32, []>(1)];\n"];
+    [m appendString:@"        tensor<bool, []> cid = const()[name=tensor<string, []>(\"cid\"), val=tensor<bool, []>(false)];\n"];
+    [m appendFormat:@"        tensor<fp16, [1,%d,1,%d]> out = concat(axis=cax,interleave=cid,values=(y,h1,h3,gate,xn))[name=tensor<string, []>(\"cat\")];\n", 2*DIM+3*HIDDEN,SEQ];
    [m appendString:@"    } -> (out);\n}\n"];
    return m;
 }
@ -111,36 +109,36 @@ static NSString *gen_ffn_fwd_taps(void) {
 static NSString *gen_ffn_bwd(void) {
    NSMutableString *m = [NSMutableString string];
    [m appendString:MIL_HDR];
-    [m appendFormat:@"    func main<ios18>(tensor<fp16, [1, %d, 1, %d]> x) {\n", DIM+2*HIDDEN, SEQ];
+    [m appendFormat:@"    func main<ios16>(tensor<fp16, [1, %d, 1, %d]> x) {\n", DIM+2*HIDDEN, SEQ];
    [m appendString:@CONV_CONST];
-    [m appendString:@"        tensor<int32, [4]> bd = const()[name=string(\"bd\"), val=tensor<int32, [4]>([0,0,0,0])];\n"];
-    [m appendFormat:@"        tensor<int32, [4]> sd = const()[name=string(\"sd\"), val=tensor<int32, [4]>([1,%d,1,%d])];\n", DIM, SEQ];
-    [m appendFormat:@"        tensor<fp16, [1,%d,1,%d]> dffn = slice_by_size(x=x,begin=bd,size=sd)[name=string(\"s0\")];\n", DIM, SEQ];
-    [m appendFormat:@"        tensor<int32, [4]> b1 = const()[name=string(\"b1\"), val=tensor<int32, [4]>([0,%d,0,0])];\n", DIM];
-    [m appendFormat:@"        tensor<int32, [4]> s1 = const()[name=string(\"s1\"), val=tensor<int32, [4]>([1,%d,1,%d])];\n", HIDDEN, SEQ];
-    [m appendFormat:@"        tensor<fp16, [1,%d,1,%d]> h1 = slice_by_size(x=x,begin=b1,size=s1)[name=string(\"s1x\")];\n", HIDDEN, SEQ];
-    [m appendFormat:@"        tensor<int32, [4]> b3 = const()[name=string(\"b3\"), val=tensor<int32, [4]>([0,%d,0,0])];\n", DIM+HIDDEN];
-    [m appendFormat:@"        tensor<fp16, [1,%d,1,%d]> h3 = slice_by_size(x=x,begin=b3,size=s1)[name=string(\"s3x\")];\n", HIDDEN, SEQ];
-    [m appendFormat:@"        tensor<fp16, [%d,%d,1,1]> W2t = const()[name=string(\"W2t\"), val=tensor<fp16, [%d,%d,1,1]>(BLOBFILE(path=string(\"@model_path/weights/w2t.bin\"), offset=uint64(64)))];\n", HIDDEN, DIM, HIDDEN, DIM];
-    [m appendFormat:@"        tensor<fp16, [1,%d,1,%d]> dsilu = conv(dilations=dl,groups=gr,pad=pd,pad_type=pt,strides=st,weight=W2t,x=dffn)[name=string(\"cw2\")];\n", HIDDEN, SEQ];
-    [m appendFormat:@"        tensor<fp16, [1,%d,1,%d]> sig = sigmoid(x=h1)[name=string(\"sg\")];\n", HIDDEN, SEQ];
-    [m appendString:@"        fp16 one = const()[name=string(\"one\"), val=fp16(1.0)];\n"];
-    [m appendFormat:@"        tensor<fp16, [1,%d,1,%d]> oms = sub(x=one,y=sig)[name=string(\"oms\")];\n", HIDDEN, SEQ];
-    [m appendFormat:@"        tensor<fp16, [1,%d,1,%d]> homs = mul(x=h1,y=oms)[name=string(\"homs\")];\n", HIDDEN, SEQ];
-    [m appendFormat:@"        tensor<fp16, [1,%d,1,%d]> brk = add(x=one,y=homs)[name=string(\"brk\")];\n", HIDDEN, SEQ];
-    [m appendFormat:@"        tensor<fp16, [1,%d,1,%d]> dsd = mul(x=sig,y=brk)[name=string(\"dsd\")];\n", HIDDEN, SEQ];
-    [m appendFormat:@"        tensor<fp16, [1,%d,1,%d]> t1 = mul(x=dsilu,y=h3)[name=string(\"t1\")];\n", HIDDEN, SEQ];
-    [m appendFormat:@"        tensor<fp16, [1,%d,1,%d]> dh1 = mul(x=t1,y=dsd)[name=string(\"dh1\")];\n", HIDDEN, SEQ];
-    [m appendFormat:@"        tensor<fp16, [1,%d,1,%d]> slh = mul(x=h1,y=sig)[name=string(\"slh\")];\n", HIDDEN, SEQ];
-    [m appendFormat:@"        tensor<fp16, [1,%d,1,%d]> dh3 = mul(x=dsilu,y=slh)[name=string(\"dh3\")];\n", HIDDEN, SEQ];
-    [m appendFormat:@"        tensor<fp16, [%d,%d,1,1]> W1t = const()[name=string(\"W1t\"), val=tensor<fp16, [%d,%d,1,1]>(BLOBFILE(path=string(\"@model_path/weights/w1t.bin\"), offset=uint64(64)))];\n", DIM, HIDDEN, DIM, HIDDEN];
-    [m appendFormat:@"        tensor<fp16, [%d,%d,1,1]> W3t = const()[name=string(\"W3t\"), val=tensor<fp16, [%d,%d,1,1]>(BLOBFILE(path=string(\"@model_path/weights/w3t.bin\"), offset=uint64(64)))];\n", DIM, HIDDEN, DIM, HIDDEN];
-    [m appendFormat:@"        tensor<fp16, [1,%d,1,%d]> dx1 = conv(dilations=dl,groups=gr,pad=pd,pad_type=pt,strides=st,weight=W1t,x=dh1)[name=string(\"cw1\")];\n", DIM, SEQ];
-    [m appendFormat:@"        tensor<fp16, [1,%d,1,%d]> dx3 = conv(dilations=dl,groups=gr,pad=pd,pad_type=pt,strides=st,weight=W3t,x=dh3)[name=string(\"cw3\")];\n", DIM, SEQ];
-    [m appendFormat:@"        tensor<fp16, [1,%d,1,%d]> dx = add(x=dx1,y=dx3)[name=string(\"adx\")];\n", DIM, SEQ];
-    [m appendString:@"        int32 cax = const()[name=string(\"cax\"), val=int32(1)];\n"];
-    [m appendString:@"        bool cid = const()[name=string(\"cid\"), val=bool(false)];\n"];
-    [m appendFormat:@"        tensor<fp16, [1,%d,1,%d]> out = concat(axis=cax,interleave=cid,values=(dx,dh1,dh3))[name=string(\"cat\")];\n", DIM+2*HIDDEN, SEQ];
+    [m appendString:@"        tensor<int32, [4]> bd = const()[name=tensor<string, []>(\"bd\"), val=tensor<int32, [4]>([0,0,0,0])];\n"];
+    [m appendFormat:@"        tensor<int32, [4]> sd = const()[name=tensor<string, []>(\"sd\"), val=tensor<int32, [4]>([1,%d,1,%d])];\n", DIM, SEQ];
+    [m appendFormat:@"        tensor<fp16, [1,%d,1,%d]> dffn = slice_by_size(x=x,begin=bd,size=sd)[name=tensor<string, []>(\"s0\")];\n", DIM, SEQ];
+    [m appendFormat:@"        tensor<int32, [4]> b1 = const()[name=tensor<string, []>(\"b1\"), val=tensor<int32, [4]>([0,%d,0,0])];\n", DIM];
+    [m appendFormat:@"        tensor<int32, [4]> s1 = const()[name=tensor<string, []>(\"s1\"), val=tensor<int32, [4]>([1,%d,1,%d])];\n", HIDDEN, SEQ];
+    [m appendFormat:@"        tensor<fp16, [1,%d,1,%d]> h1 = slice_by_size(x=x,begin=b1,size=s1)[name=tensor<string, []>(\"s1x\")];\n", HIDDEN, SEQ];
+    [m appendFormat:@"        tensor<int32, [4]> b3 = const()[name=tensor<string, []>(\"b3\"), val=tensor<int32, [4]>([0,%d,0,0])];\n", DIM+HIDDEN];
+    [m appendFormat:@"        tensor<fp16, [1,%d,1,%d]> h3 = slice_by_size(x=x,begin=b3,size=s1)[name=tensor<string, []>(\"s3x\")];\n", HIDDEN, SEQ];
+    [m appendFormat:@"        tensor<fp16, [%d,%d,1,1]> W2t = const()[name=tensor<string, []>(\"W2t\"), val=tensor<fp16, [%d,%d,1,1]>(BLOBFILE(path=tensor<string, []>(\"@model_path/weights/w2t.bin\"), offset=tensor<uint64, []>(64)))];\n", HIDDEN, DIM, HIDDEN, DIM];
+    [m appendFormat:@"        tensor<fp16, [1,%d,1,%d]> dsilu = conv(dilations=dl,groups=gr,pad=pd,pad_type=pt,strides=st,weight=W2t,x=dffn)[name=tensor<string, []>(\"cw2\")];\n", HIDDEN, SEQ];
+    [m appendFormat:@"        tensor<fp16, [1,%d,1,%d]> sig = sigmoid(x=h1)[name=tensor<string, []>(\"sg\")];\n", HIDDEN, SEQ];
+    [m appendString:@"        tensor<fp16, []> one = const()[name=tensor<string, []>(\"one\"), val=tensor<fp16, []>(1.0)];\n"];
+    [m appendFormat:@"        tensor<fp16, [1,%d,1,%d]> oms = sub(x=one,y=sig)[name=tensor<string, []>(\"oms\")];\n", HIDDEN, SEQ];
+    [m appendFormat:@"        tensor<fp16, [1,%d,1,%d]> homs = mul(x=h1,y=oms)[name=tensor<string, []>(\"homs\")];\n", HIDDEN, SEQ];
+    [m appendFormat:@"        tensor<fp16, [1,%d,1,%d]> brk = add(x=one,y=homs)[name=tensor<string, []>(\"brk\")];\n", HIDDEN, SEQ];
+    [m appendFormat:@"        tensor<fp16, [1,%d,1,%d]> dsd = mul(x=sig,y=brk)[name=tensor<string, []>(\"dsd\")];\n", HIDDEN, SEQ];
+    [m appendFormat:@"        tensor<fp16, [1,%d,1,%d]> t1 = mul(x=dsilu,y=h3)[name=tensor<string, []>(\"t1\")];\n", HIDDEN, SEQ];
+    [m appendFormat:@"        tensor<fp16, [1,%d,1,%d]> dh1 = mul(x=t1,y=dsd)[name=tensor<string, []>(\"dh1\")];\n", HIDDEN, SEQ];
+    [m appendFormat:@"        tensor<fp16, [1,%d,1,%d]> slh = mul(x=h1,y=sig)[name=tensor<string, []>(\"slh\")];\n", HIDDEN, SEQ];
+    [m appendFormat:@"        tensor<fp16, [1,%d,1,%d]> dh3 = mul(x=dsilu,y=slh)[name=tensor<string, []>(\"dh3\")];\n", HIDDEN, SEQ];
+    [m appendFormat:@"        tensor<fp16, [%d,%d,1,1]> W1t = const()[name=tensor<string, []>(\"W1t\"), val=tensor<fp16, [%d,%d,1,1]>(BLOBFILE(path=tensor<string, []>(\"@model_path/weights/w1t.bin\"), offset=tensor<uint64, []>(64)))];\n", DIM, HIDDEN, DIM, HIDDEN];
+    [m appendFormat:@"        tensor<fp16, [%d,%d,1,1]> W3t = const()[name=tensor<string, []>(\"W3t\"), val=tensor<fp16, [%d,%d,1,1]>(BLOBFILE(path=tensor<string, []>(\"@model_path/weights/w3t.bin\"), offset=tensor<uint64, []>(64)))];\n", DIM, HIDDEN, DIM, HIDDEN];
+    [m appendFormat:@"        tensor<fp16, [1,%d,1,%d]> dx1 = conv(dilations=dl,groups=gr,pad=pd,pad_type=pt,strides=st,weight=W1t,x=dh1)[name=tensor<string, []>(\"cw1\")];\n", DIM, SEQ];
+    [m appendFormat:@"        tensor<fp16, [1,%d,1,%d]> dx3 = conv(dilations=dl,groups=gr,pad=pd,pad_type=pt,strides=st,weight=W3t,x=dh3)[name=tensor<string, []>(\"cw3\")];\n", DIM, SEQ];
+    [m appendFormat:@"        tensor<fp16, [1,%d,1,%d]> dx = add(x=dx1,y=dx3)[name=tensor<string, []>(\"adx\")];\n", DIM, SEQ];
+    [m appendString:@"        tensor<int32, []> cax = const()[name=tensor<string, []>(\"cax\"), val=tensor<int32, []>(1)];\n"];
+    [m appendString:@"        tensor<bool, []> cid = const()[name=tensor<string, []>(\"cid\"), val=tensor<bool, []>(false)];\n"];
+    [m appendFormat:@"        tensor<fp16, [1,%d,1,%d]> out = concat(axis=cax,interleave=cid,values=(dx,dh1,dh3))[name=tensor<string, []>(\"cat\")];\n", DIM+2*HIDDEN, SEQ];
    [m appendString:@"    } -> (out);\n}\n"];
    return m;
 }
@ -149,23 +147,23 @@ static NSString *gen_ffn_bwd(void) {
 static NSString *gen_qkvb(void) {
    NSMutableString *m = [NSMutableString string];
    [m appendString:MIL_HDR];
-    [m appendFormat:@"    func main<ios18>(tensor<fp16, [1, %d, 1, %d]> x) {\n", 3*DIM, SEQ];
+    [m appendFormat:@"    func main<ios16>(tensor<fp16, [1, %d, 1, %d]> x) {\n", 3*DIM, SEQ];
    [m appendString:@CONV_CONST];
-    [m appendFormat:@"        tensor<int32, [4]> sz = const()[name=string(\"sz\"), val=tensor<int32, [4]>([1,%d,1,%d])];\n", DIM, SEQ];
-    [m appendString:@"        tensor<int32, [4]> b0 = const()[name=string(\"b0\"), val=tensor<int32, [4]>([0,0,0,0])];\n"];
-    [m appendFormat:@"        tensor<fp16, [1,%d,1,%d]> dq = slice_by_size(x=x,begin=b0,size=sz)[name=string(\"s0\")];\n", DIM,SEQ];
-    [m appendFormat:@"        tensor<int32, [4]> b1 = const()[name=string(\"b1\"), val=tensor<int32, [4]>([0,%d,0,0])];\n", DIM];
-    [m appendFormat:@"        tensor<fp16, [1,%d,1,%d]> dk = slice_by_size(x=x,begin=b1,size=sz)[name=string(\"s1\")];\n", DIM,SEQ];
-    [m appendFormat:@"        tensor<int32, [4]> b2 = const()[name=string(\"b2\"), val=tensor<int32, [4]>([0,%d,0,0])];\n", 2*DIM];
-    [m appendFormat:@"        tensor<fp16, [1,%d,1,%d]> dv = slice_by_size(x=x,begin=b2,size=sz)[name=string(\"s2\")];\n", DIM,SEQ];
-    [m appendFormat:@"        tensor<fp16, [%d,%d,1,1]> Wqt = const()[name=string(\"Wqt\"), val=tensor<fp16, [%d,%d,1,1]>(BLOBFILE(path=string(\"@model_path/weights/wqt.bin\"), offset=uint64(64)))];\n", DIM,DIM,DIM,DIM];
-    [m appendFormat:@"        tensor<fp16, [%d,%d,1,1]> Wkt = const()[name=string(\"Wkt\"), val=tensor<fp16, [%d,%d,1,1]>(BLOBFILE(path=string(\"@model_path/weights/wkt.bin\"), offset=uint64(64)))];\n", DIM,DIM,DIM,DIM];
-    [m appendFormat:@"        tensor<fp16, [%d,%d,1,1]> Wvt = const()[name=string(\"Wvt\"), val=tensor<fp16, [%d,%d,1,1]>(BLOBFILE(path=string(\"@model_path/weights/wvt.bin\"), offset=uint64(64)))];\n", DIM,DIM,DIM,DIM];
-    [m appendFormat:@"        tensor<fp16, [1,%d,1,%d]> dxq = conv(dilations=dl,groups=gr,pad=pd,pad_type=pt,strides=st,weight=Wqt,x=dq)[name=string(\"cq\")];\n", DIM,SEQ];
-    [m appendFormat:@"        tensor<fp16, [1,%d,1,%d]> dxk = conv(dilations=dl,groups=gr,pad=pd,pad_type=pt,strides=st,weight=Wkt,x=dk)[name=string(\"ck\")];\n", DIM,SEQ];
-    [m appendFormat:@"        tensor<fp16, [1,%d,1,%d]> dxv = conv(dilations=dl,groups=gr,pad=pd,pad_type=pt,strides=st,weight=Wvt,x=dv)[name=string(\"cv\")];\n", DIM,SEQ];
-    [m appendFormat:@"        tensor<fp16, [1,%d,1,%d]> dxqk = add(x=dxq,y=dxk)[name=string(\"aqk\")];\n", DIM,SEQ];
-    [m appendFormat:@"        tensor<fp16, [1,%d,1,%d]> out = add(x=dxqk,y=dxv)[name=string(\"out\")];\n", DIM,SEQ];
+    [m appendFormat:@"        tensor<int32, [4]> sz = const()[name=tensor<string, []>(\"sz\"), val=tensor<int32, [4]>([1,%d,1,%d])];\n", DIM, SEQ];
+    [m appendString:@"        tensor<int32, [4]> b0 = const()[name=tensor<string, []>(\"b0\"), val=tensor<int32, [4]>([0,0,0,0])];\n"];
+    [m appendFormat:@"        tensor<fp16, [1,%d,1,%d]> dq = slice_by_size(x=x,begin=b0,size=sz)[name=tensor<string, []>(\"s0\")];\n", DIM,SEQ];
+    [m appendFormat:@"        tensor<int32, [4]> b1 = const()[name=tensor<string, []>(\"b1\"), val=tensor<int32, [4]>([0,%d,0,0])];\n", DIM];
+    [m appendFormat:@"        tensor<fp16, [1,%d,1,%d]> dk = slice_by_size(x=x,begin=b1,size=sz)[name=tensor<string, []>(\"s1\")];\n", DIM,SEQ];
+    [m appendFormat:@"        tensor<int32, [4]> b2 = const()[name=tensor<string, []>(\"b2\"), val=tensor<int32, [4]>([0,%d,0,0])];\n", 2*DIM];
+    [m appendFormat:@"        tensor<fp16, [1,%d,1,%d]> dv = slice_by_size(x=x,begin=b2,size=sz)[name=tensor<string, []>(\"s2\")];\n", DIM,SEQ];
+    [m appendFormat:@"        tensor<fp16, [%d,%d,1,1]> Wqt = const()[name=tensor<string, []>(\"Wqt\"), val=tensor<fp16, [%d,%d,1,1]>(BLOBFILE(path=tensor<string, []>(\"@model_path/weights/wqt.bin\"), offset=tensor<uint64, []>(64)))];\n", DIM,DIM,DIM,DIM];
+    [m appendFormat:@"        tensor<fp16, [%d,%d,1,1]> Wkt = const()[name=tensor<string, []>(\"Wkt\"), val=tensor<fp16, [%d,%d,1,1]>(BLOBFILE(path=tensor<string, []>(\"@model_path/weights/wkt.bin\"), offset=tensor<uint64, []>(64)))];\n", DIM,DIM,DIM,DIM];
+    [m appendFormat:@"        tensor<fp16, [%d,%d,1,1]> Wvt = const()[name=tensor<string, []>(\"Wvt\"), val=tensor<fp16, [%d,%d,1,1]>(BLOBFILE(path=tensor<string, []>(\"@model_path/weights/wvt.bin\"), offset=tensor<uint64, []>(64)))];\n", DIM,DIM,DIM,DIM];
+    [m appendFormat:@"        tensor<fp16, [1,%d,1,%d]> dxq = conv(dilations=dl,groups=gr,pad=pd,pad_type=pt,strides=st,weight=Wqt,x=dq)[name=tensor<string, []>(\"cq\")];\n", DIM,SEQ];
+    [m appendFormat:@"        tensor<fp16, [1,%d,1,%d]> dxk = conv(dilations=dl,groups=gr,pad=pd,pad_type=pt,strides=st,weight=Wkt,x=dk)[name=tensor<string, []>(\"ck\")];\n", DIM,SEQ];
+    [m appendFormat:@"        tensor<fp16, [1,%d,1,%d]> dxv = conv(dilations=dl,groups=gr,pad=pd,pad_type=pt,strides=st,weight=Wvt,x=dv)[name=tensor<string, []>(\"cv\")];\n", DIM,SEQ];
+    [m appendFormat:@"        tensor<fp16, [1,%d,1,%d]> dxqk = add(x=dxq,y=dxk)[name=tensor<string, []>(\"aqk\")];\n", DIM,SEQ];
+    [m appendFormat:@"        tensor<fp16, [1,%d,1,%d]> out = add(x=dxqk,y=dxv)[name=tensor<string, []>(\"out\")];\n", DIM,SEQ];
    [m appendString:@"    } -> (out);\n}\n"];
    return m;
 }
@ -175,49 +173,49 @@ static NSString *gen_sdpa_bwd1(void) {
    float sc = 1.0f/sqrtf((float)HD);
    NSMutableString *m = [NSMutableString string];
    [m appendString:MIL_HDR];
-    [m appendFormat:@"    func main<ios18>(tensor<fp16, [1, %d, 1, %d]> x) {\n", 4*DIM, SEQ];
+    [m appendFormat:@"    func main<ios16>(tensor<fp16, [1, %d, 1, %d]> x) {\n", 4*DIM, SEQ];
    [m appendString:@CONV_CONST];
-    [m appendFormat:@"        tensor<int32, [4]> sz = const()[name=string(\"sz\"), val=tensor<int32, [4]>([1,%d,1,%d])];\n", DIM, SEQ];
-    [m appendString:@"        tensor<int32, [4]> b0 = const()[name=string(\"b0\"), val=tensor<int32, [4]>([0,0,0,0])];\n"];
-    [m appendFormat:@"        tensor<fp16, [1,%d,1,%d]> qf = slice_by_size(x=x,begin=b0,size=sz)[name=string(\"s0\")];\n", DIM,SEQ];
-    [m appendFormat:@"        tensor<int32, [4]> b1 = const()[name=string(\"b1\"), val=tensor<int32, [4]>([0,%d,0,0])];\n", DIM];
-    [m appendFormat:@"        tensor<fp16, [1,%d,1,%d]> kf = slice_by_size(x=x,begin=b1,size=sz)[name=string(\"s1\")];\n", DIM,SEQ];
-    [m appendFormat:@"        tensor<int32, [4]> b2 = const()[name=string(\"b2\"), val=tensor<int32, [4]>([0,%d,0,0])];\n", 2*DIM];
-    [m appendFormat:@"        tensor<fp16, [1,%d,1,%d]> vf = slice_by_size(x=x,begin=b2,size=sz)[name=string(\"s2\")];\n", DIM,SEQ];
-    [m appendFormat:@"        tensor<int32, [4]> b3 = const()[name=string(\"b3\"), val=tensor<int32, [4]>([0,%d,0,0])];\n", 3*DIM];
-    [m appendFormat:@"        tensor<fp16, [1,%d,1,%d]> dx2f = slice_by_size(x=x,begin=b3,size=sz)[name=string(\"s3\")];\n", DIM,SEQ];
-    [m appendFormat:@"        tensor<fp16, [%d,%d,1,1]> Wot = const()[name=string(\"Wot\"), val=tensor<fp16, [%d,%d,1,1]>(BLOBFILE(path=string(\"@model_path/weights/wot.bin\"), offset=uint64(64)))];\n", DIM,DIM,DIM,DIM];
-    [m appendFormat:@"        tensor<fp16, [1,%d,1,%d]> df = conv(dilations=dl,groups=gr,pad=pd,pad_type=pt,strides=st,weight=Wot,x=dx2f)[name=string(\"cwo\")];\n", DIM,SEQ];
-    [m appendFormat:@"        tensor<int32, [4]> rsh = const()[name=string(\"rsh\"), val=tensor<int32, [4]>([1,%d,%d,%d])];\n", HEADS,HD,SEQ];
-    [m appendString:@"        tensor<int32, [4]> pm = const()[name=string(\"pm\"), val=tensor<int32, [4]>([0,1,3,2])];\n"];
-    [m appendFormat:@"        tensor<fp16, [1,%d,%d,%d]> qr = reshape(shape=rsh,x=qf)[name=string(\"rq\")];\n", HEADS,HD,SEQ];
-    [m appendFormat:@"        tensor<fp16, [1,%d,%d,%d]> q = transpose(perm=pm,x=qr)[name=string(\"tq\")];\n", HEADS,SEQ,HD];
-    [m appendFormat:@"        tensor<fp16, [1,%d,%d,%d]> kr = reshape(shape=rsh,x=kf)[name=string(\"rk\")];\n", HEADS,HD,SEQ];
-    [m appendFormat:@"        tensor<fp16, [1,%d,%d,%d]> k = transpose(perm=pm,x=kr)[name=string(\"tk\")];\n", HEADS,SEQ,HD];
-    [m appendFormat:@"        tensor<fp16, [1,%d,%d,%d]> vr = reshape(shape=rsh,x=vf)[name=string(\"rv\")];\n", HEADS,HD,SEQ];
-    [m appendFormat:@"        tensor<fp16, [1,%d,%d,%d]> v = transpose(perm=pm,x=vr)[name=string(\"tv\")];\n", HEADS,SEQ,HD];
-    [m appendFormat:@"        tensor<fp16, [1,%d,%d,%d]> dr = reshape(shape=rsh,x=df)[name=string(\"rd\")];\n", HEADS,HD,SEQ];
-    [m appendFormat:@"        tensor<fp16, [1,%d,%d,%d]> da = transpose(perm=pm,x=dr)[name=string(\"td\")];\n", HEADS,SEQ,HD];
-    [m appendString:@"        bool bF = const()[name=string(\"bF\"), val=bool(false)];\n"];
-    [m appendString:@"        bool bT = const()[name=string(\"bT\"), val=bool(true)];\n"];
-    [m appendFormat:@"        tensor<fp16, [1,%d,%d,%d]> sc1 = matmul(transpose_x=bF,transpose_y=bT,x=q,y=k)[name=string(\"mm1\")];\n", HEADS,SEQ,SEQ];
-    [m appendFormat:@"        fp16 scv = const()[name=string(\"scv\"), val=fp16(%f)];\n", sc];
-    [m appendFormat:@"        tensor<fp16, [1,%d,%d,%d]> sc2 = mul(x=sc1,y=scv)[name=string(\"scl\")];\n", HEADS,SEQ,SEQ];
-    [m appendFormat:@"        tensor<fp16, [1,1,%d,%d]> cm = const()[name=string(\"cm\"), val=tensor<fp16, [1,1,%d,%d]>(BLOBFILE(path=string(\"@model_path/weights/mask.bin\"), offset=uint64(64)))];\n", SEQ,SEQ,SEQ,SEQ];
-    [m appendFormat:@"        tensor<fp16, [1,%d,%d,%d]> ms = add(x=sc2,y=cm)[name=string(\"msk\")];\n", HEADS,SEQ,SEQ];
-    [m appendString:@"        int32 sax = const()[name=string(\"sax\"), val=int32(-1)];\n"];
-    [m appendFormat:@"        tensor<fp16, [1,%d,%d,%d]> probs = softmax(axis=sax,x=ms)[name=string(\"sm\")];\n", HEADS,SEQ,SEQ];
-    [m appendFormat:@"        tensor<fp16, [1,%d,%d,%d]> dv4 = matmul(transpose_x=bT,transpose_y=bF,x=probs,y=da)[name=string(\"dv\")];\n", HEADS,SEQ,HD];
-    [m appendFormat:@"        tensor<fp16, [1,%d,%d,%d]> dp4 = matmul(transpose_x=bF,transpose_y=bT,x=da,y=v)[name=string(\"dp\")];\n", HEADS,SEQ,SEQ];
-    [m appendFormat:@"        tensor<fp16, [1,%d,%d,%d]> dvt = transpose(perm=pm,x=dv4)[name=string(\"dvt\")];\n", HEADS,HD,SEQ];
-    [m appendFormat:@"        tensor<int32, [4]> dvs = const()[name=string(\"dvs\"), val=tensor<int32, [4]>([1,%d,1,%d])];\n", DIM,SEQ];
-    [m appendFormat:@"        tensor<fp16, [1,%d,1,%d]> dvf = reshape(shape=dvs,x=dvt)[name=string(\"dvf\")];\n", DIM,SEQ];
-    [m appendFormat:@"        tensor<int32, [4]> scs = const()[name=string(\"scs\"), val=tensor<int32, [4]>([1,%d,1,%d])];\n", SCORE_CH,SEQ];
-    [m appendFormat:@"        tensor<fp16, [1,%d,1,%d]> pf = reshape(shape=scs,x=probs)[name=string(\"pf\")];\n", SCORE_CH,SEQ];
-    [m appendFormat:@"        tensor<fp16, [1,%d,1,%d]> dpf = reshape(shape=scs,x=dp4)[name=string(\"dpf\")];\n", SCORE_CH,SEQ];
-    [m appendString:@"        int32 cax = const()[name=string(\"cax\"), val=int32(1)];\n"];
-    [m appendString:@"        bool cid = const()[name=string(\"cid\"), val=bool(false)];\n"];
-    [m appendFormat:@"        tensor<fp16, [1,%d,1,%d]> out = concat(axis=cax,interleave=cid,values=(dvf,pf,dpf))[name=string(\"cat\")];\n", DIM+2*SCORE_CH,SEQ];
+    [m appendFormat:@"        tensor<int32, [4]> sz = const()[name=tensor<string, []>(\"sz\"), val=tensor<int32, [4]>([1,%d,1,%d])];\n", DIM, SEQ];
+    [m appendString:@"        tensor<int32, [4]> b0 = const()[name=tensor<string, []>(\"b0\"), val=tensor<int32, [4]>([0,0,0,0])];\n"];
+    [m appendFormat:@"        tensor<fp16, [1,%d,1,%d]> qf = slice_by_size(x=x,begin=b0,size=sz)[name=tensor<string, []>(\"s0\")];\n", DIM,SEQ];
+    [m appendFormat:@"        tensor<int32, [4]> b1 = const()[name=tensor<string, []>(\"b1\"), val=tensor<int32, [4]>([0,%d,0,0])];\n", DIM];
+    [m appendFormat:@"        tensor<fp16, [1,%d,1,%d]> kf = slice_by_size(x=x,begin=b1,size=sz)[name=tensor<string, []>(\"s1\")];\n", DIM,SEQ];
+    [m appendFormat:@"        tensor<int32, [4]> b2 = const()[name=tensor<string, []>(\"b2\"), val=tensor<int32, [4]>([0,%d,0,0])];\n", 2*DIM];
+    [m appendFormat:@"        tensor<fp16, [1,%d,1,%d]> vf = slice_by_size(x=x,begin=b2,size=sz)[name=tensor<string, []>(\"s2\")];\n", DIM,SEQ];
+    [m appendFormat:@"        tensor<int32, [4]> b3 = const()[name=tensor<string, []>(\"b3\"), val=tensor<int32, [4]>([0,%d,0,0])];\n", 3*DIM];
+    [m appendFormat:@"        tensor<fp16, [1,%d,1,%d]> dx2f = slice_by_size(x=x,begin=b3,size=sz)[name=tensor<string, []>(\"s3\")];\n", DIM,SEQ];
+    [m appendFormat:@"        tensor<fp16, [%d,%d,1,1]> Wot = const()[name=tensor<string, []>(\"Wot\"), val=tensor<fp16, [%d,%d,1,1]>(BLOBFILE(path=tensor<string, []>(\"@model_path/weights/wot.bin\"), offset=tensor<uint64, []>(64)))];\n", DIM,DIM,DIM,DIM];
+    [m appendFormat:@"        tensor<fp16, [1,%d,1,%d]> df = conv(dilations=dl,groups=gr,pad=pd,pad_type=pt,strides=st,weight=Wot,x=dx2f)[name=tensor<string, []>(\"cwo\")];\n", DIM,SEQ];
+    [m appendFormat:@"        tensor<int32, [4]> rsh = const()[name=tensor<string, []>(\"rsh\"), val=tensor<int32, [4]>([1,%d,%d,%d])];\n", HEADS,HD,SEQ];
+    [m appendString:@"        tensor<int32, [4]> pm = const()[name=tensor<string, []>(\"pm\"), val=tensor<int32, [4]>([0,1,3,2])];\n"];
+    [m appendFormat:@"        tensor<fp16, [1,%d,%d,%d]> qr = reshape(shape=rsh,x=qf)[name=tensor<string, []>(\"rq\")];\n", HEADS,HD,SEQ];
+    [m appendFormat:@"        tensor<fp16, [1,%d,%d,%d]> q = transpose(perm=pm,x=qr)[name=tensor<string, []>(\"tq\")];\n", HEADS,SEQ,HD];
+    [m appendFormat:@"        tensor<fp16, [1,%d,%d,%d]> kr = reshape(shape=rsh,x=kf)[name=tensor<string, []>(\"rk\")];\n", HEADS,HD,SEQ];
+    [m appendFormat:@"        tensor<fp16, [1,%d,%d,%d]> k = transpose(perm=pm,x=kr)[name=tensor<string, []>(\"tk\")];\n", HEADS,SEQ,HD];
+    [m appendFormat:@"        tensor<fp16, [1,%d,%d,%d]> vr = reshape(shape=rsh,x=vf)[name=tensor<string, []>(\"rv\")];\n", HEADS,HD,SEQ];
+    [m appendFormat:@"        tensor<fp16, [1,%d,%d,%d]> v = transpose(perm=pm,x=vr)[name=tensor<string, []>(\"tv\")];\n", HEADS,SEQ,HD];
+    [m appendFormat:@"        tensor<fp16, [1,%d,%d,%d]> dr = reshape(shape=rsh,x=df)[name=tensor<string, []>(\"rd\")];\n", HEADS,HD,SEQ];
+    [m appendFormat:@"        tensor<fp16, [1,%d,%d,%d]> da = transpose(perm=pm,x=dr)[name=tensor<string, []>(\"td\")];\n", HEADS,SEQ,HD];
+    [m appendString:@"        tensor<bool, []> bF = const()[name=tensor<string, []>(\"bF\"), val=tensor<bool, []>(false)];\n"];
+    [m appendString:@"        tensor<bool, []> bT = const()[name=tensor<string, []>(\"bT\"), val=tensor<bool, []>(true)];\n"];
+    [m appendFormat:@"        tensor<fp16, [1,%d,%d,%d]> sc1 = matmul(transpose_x=bF,transpose_y=bT,x=q,y=k)[name=tensor<string, []>(\"mm1\")];\n", HEADS,SEQ,SEQ];
+    [m appendFormat:@"        tensor<fp16, []> scv = const()[name=tensor<string, []>(\"scv\"), val=tensor<fp16, []>(%f)];\n", sc];
+    [m appendFormat:@"        tensor<fp16, [1,%d,%d,%d]> sc2 = mul(x=sc1,y=scv)[name=tensor<string, []>(\"scl\")];\n", HEADS,SEQ,SEQ];
+    [m appendFormat:@"        tensor<fp16, [1,1,%d,%d]> cm = const()[name=tensor<string, []>(\"cm\"), val=tensor<fp16, [1,1,%d,%d]>(BLOBFILE(path=tensor<string, []>(\"@model_path/weights/mask.bin\"), offset=tensor<uint64, []>(64)))];\n", SEQ,SEQ,SEQ,SEQ];
+    [m appendFormat:@"        tensor<fp16, [1,%d,%d,%d]> ms = add(x=sc2,y=cm)[name=tensor<string, []>(\"msk\")];\n", HEADS,SEQ,SEQ];
+    [m appendString:@"        tensor<int32, []> sax = const()[name=tensor<string, []>(\"sax\"), val=tensor<int32, []>(-1)];\n"];
+    [m appendFormat:@"        tensor<fp16, [1,%d,%d,%d]> probs = softmax(axis=sax,x=ms)[name=tensor<string, []>(\"sm\")];\n", HEADS,SEQ,SEQ];
+    [m appendFormat:@"        tensor<fp16, [1,%d,%d,%d]> dv4 = matmul(transpose_x=bT,transpose_y=bF,x=probs,y=da)[name=tensor<string, []>(\"dv\")];\n", HEADS,SEQ,HD];
+    [m appendFormat:@"        tensor<fp16, [1,%d,%d,%d]> dp4 = matmul(transpose_x=bF,transpose_y=bT,x=da,y=v)[name=tensor<string, []>(\"dp\")];\n", HEADS,SEQ,SEQ];
+    [m appendFormat:@"        tensor<fp16, [1,%d,%d,%d]> dvt = transpose(perm=pm,x=dv4)[name=tensor<string, []>(\"dvt\")];\n", HEADS,HD,SEQ];
+    [m appendFormat:@"        tensor<int32, [4]> dvs = const()[name=tensor<string, []>(\"dvs\"), val=tensor<int32, [4]>([1,%d,1,%d])];\n", DIM,SEQ];
+    [m appendFormat:@"        tensor<fp16, [1,%d,1,%d]> dvf = reshape(shape=dvs,x=dvt)[name=tensor<string, []>(\"dvf\")];\n", DIM,SEQ];
+    [m appendFormat:@"        tensor<int32, [4]> scs = const()[name=tensor<string, []>(\"scs\"), val=tensor<int32, [4]>([1,%d,1,%d])];\n", SCORE_CH,SEQ];
+    [m appendFormat:@"        tensor<fp16, [1,%d,1,%d]> pf = reshape(shape=scs,x=probs)[name=tensor<string, []>(\"pf\")];\n", SCORE_CH,SEQ];
+    [m appendFormat:@"        tensor<fp16, [1,%d,1,%d]> dpf = reshape(shape=scs,x=dp4)[name=tensor<string, []>(\"dpf\")];\n", SCORE_CH,SEQ];
+    [m appendString:@"        tensor<int32, []> cax = const()[name=tensor<string, []>(\"cax\"), val=tensor<int32, []>(1)];\n"];
+    [m appendString:@"        tensor<bool, []> cid = const()[name=tensor<string, []>(\"cid\"), val=tensor<bool, []>(false)];\n"];
+    [m appendFormat:@"        tensor<fp16, [1,%d,1,%d]> out = concat(axis=cax,interleave=cid,values=(dvf,pf,dpf))[name=tensor<string, []>(\"cat\")];\n", DIM+2*SCORE_CH,SEQ];
    [m appendString:@"    } -> (out);\n}\n"];
    return m;
 }
@ -228,46 +226,46 @@ static NSString *gen_sdpa_bwd2(void) {
    int bwd2_in = 2*SCORE_CH + 2*DIM;
    NSMutableString *m = [NSMutableString string];
    [m appendString:MIL_HDR];
-    [m appendFormat:@"    func main<ios18>(tensor<fp16, [1, %d, 1, %d]> x) {\n", bwd2_in, SEQ];
-    [m appendFormat:@"        tensor<int32, [4]> sz_sc = const()[name=string(\"szsc\"), val=tensor<int32, [4]>([1,%d,1,%d])];\n", SCORE_CH, SEQ];
-    [m appendString:@"        tensor<int32, [4]> b0 = const()[name=string(\"b0\"), val=tensor<int32, [4]>([0,0,0,0])];\n"];
-    [m appendFormat:@"        tensor<fp16, [1,%d,1,%d]> pf = slice_by_size(x=x,begin=b0,size=sz_sc)[name=string(\"s0\")];\n", SCORE_CH,SEQ];
-    [m appendFormat:@"        tensor<int32, [4]> b1 = const()[name=string(\"b1\"), val=tensor<int32, [4]>([0,%d,0,0])];\n", SCORE_CH];
-    [m appendFormat:@"        tensor<fp16, [1,%d,1,%d]> dpf = slice_by_size(x=x,begin=b1,size=sz_sc)[name=string(\"s1\")];\n", SCORE_CH,SEQ];
-    [m appendFormat:@"        tensor<int32, [4]> sz_d = const()[name=string(\"szd\"), val=tensor<int32, [4]>([1,%d,1,%d])];\n", DIM, SEQ];
-    [m appendFormat:@"        tensor<int32, [4]> b2 = const()[name=string(\"b2\"), val=tensor<int32, [4]>([0,%d,0,0])];\n", 2*SCORE_CH];
-    [m appendFormat:@"        tensor<fp16, [1,%d,1,%d]> qf = slice_by_size(x=x,begin=b2,size=sz_d)[name=string(\"s2\")];\n", DIM,SEQ];
-    [m appendFormat:@"        tensor<int32, [4]> b3 = const()[name=string(\"b3\"), val=tensor<int32, [4]>([0,%d,0,0])];\n", 2*SCORE_CH+DIM];
-    [m appendFormat:@"        tensor<fp16, [1,%d,1,%d]> kf = slice_by_size(x=x,begin=b3,size=sz_d)[name=string(\"s3\")];\n", DIM,SEQ];
-    [m appendFormat:@"        tensor<int32, [4]> ssh = const()[name=string(\"ssh\"), val=tensor<int32, [4]>([1,%d,%d,%d])];\n", HEADS,SEQ,SEQ];
-    [m appendFormat:@"        tensor<fp16, [1,%d,%d,%d]> probs = reshape(shape=ssh,x=pf)[name=string(\"rp\")];\n", HEADS,SEQ,SEQ];
-    [m appendFormat:@"        tensor<fp16, [1,%d,%d,%d]> dp = reshape(shape=ssh,x=dpf)[name=string(\"rdp\")];\n", HEADS,SEQ,SEQ];
-    [m appendFormat:@"        tensor<int32, [4]> rsh = const()[name=string(\"rsh\"), val=tensor<int32, [4]>([1,%d,%d,%d])];\n", HEADS,HD,SEQ];
-    [m appendString:@"        tensor<int32, [4]> pm = const()[name=string(\"pm\"), val=tensor<int32, [4]>([0,1,3,2])];\n"];
-    [m appendFormat:@"        tensor<fp16, [1,%d,%d,%d]> qr = reshape(shape=rsh,x=qf)[name=string(\"rq\")];\n", HEADS,HD,SEQ];
-    [m appendFormat:@"        tensor<fp16, [1,%d,%d,%d]> q = transpose(perm=pm,x=qr)[name=string(\"tq\")];\n", HEADS,SEQ,HD];
-    [m appendFormat:@"        tensor<fp16, [1,%d,%d,%d]> kr = reshape(shape=rsh,x=kf)[name=string(\"rk\")];\n", HEADS,HD,SEQ];
-    [m appendFormat:@"        tensor<fp16, [1,%d,%d,%d]> k = transpose(perm=pm,x=kr)[name=string(\"tk\")];\n", HEADS,SEQ,HD];
-    [m appendFormat:@"        tensor<fp16, [1,%d,%d,%d]> pdp = mul(x=probs,y=dp)[name=string(\"pdp\")];\n", HEADS,SEQ,SEQ];
-    [m appendString:@"        tensor<int32, [1]> rax = const()[name=string(\"rax\"), val=tensor<int32, [1]>([-1])];\n"];
-    [m appendString:@"        bool kd = const()[name=string(\"kd\"), val=bool(true)];\n"];
-    [m appendFormat:@"        tensor<fp16, [1,%d,%d,1]> spdp = reduce_sum(x=pdp,axes=rax,keep_dims=kd)[name=string(\"rs\")];\n", HEADS,SEQ];
-    [m appendFormat:@"        tensor<fp16, [1,%d,%d,%d]> dps = sub(x=dp,y=spdp)[name=string(\"dps\")];\n", HEADS,SEQ,SEQ];
-    [m appendFormat:@"        tensor<fp16, [1,%d,%d,%d]> ds0 = mul(x=probs,y=dps)[name=string(\"ds0\")];\n", HEADS,SEQ,SEQ];
-    [m appendFormat:@"        fp16 scv = const()[name=string(\"scv\"), val=fp16(%f)];\n", sc];
-    [m appendFormat:@"        tensor<fp16, [1,%d,%d,%d]> ds = mul(x=ds0,y=scv)[name=string(\"ds\")];\n", HEADS,SEQ,SEQ];
-    [m appendString:@"        bool bF = const()[name=string(\"bF\"), val=bool(false)];\n"];
-    [m appendString:@"        bool bT = const()[name=string(\"bT\"), val=bool(true)];\n"];
-    [m appendFormat:@"        tensor<fp16, [1,%d,%d,%d]> dq4 = matmul(transpose_x=bF,transpose_y=bF,x=ds,y=k)[name=string(\"dq\")];\n", HEADS,SEQ,HD];
-    [m appendFormat:@"        tensor<fp16, [1,%d,%d,%d]> dk4 = matmul(transpose_x=bT,transpose_y=bF,x=ds,y=q)[name=string(\"dk\")];\n", HEADS,SEQ,HD];
-    [m appendFormat:@"        tensor<fp16, [1,%d,%d,%d]> dqt = transpose(perm=pm,x=dq4)[name=string(\"dqt\")];\n", HEADS,HD,SEQ];
-    [m appendFormat:@"        tensor<fp16, [1,%d,%d,%d]> dkt = transpose(perm=pm,x=dk4)[name=string(\"dkt\")];\n", HEADS,HD,SEQ];
-    [m appendFormat:@"        tensor<int32, [4]> fs = const()[name=string(\"fs\"), val=tensor<int32, [4]>([1,%d,1,%d])];\n", DIM,SEQ];
-    [m appendFormat:@"        tensor<fp16, [1,%d,1,%d]> dqf = reshape(shape=fs,x=dqt)[name=string(\"dqf\")];\n", DIM,SEQ];
-    [m appendFormat:@"        tensor<fp16, [1,%d,1,%d]> dkf = reshape(shape=fs,x=dkt)[name=string(\"dkf\")];\n", DIM,SEQ];
-    [m appendString:@"        int32 cax = const()[name=string(\"cax\"), val=int32(1)];\n"];
-    [m appendString:@"        bool cid = const()[name=string(\"cid\"), val=bool(false)];\n"];
-    [m appendFormat:@"        tensor<fp16, [1,%d,1,%d]> out = concat(axis=cax,interleave=cid,values=(dqf,dkf))[name=string(\"cat\")];\n", 2*DIM,SEQ];
+    [m appendFormat:@"    func main<ios16>(tensor<fp16, [1, %d, 1, %d]> x) {\n", bwd2_in, SEQ];
+    [m appendFormat:@"        tensor<int32, [4]> sz_sc = const()[name=tensor<string, []>(\"szsc\"), val=tensor<int32, [4]>([1,%d,1,%d])];\n", SCORE_CH, SEQ];
+    [m appendString:@"        tensor<int32, [4]> b0 = const()[name=tensor<string, []>(\"b0\"), val=tensor<int32, [4]>([0,0,0,0])];\n"];
+    [m appendFormat:@"        tensor<fp16, [1,%d,1,%d]> pf = slice_by_size(x=x,begin=b0,size=sz_sc)[name=tensor<string, []>(\"s0\")];\n", SCORE_CH,SEQ];
+    [m appendFormat:@"        tensor<int32, [4]> b1 = const()[name=tensor<string, []>(\"b1\"), val=tensor<int32, [4]>([0,%d,0,0])];\n", SCORE_CH];
+    [m appendFormat:@"        tensor<fp16, [1,%d,1,%d]> dpf = slice_by_size(x=x,begin=b1,size=sz_sc)[name=tensor<string, []>(\"s1\")];\n", SCORE_CH,SEQ];
+    [m appendFormat:@"        tensor<int32, [4]> sz_d = const()[name=tensor<string, []>(\"szd\"), val=tensor<int32, [4]>([1,%d,1,%d])];\n", DIM, SEQ];
+    [m appendFormat:@"        tensor<int32, [4]> b2 = const()[name=tensor<string, []>(\"b2\"), val=tensor<int32, [4]>([0,%d,0,0])];\n", 2*SCORE_CH];
+    [m appendFormat:@"        tensor<fp16, [1,%d,1,%d]> qf = slice_by_size(x=x,begin=b2,size=sz_d)[name=tensor<string, []>(\"s2\")];\n", DIM,SEQ];
+    [m appendFormat:@"        tensor<int32, [4]> b3 = const()[name=tensor<string, []>(\"b3\"), val=tensor<int32, [4]>([0,%d,0,0])];\n", 2*SCORE_CH+DIM];
+    [m appendFormat:@"        tensor<fp16, [1,%d,1,%d]> kf = slice_by_size(x=x,begin=b3,size=sz_d)[name=tensor<string, []>(\"s3\")];\n", DIM,SEQ];
+    [m appendFormat:@"        tensor<int32, [4]> ssh = const()[name=tensor<string, []>(\"ssh\"), val=tensor<int32, [4]>([1,%d,%d,%d])];\n", HEADS,SEQ,SEQ];
+    [m appendFormat:@"        tensor<fp16, [1,%d,%d,%d]> probs = reshape(shape=ssh,x=pf)[name=tensor<string, []>(\"rp\")];\n", HEADS,SEQ,SEQ];
+    [m appendFormat:@"        tensor<fp16, [1,%d,%d,%d]> dp = reshape(shape=ssh,x=dpf)[name=tensor<string, []>(\"rdp\")];\n", HEADS,SEQ,SEQ];
+    [m appendFormat:@"        tensor<int32, [4]> rsh = const()[name=tensor<string, []>(\"rsh\"), val=tensor<int32, [4]>([1,%d,%d,%d])];\n", HEADS,HD,SEQ];
+    [m appendString:@"        tensor<int32, [4]> pm = const()[name=tensor<string, []>(\"pm\"), val=tensor<int32, [4]>([0,1,3,2])];\n"];
+    [m appendFormat:@"        tensor<fp16, [1,%d,%d,%d]> qr = reshape(shape=rsh,x=qf)[name=tensor<string, []>(\"rq\")];\n", HEADS,HD,SEQ];
+    [m appendFormat:@"        tensor<fp16, [1,%d,%d,%d]> q = transpose(perm=pm,x=qr)[name=tensor<string, []>(\"tq\")];\n", HEADS,SEQ,HD];
+    [m appendFormat:@"        tensor<fp16, [1,%d,%d,%d]> kr = reshape(shape=rsh,x=kf)[name=tensor<string, []>(\"rk\")];\n", HEADS,HD,SEQ];
+    [m appendFormat:@"        tensor<fp16, [1,%d,%d,%d]> k = transpose(perm=pm,x=kr)[name=tensor<string, []>(\"tk\")];\n", HEADS,SEQ,HD];
+    [m appendFormat:@"        tensor<fp16, [1,%d,%d,%d]> pdp = mul(x=probs,y=dp)[name=tensor<string, []>(\"pdp\")];\n", HEADS,SEQ,SEQ];
+    [m appendString:@"        tensor<int32, [1]> rax = const()[name=tensor<string, []>(\"rax\"), val=tensor<int32, [1]>([-1])];\n"];
+    [m appendString:@"        tensor<bool, []> kd = const()[name=tensor<string, []>(\"kd\"), val=tensor<bool, []>(true)];\n"];
+    [m appendFormat:@"        tensor<fp16, [1,%d,%d,1]> spdp = reduce_sum(x=pdp,axes=rax,keep_dims=kd)[name=tensor<string, []>(\"rs\")];\n", HEADS,SEQ];
+    [m appendFormat:@"        tensor<fp16, [1,%d,%d,%d]> dps = sub(x=dp,y=spdp)[name=tensor<string, []>(\"dps\")];\n", HEADS,SEQ,SEQ];
+    [m appendFormat:@"        tensor<fp16, [1,%d,%d,%d]> ds0 = mul(x=probs,y=dps)[name=tensor<string, []>(\"ds0\")];\n", HEADS,SEQ,SEQ];
+    [m appendFormat:@"        tensor<fp16, []> scv = const()[name=tensor<string, []>(\"scv\"), val=tensor<fp16, []>(%f)];\n", sc];
+    [m appendFormat:@"        tensor<fp16, [1,%d,%d,%d]> ds = mul(x=ds0,y=scv)[name=tensor<string, []>(\"ds\")];\n", HEADS,SEQ,SEQ];
+    [m appendString:@"        tensor<bool, []> bF = const()[name=tensor<string, []>(\"bF\"), val=tensor<bool, []>(false)];\n"];
+    [m appendString:@"        tensor<bool, []> bT = const()[name=tensor<string, []>(\"bT\"), val=tensor<bool, []>(true)];\n"];
+    [m appendFormat:@"        tensor<fp16, [1,%d,%d,%d]> dq4 = matmul(transpose_x=bF,transpose_y=bF,x=ds,y=k)[name=tensor<string, []>(\"dq\")];\n", HEADS,SEQ,HD];
+    [m appendFormat:@"        tensor<fp16, [1,%d,%d,%d]> dk4 = matmul(transpose_x=bT,transpose_y=bF,x=ds,y=q)[name=tensor<string, []>(\"dk\")];\n", HEADS,SEQ,HD];
+    [m appendFormat:@"        tensor<fp16, [1,%d,%d,%d]> dqt = transpose(perm=pm,x=dq4)[name=tensor<string, []>(\"dqt\")];\n", HEADS,HD,SEQ];
+    [m appendFormat:@"        tensor<fp16, [1,%d,%d,%d]> dkt = transpose(perm=pm,x=dk4)[name=tensor<string, []>(\"dkt\")];\n", HEADS,HD,SEQ];
+    [m appendFormat:@"        tensor<int32, [4]> fs = const()[name=tensor<string, []>(\"fs\"), val=tensor<int32, [4]>([1,%d,1,%d])];\n", DIM,SEQ];
+    [m appendFormat:@"        tensor<fp16, [1,%d,1,%d]> dqf = reshape(shape=fs,x=dqt)[name=tensor<string, []>(\"dqf\")];\n", DIM,SEQ];
+    [m appendFormat:@"        tensor<fp16, [1,%d,1,%d]> dkf = reshape(shape=fs,x=dkt)[name=tensor<string, []>(\"dkf\")];\n", DIM,SEQ];
+    [m appendString:@"        tensor<int32, []> cax = const()[name=tensor<string, []>(\"cax\"), val=tensor<int32, []>(1)];\n"];
+    [m appendString:@"        tensor<bool, []> cid = const()[name=tensor<string, []>(\"cid\"), val=tensor<bool, []>(false)];\n"];
+    [m appendFormat:@"        tensor<fp16, [1,%d,1,%d]> out = concat(axis=cax,interleave=cid,values=(dqf,dkf))[name=tensor<string, []>(\"cat\")];\n", 2*DIM,SEQ];
    [m appendString:@"    } -> (out);\n}\n"];
    return m;
 }
--- a/training/test_ane_advanced.m
+++ b/training/test_ane_advanced.m
@ -50,6 +50,8 @@ static IOSurfaceRef make_surface(size_t bytes) {
        (id)kIOSurfaceAllocSize:@(bytes), (id)kIOSurfacePixelFormat:@0});
 }

+static int g_fp16_io = 0;  // M1/M2: cast op unsupported, use fp16 I/O directly
+
 int main() {
    @autoreleasepool {
        setbuf(stdout, NULL);
@ -106,28 +108,43 @@ int main() {
        memcpy(blob+128, w, ws);
        NSData *wdata = [NSData dataWithBytesNoCopy:blob length:tot freeWhenDone:YES];

-        NSString *mil = [NSString stringWithFormat:
-            @"program(1.3)\n"
-            "[buildInfo = dict<string, string>({{\"coremlc-component-MIL\", \"3510.2.1\"}, "
-            "{\"coremlc-version\", \"3505.4.1\"}, {\"coremltools-component-milinternal\", \"\"}, "
-            "{\"coremltools-version\", \"9.0\"}})]\n"
-            "{\n"
-            "    func main<ios18>(tensor<fp32, [1, %d, 1, %d]> x) {\n"
-            "        string pt = const()[name=string(\"pt\"), val=string(\"valid\")];\n"
-            "        tensor<int32, [2]> st = const()[name=string(\"st\"), val=tensor<int32, [2]>([1,1])];\n"
-            "        tensor<int32, [4]> pd = const()[name=string(\"pd\"), val=tensor<int32, [4]>([0,0,0,0])];\n"
-            "        tensor<int32, [2]> dl = const()[name=string(\"dl\"), val=tensor<int32, [2]>([1,1])];\n"
-            "        int32 gr = const()[name=string(\"gr\"), val=int32(1)];\n"
-            "        string to16 = const()[name=string(\"to16\"), val=string(\"fp16\")];\n"
-            "        tensor<fp16, [1,%d,1,%d]> x16 = cast(dtype=to16,x=x)[name=string(\"cin\")];\n"
-            "        tensor<fp16, [%d,%d,1,1]> W = const()[name=string(\"W\"), "
-            "val=tensor<fp16, [%d,%d,1,1]>(BLOBFILE(path=string(\"@model_path/weights/weight.bin\"), offset=uint64(64)))];\n"
-            "        tensor<fp16, [1,%d,1,%d]> y16 = conv(dilations=dl,groups=gr,pad=pd,pad_type=pt,strides=st,weight=W,x=x16)"
-            "[name=string(\"conv\")];\n"
-            "        string to32 = const()[name=string(\"to32\"), val=string(\"fp32\")];\n"
-            "        tensor<fp32, [1,%d,1,%d]> y = cast(dtype=to32,x=y16)[name=string(\"cout\")];\n"
-            "    } -> (y);\n"
-            "}\n", CH, SP, CH, SP, CH, CH, CH, CH, CH, SP, CH, SP];
+        NSFileManager *fm = [NSFileManager defaultManager];
+
+        retry_compile:;
+        NSString *mil;
+        if (g_fp16_io) {
+            mil = [NSString stringWithFormat:
+                @"program(1.0)\n[buildInfo = dict<tensor<string, []>, tensor<string, []>>({{\"coremlc-version\", \"3505.4.1\"}})]\n{\n"
+                "    func main<ios16>(tensor<fp16, [1, %d, 1, %d]> x) {\n"
+                "        tensor<string, []> pt = const()[name=tensor<string, []>(\"pt\"), val=tensor<string, []>(\"valid\")];\n"
+                "        tensor<int32, [2]> st = const()[name=tensor<string, []>(\"st\"), val=tensor<int32, [2]>([1,1])];\n"
+                "        tensor<int32, [4]> pd = const()[name=tensor<string, []>(\"pd\"), val=tensor<int32, [4]>([0,0,0,0])];\n"
+                "        tensor<int32, [2]> dl = const()[name=tensor<string, []>(\"dl\"), val=tensor<int32, [2]>([1,1])];\n"
+                "        tensor<int32, []> gr = const()[name=tensor<string, []>(\"gr\"), val=tensor<int32, []>(1)];\n"
+                "        tensor<fp16, [%d,%d,1,1]> W = const()[name=tensor<string, []>(\"W\"), "
+                "val=tensor<fp16, [%d,%d,1,1]>(BLOBFILE(path=tensor<string, []>(\"@model_path/weights/weight.bin\"), offset=tensor<uint64, []>(64)))];\n"
+                "        tensor<fp16, [1,%d,1,%d]> y = conv(dilations=dl,groups=gr,pad=pd,pad_type=pt,strides=st,weight=W,x=x)"
+                "[name=tensor<string, []>(\"conv\")];\n"
+                "    } -> (y);\n}\n", CH, SP, CH, CH, CH, CH, CH, SP];
+        } else {
+            mil = [NSString stringWithFormat:
+                @"program(1.0)\n[buildInfo = dict<tensor<string, []>, tensor<string, []>>({{\"coremlc-version\", \"3505.4.1\"}})]\n{\n"
+                "    func main<ios16>(tensor<fp32, [1, %d, 1, %d]> x) {\n"
+                "        tensor<string, []> pt = const()[name=tensor<string, []>(\"pt\"), val=tensor<string, []>(\"valid\")];\n"
+                "        tensor<int32, [2]> st = const()[name=tensor<string, []>(\"st\"), val=tensor<int32, [2]>([1,1])];\n"
+                "        tensor<int32, [4]> pd = const()[name=tensor<string, []>(\"pd\"), val=tensor<int32, [4]>([0,0,0,0])];\n"
+                "        tensor<int32, [2]> dl = const()[name=tensor<string, []>(\"dl\"), val=tensor<int32, [2]>([1,1])];\n"
+                "        tensor<int32, []> gr = const()[name=tensor<string, []>(\"gr\"), val=tensor<int32, []>(1)];\n"
+                "        tensor<string, []> to16 = const()[name=tensor<string, []>(\"to16\"), val=tensor<string, []>(\"fp16\")];\n"
+                "        tensor<fp16, [1,%d,1,%d]> x16 = cast(dtype=to16,x=x)[name=tensor<string, []>(\"cin\")];\n"
+                "        tensor<fp16, [%d,%d,1,1]> W = const()[name=tensor<string, []>(\"W\"), "
+                "val=tensor<fp16, [%d,%d,1,1]>(BLOBFILE(path=tensor<string, []>(\"@model_path/weights/weight.bin\"), offset=tensor<uint64, []>(64)))];\n"
+                "        tensor<fp16, [1,%d,1,%d]> y16 = conv(dilations=dl,groups=gr,pad=pd,pad_type=pt,strides=st,weight=W,x=x16)"
+                "[name=tensor<string, []>(\"conv\")];\n"
+                "        tensor<string, []> to32 = const()[name=tensor<string, []>(\"to32\"), val=tensor<string, []>(\"fp32\")];\n"
+                "        tensor<fp32, [1,%d,1,%d]> y = cast(dtype=to32,x=y16)[name=tensor<string, []>(\"cout\")];\n"
+                "    } -> (y);\n}\n", CH, SP, CH, SP, CH, CH, CH, CH, CH, SP, CH, SP];
+        }

        NSData *md = [mil dataUsingEncoding:NSUTF8StringEncoding];
        id desc = ((id(*)(Class,SEL,id,id,id))objc_msgSend)(g_D, @selector(modelWithMILText:weights:optionsPlist:),
@ -135,23 +152,33 @@ int main() {
        id mdl = ((id(*)(Class,SEL,id))objc_msgSend)(g_I, @selector(inMemoryModelWithDescriptor:), desc);
        id hx = ((id(*)(id,SEL))objc_msgSend)(mdl, @selector(hexStringIdentifier));
        NSString *td = [NSTemporaryDirectory() stringByAppendingPathComponent:hx];
-        NSFileManager *fm = [NSFileManager defaultManager];
        [fm createDirectoryAtPath:[td stringByAppendingPathComponent:@"weights"]
            withIntermediateDirectories:YES attributes:nil error:nil];
        [md writeToFile:[td stringByAppendingPathComponent:@"model.mil"] atomically:YES];
        [wdata writeToFile:[td stringByAppendingPathComponent:@"weights/weight.bin"] atomically:YES];

        NSError *e = nil;
-        ((BOOL(*)(id,SEL,unsigned int,id,NSError**))objc_msgSend)(mdl, @selector(compileWithQoS:options:error:), 21, @{}, &e);
+        BOOL compiled = ((BOOL(*)(id,SEL,unsigned int,id,NSError**))objc_msgSend)(mdl, @selector(compileWithQoS:options:error:), 21, @{}, &e);
+        if (!compiled && !g_fp16_io) {
+            printf("[ANE] fp32 compile failed, retrying with fp16 I/O (M1/M2 fallback)\n");
+            g_fp16_io = 1;
+            [fm removeItemAtPath:td error:nil];
+            goto retry_compile;
+        }
        ((BOOL(*)(id,SEL,unsigned int,id,NSError**))objc_msgSend)(mdl, @selector(loadWithQoS:options:error:), 21, @{}, &e);

-        int ioBytes = CH * SP * 4;
+        int ioBytes = CH * SP * (g_fp16_io ? 2 : 4);
        IOSurfaceRef ioIn = make_surface(ioBytes);
        IOSurfaceRef ioOut = make_surface(ioBytes);

        IOSurfaceLock(ioIn, 0, NULL);
-        float *inp = (float*)IOSurfaceGetBaseAddress(ioIn);
-        for (int c = 0; c < CH; c++) for (int s = 0; s < SP; s++) inp[c*SP+s] = (float)(s+1) * 0.1f;
+        if (g_fp16_io) {
+            _Float16 *inp = (_Float16*)IOSurfaceGetBaseAddress(ioIn);
+            for (int c = 0; c < CH; c++) for (int s = 0; s < SP; s++) inp[c*SP+s] = (_Float16)((float)(s+1) * 0.1f);
+        } else {
+            float *inp = (float*)IOSurfaceGetBaseAddress(ioIn);
+            for (int c = 0; c < CH; c++) for (int s = 0; s < SP; s++) inp[c*SP+s] = (float)(s+1) * 0.1f;
+        }
        IOSurfaceUnlock(ioIn, 0, NULL);

        // Baseline eval
@ -165,9 +192,16 @@ int main() {
        printf("  Baseline eval (weightsBuffer=nil, procIdx=0): %s\n", ok ? "OK" : "FAIL");

        IOSurfaceLock(ioOut, kIOSurfaceLockReadOnly, NULL);
-        float *out0 = (float*)IOSurfaceGetBaseAddress(ioOut);
-        float baseline_0 = out0[0], baseline_1 = out0[1];
-        printf("  Output[0..3]: [%.4f, %.4f, %.4f, %.4f]\n", out0[0], out0[1], out0[2], out0[3]);
+        float baseline_0, baseline_1;
+        if (g_fp16_io) {
+            _Float16 *out0 = (_Float16*)IOSurfaceGetBaseAddress(ioOut);
+            baseline_0 = (float)out0[0]; baseline_1 = (float)out0[1];
+            printf("  Output[0..3]: [%.4f, %.4f, %.4f, %.4f]\n", (float)out0[0], (float)out0[1], (float)out0[2], (float)out0[3]);
+        } else {
+            float *out0 = (float*)IOSurfaceGetBaseAddress(ioOut);
+            baseline_0 = out0[0]; baseline_1 = out0[1];
+            printf("  Output[0..3]: [%.4f, %.4f, %.4f, %.4f]\n", out0[0], out0[1], out0[2], out0[3]);
+        }
        IOSurfaceUnlock(ioOut, kIOSurfaceLockReadOnly, NULL);

        // Test weightsBuffer: IOSurface with 3x identity weights
@ -194,10 +228,18 @@ int main() {
            printf("  Eval with weightsBuffer: %s\n", ok ? "OK" : e ? [[e description] UTF8String] : "FAIL");
            if (ok) {
                IOSurfaceLock(ioOut, kIOSurfaceLockReadOnly, NULL);
-                float *outW = (float*)IOSurfaceGetBaseAddress(ioOut);
-                printf("  Output[0..3]: [%.4f, %.4f, %.4f, %.4f]\n", outW[0], outW[1], outW[2], outW[3]);
-                bool changed = fabsf(outW[0] - baseline_0) > 0.001f;
-                bool is_3x = fabsf(outW[0] - baseline_0 * 3.0f) < 0.1f;
+                float outW_0;
+                if (g_fp16_io) {
+                    _Float16 *outW = (_Float16*)IOSurfaceGetBaseAddress(ioOut);
+                    outW_0 = (float)outW[0];
+                    printf("  Output[0..3]: [%.4f, %.4f, %.4f, %.4f]\n", (float)outW[0], (float)outW[1], (float)outW[2], (float)outW[3]);
+                } else {
+                    float *outW = (float*)IOSurfaceGetBaseAddress(ioOut);
+                    outW_0 = outW[0];
+                    printf("  Output[0..3]: [%.4f, %.4f, %.4f, %.4f]\n", outW[0], outW[1], outW[2], outW[3]);
+                }
+                bool changed = fabsf(outW_0 - baseline_0) > 0.001f;
+                bool is_3x = fabsf(outW_0 - baseline_0 * 3.0f) < 0.1f;
                printf("  weightsBuffer: output %s", changed ? "CHANGED" : "unchanged");
                if (changed) printf(" (%s)", is_3x ? "matches 3x — WORKS!" : "but not 3x as expected");
                printf("\n");
--- a/training/test_ane_causal_attn.m
+++ b/training/test_ane_causal_attn.m
@ -81,13 +81,11 @@ int main() {
        // === Approach 1: Non-causal SDPA (baseline) ===
        printf("=== Non-causal SDPA (baseline) ===\n");
        NSString *sdpa_mil = [NSString stringWithFormat:
-            @"program(1.3)\n[buildInfo = dict<string, string>({{\"coremlc-component-MIL\", \"3510.2.1\"}, "
-            "{\"coremlc-version\", \"3505.4.1\"}, {\"coremltools-component-milinternal\", \"\"}, "
-            "{\"coremltools-version\", \"9.0\"}})]\n{\n"
-            "    func main<ios18>(tensor<fp16, [1, %d, %d, %d]> q, "
+            @"program(1.0)\n[buildInfo = dict<tensor<string, []>, tensor<string, []>>({{\"coremlc-version\", \"3505.4.1\"}})]\n{\n"
+            "    func main<ios16>(tensor<fp16, [1, %d, %d, %d]> q, "
            "tensor<fp16, [1, %d, %d, %d]> k, tensor<fp16, [1, %d, %d, %d]> v) {\n"
            "        tensor<fp16, [1, %d, %d, %d]> att = scaled_dot_product_attention("
-            "query = q, key = k, value = v)[name = string(\"sdpa\")];\n"
+            "query = q, key = k, value = v)[name = tensor<string, []>(\"sdpa\")];\n"
            "    } -> (att);\n}\n",
            HEADS, SEQ, HD, HEADS, SEQ, HD, HEADS, SEQ, HD, HEADS, SEQ, HD];
        Kern kSDPA = compile_mil(sdpa_mil);
@ -100,13 +98,11 @@ int main() {
        // scores = Q @ K^T → [1, HEADS, SEQ, SEQ]
        printf("\n=== Decomposed causal attention ===\n");
        NSString *qkt_mil = [NSString stringWithFormat:
-            @"program(1.3)\n[buildInfo = dict<string, string>({{\"coremlc-component-MIL\", \"3510.2.1\"}, "
-            "{\"coremlc-version\", \"3505.4.1\"}, {\"coremltools-component-milinternal\", \"\"}, "
-            "{\"coremltools-version\", \"9.0\"}})]\n{\n"
-            "    func main<ios18>(tensor<fp16, [1, %d, %d, %d]> q, "
+            @"program(1.0)\n[buildInfo = dict<tensor<string, []>, tensor<string, []>>({{\"coremlc-version\", \"3505.4.1\"}})]\n{\n"
+            "    func main<ios16>(tensor<fp16, [1, %d, %d, %d]> q, "
            "tensor<fp16, [1, %d, %d, %d]> k) {\n"
            "        tensor<fp16, [1, %d, %d, %d]> scores = matmul("
-            "x = q, y = k, transpose_y = true)[name = string(\"qkt\")];\n"
+            "x = q, y = k, transpose_y = true)[name = tensor<string, []>(\"qkt\")];\n"
            "    } -> (scores);\n}\n",
            HEADS, SEQ, HD, HEADS, SEQ, HD, HEADS, SEQ, SEQ];
        Kern kQKT = compile_mil(qkt_mil);
@ -114,13 +110,11 @@ int main() {

        // Step 3: scores_softmax @ V → output [1, HEADS, SEQ, HD]
        NSString *sv_mil = [NSString stringWithFormat:
-            @"program(1.3)\n[buildInfo = dict<string, string>({{\"coremlc-component-MIL\", \"3510.2.1\"}, "
-            "{\"coremlc-version\", \"3505.4.1\"}, {\"coremltools-component-milinternal\", \"\"}, "
-            "{\"coremltools-version\", \"9.0\"}})]\n{\n"
-            "    func main<ios18>(tensor<fp16, [1, %d, %d, %d]> s, "
+            @"program(1.0)\n[buildInfo = dict<tensor<string, []>, tensor<string, []>>({{\"coremlc-version\", \"3505.4.1\"}})]\n{\n"
+            "    func main<ios16>(tensor<fp16, [1, %d, %d, %d]> s, "
            "tensor<fp16, [1, %d, %d, %d]> v) {\n"
            "        tensor<fp16, [1, %d, %d, %d]> out = matmul("
-            "x = s, y = v)[name = string(\"sv\")];\n"
+            "x = s, y = v)[name = tensor<string, []>(\"sv\")];\n"
            "    } -> (out);\n}\n",
            HEADS, SEQ, SEQ, HEADS, SEQ, HD, HEADS, SEQ, HD];
        Kern kSV = compile_mil(sv_mil);
--- a/training/test_ane_sdpa5.m
+++ b/training/test_ane_sdpa5.m
@ -187,13 +187,11 @@ int main() {
        printf("Test 1: no mask\n");
        {
            NSString *mil = [NSString stringWithFormat:
-                @"program(1.3)\n[buildInfo = dict<string, string>({{\"coremlc-component-MIL\", \"3510.2.1\"}, "
-                "{\"coremlc-version\", \"3505.4.1\"}, {\"coremltools-component-milinternal\", \"\"}, "
-                "{\"coremltools-version\", \"9.0\"}})]\n{\n"
-                "    func main<ios18>(tensor<fp16, [1, %d, %d, %d]> q, "
+                @"program(1.0)\n[buildInfo = dict<tensor<string, []>, tensor<string, []>>({{\"coremlc-version\", \"3505.4.1\"}})]\n{\n"
+                "    func main<ios16>(tensor<fp16, [1, %d, %d, %d]> q, "
                "tensor<fp16, [1, %d, %d, %d]> k, tensor<fp16, [1, %d, %d, %d]> v) {\n"
                "        tensor<fp16, [1, %d, %d, %d]> att = scaled_dot_product_attention("
-                "query = q, key = k, value = v)[name = string(\"sdpa\")];\n"
+                "query = q, key = k, value = v)[name = tensor<string, []>(\"sdpa\")];\n"
                "    } -> (att);\n}\n",
                HEADS, SEQ, HD, HEADS, SEQ, HD, HEADS, SEQ, HD, HEADS, SEQ, HD];
            Model m = compile_model(mil, nil);
@ -209,14 +207,12 @@ int main() {
        {
            NSString *maskStr = build_inline_causal_mask(SEQ);
            NSString *mil = [NSString stringWithFormat:
-                @"program(1.3)\n[buildInfo = dict<string, string>({{\"coremlc-component-MIL\", \"3510.2.1\"}, "
-                "{\"coremlc-version\", \"3505.4.1\"}, {\"coremltools-component-milinternal\", \"\"}, "
-                "{\"coremltools-version\", \"9.0\"}})]\n{\n"
-                "    func main<ios18>(tensor<fp16, [1, %d, %d, %d]> q, "
+                @"program(1.0)\n[buildInfo = dict<tensor<string, []>, tensor<string, []>>({{\"coremlc-version\", \"3505.4.1\"}})]\n{\n"
+                "    func main<ios16>(tensor<fp16, [1, %d, %d, %d]> q, "
                "tensor<fp16, [1, %d, %d, %d]> k, tensor<fp16, [1, %d, %d, %d]> v) {\n"
-                "        %@ mask = const()[name = string(\"mask\"), val = %@];\n"
+                "        %@ mask = const()[name = tensor<string, []>(\"mask\"), val = %@];\n"
                "        tensor<fp16, [1, %d, %d, %d]> att = scaled_dot_product_attention("
-                "query = q, key = k, value = v, attn_mask = mask)[name = string(\"sdpa\")];\n"
+                "query = q, key = k, value = v, attn_mask = mask)[name = tensor<string, []>(\"sdpa\")];\n"
                "    } -> (att);\n}\n",
                HEADS, SEQ, HD, HEADS, SEQ, HD, HEADS, SEQ, HD,
                [NSString stringWithFormat:@"tensor<fp16, [1, 1, %d, %d]>", SEQ, SEQ], maskStr,
@ -233,15 +229,13 @@ int main() {
        printf("\nTest 3: BLOBFILE causal mask\n");
        {
            NSString *mil = [NSString stringWithFormat:
-                @"program(1.3)\n[buildInfo = dict<string, string>({{\"coremlc-component-MIL\", \"3510.2.1\"}, "
-                "{\"coremlc-version\", \"3505.4.1\"}, {\"coremltools-component-milinternal\", \"\"}, "
-                "{\"coremltools-version\", \"9.0\"}})]\n{\n"
-                "    func main<ios18>(tensor<fp16, [1, %d, %d, %d]> q, "
+                @"program(1.0)\n[buildInfo = dict<tensor<string, []>, tensor<string, []>>({{\"coremlc-version\", \"3505.4.1\"}})]\n{\n"
+                "    func main<ios16>(tensor<fp16, [1, %d, %d, %d]> q, "
                "tensor<fp16, [1, %d, %d, %d]> k, tensor<fp16, [1, %d, %d, %d]> v) {\n"
-                "        tensor<fp16, [1, 1, %d, %d]> mask = const()[name = string(\"mask\"), "
-                "val = tensor<fp16, [1, 1, %d, %d]>(BLOBFILE(path = string(\"@model_path/weights/mask.bin\"), offset = uint64(64)))];\n"
+                "        tensor<fp16, [1, 1, %d, %d]> mask = const()[name = tensor<string, []>(\"mask\"), "
+                "val = tensor<fp16, [1, 1, %d, %d]>(BLOBFILE(path = tensor<string, []>(\"@model_path/weights/mask.bin\"), offset = tensor<uint64, []>(64)))];\n"
                "        tensor<fp16, [1, %d, %d, %d]> att = scaled_dot_product_attention("
-                "query = q, key = k, value = v, attn_mask = mask)[name = string(\"sdpa\")];\n"
+                "query = q, key = k, value = v, attn_mask = mask)[name = tensor<string, []>(\"sdpa\")];\n"
                "    } -> (att);\n}\n",
                HEADS, SEQ, HD, HEADS, SEQ, HD, HEADS, SEQ, HD,
                SEQ, SEQ, SEQ, SEQ, HEADS, SEQ, HD];
@ -258,14 +252,12 @@ int main() {
        printf("\nTest 4: mask as runtime input\n");
        {
            NSString *mil = [NSString stringWithFormat:
-                @"program(1.3)\n[buildInfo = dict<string, string>({{\"coremlc-component-MIL\", \"3510.2.1\"}, "
-                "{\"coremlc-version\", \"3505.4.1\"}, {\"coremltools-component-milinternal\", \"\"}, "
-                "{\"coremltools-version\", \"9.0\"}})]\n{\n"
-                "    func main<ios18>(tensor<fp16, [1, %d, %d, %d]> q, "
+                @"program(1.0)\n[buildInfo = dict<tensor<string, []>, tensor<string, []>>({{\"coremlc-version\", \"3505.4.1\"}})]\n{\n"
+                "    func main<ios16>(tensor<fp16, [1, %d, %d, %d]> q, "
                "tensor<fp16, [1, %d, %d, %d]> k, tensor<fp16, [1, %d, %d, %d]> v, "
                "tensor<fp16, [1, 1, %d, %d]> mask) {\n"
                "        tensor<fp16, [1, %d, %d, %d]> att = scaled_dot_product_attention("
-                "query = q, key = k, value = v, attn_mask = mask)[name = string(\"sdpa\")];\n"
+                "query = q, key = k, value = v, attn_mask = mask)[name = tensor<string, []>(\"sdpa\")];\n"
                "    } -> (att);\n}\n",
                HEADS, SEQ, HD, HEADS, SEQ, HD, HEADS, SEQ, HD,
                SEQ, SEQ, HEADS, SEQ, HD];
--- a/training/test_conv_attn3.m
+++ b/training/test_conv_attn3.m
@ -82,19 +82,17 @@ static void cleanup_kern(Kern *k) {

 static NSString *gen_conv_mil(int ic, int oc, int icg, int groups, int sp) {
    return [NSString stringWithFormat:
-        @"program(1.3)\n[buildInfo = dict<string, string>({{\"coremlc-component-MIL\", \"3510.2.1\"}, "
-        "{\"coremlc-version\", \"3505.4.1\"}, {\"coremltools-component-milinternal\", \"\"}, "
-        "{\"coremltools-version\", \"9.0\"}})]\n{\n"
-        "    func main<ios18>(tensor<fp16, [1, %d, 1, %d]> x) {\n"
-        "        tensor<fp16, [%d, %d, 1, 1]> W = const()[name = string(\"W\"), "
-        "val = tensor<fp16, [%d, %d, 1, 1]>(BLOBFILE(path = string(\"@model_path/weights/w.bin\"), offset = uint64(64)))];\n"
-        "        string pt = const()[name = string(\"pt\"), val = string(\"valid\")];\n"
-        "        tensor<int32, [2]> st = const()[name = string(\"st\"), val = tensor<int32, [2]>([1, 1])];\n"
-        "        tensor<int32, [4]> pd = const()[name = string(\"pd\"), val = tensor<int32, [4]>([0, 0, 0, 0])];\n"
-        "        tensor<int32, [2]> dl = const()[name = string(\"dl\"), val = tensor<int32, [2]>([1, 1])];\n"
-        "        int32 gr = const()[name = string(\"gr\"), val = int32(%d)];\n"
+        @"program(1.0)\n[buildInfo = dict<tensor<string, []>, tensor<string, []>>({{\"coremlc-version\", \"3505.4.1\"}})]\n{\n"
+        "    func main<ios16>(tensor<fp16, [1, %d, 1, %d]> x) {\n"
+        "        tensor<fp16, [%d, %d, 1, 1]> W = const()[name = tensor<string, []>(\"W\"), "
+        "val = tensor<fp16, [%d, %d, 1, 1]>(BLOBFILE(path = tensor<string, []>(\"@model_path/weights/w.bin\"), offset = tensor<uint64, []>(64)))];\n"
+        "        tensor<string, []> pt = const()[name = tensor<string, []>(\"pt\"), val = tensor<string, []>(\"valid\")];\n"
+        "        tensor<int32, [2]> st = const()[name = tensor<string, []>(\"st\"), val = tensor<int32, [2]>([1, 1])];\n"
+        "        tensor<int32, [4]> pd = const()[name = tensor<string, []>(\"pd\"), val = tensor<int32, [4]>([0, 0, 0, 0])];\n"
+        "        tensor<int32, [2]> dl = const()[name = tensor<string, []>(\"dl\"), val = tensor<int32, [2]>([1, 1])];\n"
+        "        tensor<int32, []> gr = const()[name = tensor<string, []>(\"gr\"), val = tensor<int32, []>(%d)];\n"
        "        tensor<fp16, [1, %d, 1, %d]> y = conv(dilations = dl, groups = gr, pad = pd, "
-        "pad_type = pt, strides = st, weight = W, x = x)[name = string(\"cv\")];\n"
+        "pad_type = pt, strides = st, weight = W, x = x)[name = tensor<string, []>(\"cv\")];\n"
        "    } -> (y);\n}\n", ic, sp, oc, icg, oc, icg, groups, oc, sp];
 }

--- a/training/test_full_fused.m
+++ b/training/test_full_fused.m
@ -130,64 +130,62 @@ int main() {
            float scale_val = 1.0f / sqrtf((float)HD);

            NSString *mil = [NSString stringWithFormat:
-                @"program(1.3)\n[buildInfo = dict<string, string>({{\"coremlc-component-MIL\", \"3510.2.1\"}, "
-                "{\"coremlc-version\", \"3505.4.1\"}, {\"coremltools-component-milinternal\", \"\"}, "
-                "{\"coremltools-version\", \"9.0\"}})]\n{\n"
-                "    func main<ios18>(tensor<fp16, [1, %d, 1, %d]> x) {\n"
+                @"program(1.0)\n[buildInfo = dict<tensor<string, []>, tensor<string, []>>({{\"coremlc-version\", \"3505.4.1\"}})]\n{\n"
+                "    func main<ios16>(tensor<fp16, [1, %d, 1, %d]> x) {\n"
                // Conv boilerplate
-                "        string pt = const()[name = string(\"pt\"), val = string(\"valid\")];\n"
-                "        tensor<int32, [2]> st = const()[name = string(\"st\"), val = tensor<int32, [2]>([1, 1])];\n"
-                "        tensor<int32, [4]> pd = const()[name = string(\"pd\"), val = tensor<int32, [4]>([0, 0, 0, 0])];\n"
-                "        tensor<int32, [2]> dl = const()[name = string(\"dl\"), val = tensor<int32, [2]>([1, 1])];\n"
-                "        int32 gr1 = const()[name = string(\"g1\"), val = int32(1)];\n"
+                "        tensor<string, []> pt = const()[name = tensor<string, []>(\"pt\"), val = tensor<string, []>(\"valid\")];\n"
+                "        tensor<int32, [2]> st = const()[name = tensor<string, []>(\"st\"), val = tensor<int32, [2]>([1, 1])];\n"
+                "        tensor<int32, [4]> pd = const()[name = tensor<string, []>(\"pd\"), val = tensor<int32, [4]>([0, 0, 0, 0])];\n"
+                "        tensor<int32, [2]> dl = const()[name = tensor<string, []>(\"dl\"), val = tensor<int32, [2]>([1, 1])];\n"
+                "        tensor<int32, []> gr1 = const()[name = tensor<string, []>(\"g1\"), val = tensor<int32, []>(1)];\n"
                // QKV weights
-                "        tensor<fp16, [%d, %d, 1, 1]> Wq = const()[name = string(\"Wq\"), "
-                "val = tensor<fp16, [%d, %d, 1, 1]>(BLOBFILE(path = string(\"@model_path/weights/wq.bin\"), offset = uint64(64)))];\n"
-                "        tensor<fp16, [%d, %d, 1, 1]> Wk = const()[name = string(\"Wk\"), "
-                "val = tensor<fp16, [%d, %d, 1, 1]>(BLOBFILE(path = string(\"@model_path/weights/wk.bin\"), offset = uint64(64)))];\n"
-                "        tensor<fp16, [%d, %d, 1, 1]> Wv = const()[name = string(\"Wv\"), "
-                "val = tensor<fp16, [%d, %d, 1, 1]>(BLOBFILE(path = string(\"@model_path/weights/wv.bin\"), offset = uint64(64)))];\n"
-                "        tensor<fp16, [%d, %d, 1, 1]> Wout = const()[name = string(\"Wo\"), "
-                "val = tensor<fp16, [%d, %d, 1, 1]>(BLOBFILE(path = string(\"@model_path/weights/wo.bin\"), offset = uint64(64)))];\n"
+                "        tensor<fp16, [%d, %d, 1, 1]> Wq = const()[name = tensor<string, []>(\"Wq\"), "
+                "val = tensor<fp16, [%d, %d, 1, 1]>(BLOBFILE(path = tensor<string, []>(\"@model_path/weights/wq.bin\"), offset = tensor<uint64, []>(64)))];\n"
+                "        tensor<fp16, [%d, %d, 1, 1]> Wk = const()[name = tensor<string, []>(\"Wk\"), "
+                "val = tensor<fp16, [%d, %d, 1, 1]>(BLOBFILE(path = tensor<string, []>(\"@model_path/weights/wk.bin\"), offset = tensor<uint64, []>(64)))];\n"
+                "        tensor<fp16, [%d, %d, 1, 1]> Wv = const()[name = tensor<string, []>(\"Wv\"), "
+                "val = tensor<fp16, [%d, %d, 1, 1]>(BLOBFILE(path = tensor<string, []>(\"@model_path/weights/wv.bin\"), offset = tensor<uint64, []>(64)))];\n"
+                "        tensor<fp16, [%d, %d, 1, 1]> Wout = const()[name = tensor<string, []>(\"Wo\"), "
+                "val = tensor<fp16, [%d, %d, 1, 1]>(BLOBFILE(path = tensor<string, []>(\"@model_path/weights/wo.bin\"), offset = tensor<uint64, []>(64)))];\n"
                // QKV projections
                "        tensor<fp16, [1, %d, 1, %d]> q_flat = conv(dilations = dl, groups = gr1, pad = pd, "
-                "pad_type = pt, strides = st, weight = Wq, x = x)[name = string(\"cq\")];\n"
+                "pad_type = pt, strides = st, weight = Wq, x = x)[name = tensor<string, []>(\"cq\")];\n"
                "        tensor<fp16, [1, %d, 1, %d]> k_flat = conv(dilations = dl, groups = gr1, pad = pd, "
-                "pad_type = pt, strides = st, weight = Wk, x = x)[name = string(\"ck\")];\n"
+                "pad_type = pt, strides = st, weight = Wk, x = x)[name = tensor<string, []>(\"ck\")];\n"
                "        tensor<fp16, [1, %d, 1, %d]> v_flat = conv(dilations = dl, groups = gr1, pad = pd, "
-                "pad_type = pt, strides = st, weight = Wv, x = x)[name = string(\"cv\")];\n"
+                "pad_type = pt, strides = st, weight = Wv, x = x)[name = tensor<string, []>(\"cv\")];\n"
                // Reshape: [1, DIM, 1, SEQ] → [1, HEADS, HD, SEQ] → transpose → [1, HEADS, SEQ, HD]
-                "        tensor<int32, [4]> qsh = const()[name = string(\"qsh\"), val = tensor<int32, [4]>([1, %d, %d, %d])];\n"
-                "        tensor<fp16, [1, %d, %d, %d]> q_4d = reshape(shape = qsh, x = q_flat)[name = string(\"rq\")];\n"
-                "        tensor<int32, [4]> perm = const()[name = string(\"pm\"), val = tensor<int32, [4]>([0, 1, 3, 2])];\n"
-                "        tensor<fp16, [1, %d, %d, %d]> q = transpose(perm = perm, x = q_4d)[name = string(\"tq\")];\n"
-                "        tensor<fp16, [1, %d, %d, %d]> k_4d = reshape(shape = qsh, x = k_flat)[name = string(\"rk\")];\n"
-                "        tensor<fp16, [1, %d, %d, %d]> k = transpose(perm = perm, x = k_4d)[name = string(\"tk\")];\n"
-                "        tensor<fp16, [1, %d, %d, %d]> v_4d = reshape(shape = qsh, x = v_flat)[name = string(\"rv\")];\n"
-                "        tensor<fp16, [1, %d, %d, %d]> v = transpose(perm = perm, x = v_4d)[name = string(\"tv\")];\n"
+                "        tensor<int32, [4]> qsh = const()[name = tensor<string, []>(\"qsh\"), val = tensor<int32, [4]>([1, %d, %d, %d])];\n"
+                "        tensor<fp16, [1, %d, %d, %d]> q_4d = reshape(shape = qsh, x = q_flat)[name = tensor<string, []>(\"rq\")];\n"
+                "        tensor<int32, [4]> perm = const()[name = tensor<string, []>(\"pm\"), val = tensor<int32, [4]>([0, 1, 3, 2])];\n"
+                "        tensor<fp16, [1, %d, %d, %d]> q = transpose(perm = perm, x = q_4d)[name = tensor<string, []>(\"tq\")];\n"
+                "        tensor<fp16, [1, %d, %d, %d]> k_4d = reshape(shape = qsh, x = k_flat)[name = tensor<string, []>(\"rk\")];\n"
+                "        tensor<fp16, [1, %d, %d, %d]> k = transpose(perm = perm, x = k_4d)[name = tensor<string, []>(\"tk\")];\n"
+                "        tensor<fp16, [1, %d, %d, %d]> v_4d = reshape(shape = qsh, x = v_flat)[name = tensor<string, []>(\"rv\")];\n"
+                "        tensor<fp16, [1, %d, %d, %d]> v = transpose(perm = perm, x = v_4d)[name = tensor<string, []>(\"tv\")];\n"
                // Q @ K^T
-                "        bool ty = const()[name = string(\"ty\"), val = bool(true)];\n"
-                "        bool tx = const()[name = string(\"tx\"), val = bool(false)];\n"
-                "        tensor<fp16, [1, %d, %d, %d]> scores = matmul(transpose_x = tx, transpose_y = ty, x = q, y = k)[name = string(\"mm1\")];\n"
+                "        tensor<bool, []> ty = const()[name = tensor<string, []>(\"ty\"), val = tensor<bool, []>(true)];\n"
+                "        tensor<bool, []> tx = const()[name = tensor<string, []>(\"tx\"), val = tensor<bool, []>(false)];\n"
+                "        tensor<fp16, [1, %d, %d, %d]> scores = matmul(transpose_x = tx, transpose_y = ty, x = q, y = k)[name = tensor<string, []>(\"mm1\")];\n"
                // Scale
-                "        fp16 sc = const()[name = string(\"sc\"), val = fp16(%f)];\n"
-                "        tensor<fp16, [1, %d, %d, %d]> scaled = mul(x = scores, y = sc)[name = string(\"scl\")];\n"
+                "        tensor<fp16, []> sc = const()[name = tensor<string, []>(\"sc\"), val = fp16(%f)];\n"
+                "        tensor<fp16, [1, %d, %d, %d]> scaled = mul(x = scores, y = sc)[name = tensor<string, []>(\"scl\")];\n"
                // Causal mask
-                "        tensor<fp16, [1, 1, %d, %d]> cmask = const()[name = string(\"cm\"), "
-                "val = tensor<fp16, [1, 1, %d, %d]>(BLOBFILE(path = string(\"@model_path/weights/mask.bin\"), offset = uint64(64)))];\n"
-                "        tensor<fp16, [1, %d, %d, %d]> masked = add(x = scaled, y = cmask)[name = string(\"msk\")];\n"
+                "        tensor<fp16, [1, 1, %d, %d]> cmask = const()[name = tensor<string, []>(\"cm\"), "
+                "val = tensor<fp16, [1, 1, %d, %d]>(BLOBFILE(path = tensor<string, []>(\"@model_path/weights/mask.bin\"), offset = tensor<uint64, []>(64)))];\n"
+                "        tensor<fp16, [1, %d, %d, %d]> masked = add(x = scaled, y = cmask)[name = tensor<string, []>(\"msk\")];\n"
                // Softmax
-                "        int32 sax = const()[name = string(\"sax\"), val = int32(-1)];\n"
-                "        tensor<fp16, [1, %d, %d, %d]> attn_w = softmax(axis = sax, x = masked)[name = string(\"sm\")];\n"
+                "        tensor<int32, []> sax = const()[name = tensor<string, []>(\"sax\"), val = tensor<int32, []>(-1)];\n"
+                "        tensor<fp16, [1, %d, %d, %d]> attn_w = softmax(axis = sax, x = masked)[name = tensor<string, []>(\"sm\")];\n"
                // scores @ V
-                "        tensor<fp16, [1, %d, %d, %d]> attn_4d = matmul(transpose_x = tx, transpose_y = tx, x = attn_w, y = v)[name = string(\"mm2\")];\n"
+                "        tensor<fp16, [1, %d, %d, %d]> attn_4d = matmul(transpose_x = tx, transpose_y = tx, x = attn_w, y = v)[name = tensor<string, []>(\"mm2\")];\n"
                // Reshape back: [1, HEADS, SEQ, HD] → transpose → [1, HEADS, HD, SEQ] → reshape → [1, DIM, 1, SEQ]
-                "        tensor<fp16, [1, %d, %d, %d]> attn_t = transpose(perm = perm, x = attn_4d)[name = string(\"ta\")];\n"
-                "        tensor<int32, [4]> osh = const()[name = string(\"osh\"), val = tensor<int32, [4]>([1, %d, 1, %d])];\n"
-                "        tensor<fp16, [1, %d, 1, %d]> attn_flat = reshape(shape = osh, x = attn_t)[name = string(\"ra\")];\n"
+                "        tensor<fp16, [1, %d, %d, %d]> attn_t = transpose(perm = perm, x = attn_4d)[name = tensor<string, []>(\"ta\")];\n"
+                "        tensor<int32, [4]> osh = const()[name = tensor<string, []>(\"osh\"), val = tensor<int32, [4]>([1, %d, 1, %d])];\n"
+                "        tensor<fp16, [1, %d, 1, %d]> attn_flat = reshape(shape = osh, x = attn_t)[name = tensor<string, []>(\"ra\")];\n"
                // Wo projection
                "        tensor<fp16, [1, %d, 1, %d]> out = conv(dilations = dl, groups = gr1, pad = pd, "
-                "pad_type = pt, strides = st, weight = Wout, x = attn_flat)[name = string(\"co\")];\n"
+                "pad_type = pt, strides = st, weight = Wout, x = attn_flat)[name = tensor<string, []>(\"co\")];\n"
                "    } -> (out);\n}\n",
                DIM, SEQ,                              // input
                DIM,DIM,DIM,DIM, DIM,DIM,DIM,DIM,      // Wq, Wk
@ -317,30 +315,28 @@ int main() {
        printf("\n=== Test 2: Fused FFN benchmark ===\n");
        {
            NSString *mil = [NSString stringWithFormat:
-                @"program(1.3)\n[buildInfo = dict<string, string>({{\"coremlc-component-MIL\", \"3510.2.1\"}, "
-                "{\"coremlc-version\", \"3505.4.1\"}, {\"coremltools-component-milinternal\", \"\"}, "
-                "{\"coremltools-version\", \"9.0\"}})]\n{\n"
-                "    func main<ios18>(tensor<fp16, [1, %d, 1, %d]> x) {\n"
-                "        string pt = const()[name = string(\"pt\"), val = string(\"valid\")];\n"
-                "        tensor<int32, [2]> st = const()[name = string(\"st\"), val = tensor<int32, [2]>([1, 1])];\n"
-                "        tensor<int32, [4]> pd = const()[name = string(\"pd\"), val = tensor<int32, [4]>([0, 0, 0, 0])];\n"
-                "        tensor<int32, [2]> dl = const()[name = string(\"dl\"), val = tensor<int32, [2]>([1, 1])];\n"
-                "        int32 gr = const()[name = string(\"gr\"), val = int32(1)];\n"
-                "        tensor<fp16, [%d, %d, 1, 1]> W1 = const()[name = string(\"W1\"), "
-                "val = tensor<fp16, [%d, %d, 1, 1]>(BLOBFILE(path = string(\"@model_path/weights/w1.bin\"), offset = uint64(64)))];\n"
-                "        tensor<fp16, [%d, %d, 1, 1]> W3 = const()[name = string(\"W3\"), "
-                "val = tensor<fp16, [%d, %d, 1, 1]>(BLOBFILE(path = string(\"@model_path/weights/w3.bin\"), offset = uint64(64)))];\n"
-                "        tensor<fp16, [%d, %d, 1, 1]> W2 = const()[name = string(\"W2\"), "
-                "val = tensor<fp16, [%d, %d, 1, 1]>(BLOBFILE(path = string(\"@model_path/weights/w2.bin\"), offset = uint64(64)))];\n"
+                @"program(1.0)\n[buildInfo = dict<tensor<string, []>, tensor<string, []>>({{\"coremlc-version\", \"3505.4.1\"}})]\n{\n"
+                "    func main<ios16>(tensor<fp16, [1, %d, 1, %d]> x) {\n"
+                "        tensor<string, []> pt = const()[name = tensor<string, []>(\"pt\"), val = tensor<string, []>(\"valid\")];\n"
+                "        tensor<int32, [2]> st = const()[name = tensor<string, []>(\"st\"), val = tensor<int32, [2]>([1, 1])];\n"
+                "        tensor<int32, [4]> pd = const()[name = tensor<string, []>(\"pd\"), val = tensor<int32, [4]>([0, 0, 0, 0])];\n"
+                "        tensor<int32, [2]> dl = const()[name = tensor<string, []>(\"dl\"), val = tensor<int32, [2]>([1, 1])];\n"
+                "        tensor<int32, []> gr = const()[name = tensor<string, []>(\"gr\"), val = tensor<int32, []>(1)];\n"
+                "        tensor<fp16, [%d, %d, 1, 1]> W1 = const()[name = tensor<string, []>(\"W1\"), "
+                "val = tensor<fp16, [%d, %d, 1, 1]>(BLOBFILE(path = tensor<string, []>(\"@model_path/weights/w1.bin\"), offset = tensor<uint64, []>(64)))];\n"
+                "        tensor<fp16, [%d, %d, 1, 1]> W3 = const()[name = tensor<string, []>(\"W3\"), "
+                "val = tensor<fp16, [%d, %d, 1, 1]>(BLOBFILE(path = tensor<string, []>(\"@model_path/weights/w3.bin\"), offset = tensor<uint64, []>(64)))];\n"
+                "        tensor<fp16, [%d, %d, 1, 1]> W2 = const()[name = tensor<string, []>(\"W2\"), "
+                "val = tensor<fp16, [%d, %d, 1, 1]>(BLOBFILE(path = tensor<string, []>(\"@model_path/weights/w2.bin\"), offset = tensor<uint64, []>(64)))];\n"
                "        tensor<fp16, [1, %d, 1, %d]> h1 = conv(dilations = dl, groups = gr, pad = pd, "
-                "pad_type = pt, strides = st, weight = W1, x = x)[name = string(\"c1\")];\n"
+                "pad_type = pt, strides = st, weight = W1, x = x)[name = tensor<string, []>(\"c1\")];\n"
                "        tensor<fp16, [1, %d, 1, %d]> h3 = conv(dilations = dl, groups = gr, pad = pd, "
-                "pad_type = pt, strides = st, weight = W3, x = x)[name = string(\"c3\")];\n"
-                "        tensor<fp16, [1, %d, 1, %d]> sig = sigmoid(x = h1)[name = string(\"sg\")];\n"
-                "        tensor<fp16, [1, %d, 1, %d]> silu = mul(x = h1, y = sig)[name = string(\"si\")];\n"
-                "        tensor<fp16, [1, %d, 1, %d]> gate = mul(x = silu, y = h3)[name = string(\"gt\")];\n"
+                "pad_type = pt, strides = st, weight = W3, x = x)[name = tensor<string, []>(\"c3\")];\n"
+                "        tensor<fp16, [1, %d, 1, %d]> sig = sigmoid(x = h1)[name = tensor<string, []>(\"sg\")];\n"
+                "        tensor<fp16, [1, %d, 1, %d]> silu = mul(x = h1, y = sig)[name = tensor<string, []>(\"si\")];\n"
+                "        tensor<fp16, [1, %d, 1, %d]> gate = mul(x = silu, y = h3)[name = tensor<string, []>(\"gt\")];\n"
                "        tensor<fp16, [1, %d, 1, %d]> out = conv(dilations = dl, groups = gr, pad = pd, "
-                "pad_type = pt, strides = st, weight = W2, x = gate)[name = string(\"c2\")];\n"
+                "pad_type = pt, strides = st, weight = W2, x = gate)[name = tensor<string, []>(\"c2\")];\n"
                "    } -> (out);\n}\n",
                DIM, SEQ,
                HIDDEN,DIM,HIDDEN,DIM, HIDDEN,DIM,HIDDEN,DIM, DIM,HIDDEN,DIM,HIDDEN,
--- a/training/test_fused_bwd.m
+++ b/training/test_fused_bwd.m
@ -15,6 +15,8 @@
 #define HIDDEN 2048
 #define SEQ 64

+static int g_fp16_io = 0;  // M1/M2: cast op unsupported, use fp16 I/O directly
+
 static Class g_D, g_I, g_AR, g_AIO;
 static void ane_init(void) {
    dlopen("/System/Library/PrivateFrameworks/AppleNeuralEngine.framework/AppleNeuralEngine", RTLD_NOW);
@ -58,47 +60,77 @@ int main() {
        // MIL: slice input → 2 convs → add
        printf("=== Fused W1b+W3b backward (slice+conv+add) ===\n");

-        NSString *mil = [NSString stringWithFormat:
-            @"program(1.3)\n[buildInfo = dict<string, string>({{\"coremlc-component-MIL\", \"3510.2.1\"}, "
-            "{\"coremlc-version\", \"3505.4.1\"}, {\"coremltools-component-milinternal\", \"\"}, "
-            "{\"coremltools-version\", \"9.0\"}})]\n{\n"
-            "    func main<ios18>(tensor<fp32, [1, %d, 1, %d]> x) {\n"  // [1, HIDDEN*2, 1, SEQ]
-            "        string d1 = const()[name = string(\"d1\"), val = string(\"fp16\")];\n"
-            "        tensor<fp16, [1, %d, 1, %d]> x16 = cast(dtype = d1, x = x)[name = string(\"cx\")];\n"
-            // Slice: dh1 = x16[:, 0:HIDDEN, :, :], dh3 = x16[:, HIDDEN:2*HIDDEN, :, :]
-            "        tensor<int32, [4]> b1 = const()[name = string(\"b1\"), val = tensor<int32, [4]>([0, 0, 0, 0])];\n"
-            "        tensor<int32, [4]> s1 = const()[name = string(\"s1\"), val = tensor<int32, [4]>([1, %d, 1, %d])];\n"
-            "        tensor<fp16, [1, %d, 1, %d]> dh1 = slice_by_size(x = x16, begin = b1, size = s1)[name = string(\"sl1\")];\n"
-            "        tensor<int32, [4]> b3 = const()[name = string(\"b3\"), val = tensor<int32, [4]>([0, %d, 0, 0])];\n"
-            "        tensor<int32, [4]> s3 = const()[name = string(\"s3\"), val = tensor<int32, [4]>([1, %d, 1, %d])];\n"
-            "        tensor<fp16, [1, %d, 1, %d]> dh3 = slice_by_size(x = x16, begin = b3, size = s3)[name = string(\"sl3\")];\n"
-            // Conv: W1^T @ dh1, W3^T @ dh3
-            "        string pt = const()[name = string(\"pt\"), val = string(\"valid\")];\n"
-            "        tensor<int32, [2]> st = const()[name = string(\"st\"), val = tensor<int32, [2]>([1, 1])];\n"
-            "        tensor<int32, [4]> pd = const()[name = string(\"pd\"), val = tensor<int32, [4]>([0, 0, 0, 0])];\n"
-            "        tensor<int32, [2]> dl = const()[name = string(\"dl\"), val = tensor<int32, [2]>([1, 1])];\n"
-            "        int32 gr = const()[name = string(\"gr\"), val = int32(1)];\n"
-            // W1^T: [DIM, HIDDEN, 1, 1]  (transposed from [HIDDEN, DIM])
-            "        tensor<fp16, [%d, %d, 1, 1]> W1t = const()[name = string(\"W1t\"), "
-            "val = tensor<fp16, [%d, %d, 1, 1]>(BLOBFILE(path = string(\"@model_path/weights/w1t.bin\"), offset = uint64(64)))];\n"
-            "        tensor<fp16, [%d, %d, 1, 1]> W3t = const()[name = string(\"W3t\"), "
-            "val = tensor<fp16, [%d, %d, 1, 1]>(BLOBFILE(path = string(\"@model_path/weights/w3t.bin\"), offset = uint64(64)))];\n"
-            "        tensor<fp16, [1, %d, 1, %d]> dx1 = conv(dilations = dl, groups = gr, pad = pd, "
-            "pad_type = pt, strides = st, weight = W1t, x = dh1)[name = string(\"cv1\")];\n"
-            "        tensor<fp16, [1, %d, 1, %d]> dx3 = conv(dilations = dl, groups = gr, pad = pd, "
-            "pad_type = pt, strides = st, weight = W3t, x = dh3)[name = string(\"cv3\")];\n"
-            // Add
-            "        tensor<fp16, [1, %d, 1, %d]> sum = add(x = dx1, y = dx3)[name = string(\"ad\")];\n"
-            "        string d2 = const()[name = string(\"d2\"), val = string(\"fp32\")];\n"
-            "        tensor<fp32, [1, %d, 1, %d]> y = cast(dtype = d2, x = sum)[name = string(\"co\")];\n"
-            "    } -> (y);\n}\n",
-            HIDDEN*2, SEQ, HIDDEN*2, SEQ,
-            HIDDEN, SEQ, HIDDEN, SEQ,  // slice1
-            HIDDEN, HIDDEN, SEQ, HIDDEN, SEQ,  // slice3
-            DIM, HIDDEN, DIM, HIDDEN,   // W1t
-            DIM, HIDDEN, DIM, HIDDEN,   // W3t
-            DIM, SEQ, DIM, SEQ,         // dx1, dx3
-            DIM, SEQ, DIM, SEQ];        // sum, y
+        retry_compile:;
+        NSString *mil;
+        if (g_fp16_io) {
+            mil = [NSString stringWithFormat:
+                @"program(1.0)\n[buildInfo = dict<tensor<string, []>, tensor<string, []>>({{\"coremlc-version\", \"3505.4.1\"}})]\n{\n"
+                "    func main<ios16>(tensor<fp16, [1, %d, 1, %d]> x) {\n"
+                "        tensor<int32, [4]> b1 = const()[name = tensor<string, []>(\"b1\"), val = tensor<int32, [4]>([0, 0, 0, 0])];\n"
+                "        tensor<int32, [4]> s1 = const()[name = tensor<string, []>(\"s1\"), val = tensor<int32, [4]>([1, %d, 1, %d])];\n"
+                "        tensor<fp16, [1, %d, 1, %d]> dh1 = slice_by_size(x = x, begin = b1, size = s1)[name = tensor<string, []>(\"sl1\")];\n"
+                "        tensor<int32, [4]> b3 = const()[name = tensor<string, []>(\"b3\"), val = tensor<int32, [4]>([0, %d, 0, 0])];\n"
+                "        tensor<int32, [4]> s3 = const()[name = tensor<string, []>(\"s3\"), val = tensor<int32, [4]>([1, %d, 1, %d])];\n"
+                "        tensor<fp16, [1, %d, 1, %d]> dh3 = slice_by_size(x = x, begin = b3, size = s3)[name = tensor<string, []>(\"sl3\")];\n"
+                "        tensor<string, []> pt = const()[name = tensor<string, []>(\"pt\"), val = tensor<string, []>(\"valid\")];\n"
+                "        tensor<int32, [2]> st = const()[name = tensor<string, []>(\"st\"), val = tensor<int32, [2]>([1, 1])];\n"
+                "        tensor<int32, [4]> pd = const()[name = tensor<string, []>(\"pd\"), val = tensor<int32, [4]>([0, 0, 0, 0])];\n"
+                "        tensor<int32, [2]> dl = const()[name = tensor<string, []>(\"dl\"), val = tensor<int32, [2]>([1, 1])];\n"
+                "        tensor<int32, []> gr = const()[name = tensor<string, []>(\"gr\"), val = tensor<int32, []>(1)];\n"
+                "        tensor<fp16, [%d, %d, 1, 1]> W1t = const()[name = tensor<string, []>(\"W1t\"), "
+                "val = tensor<fp16, [%d, %d, 1, 1]>(BLOBFILE(path = tensor<string, []>(\"@model_path/weights/w1t.bin\"), offset = tensor<uint64, []>(64)))];\n"
+                "        tensor<fp16, [%d, %d, 1, 1]> W3t = const()[name = tensor<string, []>(\"W3t\"), "
+                "val = tensor<fp16, [%d, %d, 1, 1]>(BLOBFILE(path = tensor<string, []>(\"@model_path/weights/w3t.bin\"), offset = tensor<uint64, []>(64)))];\n"
+                "        tensor<fp16, [1, %d, 1, %d]> dx1 = conv(dilations = dl, groups = gr, pad = pd, "
+                "pad_type = pt, strides = st, weight = W1t, x = dh1)[name = tensor<string, []>(\"cv1\")];\n"
+                "        tensor<fp16, [1, %d, 1, %d]> dx3 = conv(dilations = dl, groups = gr, pad = pd, "
+                "pad_type = pt, strides = st, weight = W3t, x = dh3)[name = tensor<string, []>(\"cv3\")];\n"
+                "        tensor<fp16, [1, %d, 1, %d]> y = add(x = dx1, y = dx3)[name = tensor<string, []>(\"ad\")];\n"
+                "    } -> (y);\n}\n",
+                HIDDEN*2, SEQ,
+                HIDDEN, SEQ, HIDDEN, SEQ,
+                HIDDEN, HIDDEN, SEQ, HIDDEN, SEQ,
+                DIM, HIDDEN, DIM, HIDDEN,
+                DIM, HIDDEN, DIM, HIDDEN,
+                DIM, SEQ, DIM, SEQ,
+                DIM, SEQ];
+        } else {
+            mil = [NSString stringWithFormat:
+                @"program(1.0)\n[buildInfo = dict<tensor<string, []>, tensor<string, []>>({{\"coremlc-version\", \"3505.4.1\"}})]\n{\n"
+                "    func main<ios16>(tensor<fp32, [1, %d, 1, %d]> x) {\n"
+                "        tensor<string, []> d1 = const()[name = tensor<string, []>(\"d1\"), val = tensor<string, []>(\"fp16\")];\n"
+                "        tensor<fp16, [1, %d, 1, %d]> x16 = cast(dtype = d1, x = x)[name = tensor<string, []>(\"cx\")];\n"
+                "        tensor<int32, [4]> b1 = const()[name = tensor<string, []>(\"b1\"), val = tensor<int32, [4]>([0, 0, 0, 0])];\n"
+                "        tensor<int32, [4]> s1 = const()[name = tensor<string, []>(\"s1\"), val = tensor<int32, [4]>([1, %d, 1, %d])];\n"
+                "        tensor<fp16, [1, %d, 1, %d]> dh1 = slice_by_size(x = x16, begin = b1, size = s1)[name = tensor<string, []>(\"sl1\")];\n"
+                "        tensor<int32, [4]> b3 = const()[name = tensor<string, []>(\"b3\"), val = tensor<int32, [4]>([0, %d, 0, 0])];\n"
+                "        tensor<int32, [4]> s3 = const()[name = tensor<string, []>(\"s3\"), val = tensor<int32, [4]>([1, %d, 1, %d])];\n"
+                "        tensor<fp16, [1, %d, 1, %d]> dh3 = slice_by_size(x = x16, begin = b3, size = s3)[name = tensor<string, []>(\"sl3\")];\n"
+                "        tensor<string, []> pt = const()[name = tensor<string, []>(\"pt\"), val = tensor<string, []>(\"valid\")];\n"
+                "        tensor<int32, [2]> st = const()[name = tensor<string, []>(\"st\"), val = tensor<int32, [2]>([1, 1])];\n"
+                "        tensor<int32, [4]> pd = const()[name = tensor<string, []>(\"pd\"), val = tensor<int32, [4]>([0, 0, 0, 0])];\n"
+                "        tensor<int32, [2]> dl = const()[name = tensor<string, []>(\"dl\"), val = tensor<int32, [2]>([1, 1])];\n"
+                "        tensor<int32, []> gr = const()[name = tensor<string, []>(\"gr\"), val = tensor<int32, []>(1)];\n"
+                "        tensor<fp16, [%d, %d, 1, 1]> W1t = const()[name = tensor<string, []>(\"W1t\"), "
+                "val = tensor<fp16, [%d, %d, 1, 1]>(BLOBFILE(path = tensor<string, []>(\"@model_path/weights/w1t.bin\"), offset = tensor<uint64, []>(64)))];\n"
+                "        tensor<fp16, [%d, %d, 1, 1]> W3t = const()[name = tensor<string, []>(\"W3t\"), "
+                "val = tensor<fp16, [%d, %d, 1, 1]>(BLOBFILE(path = tensor<string, []>(\"@model_path/weights/w3t.bin\"), offset = tensor<uint64, []>(64)))];\n"
+                "        tensor<fp16, [1, %d, 1, %d]> dx1 = conv(dilations = dl, groups = gr, pad = pd, "
+                "pad_type = pt, strides = st, weight = W1t, x = dh1)[name = tensor<string, []>(\"cv1\")];\n"
+                "        tensor<fp16, [1, %d, 1, %d]> dx3 = conv(dilations = dl, groups = gr, pad = pd, "
+                "pad_type = pt, strides = st, weight = W3t, x = dh3)[name = tensor<string, []>(\"cv3\")];\n"
+                "        tensor<fp16, [1, %d, 1, %d]> sum = add(x = dx1, y = dx3)[name = tensor<string, []>(\"ad\")];\n"
+                "        tensor<string, []> d2 = const()[name = tensor<string, []>(\"d2\"), val = tensor<string, []>(\"fp32\")];\n"
+                "        tensor<fp32, [1, %d, 1, %d]> y = cast(dtype = d2, x = sum)[name = tensor<string, []>(\"co\")];\n"
+                "    } -> (y);\n}\n",
+                HIDDEN*2, SEQ, HIDDEN*2, SEQ,
+                HIDDEN, SEQ, HIDDEN, SEQ,
+                HIDDEN, HIDDEN, SEQ, HIDDEN, SEQ,
+                DIM, HIDDEN, DIM, HIDDEN,
+                DIM, HIDDEN, DIM, HIDDEN,
+                DIM, SEQ, DIM, SEQ,
+                DIM, SEQ, DIM, SEQ];
+        }

        NSDictionary *wd = @{
            @"@model_path/weights/w1t.bin": @{@"offset":@0, @"data":build_blob_t(W1, HIDDEN, DIM)},
@ -119,6 +151,12 @@ int main() {

        NSError *e = nil;
        BOOL ok = ((BOOL(*)(id,SEL,unsigned int,id,NSError**))objc_msgSend)(mdl, @selector(compileWithQoS:options:error:), 21, @{}, &e);
+        if (!ok && !g_fp16_io) {
+            printf("[ANE] fp32 compile failed, retrying with fp16 I/O (M1/M2 fallback)\n");
+            g_fp16_io = 1;
+            [[NSFileManager defaultManager] removeItemAtPath:td error:nil];
+            goto retry_compile;
+        }
        printf("Compile: %s\n", ok?"OK":"FAIL");
        if (!ok) { printf("  %s\n", e?[[e description] UTF8String]:""); return 1; }
        ok = ((BOOL(*)(id,SEL,unsigned int,id,NSError**))objc_msgSend)(mdl, @selector(loadWithQoS:options:error:), 21, @{}, &e);
@ -130,13 +168,21 @@ int main() {
        float *dh3 = (float*)malloc(SEQ*HIDDEN*sizeof(float));
        for (int i = 0; i < SEQ*HIDDEN; i++) { dh1[i]=0.01f*sinf(i*0.007f); dh3[i]=0.01f*cosf(i*0.011f); }

-        IOSurfaceRef ioI = make_surface(HIDDEN*2*SEQ*4), ioO = make_surface(DIM*SEQ*4);
+        size_t bpe = g_fp16_io ? 2 : 4;
+        IOSurfaceRef ioI = make_surface(HIDDEN*2*SEQ*bpe), ioO = make_surface(DIM*SEQ*bpe);
        IOSurfaceLock(ioI, 0, NULL);
-        float *dst = (float*)IOSurfaceGetBaseAddress(ioI);
-        // Channel-first: channels 0..HIDDEN-1 = dh1, channels HIDDEN..2*HIDDEN-1 = dh3
-        for (int t = 0; t < SEQ; t++) {
-            for (int c = 0; c < HIDDEN; c++) dst[c*SEQ+t] = dh1[t*HIDDEN+c];
-            for (int c = 0; c < HIDDEN; c++) dst[(HIDDEN+c)*SEQ+t] = dh3[t*HIDDEN+c];
+        if (g_fp16_io) {
+            _Float16 *dst = (_Float16*)IOSurfaceGetBaseAddress(ioI);
+            for (int t = 0; t < SEQ; t++) {
+                for (int c = 0; c < HIDDEN; c++) dst[c*SEQ+t] = (_Float16)dh1[t*HIDDEN+c];
+                for (int c = 0; c < HIDDEN; c++) dst[(HIDDEN+c)*SEQ+t] = (_Float16)dh3[t*HIDDEN+c];
+            }
+        } else {
+            float *dst = (float*)IOSurfaceGetBaseAddress(ioI);
+            for (int t = 0; t < SEQ; t++) {
+                for (int c = 0; c < HIDDEN; c++) dst[c*SEQ+t] = dh1[t*HIDDEN+c];
+                for (int c = 0; c < HIDDEN; c++) dst[(HIDDEN+c)*SEQ+t] = dh3[t*HIDDEN+c];
+            }
        }
        IOSurfaceUnlock(ioI, 0, NULL);

@ -164,13 +210,22 @@ int main() {
            }

        IOSurfaceLock(ioO, kIOSurfaceLockReadOnly, NULL);
-        float *src = (float*)IOSurfaceGetBaseAddress(ioO);
        float maxd = 0;
-        for (int t = 0; t < SEQ; t++)
-            for (int c = 0; c < DIM; c++) {
-                float d = fabsf(src[c*SEQ+t] - ref[t*DIM+c]);
-                if (d > maxd) maxd = d;
-            }
+        if (g_fp16_io) {
+            _Float16 *src = (_Float16*)IOSurfaceGetBaseAddress(ioO);
+            for (int t = 0; t < SEQ; t++)
+                for (int c = 0; c < DIM; c++) {
+                    float d = fabsf((float)src[c*SEQ+t] - ref[t*DIM+c]);
+                    if (d > maxd) maxd = d;
+                }
+        } else {
+            float *src = (float*)IOSurfaceGetBaseAddress(ioO);
+            for (int t = 0; t < SEQ; t++)
+                for (int c = 0; c < DIM; c++) {
+                    float d = fabsf(src[c*SEQ+t] - ref[t*DIM+c]);
+                    if (d > maxd) maxd = d;
+                }
+        }
        IOSurfaceUnlock(ioO, kIOSurfaceLockReadOnly, NULL);
        printf("dx max diff: %.6f\n", maxd);

--- a/training/test_fused_qkv.m
+++ b/training/test_fused_qkv.m
@ -12,6 +12,8 @@
 #define DIM 768
 #define SEQ 64

+static int g_fp16_io = 0;  // M1/M2: cast op unsupported, use fp16 I/O directly
+
 static Class g_D, g_I, g_AR, g_AIO;
 static mach_timebase_info_data_t g_tb;
 static void ane_init(void) {
@ -56,7 +58,10 @@ static Kern compile_mil(NSString *mil, NSDictionary *wd) {
    }
    NSError *e = nil;
    if (!((BOOL(*)(id,SEL,unsigned int,id,NSError**))objc_msgSend)(mdl, @selector(compileWithQoS:options:error:), 21, @{}, &e)) {
-        printf("compile FAIL: %s\n", e?[[e localizedDescription] UTF8String]:""); return k;
+        printf("compile %s: %s\n", g_fp16_io ? "FAIL" : "failed (will retry)",
+               e ? [[e localizedDescription] UTF8String] : "");
+        [[NSFileManager defaultManager] removeItemAtPath:td error:nil];
+        return k;
    }
    ((BOOL(*)(id,SEL,unsigned int,id,NSError**))objc_msgSend)(mdl, @selector(loadWithQoS:options:error:), 21, @{}, &e);
    k.model = mdl; k.td = td;
@ -85,67 +90,108 @@ static void cleanup_kern(Kern *k) {

 // Fused QKV: 3 convs + concat in one MIL
 static NSString *gen_fused_qkv_mil(void) {
+    if (g_fp16_io) {
+        return [NSString stringWithFormat:
+            @"program(1.0)\n[buildInfo = dict<tensor<string, []>, tensor<string, []>>({{\"coremlc-version\", \"3505.4.1\"}})]\n{\n"
+            "    func main<ios16>(tensor<fp16, [1, %d, 1, %d]> x) {\n"
+            "        tensor<string, []> pt = const()[name = tensor<string, []>(\"pt\"), val = tensor<string, []>(\"valid\")];\n"
+            "        tensor<int32, [2]> st = const()[name = tensor<string, []>(\"st\"), val = tensor<int32, [2]>([1, 1])];\n"
+            "        tensor<int32, [4]> pd = const()[name = tensor<string, []>(\"pd\"), val = tensor<int32, [4]>([0, 0, 0, 0])];\n"
+            "        tensor<int32, [2]> dl = const()[name = tensor<string, []>(\"dl\"), val = tensor<int32, [2]>([1, 1])];\n"
+            "        tensor<int32, []> gr = const()[name = tensor<string, []>(\"gr\"), val = tensor<int32, []>(1)];\n"
+            "        tensor<fp16, [%d, %d, 1, 1]> Wq = const()[name = tensor<string, []>(\"Wq\"), "
+            "val = tensor<fp16, [%d, %d, 1, 1]>(BLOBFILE(path = tensor<string, []>(\"@model_path/weights/wq.bin\"), offset = tensor<uint64, []>(64)))];\n"
+            "        tensor<fp16, [%d, %d, 1, 1]> Wk = const()[name = tensor<string, []>(\"Wk\"), "
+            "val = tensor<fp16, [%d, %d, 1, 1]>(BLOBFILE(path = tensor<string, []>(\"@model_path/weights/wk.bin\"), offset = tensor<uint64, []>(64)))];\n"
+            "        tensor<fp16, [%d, %d, 1, 1]> Wv = const()[name = tensor<string, []>(\"Wv\"), "
+            "val = tensor<fp16, [%d, %d, 1, 1]>(BLOBFILE(path = tensor<string, []>(\"@model_path/weights/wv.bin\"), offset = tensor<uint64, []>(64)))];\n"
+            "        tensor<fp16, [1, %d, 1, %d]> q = conv(dilations = dl, groups = gr, pad = pd, "
+            "pad_type = pt, strides = st, weight = Wq, x = x)[name = tensor<string, []>(\"cq\")];\n"
+            "        tensor<fp16, [1, %d, 1, %d]> k = conv(dilations = dl, groups = gr, pad = pd, "
+            "pad_type = pt, strides = st, weight = Wk, x = x)[name = tensor<string, []>(\"ck\")];\n"
+            "        tensor<fp16, [1, %d, 1, %d]> v = conv(dilations = dl, groups = gr, pad = pd, "
+            "pad_type = pt, strides = st, weight = Wv, x = x)[name = tensor<string, []>(\"cv\")];\n"
+            "        tensor<int32, []> ax = const()[name = tensor<string, []>(\"ax\"), val = tensor<int32, []>(1)];\n"
+            "        tensor<bool, []> inter = const()[name = tensor<string, []>(\"il\"), val = tensor<bool, []>(false)];\n"
+            "        tensor<fp16, [1, %d, 1, %d]> y = concat(axis = ax, interleave = inter, values = (q, k, v))[name = tensor<string, []>(\"cat\")];\n"
+            "    } -> (y);\n}\n",
+            DIM, SEQ,
+            DIM, DIM, DIM, DIM,
+            DIM, DIM, DIM, DIM,
+            DIM, DIM, DIM, DIM,
+            DIM, SEQ, DIM, SEQ, DIM, SEQ,
+            DIM*3, SEQ];
+    }
    return [NSString stringWithFormat:
-        @"program(1.3)\n[buildInfo = dict<string, string>({{\"coremlc-component-MIL\", \"3510.2.1\"}, "
-        "{\"coremlc-version\", \"3505.4.1\"}, {\"coremltools-component-milinternal\", \"\"}, "
-        "{\"coremltools-version\", \"9.0\"}})]\n{\n"
-        "    func main<ios18>(tensor<fp32, [1, %d, 1, %d]> x) {\n"
-        "        string d1 = const()[name = string(\"d1\"), val = string(\"fp16\")];\n"
-        "        tensor<fp16, [1, %d, 1, %d]> x16 = cast(dtype = d1, x = x)[name = string(\"cx\")];\n"
-        "        string pt = const()[name = string(\"pt\"), val = string(\"valid\")];\n"
-        "        tensor<int32, [2]> st = const()[name = string(\"st\"), val = tensor<int32, [2]>([1, 1])];\n"
-        "        tensor<int32, [4]> pd = const()[name = string(\"pd\"), val = tensor<int32, [4]>([0, 0, 0, 0])];\n"
-        "        tensor<int32, [2]> dl = const()[name = string(\"dl\"), val = tensor<int32, [2]>([1, 1])];\n"
-        "        int32 gr = const()[name = string(\"gr\"), val = int32(1)];\n"
-        "        tensor<fp16, [%d, %d, 1, 1]> Wq = const()[name = string(\"Wq\"), "
-        "val = tensor<fp16, [%d, %d, 1, 1]>(BLOBFILE(path = string(\"@model_path/weights/wq.bin\"), offset = uint64(64)))];\n"
-        "        tensor<fp16, [%d, %d, 1, 1]> Wk = const()[name = string(\"Wk\"), "
-        "val = tensor<fp16, [%d, %d, 1, 1]>(BLOBFILE(path = string(\"@model_path/weights/wk.bin\"), offset = uint64(64)))];\n"
-        "        tensor<fp16, [%d, %d, 1, 1]> Wv = const()[name = string(\"Wv\"), "
-        "val = tensor<fp16, [%d, %d, 1, 1]>(BLOBFILE(path = string(\"@model_path/weights/wv.bin\"), offset = uint64(64)))];\n"
+        @"program(1.0)\n[buildInfo = dict<tensor<string, []>, tensor<string, []>>({{\"coremlc-version\", \"3505.4.1\"}})]\n{\n"
+        "    func main<ios16>(tensor<fp32, [1, %d, 1, %d]> x) {\n"
+        "        tensor<string, []> d1 = const()[name = tensor<string, []>(\"d1\"), val = tensor<string, []>(\"fp16\")];\n"
+        "        tensor<fp16, [1, %d, 1, %d]> x16 = cast(dtype = d1, x = x)[name = tensor<string, []>(\"cx\")];\n"
+        "        tensor<string, []> pt = const()[name = tensor<string, []>(\"pt\"), val = tensor<string, []>(\"valid\")];\n"
+        "        tensor<int32, [2]> st = const()[name = tensor<string, []>(\"st\"), val = tensor<int32, [2]>([1, 1])];\n"
+        "        tensor<int32, [4]> pd = const()[name = tensor<string, []>(\"pd\"), val = tensor<int32, [4]>([0, 0, 0, 0])];\n"
+        "        tensor<int32, [2]> dl = const()[name = tensor<string, []>(\"dl\"), val = tensor<int32, [2]>([1, 1])];\n"
+        "        tensor<int32, []> gr = const()[name = tensor<string, []>(\"gr\"), val = tensor<int32, []>(1)];\n"
+        "        tensor<fp16, [%d, %d, 1, 1]> Wq = const()[name = tensor<string, []>(\"Wq\"), "
+        "val = tensor<fp16, [%d, %d, 1, 1]>(BLOBFILE(path = tensor<string, []>(\"@model_path/weights/wq.bin\"), offset = tensor<uint64, []>(64)))];\n"
+        "        tensor<fp16, [%d, %d, 1, 1]> Wk = const()[name = tensor<string, []>(\"Wk\"), "
+        "val = tensor<fp16, [%d, %d, 1, 1]>(BLOBFILE(path = tensor<string, []>(\"@model_path/weights/wk.bin\"), offset = tensor<uint64, []>(64)))];\n"
+        "        tensor<fp16, [%d, %d, 1, 1]> Wv = const()[name = tensor<string, []>(\"Wv\"), "
+        "val = tensor<fp16, [%d, %d, 1, 1]>(BLOBFILE(path = tensor<string, []>(\"@model_path/weights/wv.bin\"), offset = tensor<uint64, []>(64)))];\n"
        "        tensor<fp16, [1, %d, 1, %d]> q = conv(dilations = dl, groups = gr, pad = pd, "
-        "pad_type = pt, strides = st, weight = Wq, x = x16)[name = string(\"cq\")];\n"
+        "pad_type = pt, strides = st, weight = Wq, x = x16)[name = tensor<string, []>(\"cq\")];\n"
        "        tensor<fp16, [1, %d, 1, %d]> k = conv(dilations = dl, groups = gr, pad = pd, "
-        "pad_type = pt, strides = st, weight = Wk, x = x16)[name = string(\"ck\")];\n"
+        "pad_type = pt, strides = st, weight = Wk, x = x16)[name = tensor<string, []>(\"ck\")];\n"
        "        tensor<fp16, [1, %d, 1, %d]> v = conv(dilations = dl, groups = gr, pad = pd, "
-        "pad_type = pt, strides = st, weight = Wv, x = x16)[name = string(\"cv\")];\n"
-        "        int32 ax = const()[name = string(\"ax\"), val = int32(1)];\n"
-        "        bool inter = const()[name = string(\"il\"), val = bool(false)];\n"
-        "        tensor<fp16, [1, %d, 1, %d]> qkv = concat(axis = ax, interleave = inter, values = (q, k, v))[name = string(\"cat\")];\n"
-        "        string d2 = const()[name = string(\"d2\"), val = string(\"fp32\")];\n"
-        "        tensor<fp32, [1, %d, 1, %d]> y = cast(dtype = d2, x = qkv)[name = string(\"co\")];\n"
+        "pad_type = pt, strides = st, weight = Wv, x = x16)[name = tensor<string, []>(\"cv\")];\n"
+        "        tensor<int32, []> ax = const()[name = tensor<string, []>(\"ax\"), val = tensor<int32, []>(1)];\n"
+        "        tensor<bool, []> inter = const()[name = tensor<string, []>(\"il\"), val = tensor<bool, []>(false)];\n"
+        "        tensor<fp16, [1, %d, 1, %d]> qkv = concat(axis = ax, interleave = inter, values = (q, k, v))[name = tensor<string, []>(\"cat\")];\n"
+        "        tensor<string, []> d2 = const()[name = tensor<string, []>(\"d2\"), val = tensor<string, []>(\"fp32\")];\n"
+        "        tensor<fp32, [1, %d, 1, %d]> y = cast(dtype = d2, x = qkv)[name = tensor<string, []>(\"co\")];\n"
        "    } -> (y);\n}\n",
        DIM, SEQ, DIM, SEQ,
-        DIM, DIM, DIM, DIM,  // Wq
-        DIM, DIM, DIM, DIM,  // Wk
-        DIM, DIM, DIM, DIM,  // Wv
-        DIM, SEQ,  // q
-        DIM, SEQ,  // k
-        DIM, SEQ,  // v
-        DIM*3, SEQ,  // concat
-        DIM*3, SEQ]; // output
+        DIM, DIM, DIM, DIM,
+        DIM, DIM, DIM, DIM,
+        DIM, DIM, DIM, DIM,
+        DIM, SEQ, DIM, SEQ, DIM, SEQ,
+        DIM*3, SEQ, DIM*3, SEQ];
 }

 // Single conv MIL for comparison
 static NSString *gen_single_mil(void) {
+    if (g_fp16_io) {
+        return [NSString stringWithFormat:
+            @"program(1.0)\n[buildInfo = dict<tensor<string, []>, tensor<string, []>>({{\"coremlc-version\", \"3505.4.1\"}})]\n{\n"
+            "    func main<ios16>(tensor<fp16, [1, %d, 1, %d]> x) {\n"
+            "        tensor<fp16, [%d, %d, 1, 1]> W = const()[name = tensor<string, []>(\"W\"), "
+            "val = tensor<fp16, [%d, %d, 1, 1]>(BLOBFILE(path = tensor<string, []>(\"@model_path/weights/w.bin\"), offset = tensor<uint64, []>(64)))];\n"
+            "        tensor<string, []> pt = const()[name = tensor<string, []>(\"pt\"), val = tensor<string, []>(\"valid\")];\n"
+            "        tensor<int32, [2]> st = const()[name = tensor<string, []>(\"st\"), val = tensor<int32, [2]>([1, 1])];\n"
+            "        tensor<int32, [4]> pd = const()[name = tensor<string, []>(\"pd\"), val = tensor<int32, [4]>([0, 0, 0, 0])];\n"
+            "        tensor<int32, [2]> dl = const()[name = tensor<string, []>(\"dl\"), val = tensor<int32, [2]>([1, 1])];\n"
+            "        tensor<int32, []> gr = const()[name = tensor<string, []>(\"gr\"), val = tensor<int32, []>(1)];\n"
+            "        tensor<fp16, [1, %d, 1, %d]> y = conv(dilations = dl, groups = gr, pad = pd, "
+            "pad_type = pt, strides = st, weight = W, x = x)[name = tensor<string, []>(\"cv\")];\n"
+            "    } -> (y);\n}\n",
+            DIM, SEQ, DIM, DIM, DIM, DIM, DIM, SEQ];
+    }
    return [NSString stringWithFormat:
-        @"program(1.3)\n[buildInfo = dict<string, string>({{\"coremlc-component-MIL\", \"3510.2.1\"}, "
-        "{\"coremlc-version\", \"3505.4.1\"}, {\"coremltools-component-milinternal\", \"\"}, "
-        "{\"coremltools-version\", \"9.0\"}})]\n{\n"
-        "    func main<ios18>(tensor<fp32, [1, %d, 1, %d]> x) {\n"
-        "        string d1 = const()[name = string(\"d1\"), val = string(\"fp16\")];\n"
-        "        tensor<fp16, [1, %d, 1, %d]> x16 = cast(dtype = d1, x = x)[name = string(\"cx\")];\n"
-        "        tensor<fp16, [%d, %d, 1, 1]> W = const()[name = string(\"W\"), "
-        "val = tensor<fp16, [%d, %d, 1, 1]>(BLOBFILE(path = string(\"@model_path/weights/w.bin\"), offset = uint64(64)))];\n"
-        "        string pt = const()[name = string(\"pt\"), val = string(\"valid\")];\n"
-        "        tensor<int32, [2]> st = const()[name = string(\"st\"), val = tensor<int32, [2]>([1, 1])];\n"
-        "        tensor<int32, [4]> pd = const()[name = string(\"pd\"), val = tensor<int32, [4]>([0, 0, 0, 0])];\n"
-        "        tensor<int32, [2]> dl = const()[name = string(\"dl\"), val = tensor<int32, [2]>([1, 1])];\n"
-        "        int32 gr = const()[name = string(\"gr\"), val = int32(1)];\n"
+        @"program(1.0)\n[buildInfo = dict<tensor<string, []>, tensor<string, []>>({{\"coremlc-version\", \"3505.4.1\"}})]\n{\n"
+        "    func main<ios16>(tensor<fp32, [1, %d, 1, %d]> x) {\n"
+        "        tensor<string, []> d1 = const()[name = tensor<string, []>(\"d1\"), val = tensor<string, []>(\"fp16\")];\n"
+        "        tensor<fp16, [1, %d, 1, %d]> x16 = cast(dtype = d1, x = x)[name = tensor<string, []>(\"cx\")];\n"
+        "        tensor<fp16, [%d, %d, 1, 1]> W = const()[name = tensor<string, []>(\"W\"), "
+        "val = tensor<fp16, [%d, %d, 1, 1]>(BLOBFILE(path = tensor<string, []>(\"@model_path/weights/w.bin\"), offset = tensor<uint64, []>(64)))];\n"
+        "        tensor<string, []> pt = const()[name = tensor<string, []>(\"pt\"), val = tensor<string, []>(\"valid\")];\n"
+        "        tensor<int32, [2]> st = const()[name = tensor<string, []>(\"st\"), val = tensor<int32, [2]>([1, 1])];\n"
+        "        tensor<int32, [4]> pd = const()[name = tensor<string, []>(\"pd\"), val = tensor<int32, [4]>([0, 0, 0, 0])];\n"
+        "        tensor<int32, [2]> dl = const()[name = tensor<string, []>(\"dl\"), val = tensor<int32, [2]>([1, 1])];\n"
+        "        tensor<int32, []> gr = const()[name = tensor<string, []>(\"gr\"), val = tensor<int32, []>(1)];\n"
        "        tensor<fp16, [1, %d, 1, %d]> y16 = conv(dilations = dl, groups = gr, pad = pd, "
-        "pad_type = pt, strides = st, weight = W, x = x16)[name = string(\"cv\")];\n"
-        "        string d2 = const()[name = string(\"d2\"), val = string(\"fp32\")];\n"
-        "        tensor<fp32, [1, %d, 1, %d]> y = cast(dtype = d2, x = y16)[name = string(\"co\")];\n"
+        "pad_type = pt, strides = st, weight = W, x = x16)[name = tensor<string, []>(\"cv\")];\n"
+        "        tensor<string, []> d2 = const()[name = tensor<string, []>(\"d2\"), val = tensor<string, []>(\"fp32\")];\n"
+        "        tensor<fp32, [1, %d, 1, %d]> y = cast(dtype = d2, x = y16)[name = tensor<string, []>(\"co\")];\n"
        "    } -> (y);\n}\n",
        DIM, SEQ, DIM, SEQ, DIM, DIM, DIM, DIM, DIM, SEQ, DIM, SEQ];
 }
@ -170,12 +216,18 @@ int main() {
        for (int i = 0; i < SEQ*DIM; i++) x[i] = 0.1f*(2*drand48()-1);

        // === Compile fused QKV ===
+        retry_compile:;
        NSDictionary *fused_wd = @{
            @"@model_path/weights/wq.bin": @{@"offset":@0, @"data":build_blob(Wq, DIM, DIM)},
            @"@model_path/weights/wk.bin": @{@"offset":@0, @"data":build_blob(Wk, DIM, DIM)},
            @"@model_path/weights/wv.bin": @{@"offset":@0, @"data":build_blob(Wv, DIM, DIM)},
        };
        Kern kFused = compile_mil(gen_fused_qkv_mil(), fused_wd);
+        if (!kFused.model && !g_fp16_io) {
+            printf("[ANE] fp32 compile failed, retrying with fp16 I/O (M1/M2 fallback)\n");
+            g_fp16_io = 1;
+            goto retry_compile;
+        }
        printf("Fused QKV: %s\n", kFused.model ? "OK" : "FAIL");

        // === Compile 3 separate ===
@ -187,16 +239,24 @@ int main() {
        if (!kFused.model || !kQ.model) goto done;

        // IOSurfaces
-        size_t in_bytes = DIM*SEQ*4, out1_bytes = DIM*SEQ*4, out3_bytes = DIM*3*SEQ*4;
+        size_t bpe = g_fp16_io ? 2 : 4;
+        size_t in_bytes = DIM*SEQ*bpe, out1_bytes = DIM*SEQ*bpe, out3_bytes = DIM*3*SEQ*bpe;
        IOSurfaceRef ioIn = make_surface(in_bytes);
        IOSurfaceRef ioFused = make_surface(out3_bytes);
        IOSurfaceRef ioQ = make_surface(out1_bytes), ioK = make_surface(out1_bytes), ioV = make_surface(out1_bytes);

        IOSurfaceLock(ioIn, 0, NULL);
-        float *dst = (float*)IOSurfaceGetBaseAddress(ioIn);
-        for (int t = 0; t < SEQ; t++)
-            for (int c = 0; c < DIM; c++)
-                dst[c*SEQ+t] = x[t*DIM+c];
+        if (g_fp16_io) {
+            _Float16 *dst = (_Float16*)IOSurfaceGetBaseAddress(ioIn);
+            for (int t = 0; t < SEQ; t++)
+                for (int c = 0; c < DIM; c++)
+                    dst[c*SEQ+t] = (_Float16)x[t*DIM+c];
+        } else {
+            float *dst = (float*)IOSurfaceGetBaseAddress(ioIn);
+            for (int t = 0; t < SEQ; t++)
+                for (int c = 0; c < DIM; c++)
+                    dst[c*SEQ+t] = x[t*DIM+c];
+        }
        IOSurfaceUnlock(ioIn, 0, NULL);

        // Eval fused
@ -212,17 +272,30 @@ int main() {
        IOSurfaceLock(ioQ, kIOSurfaceLockReadOnly, NULL);
        IOSurfaceLock(ioK, kIOSurfaceLockReadOnly, NULL);
        IOSurfaceLock(ioV, kIOSurfaceLockReadOnly, NULL);
-        float *fo = (float*)IOSurfaceGetBaseAddress(ioFused);
-        float *qo = (float*)IOSurfaceGetBaseAddress(ioQ);
-        float *ko = (float*)IOSurfaceGetBaseAddress(ioK);
-        float *vo = (float*)IOSurfaceGetBaseAddress(ioV);
        float dq=0, dk=0, dv=0;
-        for (int c = 0; c < DIM; c++)
-            for (int t = 0; t < SEQ; t++) {
-                float d1 = fabsf(fo[c*SEQ+t] - qo[c*SEQ+t]); if(d1>dq) dq=d1;
-                float d2 = fabsf(fo[(DIM+c)*SEQ+t] - ko[c*SEQ+t]); if(d2>dk) dk=d2;
-                float d3 = fabsf(fo[(DIM*2+c)*SEQ+t] - vo[c*SEQ+t]); if(d3>dv) dv=d3;
-            }
+        if (g_fp16_io) {
+            _Float16 *fo = (_Float16*)IOSurfaceGetBaseAddress(ioFused);
+            _Float16 *qo = (_Float16*)IOSurfaceGetBaseAddress(ioQ);
+            _Float16 *ko = (_Float16*)IOSurfaceGetBaseAddress(ioK);
+            _Float16 *vo = (_Float16*)IOSurfaceGetBaseAddress(ioV);
+            for (int c = 0; c < DIM; c++)
+                for (int t = 0; t < SEQ; t++) {
+                    float d1 = fabsf((float)fo[c*SEQ+t] - (float)qo[c*SEQ+t]); if(d1>dq) dq=d1;
+                    float d2 = fabsf((float)fo[(DIM+c)*SEQ+t] - (float)ko[c*SEQ+t]); if(d2>dk) dk=d2;
+                    float d3 = fabsf((float)fo[(DIM*2+c)*SEQ+t] - (float)vo[c*SEQ+t]); if(d3>dv) dv=d3;
+                }
+        } else {
+            float *fo = (float*)IOSurfaceGetBaseAddress(ioFused);
+            float *qo = (float*)IOSurfaceGetBaseAddress(ioQ);
+            float *ko = (float*)IOSurfaceGetBaseAddress(ioK);
+            float *vo = (float*)IOSurfaceGetBaseAddress(ioV);
+            for (int c = 0; c < DIM; c++)
+                for (int t = 0; t < SEQ; t++) {
+                    float d1 = fabsf(fo[c*SEQ+t] - qo[c*SEQ+t]); if(d1>dq) dq=d1;
+                    float d2 = fabsf(fo[(DIM+c)*SEQ+t] - ko[c*SEQ+t]); if(d2>dk) dk=d2;
+                    float d3 = fabsf(fo[(DIM*2+c)*SEQ+t] - vo[c*SEQ+t]); if(d3>dv) dv=d3;
+                }
+        }
        IOSurfaceUnlock(ioFused, kIOSurfaceLockReadOnly, NULL);
        IOSurfaceUnlock(ioQ, kIOSurfaceLockReadOnly, NULL);
        IOSurfaceUnlock(ioK, kIOSurfaceLockReadOnly, NULL);
--- a/training/test_perf_stats.m
+++ b/training/test_perf_stats.m
@ -10,6 +10,8 @@
 static mach_timebase_info_data_t g_tb;
 static double tb_ms(uint64_t t) { return (double)t * g_tb.numer / g_tb.denom / 1e6; }

+static int g_fp16_io = 0;  // M1/M2: cast op unsupported, use fp16 I/O directly
+
 static void dump_class(const char *name) {
    Class cls = NSClassFromString([NSString stringWithUTF8String:name]);
    if (!cls) { printf("  %s: NOT FOUND\n", name); return; }
@ -118,28 +120,43 @@ int main() {
        NSData *wdata = [NSData dataWithBytesNoCopy:blob length:tot freeWhenDone:YES];
        free(w);

-        NSString *mil = [NSString stringWithFormat:
-            @"program(1.3)\n"
-            "[buildInfo = dict<string, string>({{\"coremlc-component-MIL\", \"3510.2.1\"}, "
-            "{\"coremlc-version\", \"3505.4.1\"}, {\"coremltools-component-milinternal\", \"\"}, "
-            "{\"coremltools-version\", \"9.0\"}})]\n"
-            "{\n"
-            "    func main<ios18>(tensor<fp32, [1, %d, 1, %d]> x) {\n"
-            "        string pt = const()[name=string(\"pt\"), val=string(\"valid\")];\n"
-            "        tensor<int32, [2]> st = const()[name=string(\"st\"), val=tensor<int32, [2]>([1,1])];\n"
-            "        tensor<int32, [4]> pd = const()[name=string(\"pd\"), val=tensor<int32, [4]>([0,0,0,0])];\n"
-            "        tensor<int32, [2]> dl = const()[name=string(\"dl\"), val=tensor<int32, [2]>([1,1])];\n"
-            "        int32 gr = const()[name=string(\"gr\"), val=int32(1)];\n"
-            "        string to16 = const()[name=string(\"to16\"), val=string(\"fp16\")];\n"
-            "        tensor<fp16, [1,%d,1,%d]> x16 = cast(dtype=to16,x=x)[name=string(\"cin\")];\n"
-            "        tensor<fp16, [%d,%d,1,1]> W = const()[name=string(\"W\"), "
-            "val=tensor<fp16, [%d,%d,1,1]>(BLOBFILE(path=string(\"@model_path/weights/weight.bin\"), offset=uint64(64)))];\n"
-            "        tensor<fp16, [1,%d,1,%d]> y16 = conv(dilations=dl,groups=gr,pad=pd,pad_type=pt,strides=st,weight=W,x=x16)"
-            "[name=string(\"conv\")];\n"
-            "        string to32 = const()[name=string(\"to32\"), val=string(\"fp32\")];\n"
-            "        tensor<fp32, [1,%d,1,%d]> y = cast(dtype=to32,x=y16)[name=string(\"cout\")];\n"
-            "    } -> (y);\n"
-            "}\n", CH, SP, CH, SP, CH, CH, CH, CH, CH, SP, CH, SP];
+        retry_compile:;
+        NSString *mil;
+        if (g_fp16_io) {
+            mil = [NSString stringWithFormat:
+                @"program(1.0)\n"
+                "[buildInfo = dict<tensor<string, []>, tensor<string, []>>({{\"coremlc-version\", \"3505.4.1\"}})]\n{\n"
+                "    func main<ios16>(tensor<fp16, [1, %d, 1, %d]> x) {\n"
+                "        tensor<string, []> pt = const()[name=tensor<string, []>(\"pt\"), val=tensor<string, []>(\"valid\")];\n"
+                "        tensor<int32, [2]> st = const()[name=tensor<string, []>(\"st\"), val=tensor<int32, [2]>([1,1])];\n"
+                "        tensor<int32, [4]> pd = const()[name=tensor<string, []>(\"pd\"), val=tensor<int32, [4]>([0,0,0,0])];\n"
+                "        tensor<int32, [2]> dl = const()[name=tensor<string, []>(\"dl\"), val=tensor<int32, [2]>([1,1])];\n"
+                "        tensor<int32, []> gr = const()[name=tensor<string, []>(\"gr\"), val=tensor<int32, []>(1)];\n"
+                "        tensor<fp16, [%d,%d,1,1]> W = const()[name=tensor<string, []>(\"W\"), "
+                "val=tensor<fp16, [%d,%d,1,1]>(BLOBFILE(path=tensor<string, []>(\"@model_path/weights/weight.bin\"), offset=tensor<uint64, []>(64)))];\n"
+                "        tensor<fp16, [1,%d,1,%d]> y = conv(dilations=dl,groups=gr,pad=pd,pad_type=pt,strides=st,weight=W,x=x)"
+                "[name=tensor<string, []>(\"conv\")];\n"
+                "    } -> (y);\n}\n", CH, SP, CH, CH, CH, CH, CH, SP];
+        } else {
+            mil = [NSString stringWithFormat:
+                @"program(1.0)\n"
+                "[buildInfo = dict<tensor<string, []>, tensor<string, []>>({{\"coremlc-version\", \"3505.4.1\"}})]\n{\n"
+                "    func main<ios16>(tensor<fp32, [1, %d, 1, %d]> x) {\n"
+                "        tensor<string, []> pt = const()[name=tensor<string, []>(\"pt\"), val=tensor<string, []>(\"valid\")];\n"
+                "        tensor<int32, [2]> st = const()[name=tensor<string, []>(\"st\"), val=tensor<int32, [2]>([1,1])];\n"
+                "        tensor<int32, [4]> pd = const()[name=tensor<string, []>(\"pd\"), val=tensor<int32, [4]>([0,0,0,0])];\n"
+                "        tensor<int32, [2]> dl = const()[name=tensor<string, []>(\"dl\"), val=tensor<int32, [2]>([1,1])];\n"
+                "        tensor<int32, []> gr = const()[name=tensor<string, []>(\"gr\"), val=tensor<int32, []>(1)];\n"
+                "        tensor<string, []> to16 = const()[name=tensor<string, []>(\"to16\"), val=tensor<string, []>(\"fp16\")];\n"
+                "        tensor<fp16, [1,%d,1,%d]> x16 = cast(dtype=to16,x=x)[name=tensor<string, []>(\"cin\")];\n"
+                "        tensor<fp16, [%d,%d,1,1]> W = const()[name=tensor<string, []>(\"W\"), "
+                "val=tensor<fp16, [%d,%d,1,1]>(BLOBFILE(path=tensor<string, []>(\"@model_path/weights/weight.bin\"), offset=tensor<uint64, []>(64)))];\n"
+                "        tensor<fp16, [1,%d,1,%d]> y16 = conv(dilations=dl,groups=gr,pad=pd,pad_type=pt,strides=st,weight=W,x=x16)"
+                "[name=tensor<string, []>(\"conv\")];\n"
+                "        tensor<string, []> to32 = const()[name=tensor<string, []>(\"to32\"), val=tensor<string, []>(\"fp32\")];\n"
+                "        tensor<fp32, [1,%d,1,%d]> y = cast(dtype=to32,x=y16)[name=tensor<string, []>(\"cout\")];\n"
+                "    } -> (y);\n}\n", CH, SP, CH, SP, CH, CH, CH, CH, CH, SP, CH, SP];
+        }

        NSData *md = [mil dataUsingEncoding:NSUTF8StringEncoding];
        id desc = ((id(*)(Class,SEL,id,id,id))objc_msgSend)(g_D, @selector(modelWithMILText:weights:optionsPlist:),
@ -153,10 +170,15 @@ int main() {
        [wdata writeToFile:[td stringByAppendingPathComponent:@"weights/weight.bin"] atomically:YES];

        NSError *e = nil;
-        ((BOOL(*)(id,SEL,unsigned int,id,NSError**))objc_msgSend)(mdl, @selector(compileWithQoS:options:error:), 21, @{}, &e);
+        BOOL compiled = ((BOOL(*)(id,SEL,unsigned int,id,NSError**))objc_msgSend)(mdl, @selector(compileWithQoS:options:error:), 21, @{}, &e);
+        if (!compiled && !g_fp16_io) {
+            printf("[ANE] fp32 compile failed, retrying with fp16 I/O (M1/M2 fallback)\n");
+            g_fp16_io = 1;
+            goto retry_compile;
+        }
        ((BOOL(*)(id,SEL,unsigned int,id,NSError**))objc_msgSend)(mdl, @selector(loadWithQoS:options:error:), 21, @{}, &e);

-        int ioBytes = CH * SP * 4; // fp32
+        int ioBytes = CH * SP * (g_fp16_io ? 2 : 4);
        IOSurfaceRef ioIn = make_surface(ioBytes);
        IOSurfaceRef ioOut = make_surface(ioBytes);
        id wI = ((id(*)(Class,SEL,IOSurfaceRef))objc_msgSend)(g_AIO, @selector(objectWithIOSurface:), ioIn);
@ -174,8 +196,13 @@ int main() {

            if (req) {
                IOSurfaceLock(ioIn, 0, NULL);
-                float *inp = (float*)IOSurfaceGetBaseAddress(ioIn);
-                for (int i = 0; i < CH*SP; i++) inp[i] = 1.0f;
+                if (g_fp16_io) {
+                    _Float16 *inp = (_Float16*)IOSurfaceGetBaseAddress(ioIn);
+                    for (int i = 0; i < CH*SP; i++) inp[i] = (_Float16)1.0f;
+                } else {
+                    float *inp = (float*)IOSurfaceGetBaseAddress(ioIn);
+                    for (int i = 0; i < CH*SP; i++) inp[i] = 1.0f;
+                }
                IOSurfaceUnlock(ioIn, 0, NULL);

                BOOL ok = ((BOOL(*)(id,SEL,unsigned int,id,id,NSError**))objc_msgSend)(
--- a/training/test_qos_sweep.m
+++ b/training/test_qos_sweep.m
@ -10,6 +10,8 @@
 static mach_timebase_info_data_t g_tb;
 static double tb_ms(uint64_t t) { return (double)t * g_tb.numer / g_tb.denom / 1e6; }

+static int g_fp16_io = 0;  // M1/M2: cast op unsupported, use fp16 I/O directly
+
 static IOSurfaceRef make_surface(size_t bytes) {
    return IOSurfaceCreate((__bridge CFDictionaryRef)@{
        (id)kIOSurfaceWidth:@(bytes), (id)kIOSurfaceHeight:@1,
@ -38,37 +40,49 @@ int main() {
        for (int i = 0; i < CH*CH; i++) wp[i] = (_Float16)(0.01f * (i % 100 - 50));
        NSData *wdata = [NSData dataWithBytesNoCopy:blob length:tot freeWhenDone:YES];

-        NSString *mil = [NSString stringWithFormat:
-            @"program(1.3)\n"
-            "[buildInfo = dict<string, string>({{\"coremlc-component-MIL\", \"3510.2.1\"}, "
-            "{\"coremlc-version\", \"3505.4.1\"}, {\"coremltools-component-milinternal\", \"\"}, "
-            "{\"coremltools-version\", \"9.0\"}})]\n"
-            "{\n"
-            "    func main<ios18>(tensor<fp32, [1, %d, 1, %d]> x) {\n"
-            "        string pt = const()[name=string(\"pt\"), val=string(\"valid\")];\n"
-            "        tensor<int32, [2]> st = const()[name=string(\"st\"), val=tensor<int32, [2]>([1,1])];\n"
-            "        tensor<int32, [4]> pd = const()[name=string(\"pd\"), val=tensor<int32, [4]>([0,0,0,0])];\n"
-            "        tensor<int32, [2]> dl = const()[name=string(\"dl\"), val=tensor<int32, [2]>([1,1])];\n"
-            "        int32 gr = const()[name=string(\"gr\"), val=int32(1)];\n"
-            "        string to16 = const()[name=string(\"to16\"), val=string(\"fp16\")];\n"
-            "        tensor<fp16, [1,%d,1,%d]> x16 = cast(dtype=to16,x=x)[name=string(\"cin\")];\n"
-            "        tensor<fp16, [%d,%d,1,1]> W = const()[name=string(\"W\"), "
-            "val=tensor<fp16, [%d,%d,1,1]>(BLOBFILE(path=string(\"@model_path/weights/weight.bin\"), offset=uint64(64)))];\n"
-            "        tensor<fp16, [1,%d,1,%d]> y16 = conv(dilations=dl,groups=gr,pad=pd,pad_type=pt,strides=st,weight=W,x=x16)"
-            "[name=string(\"conv\")];\n"
-            "        string to32 = const()[name=string(\"to32\"), val=string(\"fp32\")];\n"
-            "        tensor<fp32, [1,%d,1,%d]> y = cast(dtype=to32,x=y16)[name=string(\"cout\")];\n"
-            "    } -> (y);\n"
-            "}\n", CH, SP, CH, SP, CH, CH, CH, CH, CH, SP, CH, SP];
-
-        NSDictionary *weights = @{@"@model_path/weights/weight.bin": @{@"offset":@0, @"data":wdata}};
-        NSData *milData = [mil dataUsingEncoding:NSUTF8StringEncoding];
        NSFileManager *fm = [NSFileManager defaultManager];

        printf("=== QoS Sweep: compile/load/eval with varying QoS ===\n");
        printf("Kernel: %dx%d conv, spatial=%d (%.1f MFLOPS)\n", CH, CH, SP, 2.0*CH*CH*SP/1e6);
        printf("%4s %10s %10s %10s %10s  %s\n", "QoS", "Compile", "Load", "Eval(1)", "Eval(avg10)", "Status");

+        retry_mil:;
+        NSString *mil;
+        if (g_fp16_io) {
+            mil = [NSString stringWithFormat:
+                @"program(1.0)\n[buildInfo = dict<tensor<string, []>, tensor<string, []>>({{\"coremlc-version\", \"3505.4.1\"}})]\n{\n"
+                "    func main<ios16>(tensor<fp16, [1, %d, 1, %d]> x) {\n"
+                "        tensor<string, []> pt = const()[name=tensor<string, []>(\"pt\"), val=tensor<string, []>(\"valid\")];\n"
+                "        tensor<int32, [2]> st = const()[name=tensor<string, []>(\"st\"), val=tensor<int32, [2]>([1,1])];\n"
+                "        tensor<int32, [4]> pd = const()[name=tensor<string, []>(\"pd\"), val=tensor<int32, [4]>([0,0,0,0])];\n"
+                "        tensor<int32, [2]> dl = const()[name=tensor<string, []>(\"dl\"), val=tensor<int32, [2]>([1,1])];\n"
+                "        tensor<int32, []> gr = const()[name=tensor<string, []>(\"gr\"), val=tensor<int32, []>(1)];\n"
+                "        tensor<fp16, [%d,%d,1,1]> W = const()[name=tensor<string, []>(\"W\"), "
+                "val=tensor<fp16, [%d,%d,1,1]>(BLOBFILE(path=tensor<string, []>(\"@model_path/weights/weight.bin\"), offset=tensor<uint64, []>(64)))];\n"
+                "        tensor<fp16, [1,%d,1,%d]> y = conv(dilations=dl,groups=gr,pad=pd,pad_type=pt,strides=st,weight=W,x=x)"
+                "[name=tensor<string, []>(\"conv\")];\n"
+                "    } -> (y);\n}\n", CH, SP, CH, CH, CH, CH, CH, SP];
+        } else {
+            mil = [NSString stringWithFormat:
+                @"program(1.0)\n[buildInfo = dict<tensor<string, []>, tensor<string, []>>({{\"coremlc-version\", \"3505.4.1\"}})]\n{\n"
+                "    func main<ios16>(tensor<fp32, [1, %d, 1, %d]> x) {\n"
+                "        tensor<string, []> pt = const()[name=tensor<string, []>(\"pt\"), val=tensor<string, []>(\"valid\")];\n"
+                "        tensor<int32, [2]> st = const()[name=tensor<string, []>(\"st\"), val=tensor<int32, [2]>([1,1])];\n"
+                "        tensor<int32, [4]> pd = const()[name=tensor<string, []>(\"pd\"), val=tensor<int32, [4]>([0,0,0,0])];\n"
+                "        tensor<int32, [2]> dl = const()[name=tensor<string, []>(\"dl\"), val=tensor<int32, [2]>([1,1])];\n"
+                "        tensor<int32, []> gr = const()[name=tensor<string, []>(\"gr\"), val=tensor<int32, []>(1)];\n"
+                "        tensor<string, []> to16 = const()[name=tensor<string, []>(\"to16\"), val=tensor<string, []>(\"fp16\")];\n"
+                "        tensor<fp16, [1,%d,1,%d]> x16 = cast(dtype=to16,x=x)[name=tensor<string, []>(\"cin\")];\n"
+                "        tensor<fp16, [%d,%d,1,1]> W = const()[name=tensor<string, []>(\"W\"), "
+                "val=tensor<fp16, [%d,%d,1,1]>(BLOBFILE(path=tensor<string, []>(\"@model_path/weights/weight.bin\"), offset=tensor<uint64, []>(64)))];\n"
+                "        tensor<fp16, [1,%d,1,%d]> y16 = conv(dilations=dl,groups=gr,pad=pd,pad_type=pt,strides=st,weight=W,x=x16)"
+                "[name=tensor<string, []>(\"conv\")];\n"
+                "        tensor<string, []> to32 = const()[name=tensor<string, []>(\"to32\"), val=tensor<string, []>(\"fp32\")];\n"
+                "        tensor<fp32, [1,%d,1,%d]> y = cast(dtype=to32,x=y16)[name=tensor<string, []>(\"cout\")];\n"
+                "    } -> (y);\n}\n", CH, SP, CH, SP, CH, CH, CH, CH, CH, SP, CH, SP];
+        }
+        NSData *milData = [mil dataUsingEncoding:NSUTF8StringEncoding];
+
        unsigned int qos_values[] = {0, 1, 5, 10, 15, 17, 19, 21, 25, 31, 33, 40, 47, 50, 55, 60, 63};
        int n_qos = sizeof(qos_values)/sizeof(qos_values[0]);

@ -98,6 +112,12 @@ int main() {
            double cms = tb_ms(mach_absolute_time() - t0);

            if (!cok) {
+                if (!g_fp16_io) {
+                    printf("[ANE] fp32 compile failed, retrying with fp16 I/O (M1/M2 fallback)\n");
+                    g_fp16_io = 1;
+                    [fm removeItemAtPath:td error:nil];
+                    goto retry_mil;
+                }
                printf("%4u %10s %10s %10s %10s  COMPILE_FAIL\n", qos, "-", "-", "-", "-");
                [fm removeItemAtPath:td error:nil];
                continue;
@ -115,7 +135,7 @@ int main() {
                continue;
            }

-            int ioBytes = CH * SP * 4;
+            int ioBytes = CH * SP * (g_fp16_io ? 2 : 4);
            IOSurfaceRef ioIn = make_surface(ioBytes);
            IOSurfaceRef ioOut = make_surface(ioBytes);
            id wI = ((id(*)(Class,SEL,IOSurfaceRef))objc_msgSend)(g_AIO, @selector(objectWithIOSurface:), ioIn);
@ -125,8 +145,13 @@ int main() {
                @[wI], @[@0], @[wO], @[@0], nil, nil, @0);

            IOSurfaceLock(ioIn, 0, NULL);
-            float *inp = (float*)IOSurfaceGetBaseAddress(ioIn);
-            for (int i = 0; i < CH*SP; i++) inp[i] = 0.5f;
+            if (g_fp16_io) {
+                _Float16 *inp = (_Float16*)IOSurfaceGetBaseAddress(ioIn);
+                for (int i = 0; i < CH*SP; i++) inp[i] = (_Float16)0.5f;
+            } else {
+                float *inp = (float*)IOSurfaceGetBaseAddress(ioIn);
+                for (int i = 0; i < CH*SP; i++) inp[i] = 0.5f;
+            }
            IOSurfaceUnlock(ioIn, 0, NULL);

            t0 = mach_absolute_time();
--- a/training/test_weight_reload.m
+++ b/training/test_weight_reload.m
@ -34,30 +34,42 @@ static NSData *build_weight_blob(_Float16 *w, int rows, int cols) {
    return [NSData dataWithBytesNoCopy:b length:tot freeWhenDone:YES];
 }

-// Generate MIL for a simple conv: fp32 in → cast fp16 → conv → cast fp32 out
+static int g_fp16_io = 0;  // M1/M2: cast op unsupported, use fp16 I/O directly
+
+// Generate MIL for a simple conv (fp16 I/O when g_fp16_io, else fp32 with casts)
 static NSString *gen_mil(int ch, int sp) {
+    if (g_fp16_io) {
+        return [NSString stringWithFormat:
+            @"program(1.0)\n[buildInfo = dict<tensor<string, []>, tensor<string, []>>({{\"coremlc-version\", \"3505.4.1\"}})]\n{\n"
+            "    func main<ios16>(tensor<fp16, [1, %d, 1, %d]> x) {\n"
+            "        tensor<string, []> pt = const()[name=tensor<string, []>(\"pt\"), val=tensor<string, []>(\"valid\")];\n"
+            "        tensor<int32, [2]> st = const()[name=tensor<string, []>(\"st\"), val=tensor<int32, [2]>([1,1])];\n"
+            "        tensor<int32, [4]> pd = const()[name=tensor<string, []>(\"pd\"), val=tensor<int32, [4]>([0,0,0,0])];\n"
+            "        tensor<int32, [2]> dl = const()[name=tensor<string, []>(\"dl\"), val=tensor<int32, [2]>([1,1])];\n"
+            "        tensor<int32, []> gr = const()[name=tensor<string, []>(\"gr\"), val=tensor<int32, []>(1)];\n"
+            "        tensor<fp16, [%d,%d,1,1]> W = const()[name=tensor<string, []>(\"W\"), "
+            "val=tensor<fp16, [%d,%d,1,1]>(BLOBFILE(path=tensor<string, []>(\"@model_path/weights/weight.bin\"), offset=tensor<uint64, []>(64)))];\n"
+            "        tensor<fp16, [1,%d,1,%d]> y = conv(dilations=dl,groups=gr,pad=pd,pad_type=pt,strides=st,weight=W,x=x)"
+            "[name=tensor<string, []>(\"conv\")];\n"
+            "    } -> (y);\n}\n", ch, sp, ch, ch, ch, ch, ch, sp];
+    }
    return [NSString stringWithFormat:
-        @"program(1.3)\n"
-        "[buildInfo = dict<string, string>({{\"coremlc-component-MIL\", \"3510.2.1\"}, "
-        "{\"coremlc-version\", \"3505.4.1\"}, {\"coremltools-component-milinternal\", \"\"}, "
-        "{\"coremltools-version\", \"9.0\"}})]\n"
-        "{\n"
-        "    func main<ios18>(tensor<fp32, [1, %d, 1, %d]> x) {\n"
-        "        string pt = const()[name=string(\"pt\"), val=string(\"valid\")];\n"
-        "        tensor<int32, [2]> st = const()[name=string(\"st\"), val=tensor<int32, [2]>([1,1])];\n"
-        "        tensor<int32, [4]> pd = const()[name=string(\"pd\"), val=tensor<int32, [4]>([0,0,0,0])];\n"
-        "        tensor<int32, [2]> dl = const()[name=string(\"dl\"), val=tensor<int32, [2]>([1,1])];\n"
-        "        int32 gr = const()[name=string(\"gr\"), val=int32(1)];\n"
-        "        string to16 = const()[name=string(\"to16\"), val=string(\"fp16\")];\n"
-        "        tensor<fp16, [1,%d,1,%d]> x16 = cast(dtype=to16,x=x)[name=string(\"cin\")];\n"
-        "        tensor<fp16, [%d,%d,1,1]> W = const()[name=string(\"W\"), "
-        "val=tensor<fp16, [%d,%d,1,1]>(BLOBFILE(path=string(\"@model_path/weights/weight.bin\"), offset=uint64(64)))];\n"
+        @"program(1.0)\n[buildInfo = dict<tensor<string, []>, tensor<string, []>>({{\"coremlc-version\", \"3505.4.1\"}})]\n{\n"
+        "    func main<ios16>(tensor<fp32, [1, %d, 1, %d]> x) {\n"
+        "        tensor<string, []> pt = const()[name=tensor<string, []>(\"pt\"), val=tensor<string, []>(\"valid\")];\n"
+        "        tensor<int32, [2]> st = const()[name=tensor<string, []>(\"st\"), val=tensor<int32, [2]>([1,1])];\n"
+        "        tensor<int32, [4]> pd = const()[name=tensor<string, []>(\"pd\"), val=tensor<int32, [4]>([0,0,0,0])];\n"
+        "        tensor<int32, [2]> dl = const()[name=tensor<string, []>(\"dl\"), val=tensor<int32, [2]>([1,1])];\n"
+        "        tensor<int32, []> gr = const()[name=tensor<string, []>(\"gr\"), val=tensor<int32, []>(1)];\n"
+        "        tensor<string, []> to16 = const()[name=tensor<string, []>(\"to16\"), val=tensor<string, []>(\"fp16\")];\n"
+        "        tensor<fp16, [1,%d,1,%d]> x16 = cast(dtype=to16,x=x)[name=tensor<string, []>(\"cin\")];\n"
+        "        tensor<fp16, [%d,%d,1,1]> W = const()[name=tensor<string, []>(\"W\"), "
+        "val=tensor<fp16, [%d,%d,1,1]>(BLOBFILE(path=tensor<string, []>(\"@model_path/weights/weight.bin\"), offset=tensor<uint64, []>(64)))];\n"
        "        tensor<fp16, [1,%d,1,%d]> y16 = conv(dilations=dl,groups=gr,pad=pd,pad_type=pt,strides=st,weight=W,x=x16)"
-        "[name=string(\"conv\")];\n"
-        "        string to32 = const()[name=string(\"to32\"), val=string(\"fp32\")];\n"
-        "        tensor<fp32, [1,%d,1,%d]> y = cast(dtype=to32,x=y16)[name=string(\"cout\")];\n"
-        "    } -> (y);\n"
-        "}\n", ch, sp, ch, sp, ch, ch, ch, ch, ch, sp, ch, sp];
+        "[name=tensor<string, []>(\"conv\")];\n"
+        "        tensor<string, []> to32 = const()[name=tensor<string, []>(\"to32\"), val=tensor<string, []>(\"fp32\")];\n"
+        "        tensor<fp32, [1,%d,1,%d]> y = cast(dtype=to32,x=y16)[name=tensor<string, []>(\"cout\")];\n"
+        "    } -> (y);\n}\n", ch, sp, ch, sp, ch, ch, ch, ch, ch, sp, ch, sp];
 }

 int main() {
@ -88,6 +100,9 @@ int main() {
        for (int i = 0; i < CH; i++) weightsB[i*CH+i] = (_Float16)3.0f;

        NSData *wdataA = build_weight_blob(weightsA, CH, CH);
+        NSFileManager *fm = [NSFileManager defaultManager];
+
+        retry_compile:;
        NSString *mil = gen_mil(CH, SP);
        NSDictionary *weights = @{
            @"@model_path/weights/weight.bin": @{@"offset": @0, @"data": wdataA}
@ -103,13 +118,18 @@ int main() {
        id mdl = ((id(*)(Class,SEL,id))objc_msgSend)(g_I, @selector(inMemoryModelWithDescriptor:), desc);
        id hx = ((id(*)(id,SEL))objc_msgSend)(mdl, @selector(hexStringIdentifier));
        NSString *td = [NSTemporaryDirectory() stringByAppendingPathComponent:hx];
-        NSFileManager *fm = [NSFileManager defaultManager];
        [fm createDirectoryAtPath:[td stringByAppendingPathComponent:@"weights"] withIntermediateDirectories:YES attributes:nil error:nil];
        [milData writeToFile:[td stringByAppendingPathComponent:@"model.mil"] atomically:YES];
        [wdataA writeToFile:[td stringByAppendingPathComponent:@"weights/weight.bin"] atomically:YES];

        NSError *e = nil;
        BOOL ok = ((BOOL(*)(id,SEL,unsigned int,id,NSError**))objc_msgSend)(mdl, @selector(compileWithQoS:options:error:), 21, @{}, &e);
+        if (!ok && !g_fp16_io) {
+            printf("[ANE] fp32 compile failed, retrying with fp16 I/O (M1/M2 fallback)\n");
+            g_fp16_io = 1;
+            [fm removeItemAtPath:td error:nil];
+            goto retry_compile;
+        }
        if (!ok) { printf("FAIL: compile: %s\n", [[e description] UTF8String]); return 1; }
        ok = ((BOOL(*)(id,SEL,unsigned int,id,NSError**))objc_msgSend)(mdl, @selector(loadWithQoS:options:error:), 21, @{}, &e);
        if (!ok) { printf("FAIL: load: %s\n", [[e description] UTF8String]); return 1; }
@ -117,9 +137,10 @@ int main() {
        printf("  Compile+load: %.1fms\n", compile_ms);
        printf("  tmpDir: %s\n", [td UTF8String]);

-        // Build request and IOSurfaces (fp32 I/O)
-        int inBytes = CH * SP * 4;  // fp32
-        int outBytes = CH * SP * 4;
+        // Build request and IOSurfaces
+        size_t bpe = g_fp16_io ? 2 : 4;
+        int inBytes = CH * SP * bpe;
+        int outBytes = CH * SP * bpe;
        IOSurfaceRef ioIn = make_surface(inBytes);
        IOSurfaceRef ioOut = make_surface(outBytes);
        id wI = ((id(*)(Class,SEL,IOSurfaceRef))objc_msgSend)(g_AIO, @selector(objectWithIOSurface:), ioIn);
@ -130,10 +151,17 @@ int main() {

        // Write input: channel c, spatial s = (c*SP + s + 1) * 0.01
        IOSurfaceLock(ioIn, 0, NULL);
-        float *inp = (float*)IOSurfaceGetBaseAddress(ioIn);
-        for (int c = 0; c < CH; c++)
-            for (int s = 0; s < SP; s++)
-                inp[c*SP+s] = (float)(c*SP + s + 1) * 0.01f;
+        if (g_fp16_io) {
+            _Float16 *inp = (_Float16*)IOSurfaceGetBaseAddress(ioIn);
+            for (int c = 0; c < CH; c++)
+                for (int s = 0; s < SP; s++)
+                    inp[c*SP+s] = (_Float16)((float)(c*SP + s + 1) * 0.01f);
+        } else {
+            float *inp = (float*)IOSurfaceGetBaseAddress(ioIn);
+            for (int c = 0; c < CH; c++)
+                for (int s = 0; s < SP; s++)
+                    inp[c*SP+s] = (float)(c*SP + s + 1) * 0.01f;
+        }
        IOSurfaceUnlock(ioIn, 0, NULL);

        // Eval with weights A
@ -142,13 +170,17 @@ int main() {
        if (!ok) { printf("FAIL: eval: %s\n", e ? [[e description] UTF8String] : "?"); return 1; }

        IOSurfaceLock(ioOut, kIOSurfaceLockReadOnly, NULL);
-        float *outA = (float*)IOSurfaceGetBaseAddress(ioOut);
-        printf("  Output A[0..3]: [%.4f, %.4f, %.4f, %.4f]\n", outA[0], outA[1], outA[2], outA[3]);
+        float *outA_copy = (float*)malloc(CH * SP * sizeof(float));
+        if (g_fp16_io) {
+            _Float16 *outA = (_Float16*)IOSurfaceGetBaseAddress(ioOut);
+            for (int i = 0; i < CH*SP; i++) outA_copy[i] = (float)outA[i];
+        } else {
+            float *outA = (float*)IOSurfaceGetBaseAddress(ioOut);
+            memcpy(outA_copy, outA, CH * SP * sizeof(float));
+        }
+        printf("  Output A[0..3]: [%.4f, %.4f, %.4f, %.4f]\n", outA_copy[0], outA_copy[1], outA_copy[2], outA_copy[3]);
        printf("  Output A[%d..%d]: [%.4f, %.4f, %.4f, %.4f]\n", CH*SP-4, CH*SP-1,
-               outA[CH*SP-4], outA[CH*SP-3], outA[CH*SP-2], outA[CH*SP-1]);
-        // Save copy
-        float *outA_copy = (float*)malloc(outBytes);
-        memcpy(outA_copy, outA, outBytes);
+               outA_copy[CH*SP-4], outA_copy[CH*SP-3], outA_copy[CH*SP-2], outA_copy[CH*SP-1]);
        IOSurfaceUnlock(ioOut, kIOSurfaceLockReadOnly, NULL);

        // === Step 3: Overwrite weight file with B, unload+load ===
@ -189,10 +221,17 @@ int main() {

        // Re-write same input
        IOSurfaceLock(ioIn, 0, NULL);
-        inp = (float*)IOSurfaceGetBaseAddress(ioIn);
-        for (int c = 0; c < CH; c++)
-            for (int s = 0; s < SP; s++)
-                inp[c*SP+s] = (float)(c*SP + s + 1) * 0.01f;
+        if (g_fp16_io) {
+            _Float16 *inp2 = (_Float16*)IOSurfaceGetBaseAddress(ioIn);
+            for (int c = 0; c < CH; c++)
+                for (int s = 0; s < SP; s++)
+                    inp2[c*SP+s] = (_Float16)((float)(c*SP + s + 1) * 0.01f);
+        } else {
+            float *inp2 = (float*)IOSurfaceGetBaseAddress(ioIn);
+            for (int c = 0; c < CH; c++)
+                for (int s = 0; s < SP; s++)
+                    inp2[c*SP+s] = (float)(c*SP + s + 1) * 0.01f;
+        }
        IOSurfaceUnlock(ioIn, 0, NULL);

        // Eval with (possibly reloaded) weights B
@ -201,16 +240,23 @@ int main() {
        if (!ok) { printf("FAIL: eval after reload: %s\n", e ? [[e description] UTF8String] : "?"); return 1; }

        IOSurfaceLock(ioOut, kIOSurfaceLockReadOnly, NULL);
-        float *outB = (float*)IOSurfaceGetBaseAddress(ioOut);
-        printf("  Output B[0..3]: [%.4f, %.4f, %.4f, %.4f]\n", outB[0], outB[1], outB[2], outB[3]);
+        float *outB_f = (float*)malloc(CH * SP * sizeof(float));
+        if (g_fp16_io) {
+            _Float16 *outB = (_Float16*)IOSurfaceGetBaseAddress(ioOut);
+            for (int i = 0; i < CH*SP; i++) outB_f[i] = (float)outB[i];
+        } else {
+            float *outB = (float*)IOSurfaceGetBaseAddress(ioOut);
+            memcpy(outB_f, outB, CH * SP * sizeof(float));
+        }
+        printf("  Output B[0..3]: [%.4f, %.4f, %.4f, %.4f]\n", outB_f[0], outB_f[1], outB_f[2], outB_f[3]);
        printf("  Output B[%d..%d]: [%.4f, %.4f, %.4f, %.4f]\n", CH*SP-4, CH*SP-1,
-               outB[CH*SP-4], outB[CH*SP-3], outB[CH*SP-2], outB[CH*SP-1]);
+               outB_f[CH*SP-4], outB_f[CH*SP-3], outB_f[CH*SP-2], outB_f[CH*SP-1]);

        // Check: did the output change?
        bool changed = false;
        float max_diff = 0;
        for (int i = 0; i < CH*SP; i++) {
-            float d = fabsf(outB[i] - outA_copy[i]);
+            float d = fabsf(outB_f[i] - outA_copy[i]);
            if (d > max_diff) max_diff = d;
            if (d > 0.001f) changed = true;
        }
@ -219,11 +265,12 @@ int main() {
        float max_3x_err = 0;
        for (int i = 0; i < CH*SP; i++) {
            float expected = outA_copy[i] * 3.0f;
-            float err = fabsf(outB[i] - expected);
+            float err = fabsf(outB_f[i] - expected);
            if (err > max_3x_err) max_3x_err = err;
            if (err > 0.1f) correct_3x = false;
        }
        IOSurfaceUnlock(ioOut, kIOSurfaceLockReadOnly, NULL);
+        free(outB_f);

        printf("\n=== RESULT ===\n");
        printf("  Max A-B diff: %.6f\n", max_diff);
--- a/training/tiny_train.m
+++ b/training/tiny_train.m
@ -59,25 +59,43 @@ static NSData *build_blob_transposed(const float *w, int rows, int cols) {
    return [NSData dataWithBytesNoCopy:buf length:total freeWhenDone:YES];
 }

+static int g_fp16_io = 0;  // M1/M2: cast op unsupported, use fp16 I/O directly
+
 static NSString *gen_conv_mil(int in_ch, int out_ch, int sp) {
+    if (g_fp16_io) {
+        // fp16 I/O path — no cast ops (M1/M2 compatible)
+        return [NSString stringWithFormat:
+            @"program(1.0)\n[buildInfo = dict<tensor<string, []>, tensor<string, []>>({{\"coremlc-version\", \"3505.4.1\"}})]\n{\n"
+            "    func main<ios16>(tensor<fp16, [1, %d, 1, %d]> x) {\n"
+            "        tensor<fp16, [%d, %d, 1, 1]> W = const()[name = tensor<string, []>(\"W\"), "
+            "val = tensor<fp16, [%d, %d, 1, 1]>(BLOBFILE(path = tensor<string, []>(\"@model_path/weights/weight.bin\"), offset = tensor<uint64, []>(64)))];\n"
+            "        tensor<string, []> pt = const()[name = tensor<string, []>(\"pt\"), val = tensor<string, []>(\"valid\")];\n"
+            "        tensor<int32, [2]> st = const()[name = tensor<string, []>(\"st\"), val = tensor<int32, [2]>([1, 1])];\n"
+            "        tensor<int32, [4]> pd = const()[name = tensor<string, []>(\"pd\"), val = tensor<int32, [4]>([0, 0, 0, 0])];\n"
+            "        tensor<int32, [2]> dl = const()[name = tensor<string, []>(\"dl\"), val = tensor<int32, [2]>([1, 1])];\n"
+            "        tensor<int32, []> gr = const()[name = tensor<string, []>(\"gr\"), val = tensor<int32, []>(1)];\n"
+            "        tensor<fp16, [1, %d, 1, %d]> y = conv(dilations = dl, groups = gr, pad = pd, "
+            "pad_type = pt, strides = st, weight = W, x = x)[name = tensor<string, []>(\"cv\")];\n"
+            "    } -> (y);\n}\n",
+            in_ch, sp, out_ch, in_ch, out_ch, in_ch, out_ch, sp];
+    }
+    // fp32 I/O path — cast to/from fp16 internally (M4+ native)
    return [NSString stringWithFormat:
-        @"program(1.3)\n[buildInfo = dict<string, string>({{\"coremlc-component-MIL\", \"3510.2.1\"}, "
-        "{\"coremlc-version\", \"3505.4.1\"}, {\"coremltools-component-milinternal\", \"\"}, "
-        "{\"coremltools-version\", \"9.0\"}})]\n{\n"
-        "    func main<ios18>(tensor<fp32, [1, %d, 1, %d]> x) {\n"
-        "        string d1 = const()[name = string(\"d1\"), val = string(\"fp16\")];\n"
-        "        tensor<fp16, [1, %d, 1, %d]> x16 = cast(dtype = d1, x = x)[name = string(\"cx\")];\n"
-        "        tensor<fp16, [%d, %d, 1, 1]> W = const()[name = string(\"W\"), "
-        "val = tensor<fp16, [%d, %d, 1, 1]>(BLOBFILE(path = string(\"@model_path/weights/weight.bin\"), offset = uint64(64)))];\n"
-        "        string pt = const()[name = string(\"pt\"), val = string(\"valid\")];\n"
-        "        tensor<int32, [2]> st = const()[name = string(\"st\"), val = tensor<int32, [2]>([1, 1])];\n"
-        "        tensor<int32, [4]> pd = const()[name = string(\"pd\"), val = tensor<int32, [4]>([0, 0, 0, 0])];\n"
-        "        tensor<int32, [2]> dl = const()[name = string(\"dl\"), val = tensor<int32, [2]>([1, 1])];\n"
-        "        int32 gr = const()[name = string(\"gr\"), val = int32(1)];\n"
+        @"program(1.0)\n[buildInfo = dict<tensor<string, []>, tensor<string, []>>({{\"coremlc-version\", \"3505.4.1\"}})]\n{\n"
+        "    func main<ios16>(tensor<fp32, [1, %d, 1, %d]> x) {\n"
+        "        tensor<string, []> d1 = const()[name = tensor<string, []>(\"d1\"), val = tensor<string, []>(\"fp16\")];\n"
+        "        tensor<fp16, [1, %d, 1, %d]> x16 = cast(dtype = d1, x = x)[name = tensor<string, []>(\"cx\")];\n"
+        "        tensor<fp16, [%d, %d, 1, 1]> W = const()[name = tensor<string, []>(\"W\"), "
+        "val = tensor<fp16, [%d, %d, 1, 1]>(BLOBFILE(path = tensor<string, []>(\"@model_path/weights/weight.bin\"), offset = tensor<uint64, []>(64)))];\n"
+        "        tensor<string, []> pt = const()[name = tensor<string, []>(\"pt\"), val = tensor<string, []>(\"valid\")];\n"
+        "        tensor<int32, [2]> st = const()[name = tensor<string, []>(\"st\"), val = tensor<int32, [2]>([1, 1])];\n"
+        "        tensor<int32, [4]> pd = const()[name = tensor<string, []>(\"pd\"), val = tensor<int32, [4]>([0, 0, 0, 0])];\n"
+        "        tensor<int32, [2]> dl = const()[name = tensor<string, []>(\"dl\"), val = tensor<int32, [2]>([1, 1])];\n"
+        "        tensor<int32, []> gr = const()[name = tensor<string, []>(\"gr\"), val = tensor<int32, []>(1)];\n"
        "        tensor<fp16, [1, %d, 1, %d]> y16 = conv(dilations = dl, groups = gr, pad = pd, "
-        "pad_type = pt, strides = st, weight = W, x = x16)[name = string(\"cv\")];\n"
-        "        string d2 = const()[name = string(\"d2\"), val = string(\"fp32\")];\n"
-        "        tensor<fp32, [1, %d, 1, %d]> y = cast(dtype = d2, x = y16)[name = string(\"co\")];\n"
+        "pad_type = pt, strides = st, weight = W, x = x16)[name = tensor<string, []>(\"cv\")];\n"
+        "        tensor<string, []> d2 = const()[name = tensor<string, []>(\"d2\"), val = tensor<string, []>(\"fp32\")];\n"
+        "        tensor<fp32, [1, %d, 1, %d]> y = cast(dtype = d2, x = y16)[name = tensor<string, []>(\"co\")];\n"
        "    } -> (y);\n}\n",
        in_ch, sp, in_ch, sp, out_ch, in_ch, out_ch, in_ch, out_ch, sp, out_ch, sp];
 }
@ -106,10 +124,19 @@ static Kern *compile_kern_with_blob(NSData *blob, int in_ch, int out_ch, int sp)
    [milData writeToFile:[td stringByAppendingPathComponent:@"model.mil"] atomically:YES];
    [blob writeToFile:[td stringByAppendingPathComponent:@"weights/weight.bin"] atomically:YES];
    NSError *e = nil;
-    if (!((BOOL(*)(id,SEL,unsigned int,id,NSError**))objc_msgSend)(mdl, @selector(compileWithQoS:options:error:), 21, @{}, &e)) return NULL;
+    if (!((BOOL(*)(id,SEL,unsigned int,id,NSError**))objc_msgSend)(mdl, @selector(compileWithQoS:options:error:), 21, @{}, &e)) {
+        if (!g_fp16_io) {
+            // M1/M2 ANE doesn't support cast op — retry with fp16 I/O
+            printf("[ANE] fp32 compile failed, retrying with fp16 I/O (M1/M2 fallback)\n");
+            g_fp16_io = 1;
+            return compile_kern_with_blob(blob, in_ch, out_ch, sp);
+        }
+        return NULL;
+    }
    if (!((BOOL(*)(id,SEL,unsigned int,id,NSError**))objc_msgSend)(mdl, @selector(loadWithQoS:options:error:), 21, @{}, &e)) return NULL;
    __sync_fetch_and_add(&g_compile_count, 1);
-    size_t inB = in_ch * sp * 4, outB = out_ch * sp * 4;
+    size_t bpe = g_fp16_io ? 2 : 4;
+    size_t inB = in_ch * sp * bpe, outB = out_ch * sp * bpe;
    IOSurfaceRef ioI = make_surface(inB), ioO = make_surface(outB);
    id wI = ((id(*)(Class,SEL,IOSurfaceRef))objc_msgSend)(g_AIO, @selector(objectWithIOSurface:), ioI);
    id wO = ((id(*)(Class,SEL,IOSurfaceRef))objc_msgSend)(g_AIO, @selector(objectWithIOSurface:), ioO);
@ -140,27 +167,43 @@ static void free_kern(Kern *k) {
 }

 static void ane_eval_k(Kern *k, const float *in, float *out, int in_ch, int out_ch, int sp) {
-    float *tmp = (float*)malloc(in_ch * sp * sizeof(float));
-    for (int t = 0; t < sp; t++)
-        for (int c = 0; c < in_ch; c++)
-            tmp[c*sp + t] = in[t*in_ch + c];
+    // Transpose [S,C] -> [C,S] and write to IOSurface
    IOSurfaceLock(k->ioIn, 0, NULL);
-    memcpy(IOSurfaceGetBaseAddress(k->ioIn), tmp, in_ch * sp * sizeof(float));
+    void *base_in = IOSurfaceGetBaseAddress(k->ioIn);
+    if (g_fp16_io) {
+        _Float16 *dst = (_Float16*)base_in;
+        for (int t = 0; t < sp; t++)
+            for (int c = 0; c < in_ch; c++)
+                dst[c*sp + t] = (_Float16)in[t*in_ch + c];
+    } else {
+        float *dst = (float*)base_in;
+        for (int t = 0; t < sp; t++)
+            for (int c = 0; c < in_ch; c++)
+                dst[c*sp + t] = in[t*in_ch + c];
+    }
    IOSurfaceUnlock(k->ioIn, 0, NULL);
-    free(tmp);
+
    NSError *e = nil;
    id mdl = (__bridge id)k->model;
    id req = (__bridge id)k->request;
    ((BOOL(*)(id,SEL,unsigned int,id,id,NSError**))objc_msgSend)(
        mdl, @selector(evaluateWithQoS:options:request:error:), 21, @{}, req, &e);
-    float *tmp2 = (float*)malloc(out_ch * sp * sizeof(float));
+
+    // Read output, transpose [C,S] -> [S,C]
    IOSurfaceLock(k->ioOut, kIOSurfaceLockReadOnly, NULL);
-    memcpy(tmp2, IOSurfaceGetBaseAddress(k->ioOut), out_ch * sp * sizeof(float));
+    void *base_out = IOSurfaceGetBaseAddress(k->ioOut);
+    if (g_fp16_io) {
+        _Float16 *src = (_Float16*)base_out;
+        for (int t = 0; t < sp; t++)
+            for (int c = 0; c < out_ch; c++)
+                out[t*out_ch + c] = (float)src[c*sp + t];
+    } else {
+        float *src = (float*)base_out;
+        for (int t = 0; t < sp; t++)
+            for (int c = 0; c < out_ch; c++)
+                out[t*out_ch + c] = src[c*sp + t];
+    }
    IOSurfaceUnlock(k->ioOut, kIOSurfaceLockReadOnly, NULL);
-    for (int t = 0; t < sp; t++)
-        for (int c = 0; c < out_ch; c++)
-            out[t*out_ch + c] = tmp2[c*sp + t];
-    free(tmp2);
 }

 // === Checkpoint: save/restore training state for exec() restart ===
@ -173,6 +216,7 @@ typedef struct {
    float lr;
    double cum_compile_ms, cum_train_ms, cum_wall_ms;
    int cum_steps, cum_batches;
+    int fp16_io;  // persisted: 1 if ANE needs fp16 I/O (M1/M2)
 } CkptHeader;

 static void save_checkpoint(const char *path, int step, float loss,
@ -180,7 +224,7 @@ static void save_checkpoint(const char *path, int step, float loss,
                            const float *W1, const float *W2,
                            double cc, double ct, double cw, int cs, int cb) {
    FILE *f = fopen(path, "wb");
-    CkptHeader hdr = {step, loss, D, H, S, total_steps, lr, cc, ct, cw, cs, cb};
+    CkptHeader hdr = {step, loss, D, H, S, total_steps, lr, cc, ct, cw, cs, cb, g_fp16_io};
    fwrite(&hdr, sizeof(hdr), 1, f);
    fwrite(W1, sizeof(float), H * D, f);
    fwrite(W2, sizeof(float), D * H, f);
@ -241,8 +285,9 @@ int main(int argc, char *argv[]) {
                start_step = hdr.step;
                total_steps = hdr.total_steps;
                lr = hdr.lr;
+                g_fp16_io = hdr.fp16_io;
                resuming = true;
-                printf("[RESUMED at step %d, loss=%.6f, compiles reset]\n", start_step, hdr.loss);
+                printf("[RESUMED at step %d, loss=%.6f, fp16_io=%d, compiles reset]\n", start_step, hdr.loss, g_fp16_io);
            }
        }

--- a/training/tiny_train_old.m
+++ b/training/tiny_train_old.m
@ -59,34 +59,50 @@ static NSData *build_blob_transposed(const float *w, int rows, int cols) {
    return [NSData dataWithBytesNoCopy:buf length:total freeWhenDone:YES];
 }

+static int g_fp16_io = 0;  // M1/M2: cast op unsupported, use fp16 I/O directly
+
 static NSString *gen_conv_mil(int in_ch, int out_ch, int sp) {
+    if (g_fp16_io) {
+        return [NSString stringWithFormat:
+            @"program(1.0)\n[buildInfo = dict<tensor<string, []>, tensor<string, []>>({{\"coremlc-version\", \"3505.4.1\"}})]\n{\n"
+            "    func main<ios16>(tensor<fp16, [1, %d, 1, %d]> x) {\n"
+            "        tensor<fp16, [%d, %d, 1, 1]> W = const()[name = tensor<string, []>(\"W\"), "
+            "val = tensor<fp16, [%d, %d, 1, 1]>(BLOBFILE(path = tensor<string, []>(\"@model_path/weights/weight.bin\"), offset = tensor<uint64, []>(64)))];\n"
+            "        tensor<string, []> pt = const()[name = tensor<string, []>(\"pt\"), val = tensor<string, []>(\"valid\")];\n"
+            "        tensor<int32, [2]> st = const()[name = tensor<string, []>(\"st\"), val = tensor<int32, [2]>([1, 1])];\n"
+            "        tensor<int32, [4]> pd = const()[name = tensor<string, []>(\"pd\"), val = tensor<int32, [4]>([0, 0, 0, 0])];\n"
+            "        tensor<int32, [2]> dl = const()[name = tensor<string, []>(\"dl\"), val = tensor<int32, [2]>([1, 1])];\n"
+            "        tensor<int32, []> gr = const()[name = tensor<string, []>(\"gr\"), val = tensor<int32, []>(1)];\n"
+            "        tensor<fp16, [1, %d, 1, %d]> y = conv(dilations = dl, groups = gr, pad = pd, "
+            "pad_type = pt, strides = st, weight = W, x = x)[name = tensor<string, []>(\"cv\")];\n"
+            "    } -> (y);\n}\n",
+            in_ch, sp, out_ch, in_ch, out_ch, in_ch, out_ch, sp];
+    }
    return [NSString stringWithFormat:
-        @"program(1.3)\n[buildInfo = dict<string, string>({{\"coremlc-component-MIL\", \"3510.2.1\"}, "
-        "{\"coremlc-version\", \"3505.4.1\"}, {\"coremltools-component-milinternal\", \"\"}, "
-        "{\"coremltools-version\", \"9.0\"}})]\n{\n"
-        "    func main<ios18>(tensor<fp32, [1, %d, 1, %d]> x) {\n"
-        "        string d1 = const()[name = string(\"d1\"), val = string(\"fp16\")];\n"
-        "        tensor<fp16, [1, %d, 1, %d]> x16 = cast(dtype = d1, x = x)[name = string(\"cx\")];\n"
-        "        tensor<fp16, [%d, %d, 1, 1]> W = const()[name = string(\"W\"), "
-        "val = tensor<fp16, [%d, %d, 1, 1]>(BLOBFILE(path = string(\"@model_path/weights/weight.bin\"), offset = uint64(64)))];\n"
-        "        string pt = const()[name = string(\"pt\"), val = string(\"valid\")];\n"
-        "        tensor<int32, [2]> st = const()[name = string(\"st\"), val = tensor<int32, [2]>([1, 1])];\n"
-        "        tensor<int32, [4]> pd = const()[name = string(\"pd\"), val = tensor<int32, [4]>([0, 0, 0, 0])];\n"
-        "        tensor<int32, [2]> dl = const()[name = string(\"dl\"), val = tensor<int32, [2]>([1, 1])];\n"
-        "        int32 gr = const()[name = string(\"gr\"), val = int32(1)];\n"
+        @"program(1.0)\n[buildInfo = dict<tensor<string, []>, tensor<string, []>>({{\"coremlc-version\", \"3505.4.1\"}})]\n{\n"
+        "    func main<ios16>(tensor<fp32, [1, %d, 1, %d]> x) {\n"
+        "        tensor<string, []> d1 = const()[name = tensor<string, []>(\"d1\"), val = tensor<string, []>(\"fp16\")];\n"
+        "        tensor<fp16, [1, %d, 1, %d]> x16 = cast(dtype = d1, x = x)[name = tensor<string, []>(\"cx\")];\n"
+        "        tensor<fp16, [%d, %d, 1, 1]> W = const()[name = tensor<string, []>(\"W\"), "
+        "val = tensor<fp16, [%d, %d, 1, 1]>(BLOBFILE(path = tensor<string, []>(\"@model_path/weights/weight.bin\"), offset = tensor<uint64, []>(64)))];\n"
+        "        tensor<string, []> pt = const()[name = tensor<string, []>(\"pt\"), val = tensor<string, []>(\"valid\")];\n"
+        "        tensor<int32, [2]> st = const()[name = tensor<string, []>(\"st\"), val = tensor<int32, [2]>([1, 1])];\n"
+        "        tensor<int32, [4]> pd = const()[name = tensor<string, []>(\"pd\"), val = tensor<int32, [4]>([0, 0, 0, 0])];\n"
+        "        tensor<int32, [2]> dl = const()[name = tensor<string, []>(\"dl\"), val = tensor<int32, [2]>([1, 1])];\n"
+        "        tensor<int32, []> gr = const()[name = tensor<string, []>(\"gr\"), val = tensor<int32, []>(1)];\n"
        "        tensor<fp16, [1, %d, 1, %d]> y16 = conv(dilations = dl, groups = gr, pad = pd, "
-        "pad_type = pt, strides = st, weight = W, x = x16)[name = string(\"cv\")];\n"
-        "        string d2 = const()[name = string(\"d2\"), val = string(\"fp32\")];\n"
-        "        tensor<fp32, [1, %d, 1, %d]> y = cast(dtype = d2, x = y16)[name = string(\"co\")];\n"
+        "pad_type = pt, strides = st, weight = W, x = x16)[name = tensor<string, []>(\"cv\")];\n"
+        "        tensor<string, []> d2 = const()[name = tensor<string, []>(\"d2\"), val = tensor<string, []>(\"fp32\")];\n"
+        "        tensor<fp32, [1, %d, 1, %d]> y = cast(dtype = d2, x = y16)[name = tensor<string, []>(\"co\")];\n"
        "    } -> (y);\n}\n",
        in_ch, sp, in_ch, sp, out_ch, in_ch, out_ch, in_ch, out_ch, sp, out_ch, sp];
 }

 typedef struct {
-    id model;
+    void *model;    // CFBridgingRetain'd _ANEInMemoryModel
    IOSurfaceRef ioIn, ioOut;
-    id request;
-    NSString *tmpDir;
+    void *request;  // CFBridgingRetain'd _ANERequest
+    void *tmpDir;   // CFBridgingRetain'd NSString
 } Kern;

 static Kern *compile_kern_with_blob(NSData *blob, int in_ch, int out_ch, int sp) {
@ -103,9 +119,17 @@ static Kern *compile_kern_with_blob(NSData *blob, int in_ch, int out_ch, int sp)
    [milData writeToFile:[td stringByAppendingPathComponent:@"model.mil"] atomically:YES];
    [blob writeToFile:[td stringByAppendingPathComponent:@"weights/weight.bin"] atomically:YES];
    NSError *e = nil;
-    if (!((BOOL(*)(id,SEL,unsigned int,id,NSError**))objc_msgSend)(mdl, @selector(compileWithQoS:options:error:), 21, @{}, &e)) return NULL;
+    if (!((BOOL(*)(id,SEL,unsigned int,id,NSError**))objc_msgSend)(mdl, @selector(compileWithQoS:options:error:), 21, @{}, &e)) {
+        if (!g_fp16_io) {
+            printf("[ANE] fp32 compile failed, retrying with fp16 I/O (M1/M2 fallback)\n");
+            g_fp16_io = 1;
+            return compile_kern_with_blob(blob, in_ch, out_ch, sp);
+        }
+        return NULL;
+    }
    if (!((BOOL(*)(id,SEL,unsigned int,id,NSError**))objc_msgSend)(mdl, @selector(loadWithQoS:options:error:), 21, @{}, &e)) return NULL;
-    size_t inB = in_ch * sp * 4, outB = out_ch * sp * 4;
+    size_t bpe = g_fp16_io ? 2 : 4;
+    size_t inB = in_ch * sp * bpe, outB = out_ch * sp * bpe;
    IOSurfaceRef ioI = make_surface(inB), ioO = make_surface(outB);
    id wI = ((id(*)(Class,SEL,IOSurfaceRef))objc_msgSend)(g_AIO, @selector(objectWithIOSurface:), ioI);
    id wO = ((id(*)(Class,SEL,IOSurfaceRef))objc_msgSend)(g_AIO, @selector(objectWithIOSurface:), ioO);
@ -113,40 +137,60 @@ static Kern *compile_kern_with_blob(NSData *blob, int in_ch, int out_ch, int sp)
        @selector(requestWithInputs:inputIndices:outputs:outputIndices:weightsBuffer:perfStats:procedureIndex:),
        @[wI], @[@0], @[wO], @[@0], nil, nil, @0);
    Kern *k = calloc(1, sizeof(Kern));
-    k->model = mdl; k->ioIn = ioI; k->ioOut = ioO; k->request = req; k->tmpDir = td;
+    k->model = (void*)CFBridgingRetain(mdl);
+    k->ioIn = ioI; k->ioOut = ioO;
+    k->request = (void*)CFBridgingRetain(req);
+    k->tmpDir = (void*)CFBridgingRetain(td);
    return k;
 }

 static void free_kern(Kern *k) {
    if (!k) return;
+    id mdl = (__bridge id)k->model;
    NSError *e = nil;
-    ((BOOL(*)(id,SEL,unsigned int,NSError**))objc_msgSend)(k->model, @selector(unloadWithQoS:error:), 21, &e);
+    ((BOOL(*)(id,SEL,unsigned int,NSError**))objc_msgSend)(mdl, @selector(unloadWithQoS:error:), 21, &e);
    CFRelease(k->ioIn); CFRelease(k->ioOut);
-    [[NSFileManager defaultManager] removeItemAtPath:k->tmpDir error:nil];
+    NSString *td = (__bridge id)k->tmpDir;
+    [[NSFileManager defaultManager] removeItemAtPath:td error:nil];
+    CFRelease(k->model); CFRelease(k->request); CFRelease(k->tmpDir);
    free(k);
 }

 // ANE eval: input [S, in_ch] row-major ↔ [in_ch, S] channels-first
 static void ane_eval(Kern *k, const float *in, float *out, int in_ch, int out_ch, int sp) {
-    float *tmp = (float*)malloc(in_ch * sp * sizeof(float));
-    for (int t = 0; t < sp; t++)
-        for (int c = 0; c < in_ch; c++)
-            tmp[c*sp + t] = in[t*in_ch + c];
    IOSurfaceLock(k->ioIn, 0, NULL);
-    memcpy(IOSurfaceGetBaseAddress(k->ioIn), tmp, in_ch * sp * sizeof(float));
+    void *base_in = IOSurfaceGetBaseAddress(k->ioIn);
+    if (g_fp16_io) {
+        _Float16 *dst = (_Float16*)base_in;
+        for (int t = 0; t < sp; t++)
+            for (int c = 0; c < in_ch; c++)
+                dst[c*sp + t] = (_Float16)in[t*in_ch + c];
+    } else {
+        float *dst = (float*)base_in;
+        for (int t = 0; t < sp; t++)
+            for (int c = 0; c < in_ch; c++)
+                dst[c*sp + t] = in[t*in_ch + c];
+    }
    IOSurfaceUnlock(k->ioIn, 0, NULL);
-    free(tmp);
    NSError *e = nil;
+    id mdl = (__bridge id)k->model;
+    id req = (__bridge id)k->request;
    ((BOOL(*)(id,SEL,unsigned int,id,id,NSError**))objc_msgSend)(
-        k->model, @selector(evaluateWithQoS:options:request:error:), 21, @{}, k->request, &e);
-    float *tmp2 = (float*)malloc(out_ch * sp * sizeof(float));
+        mdl, @selector(evaluateWithQoS:options:request:error:), 21, @{}, req, &e);
    IOSurfaceLock(k->ioOut, kIOSurfaceLockReadOnly, NULL);
-    memcpy(tmp2, IOSurfaceGetBaseAddress(k->ioOut), out_ch * sp * sizeof(float));
+    void *base_out = IOSurfaceGetBaseAddress(k->ioOut);
+    if (g_fp16_io) {
+        _Float16 *src = (_Float16*)base_out;
+        for (int t = 0; t < sp; t++)
+            for (int c = 0; c < out_ch; c++)
+                out[t*out_ch + c] = (float)src[c*sp + t];
+    } else {
+        float *src = (float*)base_out;
+        for (int t = 0; t < sp; t++)
+            for (int c = 0; c < out_ch; c++)
+                out[t*out_ch + c] = src[c*sp + t];
+    }
    IOSurfaceUnlock(k->ioOut, kIOSurfaceLockReadOnly, NULL);
-    for (int t = 0; t < sp; t++)
-        for (int c = 0; c < out_ch; c++)
-            out[t*out_ch + c] = tmp2[c*sp + t];
-    free(tmp2);
 }

 int main(int argc, char *argv[]) {