From 2d2adacf09ec6352cf8fea53c7a916bd5257144f Mon Sep 17 00:00:00 2001
From: imperatormk <darko.simonovski@hotmail.com>
Date: Tue, 3 Mar 2026 18:26:12 +0100
Subject: [PATCH] wire up fp16 I/O retry in train.m forward path

---
 training/forward.h | 45 +++++++++++++++++++++++++++++++++------------
 training/model.h   | 45 +++++++++++++++++++++++++++++++++------------
 2 files changed, 66 insertions(+), 24 deletions(-)

diff --git a/training/forward.h b/training/forward.h
index adcf898..0f2ca9f 100644
--- a/training/forward.h
+++ b/training/forward.h
@@ -9,22 +9,43 @@
 // Transpose back to [S, out_dim] row-major
 static void ane_conv_eval(ANEKernel *kernel, const float *x, float *y,
                           int S, int in_dim, int out_dim) {
-    float *x_t = (float*)malloc(S * in_dim * sizeof(float));
-    for (int t = 0; t < S; t++)
-        for (int i = 0; i < in_dim; i++)
-            x_t[i*S + t] = x[t*in_dim + i];
+    if (g_fp16_io) {
+        // fp16 I/O path: transpose + convert float→fp16, write, eval, read fp16→float + transpose
+        _Float16 *x_t = (_Float16*)malloc(S * in_dim * sizeof(_Float16));
+        for (int t = 0; t < S; t++)
+            for (int i = 0; i < in_dim; i++)
+                x_t[i*S + t] = (_Float16)x[t*in_dim + i];
 
-    ane_write_input(kernel, 0, x_t, S * in_dim * sizeof(float));
-    ane_eval(kernel);
+        ane_write_input(kernel, 0, x_t, S * in_dim * sizeof(_Float16));
+        ane_eval(kernel);
 
-    float *y_t = (float*)malloc(S * out_dim * sizeof(float));
-    ane_read_output(kernel, 0, y_t, S * out_dim * sizeof(float));
+        _Float16 *y_t = (_Float16*)malloc(S * out_dim * sizeof(_Float16));
+        ane_read_output(kernel, 0, y_t, S * out_dim * sizeof(_Float16));
 
-    for (int t = 0; t < S; t++)
-        for (int i = 0; i < out_dim; i++)
-            y[t*out_dim + i] = y_t[i*S + t];
+        for (int t = 0; t < S; t++)
+            for (int i = 0; i < out_dim; i++)
+                y[t*out_dim + i] = (float)y_t[i*S + t];
 
-    free(x_t); free(y_t);
+        free(x_t); free(y_t);
+    } else {
+        // fp32 I/O path: transpose, write, eval, read, transpose back
+        float *x_t = (float*)malloc(S * in_dim * sizeof(float));
+        for (int t = 0; t < S; t++)
+            for (int i = 0; i < in_dim; i++)
+                x_t[i*S + t] = x[t*in_dim + i];
+
+        ane_write_input(kernel, 0, x_t, S * in_dim * sizeof(float));
+        ane_eval(kernel);
+
+        float *y_t = (float*)malloc(S * out_dim * sizeof(float));
+        ane_read_output(kernel, 0, y_t, S * out_dim * sizeof(float));
+
+        for (int t = 0; t < S; t++)
+            for (int i = 0; i < out_dim; i++)
+                y[t*out_dim + i] = y_t[i*S + t];
+
+        free(x_t); free(y_t);
+    }
 }
 
 // CPU matmul fallback: y = W @ x, W[out_dim, in_dim], x[S, in_dim] → y[S, out_dim]
diff --git a/training/model.h b/training/model.h
index 6cee52f..b8da703 100644
--- a/training/model.h
+++ b/training/model.h
@@ -151,8 +151,9 @@ static int model_load_weights(Model *m, const char *path) {
 static ANEKernel *compile_conv_kernel(const float *weights, int in_ch, int out_ch, int spatial) {
     NSData *wb = mil_build_weight_blob(weights, out_ch, in_ch);
     NSString *mil = mil_gen_conv(in_ch, out_ch, spatial);
-    size_t inBytes = (size_t)in_ch * spatial * 4;
-    size_t outBytes = (size_t)out_ch * spatial * 4;
+    size_t bpe = g_fp16_io ? 2 : 4;
+    size_t inBytes = (size_t)in_ch * spatial * bpe;
+    size_t outBytes = (size_t)out_ch * spatial * bpe;
     return ane_compile([mil dataUsingEncoding:NSUTF8StringEncoding], wb, 1, &inBytes, 1, &outBytes);
 }
 
@@ -161,9 +162,31 @@ static int model_compile_kernels(Model *m, int seq_len) {
     m->seq_len = seq_len;
     int d = m->cfg.dim, hd = m->cfg.hidden_dim, vs = m->cfg.vocab_size;
     int S = seq_len;
-    printf("Compiling %d ANE conv kernels (S=%d)...\n", N_LAYERS * 7 + 1, S);
+    printf("Compiling %d ANE conv kernels (S=%d, %s I/O)...\n",
+           N_LAYERS * 7 + 1, S, g_fp16_io ? "fp16" : "fp32");
 
-    for (int l = 0; l < N_LAYERS; l++) {
+    // Try first layer as canary — if cast op fails, retry with fp16 I/O
+    m->kern_q[0] = compile_conv_kernel(m->wq[0], d, d, S);
+    if (!m->kern_q[0] && !g_fp16_io) {
+        printf("  Compile failed, retrying with fp16 I/O (M1/M2 fallback)...\n");
+        g_fp16_io = 1;
+        m->kern_q[0] = compile_conv_kernel(m->wq[0], d, d, S);
+    }
+    if (!m->kern_q[0]) { fprintf(stderr, "L0 kern_q fail\n"); return -1; }
+
+    m->kern_k[0] = compile_conv_kernel(m->wk[0], d, d, S);
+    m->kern_v[0] = compile_conv_kernel(m->wv[0], d, d, S);
+    m->kern_o[0] = compile_conv_kernel(m->wo[0], d, d, S);
+    m->kern_w1[0] = compile_conv_kernel(m->w1[0], d, hd, S);
+    m->kern_w2[0] = compile_conv_kernel(m->w2[0], hd, d, S);
+    m->kern_w3[0] = compile_conv_kernel(m->w3[0], d, hd, S);
+    if (!m->kern_k[0] || !m->kern_v[0] || !m->kern_o[0] ||
+        !m->kern_w1[0] || !m->kern_w2[0] || !m->kern_w3[0]) {
+        fprintf(stderr, "L0 compile fail\n"); return -1;
+    }
+    printf("  Layer 0 OK\n");
+
+    for (int l = 1; l < N_LAYERS; l++) {
         m->kern_q[l] = compile_conv_kernel(m->wq[l], d, d, S);
         m->kern_k[l] = compile_conv_kernel(m->wk[l], d, d, S);
         m->kern_v[l] = compile_conv_kernel(m->wv[l], d, d, S);
@@ -171,20 +194,18 @@ static int model_compile_kernels(Model *m, int seq_len) {
         m->kern_w1[l] = compile_conv_kernel(m->w1[l], d, hd, S);
         m->kern_w2[l] = compile_conv_kernel(m->w2[l], hd, d, S);
         m->kern_w3[l] = compile_conv_kernel(m->w3[l], d, hd, S);
-        if (!m->kern_q[l]) { fprintf(stderr, "L%d kern_q fail\n",l); return -1; }
-        if (!m->kern_k[l]) { fprintf(stderr, "L%d kern_k fail\n",l); return -1; }
-        if (!m->kern_v[l]) { fprintf(stderr, "L%d kern_v fail\n",l); return -1; }
-        if (!m->kern_o[l]) { fprintf(stderr, "L%d kern_o fail\n",l); return -1; }
-        if (!m->kern_w1[l]) { fprintf(stderr, "L%d kern_w1 fail\n",l); return -1; }
-        if (!m->kern_w2[l]) { fprintf(stderr, "L%d kern_w2 fail\n",l); return -1; }
-        if (!m->kern_w3[l]) { fprintf(stderr, "L%d kern_w3 fail\n",l); return -1; }
+        if (!m->kern_q[l] || !m->kern_k[l] || !m->kern_v[l] || !m->kern_o[l] ||
+            !m->kern_w1[l] || !m->kern_w2[l] || !m->kern_w3[l]) {
+            fprintf(stderr, "L%d compile fail\n", l); return -1;
+        }
         printf("  Layer %d OK\n", l);
     }
     m->kern_cls = compile_conv_kernel(m->wcls, d, vs, S);
     if (!m->kern_cls) {
         fprintf(stderr, "Classifier kernel compile failed (dim=%d→vocab=%d too large?), using CPU for cls\n", d, vs);
     }
-    printf("  All kernels compiled (%d conv + %s)\n", N_LAYERS * 7, m->kern_cls ? "cls" : "cls=CPU");
+    printf("  All kernels compiled (%d conv + %s, %s I/O)\n",
+           N_LAYERS * 7, m->kern_cls ? "cls" : "cls=CPU", g_fp16_io ? "fp16" : "fp32");
     return 0;
 }